xref: /linux/virt/kvm/guest_memfd.c (revision bf4afc53b77aeaa48b5409da5c8da6bb4eff7f43)
1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/anon_inodes.h>
3 #include <linux/backing-dev.h>
4 #include <linux/falloc.h>
5 #include <linux/fs.h>
6 #include <linux/kvm_host.h>
7 #include <linux/mempolicy.h>
8 #include <linux/pseudo_fs.h>
9 #include <linux/pagemap.h>
10 
11 #include "kvm_mm.h"
12 
13 static struct vfsmount *kvm_gmem_mnt;
14 
15 /*
16  * A guest_memfd instance can be associated multiple VMs, each with its own
17  * "view" of the underlying physical memory.
18  *
19  * The gmem's inode is effectively the raw underlying physical storage, and is
20  * used to track properties of the physical memory, while each gmem file is
21  * effectively a single VM's view of that storage, and is used to track assets
22  * specific to its associated VM, e.g. memslots=>gmem bindings.
23  */
24 struct gmem_file {
25 	struct kvm *kvm;
26 	struct xarray bindings;
27 	struct list_head entry;
28 };
29 
30 struct gmem_inode {
31 	struct shared_policy policy;
32 	struct inode vfs_inode;
33 
34 	u64 flags;
35 };
36 
GMEM_I(struct inode * inode)37 static __always_inline struct gmem_inode *GMEM_I(struct inode *inode)
38 {
39 	return container_of(inode, struct gmem_inode, vfs_inode);
40 }
41 
42 #define kvm_gmem_for_each_file(f, mapping) \
43 	list_for_each_entry(f, &(mapping)->i_private_list, entry)
44 
45 /**
46  * folio_file_pfn - like folio_file_page, but return a pfn.
47  * @folio: The folio which contains this index.
48  * @index: The index we want to look up.
49  *
50  * Return: The pfn for this index.
51  */
folio_file_pfn(struct folio * folio,pgoff_t index)52 static inline kvm_pfn_t folio_file_pfn(struct folio *folio, pgoff_t index)
53 {
54 	return folio_pfn(folio) + (index & (folio_nr_pages(folio) - 1));
55 }
56 
kvm_gmem_get_index(struct kvm_memory_slot * slot,gfn_t gfn)57 static pgoff_t kvm_gmem_get_index(struct kvm_memory_slot *slot, gfn_t gfn)
58 {
59 	return gfn - slot->base_gfn + slot->gmem.pgoff;
60 }
61 
__kvm_gmem_prepare_folio(struct kvm * kvm,struct kvm_memory_slot * slot,pgoff_t index,struct folio * folio)62 static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
63 				    pgoff_t index, struct folio *folio)
64 {
65 #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE
66 	kvm_pfn_t pfn = folio_file_pfn(folio, index);
67 	gfn_t gfn = slot->base_gfn + index - slot->gmem.pgoff;
68 	int rc = kvm_arch_gmem_prepare(kvm, gfn, pfn, folio_order(folio));
69 	if (rc) {
70 		pr_warn_ratelimited("gmem: Failed to prepare folio for index %lx GFN %llx PFN %llx error %d.\n",
71 				    index, gfn, pfn, rc);
72 		return rc;
73 	}
74 #endif
75 
76 	return 0;
77 }
78 
79 /*
80  * Process @folio, which contains @gfn, so that the guest can use it.
81  * The folio must be locked and the gfn must be contained in @slot.
82  * On successful return the guest sees a zero page so as to avoid
83  * leaking host data and the up-to-date flag is set.
84  */
kvm_gmem_prepare_folio(struct kvm * kvm,struct kvm_memory_slot * slot,gfn_t gfn,struct folio * folio)85 static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
86 				  gfn_t gfn, struct folio *folio)
87 {
88 	pgoff_t index;
89 
90 	/*
91 	 * Preparing huge folios should always be safe, since it should
92 	 * be possible to split them later if needed.
93 	 *
94 	 * Right now the folio order is always going to be zero, but the
95 	 * code is ready for huge folios.  The only assumption is that
96 	 * the base pgoff of memslots is naturally aligned with the
97 	 * requested page order, ensuring that huge folios can also use
98 	 * huge page table entries for GPA->HPA mapping.
99 	 *
100 	 * The order will be passed when creating the guest_memfd, and
101 	 * checked when creating memslots.
102 	 */
103 	WARN_ON(!IS_ALIGNED(slot->gmem.pgoff, folio_nr_pages(folio)));
104 	index = kvm_gmem_get_index(slot, gfn);
105 	index = ALIGN_DOWN(index, folio_nr_pages(folio));
106 
107 	return __kvm_gmem_prepare_folio(kvm, slot, index, folio);
108 }
109 
110 /*
111  * Returns a locked folio on success.  The caller is responsible for
112  * setting the up-to-date flag before the memory is mapped into the guest.
113  * There is no backing storage for the memory, so the folio will remain
114  * up-to-date until it's removed.
115  *
116  * Ignore accessed, referenced, and dirty flags.  The memory is
117  * unevictable and there is no storage to write back to.
118  */
kvm_gmem_get_folio(struct inode * inode,pgoff_t index)119 static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
120 {
121 	/* TODO: Support huge pages. */
122 	struct mempolicy *policy;
123 	struct folio *folio;
124 
125 	/*
126 	 * Fast-path: See if folio is already present in mapping to avoid
127 	 * policy_lookup.
128 	 */
129 	folio = __filemap_get_folio(inode->i_mapping, index,
130 				    FGP_LOCK | FGP_ACCESSED, 0);
131 	if (!IS_ERR(folio))
132 		return folio;
133 
134 	policy = mpol_shared_policy_lookup(&GMEM_I(inode)->policy, index);
135 	folio = __filemap_get_folio_mpol(inode->i_mapping, index,
136 					 FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
137 					 mapping_gfp_mask(inode->i_mapping), policy);
138 	mpol_cond_put(policy);
139 
140 	/*
141 	 * External interfaces like kvm_gmem_get_pfn() support dealing
142 	 * with hugepages to a degree, but internally, guest_memfd currently
143 	 * assumes that all folios are order-0 and handling would need
144 	 * to be updated for anything otherwise (e.g. page-clearing
145 	 * operations).
146 	 */
147 	WARN_ON_ONCE(!IS_ERR(folio) && folio_order(folio));
148 
149 	return folio;
150 }
151 
kvm_gmem_get_invalidate_filter(struct inode * inode)152 static enum kvm_gfn_range_filter kvm_gmem_get_invalidate_filter(struct inode *inode)
153 {
154 	if (GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_INIT_SHARED)
155 		return KVM_FILTER_SHARED;
156 
157 	return KVM_FILTER_PRIVATE;
158 }
159 
__kvm_gmem_invalidate_begin(struct gmem_file * f,pgoff_t start,pgoff_t end,enum kvm_gfn_range_filter attr_filter)160 static void __kvm_gmem_invalidate_begin(struct gmem_file *f, pgoff_t start,
161 					pgoff_t end,
162 					enum kvm_gfn_range_filter attr_filter)
163 {
164 	bool flush = false, found_memslot = false;
165 	struct kvm_memory_slot *slot;
166 	struct kvm *kvm = f->kvm;
167 	unsigned long index;
168 
169 	xa_for_each_range(&f->bindings, index, slot, start, end - 1) {
170 		pgoff_t pgoff = slot->gmem.pgoff;
171 
172 		struct kvm_gfn_range gfn_range = {
173 			.start = slot->base_gfn + max(pgoff, start) - pgoff,
174 			.end = slot->base_gfn + min(pgoff + slot->npages, end) - pgoff,
175 			.slot = slot,
176 			.may_block = true,
177 			.attr_filter = attr_filter,
178 		};
179 
180 		if (!found_memslot) {
181 			found_memslot = true;
182 
183 			KVM_MMU_LOCK(kvm);
184 			kvm_mmu_invalidate_begin(kvm);
185 		}
186 
187 		flush |= kvm_mmu_unmap_gfn_range(kvm, &gfn_range);
188 	}
189 
190 	if (flush)
191 		kvm_flush_remote_tlbs(kvm);
192 
193 	if (found_memslot)
194 		KVM_MMU_UNLOCK(kvm);
195 }
196 
kvm_gmem_invalidate_begin(struct inode * inode,pgoff_t start,pgoff_t end)197 static void kvm_gmem_invalidate_begin(struct inode *inode, pgoff_t start,
198 				      pgoff_t end)
199 {
200 	enum kvm_gfn_range_filter attr_filter;
201 	struct gmem_file *f;
202 
203 	attr_filter = kvm_gmem_get_invalidate_filter(inode);
204 
205 	kvm_gmem_for_each_file(f, inode->i_mapping)
206 		__kvm_gmem_invalidate_begin(f, start, end, attr_filter);
207 }
208 
__kvm_gmem_invalidate_end(struct gmem_file * f,pgoff_t start,pgoff_t end)209 static void __kvm_gmem_invalidate_end(struct gmem_file *f, pgoff_t start,
210 				      pgoff_t end)
211 {
212 	struct kvm *kvm = f->kvm;
213 
214 	if (xa_find(&f->bindings, &start, end - 1, XA_PRESENT)) {
215 		KVM_MMU_LOCK(kvm);
216 		kvm_mmu_invalidate_end(kvm);
217 		KVM_MMU_UNLOCK(kvm);
218 	}
219 }
220 
kvm_gmem_invalidate_end(struct inode * inode,pgoff_t start,pgoff_t end)221 static void kvm_gmem_invalidate_end(struct inode *inode, pgoff_t start,
222 				    pgoff_t end)
223 {
224 	struct gmem_file *f;
225 
226 	kvm_gmem_for_each_file(f, inode->i_mapping)
227 		__kvm_gmem_invalidate_end(f, start, end);
228 }
229 
kvm_gmem_punch_hole(struct inode * inode,loff_t offset,loff_t len)230 static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len)
231 {
232 	pgoff_t start = offset >> PAGE_SHIFT;
233 	pgoff_t end = (offset + len) >> PAGE_SHIFT;
234 
235 	/*
236 	 * Bindings must be stable across invalidation to ensure the start+end
237 	 * are balanced.
238 	 */
239 	filemap_invalidate_lock(inode->i_mapping);
240 
241 	kvm_gmem_invalidate_begin(inode, start, end);
242 
243 	truncate_inode_pages_range(inode->i_mapping, offset, offset + len - 1);
244 
245 	kvm_gmem_invalidate_end(inode, start, end);
246 
247 	filemap_invalidate_unlock(inode->i_mapping);
248 
249 	return 0;
250 }
251 
kvm_gmem_allocate(struct inode * inode,loff_t offset,loff_t len)252 static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len)
253 {
254 	struct address_space *mapping = inode->i_mapping;
255 	pgoff_t start, index, end;
256 	int r;
257 
258 	/* Dedicated guest is immutable by default. */
259 	if (offset + len > i_size_read(inode))
260 		return -EINVAL;
261 
262 	filemap_invalidate_lock_shared(mapping);
263 
264 	start = offset >> PAGE_SHIFT;
265 	end = (offset + len) >> PAGE_SHIFT;
266 
267 	r = 0;
268 	for (index = start; index < end; ) {
269 		struct folio *folio;
270 
271 		if (signal_pending(current)) {
272 			r = -EINTR;
273 			break;
274 		}
275 
276 		folio = kvm_gmem_get_folio(inode, index);
277 		if (IS_ERR(folio)) {
278 			r = PTR_ERR(folio);
279 			break;
280 		}
281 
282 		index = folio_next_index(folio);
283 
284 		folio_unlock(folio);
285 		folio_put(folio);
286 
287 		/* 64-bit only, wrapping the index should be impossible. */
288 		if (WARN_ON_ONCE(!index))
289 			break;
290 
291 		cond_resched();
292 	}
293 
294 	filemap_invalidate_unlock_shared(mapping);
295 
296 	return r;
297 }
298 
kvm_gmem_fallocate(struct file * file,int mode,loff_t offset,loff_t len)299 static long kvm_gmem_fallocate(struct file *file, int mode, loff_t offset,
300 			       loff_t len)
301 {
302 	int ret;
303 
304 	if (!(mode & FALLOC_FL_KEEP_SIZE))
305 		return -EOPNOTSUPP;
306 
307 	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
308 		return -EOPNOTSUPP;
309 
310 	if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len))
311 		return -EINVAL;
312 
313 	if (mode & FALLOC_FL_PUNCH_HOLE)
314 		ret = kvm_gmem_punch_hole(file_inode(file), offset, len);
315 	else
316 		ret = kvm_gmem_allocate(file_inode(file), offset, len);
317 
318 	if (!ret)
319 		file_modified(file);
320 	return ret;
321 }
322 
kvm_gmem_release(struct inode * inode,struct file * file)323 static int kvm_gmem_release(struct inode *inode, struct file *file)
324 {
325 	struct gmem_file *f = file->private_data;
326 	struct kvm_memory_slot *slot;
327 	struct kvm *kvm = f->kvm;
328 	unsigned long index;
329 
330 	/*
331 	 * Prevent concurrent attempts to *unbind* a memslot.  This is the last
332 	 * reference to the file and thus no new bindings can be created, but
333 	 * dereferencing the slot for existing bindings needs to be protected
334 	 * against memslot updates, specifically so that unbind doesn't race
335 	 * and free the memslot (kvm_gmem_get_file() will return NULL).
336 	 *
337 	 * Since .release is called only when the reference count is zero,
338 	 * after which file_ref_get() and get_file_active() fail,
339 	 * kvm_gmem_get_pfn() cannot be using the file concurrently.
340 	 * file_ref_put() provides a full barrier, and get_file_active() the
341 	 * matching acquire barrier.
342 	 */
343 	mutex_lock(&kvm->slots_lock);
344 
345 	filemap_invalidate_lock(inode->i_mapping);
346 
347 	xa_for_each(&f->bindings, index, slot)
348 		WRITE_ONCE(slot->gmem.file, NULL);
349 
350 	/*
351 	 * All in-flight operations are gone and new bindings can be created.
352 	 * Zap all SPTEs pointed at by this file.  Do not free the backing
353 	 * memory, as its lifetime is associated with the inode, not the file.
354 	 */
355 	__kvm_gmem_invalidate_begin(f, 0, -1ul,
356 				    kvm_gmem_get_invalidate_filter(inode));
357 	__kvm_gmem_invalidate_end(f, 0, -1ul);
358 
359 	list_del(&f->entry);
360 
361 	filemap_invalidate_unlock(inode->i_mapping);
362 
363 	mutex_unlock(&kvm->slots_lock);
364 
365 	xa_destroy(&f->bindings);
366 	kfree(f);
367 
368 	kvm_put_kvm(kvm);
369 
370 	return 0;
371 }
372 
kvm_gmem_get_file(struct kvm_memory_slot * slot)373 static inline struct file *kvm_gmem_get_file(struct kvm_memory_slot *slot)
374 {
375 	/*
376 	 * Do not return slot->gmem.file if it has already been closed;
377 	 * there might be some time between the last fput() and when
378 	 * kvm_gmem_release() clears slot->gmem.file.
379 	 */
380 	return get_file_active(&slot->gmem.file);
381 }
382 
383 DEFINE_CLASS(gmem_get_file, struct file *, if (_T) fput(_T),
384 	     kvm_gmem_get_file(slot), struct kvm_memory_slot *slot);
385 
kvm_gmem_supports_mmap(struct inode * inode)386 static bool kvm_gmem_supports_mmap(struct inode *inode)
387 {
388 	return GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_MMAP;
389 }
390 
kvm_gmem_fault_user_mapping(struct vm_fault * vmf)391 static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf)
392 {
393 	struct inode *inode = file_inode(vmf->vma->vm_file);
394 	struct folio *folio;
395 	vm_fault_t ret = VM_FAULT_LOCKED;
396 
397 	if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode))
398 		return VM_FAULT_SIGBUS;
399 
400 	if (!(GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_INIT_SHARED))
401 		return VM_FAULT_SIGBUS;
402 
403 	folio = kvm_gmem_get_folio(inode, vmf->pgoff);
404 	if (IS_ERR(folio)) {
405 		if (PTR_ERR(folio) == -EAGAIN)
406 			return VM_FAULT_RETRY;
407 
408 		return vmf_error(PTR_ERR(folio));
409 	}
410 
411 	if (WARN_ON_ONCE(folio_test_large(folio))) {
412 		ret = VM_FAULT_SIGBUS;
413 		goto out_folio;
414 	}
415 
416 	if (!folio_test_uptodate(folio)) {
417 		clear_highpage(folio_page(folio, 0));
418 		folio_mark_uptodate(folio);
419 	}
420 
421 	vmf->page = folio_file_page(folio, vmf->pgoff);
422 
423 out_folio:
424 	if (ret != VM_FAULT_LOCKED) {
425 		folio_unlock(folio);
426 		folio_put(folio);
427 	}
428 
429 	return ret;
430 }
431 
432 #ifdef CONFIG_NUMA
kvm_gmem_set_policy(struct vm_area_struct * vma,struct mempolicy * mpol)433 static int kvm_gmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
434 {
435 	struct inode *inode = file_inode(vma->vm_file);
436 
437 	return mpol_set_shared_policy(&GMEM_I(inode)->policy, vma, mpol);
438 }
439 
kvm_gmem_get_policy(struct vm_area_struct * vma,unsigned long addr,pgoff_t * pgoff)440 static struct mempolicy *kvm_gmem_get_policy(struct vm_area_struct *vma,
441 					     unsigned long addr, pgoff_t *pgoff)
442 {
443 	struct inode *inode = file_inode(vma->vm_file);
444 
445 	*pgoff = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT);
446 
447 	/*
448 	 * Return the memory policy for this index, or NULL if none is set.
449 	 *
450 	 * Returning NULL, e.g. instead of the current task's memory policy, is
451 	 * important for the .get_policy kernel ABI: it indicates that no
452 	 * explicit policy has been set via mbind() for this memory. The caller
453 	 * can then replace NULL with the default memory policy instead of the
454 	 * current task's memory policy.
455 	 */
456 	return mpol_shared_policy_lookup(&GMEM_I(inode)->policy, *pgoff);
457 }
458 #endif /* CONFIG_NUMA */
459 
460 static const struct vm_operations_struct kvm_gmem_vm_ops = {
461 	.fault		= kvm_gmem_fault_user_mapping,
462 #ifdef CONFIG_NUMA
463 	.get_policy	= kvm_gmem_get_policy,
464 	.set_policy	= kvm_gmem_set_policy,
465 #endif
466 };
467 
kvm_gmem_mmap(struct file * file,struct vm_area_struct * vma)468 static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma)
469 {
470 	if (!kvm_gmem_supports_mmap(file_inode(file)))
471 		return -ENODEV;
472 
473 	if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) !=
474 	    (VM_SHARED | VM_MAYSHARE)) {
475 		return -EINVAL;
476 	}
477 
478 	vma->vm_ops = &kvm_gmem_vm_ops;
479 
480 	return 0;
481 }
482 
483 static struct file_operations kvm_gmem_fops = {
484 	.mmap		= kvm_gmem_mmap,
485 	.open		= generic_file_open,
486 	.release	= kvm_gmem_release,
487 	.fallocate	= kvm_gmem_fallocate,
488 };
489 
kvm_gmem_migrate_folio(struct address_space * mapping,struct folio * dst,struct folio * src,enum migrate_mode mode)490 static int kvm_gmem_migrate_folio(struct address_space *mapping,
491 				  struct folio *dst, struct folio *src,
492 				  enum migrate_mode mode)
493 {
494 	WARN_ON_ONCE(1);
495 	return -EINVAL;
496 }
497 
kvm_gmem_error_folio(struct address_space * mapping,struct folio * folio)498 static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *folio)
499 {
500 	pgoff_t start, end;
501 
502 	filemap_invalidate_lock_shared(mapping);
503 
504 	start = folio->index;
505 	end = start + folio_nr_pages(folio);
506 
507 	kvm_gmem_invalidate_begin(mapping->host, start, end);
508 
509 	/*
510 	 * Do not truncate the range, what action is taken in response to the
511 	 * error is userspace's decision (assuming the architecture supports
512 	 * gracefully handling memory errors).  If/when the guest attempts to
513 	 * access a poisoned page, kvm_gmem_get_pfn() will return -EHWPOISON,
514 	 * at which point KVM can either terminate the VM or propagate the
515 	 * error to userspace.
516 	 */
517 
518 	kvm_gmem_invalidate_end(mapping->host, start, end);
519 
520 	filemap_invalidate_unlock_shared(mapping);
521 
522 	return MF_DELAYED;
523 }
524 
525 #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
kvm_gmem_free_folio(struct folio * folio)526 static void kvm_gmem_free_folio(struct folio *folio)
527 {
528 	struct page *page = folio_page(folio, 0);
529 	kvm_pfn_t pfn = page_to_pfn(page);
530 	int order = folio_order(folio);
531 
532 	kvm_arch_gmem_invalidate(pfn, pfn + (1ul << order));
533 }
534 #endif
535 
536 static const struct address_space_operations kvm_gmem_aops = {
537 	.dirty_folio = noop_dirty_folio,
538 	.migrate_folio	= kvm_gmem_migrate_folio,
539 	.error_remove_folio = kvm_gmem_error_folio,
540 #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
541 	.free_folio = kvm_gmem_free_folio,
542 #endif
543 };
544 
kvm_gmem_setattr(struct mnt_idmap * idmap,struct dentry * dentry,struct iattr * attr)545 static int kvm_gmem_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
546 			    struct iattr *attr)
547 {
548 	return -EINVAL;
549 }
550 static const struct inode_operations kvm_gmem_iops = {
551 	.setattr	= kvm_gmem_setattr,
552 };
553 
kvm_arch_supports_gmem_init_shared(struct kvm * kvm)554 bool __weak kvm_arch_supports_gmem_init_shared(struct kvm *kvm)
555 {
556 	return true;
557 }
558 
__kvm_gmem_create(struct kvm * kvm,loff_t size,u64 flags)559 static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
560 {
561 	static const char *name = "[kvm-gmem]";
562 	struct gmem_file *f;
563 	struct inode *inode;
564 	struct file *file;
565 	int fd, err;
566 
567 	fd = get_unused_fd_flags(0);
568 	if (fd < 0)
569 		return fd;
570 
571 	f = kzalloc_obj(*f);
572 	if (!f) {
573 		err = -ENOMEM;
574 		goto err_fd;
575 	}
576 
577 	/* __fput() will take care of fops_put(). */
578 	if (!fops_get(&kvm_gmem_fops)) {
579 		err = -ENOENT;
580 		goto err_gmem;
581 	}
582 
583 	inode = anon_inode_make_secure_inode(kvm_gmem_mnt->mnt_sb, name, NULL);
584 	if (IS_ERR(inode)) {
585 		err = PTR_ERR(inode);
586 		goto err_fops;
587 	}
588 
589 	inode->i_op = &kvm_gmem_iops;
590 	inode->i_mapping->a_ops = &kvm_gmem_aops;
591 	inode->i_mode |= S_IFREG;
592 	inode->i_size = size;
593 	mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
594 	mapping_set_inaccessible(inode->i_mapping);
595 	/* Unmovable mappings are supposed to be marked unevictable as well. */
596 	WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
597 
598 	GMEM_I(inode)->flags = flags;
599 
600 	file = alloc_file_pseudo(inode, kvm_gmem_mnt, name, O_RDWR, &kvm_gmem_fops);
601 	if (IS_ERR(file)) {
602 		err = PTR_ERR(file);
603 		goto err_inode;
604 	}
605 
606 	file->f_flags |= O_LARGEFILE;
607 	file->private_data = f;
608 
609 	kvm_get_kvm(kvm);
610 	f->kvm = kvm;
611 	xa_init(&f->bindings);
612 	list_add(&f->entry, &inode->i_mapping->i_private_list);
613 
614 	fd_install(fd, file);
615 	return fd;
616 
617 err_inode:
618 	iput(inode);
619 err_fops:
620 	fops_put(&kvm_gmem_fops);
621 err_gmem:
622 	kfree(f);
623 err_fd:
624 	put_unused_fd(fd);
625 	return err;
626 }
627 
kvm_gmem_create(struct kvm * kvm,struct kvm_create_guest_memfd * args)628 int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args)
629 {
630 	loff_t size = args->size;
631 	u64 flags = args->flags;
632 
633 	if (flags & ~kvm_gmem_get_supported_flags(kvm))
634 		return -EINVAL;
635 
636 	if (size <= 0 || !PAGE_ALIGNED(size))
637 		return -EINVAL;
638 
639 	return __kvm_gmem_create(kvm, size, flags);
640 }
641 
kvm_gmem_bind(struct kvm * kvm,struct kvm_memory_slot * slot,unsigned int fd,loff_t offset)642 int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
643 		  unsigned int fd, loff_t offset)
644 {
645 	loff_t size = slot->npages << PAGE_SHIFT;
646 	unsigned long start, end;
647 	struct gmem_file *f;
648 	struct inode *inode;
649 	struct file *file;
650 	int r = -EINVAL;
651 
652 	BUILD_BUG_ON(sizeof(gfn_t) != sizeof(slot->gmem.pgoff));
653 
654 	file = fget(fd);
655 	if (!file)
656 		return -EBADF;
657 
658 	if (file->f_op != &kvm_gmem_fops)
659 		goto err;
660 
661 	f = file->private_data;
662 	if (f->kvm != kvm)
663 		goto err;
664 
665 	inode = file_inode(file);
666 
667 	if (offset < 0 || !PAGE_ALIGNED(offset) ||
668 	    offset + size > i_size_read(inode))
669 		goto err;
670 
671 	filemap_invalidate_lock(inode->i_mapping);
672 
673 	start = offset >> PAGE_SHIFT;
674 	end = start + slot->npages;
675 
676 	if (!xa_empty(&f->bindings) &&
677 	    xa_find(&f->bindings, &start, end - 1, XA_PRESENT)) {
678 		filemap_invalidate_unlock(inode->i_mapping);
679 		goto err;
680 	}
681 
682 	/*
683 	 * memslots of flag KVM_MEM_GUEST_MEMFD are immutable to change, so
684 	 * kvm_gmem_bind() must occur on a new memslot.  Because the memslot
685 	 * is not visible yet, kvm_gmem_get_pfn() is guaranteed to see the file.
686 	 */
687 	WRITE_ONCE(slot->gmem.file, file);
688 	slot->gmem.pgoff = start;
689 	if (kvm_gmem_supports_mmap(inode))
690 		slot->flags |= KVM_MEMSLOT_GMEM_ONLY;
691 
692 	xa_store_range(&f->bindings, start, end - 1, slot, GFP_KERNEL);
693 	filemap_invalidate_unlock(inode->i_mapping);
694 
695 	/*
696 	 * Drop the reference to the file, even on success.  The file pins KVM,
697 	 * not the other way 'round.  Active bindings are invalidated if the
698 	 * file is closed before memslots are destroyed.
699 	 */
700 	r = 0;
701 err:
702 	fput(file);
703 	return r;
704 }
705 
__kvm_gmem_unbind(struct kvm_memory_slot * slot,struct gmem_file * f)706 static void __kvm_gmem_unbind(struct kvm_memory_slot *slot, struct gmem_file *f)
707 {
708 	unsigned long start = slot->gmem.pgoff;
709 	unsigned long end = start + slot->npages;
710 
711 	xa_store_range(&f->bindings, start, end - 1, NULL, GFP_KERNEL);
712 
713 	/*
714 	 * synchronize_srcu(&kvm->srcu) ensured that kvm_gmem_get_pfn()
715 	 * cannot see this memslot.
716 	 */
717 	WRITE_ONCE(slot->gmem.file, NULL);
718 }
719 
kvm_gmem_unbind(struct kvm_memory_slot * slot)720 void kvm_gmem_unbind(struct kvm_memory_slot *slot)
721 {
722 	/*
723 	 * Nothing to do if the underlying file was _already_ closed, as
724 	 * kvm_gmem_release() invalidates and nullifies all bindings.
725 	 */
726 	if (!slot->gmem.file)
727 		return;
728 
729 	CLASS(gmem_get_file, file)(slot);
730 
731 	/*
732 	 * However, if the file is _being_ closed, then the bindings need to be
733 	 * removed as kvm_gmem_release() might not run until after the memslot
734 	 * is freed.  Note, modifying the bindings is safe even though the file
735 	 * is dying as kvm_gmem_release() nullifies slot->gmem.file under
736 	 * slots_lock, and only puts its reference to KVM after destroying all
737 	 * bindings.  I.e. reaching this point means kvm_gmem_release() hasn't
738 	 * yet destroyed the bindings or freed the gmem_file, and can't do so
739 	 * until the caller drops slots_lock.
740 	 */
741 	if (!file) {
742 		__kvm_gmem_unbind(slot, slot->gmem.file->private_data);
743 		return;
744 	}
745 
746 	filemap_invalidate_lock(file->f_mapping);
747 	__kvm_gmem_unbind(slot, file->private_data);
748 	filemap_invalidate_unlock(file->f_mapping);
749 }
750 
751 /* Returns a locked folio on success.  */
__kvm_gmem_get_pfn(struct file * file,struct kvm_memory_slot * slot,pgoff_t index,kvm_pfn_t * pfn,int * max_order)752 static struct folio *__kvm_gmem_get_pfn(struct file *file,
753 					struct kvm_memory_slot *slot,
754 					pgoff_t index, kvm_pfn_t *pfn,
755 					int *max_order)
756 {
757 	struct file *slot_file = READ_ONCE(slot->gmem.file);
758 	struct gmem_file *f = file->private_data;
759 	struct folio *folio;
760 
761 	if (file != slot_file) {
762 		WARN_ON_ONCE(slot_file);
763 		return ERR_PTR(-EFAULT);
764 	}
765 
766 	if (xa_load(&f->bindings, index) != slot) {
767 		WARN_ON_ONCE(xa_load(&f->bindings, index));
768 		return ERR_PTR(-EIO);
769 	}
770 
771 	folio = kvm_gmem_get_folio(file_inode(file), index);
772 	if (IS_ERR(folio))
773 		return folio;
774 
775 	if (folio_test_hwpoison(folio)) {
776 		folio_unlock(folio);
777 		folio_put(folio);
778 		return ERR_PTR(-EHWPOISON);
779 	}
780 
781 	*pfn = folio_file_pfn(folio, index);
782 	if (max_order)
783 		*max_order = 0;
784 
785 	return folio;
786 }
787 
kvm_gmem_get_pfn(struct kvm * kvm,struct kvm_memory_slot * slot,gfn_t gfn,kvm_pfn_t * pfn,struct page ** page,int * max_order)788 int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
789 		     gfn_t gfn, kvm_pfn_t *pfn, struct page **page,
790 		     int *max_order)
791 {
792 	pgoff_t index = kvm_gmem_get_index(slot, gfn);
793 	struct folio *folio;
794 	int r = 0;
795 
796 	CLASS(gmem_get_file, file)(slot);
797 	if (!file)
798 		return -EFAULT;
799 
800 	folio = __kvm_gmem_get_pfn(file, slot, index, pfn, max_order);
801 	if (IS_ERR(folio))
802 		return PTR_ERR(folio);
803 
804 	if (!folio_test_uptodate(folio)) {
805 		clear_highpage(folio_page(folio, 0));
806 		folio_mark_uptodate(folio);
807 	}
808 
809 	r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio);
810 
811 	folio_unlock(folio);
812 
813 	if (!r)
814 		*page = folio_file_page(folio, index);
815 	else
816 		folio_put(folio);
817 
818 	return r;
819 }
820 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_get_pfn);
821 
822 #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_POPULATE
823 
__kvm_gmem_populate(struct kvm * kvm,struct kvm_memory_slot * slot,struct file * file,gfn_t gfn,struct page * src_page,kvm_gmem_populate_cb post_populate,void * opaque)824 static long __kvm_gmem_populate(struct kvm *kvm, struct kvm_memory_slot *slot,
825 				struct file *file, gfn_t gfn, struct page *src_page,
826 				kvm_gmem_populate_cb post_populate, void *opaque)
827 {
828 	pgoff_t index = kvm_gmem_get_index(slot, gfn);
829 	struct folio *folio;
830 	kvm_pfn_t pfn;
831 	int ret;
832 
833 	filemap_invalidate_lock(file->f_mapping);
834 
835 	folio = __kvm_gmem_get_pfn(file, slot, index, &pfn, NULL);
836 	if (IS_ERR(folio)) {
837 		ret = PTR_ERR(folio);
838 		goto out_unlock;
839 	}
840 
841 	folio_unlock(folio);
842 
843 	if (!kvm_range_has_memory_attributes(kvm, gfn, gfn + 1,
844 					     KVM_MEMORY_ATTRIBUTE_PRIVATE,
845 					     KVM_MEMORY_ATTRIBUTE_PRIVATE)) {
846 		ret = -EINVAL;
847 		goto out_put_folio;
848 	}
849 
850 	ret = post_populate(kvm, gfn, pfn, src_page, opaque);
851 	if (!ret)
852 		folio_mark_uptodate(folio);
853 
854 out_put_folio:
855 	folio_put(folio);
856 out_unlock:
857 	filemap_invalidate_unlock(file->f_mapping);
858 	return ret;
859 }
860 
kvm_gmem_populate(struct kvm * kvm,gfn_t start_gfn,void __user * src,long npages,kvm_gmem_populate_cb post_populate,void * opaque)861 long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long npages,
862 		       kvm_gmem_populate_cb post_populate, void *opaque)
863 {
864 	struct kvm_memory_slot *slot;
865 	int ret = 0;
866 	long i;
867 
868 	lockdep_assert_held(&kvm->slots_lock);
869 
870 	if (WARN_ON_ONCE(npages <= 0))
871 		return -EINVAL;
872 
873 	if (WARN_ON_ONCE(!PAGE_ALIGNED(src)))
874 		return -EINVAL;
875 
876 	slot = gfn_to_memslot(kvm, start_gfn);
877 	if (!kvm_slot_has_gmem(slot))
878 		return -EINVAL;
879 
880 	CLASS(gmem_get_file, file)(slot);
881 	if (!file)
882 		return -EFAULT;
883 
884 	npages = min_t(ulong, slot->npages - (start_gfn - slot->base_gfn), npages);
885 	for (i = 0; i < npages; i++) {
886 		struct page *src_page = NULL;
887 
888 		if (signal_pending(current)) {
889 			ret = -EINTR;
890 			break;
891 		}
892 
893 		if (src) {
894 			unsigned long uaddr = (unsigned long)src + i * PAGE_SIZE;
895 
896 			ret = get_user_pages_fast(uaddr, 1, 0, &src_page);
897 			if (ret < 0)
898 				break;
899 			if (ret != 1) {
900 				ret = -ENOMEM;
901 				break;
902 			}
903 		}
904 
905 		ret = __kvm_gmem_populate(kvm, slot, file, start_gfn + i, src_page,
906 					  post_populate, opaque);
907 
908 		if (src_page)
909 			put_page(src_page);
910 
911 		if (ret)
912 			break;
913 	}
914 
915 	return ret && !i ? ret : i;
916 }
917 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_populate);
918 #endif
919 
920 static struct kmem_cache *kvm_gmem_inode_cachep;
921 
kvm_gmem_init_inode_once(void * __gi)922 static void kvm_gmem_init_inode_once(void *__gi)
923 {
924 	struct gmem_inode *gi = __gi;
925 
926 	/*
927 	 * Note!  Don't initialize the inode with anything specific to the
928 	 * guest_memfd instance, or that might be specific to how the inode is
929 	 * used (from the VFS-layer's perspective).  This hook is called only
930 	 * during the initial slab allocation, i.e. only fields/state that are
931 	 * idempotent across _all_ use of the inode _object_ can be initialized
932 	 * at this time!
933 	 */
934 	inode_init_once(&gi->vfs_inode);
935 }
936 
kvm_gmem_alloc_inode(struct super_block * sb)937 static struct inode *kvm_gmem_alloc_inode(struct super_block *sb)
938 {
939 	struct gmem_inode *gi;
940 
941 	gi = alloc_inode_sb(sb, kvm_gmem_inode_cachep, GFP_KERNEL);
942 	if (!gi)
943 		return NULL;
944 
945 	mpol_shared_policy_init(&gi->policy, NULL);
946 
947 	gi->flags = 0;
948 	return &gi->vfs_inode;
949 }
950 
kvm_gmem_destroy_inode(struct inode * inode)951 static void kvm_gmem_destroy_inode(struct inode *inode)
952 {
953 	mpol_free_shared_policy(&GMEM_I(inode)->policy);
954 }
955 
kvm_gmem_free_inode(struct inode * inode)956 static void kvm_gmem_free_inode(struct inode *inode)
957 {
958 	kmem_cache_free(kvm_gmem_inode_cachep, GMEM_I(inode));
959 }
960 
961 static const struct super_operations kvm_gmem_super_operations = {
962 	.statfs		= simple_statfs,
963 	.alloc_inode	= kvm_gmem_alloc_inode,
964 	.destroy_inode	= kvm_gmem_destroy_inode,
965 	.free_inode	= kvm_gmem_free_inode,
966 };
967 
kvm_gmem_init_fs_context(struct fs_context * fc)968 static int kvm_gmem_init_fs_context(struct fs_context *fc)
969 {
970 	struct pseudo_fs_context *ctx;
971 
972 	if (!init_pseudo(fc, GUEST_MEMFD_MAGIC))
973 		return -ENOMEM;
974 
975 	fc->s_iflags |= SB_I_NOEXEC;
976 	fc->s_iflags |= SB_I_NODEV;
977 	ctx = fc->fs_private;
978 	ctx->ops = &kvm_gmem_super_operations;
979 
980 	return 0;
981 }
982 
983 static struct file_system_type kvm_gmem_fs = {
984 	.name		 = "guest_memfd",
985 	.init_fs_context = kvm_gmem_init_fs_context,
986 	.kill_sb	 = kill_anon_super,
987 };
988 
kvm_gmem_init_mount(void)989 static int kvm_gmem_init_mount(void)
990 {
991 	kvm_gmem_mnt = kern_mount(&kvm_gmem_fs);
992 
993 	if (IS_ERR(kvm_gmem_mnt))
994 		return PTR_ERR(kvm_gmem_mnt);
995 
996 	kvm_gmem_mnt->mnt_flags |= MNT_NOEXEC;
997 	return 0;
998 }
999 
kvm_gmem_init(struct module * module)1000 int kvm_gmem_init(struct module *module)
1001 {
1002 	struct kmem_cache_args args = {
1003 		.align = 0,
1004 		.ctor = kvm_gmem_init_inode_once,
1005 	};
1006 	int ret;
1007 
1008 	kvm_gmem_fops.owner = module;
1009 	kvm_gmem_inode_cachep = kmem_cache_create("kvm_gmem_inode_cache",
1010 						  sizeof(struct gmem_inode),
1011 						  &args, SLAB_ACCOUNT);
1012 	if (!kvm_gmem_inode_cachep)
1013 		return -ENOMEM;
1014 
1015 	ret = kvm_gmem_init_mount();
1016 	if (ret) {
1017 		kmem_cache_destroy(kvm_gmem_inode_cachep);
1018 		return ret;
1019 	}
1020 	return 0;
1021 }
1022 
kvm_gmem_exit(void)1023 void kvm_gmem_exit(void)
1024 {
1025 	kern_unmount(kvm_gmem_mnt);
1026 	kvm_gmem_mnt = NULL;
1027 	rcu_barrier();
1028 	kmem_cache_destroy(kvm_gmem_inode_cachep);
1029 }
1030