xref: /linux/virt/kvm/guest_memfd.c (revision c98d767b34574be82b74d77d02264a830ae1cadd)
1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/anon_inodes.h>
3 #include <linux/backing-dev.h>
4 #include <linux/falloc.h>
5 #include <linux/fs.h>
6 #include <linux/kvm_host.h>
7 #include <linux/mempolicy.h>
8 #include <linux/pseudo_fs.h>
9 #include <linux/pagemap.h>
10 
11 #include "kvm_mm.h"
12 
13 static struct vfsmount *kvm_gmem_mnt;
14 
15 /*
16  * A guest_memfd instance can be associated multiple VMs, each with its own
17  * "view" of the underlying physical memory.
18  *
19  * The gmem's inode is effectively the raw underlying physical storage, and is
20  * used to track properties of the physical memory, while each gmem file is
21  * effectively a single VM's view of that storage, and is used to track assets
22  * specific to its associated VM, e.g. memslots=>gmem bindings.
23  */
24 struct gmem_file {
25 	struct kvm *kvm;
26 	struct xarray bindings;
27 	struct list_head entry;
28 };
29 
30 struct gmem_inode {
31 	struct shared_policy policy;
32 	struct inode vfs_inode;
33 	struct list_head gmem_file_list;
34 
35 	u64 flags;
36 };
37 
38 static __always_inline struct gmem_inode *GMEM_I(struct inode *inode)
39 {
40 	return container_of(inode, struct gmem_inode, vfs_inode);
41 }
42 
43 #define kvm_gmem_for_each_file(f, inode) \
44 	list_for_each_entry(f, &GMEM_I(inode)->gmem_file_list, entry)
45 
46 /**
47  * folio_file_pfn - like folio_file_page, but return a pfn.
48  * @folio: The folio which contains this index.
49  * @index: The index we want to look up.
50  *
51  * Return: The pfn for this index.
52  */
53 static inline kvm_pfn_t folio_file_pfn(struct folio *folio, pgoff_t index)
54 {
55 	return folio_pfn(folio) + (index & (folio_nr_pages(folio) - 1));
56 }
57 
58 static pgoff_t kvm_gmem_get_index(struct kvm_memory_slot *slot, gfn_t gfn)
59 {
60 	return gfn - slot->base_gfn + slot->gmem.pgoff;
61 }
62 
63 static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
64 				    pgoff_t index, struct folio *folio)
65 {
66 #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE
67 	kvm_pfn_t pfn = folio_file_pfn(folio, index);
68 	gfn_t gfn = slot->base_gfn + index - slot->gmem.pgoff;
69 	int rc = kvm_arch_gmem_prepare(kvm, gfn, pfn, folio_order(folio));
70 	if (rc) {
71 		pr_warn_ratelimited("gmem: Failed to prepare folio for index %lx GFN %llx PFN %llx error %d.\n",
72 				    index, gfn, pfn, rc);
73 		return rc;
74 	}
75 #endif
76 
77 	return 0;
78 }
79 
80 /*
81  * Process @folio, which contains @gfn, so that the guest can use it.
82  * The folio must be locked and the gfn must be contained in @slot.
83  * On successful return the guest sees a zero page so as to avoid
84  * leaking host data and the up-to-date flag is set.
85  */
86 static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
87 				  gfn_t gfn, struct folio *folio)
88 {
89 	pgoff_t index;
90 
91 	/*
92 	 * Preparing huge folios should always be safe, since it should
93 	 * be possible to split them later if needed.
94 	 *
95 	 * Right now the folio order is always going to be zero, but the
96 	 * code is ready for huge folios.  The only assumption is that
97 	 * the base pgoff of memslots is naturally aligned with the
98 	 * requested page order, ensuring that huge folios can also use
99 	 * huge page table entries for GPA->HPA mapping.
100 	 *
101 	 * The order will be passed when creating the guest_memfd, and
102 	 * checked when creating memslots.
103 	 */
104 	WARN_ON(!IS_ALIGNED(slot->gmem.pgoff, folio_nr_pages(folio)));
105 	index = kvm_gmem_get_index(slot, gfn);
106 	index = ALIGN_DOWN(index, folio_nr_pages(folio));
107 
108 	return __kvm_gmem_prepare_folio(kvm, slot, index, folio);
109 }
110 
111 /*
112  * Returns a locked folio on success.  The caller is responsible for
113  * setting the up-to-date flag before the memory is mapped into the guest.
114  * There is no backing storage for the memory, so the folio will remain
115  * up-to-date until it's removed.
116  *
117  * Ignore accessed, referenced, and dirty flags.  The memory is
118  * unevictable and there is no storage to write back to.
119  */
120 static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
121 {
122 	/* TODO: Support huge pages. */
123 	struct mempolicy *policy;
124 	struct folio *folio;
125 
126 	/*
127 	 * Fast-path: See if folio is already present in mapping to avoid
128 	 * policy_lookup.
129 	 */
130 	folio = filemap_lock_folio(inode->i_mapping, index);
131 	if (!IS_ERR(folio))
132 		return folio;
133 
134 	policy = mpol_shared_policy_lookup(&GMEM_I(inode)->policy, index);
135 	folio = __filemap_get_folio_mpol(inode->i_mapping, index,
136 					 FGP_LOCK | FGP_CREAT,
137 					 mapping_gfp_mask(inode->i_mapping), policy);
138 	mpol_cond_put(policy);
139 
140 	/*
141 	 * External interfaces like kvm_gmem_get_pfn() support dealing
142 	 * with hugepages to a degree, but internally, guest_memfd currently
143 	 * assumes that all folios are order-0 and handling would need
144 	 * to be updated for anything otherwise (e.g. page-clearing
145 	 * operations).
146 	 */
147 	WARN_ON_ONCE(!IS_ERR(folio) && folio_order(folio));
148 
149 	return folio;
150 }
151 
152 static enum kvm_gfn_range_filter kvm_gmem_get_invalidate_filter(struct inode *inode)
153 {
154 	if (GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_INIT_SHARED)
155 		return KVM_FILTER_SHARED;
156 
157 	return KVM_FILTER_PRIVATE;
158 }
159 
160 static void __kvm_gmem_invalidate_start(struct gmem_file *f, pgoff_t start,
161 					pgoff_t end,
162 					enum kvm_gfn_range_filter attr_filter)
163 {
164 	bool flush = false, found_memslot = false;
165 	struct kvm_memory_slot *slot;
166 	struct kvm *kvm = f->kvm;
167 	unsigned long index;
168 
169 	xa_for_each_range(&f->bindings, index, slot, start, end - 1) {
170 		pgoff_t pgoff = slot->gmem.pgoff;
171 
172 		struct kvm_gfn_range gfn_range = {
173 			.start = slot->base_gfn + max(pgoff, start) - pgoff,
174 			.end = slot->base_gfn + min(pgoff + slot->npages, end) - pgoff,
175 			.slot = slot,
176 			.may_block = true,
177 			.attr_filter = attr_filter,
178 		};
179 
180 		if (!found_memslot) {
181 			found_memslot = true;
182 
183 			KVM_MMU_LOCK(kvm);
184 			kvm_mmu_invalidate_start(kvm);
185 		}
186 
187 		flush |= kvm_mmu_unmap_gfn_range(kvm, &gfn_range);
188 	}
189 
190 	if (flush)
191 		kvm_flush_remote_tlbs(kvm);
192 
193 	if (found_memslot)
194 		KVM_MMU_UNLOCK(kvm);
195 }
196 
197 static void kvm_gmem_invalidate_start(struct inode *inode, pgoff_t start,
198 				      pgoff_t end)
199 {
200 	enum kvm_gfn_range_filter attr_filter;
201 	struct gmem_file *f;
202 
203 	attr_filter = kvm_gmem_get_invalidate_filter(inode);
204 
205 	kvm_gmem_for_each_file(f, inode)
206 		__kvm_gmem_invalidate_start(f, start, end, attr_filter);
207 }
208 
209 static void __kvm_gmem_invalidate_end(struct gmem_file *f, pgoff_t start,
210 				      pgoff_t end)
211 {
212 	struct kvm *kvm = f->kvm;
213 
214 	if (xa_find(&f->bindings, &start, end - 1, XA_PRESENT)) {
215 		KVM_MMU_LOCK(kvm);
216 		kvm_mmu_invalidate_end(kvm);
217 		KVM_MMU_UNLOCK(kvm);
218 	}
219 }
220 
221 static void kvm_gmem_invalidate_end(struct inode *inode, pgoff_t start,
222 				    pgoff_t end)
223 {
224 	struct gmem_file *f;
225 
226 	kvm_gmem_for_each_file(f, inode)
227 		__kvm_gmem_invalidate_end(f, start, end);
228 }
229 
230 static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len)
231 {
232 	pgoff_t start = offset >> PAGE_SHIFT;
233 	pgoff_t end = (offset + len) >> PAGE_SHIFT;
234 
235 	/*
236 	 * Bindings must be stable across invalidation to ensure the start+end
237 	 * are balanced.
238 	 */
239 	filemap_invalidate_lock(inode->i_mapping);
240 
241 	kvm_gmem_invalidate_start(inode, start, end);
242 
243 	truncate_inode_pages_range(inode->i_mapping, offset, offset + len - 1);
244 
245 	kvm_gmem_invalidate_end(inode, start, end);
246 
247 	filemap_invalidate_unlock(inode->i_mapping);
248 
249 	return 0;
250 }
251 
252 static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len)
253 {
254 	struct address_space *mapping = inode->i_mapping;
255 	pgoff_t start, index, end;
256 	int r;
257 
258 	/* Dedicated guest is immutable by default. */
259 	if (offset + len > i_size_read(inode))
260 		return -EINVAL;
261 
262 	filemap_invalidate_lock_shared(mapping);
263 
264 	start = offset >> PAGE_SHIFT;
265 	end = (offset + len) >> PAGE_SHIFT;
266 
267 	r = 0;
268 	for (index = start; index < end; ) {
269 		struct folio *folio;
270 
271 		if (signal_pending(current)) {
272 			r = -EINTR;
273 			break;
274 		}
275 
276 		folio = kvm_gmem_get_folio(inode, index);
277 		if (IS_ERR(folio)) {
278 			r = PTR_ERR(folio);
279 			break;
280 		}
281 
282 		index = folio_next_index(folio);
283 
284 		folio_unlock(folio);
285 		folio_put(folio);
286 
287 		/* 64-bit only, wrapping the index should be impossible. */
288 		if (WARN_ON_ONCE(!index))
289 			break;
290 
291 		cond_resched();
292 	}
293 
294 	filemap_invalidate_unlock_shared(mapping);
295 
296 	return r;
297 }
298 
299 static long kvm_gmem_fallocate(struct file *file, int mode, loff_t offset,
300 			       loff_t len)
301 {
302 	int ret;
303 
304 	if (!(mode & FALLOC_FL_KEEP_SIZE))
305 		return -EOPNOTSUPP;
306 
307 	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
308 		return -EOPNOTSUPP;
309 
310 	if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len))
311 		return -EINVAL;
312 
313 	if (mode & FALLOC_FL_PUNCH_HOLE)
314 		ret = kvm_gmem_punch_hole(file_inode(file), offset, len);
315 	else
316 		ret = kvm_gmem_allocate(file_inode(file), offset, len);
317 
318 	if (!ret)
319 		file_modified(file);
320 	return ret;
321 }
322 
323 static int kvm_gmem_release(struct inode *inode, struct file *file)
324 {
325 	struct gmem_file *f = file->private_data;
326 	struct kvm_memory_slot *slot;
327 	struct kvm *kvm = f->kvm;
328 	unsigned long index;
329 
330 	/*
331 	 * Prevent concurrent attempts to *unbind* a memslot.  This is the last
332 	 * reference to the file and thus no new bindings can be created, but
333 	 * dereferencing the slot for existing bindings needs to be protected
334 	 * against memslot updates, specifically so that unbind doesn't race
335 	 * and free the memslot (kvm_gmem_get_file() will return NULL).
336 	 *
337 	 * Since .release is called only when the reference count is zero,
338 	 * after which file_ref_get() and get_file_active() fail,
339 	 * kvm_gmem_get_pfn() cannot be using the file concurrently.
340 	 * file_ref_put() provides a full barrier, and get_file_active() the
341 	 * matching acquire barrier.
342 	 */
343 	mutex_lock(&kvm->slots_lock);
344 
345 	filemap_invalidate_lock(inode->i_mapping);
346 
347 	xa_for_each(&f->bindings, index, slot)
348 		WRITE_ONCE(slot->gmem.file, NULL);
349 
350 	/*
351 	 * All in-flight operations are gone and new bindings can be created.
352 	 * Zap all SPTEs pointed at by this file.  Do not free the backing
353 	 * memory, as its lifetime is associated with the inode, not the file.
354 	 */
355 	__kvm_gmem_invalidate_start(f, 0, -1ul,
356 				    kvm_gmem_get_invalidate_filter(inode));
357 	__kvm_gmem_invalidate_end(f, 0, -1ul);
358 
359 	list_del(&f->entry);
360 
361 	filemap_invalidate_unlock(inode->i_mapping);
362 
363 	mutex_unlock(&kvm->slots_lock);
364 
365 	xa_destroy(&f->bindings);
366 	kfree(f);
367 
368 	kvm_put_kvm(kvm);
369 
370 	return 0;
371 }
372 
373 static inline struct file *kvm_gmem_get_file(struct kvm_memory_slot *slot)
374 {
375 	/*
376 	 * Do not return slot->gmem.file if it has already been closed;
377 	 * there might be some time between the last fput() and when
378 	 * kvm_gmem_release() clears slot->gmem.file.
379 	 */
380 	return get_file_active(&slot->gmem.file);
381 }
382 
383 DEFINE_CLASS(gmem_get_file, struct file *, if (_T) fput(_T),
384 	     kvm_gmem_get_file(slot), struct kvm_memory_slot *slot);
385 
386 static bool kvm_gmem_supports_mmap(struct inode *inode)
387 {
388 	return GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_MMAP;
389 }
390 
391 static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf)
392 {
393 	struct inode *inode = file_inode(vmf->vma->vm_file);
394 	struct folio *folio;
395 	vm_fault_t ret = VM_FAULT_LOCKED;
396 
397 	if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode))
398 		return VM_FAULT_SIGBUS;
399 
400 	if (!(GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_INIT_SHARED))
401 		return VM_FAULT_SIGBUS;
402 
403 	folio = kvm_gmem_get_folio(inode, vmf->pgoff);
404 	if (IS_ERR(folio)) {
405 		if (PTR_ERR(folio) == -EAGAIN)
406 			return VM_FAULT_RETRY;
407 
408 		return vmf_error(PTR_ERR(folio));
409 	}
410 
411 	if (WARN_ON_ONCE(folio_test_large(folio))) {
412 		ret = VM_FAULT_SIGBUS;
413 		goto out_folio;
414 	}
415 
416 	if (!folio_test_uptodate(folio)) {
417 		clear_highpage(folio_page(folio, 0));
418 		folio_mark_uptodate(folio);
419 	}
420 
421 	vmf->page = folio_file_page(folio, vmf->pgoff);
422 
423 out_folio:
424 	if (ret != VM_FAULT_LOCKED) {
425 		folio_unlock(folio);
426 		folio_put(folio);
427 	}
428 
429 	return ret;
430 }
431 
432 #ifdef CONFIG_NUMA
433 static int kvm_gmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
434 {
435 	struct inode *inode = file_inode(vma->vm_file);
436 
437 	return mpol_set_shared_policy(&GMEM_I(inode)->policy, vma, mpol);
438 }
439 
440 static struct mempolicy *kvm_gmem_get_policy(struct vm_area_struct *vma,
441 					     unsigned long addr, pgoff_t *ilx)
442 {
443 	pgoff_t pgoff = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT);
444 	struct inode *inode = file_inode(vma->vm_file);
445 
446 	*ilx = inode->i_ino;
447 
448 	/*
449 	 * Return the memory policy for this index, or NULL if none is set.
450 	 *
451 	 * Returning NULL, e.g. instead of the current task's memory policy, is
452 	 * important for the .get_policy kernel ABI: it indicates that no
453 	 * explicit policy has been set via mbind() for this memory. The caller
454 	 * can then replace NULL with the default memory policy instead of the
455 	 * current task's memory policy.
456 	 */
457 	return mpol_shared_policy_lookup(&GMEM_I(inode)->policy, pgoff);
458 }
459 #endif /* CONFIG_NUMA */
460 
461 static const struct vm_operations_struct kvm_gmem_vm_ops = {
462 	.fault		= kvm_gmem_fault_user_mapping,
463 #ifdef CONFIG_NUMA
464 	.get_policy	= kvm_gmem_get_policy,
465 	.set_policy	= kvm_gmem_set_policy,
466 #endif
467 };
468 
469 static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma)
470 {
471 	if (!kvm_gmem_supports_mmap(file_inode(file)))
472 		return -ENODEV;
473 
474 	if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) !=
475 	    (VM_SHARED | VM_MAYSHARE)) {
476 		return -EINVAL;
477 	}
478 
479 	vma->vm_ops = &kvm_gmem_vm_ops;
480 
481 	return 0;
482 }
483 
484 static struct file_operations kvm_gmem_fops = {
485 	.mmap		= kvm_gmem_mmap,
486 	.open		= generic_file_open,
487 	.release	= kvm_gmem_release,
488 	.fallocate	= kvm_gmem_fallocate,
489 };
490 
491 static int kvm_gmem_migrate_folio(struct address_space *mapping,
492 				  struct folio *dst, struct folio *src,
493 				  enum migrate_mode mode)
494 {
495 	WARN_ON_ONCE(1);
496 	return -EINVAL;
497 }
498 
499 static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *folio)
500 {
501 	pgoff_t start, end;
502 
503 	filemap_invalidate_lock_shared(mapping);
504 
505 	start = folio->index;
506 	end = start + folio_nr_pages(folio);
507 
508 	kvm_gmem_invalidate_start(mapping->host, start, end);
509 
510 	/*
511 	 * Do not truncate the range, what action is taken in response to the
512 	 * error is userspace's decision (assuming the architecture supports
513 	 * gracefully handling memory errors).  If/when the guest attempts to
514 	 * access a poisoned page, kvm_gmem_get_pfn() will return -EHWPOISON,
515 	 * at which point KVM can either terminate the VM or propagate the
516 	 * error to userspace.
517 	 */
518 
519 	kvm_gmem_invalidate_end(mapping->host, start, end);
520 
521 	filemap_invalidate_unlock_shared(mapping);
522 
523 	return MF_DELAYED;
524 }
525 
526 #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
527 static void kvm_gmem_free_folio(struct folio *folio)
528 {
529 	struct page *page = folio_page(folio, 0);
530 	kvm_pfn_t pfn = page_to_pfn(page);
531 	int order = folio_order(folio);
532 
533 	kvm_arch_gmem_invalidate(pfn, pfn + (1ul << order));
534 }
535 #endif
536 
537 static const struct address_space_operations kvm_gmem_aops = {
538 	.dirty_folio = noop_dirty_folio,
539 	.migrate_folio	= kvm_gmem_migrate_folio,
540 	.error_remove_folio = kvm_gmem_error_folio,
541 #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
542 	.free_folio = kvm_gmem_free_folio,
543 #endif
544 };
545 
546 static int kvm_gmem_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
547 			    struct iattr *attr)
548 {
549 	return -EINVAL;
550 }
551 static const struct inode_operations kvm_gmem_iops = {
552 	.setattr	= kvm_gmem_setattr,
553 };
554 
555 bool __weak kvm_arch_supports_gmem_init_shared(struct kvm *kvm)
556 {
557 	return true;
558 }
559 
560 static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
561 {
562 	static const char *name = "[kvm-gmem]";
563 	struct gmem_file *f;
564 	struct inode *inode;
565 	struct file *file;
566 	int fd, err;
567 
568 	fd = get_unused_fd_flags(0);
569 	if (fd < 0)
570 		return fd;
571 
572 	f = kzalloc_obj(*f);
573 	if (!f) {
574 		err = -ENOMEM;
575 		goto err_fd;
576 	}
577 
578 	/* __fput() will take care of fops_put(). */
579 	if (!fops_get(&kvm_gmem_fops)) {
580 		err = -ENOENT;
581 		goto err_gmem;
582 	}
583 
584 	inode = anon_inode_make_secure_inode(kvm_gmem_mnt->mnt_sb, name, NULL);
585 	if (IS_ERR(inode)) {
586 		err = PTR_ERR(inode);
587 		goto err_fops;
588 	}
589 
590 	inode->i_op = &kvm_gmem_iops;
591 	inode->i_mapping->a_ops = &kvm_gmem_aops;
592 	inode->i_mode |= S_IFREG;
593 	inode->i_size = size;
594 	mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
595 	mapping_set_inaccessible(inode->i_mapping);
596 	/* Unmovable mappings are supposed to be marked unevictable as well. */
597 	WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
598 
599 	GMEM_I(inode)->flags = flags;
600 
601 	file = alloc_file_pseudo(inode, kvm_gmem_mnt, name, O_RDWR, &kvm_gmem_fops);
602 	if (IS_ERR(file)) {
603 		err = PTR_ERR(file);
604 		goto err_inode;
605 	}
606 
607 	file->f_flags |= O_LARGEFILE;
608 	file->private_data = f;
609 
610 	kvm_get_kvm(kvm);
611 	f->kvm = kvm;
612 	xa_init(&f->bindings);
613 	list_add(&f->entry, &GMEM_I(inode)->gmem_file_list);
614 
615 	fd_install(fd, file);
616 	return fd;
617 
618 err_inode:
619 	iput(inode);
620 err_fops:
621 	fops_put(&kvm_gmem_fops);
622 err_gmem:
623 	kfree(f);
624 err_fd:
625 	put_unused_fd(fd);
626 	return err;
627 }
628 
629 int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args)
630 {
631 	loff_t size = args->size;
632 	u64 flags = args->flags;
633 
634 	if (flags & ~kvm_gmem_get_supported_flags(kvm))
635 		return -EINVAL;
636 
637 	if (size <= 0 || !PAGE_ALIGNED(size))
638 		return -EINVAL;
639 
640 	return __kvm_gmem_create(kvm, size, flags);
641 }
642 
643 int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
644 		  unsigned int fd, uoff_t offset)
645 {
646 	uoff_t size = slot->npages << PAGE_SHIFT;
647 	unsigned long start, end;
648 	struct gmem_file *f;
649 	struct inode *inode;
650 	struct file *file;
651 	int r = -EINVAL;
652 
653 	BUILD_BUG_ON(sizeof(gpa_t) != sizeof(offset));
654 	BUILD_BUG_ON(sizeof(gfn_t) != sizeof(slot->gmem.pgoff));
655 
656 	file = fget(fd);
657 	if (!file)
658 		return -EBADF;
659 
660 	if (file->f_op != &kvm_gmem_fops)
661 		goto err;
662 
663 	f = file->private_data;
664 	if (f->kvm != kvm)
665 		goto err;
666 
667 	inode = file_inode(file);
668 
669 	if (!PAGE_ALIGNED(offset) || offset + size > i_size_read(inode))
670 		goto err;
671 
672 	filemap_invalidate_lock(inode->i_mapping);
673 
674 	start = offset >> PAGE_SHIFT;
675 	end = start + slot->npages;
676 
677 	if (!xa_empty(&f->bindings) &&
678 	    xa_find(&f->bindings, &start, end - 1, XA_PRESENT)) {
679 		r = -EEXIST;
680 		filemap_invalidate_unlock(inode->i_mapping);
681 		goto err;
682 	}
683 
684 	/*
685 	 * memslots of flag KVM_MEM_GUEST_MEMFD are immutable to change, so
686 	 * kvm_gmem_bind() must occur on a new memslot.  Because the memslot
687 	 * is not visible yet, kvm_gmem_get_pfn() is guaranteed to see the file.
688 	 */
689 	WRITE_ONCE(slot->gmem.file, file);
690 	slot->gmem.pgoff = start;
691 	if (kvm_gmem_supports_mmap(inode))
692 		slot->flags |= KVM_MEMSLOT_GMEM_ONLY;
693 
694 	xa_store_range(&f->bindings, start, end - 1, slot, GFP_KERNEL);
695 	filemap_invalidate_unlock(inode->i_mapping);
696 
697 	/*
698 	 * Drop the reference to the file, even on success.  The file pins KVM,
699 	 * not the other way 'round.  Active bindings are invalidated if the
700 	 * file is closed before memslots are destroyed.
701 	 */
702 	r = 0;
703 err:
704 	fput(file);
705 	return r;
706 }
707 
708 static void __kvm_gmem_unbind(struct kvm_memory_slot *slot, struct gmem_file *f)
709 {
710 	unsigned long start = slot->gmem.pgoff;
711 	unsigned long end = start + slot->npages;
712 
713 	xa_store_range(&f->bindings, start, end - 1, NULL, GFP_KERNEL);
714 
715 	/*
716 	 * synchronize_srcu(&kvm->srcu) ensured that kvm_gmem_get_pfn()
717 	 * cannot see this memslot.
718 	 */
719 	WRITE_ONCE(slot->gmem.file, NULL);
720 }
721 
722 void kvm_gmem_unbind(struct kvm_memory_slot *slot)
723 {
724 	/*
725 	 * Nothing to do if the underlying file was _already_ closed, as
726 	 * kvm_gmem_release() invalidates and nullifies all bindings.
727 	 */
728 	if (!slot->gmem.file)
729 		return;
730 
731 	CLASS(gmem_get_file, file)(slot);
732 
733 	/*
734 	 * However, if the file is _being_ closed, then the bindings need to be
735 	 * removed as kvm_gmem_release() might not run until after the memslot
736 	 * is freed.  Note, modifying the bindings is safe even though the file
737 	 * is dying as kvm_gmem_release() nullifies slot->gmem.file under
738 	 * slots_lock, and only puts its reference to KVM after destroying all
739 	 * bindings.  I.e. reaching this point means kvm_gmem_release() hasn't
740 	 * yet destroyed the bindings or freed the gmem_file, and can't do so
741 	 * until the caller drops slots_lock.
742 	 */
743 	if (!file) {
744 		__kvm_gmem_unbind(slot, slot->gmem.file->private_data);
745 		return;
746 	}
747 
748 	filemap_invalidate_lock(file->f_mapping);
749 	__kvm_gmem_unbind(slot, file->private_data);
750 	filemap_invalidate_unlock(file->f_mapping);
751 }
752 
753 /* Returns a locked folio on success.  */
754 static struct folio *__kvm_gmem_get_pfn(struct file *file,
755 					struct kvm_memory_slot *slot,
756 					pgoff_t index, kvm_pfn_t *pfn,
757 					int *max_order)
758 {
759 	struct file *slot_file = READ_ONCE(slot->gmem.file);
760 	struct gmem_file *f = file->private_data;
761 	struct folio *folio;
762 
763 	if (file != slot_file) {
764 		WARN_ON_ONCE(slot_file);
765 		return ERR_PTR(-EFAULT);
766 	}
767 
768 	if (xa_load(&f->bindings, index) != slot) {
769 		WARN_ON_ONCE(xa_load(&f->bindings, index));
770 		return ERR_PTR(-EIO);
771 	}
772 
773 	folio = kvm_gmem_get_folio(file_inode(file), index);
774 	if (IS_ERR(folio))
775 		return folio;
776 
777 	if (folio_test_hwpoison(folio)) {
778 		folio_unlock(folio);
779 		folio_put(folio);
780 		return ERR_PTR(-EHWPOISON);
781 	}
782 
783 	*pfn = folio_file_pfn(folio, index);
784 	if (max_order)
785 		*max_order = 0;
786 
787 	return folio;
788 }
789 
790 int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
791 		     gfn_t gfn, kvm_pfn_t *pfn, struct page **page,
792 		     int *max_order)
793 {
794 	pgoff_t index = kvm_gmem_get_index(slot, gfn);
795 	struct folio *folio;
796 	int r = 0;
797 
798 	CLASS(gmem_get_file, file)(slot);
799 	if (!file)
800 		return -EFAULT;
801 
802 	folio = __kvm_gmem_get_pfn(file, slot, index, pfn, max_order);
803 	if (IS_ERR(folio))
804 		return PTR_ERR(folio);
805 
806 	if (!folio_test_uptodate(folio)) {
807 		clear_highpage(folio_page(folio, 0));
808 		folio_mark_uptodate(folio);
809 	}
810 
811 	r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio);
812 
813 	folio_unlock(folio);
814 
815 	if (!r)
816 		*page = folio_file_page(folio, index);
817 	else
818 		folio_put(folio);
819 
820 	return r;
821 }
822 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_get_pfn);
823 
824 #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_POPULATE
825 
826 static long __kvm_gmem_populate(struct kvm *kvm, struct kvm_memory_slot *slot,
827 				struct file *file, gfn_t gfn, struct page *src_page,
828 				kvm_gmem_populate_cb post_populate, void *opaque)
829 {
830 	pgoff_t index = kvm_gmem_get_index(slot, gfn);
831 	struct folio *folio;
832 	kvm_pfn_t pfn;
833 	int ret;
834 
835 	filemap_invalidate_lock(file->f_mapping);
836 
837 	folio = __kvm_gmem_get_pfn(file, slot, index, &pfn, NULL);
838 	if (IS_ERR(folio)) {
839 		ret = PTR_ERR(folio);
840 		goto out_unlock;
841 	}
842 
843 	folio_unlock(folio);
844 
845 	if (!kvm_range_has_memory_attributes(kvm, gfn, gfn + 1,
846 					     KVM_MEMORY_ATTRIBUTE_PRIVATE,
847 					     KVM_MEMORY_ATTRIBUTE_PRIVATE)) {
848 		ret = -EINVAL;
849 		goto out_put_folio;
850 	}
851 
852 	ret = post_populate(kvm, gfn, pfn, src_page, opaque);
853 	if (!ret)
854 		folio_mark_uptodate(folio);
855 
856 out_put_folio:
857 	folio_put(folio);
858 out_unlock:
859 	filemap_invalidate_unlock(file->f_mapping);
860 	return ret;
861 }
862 
863 long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src,
864 		       long npages, bool may_writeback_src,
865 		       kvm_gmem_populate_cb post_populate, void *opaque)
866 {
867 	struct kvm_memory_slot *slot;
868 	int ret = 0;
869 	long i;
870 
871 	lockdep_assert_held(&kvm->slots_lock);
872 
873 	if (WARN_ON_ONCE(npages <= 0))
874 		return -EINVAL;
875 
876 	if (WARN_ON_ONCE(!PAGE_ALIGNED(src)))
877 		return -EINVAL;
878 
879 	slot = gfn_to_memslot(kvm, start_gfn);
880 	if (!kvm_slot_has_gmem(slot))
881 		return -EINVAL;
882 
883 	CLASS(gmem_get_file, file)(slot);
884 	if (!file)
885 		return -EFAULT;
886 
887 	npages = min_t(ulong, slot->npages - (start_gfn - slot->base_gfn), npages);
888 	for (i = 0; i < npages; i++) {
889 		struct page *src_page = NULL;
890 
891 		if (signal_pending(current)) {
892 			ret = -EINTR;
893 			break;
894 		}
895 
896 		if (src) {
897 			unsigned long uaddr = (unsigned long)src + i * PAGE_SIZE;
898 			unsigned int flags = may_writeback_src ? FOLL_WRITE : 0;
899 
900 			ret = get_user_pages_fast(uaddr, 1, flags, &src_page);
901 			if (ret < 0)
902 				break;
903 			if (ret != 1) {
904 				ret = -ENOMEM;
905 				break;
906 			}
907 		}
908 
909 		ret = __kvm_gmem_populate(kvm, slot, file, start_gfn + i, src_page,
910 					  post_populate, opaque);
911 
912 		if (src_page)
913 			put_page(src_page);
914 
915 		if (ret)
916 			break;
917 	}
918 
919 	return ret && !i ? ret : i;
920 }
921 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_populate);
922 #endif
923 
924 static struct kmem_cache *kvm_gmem_inode_cachep;
925 
926 static void kvm_gmem_init_inode_once(void *__gi)
927 {
928 	struct gmem_inode *gi = __gi;
929 
930 	/*
931 	 * Note!  Don't initialize the inode with anything specific to the
932 	 * guest_memfd instance, or that might be specific to how the inode is
933 	 * used (from the VFS-layer's perspective).  This hook is called only
934 	 * during the initial slab allocation, i.e. only fields/state that are
935 	 * idempotent across _all_ use of the inode _object_ can be initialized
936 	 * at this time!
937 	 */
938 	inode_init_once(&gi->vfs_inode);
939 }
940 
941 static struct inode *kvm_gmem_alloc_inode(struct super_block *sb)
942 {
943 	struct gmem_inode *gi;
944 
945 	gi = alloc_inode_sb(sb, kvm_gmem_inode_cachep, GFP_KERNEL);
946 	if (!gi)
947 		return NULL;
948 
949 	mpol_shared_policy_init(&gi->policy, NULL);
950 
951 	gi->flags = 0;
952 	INIT_LIST_HEAD(&gi->gmem_file_list);
953 	return &gi->vfs_inode;
954 }
955 
956 static void kvm_gmem_destroy_inode(struct inode *inode)
957 {
958 	mpol_free_shared_policy(&GMEM_I(inode)->policy);
959 }
960 
961 static void kvm_gmem_free_inode(struct inode *inode)
962 {
963 	kmem_cache_free(kvm_gmem_inode_cachep, GMEM_I(inode));
964 }
965 
966 static const struct super_operations kvm_gmem_super_operations = {
967 	.statfs		= simple_statfs,
968 	.alloc_inode	= kvm_gmem_alloc_inode,
969 	.destroy_inode	= kvm_gmem_destroy_inode,
970 	.free_inode	= kvm_gmem_free_inode,
971 };
972 
973 static int kvm_gmem_init_fs_context(struct fs_context *fc)
974 {
975 	struct pseudo_fs_context *ctx;
976 
977 	if (!init_pseudo(fc, GUEST_MEMFD_MAGIC))
978 		return -ENOMEM;
979 
980 	ctx = fc->fs_private;
981 	ctx->ops = &kvm_gmem_super_operations;
982 
983 	return 0;
984 }
985 
986 static struct file_system_type kvm_gmem_fs = {
987 	.name		 = "guest_memfd",
988 	.init_fs_context = kvm_gmem_init_fs_context,
989 	.kill_sb	 = kill_anon_super,
990 };
991 
992 static int kvm_gmem_init_mount(void)
993 {
994 	kvm_gmem_mnt = kern_mount(&kvm_gmem_fs);
995 
996 	if (IS_ERR(kvm_gmem_mnt))
997 		return PTR_ERR(kvm_gmem_mnt);
998 
999 	kvm_gmem_mnt->mnt_flags |= MNT_NOEXEC;
1000 	return 0;
1001 }
1002 
1003 int kvm_gmem_init(struct module *module)
1004 {
1005 	struct kmem_cache_args args = {
1006 		.align = 0,
1007 		.ctor = kvm_gmem_init_inode_once,
1008 	};
1009 	int ret;
1010 
1011 	kvm_gmem_fops.owner = module;
1012 	kvm_gmem_inode_cachep = kmem_cache_create("kvm_gmem_inode_cache",
1013 						  sizeof(struct gmem_inode),
1014 						  &args, SLAB_ACCOUNT);
1015 	if (!kvm_gmem_inode_cachep)
1016 		return -ENOMEM;
1017 
1018 	ret = kvm_gmem_init_mount();
1019 	if (ret) {
1020 		kmem_cache_destroy(kvm_gmem_inode_cachep);
1021 		return ret;
1022 	}
1023 	return 0;
1024 }
1025 
1026 void kvm_gmem_exit(void)
1027 {
1028 	kern_unmount(kvm_gmem_mnt);
1029 	kvm_gmem_mnt = NULL;
1030 	rcu_barrier();
1031 	kmem_cache_destroy(kvm_gmem_inode_cachep);
1032 }
1033