xref: /linux/virt/kvm/guest_memfd.c (revision 02e5f74ef08d3e6afec438d571487d0d0cec3c48)
1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/backing-dev.h>
3 #include <linux/falloc.h>
4 #include <linux/kvm_host.h>
5 #include <linux/pagemap.h>
6 #include <linux/anon_inodes.h>
7 
8 #include "kvm_mm.h"
9 
10 struct kvm_gmem {
11 	struct kvm *kvm;
12 	struct xarray bindings;
13 	struct list_head entry;
14 };
15 
16 /**
17  * folio_file_pfn - like folio_file_page, but return a pfn.
18  * @folio: The folio which contains this index.
19  * @index: The index we want to look up.
20  *
21  * Return: The pfn for this index.
22  */
folio_file_pfn(struct folio * folio,pgoff_t index)23 static inline kvm_pfn_t folio_file_pfn(struct folio *folio, pgoff_t index)
24 {
25 	return folio_pfn(folio) + (index & (folio_nr_pages(folio) - 1));
26 }
27 
__kvm_gmem_prepare_folio(struct kvm * kvm,struct kvm_memory_slot * slot,pgoff_t index,struct folio * folio)28 static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
29 				    pgoff_t index, struct folio *folio)
30 {
31 #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE
32 	kvm_pfn_t pfn = folio_file_pfn(folio, index);
33 	gfn_t gfn = slot->base_gfn + index - slot->gmem.pgoff;
34 	int rc = kvm_arch_gmem_prepare(kvm, gfn, pfn, folio_order(folio));
35 	if (rc) {
36 		pr_warn_ratelimited("gmem: Failed to prepare folio for index %lx GFN %llx PFN %llx error %d.\n",
37 				    index, gfn, pfn, rc);
38 		return rc;
39 	}
40 #endif
41 
42 	return 0;
43 }
44 
kvm_gmem_mark_prepared(struct folio * folio)45 static inline void kvm_gmem_mark_prepared(struct folio *folio)
46 {
47 	folio_mark_uptodate(folio);
48 }
49 
50 /*
51  * Process @folio, which contains @gfn, so that the guest can use it.
52  * The folio must be locked and the gfn must be contained in @slot.
53  * On successful return the guest sees a zero page so as to avoid
54  * leaking host data and the up-to-date flag is set.
55  */
kvm_gmem_prepare_folio(struct kvm * kvm,struct kvm_memory_slot * slot,gfn_t gfn,struct folio * folio)56 static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
57 				  gfn_t gfn, struct folio *folio)
58 {
59 	unsigned long nr_pages, i;
60 	pgoff_t index;
61 	int r;
62 
63 	nr_pages = folio_nr_pages(folio);
64 	for (i = 0; i < nr_pages; i++)
65 		clear_highpage(folio_page(folio, i));
66 
67 	/*
68 	 * Preparing huge folios should always be safe, since it should
69 	 * be possible to split them later if needed.
70 	 *
71 	 * Right now the folio order is always going to be zero, but the
72 	 * code is ready for huge folios.  The only assumption is that
73 	 * the base pgoff of memslots is naturally aligned with the
74 	 * requested page order, ensuring that huge folios can also use
75 	 * huge page table entries for GPA->HPA mapping.
76 	 *
77 	 * The order will be passed when creating the guest_memfd, and
78 	 * checked when creating memslots.
79 	 */
80 	WARN_ON(!IS_ALIGNED(slot->gmem.pgoff, 1 << folio_order(folio)));
81 	index = gfn - slot->base_gfn + slot->gmem.pgoff;
82 	index = ALIGN_DOWN(index, 1 << folio_order(folio));
83 	r = __kvm_gmem_prepare_folio(kvm, slot, index, folio);
84 	if (!r)
85 		kvm_gmem_mark_prepared(folio);
86 
87 	return r;
88 }
89 
90 /*
91  * Returns a locked folio on success.  The caller is responsible for
92  * setting the up-to-date flag before the memory is mapped into the guest.
93  * There is no backing storage for the memory, so the folio will remain
94  * up-to-date until it's removed.
95  *
96  * Ignore accessed, referenced, and dirty flags.  The memory is
97  * unevictable and there is no storage to write back to.
98  */
kvm_gmem_get_folio(struct inode * inode,pgoff_t index)99 static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
100 {
101 	/* TODO: Support huge pages. */
102 	return filemap_grab_folio(inode->i_mapping, index);
103 }
104 
kvm_gmem_get_invalidate_filter(struct inode * inode)105 static enum kvm_gfn_range_filter kvm_gmem_get_invalidate_filter(struct inode *inode)
106 {
107 	if ((u64)inode->i_private & GUEST_MEMFD_FLAG_INIT_SHARED)
108 		return KVM_FILTER_SHARED;
109 
110 	return KVM_FILTER_PRIVATE;
111 }
112 
__kvm_gmem_invalidate_begin(struct kvm_gmem * gmem,pgoff_t start,pgoff_t end,enum kvm_gfn_range_filter attr_filter)113 static void __kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start,
114 					pgoff_t end,
115 					enum kvm_gfn_range_filter attr_filter)
116 {
117 	bool flush = false, found_memslot = false;
118 	struct kvm_memory_slot *slot;
119 	struct kvm *kvm = gmem->kvm;
120 	unsigned long index;
121 
122 	xa_for_each_range(&gmem->bindings, index, slot, start, end - 1) {
123 		pgoff_t pgoff = slot->gmem.pgoff;
124 
125 		struct kvm_gfn_range gfn_range = {
126 			.start = slot->base_gfn + max(pgoff, start) - pgoff,
127 			.end = slot->base_gfn + min(pgoff + slot->npages, end) - pgoff,
128 			.slot = slot,
129 			.may_block = true,
130 			.attr_filter = attr_filter,
131 		};
132 
133 		if (!found_memslot) {
134 			found_memslot = true;
135 
136 			KVM_MMU_LOCK(kvm);
137 			kvm_mmu_invalidate_begin(kvm);
138 		}
139 
140 		flush |= kvm_mmu_unmap_gfn_range(kvm, &gfn_range);
141 	}
142 
143 	if (flush)
144 		kvm_flush_remote_tlbs(kvm);
145 
146 	if (found_memslot)
147 		KVM_MMU_UNLOCK(kvm);
148 }
149 
kvm_gmem_invalidate_begin(struct inode * inode,pgoff_t start,pgoff_t end)150 static void kvm_gmem_invalidate_begin(struct inode *inode, pgoff_t start,
151 				      pgoff_t end)
152 {
153 	struct list_head *gmem_list = &inode->i_mapping->i_private_list;
154 	enum kvm_gfn_range_filter attr_filter;
155 	struct kvm_gmem *gmem;
156 
157 	attr_filter = kvm_gmem_get_invalidate_filter(inode);
158 
159 	list_for_each_entry(gmem, gmem_list, entry)
160 		__kvm_gmem_invalidate_begin(gmem, start, end, attr_filter);
161 }
162 
__kvm_gmem_invalidate_end(struct kvm_gmem * gmem,pgoff_t start,pgoff_t end)163 static void __kvm_gmem_invalidate_end(struct kvm_gmem *gmem, pgoff_t start,
164 				      pgoff_t end)
165 {
166 	struct kvm *kvm = gmem->kvm;
167 
168 	if (xa_find(&gmem->bindings, &start, end - 1, XA_PRESENT)) {
169 		KVM_MMU_LOCK(kvm);
170 		kvm_mmu_invalidate_end(kvm);
171 		KVM_MMU_UNLOCK(kvm);
172 	}
173 }
174 
kvm_gmem_invalidate_end(struct inode * inode,pgoff_t start,pgoff_t end)175 static void kvm_gmem_invalidate_end(struct inode *inode, pgoff_t start,
176 				    pgoff_t end)
177 {
178 	struct list_head *gmem_list = &inode->i_mapping->i_private_list;
179 	struct kvm_gmem *gmem;
180 
181 	list_for_each_entry(gmem, gmem_list, entry)
182 		__kvm_gmem_invalidate_end(gmem, start, end);
183 }
184 
kvm_gmem_punch_hole(struct inode * inode,loff_t offset,loff_t len)185 static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len)
186 {
187 	pgoff_t start = offset >> PAGE_SHIFT;
188 	pgoff_t end = (offset + len) >> PAGE_SHIFT;
189 
190 	/*
191 	 * Bindings must be stable across invalidation to ensure the start+end
192 	 * are balanced.
193 	 */
194 	filemap_invalidate_lock(inode->i_mapping);
195 
196 	kvm_gmem_invalidate_begin(inode, start, end);
197 
198 	truncate_inode_pages_range(inode->i_mapping, offset, offset + len - 1);
199 
200 	kvm_gmem_invalidate_end(inode, start, end);
201 
202 	filemap_invalidate_unlock(inode->i_mapping);
203 
204 	return 0;
205 }
206 
kvm_gmem_allocate(struct inode * inode,loff_t offset,loff_t len)207 static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len)
208 {
209 	struct address_space *mapping = inode->i_mapping;
210 	pgoff_t start, index, end;
211 	int r;
212 
213 	/* Dedicated guest is immutable by default. */
214 	if (offset + len > i_size_read(inode))
215 		return -EINVAL;
216 
217 	filemap_invalidate_lock_shared(mapping);
218 
219 	start = offset >> PAGE_SHIFT;
220 	end = (offset + len) >> PAGE_SHIFT;
221 
222 	r = 0;
223 	for (index = start; index < end; ) {
224 		struct folio *folio;
225 
226 		if (signal_pending(current)) {
227 			r = -EINTR;
228 			break;
229 		}
230 
231 		folio = kvm_gmem_get_folio(inode, index);
232 		if (IS_ERR(folio)) {
233 			r = PTR_ERR(folio);
234 			break;
235 		}
236 
237 		index = folio_next_index(folio);
238 
239 		folio_unlock(folio);
240 		folio_put(folio);
241 
242 		/* 64-bit only, wrapping the index should be impossible. */
243 		if (WARN_ON_ONCE(!index))
244 			break;
245 
246 		cond_resched();
247 	}
248 
249 	filemap_invalidate_unlock_shared(mapping);
250 
251 	return r;
252 }
253 
kvm_gmem_fallocate(struct file * file,int mode,loff_t offset,loff_t len)254 static long kvm_gmem_fallocate(struct file *file, int mode, loff_t offset,
255 			       loff_t len)
256 {
257 	int ret;
258 
259 	if (!(mode & FALLOC_FL_KEEP_SIZE))
260 		return -EOPNOTSUPP;
261 
262 	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
263 		return -EOPNOTSUPP;
264 
265 	if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len))
266 		return -EINVAL;
267 
268 	if (mode & FALLOC_FL_PUNCH_HOLE)
269 		ret = kvm_gmem_punch_hole(file_inode(file), offset, len);
270 	else
271 		ret = kvm_gmem_allocate(file_inode(file), offset, len);
272 
273 	if (!ret)
274 		file_modified(file);
275 	return ret;
276 }
277 
kvm_gmem_release(struct inode * inode,struct file * file)278 static int kvm_gmem_release(struct inode *inode, struct file *file)
279 {
280 	struct kvm_gmem *gmem = file->private_data;
281 	struct kvm_memory_slot *slot;
282 	struct kvm *kvm = gmem->kvm;
283 	unsigned long index;
284 
285 	/*
286 	 * Prevent concurrent attempts to *unbind* a memslot.  This is the last
287 	 * reference to the file and thus no new bindings can be created, but
288 	 * dereferencing the slot for existing bindings needs to be protected
289 	 * against memslot updates, specifically so that unbind doesn't race
290 	 * and free the memslot (kvm_gmem_get_file() will return NULL).
291 	 *
292 	 * Since .release is called only when the reference count is zero,
293 	 * after which file_ref_get() and get_file_active() fail,
294 	 * kvm_gmem_get_pfn() cannot be using the file concurrently.
295 	 * file_ref_put() provides a full barrier, and get_file_active() the
296 	 * matching acquire barrier.
297 	 */
298 	mutex_lock(&kvm->slots_lock);
299 
300 	filemap_invalidate_lock(inode->i_mapping);
301 
302 	xa_for_each(&gmem->bindings, index, slot)
303 		WRITE_ONCE(slot->gmem.file, NULL);
304 
305 	/*
306 	 * All in-flight operations are gone and new bindings can be created.
307 	 * Zap all SPTEs pointed at by this file.  Do not free the backing
308 	 * memory, as its lifetime is associated with the inode, not the file.
309 	 */
310 	__kvm_gmem_invalidate_begin(gmem, 0, -1ul,
311 				    kvm_gmem_get_invalidate_filter(inode));
312 	__kvm_gmem_invalidate_end(gmem, 0, -1ul);
313 
314 	list_del(&gmem->entry);
315 
316 	filemap_invalidate_unlock(inode->i_mapping);
317 
318 	mutex_unlock(&kvm->slots_lock);
319 
320 	xa_destroy(&gmem->bindings);
321 	kfree(gmem);
322 
323 	kvm_put_kvm(kvm);
324 
325 	return 0;
326 }
327 
kvm_gmem_get_file(struct kvm_memory_slot * slot)328 static inline struct file *kvm_gmem_get_file(struct kvm_memory_slot *slot)
329 {
330 	/*
331 	 * Do not return slot->gmem.file if it has already been closed;
332 	 * there might be some time between the last fput() and when
333 	 * kvm_gmem_release() clears slot->gmem.file.
334 	 */
335 	return get_file_active(&slot->gmem.file);
336 }
337 
kvm_gmem_get_index(struct kvm_memory_slot * slot,gfn_t gfn)338 static pgoff_t kvm_gmem_get_index(struct kvm_memory_slot *slot, gfn_t gfn)
339 {
340 	return gfn - slot->base_gfn + slot->gmem.pgoff;
341 }
342 
kvm_gmem_supports_mmap(struct inode * inode)343 static bool kvm_gmem_supports_mmap(struct inode *inode)
344 {
345 	const u64 flags = (u64)inode->i_private;
346 
347 	return flags & GUEST_MEMFD_FLAG_MMAP;
348 }
349 
kvm_gmem_fault_user_mapping(struct vm_fault * vmf)350 static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf)
351 {
352 	struct inode *inode = file_inode(vmf->vma->vm_file);
353 	struct folio *folio;
354 	vm_fault_t ret = VM_FAULT_LOCKED;
355 
356 	if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode))
357 		return VM_FAULT_SIGBUS;
358 
359 	if (!((u64)inode->i_private & GUEST_MEMFD_FLAG_INIT_SHARED))
360 		return VM_FAULT_SIGBUS;
361 
362 	folio = kvm_gmem_get_folio(inode, vmf->pgoff);
363 	if (IS_ERR(folio)) {
364 		int err = PTR_ERR(folio);
365 
366 		if (err == -EAGAIN)
367 			return VM_FAULT_RETRY;
368 
369 		return vmf_error(err);
370 	}
371 
372 	if (WARN_ON_ONCE(folio_test_large(folio))) {
373 		ret = VM_FAULT_SIGBUS;
374 		goto out_folio;
375 	}
376 
377 	if (!folio_test_uptodate(folio)) {
378 		clear_highpage(folio_page(folio, 0));
379 		kvm_gmem_mark_prepared(folio);
380 	}
381 
382 	vmf->page = folio_file_page(folio, vmf->pgoff);
383 
384 out_folio:
385 	if (ret != VM_FAULT_LOCKED) {
386 		folio_unlock(folio);
387 		folio_put(folio);
388 	}
389 
390 	return ret;
391 }
392 
393 static const struct vm_operations_struct kvm_gmem_vm_ops = {
394 	.fault = kvm_gmem_fault_user_mapping,
395 };
396 
kvm_gmem_mmap(struct file * file,struct vm_area_struct * vma)397 static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma)
398 {
399 	if (!kvm_gmem_supports_mmap(file_inode(file)))
400 		return -ENODEV;
401 
402 	if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) !=
403 	    (VM_SHARED | VM_MAYSHARE)) {
404 		return -EINVAL;
405 	}
406 
407 	vma->vm_ops = &kvm_gmem_vm_ops;
408 
409 	return 0;
410 }
411 
412 static struct file_operations kvm_gmem_fops = {
413 	.mmap		= kvm_gmem_mmap,
414 	.open		= generic_file_open,
415 	.release	= kvm_gmem_release,
416 	.fallocate	= kvm_gmem_fallocate,
417 };
418 
kvm_gmem_init(struct module * module)419 void kvm_gmem_init(struct module *module)
420 {
421 	kvm_gmem_fops.owner = module;
422 }
423 
kvm_gmem_migrate_folio(struct address_space * mapping,struct folio * dst,struct folio * src,enum migrate_mode mode)424 static int kvm_gmem_migrate_folio(struct address_space *mapping,
425 				  struct folio *dst, struct folio *src,
426 				  enum migrate_mode mode)
427 {
428 	WARN_ON_ONCE(1);
429 	return -EINVAL;
430 }
431 
kvm_gmem_error_folio(struct address_space * mapping,struct folio * folio)432 static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *folio)
433 {
434 	pgoff_t start, end;
435 
436 	filemap_invalidate_lock_shared(mapping);
437 
438 	start = folio->index;
439 	end = start + folio_nr_pages(folio);
440 
441 	kvm_gmem_invalidate_begin(mapping->host, start, end);
442 
443 	/*
444 	 * Do not truncate the range, what action is taken in response to the
445 	 * error is userspace's decision (assuming the architecture supports
446 	 * gracefully handling memory errors).  If/when the guest attempts to
447 	 * access a poisoned page, kvm_gmem_get_pfn() will return -EHWPOISON,
448 	 * at which point KVM can either terminate the VM or propagate the
449 	 * error to userspace.
450 	 */
451 
452 	kvm_gmem_invalidate_end(mapping->host, start, end);
453 
454 	filemap_invalidate_unlock_shared(mapping);
455 
456 	return MF_DELAYED;
457 }
458 
459 #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
kvm_gmem_free_folio(struct folio * folio)460 static void kvm_gmem_free_folio(struct folio *folio)
461 {
462 	struct page *page = folio_page(folio, 0);
463 	kvm_pfn_t pfn = page_to_pfn(page);
464 	int order = folio_order(folio);
465 
466 	kvm_arch_gmem_invalidate(pfn, pfn + (1ul << order));
467 }
468 #endif
469 
470 static const struct address_space_operations kvm_gmem_aops = {
471 	.dirty_folio = noop_dirty_folio,
472 	.migrate_folio	= kvm_gmem_migrate_folio,
473 	.error_remove_folio = kvm_gmem_error_folio,
474 #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
475 	.free_folio = kvm_gmem_free_folio,
476 #endif
477 };
478 
kvm_gmem_setattr(struct mnt_idmap * idmap,struct dentry * dentry,struct iattr * attr)479 static int kvm_gmem_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
480 			    struct iattr *attr)
481 {
482 	return -EINVAL;
483 }
484 static const struct inode_operations kvm_gmem_iops = {
485 	.setattr	= kvm_gmem_setattr,
486 };
487 
kvm_arch_supports_gmem_init_shared(struct kvm * kvm)488 bool __weak kvm_arch_supports_gmem_init_shared(struct kvm *kvm)
489 {
490 	return true;
491 }
492 
__kvm_gmem_create(struct kvm * kvm,loff_t size,u64 flags)493 static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
494 {
495 	const char *anon_name = "[kvm-gmem]";
496 	struct kvm_gmem *gmem;
497 	struct inode *inode;
498 	struct file *file;
499 	int fd, err;
500 
501 	fd = get_unused_fd_flags(0);
502 	if (fd < 0)
503 		return fd;
504 
505 	gmem = kzalloc(sizeof(*gmem), GFP_KERNEL);
506 	if (!gmem) {
507 		err = -ENOMEM;
508 		goto err_fd;
509 	}
510 
511 	file = anon_inode_create_getfile(anon_name, &kvm_gmem_fops, gmem,
512 					 O_RDWR, NULL);
513 	if (IS_ERR(file)) {
514 		err = PTR_ERR(file);
515 		goto err_gmem;
516 	}
517 
518 	file->f_flags |= O_LARGEFILE;
519 
520 	inode = file->f_inode;
521 	WARN_ON(file->f_mapping != inode->i_mapping);
522 
523 	inode->i_private = (void *)(unsigned long)flags;
524 	inode->i_op = &kvm_gmem_iops;
525 	inode->i_mapping->a_ops = &kvm_gmem_aops;
526 	inode->i_mode |= S_IFREG;
527 	inode->i_size = size;
528 	mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
529 	mapping_set_inaccessible(inode->i_mapping);
530 	/* Unmovable mappings are supposed to be marked unevictable as well. */
531 	WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
532 
533 	kvm_get_kvm(kvm);
534 	gmem->kvm = kvm;
535 	xa_init(&gmem->bindings);
536 	list_add(&gmem->entry, &inode->i_mapping->i_private_list);
537 
538 	fd_install(fd, file);
539 	return fd;
540 
541 err_gmem:
542 	kfree(gmem);
543 err_fd:
544 	put_unused_fd(fd);
545 	return err;
546 }
547 
kvm_gmem_create(struct kvm * kvm,struct kvm_create_guest_memfd * args)548 int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args)
549 {
550 	loff_t size = args->size;
551 	u64 flags = args->flags;
552 
553 	if (flags & ~kvm_gmem_get_supported_flags(kvm))
554 		return -EINVAL;
555 
556 	if (size <= 0 || !PAGE_ALIGNED(size))
557 		return -EINVAL;
558 
559 	return __kvm_gmem_create(kvm, size, flags);
560 }
561 
kvm_gmem_bind(struct kvm * kvm,struct kvm_memory_slot * slot,unsigned int fd,loff_t offset)562 int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
563 		  unsigned int fd, loff_t offset)
564 {
565 	loff_t size = slot->npages << PAGE_SHIFT;
566 	unsigned long start, end;
567 	struct kvm_gmem *gmem;
568 	struct inode *inode;
569 	struct file *file;
570 	int r = -EINVAL;
571 
572 	BUILD_BUG_ON(sizeof(gfn_t) != sizeof(slot->gmem.pgoff));
573 
574 	file = fget(fd);
575 	if (!file)
576 		return -EBADF;
577 
578 	if (file->f_op != &kvm_gmem_fops)
579 		goto err;
580 
581 	gmem = file->private_data;
582 	if (gmem->kvm != kvm)
583 		goto err;
584 
585 	inode = file_inode(file);
586 
587 	if (offset < 0 || !PAGE_ALIGNED(offset) ||
588 	    offset + size > i_size_read(inode))
589 		goto err;
590 
591 	filemap_invalidate_lock(inode->i_mapping);
592 
593 	start = offset >> PAGE_SHIFT;
594 	end = start + slot->npages;
595 
596 	if (!xa_empty(&gmem->bindings) &&
597 	    xa_find(&gmem->bindings, &start, end - 1, XA_PRESENT)) {
598 		filemap_invalidate_unlock(inode->i_mapping);
599 		goto err;
600 	}
601 
602 	/*
603 	 * memslots of flag KVM_MEM_GUEST_MEMFD are immutable to change, so
604 	 * kvm_gmem_bind() must occur on a new memslot.  Because the memslot
605 	 * is not visible yet, kvm_gmem_get_pfn() is guaranteed to see the file.
606 	 */
607 	WRITE_ONCE(slot->gmem.file, file);
608 	slot->gmem.pgoff = start;
609 	if (kvm_gmem_supports_mmap(inode))
610 		slot->flags |= KVM_MEMSLOT_GMEM_ONLY;
611 
612 	xa_store_range(&gmem->bindings, start, end - 1, slot, GFP_KERNEL);
613 	filemap_invalidate_unlock(inode->i_mapping);
614 
615 	/*
616 	 * Drop the reference to the file, even on success.  The file pins KVM,
617 	 * not the other way 'round.  Active bindings are invalidated if the
618 	 * file is closed before memslots are destroyed.
619 	 */
620 	r = 0;
621 err:
622 	fput(file);
623 	return r;
624 }
625 
kvm_gmem_unbind(struct kvm_memory_slot * slot)626 void kvm_gmem_unbind(struct kvm_memory_slot *slot)
627 {
628 	unsigned long start = slot->gmem.pgoff;
629 	unsigned long end = start + slot->npages;
630 	struct kvm_gmem *gmem;
631 	struct file *file;
632 
633 	/*
634 	 * Nothing to do if the underlying file was already closed (or is being
635 	 * closed right now), kvm_gmem_release() invalidates all bindings.
636 	 */
637 	file = kvm_gmem_get_file(slot);
638 	if (!file)
639 		return;
640 
641 	gmem = file->private_data;
642 
643 	filemap_invalidate_lock(file->f_mapping);
644 	xa_store_range(&gmem->bindings, start, end - 1, NULL, GFP_KERNEL);
645 
646 	/*
647 	 * synchronize_srcu(&kvm->srcu) ensured that kvm_gmem_get_pfn()
648 	 * cannot see this memslot.
649 	 */
650 	WRITE_ONCE(slot->gmem.file, NULL);
651 	filemap_invalidate_unlock(file->f_mapping);
652 
653 	fput(file);
654 }
655 
656 /* Returns a locked folio on success.  */
__kvm_gmem_get_pfn(struct file * file,struct kvm_memory_slot * slot,pgoff_t index,kvm_pfn_t * pfn,bool * is_prepared,int * max_order)657 static struct folio *__kvm_gmem_get_pfn(struct file *file,
658 					struct kvm_memory_slot *slot,
659 					pgoff_t index, kvm_pfn_t *pfn,
660 					bool *is_prepared, int *max_order)
661 {
662 	struct file *gmem_file = READ_ONCE(slot->gmem.file);
663 	struct kvm_gmem *gmem = file->private_data;
664 	struct folio *folio;
665 
666 	if (file != gmem_file) {
667 		WARN_ON_ONCE(gmem_file);
668 		return ERR_PTR(-EFAULT);
669 	}
670 
671 	gmem = file->private_data;
672 	if (xa_load(&gmem->bindings, index) != slot) {
673 		WARN_ON_ONCE(xa_load(&gmem->bindings, index));
674 		return ERR_PTR(-EIO);
675 	}
676 
677 	folio = kvm_gmem_get_folio(file_inode(file), index);
678 	if (IS_ERR(folio))
679 		return folio;
680 
681 	if (folio_test_hwpoison(folio)) {
682 		folio_unlock(folio);
683 		folio_put(folio);
684 		return ERR_PTR(-EHWPOISON);
685 	}
686 
687 	*pfn = folio_file_pfn(folio, index);
688 	if (max_order)
689 		*max_order = 0;
690 
691 	*is_prepared = folio_test_uptodate(folio);
692 	return folio;
693 }
694 
kvm_gmem_get_pfn(struct kvm * kvm,struct kvm_memory_slot * slot,gfn_t gfn,kvm_pfn_t * pfn,struct page ** page,int * max_order)695 int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
696 		     gfn_t gfn, kvm_pfn_t *pfn, struct page **page,
697 		     int *max_order)
698 {
699 	pgoff_t index = kvm_gmem_get_index(slot, gfn);
700 	struct file *file = kvm_gmem_get_file(slot);
701 	struct folio *folio;
702 	bool is_prepared = false;
703 	int r = 0;
704 
705 	if (!file)
706 		return -EFAULT;
707 
708 	folio = __kvm_gmem_get_pfn(file, slot, index, pfn, &is_prepared, max_order);
709 	if (IS_ERR(folio)) {
710 		r = PTR_ERR(folio);
711 		goto out;
712 	}
713 
714 	if (!is_prepared)
715 		r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio);
716 
717 	folio_unlock(folio);
718 
719 	if (!r)
720 		*page = folio_file_page(folio, index);
721 	else
722 		folio_put(folio);
723 
724 out:
725 	fput(file);
726 	return r;
727 }
728 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_get_pfn);
729 
730 #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_POPULATE
kvm_gmem_populate(struct kvm * kvm,gfn_t start_gfn,void __user * src,long npages,kvm_gmem_populate_cb post_populate,void * opaque)731 long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long npages,
732 		       kvm_gmem_populate_cb post_populate, void *opaque)
733 {
734 	struct file *file;
735 	struct kvm_memory_slot *slot;
736 	void __user *p;
737 
738 	int ret = 0, max_order;
739 	long i;
740 
741 	lockdep_assert_held(&kvm->slots_lock);
742 
743 	if (WARN_ON_ONCE(npages <= 0))
744 		return -EINVAL;
745 
746 	slot = gfn_to_memslot(kvm, start_gfn);
747 	if (!kvm_slot_has_gmem(slot))
748 		return -EINVAL;
749 
750 	file = kvm_gmem_get_file(slot);
751 	if (!file)
752 		return -EFAULT;
753 
754 	filemap_invalidate_lock(file->f_mapping);
755 
756 	npages = min_t(ulong, slot->npages - (start_gfn - slot->base_gfn), npages);
757 	for (i = 0; i < npages; i += (1 << max_order)) {
758 		struct folio *folio;
759 		gfn_t gfn = start_gfn + i;
760 		pgoff_t index = kvm_gmem_get_index(slot, gfn);
761 		bool is_prepared = false;
762 		kvm_pfn_t pfn;
763 
764 		if (signal_pending(current)) {
765 			ret = -EINTR;
766 			break;
767 		}
768 
769 		folio = __kvm_gmem_get_pfn(file, slot, index, &pfn, &is_prepared, &max_order);
770 		if (IS_ERR(folio)) {
771 			ret = PTR_ERR(folio);
772 			break;
773 		}
774 
775 		if (is_prepared) {
776 			folio_unlock(folio);
777 			folio_put(folio);
778 			ret = -EEXIST;
779 			break;
780 		}
781 
782 		folio_unlock(folio);
783 		WARN_ON(!IS_ALIGNED(gfn, 1 << max_order) ||
784 			(npages - i) < (1 << max_order));
785 
786 		ret = -EINVAL;
787 		while (!kvm_range_has_memory_attributes(kvm, gfn, gfn + (1 << max_order),
788 							KVM_MEMORY_ATTRIBUTE_PRIVATE,
789 							KVM_MEMORY_ATTRIBUTE_PRIVATE)) {
790 			if (!max_order)
791 				goto put_folio_and_exit;
792 			max_order--;
793 		}
794 
795 		p = src ? src + i * PAGE_SIZE : NULL;
796 		ret = post_populate(kvm, gfn, pfn, p, max_order, opaque);
797 		if (!ret)
798 			kvm_gmem_mark_prepared(folio);
799 
800 put_folio_and_exit:
801 		folio_put(folio);
802 		if (ret)
803 			break;
804 	}
805 
806 	filemap_invalidate_unlock(file->f_mapping);
807 
808 	fput(file);
809 	return ret && !i ? ret : i;
810 }
811 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_populate);
812 #endif
813