1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/anon_inodes.h> 3 #include <linux/backing-dev.h> 4 #include <linux/falloc.h> 5 #include <linux/fs.h> 6 #include <linux/kvm_host.h> 7 #include <linux/mempolicy.h> 8 #include <linux/pseudo_fs.h> 9 #include <linux/pagemap.h> 10 11 #include "kvm_mm.h" 12 13 static struct vfsmount *kvm_gmem_mnt; 14 15 /* 16 * A guest_memfd instance can be associated multiple VMs, each with its own 17 * "view" of the underlying physical memory. 18 * 19 * The gmem's inode is effectively the raw underlying physical storage, and is 20 * used to track properties of the physical memory, while each gmem file is 21 * effectively a single VM's view of that storage, and is used to track assets 22 * specific to its associated VM, e.g. memslots=>gmem bindings. 23 */ 24 struct gmem_file { 25 struct kvm *kvm; 26 struct xarray bindings; 27 struct list_head entry; 28 }; 29 30 struct gmem_inode { 31 struct shared_policy policy; 32 struct inode vfs_inode; 33 struct list_head gmem_file_list; 34 35 u64 flags; 36 }; 37 38 static __always_inline struct gmem_inode *GMEM_I(struct inode *inode) 39 { 40 return container_of(inode, struct gmem_inode, vfs_inode); 41 } 42 43 #define kvm_gmem_for_each_file(f, inode) \ 44 list_for_each_entry(f, &GMEM_I(inode)->gmem_file_list, entry) 45 46 /** 47 * folio_file_pfn - like folio_file_page, but return a pfn. 48 * @folio: The folio which contains this index. 49 * @index: The index we want to look up. 50 * 51 * Return: The pfn for this index. 52 */ 53 static inline kvm_pfn_t folio_file_pfn(struct folio *folio, pgoff_t index) 54 { 55 return folio_pfn(folio) + (index & (folio_nr_pages(folio) - 1)); 56 } 57 58 static pgoff_t kvm_gmem_get_index(struct kvm_memory_slot *slot, gfn_t gfn) 59 { 60 return gfn - slot->base_gfn + slot->gmem.pgoff; 61 } 62 63 static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot, 64 pgoff_t index, struct folio *folio) 65 { 66 #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE 67 kvm_pfn_t pfn = folio_file_pfn(folio, index); 68 gfn_t gfn = slot->base_gfn + index - slot->gmem.pgoff; 69 int rc = kvm_arch_gmem_prepare(kvm, gfn, pfn, folio_order(folio)); 70 if (rc) { 71 pr_warn_ratelimited("gmem: Failed to prepare folio for index %lx GFN %llx PFN %llx error %d.\n", 72 index, gfn, pfn, rc); 73 return rc; 74 } 75 #endif 76 77 return 0; 78 } 79 80 /* 81 * Process @folio, which contains @gfn, so that the guest can use it. 82 * The folio must be locked and the gfn must be contained in @slot. 83 * On successful return the guest sees a zero page so as to avoid 84 * leaking host data and the up-to-date flag is set. 85 */ 86 static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot, 87 gfn_t gfn, struct folio *folio) 88 { 89 pgoff_t index; 90 91 /* 92 * Preparing huge folios should always be safe, since it should 93 * be possible to split them later if needed. 94 * 95 * Right now the folio order is always going to be zero, but the 96 * code is ready for huge folios. The only assumption is that 97 * the base pgoff of memslots is naturally aligned with the 98 * requested page order, ensuring that huge folios can also use 99 * huge page table entries for GPA->HPA mapping. 100 * 101 * The order will be passed when creating the guest_memfd, and 102 * checked when creating memslots. 103 */ 104 WARN_ON(!IS_ALIGNED(slot->gmem.pgoff, folio_nr_pages(folio))); 105 index = kvm_gmem_get_index(slot, gfn); 106 index = ALIGN_DOWN(index, folio_nr_pages(folio)); 107 108 return __kvm_gmem_prepare_folio(kvm, slot, index, folio); 109 } 110 111 /* 112 * Returns a locked folio on success. The caller is responsible for 113 * setting the up-to-date flag before the memory is mapped into the guest. 114 * There is no backing storage for the memory, so the folio will remain 115 * up-to-date until it's removed. 116 * 117 * Ignore accessed, referenced, and dirty flags. The memory is 118 * unevictable and there is no storage to write back to. 119 */ 120 static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index) 121 { 122 /* TODO: Support huge pages. */ 123 struct mempolicy *policy; 124 struct folio *folio; 125 126 /* 127 * Fast-path: See if folio is already present in mapping to avoid 128 * policy_lookup. 129 */ 130 folio = __filemap_get_folio(inode->i_mapping, index, 131 FGP_LOCK | FGP_ACCESSED, 0); 132 if (!IS_ERR(folio)) 133 return folio; 134 135 policy = mpol_shared_policy_lookup(&GMEM_I(inode)->policy, index); 136 folio = __filemap_get_folio_mpol(inode->i_mapping, index, 137 FGP_LOCK | FGP_ACCESSED | FGP_CREAT, 138 mapping_gfp_mask(inode->i_mapping), policy); 139 mpol_cond_put(policy); 140 141 /* 142 * External interfaces like kvm_gmem_get_pfn() support dealing 143 * with hugepages to a degree, but internally, guest_memfd currently 144 * assumes that all folios are order-0 and handling would need 145 * to be updated for anything otherwise (e.g. page-clearing 146 * operations). 147 */ 148 WARN_ON_ONCE(!IS_ERR(folio) && folio_order(folio)); 149 150 return folio; 151 } 152 153 static enum kvm_gfn_range_filter kvm_gmem_get_invalidate_filter(struct inode *inode) 154 { 155 if (GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_INIT_SHARED) 156 return KVM_FILTER_SHARED; 157 158 return KVM_FILTER_PRIVATE; 159 } 160 161 static void __kvm_gmem_invalidate_begin(struct gmem_file *f, pgoff_t start, 162 pgoff_t end, 163 enum kvm_gfn_range_filter attr_filter) 164 { 165 bool flush = false, found_memslot = false; 166 struct kvm_memory_slot *slot; 167 struct kvm *kvm = f->kvm; 168 unsigned long index; 169 170 xa_for_each_range(&f->bindings, index, slot, start, end - 1) { 171 pgoff_t pgoff = slot->gmem.pgoff; 172 173 struct kvm_gfn_range gfn_range = { 174 .start = slot->base_gfn + max(pgoff, start) - pgoff, 175 .end = slot->base_gfn + min(pgoff + slot->npages, end) - pgoff, 176 .slot = slot, 177 .may_block = true, 178 .attr_filter = attr_filter, 179 }; 180 181 if (!found_memslot) { 182 found_memslot = true; 183 184 KVM_MMU_LOCK(kvm); 185 kvm_mmu_invalidate_begin(kvm); 186 } 187 188 flush |= kvm_mmu_unmap_gfn_range(kvm, &gfn_range); 189 } 190 191 if (flush) 192 kvm_flush_remote_tlbs(kvm); 193 194 if (found_memslot) 195 KVM_MMU_UNLOCK(kvm); 196 } 197 198 static void kvm_gmem_invalidate_begin(struct inode *inode, pgoff_t start, 199 pgoff_t end) 200 { 201 enum kvm_gfn_range_filter attr_filter; 202 struct gmem_file *f; 203 204 attr_filter = kvm_gmem_get_invalidate_filter(inode); 205 206 kvm_gmem_for_each_file(f, inode) 207 __kvm_gmem_invalidate_begin(f, start, end, attr_filter); 208 } 209 210 static void __kvm_gmem_invalidate_end(struct gmem_file *f, pgoff_t start, 211 pgoff_t end) 212 { 213 struct kvm *kvm = f->kvm; 214 215 if (xa_find(&f->bindings, &start, end - 1, XA_PRESENT)) { 216 KVM_MMU_LOCK(kvm); 217 kvm_mmu_invalidate_end(kvm); 218 KVM_MMU_UNLOCK(kvm); 219 } 220 } 221 222 static void kvm_gmem_invalidate_end(struct inode *inode, pgoff_t start, 223 pgoff_t end) 224 { 225 struct gmem_file *f; 226 227 kvm_gmem_for_each_file(f, inode) 228 __kvm_gmem_invalidate_end(f, start, end); 229 } 230 231 static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len) 232 { 233 pgoff_t start = offset >> PAGE_SHIFT; 234 pgoff_t end = (offset + len) >> PAGE_SHIFT; 235 236 /* 237 * Bindings must be stable across invalidation to ensure the start+end 238 * are balanced. 239 */ 240 filemap_invalidate_lock(inode->i_mapping); 241 242 kvm_gmem_invalidate_begin(inode, start, end); 243 244 truncate_inode_pages_range(inode->i_mapping, offset, offset + len - 1); 245 246 kvm_gmem_invalidate_end(inode, start, end); 247 248 filemap_invalidate_unlock(inode->i_mapping); 249 250 return 0; 251 } 252 253 static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len) 254 { 255 struct address_space *mapping = inode->i_mapping; 256 pgoff_t start, index, end; 257 int r; 258 259 /* Dedicated guest is immutable by default. */ 260 if (offset + len > i_size_read(inode)) 261 return -EINVAL; 262 263 filemap_invalidate_lock_shared(mapping); 264 265 start = offset >> PAGE_SHIFT; 266 end = (offset + len) >> PAGE_SHIFT; 267 268 r = 0; 269 for (index = start; index < end; ) { 270 struct folio *folio; 271 272 if (signal_pending(current)) { 273 r = -EINTR; 274 break; 275 } 276 277 folio = kvm_gmem_get_folio(inode, index); 278 if (IS_ERR(folio)) { 279 r = PTR_ERR(folio); 280 break; 281 } 282 283 index = folio_next_index(folio); 284 285 folio_unlock(folio); 286 folio_put(folio); 287 288 /* 64-bit only, wrapping the index should be impossible. */ 289 if (WARN_ON_ONCE(!index)) 290 break; 291 292 cond_resched(); 293 } 294 295 filemap_invalidate_unlock_shared(mapping); 296 297 return r; 298 } 299 300 static long kvm_gmem_fallocate(struct file *file, int mode, loff_t offset, 301 loff_t len) 302 { 303 int ret; 304 305 if (!(mode & FALLOC_FL_KEEP_SIZE)) 306 return -EOPNOTSUPP; 307 308 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) 309 return -EOPNOTSUPP; 310 311 if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len)) 312 return -EINVAL; 313 314 if (mode & FALLOC_FL_PUNCH_HOLE) 315 ret = kvm_gmem_punch_hole(file_inode(file), offset, len); 316 else 317 ret = kvm_gmem_allocate(file_inode(file), offset, len); 318 319 if (!ret) 320 file_modified(file); 321 return ret; 322 } 323 324 static int kvm_gmem_release(struct inode *inode, struct file *file) 325 { 326 struct gmem_file *f = file->private_data; 327 struct kvm_memory_slot *slot; 328 struct kvm *kvm = f->kvm; 329 unsigned long index; 330 331 /* 332 * Prevent concurrent attempts to *unbind* a memslot. This is the last 333 * reference to the file and thus no new bindings can be created, but 334 * dereferencing the slot for existing bindings needs to be protected 335 * against memslot updates, specifically so that unbind doesn't race 336 * and free the memslot (kvm_gmem_get_file() will return NULL). 337 * 338 * Since .release is called only when the reference count is zero, 339 * after which file_ref_get() and get_file_active() fail, 340 * kvm_gmem_get_pfn() cannot be using the file concurrently. 341 * file_ref_put() provides a full barrier, and get_file_active() the 342 * matching acquire barrier. 343 */ 344 mutex_lock(&kvm->slots_lock); 345 346 filemap_invalidate_lock(inode->i_mapping); 347 348 xa_for_each(&f->bindings, index, slot) 349 WRITE_ONCE(slot->gmem.file, NULL); 350 351 /* 352 * All in-flight operations are gone and new bindings can be created. 353 * Zap all SPTEs pointed at by this file. Do not free the backing 354 * memory, as its lifetime is associated with the inode, not the file. 355 */ 356 __kvm_gmem_invalidate_begin(f, 0, -1ul, 357 kvm_gmem_get_invalidate_filter(inode)); 358 __kvm_gmem_invalidate_end(f, 0, -1ul); 359 360 list_del(&f->entry); 361 362 filemap_invalidate_unlock(inode->i_mapping); 363 364 mutex_unlock(&kvm->slots_lock); 365 366 xa_destroy(&f->bindings); 367 kfree(f); 368 369 kvm_put_kvm(kvm); 370 371 return 0; 372 } 373 374 static inline struct file *kvm_gmem_get_file(struct kvm_memory_slot *slot) 375 { 376 /* 377 * Do not return slot->gmem.file if it has already been closed; 378 * there might be some time between the last fput() and when 379 * kvm_gmem_release() clears slot->gmem.file. 380 */ 381 return get_file_active(&slot->gmem.file); 382 } 383 384 DEFINE_CLASS(gmem_get_file, struct file *, if (_T) fput(_T), 385 kvm_gmem_get_file(slot), struct kvm_memory_slot *slot); 386 387 static bool kvm_gmem_supports_mmap(struct inode *inode) 388 { 389 return GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_MMAP; 390 } 391 392 static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf) 393 { 394 struct inode *inode = file_inode(vmf->vma->vm_file); 395 struct folio *folio; 396 vm_fault_t ret = VM_FAULT_LOCKED; 397 398 if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode)) 399 return VM_FAULT_SIGBUS; 400 401 if (!(GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_INIT_SHARED)) 402 return VM_FAULT_SIGBUS; 403 404 folio = kvm_gmem_get_folio(inode, vmf->pgoff); 405 if (IS_ERR(folio)) { 406 if (PTR_ERR(folio) == -EAGAIN) 407 return VM_FAULT_RETRY; 408 409 return vmf_error(PTR_ERR(folio)); 410 } 411 412 if (WARN_ON_ONCE(folio_test_large(folio))) { 413 ret = VM_FAULT_SIGBUS; 414 goto out_folio; 415 } 416 417 if (!folio_test_uptodate(folio)) { 418 clear_highpage(folio_page(folio, 0)); 419 folio_mark_uptodate(folio); 420 } 421 422 vmf->page = folio_file_page(folio, vmf->pgoff); 423 424 out_folio: 425 if (ret != VM_FAULT_LOCKED) { 426 folio_unlock(folio); 427 folio_put(folio); 428 } 429 430 return ret; 431 } 432 433 #ifdef CONFIG_NUMA 434 static int kvm_gmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol) 435 { 436 struct inode *inode = file_inode(vma->vm_file); 437 438 return mpol_set_shared_policy(&GMEM_I(inode)->policy, vma, mpol); 439 } 440 441 static struct mempolicy *kvm_gmem_get_policy(struct vm_area_struct *vma, 442 unsigned long addr, pgoff_t *pgoff) 443 { 444 struct inode *inode = file_inode(vma->vm_file); 445 446 *pgoff = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT); 447 448 /* 449 * Return the memory policy for this index, or NULL if none is set. 450 * 451 * Returning NULL, e.g. instead of the current task's memory policy, is 452 * important for the .get_policy kernel ABI: it indicates that no 453 * explicit policy has been set via mbind() for this memory. The caller 454 * can then replace NULL with the default memory policy instead of the 455 * current task's memory policy. 456 */ 457 return mpol_shared_policy_lookup(&GMEM_I(inode)->policy, *pgoff); 458 } 459 #endif /* CONFIG_NUMA */ 460 461 static const struct vm_operations_struct kvm_gmem_vm_ops = { 462 .fault = kvm_gmem_fault_user_mapping, 463 #ifdef CONFIG_NUMA 464 .get_policy = kvm_gmem_get_policy, 465 .set_policy = kvm_gmem_set_policy, 466 #endif 467 }; 468 469 static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma) 470 { 471 if (!kvm_gmem_supports_mmap(file_inode(file))) 472 return -ENODEV; 473 474 if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) != 475 (VM_SHARED | VM_MAYSHARE)) { 476 return -EINVAL; 477 } 478 479 vma->vm_ops = &kvm_gmem_vm_ops; 480 481 return 0; 482 } 483 484 static struct file_operations kvm_gmem_fops = { 485 .mmap = kvm_gmem_mmap, 486 .open = generic_file_open, 487 .release = kvm_gmem_release, 488 .fallocate = kvm_gmem_fallocate, 489 }; 490 491 static int kvm_gmem_migrate_folio(struct address_space *mapping, 492 struct folio *dst, struct folio *src, 493 enum migrate_mode mode) 494 { 495 WARN_ON_ONCE(1); 496 return -EINVAL; 497 } 498 499 static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *folio) 500 { 501 pgoff_t start, end; 502 503 filemap_invalidate_lock_shared(mapping); 504 505 start = folio->index; 506 end = start + folio_nr_pages(folio); 507 508 kvm_gmem_invalidate_begin(mapping->host, start, end); 509 510 /* 511 * Do not truncate the range, what action is taken in response to the 512 * error is userspace's decision (assuming the architecture supports 513 * gracefully handling memory errors). If/when the guest attempts to 514 * access a poisoned page, kvm_gmem_get_pfn() will return -EHWPOISON, 515 * at which point KVM can either terminate the VM or propagate the 516 * error to userspace. 517 */ 518 519 kvm_gmem_invalidate_end(mapping->host, start, end); 520 521 filemap_invalidate_unlock_shared(mapping); 522 523 return MF_DELAYED; 524 } 525 526 #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE 527 static void kvm_gmem_free_folio(struct folio *folio) 528 { 529 struct page *page = folio_page(folio, 0); 530 kvm_pfn_t pfn = page_to_pfn(page); 531 int order = folio_order(folio); 532 533 kvm_arch_gmem_invalidate(pfn, pfn + (1ul << order)); 534 } 535 #endif 536 537 static const struct address_space_operations kvm_gmem_aops = { 538 .dirty_folio = noop_dirty_folio, 539 .migrate_folio = kvm_gmem_migrate_folio, 540 .error_remove_folio = kvm_gmem_error_folio, 541 #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE 542 .free_folio = kvm_gmem_free_folio, 543 #endif 544 }; 545 546 static int kvm_gmem_setattr(struct mnt_idmap *idmap, struct dentry *dentry, 547 struct iattr *attr) 548 { 549 return -EINVAL; 550 } 551 static const struct inode_operations kvm_gmem_iops = { 552 .setattr = kvm_gmem_setattr, 553 }; 554 555 bool __weak kvm_arch_supports_gmem_init_shared(struct kvm *kvm) 556 { 557 return true; 558 } 559 560 static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) 561 { 562 static const char *name = "[kvm-gmem]"; 563 struct gmem_file *f; 564 struct inode *inode; 565 struct file *file; 566 int fd, err; 567 568 fd = get_unused_fd_flags(0); 569 if (fd < 0) 570 return fd; 571 572 f = kzalloc_obj(*f); 573 if (!f) { 574 err = -ENOMEM; 575 goto err_fd; 576 } 577 578 /* __fput() will take care of fops_put(). */ 579 if (!fops_get(&kvm_gmem_fops)) { 580 err = -ENOENT; 581 goto err_gmem; 582 } 583 584 inode = anon_inode_make_secure_inode(kvm_gmem_mnt->mnt_sb, name, NULL); 585 if (IS_ERR(inode)) { 586 err = PTR_ERR(inode); 587 goto err_fops; 588 } 589 590 inode->i_op = &kvm_gmem_iops; 591 inode->i_mapping->a_ops = &kvm_gmem_aops; 592 inode->i_mode |= S_IFREG; 593 inode->i_size = size; 594 mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); 595 mapping_set_inaccessible(inode->i_mapping); 596 /* Unmovable mappings are supposed to be marked unevictable as well. */ 597 WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping)); 598 599 GMEM_I(inode)->flags = flags; 600 601 file = alloc_file_pseudo(inode, kvm_gmem_mnt, name, O_RDWR, &kvm_gmem_fops); 602 if (IS_ERR(file)) { 603 err = PTR_ERR(file); 604 goto err_inode; 605 } 606 607 file->f_flags |= O_LARGEFILE; 608 file->private_data = f; 609 610 kvm_get_kvm(kvm); 611 f->kvm = kvm; 612 xa_init(&f->bindings); 613 list_add(&f->entry, &GMEM_I(inode)->gmem_file_list); 614 615 fd_install(fd, file); 616 return fd; 617 618 err_inode: 619 iput(inode); 620 err_fops: 621 fops_put(&kvm_gmem_fops); 622 err_gmem: 623 kfree(f); 624 err_fd: 625 put_unused_fd(fd); 626 return err; 627 } 628 629 int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args) 630 { 631 loff_t size = args->size; 632 u64 flags = args->flags; 633 634 if (flags & ~kvm_gmem_get_supported_flags(kvm)) 635 return -EINVAL; 636 637 if (size <= 0 || !PAGE_ALIGNED(size)) 638 return -EINVAL; 639 640 return __kvm_gmem_create(kvm, size, flags); 641 } 642 643 int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot, 644 unsigned int fd, loff_t offset) 645 { 646 loff_t size = slot->npages << PAGE_SHIFT; 647 unsigned long start, end; 648 struct gmem_file *f; 649 struct inode *inode; 650 struct file *file; 651 int r = -EINVAL; 652 653 BUILD_BUG_ON(sizeof(gfn_t) != sizeof(slot->gmem.pgoff)); 654 655 file = fget(fd); 656 if (!file) 657 return -EBADF; 658 659 if (file->f_op != &kvm_gmem_fops) 660 goto err; 661 662 f = file->private_data; 663 if (f->kvm != kvm) 664 goto err; 665 666 inode = file_inode(file); 667 668 if (offset < 0 || !PAGE_ALIGNED(offset) || 669 offset + size > i_size_read(inode)) 670 goto err; 671 672 filemap_invalidate_lock(inode->i_mapping); 673 674 start = offset >> PAGE_SHIFT; 675 end = start + slot->npages; 676 677 if (!xa_empty(&f->bindings) && 678 xa_find(&f->bindings, &start, end - 1, XA_PRESENT)) { 679 filemap_invalidate_unlock(inode->i_mapping); 680 goto err; 681 } 682 683 /* 684 * memslots of flag KVM_MEM_GUEST_MEMFD are immutable to change, so 685 * kvm_gmem_bind() must occur on a new memslot. Because the memslot 686 * is not visible yet, kvm_gmem_get_pfn() is guaranteed to see the file. 687 */ 688 WRITE_ONCE(slot->gmem.file, file); 689 slot->gmem.pgoff = start; 690 if (kvm_gmem_supports_mmap(inode)) 691 slot->flags |= KVM_MEMSLOT_GMEM_ONLY; 692 693 xa_store_range(&f->bindings, start, end - 1, slot, GFP_KERNEL); 694 filemap_invalidate_unlock(inode->i_mapping); 695 696 /* 697 * Drop the reference to the file, even on success. The file pins KVM, 698 * not the other way 'round. Active bindings are invalidated if the 699 * file is closed before memslots are destroyed. 700 */ 701 r = 0; 702 err: 703 fput(file); 704 return r; 705 } 706 707 static void __kvm_gmem_unbind(struct kvm_memory_slot *slot, struct gmem_file *f) 708 { 709 unsigned long start = slot->gmem.pgoff; 710 unsigned long end = start + slot->npages; 711 712 xa_store_range(&f->bindings, start, end - 1, NULL, GFP_KERNEL); 713 714 /* 715 * synchronize_srcu(&kvm->srcu) ensured that kvm_gmem_get_pfn() 716 * cannot see this memslot. 717 */ 718 WRITE_ONCE(slot->gmem.file, NULL); 719 } 720 721 void kvm_gmem_unbind(struct kvm_memory_slot *slot) 722 { 723 /* 724 * Nothing to do if the underlying file was _already_ closed, as 725 * kvm_gmem_release() invalidates and nullifies all bindings. 726 */ 727 if (!slot->gmem.file) 728 return; 729 730 CLASS(gmem_get_file, file)(slot); 731 732 /* 733 * However, if the file is _being_ closed, then the bindings need to be 734 * removed as kvm_gmem_release() might not run until after the memslot 735 * is freed. Note, modifying the bindings is safe even though the file 736 * is dying as kvm_gmem_release() nullifies slot->gmem.file under 737 * slots_lock, and only puts its reference to KVM after destroying all 738 * bindings. I.e. reaching this point means kvm_gmem_release() hasn't 739 * yet destroyed the bindings or freed the gmem_file, and can't do so 740 * until the caller drops slots_lock. 741 */ 742 if (!file) { 743 __kvm_gmem_unbind(slot, slot->gmem.file->private_data); 744 return; 745 } 746 747 filemap_invalidate_lock(file->f_mapping); 748 __kvm_gmem_unbind(slot, file->private_data); 749 filemap_invalidate_unlock(file->f_mapping); 750 } 751 752 /* Returns a locked folio on success. */ 753 static struct folio *__kvm_gmem_get_pfn(struct file *file, 754 struct kvm_memory_slot *slot, 755 pgoff_t index, kvm_pfn_t *pfn, 756 int *max_order) 757 { 758 struct file *slot_file = READ_ONCE(slot->gmem.file); 759 struct gmem_file *f = file->private_data; 760 struct folio *folio; 761 762 if (file != slot_file) { 763 WARN_ON_ONCE(slot_file); 764 return ERR_PTR(-EFAULT); 765 } 766 767 if (xa_load(&f->bindings, index) != slot) { 768 WARN_ON_ONCE(xa_load(&f->bindings, index)); 769 return ERR_PTR(-EIO); 770 } 771 772 folio = kvm_gmem_get_folio(file_inode(file), index); 773 if (IS_ERR(folio)) 774 return folio; 775 776 if (folio_test_hwpoison(folio)) { 777 folio_unlock(folio); 778 folio_put(folio); 779 return ERR_PTR(-EHWPOISON); 780 } 781 782 *pfn = folio_file_pfn(folio, index); 783 if (max_order) 784 *max_order = 0; 785 786 return folio; 787 } 788 789 int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, 790 gfn_t gfn, kvm_pfn_t *pfn, struct page **page, 791 int *max_order) 792 { 793 pgoff_t index = kvm_gmem_get_index(slot, gfn); 794 struct folio *folio; 795 int r = 0; 796 797 CLASS(gmem_get_file, file)(slot); 798 if (!file) 799 return -EFAULT; 800 801 folio = __kvm_gmem_get_pfn(file, slot, index, pfn, max_order); 802 if (IS_ERR(folio)) 803 return PTR_ERR(folio); 804 805 if (!folio_test_uptodate(folio)) { 806 clear_highpage(folio_page(folio, 0)); 807 folio_mark_uptodate(folio); 808 } 809 810 r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio); 811 812 folio_unlock(folio); 813 814 if (!r) 815 *page = folio_file_page(folio, index); 816 else 817 folio_put(folio); 818 819 return r; 820 } 821 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_get_pfn); 822 823 #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_POPULATE 824 825 static long __kvm_gmem_populate(struct kvm *kvm, struct kvm_memory_slot *slot, 826 struct file *file, gfn_t gfn, struct page *src_page, 827 kvm_gmem_populate_cb post_populate, void *opaque) 828 { 829 pgoff_t index = kvm_gmem_get_index(slot, gfn); 830 struct folio *folio; 831 kvm_pfn_t pfn; 832 int ret; 833 834 filemap_invalidate_lock(file->f_mapping); 835 836 folio = __kvm_gmem_get_pfn(file, slot, index, &pfn, NULL); 837 if (IS_ERR(folio)) { 838 ret = PTR_ERR(folio); 839 goto out_unlock; 840 } 841 842 folio_unlock(folio); 843 844 if (!kvm_range_has_memory_attributes(kvm, gfn, gfn + 1, 845 KVM_MEMORY_ATTRIBUTE_PRIVATE, 846 KVM_MEMORY_ATTRIBUTE_PRIVATE)) { 847 ret = -EINVAL; 848 goto out_put_folio; 849 } 850 851 ret = post_populate(kvm, gfn, pfn, src_page, opaque); 852 if (!ret) 853 folio_mark_uptodate(folio); 854 855 out_put_folio: 856 folio_put(folio); 857 out_unlock: 858 filemap_invalidate_unlock(file->f_mapping); 859 return ret; 860 } 861 862 long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long npages, 863 kvm_gmem_populate_cb post_populate, void *opaque) 864 { 865 struct kvm_memory_slot *slot; 866 int ret = 0; 867 long i; 868 869 lockdep_assert_held(&kvm->slots_lock); 870 871 if (WARN_ON_ONCE(npages <= 0)) 872 return -EINVAL; 873 874 if (WARN_ON_ONCE(!PAGE_ALIGNED(src))) 875 return -EINVAL; 876 877 slot = gfn_to_memslot(kvm, start_gfn); 878 if (!kvm_slot_has_gmem(slot)) 879 return -EINVAL; 880 881 CLASS(gmem_get_file, file)(slot); 882 if (!file) 883 return -EFAULT; 884 885 npages = min_t(ulong, slot->npages - (start_gfn - slot->base_gfn), npages); 886 for (i = 0; i < npages; i++) { 887 struct page *src_page = NULL; 888 889 if (signal_pending(current)) { 890 ret = -EINTR; 891 break; 892 } 893 894 if (src) { 895 unsigned long uaddr = (unsigned long)src + i * PAGE_SIZE; 896 897 ret = get_user_pages_fast(uaddr, 1, 0, &src_page); 898 if (ret < 0) 899 break; 900 if (ret != 1) { 901 ret = -ENOMEM; 902 break; 903 } 904 } 905 906 ret = __kvm_gmem_populate(kvm, slot, file, start_gfn + i, src_page, 907 post_populate, opaque); 908 909 if (src_page) 910 put_page(src_page); 911 912 if (ret) 913 break; 914 } 915 916 return ret && !i ? ret : i; 917 } 918 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_populate); 919 #endif 920 921 static struct kmem_cache *kvm_gmem_inode_cachep; 922 923 static void kvm_gmem_init_inode_once(void *__gi) 924 { 925 struct gmem_inode *gi = __gi; 926 927 /* 928 * Note! Don't initialize the inode with anything specific to the 929 * guest_memfd instance, or that might be specific to how the inode is 930 * used (from the VFS-layer's perspective). This hook is called only 931 * during the initial slab allocation, i.e. only fields/state that are 932 * idempotent across _all_ use of the inode _object_ can be initialized 933 * at this time! 934 */ 935 inode_init_once(&gi->vfs_inode); 936 } 937 938 static struct inode *kvm_gmem_alloc_inode(struct super_block *sb) 939 { 940 struct gmem_inode *gi; 941 942 gi = alloc_inode_sb(sb, kvm_gmem_inode_cachep, GFP_KERNEL); 943 if (!gi) 944 return NULL; 945 946 mpol_shared_policy_init(&gi->policy, NULL); 947 948 gi->flags = 0; 949 INIT_LIST_HEAD(&gi->gmem_file_list); 950 return &gi->vfs_inode; 951 } 952 953 static void kvm_gmem_destroy_inode(struct inode *inode) 954 { 955 mpol_free_shared_policy(&GMEM_I(inode)->policy); 956 } 957 958 static void kvm_gmem_free_inode(struct inode *inode) 959 { 960 kmem_cache_free(kvm_gmem_inode_cachep, GMEM_I(inode)); 961 } 962 963 static const struct super_operations kvm_gmem_super_operations = { 964 .statfs = simple_statfs, 965 .alloc_inode = kvm_gmem_alloc_inode, 966 .destroy_inode = kvm_gmem_destroy_inode, 967 .free_inode = kvm_gmem_free_inode, 968 }; 969 970 static int kvm_gmem_init_fs_context(struct fs_context *fc) 971 { 972 struct pseudo_fs_context *ctx; 973 974 if (!init_pseudo(fc, GUEST_MEMFD_MAGIC)) 975 return -ENOMEM; 976 977 fc->s_iflags |= SB_I_NOEXEC; 978 fc->s_iflags |= SB_I_NODEV; 979 ctx = fc->fs_private; 980 ctx->ops = &kvm_gmem_super_operations; 981 982 return 0; 983 } 984 985 static struct file_system_type kvm_gmem_fs = { 986 .name = "guest_memfd", 987 .init_fs_context = kvm_gmem_init_fs_context, 988 .kill_sb = kill_anon_super, 989 }; 990 991 static int kvm_gmem_init_mount(void) 992 { 993 kvm_gmem_mnt = kern_mount(&kvm_gmem_fs); 994 995 if (IS_ERR(kvm_gmem_mnt)) 996 return PTR_ERR(kvm_gmem_mnt); 997 998 kvm_gmem_mnt->mnt_flags |= MNT_NOEXEC; 999 return 0; 1000 } 1001 1002 int kvm_gmem_init(struct module *module) 1003 { 1004 struct kmem_cache_args args = { 1005 .align = 0, 1006 .ctor = kvm_gmem_init_inode_once, 1007 }; 1008 int ret; 1009 1010 kvm_gmem_fops.owner = module; 1011 kvm_gmem_inode_cachep = kmem_cache_create("kvm_gmem_inode_cache", 1012 sizeof(struct gmem_inode), 1013 &args, SLAB_ACCOUNT); 1014 if (!kvm_gmem_inode_cachep) 1015 return -ENOMEM; 1016 1017 ret = kvm_gmem_init_mount(); 1018 if (ret) { 1019 kmem_cache_destroy(kvm_gmem_inode_cachep); 1020 return ret; 1021 } 1022 return 0; 1023 } 1024 1025 void kvm_gmem_exit(void) 1026 { 1027 kern_unmount(kvm_gmem_mnt); 1028 kvm_gmem_mnt = NULL; 1029 rcu_barrier(); 1030 kmem_cache_destroy(kvm_gmem_inode_cachep); 1031 } 1032