1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/anon_inodes.h> 3 #include <linux/backing-dev.h> 4 #include <linux/falloc.h> 5 #include <linux/fs.h> 6 #include <linux/kvm_host.h> 7 #include <linux/mempolicy.h> 8 #include <linux/pseudo_fs.h> 9 #include <linux/pagemap.h> 10 11 #include "kvm_mm.h" 12 13 static struct vfsmount *kvm_gmem_mnt; 14 15 /* 16 * A guest_memfd instance can be associated multiple VMs, each with its own 17 * "view" of the underlying physical memory. 18 * 19 * The gmem's inode is effectively the raw underlying physical storage, and is 20 * used to track properties of the physical memory, while each gmem file is 21 * effectively a single VM's view of that storage, and is used to track assets 22 * specific to its associated VM, e.g. memslots=>gmem bindings. 23 */ 24 struct gmem_file { 25 struct kvm *kvm; 26 struct xarray bindings; 27 struct list_head entry; 28 }; 29 30 struct gmem_inode { 31 struct shared_policy policy; 32 struct inode vfs_inode; 33 struct list_head gmem_file_list; 34 35 u64 flags; 36 }; 37 38 static __always_inline struct gmem_inode *GMEM_I(struct inode *inode) 39 { 40 return container_of(inode, struct gmem_inode, vfs_inode); 41 } 42 43 #define kvm_gmem_for_each_file(f, inode) \ 44 list_for_each_entry(f, &GMEM_I(inode)->gmem_file_list, entry) 45 46 /** 47 * folio_file_pfn - like folio_file_page, but return a pfn. 48 * @folio: The folio which contains this index. 49 * @index: The index we want to look up. 50 * 51 * Return: The pfn for this index. 52 */ 53 static inline kvm_pfn_t folio_file_pfn(struct folio *folio, pgoff_t index) 54 { 55 return folio_pfn(folio) + (index & (folio_nr_pages(folio) - 1)); 56 } 57 58 static pgoff_t kvm_gmem_get_index(struct kvm_memory_slot *slot, gfn_t gfn) 59 { 60 return gfn - slot->base_gfn + slot->gmem.pgoff; 61 } 62 63 static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot, 64 pgoff_t index, struct folio *folio) 65 { 66 #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE 67 kvm_pfn_t pfn = folio_file_pfn(folio, index); 68 gfn_t gfn = slot->base_gfn + index - slot->gmem.pgoff; 69 int rc = kvm_arch_gmem_prepare(kvm, gfn, pfn, folio_order(folio)); 70 if (rc) { 71 pr_warn_ratelimited("gmem: Failed to prepare folio for index %lx GFN %llx PFN %llx error %d.\n", 72 index, gfn, pfn, rc); 73 return rc; 74 } 75 #endif 76 77 return 0; 78 } 79 80 /* 81 * Process @folio, which contains @gfn, so that the guest can use it. 82 * The folio must be locked and the gfn must be contained in @slot. 83 * On successful return the guest sees a zero page so as to avoid 84 * leaking host data and the up-to-date flag is set. 85 */ 86 static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot, 87 gfn_t gfn, struct folio *folio) 88 { 89 pgoff_t index; 90 91 /* 92 * Preparing huge folios should always be safe, since it should 93 * be possible to split them later if needed. 94 * 95 * Right now the folio order is always going to be zero, but the 96 * code is ready for huge folios. The only assumption is that 97 * the base pgoff of memslots is naturally aligned with the 98 * requested page order, ensuring that huge folios can also use 99 * huge page table entries for GPA->HPA mapping. 100 * 101 * The order will be passed when creating the guest_memfd, and 102 * checked when creating memslots. 103 */ 104 WARN_ON(!IS_ALIGNED(slot->gmem.pgoff, folio_nr_pages(folio))); 105 index = kvm_gmem_get_index(slot, gfn); 106 index = ALIGN_DOWN(index, folio_nr_pages(folio)); 107 108 return __kvm_gmem_prepare_folio(kvm, slot, index, folio); 109 } 110 111 /* 112 * Returns a locked folio on success. The caller is responsible for 113 * setting the up-to-date flag before the memory is mapped into the guest. 114 * There is no backing storage for the memory, so the folio will remain 115 * up-to-date until it's removed. 116 * 117 * Ignore accessed, referenced, and dirty flags. The memory is 118 * unevictable and there is no storage to write back to. 119 */ 120 static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index) 121 { 122 /* TODO: Support huge pages. */ 123 struct mempolicy *policy; 124 struct folio *folio; 125 126 /* 127 * Fast-path: See if folio is already present in mapping to avoid 128 * policy_lookup. 129 */ 130 folio = filemap_lock_folio(inode->i_mapping, index); 131 if (!IS_ERR(folio)) 132 return folio; 133 134 policy = mpol_shared_policy_lookup(&GMEM_I(inode)->policy, index); 135 folio = __filemap_get_folio_mpol(inode->i_mapping, index, 136 FGP_LOCK | FGP_CREAT, 137 mapping_gfp_mask(inode->i_mapping), policy); 138 mpol_cond_put(policy); 139 140 /* 141 * External interfaces like kvm_gmem_get_pfn() support dealing 142 * with hugepages to a degree, but internally, guest_memfd currently 143 * assumes that all folios are order-0 and handling would need 144 * to be updated for anything otherwise (e.g. page-clearing 145 * operations). 146 */ 147 WARN_ON_ONCE(!IS_ERR(folio) && folio_order(folio)); 148 149 return folio; 150 } 151 152 static enum kvm_gfn_range_filter kvm_gmem_get_invalidate_filter(struct inode *inode) 153 { 154 if (GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_INIT_SHARED) 155 return KVM_FILTER_SHARED; 156 157 return KVM_FILTER_PRIVATE; 158 } 159 160 static void __kvm_gmem_invalidate_start(struct gmem_file *f, pgoff_t start, 161 pgoff_t end, 162 enum kvm_gfn_range_filter attr_filter) 163 { 164 bool flush = false, found_memslot = false; 165 struct kvm_memory_slot *slot; 166 struct kvm *kvm = f->kvm; 167 unsigned long index; 168 169 xa_for_each_range(&f->bindings, index, slot, start, end - 1) { 170 pgoff_t pgoff = slot->gmem.pgoff; 171 172 struct kvm_gfn_range gfn_range = { 173 .start = slot->base_gfn + max(pgoff, start) - pgoff, 174 .end = slot->base_gfn + min(pgoff + slot->npages, end) - pgoff, 175 .slot = slot, 176 .may_block = true, 177 .attr_filter = attr_filter, 178 }; 179 180 if (!found_memslot) { 181 found_memslot = true; 182 183 KVM_MMU_LOCK(kvm); 184 kvm_mmu_invalidate_start(kvm); 185 } 186 187 flush |= kvm_mmu_unmap_gfn_range(kvm, &gfn_range); 188 } 189 190 if (flush) 191 kvm_flush_remote_tlbs(kvm); 192 193 if (found_memslot) 194 KVM_MMU_UNLOCK(kvm); 195 } 196 197 static void kvm_gmem_invalidate_start(struct inode *inode, pgoff_t start, 198 pgoff_t end) 199 { 200 enum kvm_gfn_range_filter attr_filter; 201 struct gmem_file *f; 202 203 attr_filter = kvm_gmem_get_invalidate_filter(inode); 204 205 kvm_gmem_for_each_file(f, inode) 206 __kvm_gmem_invalidate_start(f, start, end, attr_filter); 207 } 208 209 static void __kvm_gmem_invalidate_end(struct gmem_file *f, pgoff_t start, 210 pgoff_t end) 211 { 212 struct kvm *kvm = f->kvm; 213 214 if (xa_find(&f->bindings, &start, end - 1, XA_PRESENT)) { 215 KVM_MMU_LOCK(kvm); 216 kvm_mmu_invalidate_end(kvm); 217 KVM_MMU_UNLOCK(kvm); 218 } 219 } 220 221 static void kvm_gmem_invalidate_end(struct inode *inode, pgoff_t start, 222 pgoff_t end) 223 { 224 struct gmem_file *f; 225 226 kvm_gmem_for_each_file(f, inode) 227 __kvm_gmem_invalidate_end(f, start, end); 228 } 229 230 static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len) 231 { 232 pgoff_t start = offset >> PAGE_SHIFT; 233 pgoff_t end = (offset + len) >> PAGE_SHIFT; 234 235 /* 236 * Bindings must be stable across invalidation to ensure the start+end 237 * are balanced. 238 */ 239 filemap_invalidate_lock(inode->i_mapping); 240 241 kvm_gmem_invalidate_start(inode, start, end); 242 243 truncate_inode_pages_range(inode->i_mapping, offset, offset + len - 1); 244 245 kvm_gmem_invalidate_end(inode, start, end); 246 247 filemap_invalidate_unlock(inode->i_mapping); 248 249 return 0; 250 } 251 252 static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len) 253 { 254 struct address_space *mapping = inode->i_mapping; 255 pgoff_t start, index, end; 256 int r; 257 258 /* Dedicated guest is immutable by default. */ 259 if (offset + len > i_size_read(inode)) 260 return -EINVAL; 261 262 filemap_invalidate_lock_shared(mapping); 263 264 start = offset >> PAGE_SHIFT; 265 end = (offset + len) >> PAGE_SHIFT; 266 267 r = 0; 268 for (index = start; index < end; ) { 269 struct folio *folio; 270 271 if (signal_pending(current)) { 272 r = -EINTR; 273 break; 274 } 275 276 folio = kvm_gmem_get_folio(inode, index); 277 if (IS_ERR(folio)) { 278 r = PTR_ERR(folio); 279 break; 280 } 281 282 index = folio_next_index(folio); 283 284 folio_unlock(folio); 285 folio_put(folio); 286 287 /* 64-bit only, wrapping the index should be impossible. */ 288 if (WARN_ON_ONCE(!index)) 289 break; 290 291 cond_resched(); 292 } 293 294 filemap_invalidate_unlock_shared(mapping); 295 296 return r; 297 } 298 299 static long kvm_gmem_fallocate(struct file *file, int mode, loff_t offset, 300 loff_t len) 301 { 302 int ret; 303 304 if (!(mode & FALLOC_FL_KEEP_SIZE)) 305 return -EOPNOTSUPP; 306 307 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) 308 return -EOPNOTSUPP; 309 310 if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len)) 311 return -EINVAL; 312 313 if (mode & FALLOC_FL_PUNCH_HOLE) 314 ret = kvm_gmem_punch_hole(file_inode(file), offset, len); 315 else 316 ret = kvm_gmem_allocate(file_inode(file), offset, len); 317 318 if (!ret) 319 file_modified(file); 320 return ret; 321 } 322 323 static int kvm_gmem_release(struct inode *inode, struct file *file) 324 { 325 struct gmem_file *f = file->private_data; 326 struct kvm_memory_slot *slot; 327 struct kvm *kvm = f->kvm; 328 unsigned long index; 329 330 /* 331 * Prevent concurrent attempts to *unbind* a memslot. This is the last 332 * reference to the file and thus no new bindings can be created, but 333 * dereferencing the slot for existing bindings needs to be protected 334 * against memslot updates, specifically so that unbind doesn't race 335 * and free the memslot (kvm_gmem_get_file() will return NULL). 336 * 337 * Since .release is called only when the reference count is zero, 338 * after which file_ref_get() and get_file_active() fail, 339 * kvm_gmem_get_pfn() cannot be using the file concurrently. 340 * file_ref_put() provides a full barrier, and get_file_active() the 341 * matching acquire barrier. 342 */ 343 mutex_lock(&kvm->slots_lock); 344 345 filemap_invalidate_lock(inode->i_mapping); 346 347 xa_for_each(&f->bindings, index, slot) 348 WRITE_ONCE(slot->gmem.file, NULL); 349 350 /* 351 * All in-flight operations are gone and new bindings can be created. 352 * Zap all SPTEs pointed at by this file. Do not free the backing 353 * memory, as its lifetime is associated with the inode, not the file. 354 */ 355 __kvm_gmem_invalidate_start(f, 0, -1ul, 356 kvm_gmem_get_invalidate_filter(inode)); 357 __kvm_gmem_invalidate_end(f, 0, -1ul); 358 359 list_del(&f->entry); 360 361 filemap_invalidate_unlock(inode->i_mapping); 362 363 mutex_unlock(&kvm->slots_lock); 364 365 xa_destroy(&f->bindings); 366 kfree(f); 367 368 kvm_put_kvm(kvm); 369 370 return 0; 371 } 372 373 static inline struct file *kvm_gmem_get_file(struct kvm_memory_slot *slot) 374 { 375 /* 376 * Do not return slot->gmem.file if it has already been closed; 377 * there might be some time between the last fput() and when 378 * kvm_gmem_release() clears slot->gmem.file. 379 */ 380 return get_file_active(&slot->gmem.file); 381 } 382 383 DEFINE_CLASS(gmem_get_file, struct file *, if (_T) fput(_T), 384 kvm_gmem_get_file(slot), struct kvm_memory_slot *slot); 385 386 static bool kvm_gmem_supports_mmap(struct inode *inode) 387 { 388 return GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_MMAP; 389 } 390 391 static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf) 392 { 393 struct inode *inode = file_inode(vmf->vma->vm_file); 394 struct folio *folio; 395 vm_fault_t ret = VM_FAULT_LOCKED; 396 397 if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode)) 398 return VM_FAULT_SIGBUS; 399 400 if (!(GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_INIT_SHARED)) 401 return VM_FAULT_SIGBUS; 402 403 folio = kvm_gmem_get_folio(inode, vmf->pgoff); 404 if (IS_ERR(folio)) { 405 if (PTR_ERR(folio) == -EAGAIN) 406 return VM_FAULT_RETRY; 407 408 return vmf_error(PTR_ERR(folio)); 409 } 410 411 if (WARN_ON_ONCE(folio_test_large(folio))) { 412 ret = VM_FAULT_SIGBUS; 413 goto out_folio; 414 } 415 416 if (!folio_test_uptodate(folio)) { 417 clear_highpage(folio_page(folio, 0)); 418 folio_mark_uptodate(folio); 419 } 420 421 vmf->page = folio_file_page(folio, vmf->pgoff); 422 423 out_folio: 424 if (ret != VM_FAULT_LOCKED) { 425 folio_unlock(folio); 426 folio_put(folio); 427 } 428 429 return ret; 430 } 431 432 #ifdef CONFIG_NUMA 433 static int kvm_gmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol) 434 { 435 struct inode *inode = file_inode(vma->vm_file); 436 437 return mpol_set_shared_policy(&GMEM_I(inode)->policy, vma, mpol); 438 } 439 440 static struct mempolicy *kvm_gmem_get_policy(struct vm_area_struct *vma, 441 unsigned long addr, pgoff_t *ilx) 442 { 443 pgoff_t pgoff = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT); 444 struct inode *inode = file_inode(vma->vm_file); 445 446 *ilx = inode->i_ino; 447 448 /* 449 * Return the memory policy for this index, or NULL if none is set. 450 * 451 * Returning NULL, e.g. instead of the current task's memory policy, is 452 * important for the .get_policy kernel ABI: it indicates that no 453 * explicit policy has been set via mbind() for this memory. The caller 454 * can then replace NULL with the default memory policy instead of the 455 * current task's memory policy. 456 */ 457 return mpol_shared_policy_lookup(&GMEM_I(inode)->policy, pgoff); 458 } 459 #endif /* CONFIG_NUMA */ 460 461 static const struct vm_operations_struct kvm_gmem_vm_ops = { 462 .fault = kvm_gmem_fault_user_mapping, 463 #ifdef CONFIG_NUMA 464 .get_policy = kvm_gmem_get_policy, 465 .set_policy = kvm_gmem_set_policy, 466 #endif 467 }; 468 469 static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma) 470 { 471 if (!kvm_gmem_supports_mmap(file_inode(file))) 472 return -ENODEV; 473 474 if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) != 475 (VM_SHARED | VM_MAYSHARE)) { 476 return -EINVAL; 477 } 478 479 vma->vm_ops = &kvm_gmem_vm_ops; 480 481 return 0; 482 } 483 484 static struct file_operations kvm_gmem_fops = { 485 .mmap = kvm_gmem_mmap, 486 .open = generic_file_open, 487 .release = kvm_gmem_release, 488 .fallocate = kvm_gmem_fallocate, 489 }; 490 491 static int kvm_gmem_migrate_folio(struct address_space *mapping, 492 struct folio *dst, struct folio *src, 493 enum migrate_mode mode) 494 { 495 WARN_ON_ONCE(1); 496 return -EINVAL; 497 } 498 499 static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *folio) 500 { 501 pgoff_t start, end; 502 503 filemap_invalidate_lock_shared(mapping); 504 505 start = folio->index; 506 end = start + folio_nr_pages(folio); 507 508 kvm_gmem_invalidate_start(mapping->host, start, end); 509 510 /* 511 * Do not truncate the range, what action is taken in response to the 512 * error is userspace's decision (assuming the architecture supports 513 * gracefully handling memory errors). If/when the guest attempts to 514 * access a poisoned page, kvm_gmem_get_pfn() will return -EHWPOISON, 515 * at which point KVM can either terminate the VM or propagate the 516 * error to userspace. 517 */ 518 519 kvm_gmem_invalidate_end(mapping->host, start, end); 520 521 filemap_invalidate_unlock_shared(mapping); 522 523 return MF_DELAYED; 524 } 525 526 #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE 527 static void kvm_gmem_free_folio(struct folio *folio) 528 { 529 struct page *page = folio_page(folio, 0); 530 kvm_pfn_t pfn = page_to_pfn(page); 531 int order = folio_order(folio); 532 533 kvm_arch_gmem_invalidate(pfn, pfn + (1ul << order)); 534 } 535 #endif 536 537 static const struct address_space_operations kvm_gmem_aops = { 538 .dirty_folio = noop_dirty_folio, 539 .migrate_folio = kvm_gmem_migrate_folio, 540 .error_remove_folio = kvm_gmem_error_folio, 541 #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE 542 .free_folio = kvm_gmem_free_folio, 543 #endif 544 }; 545 546 static int kvm_gmem_setattr(struct mnt_idmap *idmap, struct dentry *dentry, 547 struct iattr *attr) 548 { 549 return -EINVAL; 550 } 551 static const struct inode_operations kvm_gmem_iops = { 552 .setattr = kvm_gmem_setattr, 553 }; 554 555 bool __weak kvm_arch_supports_gmem_init_shared(struct kvm *kvm) 556 { 557 return true; 558 } 559 560 static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) 561 { 562 static const char *name = "[kvm-gmem]"; 563 struct gmem_file *f; 564 struct inode *inode; 565 struct file *file; 566 int fd, err; 567 568 fd = get_unused_fd_flags(0); 569 if (fd < 0) 570 return fd; 571 572 f = kzalloc_obj(*f); 573 if (!f) { 574 err = -ENOMEM; 575 goto err_fd; 576 } 577 578 /* __fput() will take care of fops_put(). */ 579 if (!fops_get(&kvm_gmem_fops)) { 580 err = -ENOENT; 581 goto err_gmem; 582 } 583 584 inode = anon_inode_make_secure_inode(kvm_gmem_mnt->mnt_sb, name, NULL); 585 if (IS_ERR(inode)) { 586 err = PTR_ERR(inode); 587 goto err_fops; 588 } 589 590 inode->i_op = &kvm_gmem_iops; 591 inode->i_mapping->a_ops = &kvm_gmem_aops; 592 inode->i_mode |= S_IFREG; 593 inode->i_size = size; 594 mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); 595 mapping_set_inaccessible(inode->i_mapping); 596 /* Unmovable mappings are supposed to be marked unevictable as well. */ 597 WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping)); 598 599 GMEM_I(inode)->flags = flags; 600 601 file = alloc_file_pseudo(inode, kvm_gmem_mnt, name, O_RDWR, &kvm_gmem_fops); 602 if (IS_ERR(file)) { 603 err = PTR_ERR(file); 604 goto err_inode; 605 } 606 607 file->f_flags |= O_LARGEFILE; 608 file->private_data = f; 609 610 kvm_get_kvm(kvm); 611 f->kvm = kvm; 612 xa_init(&f->bindings); 613 list_add(&f->entry, &GMEM_I(inode)->gmem_file_list); 614 615 fd_install(fd, file); 616 return fd; 617 618 err_inode: 619 iput(inode); 620 err_fops: 621 fops_put(&kvm_gmem_fops); 622 err_gmem: 623 kfree(f); 624 err_fd: 625 put_unused_fd(fd); 626 return err; 627 } 628 629 int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args) 630 { 631 loff_t size = args->size; 632 u64 flags = args->flags; 633 634 if (flags & ~kvm_gmem_get_supported_flags(kvm)) 635 return -EINVAL; 636 637 if (size <= 0 || !PAGE_ALIGNED(size)) 638 return -EINVAL; 639 640 return __kvm_gmem_create(kvm, size, flags); 641 } 642 643 int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot, 644 unsigned int fd, uoff_t offset) 645 { 646 uoff_t size = slot->npages << PAGE_SHIFT; 647 unsigned long start, end; 648 struct gmem_file *f; 649 struct inode *inode; 650 struct file *file; 651 int r = -EINVAL; 652 653 BUILD_BUG_ON(sizeof(gpa_t) != sizeof(offset)); 654 BUILD_BUG_ON(sizeof(gfn_t) != sizeof(slot->gmem.pgoff)); 655 656 file = fget(fd); 657 if (!file) 658 return -EBADF; 659 660 if (file->f_op != &kvm_gmem_fops) 661 goto err; 662 663 f = file->private_data; 664 if (f->kvm != kvm) 665 goto err; 666 667 inode = file_inode(file); 668 669 if (!PAGE_ALIGNED(offset) || offset + size > i_size_read(inode)) 670 goto err; 671 672 filemap_invalidate_lock(inode->i_mapping); 673 674 start = offset >> PAGE_SHIFT; 675 end = start + slot->npages; 676 677 if (!xa_empty(&f->bindings) && 678 xa_find(&f->bindings, &start, end - 1, XA_PRESENT)) { 679 r = -EEXIST; 680 filemap_invalidate_unlock(inode->i_mapping); 681 goto err; 682 } 683 684 /* 685 * memslots of flag KVM_MEM_GUEST_MEMFD are immutable to change, so 686 * kvm_gmem_bind() must occur on a new memslot. Because the memslot 687 * is not visible yet, kvm_gmem_get_pfn() is guaranteed to see the file. 688 */ 689 WRITE_ONCE(slot->gmem.file, file); 690 slot->gmem.pgoff = start; 691 if (kvm_gmem_supports_mmap(inode)) 692 slot->flags |= KVM_MEMSLOT_GMEM_ONLY; 693 694 xa_store_range(&f->bindings, start, end - 1, slot, GFP_KERNEL); 695 filemap_invalidate_unlock(inode->i_mapping); 696 697 /* 698 * Drop the reference to the file, even on success. The file pins KVM, 699 * not the other way 'round. Active bindings are invalidated if the 700 * file is closed before memslots are destroyed. 701 */ 702 r = 0; 703 err: 704 fput(file); 705 return r; 706 } 707 708 static void __kvm_gmem_unbind(struct kvm_memory_slot *slot, struct gmem_file *f) 709 { 710 unsigned long start = slot->gmem.pgoff; 711 unsigned long end = start + slot->npages; 712 713 xa_store_range(&f->bindings, start, end - 1, NULL, GFP_KERNEL); 714 715 /* 716 * synchronize_srcu(&kvm->srcu) ensured that kvm_gmem_get_pfn() 717 * cannot see this memslot. 718 */ 719 WRITE_ONCE(slot->gmem.file, NULL); 720 } 721 722 void kvm_gmem_unbind(struct kvm_memory_slot *slot) 723 { 724 /* 725 * Nothing to do if the underlying file was _already_ closed, as 726 * kvm_gmem_release() invalidates and nullifies all bindings. 727 */ 728 if (!slot->gmem.file) 729 return; 730 731 CLASS(gmem_get_file, file)(slot); 732 733 /* 734 * However, if the file is _being_ closed, then the bindings need to be 735 * removed as kvm_gmem_release() might not run until after the memslot 736 * is freed. Note, modifying the bindings is safe even though the file 737 * is dying as kvm_gmem_release() nullifies slot->gmem.file under 738 * slots_lock, and only puts its reference to KVM after destroying all 739 * bindings. I.e. reaching this point means kvm_gmem_release() hasn't 740 * yet destroyed the bindings or freed the gmem_file, and can't do so 741 * until the caller drops slots_lock. 742 */ 743 if (!file) { 744 __kvm_gmem_unbind(slot, slot->gmem.file->private_data); 745 return; 746 } 747 748 filemap_invalidate_lock(file->f_mapping); 749 __kvm_gmem_unbind(slot, file->private_data); 750 filemap_invalidate_unlock(file->f_mapping); 751 } 752 753 /* Returns a locked folio on success. */ 754 static struct folio *__kvm_gmem_get_pfn(struct file *file, 755 struct kvm_memory_slot *slot, 756 pgoff_t index, kvm_pfn_t *pfn, 757 int *max_order) 758 { 759 struct file *slot_file = READ_ONCE(slot->gmem.file); 760 struct gmem_file *f = file->private_data; 761 struct folio *folio; 762 763 if (file != slot_file) { 764 WARN_ON_ONCE(slot_file); 765 return ERR_PTR(-EFAULT); 766 } 767 768 if (xa_load(&f->bindings, index) != slot) { 769 WARN_ON_ONCE(xa_load(&f->bindings, index)); 770 return ERR_PTR(-EIO); 771 } 772 773 folio = kvm_gmem_get_folio(file_inode(file), index); 774 if (IS_ERR(folio)) 775 return folio; 776 777 if (folio_test_hwpoison(folio)) { 778 folio_unlock(folio); 779 folio_put(folio); 780 return ERR_PTR(-EHWPOISON); 781 } 782 783 *pfn = folio_file_pfn(folio, index); 784 if (max_order) 785 *max_order = 0; 786 787 return folio; 788 } 789 790 int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, 791 gfn_t gfn, kvm_pfn_t *pfn, struct page **page, 792 int *max_order) 793 { 794 pgoff_t index = kvm_gmem_get_index(slot, gfn); 795 struct folio *folio; 796 int r = 0; 797 798 CLASS(gmem_get_file, file)(slot); 799 if (!file) 800 return -EFAULT; 801 802 folio = __kvm_gmem_get_pfn(file, slot, index, pfn, max_order); 803 if (IS_ERR(folio)) 804 return PTR_ERR(folio); 805 806 if (!folio_test_uptodate(folio)) { 807 clear_highpage(folio_page(folio, 0)); 808 folio_mark_uptodate(folio); 809 } 810 811 r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio); 812 813 folio_unlock(folio); 814 815 if (!r) 816 *page = folio_file_page(folio, index); 817 else 818 folio_put(folio); 819 820 return r; 821 } 822 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_get_pfn); 823 824 #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_POPULATE 825 826 static long __kvm_gmem_populate(struct kvm *kvm, struct kvm_memory_slot *slot, 827 struct file *file, gfn_t gfn, struct page *src_page, 828 kvm_gmem_populate_cb post_populate, void *opaque) 829 { 830 pgoff_t index = kvm_gmem_get_index(slot, gfn); 831 struct folio *folio; 832 kvm_pfn_t pfn; 833 int ret; 834 835 filemap_invalidate_lock(file->f_mapping); 836 837 folio = __kvm_gmem_get_pfn(file, slot, index, &pfn, NULL); 838 if (IS_ERR(folio)) { 839 ret = PTR_ERR(folio); 840 goto out_unlock; 841 } 842 843 folio_unlock(folio); 844 845 if (!kvm_range_has_memory_attributes(kvm, gfn, gfn + 1, 846 KVM_MEMORY_ATTRIBUTE_PRIVATE, 847 KVM_MEMORY_ATTRIBUTE_PRIVATE)) { 848 ret = -EINVAL; 849 goto out_put_folio; 850 } 851 852 ret = post_populate(kvm, gfn, pfn, src_page, opaque); 853 if (!ret) 854 folio_mark_uptodate(folio); 855 856 out_put_folio: 857 folio_put(folio); 858 out_unlock: 859 filemap_invalidate_unlock(file->f_mapping); 860 return ret; 861 } 862 863 long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, 864 long npages, bool may_writeback_src, 865 kvm_gmem_populate_cb post_populate, void *opaque) 866 { 867 struct kvm_memory_slot *slot; 868 int ret = 0; 869 long i; 870 871 lockdep_assert_held(&kvm->slots_lock); 872 873 if (WARN_ON_ONCE(npages <= 0)) 874 return -EINVAL; 875 876 if (WARN_ON_ONCE(!PAGE_ALIGNED(src))) 877 return -EINVAL; 878 879 slot = gfn_to_memslot(kvm, start_gfn); 880 if (!kvm_slot_has_gmem(slot)) 881 return -EINVAL; 882 883 CLASS(gmem_get_file, file)(slot); 884 if (!file) 885 return -EFAULT; 886 887 npages = min_t(ulong, slot->npages - (start_gfn - slot->base_gfn), npages); 888 for (i = 0; i < npages; i++) { 889 struct page *src_page = NULL; 890 891 if (signal_pending(current)) { 892 ret = -EINTR; 893 break; 894 } 895 896 if (src) { 897 unsigned long uaddr = (unsigned long)src + i * PAGE_SIZE; 898 unsigned int flags = may_writeback_src ? FOLL_WRITE : 0; 899 900 ret = get_user_pages_fast(uaddr, 1, flags, &src_page); 901 if (ret < 0) 902 break; 903 if (ret != 1) { 904 ret = -ENOMEM; 905 break; 906 } 907 } 908 909 ret = __kvm_gmem_populate(kvm, slot, file, start_gfn + i, src_page, 910 post_populate, opaque); 911 912 if (src_page) 913 put_page(src_page); 914 915 if (ret) 916 break; 917 } 918 919 return ret && !i ? ret : i; 920 } 921 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_populate); 922 #endif 923 924 static struct kmem_cache *kvm_gmem_inode_cachep; 925 926 static void kvm_gmem_init_inode_once(void *__gi) 927 { 928 struct gmem_inode *gi = __gi; 929 930 /* 931 * Note! Don't initialize the inode with anything specific to the 932 * guest_memfd instance, or that might be specific to how the inode is 933 * used (from the VFS-layer's perspective). This hook is called only 934 * during the initial slab allocation, i.e. only fields/state that are 935 * idempotent across _all_ use of the inode _object_ can be initialized 936 * at this time! 937 */ 938 inode_init_once(&gi->vfs_inode); 939 } 940 941 static struct inode *kvm_gmem_alloc_inode(struct super_block *sb) 942 { 943 struct gmem_inode *gi; 944 945 gi = alloc_inode_sb(sb, kvm_gmem_inode_cachep, GFP_KERNEL); 946 if (!gi) 947 return NULL; 948 949 mpol_shared_policy_init(&gi->policy, NULL); 950 951 gi->flags = 0; 952 INIT_LIST_HEAD(&gi->gmem_file_list); 953 return &gi->vfs_inode; 954 } 955 956 static void kvm_gmem_destroy_inode(struct inode *inode) 957 { 958 mpol_free_shared_policy(&GMEM_I(inode)->policy); 959 } 960 961 static void kvm_gmem_free_inode(struct inode *inode) 962 { 963 kmem_cache_free(kvm_gmem_inode_cachep, GMEM_I(inode)); 964 } 965 966 static const struct super_operations kvm_gmem_super_operations = { 967 .statfs = simple_statfs, 968 .alloc_inode = kvm_gmem_alloc_inode, 969 .destroy_inode = kvm_gmem_destroy_inode, 970 .free_inode = kvm_gmem_free_inode, 971 }; 972 973 static int kvm_gmem_init_fs_context(struct fs_context *fc) 974 { 975 struct pseudo_fs_context *ctx; 976 977 if (!init_pseudo(fc, GUEST_MEMFD_MAGIC)) 978 return -ENOMEM; 979 980 ctx = fc->fs_private; 981 ctx->ops = &kvm_gmem_super_operations; 982 983 return 0; 984 } 985 986 static struct file_system_type kvm_gmem_fs = { 987 .name = "guest_memfd", 988 .init_fs_context = kvm_gmem_init_fs_context, 989 .kill_sb = kill_anon_super, 990 }; 991 992 static int kvm_gmem_init_mount(void) 993 { 994 kvm_gmem_mnt = kern_mount(&kvm_gmem_fs); 995 996 if (IS_ERR(kvm_gmem_mnt)) 997 return PTR_ERR(kvm_gmem_mnt); 998 999 kvm_gmem_mnt->mnt_flags |= MNT_NOEXEC; 1000 return 0; 1001 } 1002 1003 int kvm_gmem_init(struct module *module) 1004 { 1005 struct kmem_cache_args args = { 1006 .align = 0, 1007 .ctor = kvm_gmem_init_inode_once, 1008 }; 1009 int ret; 1010 1011 kvm_gmem_fops.owner = module; 1012 kvm_gmem_inode_cachep = kmem_cache_create("kvm_gmem_inode_cache", 1013 sizeof(struct gmem_inode), 1014 &args, SLAB_ACCOUNT); 1015 if (!kvm_gmem_inode_cachep) 1016 return -ENOMEM; 1017 1018 ret = kvm_gmem_init_mount(); 1019 if (ret) { 1020 kmem_cache_destroy(kvm_gmem_inode_cachep); 1021 return ret; 1022 } 1023 return 0; 1024 } 1025 1026 void kvm_gmem_exit(void) 1027 { 1028 kern_unmount(kvm_gmem_mnt); 1029 kvm_gmem_mnt = NULL; 1030 rcu_barrier(); 1031 kmem_cache_destroy(kvm_gmem_inode_cachep); 1032 } 1033