1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/anon_inodes.h> 3 #include <linux/backing-dev.h> 4 #include <linux/falloc.h> 5 #include <linux/fs.h> 6 #include <linux/kvm_host.h> 7 #include <linux/mempolicy.h> 8 #include <linux/pseudo_fs.h> 9 #include <linux/pagemap.h> 10 11 #include "kvm_mm.h" 12 13 static struct vfsmount *kvm_gmem_mnt; 14 15 /* 16 * A guest_memfd instance can be associated multiple VMs, each with its own 17 * "view" of the underlying physical memory. 18 * 19 * The gmem's inode is effectively the raw underlying physical storage, and is 20 * used to track properties of the physical memory, while each gmem file is 21 * effectively a single VM's view of that storage, and is used to track assets 22 * specific to its associated VM, e.g. memslots=>gmem bindings. 23 */ 24 struct gmem_file { 25 struct kvm *kvm; 26 struct xarray bindings; 27 struct list_head entry; 28 }; 29 30 struct gmem_inode { 31 struct shared_policy policy; 32 struct inode vfs_inode; 33 34 u64 flags; 35 }; 36 37 static __always_inline struct gmem_inode *GMEM_I(struct inode *inode) 38 { 39 return container_of(inode, struct gmem_inode, vfs_inode); 40 } 41 42 #define kvm_gmem_for_each_file(f, mapping) \ 43 list_for_each_entry(f, &(mapping)->i_private_list, entry) 44 45 /** 46 * folio_file_pfn - like folio_file_page, but return a pfn. 47 * @folio: The folio which contains this index. 48 * @index: The index we want to look up. 49 * 50 * Return: The pfn for this index. 51 */ 52 static inline kvm_pfn_t folio_file_pfn(struct folio *folio, pgoff_t index) 53 { 54 return folio_pfn(folio) + (index & (folio_nr_pages(folio) - 1)); 55 } 56 57 static pgoff_t kvm_gmem_get_index(struct kvm_memory_slot *slot, gfn_t gfn) 58 { 59 return gfn - slot->base_gfn + slot->gmem.pgoff; 60 } 61 62 static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot, 63 pgoff_t index, struct folio *folio) 64 { 65 #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE 66 kvm_pfn_t pfn = folio_file_pfn(folio, index); 67 gfn_t gfn = slot->base_gfn + index - slot->gmem.pgoff; 68 int rc = kvm_arch_gmem_prepare(kvm, gfn, pfn, folio_order(folio)); 69 if (rc) { 70 pr_warn_ratelimited("gmem: Failed to prepare folio for index %lx GFN %llx PFN %llx error %d.\n", 71 index, gfn, pfn, rc); 72 return rc; 73 } 74 #endif 75 76 return 0; 77 } 78 79 /* 80 * Process @folio, which contains @gfn, so that the guest can use it. 81 * The folio must be locked and the gfn must be contained in @slot. 82 * On successful return the guest sees a zero page so as to avoid 83 * leaking host data and the up-to-date flag is set. 84 */ 85 static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot, 86 gfn_t gfn, struct folio *folio) 87 { 88 pgoff_t index; 89 90 /* 91 * Preparing huge folios should always be safe, since it should 92 * be possible to split them later if needed. 93 * 94 * Right now the folio order is always going to be zero, but the 95 * code is ready for huge folios. The only assumption is that 96 * the base pgoff of memslots is naturally aligned with the 97 * requested page order, ensuring that huge folios can also use 98 * huge page table entries for GPA->HPA mapping. 99 * 100 * The order will be passed when creating the guest_memfd, and 101 * checked when creating memslots. 102 */ 103 WARN_ON(!IS_ALIGNED(slot->gmem.pgoff, folio_nr_pages(folio))); 104 index = kvm_gmem_get_index(slot, gfn); 105 index = ALIGN_DOWN(index, folio_nr_pages(folio)); 106 107 return __kvm_gmem_prepare_folio(kvm, slot, index, folio); 108 } 109 110 /* 111 * Returns a locked folio on success. The caller is responsible for 112 * setting the up-to-date flag before the memory is mapped into the guest. 113 * There is no backing storage for the memory, so the folio will remain 114 * up-to-date until it's removed. 115 * 116 * Ignore accessed, referenced, and dirty flags. The memory is 117 * unevictable and there is no storage to write back to. 118 */ 119 static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index) 120 { 121 /* TODO: Support huge pages. */ 122 struct mempolicy *policy; 123 struct folio *folio; 124 125 /* 126 * Fast-path: See if folio is already present in mapping to avoid 127 * policy_lookup. 128 */ 129 folio = __filemap_get_folio(inode->i_mapping, index, 130 FGP_LOCK | FGP_ACCESSED, 0); 131 if (!IS_ERR(folio)) 132 return folio; 133 134 policy = mpol_shared_policy_lookup(&GMEM_I(inode)->policy, index); 135 folio = __filemap_get_folio_mpol(inode->i_mapping, index, 136 FGP_LOCK | FGP_ACCESSED | FGP_CREAT, 137 mapping_gfp_mask(inode->i_mapping), policy); 138 mpol_cond_put(policy); 139 140 /* 141 * External interfaces like kvm_gmem_get_pfn() support dealing 142 * with hugepages to a degree, but internally, guest_memfd currently 143 * assumes that all folios are order-0 and handling would need 144 * to be updated for anything otherwise (e.g. page-clearing 145 * operations). 146 */ 147 WARN_ON_ONCE(!IS_ERR(folio) && folio_order(folio)); 148 149 return folio; 150 } 151 152 static enum kvm_gfn_range_filter kvm_gmem_get_invalidate_filter(struct inode *inode) 153 { 154 if (GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_INIT_SHARED) 155 return KVM_FILTER_SHARED; 156 157 return KVM_FILTER_PRIVATE; 158 } 159 160 static void __kvm_gmem_invalidate_begin(struct gmem_file *f, pgoff_t start, 161 pgoff_t end, 162 enum kvm_gfn_range_filter attr_filter) 163 { 164 bool flush = false, found_memslot = false; 165 struct kvm_memory_slot *slot; 166 struct kvm *kvm = f->kvm; 167 unsigned long index; 168 169 xa_for_each_range(&f->bindings, index, slot, start, end - 1) { 170 pgoff_t pgoff = slot->gmem.pgoff; 171 172 struct kvm_gfn_range gfn_range = { 173 .start = slot->base_gfn + max(pgoff, start) - pgoff, 174 .end = slot->base_gfn + min(pgoff + slot->npages, end) - pgoff, 175 .slot = slot, 176 .may_block = true, 177 .attr_filter = attr_filter, 178 }; 179 180 if (!found_memslot) { 181 found_memslot = true; 182 183 KVM_MMU_LOCK(kvm); 184 kvm_mmu_invalidate_begin(kvm); 185 } 186 187 flush |= kvm_mmu_unmap_gfn_range(kvm, &gfn_range); 188 } 189 190 if (flush) 191 kvm_flush_remote_tlbs(kvm); 192 193 if (found_memslot) 194 KVM_MMU_UNLOCK(kvm); 195 } 196 197 static void kvm_gmem_invalidate_begin(struct inode *inode, pgoff_t start, 198 pgoff_t end) 199 { 200 enum kvm_gfn_range_filter attr_filter; 201 struct gmem_file *f; 202 203 attr_filter = kvm_gmem_get_invalidate_filter(inode); 204 205 kvm_gmem_for_each_file(f, inode->i_mapping) 206 __kvm_gmem_invalidate_begin(f, start, end, attr_filter); 207 } 208 209 static void __kvm_gmem_invalidate_end(struct gmem_file *f, pgoff_t start, 210 pgoff_t end) 211 { 212 struct kvm *kvm = f->kvm; 213 214 if (xa_find(&f->bindings, &start, end - 1, XA_PRESENT)) { 215 KVM_MMU_LOCK(kvm); 216 kvm_mmu_invalidate_end(kvm); 217 KVM_MMU_UNLOCK(kvm); 218 } 219 } 220 221 static void kvm_gmem_invalidate_end(struct inode *inode, pgoff_t start, 222 pgoff_t end) 223 { 224 struct gmem_file *f; 225 226 kvm_gmem_for_each_file(f, inode->i_mapping) 227 __kvm_gmem_invalidate_end(f, start, end); 228 } 229 230 static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len) 231 { 232 pgoff_t start = offset >> PAGE_SHIFT; 233 pgoff_t end = (offset + len) >> PAGE_SHIFT; 234 235 /* 236 * Bindings must be stable across invalidation to ensure the start+end 237 * are balanced. 238 */ 239 filemap_invalidate_lock(inode->i_mapping); 240 241 kvm_gmem_invalidate_begin(inode, start, end); 242 243 truncate_inode_pages_range(inode->i_mapping, offset, offset + len - 1); 244 245 kvm_gmem_invalidate_end(inode, start, end); 246 247 filemap_invalidate_unlock(inode->i_mapping); 248 249 return 0; 250 } 251 252 static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len) 253 { 254 struct address_space *mapping = inode->i_mapping; 255 pgoff_t start, index, end; 256 int r; 257 258 /* Dedicated guest is immutable by default. */ 259 if (offset + len > i_size_read(inode)) 260 return -EINVAL; 261 262 filemap_invalidate_lock_shared(mapping); 263 264 start = offset >> PAGE_SHIFT; 265 end = (offset + len) >> PAGE_SHIFT; 266 267 r = 0; 268 for (index = start; index < end; ) { 269 struct folio *folio; 270 271 if (signal_pending(current)) { 272 r = -EINTR; 273 break; 274 } 275 276 folio = kvm_gmem_get_folio(inode, index); 277 if (IS_ERR(folio)) { 278 r = PTR_ERR(folio); 279 break; 280 } 281 282 index = folio_next_index(folio); 283 284 folio_unlock(folio); 285 folio_put(folio); 286 287 /* 64-bit only, wrapping the index should be impossible. */ 288 if (WARN_ON_ONCE(!index)) 289 break; 290 291 cond_resched(); 292 } 293 294 filemap_invalidate_unlock_shared(mapping); 295 296 return r; 297 } 298 299 static long kvm_gmem_fallocate(struct file *file, int mode, loff_t offset, 300 loff_t len) 301 { 302 int ret; 303 304 if (!(mode & FALLOC_FL_KEEP_SIZE)) 305 return -EOPNOTSUPP; 306 307 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) 308 return -EOPNOTSUPP; 309 310 if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len)) 311 return -EINVAL; 312 313 if (mode & FALLOC_FL_PUNCH_HOLE) 314 ret = kvm_gmem_punch_hole(file_inode(file), offset, len); 315 else 316 ret = kvm_gmem_allocate(file_inode(file), offset, len); 317 318 if (!ret) 319 file_modified(file); 320 return ret; 321 } 322 323 static int kvm_gmem_release(struct inode *inode, struct file *file) 324 { 325 struct gmem_file *f = file->private_data; 326 struct kvm_memory_slot *slot; 327 struct kvm *kvm = f->kvm; 328 unsigned long index; 329 330 /* 331 * Prevent concurrent attempts to *unbind* a memslot. This is the last 332 * reference to the file and thus no new bindings can be created, but 333 * dereferencing the slot for existing bindings needs to be protected 334 * against memslot updates, specifically so that unbind doesn't race 335 * and free the memslot (kvm_gmem_get_file() will return NULL). 336 * 337 * Since .release is called only when the reference count is zero, 338 * after which file_ref_get() and get_file_active() fail, 339 * kvm_gmem_get_pfn() cannot be using the file concurrently. 340 * file_ref_put() provides a full barrier, and get_file_active() the 341 * matching acquire barrier. 342 */ 343 mutex_lock(&kvm->slots_lock); 344 345 filemap_invalidate_lock(inode->i_mapping); 346 347 xa_for_each(&f->bindings, index, slot) 348 WRITE_ONCE(slot->gmem.file, NULL); 349 350 /* 351 * All in-flight operations are gone and new bindings can be created. 352 * Zap all SPTEs pointed at by this file. Do not free the backing 353 * memory, as its lifetime is associated with the inode, not the file. 354 */ 355 __kvm_gmem_invalidate_begin(f, 0, -1ul, 356 kvm_gmem_get_invalidate_filter(inode)); 357 __kvm_gmem_invalidate_end(f, 0, -1ul); 358 359 list_del(&f->entry); 360 361 filemap_invalidate_unlock(inode->i_mapping); 362 363 mutex_unlock(&kvm->slots_lock); 364 365 xa_destroy(&f->bindings); 366 kfree(f); 367 368 kvm_put_kvm(kvm); 369 370 return 0; 371 } 372 373 static inline struct file *kvm_gmem_get_file(struct kvm_memory_slot *slot) 374 { 375 /* 376 * Do not return slot->gmem.file if it has already been closed; 377 * there might be some time between the last fput() and when 378 * kvm_gmem_release() clears slot->gmem.file. 379 */ 380 return get_file_active(&slot->gmem.file); 381 } 382 383 DEFINE_CLASS(gmem_get_file, struct file *, if (_T) fput(_T), 384 kvm_gmem_get_file(slot), struct kvm_memory_slot *slot); 385 386 static bool kvm_gmem_supports_mmap(struct inode *inode) 387 { 388 return GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_MMAP; 389 } 390 391 static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf) 392 { 393 struct inode *inode = file_inode(vmf->vma->vm_file); 394 struct folio *folio; 395 vm_fault_t ret = VM_FAULT_LOCKED; 396 397 if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode)) 398 return VM_FAULT_SIGBUS; 399 400 if (!(GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_INIT_SHARED)) 401 return VM_FAULT_SIGBUS; 402 403 folio = kvm_gmem_get_folio(inode, vmf->pgoff); 404 if (IS_ERR(folio)) { 405 if (PTR_ERR(folio) == -EAGAIN) 406 return VM_FAULT_RETRY; 407 408 return vmf_error(PTR_ERR(folio)); 409 } 410 411 if (WARN_ON_ONCE(folio_test_large(folio))) { 412 ret = VM_FAULT_SIGBUS; 413 goto out_folio; 414 } 415 416 if (!folio_test_uptodate(folio)) { 417 clear_highpage(folio_page(folio, 0)); 418 folio_mark_uptodate(folio); 419 } 420 421 vmf->page = folio_file_page(folio, vmf->pgoff); 422 423 out_folio: 424 if (ret != VM_FAULT_LOCKED) { 425 folio_unlock(folio); 426 folio_put(folio); 427 } 428 429 return ret; 430 } 431 432 #ifdef CONFIG_NUMA 433 static int kvm_gmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol) 434 { 435 struct inode *inode = file_inode(vma->vm_file); 436 437 return mpol_set_shared_policy(&GMEM_I(inode)->policy, vma, mpol); 438 } 439 440 static struct mempolicy *kvm_gmem_get_policy(struct vm_area_struct *vma, 441 unsigned long addr, pgoff_t *pgoff) 442 { 443 struct inode *inode = file_inode(vma->vm_file); 444 445 *pgoff = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT); 446 447 /* 448 * Return the memory policy for this index, or NULL if none is set. 449 * 450 * Returning NULL, e.g. instead of the current task's memory policy, is 451 * important for the .get_policy kernel ABI: it indicates that no 452 * explicit policy has been set via mbind() for this memory. The caller 453 * can then replace NULL with the default memory policy instead of the 454 * current task's memory policy. 455 */ 456 return mpol_shared_policy_lookup(&GMEM_I(inode)->policy, *pgoff); 457 } 458 #endif /* CONFIG_NUMA */ 459 460 static const struct vm_operations_struct kvm_gmem_vm_ops = { 461 .fault = kvm_gmem_fault_user_mapping, 462 #ifdef CONFIG_NUMA 463 .get_policy = kvm_gmem_get_policy, 464 .set_policy = kvm_gmem_set_policy, 465 #endif 466 }; 467 468 static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma) 469 { 470 if (!kvm_gmem_supports_mmap(file_inode(file))) 471 return -ENODEV; 472 473 if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) != 474 (VM_SHARED | VM_MAYSHARE)) { 475 return -EINVAL; 476 } 477 478 vma->vm_ops = &kvm_gmem_vm_ops; 479 480 return 0; 481 } 482 483 static struct file_operations kvm_gmem_fops = { 484 .mmap = kvm_gmem_mmap, 485 .open = generic_file_open, 486 .release = kvm_gmem_release, 487 .fallocate = kvm_gmem_fallocate, 488 }; 489 490 static int kvm_gmem_migrate_folio(struct address_space *mapping, 491 struct folio *dst, struct folio *src, 492 enum migrate_mode mode) 493 { 494 WARN_ON_ONCE(1); 495 return -EINVAL; 496 } 497 498 static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *folio) 499 { 500 pgoff_t start, end; 501 502 filemap_invalidate_lock_shared(mapping); 503 504 start = folio->index; 505 end = start + folio_nr_pages(folio); 506 507 kvm_gmem_invalidate_begin(mapping->host, start, end); 508 509 /* 510 * Do not truncate the range, what action is taken in response to the 511 * error is userspace's decision (assuming the architecture supports 512 * gracefully handling memory errors). If/when the guest attempts to 513 * access a poisoned page, kvm_gmem_get_pfn() will return -EHWPOISON, 514 * at which point KVM can either terminate the VM or propagate the 515 * error to userspace. 516 */ 517 518 kvm_gmem_invalidate_end(mapping->host, start, end); 519 520 filemap_invalidate_unlock_shared(mapping); 521 522 return MF_DELAYED; 523 } 524 525 #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE 526 static void kvm_gmem_free_folio(struct folio *folio) 527 { 528 struct page *page = folio_page(folio, 0); 529 kvm_pfn_t pfn = page_to_pfn(page); 530 int order = folio_order(folio); 531 532 kvm_arch_gmem_invalidate(pfn, pfn + (1ul << order)); 533 } 534 #endif 535 536 static const struct address_space_operations kvm_gmem_aops = { 537 .dirty_folio = noop_dirty_folio, 538 .migrate_folio = kvm_gmem_migrate_folio, 539 .error_remove_folio = kvm_gmem_error_folio, 540 #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE 541 .free_folio = kvm_gmem_free_folio, 542 #endif 543 }; 544 545 static int kvm_gmem_setattr(struct mnt_idmap *idmap, struct dentry *dentry, 546 struct iattr *attr) 547 { 548 return -EINVAL; 549 } 550 static const struct inode_operations kvm_gmem_iops = { 551 .setattr = kvm_gmem_setattr, 552 }; 553 554 bool __weak kvm_arch_supports_gmem_init_shared(struct kvm *kvm) 555 { 556 return true; 557 } 558 559 static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) 560 { 561 static const char *name = "[kvm-gmem]"; 562 struct gmem_file *f; 563 struct inode *inode; 564 struct file *file; 565 int fd, err; 566 567 fd = get_unused_fd_flags(0); 568 if (fd < 0) 569 return fd; 570 571 f = kzalloc(sizeof(*f), GFP_KERNEL); 572 if (!f) { 573 err = -ENOMEM; 574 goto err_fd; 575 } 576 577 /* __fput() will take care of fops_put(). */ 578 if (!fops_get(&kvm_gmem_fops)) { 579 err = -ENOENT; 580 goto err_gmem; 581 } 582 583 inode = anon_inode_make_secure_inode(kvm_gmem_mnt->mnt_sb, name, NULL); 584 if (IS_ERR(inode)) { 585 err = PTR_ERR(inode); 586 goto err_fops; 587 } 588 589 inode->i_op = &kvm_gmem_iops; 590 inode->i_mapping->a_ops = &kvm_gmem_aops; 591 inode->i_mode |= S_IFREG; 592 inode->i_size = size; 593 mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); 594 mapping_set_inaccessible(inode->i_mapping); 595 /* Unmovable mappings are supposed to be marked unevictable as well. */ 596 WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping)); 597 598 GMEM_I(inode)->flags = flags; 599 600 file = alloc_file_pseudo(inode, kvm_gmem_mnt, name, O_RDWR, &kvm_gmem_fops); 601 if (IS_ERR(file)) { 602 err = PTR_ERR(file); 603 goto err_inode; 604 } 605 606 file->f_flags |= O_LARGEFILE; 607 file->private_data = f; 608 609 kvm_get_kvm(kvm); 610 f->kvm = kvm; 611 xa_init(&f->bindings); 612 list_add(&f->entry, &inode->i_mapping->i_private_list); 613 614 fd_install(fd, file); 615 return fd; 616 617 err_inode: 618 iput(inode); 619 err_fops: 620 fops_put(&kvm_gmem_fops); 621 err_gmem: 622 kfree(f); 623 err_fd: 624 put_unused_fd(fd); 625 return err; 626 } 627 628 int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args) 629 { 630 loff_t size = args->size; 631 u64 flags = args->flags; 632 633 if (flags & ~kvm_gmem_get_supported_flags(kvm)) 634 return -EINVAL; 635 636 if (size <= 0 || !PAGE_ALIGNED(size)) 637 return -EINVAL; 638 639 return __kvm_gmem_create(kvm, size, flags); 640 } 641 642 int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot, 643 unsigned int fd, loff_t offset) 644 { 645 loff_t size = slot->npages << PAGE_SHIFT; 646 unsigned long start, end; 647 struct gmem_file *f; 648 struct inode *inode; 649 struct file *file; 650 int r = -EINVAL; 651 652 BUILD_BUG_ON(sizeof(gfn_t) != sizeof(slot->gmem.pgoff)); 653 654 file = fget(fd); 655 if (!file) 656 return -EBADF; 657 658 if (file->f_op != &kvm_gmem_fops) 659 goto err; 660 661 f = file->private_data; 662 if (f->kvm != kvm) 663 goto err; 664 665 inode = file_inode(file); 666 667 if (offset < 0 || !PAGE_ALIGNED(offset) || 668 offset + size > i_size_read(inode)) 669 goto err; 670 671 filemap_invalidate_lock(inode->i_mapping); 672 673 start = offset >> PAGE_SHIFT; 674 end = start + slot->npages; 675 676 if (!xa_empty(&f->bindings) && 677 xa_find(&f->bindings, &start, end - 1, XA_PRESENT)) { 678 filemap_invalidate_unlock(inode->i_mapping); 679 goto err; 680 } 681 682 /* 683 * memslots of flag KVM_MEM_GUEST_MEMFD are immutable to change, so 684 * kvm_gmem_bind() must occur on a new memslot. Because the memslot 685 * is not visible yet, kvm_gmem_get_pfn() is guaranteed to see the file. 686 */ 687 WRITE_ONCE(slot->gmem.file, file); 688 slot->gmem.pgoff = start; 689 if (kvm_gmem_supports_mmap(inode)) 690 slot->flags |= KVM_MEMSLOT_GMEM_ONLY; 691 692 xa_store_range(&f->bindings, start, end - 1, slot, GFP_KERNEL); 693 filemap_invalidate_unlock(inode->i_mapping); 694 695 /* 696 * Drop the reference to the file, even on success. The file pins KVM, 697 * not the other way 'round. Active bindings are invalidated if the 698 * file is closed before memslots are destroyed. 699 */ 700 r = 0; 701 err: 702 fput(file); 703 return r; 704 } 705 706 static void __kvm_gmem_unbind(struct kvm_memory_slot *slot, struct gmem_file *f) 707 { 708 unsigned long start = slot->gmem.pgoff; 709 unsigned long end = start + slot->npages; 710 711 xa_store_range(&f->bindings, start, end - 1, NULL, GFP_KERNEL); 712 713 /* 714 * synchronize_srcu(&kvm->srcu) ensured that kvm_gmem_get_pfn() 715 * cannot see this memslot. 716 */ 717 WRITE_ONCE(slot->gmem.file, NULL); 718 } 719 720 void kvm_gmem_unbind(struct kvm_memory_slot *slot) 721 { 722 /* 723 * Nothing to do if the underlying file was _already_ closed, as 724 * kvm_gmem_release() invalidates and nullifies all bindings. 725 */ 726 if (!slot->gmem.file) 727 return; 728 729 CLASS(gmem_get_file, file)(slot); 730 731 /* 732 * However, if the file is _being_ closed, then the bindings need to be 733 * removed as kvm_gmem_release() might not run until after the memslot 734 * is freed. Note, modifying the bindings is safe even though the file 735 * is dying as kvm_gmem_release() nullifies slot->gmem.file under 736 * slots_lock, and only puts its reference to KVM after destroying all 737 * bindings. I.e. reaching this point means kvm_gmem_release() hasn't 738 * yet destroyed the bindings or freed the gmem_file, and can't do so 739 * until the caller drops slots_lock. 740 */ 741 if (!file) { 742 __kvm_gmem_unbind(slot, slot->gmem.file->private_data); 743 return; 744 } 745 746 filemap_invalidate_lock(file->f_mapping); 747 __kvm_gmem_unbind(slot, file->private_data); 748 filemap_invalidate_unlock(file->f_mapping); 749 } 750 751 /* Returns a locked folio on success. */ 752 static struct folio *__kvm_gmem_get_pfn(struct file *file, 753 struct kvm_memory_slot *slot, 754 pgoff_t index, kvm_pfn_t *pfn, 755 int *max_order) 756 { 757 struct file *slot_file = READ_ONCE(slot->gmem.file); 758 struct gmem_file *f = file->private_data; 759 struct folio *folio; 760 761 if (file != slot_file) { 762 WARN_ON_ONCE(slot_file); 763 return ERR_PTR(-EFAULT); 764 } 765 766 if (xa_load(&f->bindings, index) != slot) { 767 WARN_ON_ONCE(xa_load(&f->bindings, index)); 768 return ERR_PTR(-EIO); 769 } 770 771 folio = kvm_gmem_get_folio(file_inode(file), index); 772 if (IS_ERR(folio)) 773 return folio; 774 775 if (folio_test_hwpoison(folio)) { 776 folio_unlock(folio); 777 folio_put(folio); 778 return ERR_PTR(-EHWPOISON); 779 } 780 781 *pfn = folio_file_pfn(folio, index); 782 if (max_order) 783 *max_order = 0; 784 785 return folio; 786 } 787 788 int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, 789 gfn_t gfn, kvm_pfn_t *pfn, struct page **page, 790 int *max_order) 791 { 792 pgoff_t index = kvm_gmem_get_index(slot, gfn); 793 struct folio *folio; 794 int r = 0; 795 796 CLASS(gmem_get_file, file)(slot); 797 if (!file) 798 return -EFAULT; 799 800 folio = __kvm_gmem_get_pfn(file, slot, index, pfn, max_order); 801 if (IS_ERR(folio)) 802 return PTR_ERR(folio); 803 804 if (!folio_test_uptodate(folio)) { 805 clear_highpage(folio_page(folio, 0)); 806 folio_mark_uptodate(folio); 807 } 808 809 r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio); 810 811 folio_unlock(folio); 812 813 if (!r) 814 *page = folio_file_page(folio, index); 815 else 816 folio_put(folio); 817 818 return r; 819 } 820 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_get_pfn); 821 822 #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_POPULATE 823 824 static long __kvm_gmem_populate(struct kvm *kvm, struct kvm_memory_slot *slot, 825 struct file *file, gfn_t gfn, struct page *src_page, 826 kvm_gmem_populate_cb post_populate, void *opaque) 827 { 828 pgoff_t index = kvm_gmem_get_index(slot, gfn); 829 struct folio *folio; 830 kvm_pfn_t pfn; 831 int ret; 832 833 filemap_invalidate_lock(file->f_mapping); 834 835 folio = __kvm_gmem_get_pfn(file, slot, index, &pfn, NULL); 836 if (IS_ERR(folio)) { 837 ret = PTR_ERR(folio); 838 goto out_unlock; 839 } 840 841 folio_unlock(folio); 842 843 if (!kvm_range_has_memory_attributes(kvm, gfn, gfn + 1, 844 KVM_MEMORY_ATTRIBUTE_PRIVATE, 845 KVM_MEMORY_ATTRIBUTE_PRIVATE)) { 846 ret = -EINVAL; 847 goto out_put_folio; 848 } 849 850 ret = post_populate(kvm, gfn, pfn, src_page, opaque); 851 if (!ret) 852 folio_mark_uptodate(folio); 853 854 out_put_folio: 855 folio_put(folio); 856 out_unlock: 857 filemap_invalidate_unlock(file->f_mapping); 858 return ret; 859 } 860 861 long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long npages, 862 kvm_gmem_populate_cb post_populate, void *opaque) 863 { 864 struct kvm_memory_slot *slot; 865 int ret = 0; 866 long i; 867 868 lockdep_assert_held(&kvm->slots_lock); 869 870 if (WARN_ON_ONCE(npages <= 0)) 871 return -EINVAL; 872 873 if (WARN_ON_ONCE(!PAGE_ALIGNED(src))) 874 return -EINVAL; 875 876 slot = gfn_to_memslot(kvm, start_gfn); 877 if (!kvm_slot_has_gmem(slot)) 878 return -EINVAL; 879 880 CLASS(gmem_get_file, file)(slot); 881 if (!file) 882 return -EFAULT; 883 884 npages = min_t(ulong, slot->npages - (start_gfn - slot->base_gfn), npages); 885 for (i = 0; i < npages; i++) { 886 struct page *src_page = NULL; 887 888 if (signal_pending(current)) { 889 ret = -EINTR; 890 break; 891 } 892 893 if (src) { 894 unsigned long uaddr = (unsigned long)src + i * PAGE_SIZE; 895 896 ret = get_user_pages_fast(uaddr, 1, 0, &src_page); 897 if (ret < 0) 898 break; 899 if (ret != 1) { 900 ret = -ENOMEM; 901 break; 902 } 903 } 904 905 ret = __kvm_gmem_populate(kvm, slot, file, start_gfn + i, src_page, 906 post_populate, opaque); 907 908 if (src_page) 909 put_page(src_page); 910 911 if (ret) 912 break; 913 } 914 915 return ret && !i ? ret : i; 916 } 917 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_populate); 918 #endif 919 920 static struct kmem_cache *kvm_gmem_inode_cachep; 921 922 static void kvm_gmem_init_inode_once(void *__gi) 923 { 924 struct gmem_inode *gi = __gi; 925 926 /* 927 * Note! Don't initialize the inode with anything specific to the 928 * guest_memfd instance, or that might be specific to how the inode is 929 * used (from the VFS-layer's perspective). This hook is called only 930 * during the initial slab allocation, i.e. only fields/state that are 931 * idempotent across _all_ use of the inode _object_ can be initialized 932 * at this time! 933 */ 934 inode_init_once(&gi->vfs_inode); 935 } 936 937 static struct inode *kvm_gmem_alloc_inode(struct super_block *sb) 938 { 939 struct gmem_inode *gi; 940 941 gi = alloc_inode_sb(sb, kvm_gmem_inode_cachep, GFP_KERNEL); 942 if (!gi) 943 return NULL; 944 945 mpol_shared_policy_init(&gi->policy, NULL); 946 947 gi->flags = 0; 948 return &gi->vfs_inode; 949 } 950 951 static void kvm_gmem_destroy_inode(struct inode *inode) 952 { 953 mpol_free_shared_policy(&GMEM_I(inode)->policy); 954 } 955 956 static void kvm_gmem_free_inode(struct inode *inode) 957 { 958 kmem_cache_free(kvm_gmem_inode_cachep, GMEM_I(inode)); 959 } 960 961 static const struct super_operations kvm_gmem_super_operations = { 962 .statfs = simple_statfs, 963 .alloc_inode = kvm_gmem_alloc_inode, 964 .destroy_inode = kvm_gmem_destroy_inode, 965 .free_inode = kvm_gmem_free_inode, 966 }; 967 968 static int kvm_gmem_init_fs_context(struct fs_context *fc) 969 { 970 struct pseudo_fs_context *ctx; 971 972 if (!init_pseudo(fc, GUEST_MEMFD_MAGIC)) 973 return -ENOMEM; 974 975 fc->s_iflags |= SB_I_NOEXEC; 976 fc->s_iflags |= SB_I_NODEV; 977 ctx = fc->fs_private; 978 ctx->ops = &kvm_gmem_super_operations; 979 980 return 0; 981 } 982 983 static struct file_system_type kvm_gmem_fs = { 984 .name = "guest_memfd", 985 .init_fs_context = kvm_gmem_init_fs_context, 986 .kill_sb = kill_anon_super, 987 }; 988 989 static int kvm_gmem_init_mount(void) 990 { 991 kvm_gmem_mnt = kern_mount(&kvm_gmem_fs); 992 993 if (IS_ERR(kvm_gmem_mnt)) 994 return PTR_ERR(kvm_gmem_mnt); 995 996 kvm_gmem_mnt->mnt_flags |= MNT_NOEXEC; 997 return 0; 998 } 999 1000 int kvm_gmem_init(struct module *module) 1001 { 1002 struct kmem_cache_args args = { 1003 .align = 0, 1004 .ctor = kvm_gmem_init_inode_once, 1005 }; 1006 int ret; 1007 1008 kvm_gmem_fops.owner = module; 1009 kvm_gmem_inode_cachep = kmem_cache_create("kvm_gmem_inode_cache", 1010 sizeof(struct gmem_inode), 1011 &args, SLAB_ACCOUNT); 1012 if (!kvm_gmem_inode_cachep) 1013 return -ENOMEM; 1014 1015 ret = kvm_gmem_init_mount(); 1016 if (ret) { 1017 kmem_cache_destroy(kvm_gmem_inode_cachep); 1018 return ret; 1019 } 1020 return 0; 1021 } 1022 1023 void kvm_gmem_exit(void) 1024 { 1025 kern_unmount(kvm_gmem_mnt); 1026 kvm_gmem_mnt = NULL; 1027 rcu_barrier(); 1028 kmem_cache_destroy(kvm_gmem_inode_cachep); 1029 } 1030