1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/anon_inodes.h> 3 #include <linux/backing-dev.h> 4 #include <linux/falloc.h> 5 #include <linux/fs.h> 6 #include <linux/kvm_host.h> 7 #include <linux/mempolicy.h> 8 #include <linux/pseudo_fs.h> 9 #include <linux/pagemap.h> 10 11 #include "kvm_mm.h" 12 13 static struct vfsmount *kvm_gmem_mnt; 14 15 /* 16 * A guest_memfd instance can be associated multiple VMs, each with its own 17 * "view" of the underlying physical memory. 18 * 19 * The gmem's inode is effectively the raw underlying physical storage, and is 20 * used to track properties of the physical memory, while each gmem file is 21 * effectively a single VM's view of that storage, and is used to track assets 22 * specific to its associated VM, e.g. memslots=>gmem bindings. 23 */ 24 struct gmem_file { 25 struct kvm *kvm; 26 struct xarray bindings; 27 struct list_head entry; 28 }; 29 30 struct gmem_inode { 31 struct shared_policy policy; 32 struct inode vfs_inode; 33 34 u64 flags; 35 }; 36 37 static __always_inline struct gmem_inode *GMEM_I(struct inode *inode) 38 { 39 return container_of(inode, struct gmem_inode, vfs_inode); 40 } 41 42 #define kvm_gmem_for_each_file(f, mapping) \ 43 list_for_each_entry(f, &(mapping)->i_private_list, entry) 44 45 /** 46 * folio_file_pfn - like folio_file_page, but return a pfn. 47 * @folio: The folio which contains this index. 48 * @index: The index we want to look up. 49 * 50 * Return: The pfn for this index. 51 */ 52 static inline kvm_pfn_t folio_file_pfn(struct folio *folio, pgoff_t index) 53 { 54 return folio_pfn(folio) + (index & (folio_nr_pages(folio) - 1)); 55 } 56 57 static pgoff_t kvm_gmem_get_index(struct kvm_memory_slot *slot, gfn_t gfn) 58 { 59 return gfn - slot->base_gfn + slot->gmem.pgoff; 60 } 61 62 static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot, 63 pgoff_t index, struct folio *folio) 64 { 65 #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE 66 kvm_pfn_t pfn = folio_file_pfn(folio, index); 67 gfn_t gfn = slot->base_gfn + index - slot->gmem.pgoff; 68 int rc = kvm_arch_gmem_prepare(kvm, gfn, pfn, folio_order(folio)); 69 if (rc) { 70 pr_warn_ratelimited("gmem: Failed to prepare folio for index %lx GFN %llx PFN %llx error %d.\n", 71 index, gfn, pfn, rc); 72 return rc; 73 } 74 #endif 75 76 return 0; 77 } 78 79 static inline void kvm_gmem_mark_prepared(struct folio *folio) 80 { 81 folio_mark_uptodate(folio); 82 } 83 84 /* 85 * Process @folio, which contains @gfn, so that the guest can use it. 86 * The folio must be locked and the gfn must be contained in @slot. 87 * On successful return the guest sees a zero page so as to avoid 88 * leaking host data and the up-to-date flag is set. 89 */ 90 static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot, 91 gfn_t gfn, struct folio *folio) 92 { 93 unsigned long nr_pages, i; 94 pgoff_t index; 95 int r; 96 97 nr_pages = folio_nr_pages(folio); 98 for (i = 0; i < nr_pages; i++) 99 clear_highpage(folio_page(folio, i)); 100 101 /* 102 * Preparing huge folios should always be safe, since it should 103 * be possible to split them later if needed. 104 * 105 * Right now the folio order is always going to be zero, but the 106 * code is ready for huge folios. The only assumption is that 107 * the base pgoff of memslots is naturally aligned with the 108 * requested page order, ensuring that huge folios can also use 109 * huge page table entries for GPA->HPA mapping. 110 * 111 * The order will be passed when creating the guest_memfd, and 112 * checked when creating memslots. 113 */ 114 WARN_ON(!IS_ALIGNED(slot->gmem.pgoff, folio_nr_pages(folio))); 115 index = kvm_gmem_get_index(slot, gfn); 116 index = ALIGN_DOWN(index, folio_nr_pages(folio)); 117 r = __kvm_gmem_prepare_folio(kvm, slot, index, folio); 118 if (!r) 119 kvm_gmem_mark_prepared(folio); 120 121 return r; 122 } 123 124 /* 125 * Returns a locked folio on success. The caller is responsible for 126 * setting the up-to-date flag before the memory is mapped into the guest. 127 * There is no backing storage for the memory, so the folio will remain 128 * up-to-date until it's removed. 129 * 130 * Ignore accessed, referenced, and dirty flags. The memory is 131 * unevictable and there is no storage to write back to. 132 */ 133 static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index) 134 { 135 /* TODO: Support huge pages. */ 136 struct mempolicy *policy; 137 struct folio *folio; 138 139 /* 140 * Fast-path: See if folio is already present in mapping to avoid 141 * policy_lookup. 142 */ 143 folio = __filemap_get_folio(inode->i_mapping, index, 144 FGP_LOCK | FGP_ACCESSED, 0); 145 if (!IS_ERR(folio)) 146 return folio; 147 148 policy = mpol_shared_policy_lookup(&GMEM_I(inode)->policy, index); 149 folio = __filemap_get_folio_mpol(inode->i_mapping, index, 150 FGP_LOCK | FGP_ACCESSED | FGP_CREAT, 151 mapping_gfp_mask(inode->i_mapping), policy); 152 mpol_cond_put(policy); 153 154 return folio; 155 } 156 157 static enum kvm_gfn_range_filter kvm_gmem_get_invalidate_filter(struct inode *inode) 158 { 159 if (GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_INIT_SHARED) 160 return KVM_FILTER_SHARED; 161 162 return KVM_FILTER_PRIVATE; 163 } 164 165 static void __kvm_gmem_invalidate_begin(struct gmem_file *f, pgoff_t start, 166 pgoff_t end, 167 enum kvm_gfn_range_filter attr_filter) 168 { 169 bool flush = false, found_memslot = false; 170 struct kvm_memory_slot *slot; 171 struct kvm *kvm = f->kvm; 172 unsigned long index; 173 174 xa_for_each_range(&f->bindings, index, slot, start, end - 1) { 175 pgoff_t pgoff = slot->gmem.pgoff; 176 177 struct kvm_gfn_range gfn_range = { 178 .start = slot->base_gfn + max(pgoff, start) - pgoff, 179 .end = slot->base_gfn + min(pgoff + slot->npages, end) - pgoff, 180 .slot = slot, 181 .may_block = true, 182 .attr_filter = attr_filter, 183 }; 184 185 if (!found_memslot) { 186 found_memslot = true; 187 188 KVM_MMU_LOCK(kvm); 189 kvm_mmu_invalidate_begin(kvm); 190 } 191 192 flush |= kvm_mmu_unmap_gfn_range(kvm, &gfn_range); 193 } 194 195 if (flush) 196 kvm_flush_remote_tlbs(kvm); 197 198 if (found_memslot) 199 KVM_MMU_UNLOCK(kvm); 200 } 201 202 static void kvm_gmem_invalidate_begin(struct inode *inode, pgoff_t start, 203 pgoff_t end) 204 { 205 enum kvm_gfn_range_filter attr_filter; 206 struct gmem_file *f; 207 208 attr_filter = kvm_gmem_get_invalidate_filter(inode); 209 210 kvm_gmem_for_each_file(f, inode->i_mapping) 211 __kvm_gmem_invalidate_begin(f, start, end, attr_filter); 212 } 213 214 static void __kvm_gmem_invalidate_end(struct gmem_file *f, pgoff_t start, 215 pgoff_t end) 216 { 217 struct kvm *kvm = f->kvm; 218 219 if (xa_find(&f->bindings, &start, end - 1, XA_PRESENT)) { 220 KVM_MMU_LOCK(kvm); 221 kvm_mmu_invalidate_end(kvm); 222 KVM_MMU_UNLOCK(kvm); 223 } 224 } 225 226 static void kvm_gmem_invalidate_end(struct inode *inode, pgoff_t start, 227 pgoff_t end) 228 { 229 struct gmem_file *f; 230 231 kvm_gmem_for_each_file(f, inode->i_mapping) 232 __kvm_gmem_invalidate_end(f, start, end); 233 } 234 235 static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len) 236 { 237 pgoff_t start = offset >> PAGE_SHIFT; 238 pgoff_t end = (offset + len) >> PAGE_SHIFT; 239 240 /* 241 * Bindings must be stable across invalidation to ensure the start+end 242 * are balanced. 243 */ 244 filemap_invalidate_lock(inode->i_mapping); 245 246 kvm_gmem_invalidate_begin(inode, start, end); 247 248 truncate_inode_pages_range(inode->i_mapping, offset, offset + len - 1); 249 250 kvm_gmem_invalidate_end(inode, start, end); 251 252 filemap_invalidate_unlock(inode->i_mapping); 253 254 return 0; 255 } 256 257 static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len) 258 { 259 struct address_space *mapping = inode->i_mapping; 260 pgoff_t start, index, end; 261 int r; 262 263 /* Dedicated guest is immutable by default. */ 264 if (offset + len > i_size_read(inode)) 265 return -EINVAL; 266 267 filemap_invalidate_lock_shared(mapping); 268 269 start = offset >> PAGE_SHIFT; 270 end = (offset + len) >> PAGE_SHIFT; 271 272 r = 0; 273 for (index = start; index < end; ) { 274 struct folio *folio; 275 276 if (signal_pending(current)) { 277 r = -EINTR; 278 break; 279 } 280 281 folio = kvm_gmem_get_folio(inode, index); 282 if (IS_ERR(folio)) { 283 r = PTR_ERR(folio); 284 break; 285 } 286 287 index = folio_next_index(folio); 288 289 folio_unlock(folio); 290 folio_put(folio); 291 292 /* 64-bit only, wrapping the index should be impossible. */ 293 if (WARN_ON_ONCE(!index)) 294 break; 295 296 cond_resched(); 297 } 298 299 filemap_invalidate_unlock_shared(mapping); 300 301 return r; 302 } 303 304 static long kvm_gmem_fallocate(struct file *file, int mode, loff_t offset, 305 loff_t len) 306 { 307 int ret; 308 309 if (!(mode & FALLOC_FL_KEEP_SIZE)) 310 return -EOPNOTSUPP; 311 312 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) 313 return -EOPNOTSUPP; 314 315 if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len)) 316 return -EINVAL; 317 318 if (mode & FALLOC_FL_PUNCH_HOLE) 319 ret = kvm_gmem_punch_hole(file_inode(file), offset, len); 320 else 321 ret = kvm_gmem_allocate(file_inode(file), offset, len); 322 323 if (!ret) 324 file_modified(file); 325 return ret; 326 } 327 328 static int kvm_gmem_release(struct inode *inode, struct file *file) 329 { 330 struct gmem_file *f = file->private_data; 331 struct kvm_memory_slot *slot; 332 struct kvm *kvm = f->kvm; 333 unsigned long index; 334 335 /* 336 * Prevent concurrent attempts to *unbind* a memslot. This is the last 337 * reference to the file and thus no new bindings can be created, but 338 * dereferencing the slot for existing bindings needs to be protected 339 * against memslot updates, specifically so that unbind doesn't race 340 * and free the memslot (kvm_gmem_get_file() will return NULL). 341 * 342 * Since .release is called only when the reference count is zero, 343 * after which file_ref_get() and get_file_active() fail, 344 * kvm_gmem_get_pfn() cannot be using the file concurrently. 345 * file_ref_put() provides a full barrier, and get_file_active() the 346 * matching acquire barrier. 347 */ 348 mutex_lock(&kvm->slots_lock); 349 350 filemap_invalidate_lock(inode->i_mapping); 351 352 xa_for_each(&f->bindings, index, slot) 353 WRITE_ONCE(slot->gmem.file, NULL); 354 355 /* 356 * All in-flight operations are gone and new bindings can be created. 357 * Zap all SPTEs pointed at by this file. Do not free the backing 358 * memory, as its lifetime is associated with the inode, not the file. 359 */ 360 __kvm_gmem_invalidate_begin(f, 0, -1ul, 361 kvm_gmem_get_invalidate_filter(inode)); 362 __kvm_gmem_invalidate_end(f, 0, -1ul); 363 364 list_del(&f->entry); 365 366 filemap_invalidate_unlock(inode->i_mapping); 367 368 mutex_unlock(&kvm->slots_lock); 369 370 xa_destroy(&f->bindings); 371 kfree(f); 372 373 kvm_put_kvm(kvm); 374 375 return 0; 376 } 377 378 static inline struct file *kvm_gmem_get_file(struct kvm_memory_slot *slot) 379 { 380 /* 381 * Do not return slot->gmem.file if it has already been closed; 382 * there might be some time between the last fput() and when 383 * kvm_gmem_release() clears slot->gmem.file. 384 */ 385 return get_file_active(&slot->gmem.file); 386 } 387 388 DEFINE_CLASS(gmem_get_file, struct file *, if (_T) fput(_T), 389 kvm_gmem_get_file(slot), struct kvm_memory_slot *slot); 390 391 static bool kvm_gmem_supports_mmap(struct inode *inode) 392 { 393 return GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_MMAP; 394 } 395 396 static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf) 397 { 398 struct inode *inode = file_inode(vmf->vma->vm_file); 399 struct folio *folio; 400 vm_fault_t ret = VM_FAULT_LOCKED; 401 402 if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode)) 403 return VM_FAULT_SIGBUS; 404 405 if (!(GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_INIT_SHARED)) 406 return VM_FAULT_SIGBUS; 407 408 folio = kvm_gmem_get_folio(inode, vmf->pgoff); 409 if (IS_ERR(folio)) { 410 if (PTR_ERR(folio) == -EAGAIN) 411 return VM_FAULT_RETRY; 412 413 return vmf_error(PTR_ERR(folio)); 414 } 415 416 if (WARN_ON_ONCE(folio_test_large(folio))) { 417 ret = VM_FAULT_SIGBUS; 418 goto out_folio; 419 } 420 421 if (!folio_test_uptodate(folio)) { 422 clear_highpage(folio_page(folio, 0)); 423 kvm_gmem_mark_prepared(folio); 424 } 425 426 vmf->page = folio_file_page(folio, vmf->pgoff); 427 428 out_folio: 429 if (ret != VM_FAULT_LOCKED) { 430 folio_unlock(folio); 431 folio_put(folio); 432 } 433 434 return ret; 435 } 436 437 #ifdef CONFIG_NUMA 438 static int kvm_gmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol) 439 { 440 struct inode *inode = file_inode(vma->vm_file); 441 442 return mpol_set_shared_policy(&GMEM_I(inode)->policy, vma, mpol); 443 } 444 445 static struct mempolicy *kvm_gmem_get_policy(struct vm_area_struct *vma, 446 unsigned long addr, pgoff_t *pgoff) 447 { 448 struct inode *inode = file_inode(vma->vm_file); 449 450 *pgoff = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT); 451 452 /* 453 * Return the memory policy for this index, or NULL if none is set. 454 * 455 * Returning NULL, e.g. instead of the current task's memory policy, is 456 * important for the .get_policy kernel ABI: it indicates that no 457 * explicit policy has been set via mbind() for this memory. The caller 458 * can then replace NULL with the default memory policy instead of the 459 * current task's memory policy. 460 */ 461 return mpol_shared_policy_lookup(&GMEM_I(inode)->policy, *pgoff); 462 } 463 #endif /* CONFIG_NUMA */ 464 465 static const struct vm_operations_struct kvm_gmem_vm_ops = { 466 .fault = kvm_gmem_fault_user_mapping, 467 #ifdef CONFIG_NUMA 468 .get_policy = kvm_gmem_get_policy, 469 .set_policy = kvm_gmem_set_policy, 470 #endif 471 }; 472 473 static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma) 474 { 475 if (!kvm_gmem_supports_mmap(file_inode(file))) 476 return -ENODEV; 477 478 if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) != 479 (VM_SHARED | VM_MAYSHARE)) { 480 return -EINVAL; 481 } 482 483 vma->vm_ops = &kvm_gmem_vm_ops; 484 485 return 0; 486 } 487 488 static struct file_operations kvm_gmem_fops = { 489 .mmap = kvm_gmem_mmap, 490 .open = generic_file_open, 491 .release = kvm_gmem_release, 492 .fallocate = kvm_gmem_fallocate, 493 }; 494 495 static int kvm_gmem_migrate_folio(struct address_space *mapping, 496 struct folio *dst, struct folio *src, 497 enum migrate_mode mode) 498 { 499 WARN_ON_ONCE(1); 500 return -EINVAL; 501 } 502 503 static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *folio) 504 { 505 pgoff_t start, end; 506 507 filemap_invalidate_lock_shared(mapping); 508 509 start = folio->index; 510 end = start + folio_nr_pages(folio); 511 512 kvm_gmem_invalidate_begin(mapping->host, start, end); 513 514 /* 515 * Do not truncate the range, what action is taken in response to the 516 * error is userspace's decision (assuming the architecture supports 517 * gracefully handling memory errors). If/when the guest attempts to 518 * access a poisoned page, kvm_gmem_get_pfn() will return -EHWPOISON, 519 * at which point KVM can either terminate the VM or propagate the 520 * error to userspace. 521 */ 522 523 kvm_gmem_invalidate_end(mapping->host, start, end); 524 525 filemap_invalidate_unlock_shared(mapping); 526 527 return MF_DELAYED; 528 } 529 530 #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE 531 static void kvm_gmem_free_folio(struct folio *folio) 532 { 533 struct page *page = folio_page(folio, 0); 534 kvm_pfn_t pfn = page_to_pfn(page); 535 int order = folio_order(folio); 536 537 kvm_arch_gmem_invalidate(pfn, pfn + (1ul << order)); 538 } 539 #endif 540 541 static const struct address_space_operations kvm_gmem_aops = { 542 .dirty_folio = noop_dirty_folio, 543 .migrate_folio = kvm_gmem_migrate_folio, 544 .error_remove_folio = kvm_gmem_error_folio, 545 #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE 546 .free_folio = kvm_gmem_free_folio, 547 #endif 548 }; 549 550 static int kvm_gmem_setattr(struct mnt_idmap *idmap, struct dentry *dentry, 551 struct iattr *attr) 552 { 553 return -EINVAL; 554 } 555 static const struct inode_operations kvm_gmem_iops = { 556 .setattr = kvm_gmem_setattr, 557 }; 558 559 bool __weak kvm_arch_supports_gmem_init_shared(struct kvm *kvm) 560 { 561 return true; 562 } 563 564 static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) 565 { 566 static const char *name = "[kvm-gmem]"; 567 struct gmem_file *f; 568 struct inode *inode; 569 struct file *file; 570 int fd, err; 571 572 fd = get_unused_fd_flags(0); 573 if (fd < 0) 574 return fd; 575 576 f = kzalloc(sizeof(*f), GFP_KERNEL); 577 if (!f) { 578 err = -ENOMEM; 579 goto err_fd; 580 } 581 582 /* __fput() will take care of fops_put(). */ 583 if (!fops_get(&kvm_gmem_fops)) { 584 err = -ENOENT; 585 goto err_gmem; 586 } 587 588 inode = anon_inode_make_secure_inode(kvm_gmem_mnt->mnt_sb, name, NULL); 589 if (IS_ERR(inode)) { 590 err = PTR_ERR(inode); 591 goto err_fops; 592 } 593 594 inode->i_op = &kvm_gmem_iops; 595 inode->i_mapping->a_ops = &kvm_gmem_aops; 596 inode->i_mode |= S_IFREG; 597 inode->i_size = size; 598 mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); 599 mapping_set_inaccessible(inode->i_mapping); 600 /* Unmovable mappings are supposed to be marked unevictable as well. */ 601 WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping)); 602 603 GMEM_I(inode)->flags = flags; 604 605 file = alloc_file_pseudo(inode, kvm_gmem_mnt, name, O_RDWR, &kvm_gmem_fops); 606 if (IS_ERR(file)) { 607 err = PTR_ERR(file); 608 goto err_inode; 609 } 610 611 file->f_flags |= O_LARGEFILE; 612 file->private_data = f; 613 614 kvm_get_kvm(kvm); 615 f->kvm = kvm; 616 xa_init(&f->bindings); 617 list_add(&f->entry, &inode->i_mapping->i_private_list); 618 619 fd_install(fd, file); 620 return fd; 621 622 err_inode: 623 iput(inode); 624 err_fops: 625 fops_put(&kvm_gmem_fops); 626 err_gmem: 627 kfree(f); 628 err_fd: 629 put_unused_fd(fd); 630 return err; 631 } 632 633 int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args) 634 { 635 loff_t size = args->size; 636 u64 flags = args->flags; 637 638 if (flags & ~kvm_gmem_get_supported_flags(kvm)) 639 return -EINVAL; 640 641 if (size <= 0 || !PAGE_ALIGNED(size)) 642 return -EINVAL; 643 644 return __kvm_gmem_create(kvm, size, flags); 645 } 646 647 int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot, 648 unsigned int fd, loff_t offset) 649 { 650 loff_t size = slot->npages << PAGE_SHIFT; 651 unsigned long start, end; 652 struct gmem_file *f; 653 struct inode *inode; 654 struct file *file; 655 int r = -EINVAL; 656 657 BUILD_BUG_ON(sizeof(gfn_t) != sizeof(slot->gmem.pgoff)); 658 659 file = fget(fd); 660 if (!file) 661 return -EBADF; 662 663 if (file->f_op != &kvm_gmem_fops) 664 goto err; 665 666 f = file->private_data; 667 if (f->kvm != kvm) 668 goto err; 669 670 inode = file_inode(file); 671 672 if (offset < 0 || !PAGE_ALIGNED(offset) || 673 offset + size > i_size_read(inode)) 674 goto err; 675 676 filemap_invalidate_lock(inode->i_mapping); 677 678 start = offset >> PAGE_SHIFT; 679 end = start + slot->npages; 680 681 if (!xa_empty(&f->bindings) && 682 xa_find(&f->bindings, &start, end - 1, XA_PRESENT)) { 683 filemap_invalidate_unlock(inode->i_mapping); 684 goto err; 685 } 686 687 /* 688 * memslots of flag KVM_MEM_GUEST_MEMFD are immutable to change, so 689 * kvm_gmem_bind() must occur on a new memslot. Because the memslot 690 * is not visible yet, kvm_gmem_get_pfn() is guaranteed to see the file. 691 */ 692 WRITE_ONCE(slot->gmem.file, file); 693 slot->gmem.pgoff = start; 694 if (kvm_gmem_supports_mmap(inode)) 695 slot->flags |= KVM_MEMSLOT_GMEM_ONLY; 696 697 xa_store_range(&f->bindings, start, end - 1, slot, GFP_KERNEL); 698 filemap_invalidate_unlock(inode->i_mapping); 699 700 /* 701 * Drop the reference to the file, even on success. The file pins KVM, 702 * not the other way 'round. Active bindings are invalidated if the 703 * file is closed before memslots are destroyed. 704 */ 705 r = 0; 706 err: 707 fput(file); 708 return r; 709 } 710 711 static void __kvm_gmem_unbind(struct kvm_memory_slot *slot, struct gmem_file *f) 712 { 713 unsigned long start = slot->gmem.pgoff; 714 unsigned long end = start + slot->npages; 715 716 xa_store_range(&f->bindings, start, end - 1, NULL, GFP_KERNEL); 717 718 /* 719 * synchronize_srcu(&kvm->srcu) ensured that kvm_gmem_get_pfn() 720 * cannot see this memslot. 721 */ 722 WRITE_ONCE(slot->gmem.file, NULL); 723 } 724 725 void kvm_gmem_unbind(struct kvm_memory_slot *slot) 726 { 727 /* 728 * Nothing to do if the underlying file was _already_ closed, as 729 * kvm_gmem_release() invalidates and nullifies all bindings. 730 */ 731 if (!slot->gmem.file) 732 return; 733 734 CLASS(gmem_get_file, file)(slot); 735 736 /* 737 * However, if the file is _being_ closed, then the bindings need to be 738 * removed as kvm_gmem_release() might not run until after the memslot 739 * is freed. Note, modifying the bindings is safe even though the file 740 * is dying as kvm_gmem_release() nullifies slot->gmem.file under 741 * slots_lock, and only puts its reference to KVM after destroying all 742 * bindings. I.e. reaching this point means kvm_gmem_release() hasn't 743 * yet destroyed the bindings or freed the gmem_file, and can't do so 744 * until the caller drops slots_lock. 745 */ 746 if (!file) { 747 __kvm_gmem_unbind(slot, slot->gmem.file->private_data); 748 return; 749 } 750 751 filemap_invalidate_lock(file->f_mapping); 752 __kvm_gmem_unbind(slot, file->private_data); 753 filemap_invalidate_unlock(file->f_mapping); 754 } 755 756 /* Returns a locked folio on success. */ 757 static struct folio *__kvm_gmem_get_pfn(struct file *file, 758 struct kvm_memory_slot *slot, 759 pgoff_t index, kvm_pfn_t *pfn, 760 bool *is_prepared, int *max_order) 761 { 762 struct file *slot_file = READ_ONCE(slot->gmem.file); 763 struct gmem_file *f = file->private_data; 764 struct folio *folio; 765 766 if (file != slot_file) { 767 WARN_ON_ONCE(slot_file); 768 return ERR_PTR(-EFAULT); 769 } 770 771 if (xa_load(&f->bindings, index) != slot) { 772 WARN_ON_ONCE(xa_load(&f->bindings, index)); 773 return ERR_PTR(-EIO); 774 } 775 776 folio = kvm_gmem_get_folio(file_inode(file), index); 777 if (IS_ERR(folio)) 778 return folio; 779 780 if (folio_test_hwpoison(folio)) { 781 folio_unlock(folio); 782 folio_put(folio); 783 return ERR_PTR(-EHWPOISON); 784 } 785 786 *pfn = folio_file_pfn(folio, index); 787 if (max_order) 788 *max_order = 0; 789 790 *is_prepared = folio_test_uptodate(folio); 791 return folio; 792 } 793 794 int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, 795 gfn_t gfn, kvm_pfn_t *pfn, struct page **page, 796 int *max_order) 797 { 798 pgoff_t index = kvm_gmem_get_index(slot, gfn); 799 struct folio *folio; 800 bool is_prepared = false; 801 int r = 0; 802 803 CLASS(gmem_get_file, file)(slot); 804 if (!file) 805 return -EFAULT; 806 807 folio = __kvm_gmem_get_pfn(file, slot, index, pfn, &is_prepared, max_order); 808 if (IS_ERR(folio)) 809 return PTR_ERR(folio); 810 811 if (!is_prepared) 812 r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio); 813 814 folio_unlock(folio); 815 816 if (!r) 817 *page = folio_file_page(folio, index); 818 else 819 folio_put(folio); 820 821 return r; 822 } 823 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_get_pfn); 824 825 #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_POPULATE 826 long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long npages, 827 kvm_gmem_populate_cb post_populate, void *opaque) 828 { 829 struct kvm_memory_slot *slot; 830 void __user *p; 831 832 int ret = 0, max_order; 833 long i; 834 835 lockdep_assert_held(&kvm->slots_lock); 836 837 if (WARN_ON_ONCE(npages <= 0)) 838 return -EINVAL; 839 840 slot = gfn_to_memslot(kvm, start_gfn); 841 if (!kvm_slot_has_gmem(slot)) 842 return -EINVAL; 843 844 CLASS(gmem_get_file, file)(slot); 845 if (!file) 846 return -EFAULT; 847 848 filemap_invalidate_lock(file->f_mapping); 849 850 npages = min_t(ulong, slot->npages - (start_gfn - slot->base_gfn), npages); 851 for (i = 0; i < npages; i += (1 << max_order)) { 852 struct folio *folio; 853 gfn_t gfn = start_gfn + i; 854 pgoff_t index = kvm_gmem_get_index(slot, gfn); 855 bool is_prepared = false; 856 kvm_pfn_t pfn; 857 858 if (signal_pending(current)) { 859 ret = -EINTR; 860 break; 861 } 862 863 folio = __kvm_gmem_get_pfn(file, slot, index, &pfn, &is_prepared, &max_order); 864 if (IS_ERR(folio)) { 865 ret = PTR_ERR(folio); 866 break; 867 } 868 869 if (is_prepared) { 870 folio_unlock(folio); 871 folio_put(folio); 872 ret = -EEXIST; 873 break; 874 } 875 876 folio_unlock(folio); 877 WARN_ON(!IS_ALIGNED(gfn, 1 << max_order) || 878 (npages - i) < (1 << max_order)); 879 880 ret = -EINVAL; 881 while (!kvm_range_has_memory_attributes(kvm, gfn, gfn + (1 << max_order), 882 KVM_MEMORY_ATTRIBUTE_PRIVATE, 883 KVM_MEMORY_ATTRIBUTE_PRIVATE)) { 884 if (!max_order) 885 goto put_folio_and_exit; 886 max_order--; 887 } 888 889 p = src ? src + i * PAGE_SIZE : NULL; 890 ret = post_populate(kvm, gfn, pfn, p, max_order, opaque); 891 if (!ret) 892 kvm_gmem_mark_prepared(folio); 893 894 put_folio_and_exit: 895 folio_put(folio); 896 if (ret) 897 break; 898 } 899 900 filemap_invalidate_unlock(file->f_mapping); 901 902 return ret && !i ? ret : i; 903 } 904 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_populate); 905 #endif 906 907 static struct kmem_cache *kvm_gmem_inode_cachep; 908 909 static void kvm_gmem_init_inode_once(void *__gi) 910 { 911 struct gmem_inode *gi = __gi; 912 913 /* 914 * Note! Don't initialize the inode with anything specific to the 915 * guest_memfd instance, or that might be specific to how the inode is 916 * used (from the VFS-layer's perspective). This hook is called only 917 * during the initial slab allocation, i.e. only fields/state that are 918 * idempotent across _all_ use of the inode _object_ can be initialized 919 * at this time! 920 */ 921 inode_init_once(&gi->vfs_inode); 922 } 923 924 static struct inode *kvm_gmem_alloc_inode(struct super_block *sb) 925 { 926 struct gmem_inode *gi; 927 928 gi = alloc_inode_sb(sb, kvm_gmem_inode_cachep, GFP_KERNEL); 929 if (!gi) 930 return NULL; 931 932 mpol_shared_policy_init(&gi->policy, NULL); 933 934 gi->flags = 0; 935 return &gi->vfs_inode; 936 } 937 938 static void kvm_gmem_destroy_inode(struct inode *inode) 939 { 940 mpol_free_shared_policy(&GMEM_I(inode)->policy); 941 } 942 943 static void kvm_gmem_free_inode(struct inode *inode) 944 { 945 kmem_cache_free(kvm_gmem_inode_cachep, GMEM_I(inode)); 946 } 947 948 static const struct super_operations kvm_gmem_super_operations = { 949 .statfs = simple_statfs, 950 .alloc_inode = kvm_gmem_alloc_inode, 951 .destroy_inode = kvm_gmem_destroy_inode, 952 .free_inode = kvm_gmem_free_inode, 953 }; 954 955 static int kvm_gmem_init_fs_context(struct fs_context *fc) 956 { 957 struct pseudo_fs_context *ctx; 958 959 if (!init_pseudo(fc, GUEST_MEMFD_MAGIC)) 960 return -ENOMEM; 961 962 fc->s_iflags |= SB_I_NOEXEC; 963 fc->s_iflags |= SB_I_NODEV; 964 ctx = fc->fs_private; 965 ctx->ops = &kvm_gmem_super_operations; 966 967 return 0; 968 } 969 970 static struct file_system_type kvm_gmem_fs = { 971 .name = "guest_memfd", 972 .init_fs_context = kvm_gmem_init_fs_context, 973 .kill_sb = kill_anon_super, 974 }; 975 976 static int kvm_gmem_init_mount(void) 977 { 978 kvm_gmem_mnt = kern_mount(&kvm_gmem_fs); 979 980 if (IS_ERR(kvm_gmem_mnt)) 981 return PTR_ERR(kvm_gmem_mnt); 982 983 kvm_gmem_mnt->mnt_flags |= MNT_NOEXEC; 984 return 0; 985 } 986 987 int kvm_gmem_init(struct module *module) 988 { 989 struct kmem_cache_args args = { 990 .align = 0, 991 .ctor = kvm_gmem_init_inode_once, 992 }; 993 int ret; 994 995 kvm_gmem_fops.owner = module; 996 kvm_gmem_inode_cachep = kmem_cache_create("kvm_gmem_inode_cache", 997 sizeof(struct gmem_inode), 998 &args, SLAB_ACCOUNT); 999 if (!kvm_gmem_inode_cachep) 1000 return -ENOMEM; 1001 1002 ret = kvm_gmem_init_mount(); 1003 if (ret) { 1004 kmem_cache_destroy(kvm_gmem_inode_cachep); 1005 return ret; 1006 } 1007 return 0; 1008 } 1009 1010 void kvm_gmem_exit(void) 1011 { 1012 kern_unmount(kvm_gmem_mnt); 1013 kvm_gmem_mnt = NULL; 1014 rcu_barrier(); 1015 kmem_cache_destroy(kvm_gmem_inode_cachep); 1016 } 1017