1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/backing-dev.h> 3 #include <linux/falloc.h> 4 #include <linux/kvm_host.h> 5 #include <linux/pagemap.h> 6 #include <linux/anon_inodes.h> 7 8 #include "kvm_mm.h" 9 10 struct kvm_gmem { 11 struct kvm *kvm; 12 struct xarray bindings; 13 struct list_head entry; 14 }; 15 16 /** 17 * folio_file_pfn - like folio_file_page, but return a pfn. 18 * @folio: The folio which contains this index. 19 * @index: The index we want to look up. 20 * 21 * Return: The pfn for this index. 22 */ 23 static inline kvm_pfn_t folio_file_pfn(struct folio *folio, pgoff_t index) 24 { 25 return folio_pfn(folio) + (index & (folio_nr_pages(folio) - 1)); 26 } 27 28 static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot, 29 pgoff_t index, struct folio *folio) 30 { 31 #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE 32 kvm_pfn_t pfn = folio_file_pfn(folio, index); 33 gfn_t gfn = slot->base_gfn + index - slot->gmem.pgoff; 34 int rc = kvm_arch_gmem_prepare(kvm, gfn, pfn, folio_order(folio)); 35 if (rc) { 36 pr_warn_ratelimited("gmem: Failed to prepare folio for index %lx GFN %llx PFN %llx error %d.\n", 37 index, gfn, pfn, rc); 38 return rc; 39 } 40 #endif 41 42 return 0; 43 } 44 45 static inline void kvm_gmem_mark_prepared(struct folio *folio) 46 { 47 folio_mark_uptodate(folio); 48 } 49 50 /* 51 * Process @folio, which contains @gfn, so that the guest can use it. 52 * The folio must be locked and the gfn must be contained in @slot. 53 * On successful return the guest sees a zero page so as to avoid 54 * leaking host data and the up-to-date flag is set. 55 */ 56 static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot, 57 gfn_t gfn, struct folio *folio) 58 { 59 unsigned long nr_pages, i; 60 pgoff_t index; 61 int r; 62 63 nr_pages = folio_nr_pages(folio); 64 for (i = 0; i < nr_pages; i++) 65 clear_highpage(folio_page(folio, i)); 66 67 /* 68 * Preparing huge folios should always be safe, since it should 69 * be possible to split them later if needed. 70 * 71 * Right now the folio order is always going to be zero, but the 72 * code is ready for huge folios. The only assumption is that 73 * the base pgoff of memslots is naturally aligned with the 74 * requested page order, ensuring that huge folios can also use 75 * huge page table entries for GPA->HPA mapping. 76 * 77 * The order will be passed when creating the guest_memfd, and 78 * checked when creating memslots. 79 */ 80 WARN_ON(!IS_ALIGNED(slot->gmem.pgoff, 1 << folio_order(folio))); 81 index = gfn - slot->base_gfn + slot->gmem.pgoff; 82 index = ALIGN_DOWN(index, 1 << folio_order(folio)); 83 r = __kvm_gmem_prepare_folio(kvm, slot, index, folio); 84 if (!r) 85 kvm_gmem_mark_prepared(folio); 86 87 return r; 88 } 89 90 /* 91 * Returns a locked folio on success. The caller is responsible for 92 * setting the up-to-date flag before the memory is mapped into the guest. 93 * There is no backing storage for the memory, so the folio will remain 94 * up-to-date until it's removed. 95 * 96 * Ignore accessed, referenced, and dirty flags. The memory is 97 * unevictable and there is no storage to write back to. 98 */ 99 static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index) 100 { 101 /* TODO: Support huge pages. */ 102 return filemap_grab_folio(inode->i_mapping, index); 103 } 104 105 static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start, 106 pgoff_t end) 107 { 108 bool flush = false, found_memslot = false; 109 struct kvm_memory_slot *slot; 110 struct kvm *kvm = gmem->kvm; 111 unsigned long index; 112 113 xa_for_each_range(&gmem->bindings, index, slot, start, end - 1) { 114 pgoff_t pgoff = slot->gmem.pgoff; 115 116 struct kvm_gfn_range gfn_range = { 117 .start = slot->base_gfn + max(pgoff, start) - pgoff, 118 .end = slot->base_gfn + min(pgoff + slot->npages, end) - pgoff, 119 .slot = slot, 120 .may_block = true, 121 }; 122 123 if (!found_memslot) { 124 found_memslot = true; 125 126 KVM_MMU_LOCK(kvm); 127 kvm_mmu_invalidate_begin(kvm); 128 } 129 130 flush |= kvm_mmu_unmap_gfn_range(kvm, &gfn_range); 131 } 132 133 if (flush) 134 kvm_flush_remote_tlbs(kvm); 135 136 if (found_memslot) 137 KVM_MMU_UNLOCK(kvm); 138 } 139 140 static void kvm_gmem_invalidate_end(struct kvm_gmem *gmem, pgoff_t start, 141 pgoff_t end) 142 { 143 struct kvm *kvm = gmem->kvm; 144 145 if (xa_find(&gmem->bindings, &start, end - 1, XA_PRESENT)) { 146 KVM_MMU_LOCK(kvm); 147 kvm_mmu_invalidate_end(kvm); 148 KVM_MMU_UNLOCK(kvm); 149 } 150 } 151 152 static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len) 153 { 154 struct list_head *gmem_list = &inode->i_mapping->i_private_list; 155 pgoff_t start = offset >> PAGE_SHIFT; 156 pgoff_t end = (offset + len) >> PAGE_SHIFT; 157 struct kvm_gmem *gmem; 158 159 /* 160 * Bindings must be stable across invalidation to ensure the start+end 161 * are balanced. 162 */ 163 filemap_invalidate_lock(inode->i_mapping); 164 165 list_for_each_entry(gmem, gmem_list, entry) 166 kvm_gmem_invalidate_begin(gmem, start, end); 167 168 truncate_inode_pages_range(inode->i_mapping, offset, offset + len - 1); 169 170 list_for_each_entry(gmem, gmem_list, entry) 171 kvm_gmem_invalidate_end(gmem, start, end); 172 173 filemap_invalidate_unlock(inode->i_mapping); 174 175 return 0; 176 } 177 178 static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len) 179 { 180 struct address_space *mapping = inode->i_mapping; 181 pgoff_t start, index, end; 182 int r; 183 184 /* Dedicated guest is immutable by default. */ 185 if (offset + len > i_size_read(inode)) 186 return -EINVAL; 187 188 filemap_invalidate_lock_shared(mapping); 189 190 start = offset >> PAGE_SHIFT; 191 end = (offset + len) >> PAGE_SHIFT; 192 193 r = 0; 194 for (index = start; index < end; ) { 195 struct folio *folio; 196 197 if (signal_pending(current)) { 198 r = -EINTR; 199 break; 200 } 201 202 folio = kvm_gmem_get_folio(inode, index); 203 if (IS_ERR(folio)) { 204 r = PTR_ERR(folio); 205 break; 206 } 207 208 index = folio_next_index(folio); 209 210 folio_unlock(folio); 211 folio_put(folio); 212 213 /* 64-bit only, wrapping the index should be impossible. */ 214 if (WARN_ON_ONCE(!index)) 215 break; 216 217 cond_resched(); 218 } 219 220 filemap_invalidate_unlock_shared(mapping); 221 222 return r; 223 } 224 225 static long kvm_gmem_fallocate(struct file *file, int mode, loff_t offset, 226 loff_t len) 227 { 228 int ret; 229 230 if (!(mode & FALLOC_FL_KEEP_SIZE)) 231 return -EOPNOTSUPP; 232 233 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) 234 return -EOPNOTSUPP; 235 236 if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len)) 237 return -EINVAL; 238 239 if (mode & FALLOC_FL_PUNCH_HOLE) 240 ret = kvm_gmem_punch_hole(file_inode(file), offset, len); 241 else 242 ret = kvm_gmem_allocate(file_inode(file), offset, len); 243 244 if (!ret) 245 file_modified(file); 246 return ret; 247 } 248 249 static int kvm_gmem_release(struct inode *inode, struct file *file) 250 { 251 struct kvm_gmem *gmem = file->private_data; 252 struct kvm_memory_slot *slot; 253 struct kvm *kvm = gmem->kvm; 254 unsigned long index; 255 256 /* 257 * Prevent concurrent attempts to *unbind* a memslot. This is the last 258 * reference to the file and thus no new bindings can be created, but 259 * dereferencing the slot for existing bindings needs to be protected 260 * against memslot updates, specifically so that unbind doesn't race 261 * and free the memslot (kvm_gmem_get_file() will return NULL). 262 */ 263 mutex_lock(&kvm->slots_lock); 264 265 filemap_invalidate_lock(inode->i_mapping); 266 267 xa_for_each(&gmem->bindings, index, slot) 268 rcu_assign_pointer(slot->gmem.file, NULL); 269 270 synchronize_rcu(); 271 272 /* 273 * All in-flight operations are gone and new bindings can be created. 274 * Zap all SPTEs pointed at by this file. Do not free the backing 275 * memory, as its lifetime is associated with the inode, not the file. 276 */ 277 kvm_gmem_invalidate_begin(gmem, 0, -1ul); 278 kvm_gmem_invalidate_end(gmem, 0, -1ul); 279 280 list_del(&gmem->entry); 281 282 filemap_invalidate_unlock(inode->i_mapping); 283 284 mutex_unlock(&kvm->slots_lock); 285 286 xa_destroy(&gmem->bindings); 287 kfree(gmem); 288 289 kvm_put_kvm(kvm); 290 291 return 0; 292 } 293 294 static inline struct file *kvm_gmem_get_file(struct kvm_memory_slot *slot) 295 { 296 /* 297 * Do not return slot->gmem.file if it has already been closed; 298 * there might be some time between the last fput() and when 299 * kvm_gmem_release() clears slot->gmem.file, and you do not 300 * want to spin in the meanwhile. 301 */ 302 return get_file_active(&slot->gmem.file); 303 } 304 305 static pgoff_t kvm_gmem_get_index(struct kvm_memory_slot *slot, gfn_t gfn) 306 { 307 return gfn - slot->base_gfn + slot->gmem.pgoff; 308 } 309 310 static struct file_operations kvm_gmem_fops = { 311 .open = generic_file_open, 312 .release = kvm_gmem_release, 313 .fallocate = kvm_gmem_fallocate, 314 }; 315 316 void kvm_gmem_init(struct module *module) 317 { 318 kvm_gmem_fops.owner = module; 319 } 320 321 static int kvm_gmem_migrate_folio(struct address_space *mapping, 322 struct folio *dst, struct folio *src, 323 enum migrate_mode mode) 324 { 325 WARN_ON_ONCE(1); 326 return -EINVAL; 327 } 328 329 static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *folio) 330 { 331 struct list_head *gmem_list = &mapping->i_private_list; 332 struct kvm_gmem *gmem; 333 pgoff_t start, end; 334 335 filemap_invalidate_lock_shared(mapping); 336 337 start = folio->index; 338 end = start + folio_nr_pages(folio); 339 340 list_for_each_entry(gmem, gmem_list, entry) 341 kvm_gmem_invalidate_begin(gmem, start, end); 342 343 /* 344 * Do not truncate the range, what action is taken in response to the 345 * error is userspace's decision (assuming the architecture supports 346 * gracefully handling memory errors). If/when the guest attempts to 347 * access a poisoned page, kvm_gmem_get_pfn() will return -EHWPOISON, 348 * at which point KVM can either terminate the VM or propagate the 349 * error to userspace. 350 */ 351 352 list_for_each_entry(gmem, gmem_list, entry) 353 kvm_gmem_invalidate_end(gmem, start, end); 354 355 filemap_invalidate_unlock_shared(mapping); 356 357 return MF_DELAYED; 358 } 359 360 #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE 361 static void kvm_gmem_free_folio(struct folio *folio) 362 { 363 struct page *page = folio_page(folio, 0); 364 kvm_pfn_t pfn = page_to_pfn(page); 365 int order = folio_order(folio); 366 367 kvm_arch_gmem_invalidate(pfn, pfn + (1ul << order)); 368 } 369 #endif 370 371 static const struct address_space_operations kvm_gmem_aops = { 372 .dirty_folio = noop_dirty_folio, 373 .migrate_folio = kvm_gmem_migrate_folio, 374 .error_remove_folio = kvm_gmem_error_folio, 375 #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE 376 .free_folio = kvm_gmem_free_folio, 377 #endif 378 }; 379 380 static int kvm_gmem_getattr(struct mnt_idmap *idmap, const struct path *path, 381 struct kstat *stat, u32 request_mask, 382 unsigned int query_flags) 383 { 384 struct inode *inode = path->dentry->d_inode; 385 386 generic_fillattr(idmap, request_mask, inode, stat); 387 return 0; 388 } 389 390 static int kvm_gmem_setattr(struct mnt_idmap *idmap, struct dentry *dentry, 391 struct iattr *attr) 392 { 393 return -EINVAL; 394 } 395 static const struct inode_operations kvm_gmem_iops = { 396 .getattr = kvm_gmem_getattr, 397 .setattr = kvm_gmem_setattr, 398 }; 399 400 static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) 401 { 402 const char *anon_name = "[kvm-gmem]"; 403 struct kvm_gmem *gmem; 404 struct inode *inode; 405 struct file *file; 406 int fd, err; 407 408 fd = get_unused_fd_flags(0); 409 if (fd < 0) 410 return fd; 411 412 gmem = kzalloc(sizeof(*gmem), GFP_KERNEL); 413 if (!gmem) { 414 err = -ENOMEM; 415 goto err_fd; 416 } 417 418 file = anon_inode_create_getfile(anon_name, &kvm_gmem_fops, gmem, 419 O_RDWR, NULL); 420 if (IS_ERR(file)) { 421 err = PTR_ERR(file); 422 goto err_gmem; 423 } 424 425 file->f_flags |= O_LARGEFILE; 426 427 inode = file->f_inode; 428 WARN_ON(file->f_mapping != inode->i_mapping); 429 430 inode->i_private = (void *)(unsigned long)flags; 431 inode->i_op = &kvm_gmem_iops; 432 inode->i_mapping->a_ops = &kvm_gmem_aops; 433 inode->i_mode |= S_IFREG; 434 inode->i_size = size; 435 mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); 436 mapping_set_inaccessible(inode->i_mapping); 437 /* Unmovable mappings are supposed to be marked unevictable as well. */ 438 WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping)); 439 440 kvm_get_kvm(kvm); 441 gmem->kvm = kvm; 442 xa_init(&gmem->bindings); 443 list_add(&gmem->entry, &inode->i_mapping->i_private_list); 444 445 fd_install(fd, file); 446 return fd; 447 448 err_gmem: 449 kfree(gmem); 450 err_fd: 451 put_unused_fd(fd); 452 return err; 453 } 454 455 int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args) 456 { 457 loff_t size = args->size; 458 u64 flags = args->flags; 459 u64 valid_flags = 0; 460 461 if (flags & ~valid_flags) 462 return -EINVAL; 463 464 if (size <= 0 || !PAGE_ALIGNED(size)) 465 return -EINVAL; 466 467 return __kvm_gmem_create(kvm, size, flags); 468 } 469 470 int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot, 471 unsigned int fd, loff_t offset) 472 { 473 loff_t size = slot->npages << PAGE_SHIFT; 474 unsigned long start, end; 475 struct kvm_gmem *gmem; 476 struct inode *inode; 477 struct file *file; 478 int r = -EINVAL; 479 480 BUILD_BUG_ON(sizeof(gfn_t) != sizeof(slot->gmem.pgoff)); 481 482 file = fget(fd); 483 if (!file) 484 return -EBADF; 485 486 if (file->f_op != &kvm_gmem_fops) 487 goto err; 488 489 gmem = file->private_data; 490 if (gmem->kvm != kvm) 491 goto err; 492 493 inode = file_inode(file); 494 495 if (offset < 0 || !PAGE_ALIGNED(offset) || 496 offset + size > i_size_read(inode)) 497 goto err; 498 499 filemap_invalidate_lock(inode->i_mapping); 500 501 start = offset >> PAGE_SHIFT; 502 end = start + slot->npages; 503 504 if (!xa_empty(&gmem->bindings) && 505 xa_find(&gmem->bindings, &start, end - 1, XA_PRESENT)) { 506 filemap_invalidate_unlock(inode->i_mapping); 507 goto err; 508 } 509 510 /* 511 * No synchronize_rcu() needed, any in-flight readers are guaranteed to 512 * be see either a NULL file or this new file, no need for them to go 513 * away. 514 */ 515 rcu_assign_pointer(slot->gmem.file, file); 516 slot->gmem.pgoff = start; 517 518 xa_store_range(&gmem->bindings, start, end - 1, slot, GFP_KERNEL); 519 filemap_invalidate_unlock(inode->i_mapping); 520 521 /* 522 * Drop the reference to the file, even on success. The file pins KVM, 523 * not the other way 'round. Active bindings are invalidated if the 524 * file is closed before memslots are destroyed. 525 */ 526 r = 0; 527 err: 528 fput(file); 529 return r; 530 } 531 532 void kvm_gmem_unbind(struct kvm_memory_slot *slot) 533 { 534 unsigned long start = slot->gmem.pgoff; 535 unsigned long end = start + slot->npages; 536 struct kvm_gmem *gmem; 537 struct file *file; 538 539 /* 540 * Nothing to do if the underlying file was already closed (or is being 541 * closed right now), kvm_gmem_release() invalidates all bindings. 542 */ 543 file = kvm_gmem_get_file(slot); 544 if (!file) 545 return; 546 547 gmem = file->private_data; 548 549 filemap_invalidate_lock(file->f_mapping); 550 xa_store_range(&gmem->bindings, start, end - 1, NULL, GFP_KERNEL); 551 rcu_assign_pointer(slot->gmem.file, NULL); 552 synchronize_rcu(); 553 filemap_invalidate_unlock(file->f_mapping); 554 555 fput(file); 556 } 557 558 /* Returns a locked folio on success. */ 559 static struct folio *__kvm_gmem_get_pfn(struct file *file, 560 struct kvm_memory_slot *slot, 561 pgoff_t index, kvm_pfn_t *pfn, 562 bool *is_prepared, int *max_order) 563 { 564 struct kvm_gmem *gmem = file->private_data; 565 struct folio *folio; 566 567 if (file != slot->gmem.file) { 568 WARN_ON_ONCE(slot->gmem.file); 569 return ERR_PTR(-EFAULT); 570 } 571 572 gmem = file->private_data; 573 if (xa_load(&gmem->bindings, index) != slot) { 574 WARN_ON_ONCE(xa_load(&gmem->bindings, index)); 575 return ERR_PTR(-EIO); 576 } 577 578 folio = kvm_gmem_get_folio(file_inode(file), index); 579 if (IS_ERR(folio)) 580 return folio; 581 582 if (folio_test_hwpoison(folio)) { 583 folio_unlock(folio); 584 folio_put(folio); 585 return ERR_PTR(-EHWPOISON); 586 } 587 588 *pfn = folio_file_pfn(folio, index); 589 if (max_order) 590 *max_order = 0; 591 592 *is_prepared = folio_test_uptodate(folio); 593 return folio; 594 } 595 596 int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, 597 gfn_t gfn, kvm_pfn_t *pfn, struct page **page, 598 int *max_order) 599 { 600 pgoff_t index = kvm_gmem_get_index(slot, gfn); 601 struct file *file = kvm_gmem_get_file(slot); 602 struct folio *folio; 603 bool is_prepared = false; 604 int r = 0; 605 606 if (!file) 607 return -EFAULT; 608 609 folio = __kvm_gmem_get_pfn(file, slot, index, pfn, &is_prepared, max_order); 610 if (IS_ERR(folio)) { 611 r = PTR_ERR(folio); 612 goto out; 613 } 614 615 if (!is_prepared) 616 r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio); 617 618 folio_unlock(folio); 619 620 if (!r) 621 *page = folio_file_page(folio, index); 622 else 623 folio_put(folio); 624 625 out: 626 fput(file); 627 return r; 628 } 629 EXPORT_SYMBOL_GPL(kvm_gmem_get_pfn); 630 631 #ifdef CONFIG_KVM_GENERIC_PRIVATE_MEM 632 long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long npages, 633 kvm_gmem_populate_cb post_populate, void *opaque) 634 { 635 struct file *file; 636 struct kvm_memory_slot *slot; 637 void __user *p; 638 639 int ret = 0, max_order; 640 long i; 641 642 lockdep_assert_held(&kvm->slots_lock); 643 if (npages < 0) 644 return -EINVAL; 645 646 slot = gfn_to_memslot(kvm, start_gfn); 647 if (!kvm_slot_can_be_private(slot)) 648 return -EINVAL; 649 650 file = kvm_gmem_get_file(slot); 651 if (!file) 652 return -EFAULT; 653 654 filemap_invalidate_lock(file->f_mapping); 655 656 npages = min_t(ulong, slot->npages - (start_gfn - slot->base_gfn), npages); 657 for (i = 0; i < npages; i += (1 << max_order)) { 658 struct folio *folio; 659 gfn_t gfn = start_gfn + i; 660 pgoff_t index = kvm_gmem_get_index(slot, gfn); 661 bool is_prepared = false; 662 kvm_pfn_t pfn; 663 664 if (signal_pending(current)) { 665 ret = -EINTR; 666 break; 667 } 668 669 folio = __kvm_gmem_get_pfn(file, slot, index, &pfn, &is_prepared, &max_order); 670 if (IS_ERR(folio)) { 671 ret = PTR_ERR(folio); 672 break; 673 } 674 675 if (is_prepared) { 676 folio_unlock(folio); 677 folio_put(folio); 678 ret = -EEXIST; 679 break; 680 } 681 682 folio_unlock(folio); 683 WARN_ON(!IS_ALIGNED(gfn, 1 << max_order) || 684 (npages - i) < (1 << max_order)); 685 686 ret = -EINVAL; 687 while (!kvm_range_has_memory_attributes(kvm, gfn, gfn + (1 << max_order), 688 KVM_MEMORY_ATTRIBUTE_PRIVATE, 689 KVM_MEMORY_ATTRIBUTE_PRIVATE)) { 690 if (!max_order) 691 goto put_folio_and_exit; 692 max_order--; 693 } 694 695 p = src ? src + i * PAGE_SIZE : NULL; 696 ret = post_populate(kvm, gfn, pfn, p, max_order, opaque); 697 if (!ret) 698 kvm_gmem_mark_prepared(folio); 699 700 put_folio_and_exit: 701 folio_put(folio); 702 if (ret) 703 break; 704 } 705 706 filemap_invalidate_unlock(file->f_mapping); 707 708 fput(file); 709 return ret && !i ? ret : i; 710 } 711 EXPORT_SYMBOL_GPL(kvm_gmem_populate); 712 #endif 713