1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/backing-dev.h> 3 #include <linux/falloc.h> 4 #include <linux/kvm_host.h> 5 #include <linux/pagemap.h> 6 #include <linux/anon_inodes.h> 7 8 #include "kvm_mm.h" 9 10 struct kvm_gmem { 11 struct kvm *kvm; 12 struct xarray bindings; 13 struct list_head entry; 14 }; 15 16 static int kvm_gmem_prepare_folio(struct inode *inode, pgoff_t index, struct folio *folio) 17 { 18 #ifdef CONFIG_HAVE_KVM_GMEM_PREPARE 19 struct list_head *gmem_list = &inode->i_mapping->i_private_list; 20 struct kvm_gmem *gmem; 21 22 list_for_each_entry(gmem, gmem_list, entry) { 23 struct kvm_memory_slot *slot; 24 struct kvm *kvm = gmem->kvm; 25 struct page *page; 26 kvm_pfn_t pfn; 27 gfn_t gfn; 28 int rc; 29 30 if (!kvm_arch_gmem_prepare_needed(kvm)) 31 continue; 32 33 slot = xa_load(&gmem->bindings, index); 34 if (!slot) 35 continue; 36 37 page = folio_file_page(folio, index); 38 pfn = page_to_pfn(page); 39 gfn = slot->base_gfn + index - slot->gmem.pgoff; 40 rc = kvm_arch_gmem_prepare(kvm, gfn, pfn, compound_order(compound_head(page))); 41 if (rc) { 42 pr_warn_ratelimited("gmem: Failed to prepare folio for index %lx GFN %llx PFN %llx error %d.\n", 43 index, gfn, pfn, rc); 44 return rc; 45 } 46 } 47 48 #endif 49 return 0; 50 } 51 52 static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index, bool prepare) 53 { 54 struct folio *folio; 55 56 /* TODO: Support huge pages. */ 57 folio = filemap_grab_folio(inode->i_mapping, index); 58 if (IS_ERR(folio)) 59 return folio; 60 61 /* 62 * Use the up-to-date flag to track whether or not the memory has been 63 * zeroed before being handed off to the guest. There is no backing 64 * storage for the memory, so the folio will remain up-to-date until 65 * it's removed. 66 * 67 * TODO: Skip clearing pages when trusted firmware will do it when 68 * assigning memory to the guest. 69 */ 70 if (!folio_test_uptodate(folio)) { 71 unsigned long nr_pages = folio_nr_pages(folio); 72 unsigned long i; 73 74 for (i = 0; i < nr_pages; i++) 75 clear_highpage(folio_page(folio, i)); 76 77 folio_mark_uptodate(folio); 78 } 79 80 if (prepare) { 81 int r = kvm_gmem_prepare_folio(inode, index, folio); 82 if (r < 0) { 83 folio_unlock(folio); 84 folio_put(folio); 85 return ERR_PTR(r); 86 } 87 } 88 89 /* 90 * Ignore accessed, referenced, and dirty flags. The memory is 91 * unevictable and there is no storage to write back to. 92 */ 93 return folio; 94 } 95 96 static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start, 97 pgoff_t end) 98 { 99 bool flush = false, found_memslot = false; 100 struct kvm_memory_slot *slot; 101 struct kvm *kvm = gmem->kvm; 102 unsigned long index; 103 104 xa_for_each_range(&gmem->bindings, index, slot, start, end - 1) { 105 pgoff_t pgoff = slot->gmem.pgoff; 106 107 struct kvm_gfn_range gfn_range = { 108 .start = slot->base_gfn + max(pgoff, start) - pgoff, 109 .end = slot->base_gfn + min(pgoff + slot->npages, end) - pgoff, 110 .slot = slot, 111 .may_block = true, 112 }; 113 114 if (!found_memslot) { 115 found_memslot = true; 116 117 KVM_MMU_LOCK(kvm); 118 kvm_mmu_invalidate_begin(kvm); 119 } 120 121 flush |= kvm_mmu_unmap_gfn_range(kvm, &gfn_range); 122 } 123 124 if (flush) 125 kvm_flush_remote_tlbs(kvm); 126 127 if (found_memslot) 128 KVM_MMU_UNLOCK(kvm); 129 } 130 131 static void kvm_gmem_invalidate_end(struct kvm_gmem *gmem, pgoff_t start, 132 pgoff_t end) 133 { 134 struct kvm *kvm = gmem->kvm; 135 136 if (xa_find(&gmem->bindings, &start, end - 1, XA_PRESENT)) { 137 KVM_MMU_LOCK(kvm); 138 kvm_mmu_invalidate_end(kvm); 139 KVM_MMU_UNLOCK(kvm); 140 } 141 } 142 143 static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len) 144 { 145 struct list_head *gmem_list = &inode->i_mapping->i_private_list; 146 pgoff_t start = offset >> PAGE_SHIFT; 147 pgoff_t end = (offset + len) >> PAGE_SHIFT; 148 struct kvm_gmem *gmem; 149 150 /* 151 * Bindings must be stable across invalidation to ensure the start+end 152 * are balanced. 153 */ 154 filemap_invalidate_lock(inode->i_mapping); 155 156 list_for_each_entry(gmem, gmem_list, entry) 157 kvm_gmem_invalidate_begin(gmem, start, end); 158 159 truncate_inode_pages_range(inode->i_mapping, offset, offset + len - 1); 160 161 list_for_each_entry(gmem, gmem_list, entry) 162 kvm_gmem_invalidate_end(gmem, start, end); 163 164 filemap_invalidate_unlock(inode->i_mapping); 165 166 return 0; 167 } 168 169 static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len) 170 { 171 struct address_space *mapping = inode->i_mapping; 172 pgoff_t start, index, end; 173 int r; 174 175 /* Dedicated guest is immutable by default. */ 176 if (offset + len > i_size_read(inode)) 177 return -EINVAL; 178 179 filemap_invalidate_lock_shared(mapping); 180 181 start = offset >> PAGE_SHIFT; 182 end = (offset + len) >> PAGE_SHIFT; 183 184 r = 0; 185 for (index = start; index < end; ) { 186 struct folio *folio; 187 188 if (signal_pending(current)) { 189 r = -EINTR; 190 break; 191 } 192 193 folio = kvm_gmem_get_folio(inode, index, true); 194 if (IS_ERR(folio)) { 195 r = PTR_ERR(folio); 196 break; 197 } 198 199 index = folio_next_index(folio); 200 201 folio_unlock(folio); 202 folio_put(folio); 203 204 /* 64-bit only, wrapping the index should be impossible. */ 205 if (WARN_ON_ONCE(!index)) 206 break; 207 208 cond_resched(); 209 } 210 211 filemap_invalidate_unlock_shared(mapping); 212 213 return r; 214 } 215 216 static long kvm_gmem_fallocate(struct file *file, int mode, loff_t offset, 217 loff_t len) 218 { 219 int ret; 220 221 if (!(mode & FALLOC_FL_KEEP_SIZE)) 222 return -EOPNOTSUPP; 223 224 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) 225 return -EOPNOTSUPP; 226 227 if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len)) 228 return -EINVAL; 229 230 if (mode & FALLOC_FL_PUNCH_HOLE) 231 ret = kvm_gmem_punch_hole(file_inode(file), offset, len); 232 else 233 ret = kvm_gmem_allocate(file_inode(file), offset, len); 234 235 if (!ret) 236 file_modified(file); 237 return ret; 238 } 239 240 static int kvm_gmem_release(struct inode *inode, struct file *file) 241 { 242 struct kvm_gmem *gmem = file->private_data; 243 struct kvm_memory_slot *slot; 244 struct kvm *kvm = gmem->kvm; 245 unsigned long index; 246 247 /* 248 * Prevent concurrent attempts to *unbind* a memslot. This is the last 249 * reference to the file and thus no new bindings can be created, but 250 * dereferencing the slot for existing bindings needs to be protected 251 * against memslot updates, specifically so that unbind doesn't race 252 * and free the memslot (kvm_gmem_get_file() will return NULL). 253 */ 254 mutex_lock(&kvm->slots_lock); 255 256 filemap_invalidate_lock(inode->i_mapping); 257 258 xa_for_each(&gmem->bindings, index, slot) 259 rcu_assign_pointer(slot->gmem.file, NULL); 260 261 synchronize_rcu(); 262 263 /* 264 * All in-flight operations are gone and new bindings can be created. 265 * Zap all SPTEs pointed at by this file. Do not free the backing 266 * memory, as its lifetime is associated with the inode, not the file. 267 */ 268 kvm_gmem_invalidate_begin(gmem, 0, -1ul); 269 kvm_gmem_invalidate_end(gmem, 0, -1ul); 270 271 list_del(&gmem->entry); 272 273 filemap_invalidate_unlock(inode->i_mapping); 274 275 mutex_unlock(&kvm->slots_lock); 276 277 xa_destroy(&gmem->bindings); 278 kfree(gmem); 279 280 kvm_put_kvm(kvm); 281 282 return 0; 283 } 284 285 static inline struct file *kvm_gmem_get_file(struct kvm_memory_slot *slot) 286 { 287 /* 288 * Do not return slot->gmem.file if it has already been closed; 289 * there might be some time between the last fput() and when 290 * kvm_gmem_release() clears slot->gmem.file, and you do not 291 * want to spin in the meanwhile. 292 */ 293 return get_file_active(&slot->gmem.file); 294 } 295 296 static struct file_operations kvm_gmem_fops = { 297 .open = generic_file_open, 298 .release = kvm_gmem_release, 299 .fallocate = kvm_gmem_fallocate, 300 }; 301 302 void kvm_gmem_init(struct module *module) 303 { 304 kvm_gmem_fops.owner = module; 305 } 306 307 static int kvm_gmem_migrate_folio(struct address_space *mapping, 308 struct folio *dst, struct folio *src, 309 enum migrate_mode mode) 310 { 311 WARN_ON_ONCE(1); 312 return -EINVAL; 313 } 314 315 static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *folio) 316 { 317 struct list_head *gmem_list = &mapping->i_private_list; 318 struct kvm_gmem *gmem; 319 pgoff_t start, end; 320 321 filemap_invalidate_lock_shared(mapping); 322 323 start = folio->index; 324 end = start + folio_nr_pages(folio); 325 326 list_for_each_entry(gmem, gmem_list, entry) 327 kvm_gmem_invalidate_begin(gmem, start, end); 328 329 /* 330 * Do not truncate the range, what action is taken in response to the 331 * error is userspace's decision (assuming the architecture supports 332 * gracefully handling memory errors). If/when the guest attempts to 333 * access a poisoned page, kvm_gmem_get_pfn() will return -EHWPOISON, 334 * at which point KVM can either terminate the VM or propagate the 335 * error to userspace. 336 */ 337 338 list_for_each_entry(gmem, gmem_list, entry) 339 kvm_gmem_invalidate_end(gmem, start, end); 340 341 filemap_invalidate_unlock_shared(mapping); 342 343 return MF_DELAYED; 344 } 345 346 #ifdef CONFIG_HAVE_KVM_GMEM_INVALIDATE 347 static void kvm_gmem_free_folio(struct folio *folio) 348 { 349 struct page *page = folio_page(folio, 0); 350 kvm_pfn_t pfn = page_to_pfn(page); 351 int order = folio_order(folio); 352 353 kvm_arch_gmem_invalidate(pfn, pfn + (1ul << order)); 354 } 355 #endif 356 357 static const struct address_space_operations kvm_gmem_aops = { 358 .dirty_folio = noop_dirty_folio, 359 .migrate_folio = kvm_gmem_migrate_folio, 360 .error_remove_folio = kvm_gmem_error_folio, 361 #ifdef CONFIG_HAVE_KVM_GMEM_INVALIDATE 362 .free_folio = kvm_gmem_free_folio, 363 #endif 364 }; 365 366 static int kvm_gmem_getattr(struct mnt_idmap *idmap, const struct path *path, 367 struct kstat *stat, u32 request_mask, 368 unsigned int query_flags) 369 { 370 struct inode *inode = path->dentry->d_inode; 371 372 generic_fillattr(idmap, request_mask, inode, stat); 373 return 0; 374 } 375 376 static int kvm_gmem_setattr(struct mnt_idmap *idmap, struct dentry *dentry, 377 struct iattr *attr) 378 { 379 return -EINVAL; 380 } 381 static const struct inode_operations kvm_gmem_iops = { 382 .getattr = kvm_gmem_getattr, 383 .setattr = kvm_gmem_setattr, 384 }; 385 386 static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) 387 { 388 const char *anon_name = "[kvm-gmem]"; 389 struct kvm_gmem *gmem; 390 struct inode *inode; 391 struct file *file; 392 int fd, err; 393 394 fd = get_unused_fd_flags(0); 395 if (fd < 0) 396 return fd; 397 398 gmem = kzalloc(sizeof(*gmem), GFP_KERNEL); 399 if (!gmem) { 400 err = -ENOMEM; 401 goto err_fd; 402 } 403 404 file = anon_inode_create_getfile(anon_name, &kvm_gmem_fops, gmem, 405 O_RDWR, NULL); 406 if (IS_ERR(file)) { 407 err = PTR_ERR(file); 408 goto err_gmem; 409 } 410 411 file->f_flags |= O_LARGEFILE; 412 413 inode = file->f_inode; 414 WARN_ON(file->f_mapping != inode->i_mapping); 415 416 inode->i_private = (void *)(unsigned long)flags; 417 inode->i_op = &kvm_gmem_iops; 418 inode->i_mapping->a_ops = &kvm_gmem_aops; 419 inode->i_mode |= S_IFREG; 420 inode->i_size = size; 421 mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); 422 mapping_set_inaccessible(inode->i_mapping); 423 /* Unmovable mappings are supposed to be marked unevictable as well. */ 424 WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping)); 425 426 kvm_get_kvm(kvm); 427 gmem->kvm = kvm; 428 xa_init(&gmem->bindings); 429 list_add(&gmem->entry, &inode->i_mapping->i_private_list); 430 431 fd_install(fd, file); 432 return fd; 433 434 err_gmem: 435 kfree(gmem); 436 err_fd: 437 put_unused_fd(fd); 438 return err; 439 } 440 441 int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args) 442 { 443 loff_t size = args->size; 444 u64 flags = args->flags; 445 u64 valid_flags = 0; 446 447 if (flags & ~valid_flags) 448 return -EINVAL; 449 450 if (size <= 0 || !PAGE_ALIGNED(size)) 451 return -EINVAL; 452 453 return __kvm_gmem_create(kvm, size, flags); 454 } 455 456 int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot, 457 unsigned int fd, loff_t offset) 458 { 459 loff_t size = slot->npages << PAGE_SHIFT; 460 unsigned long start, end; 461 struct kvm_gmem *gmem; 462 struct inode *inode; 463 struct file *file; 464 int r = -EINVAL; 465 466 BUILD_BUG_ON(sizeof(gfn_t) != sizeof(slot->gmem.pgoff)); 467 468 file = fget(fd); 469 if (!file) 470 return -EBADF; 471 472 if (file->f_op != &kvm_gmem_fops) 473 goto err; 474 475 gmem = file->private_data; 476 if (gmem->kvm != kvm) 477 goto err; 478 479 inode = file_inode(file); 480 481 if (offset < 0 || !PAGE_ALIGNED(offset) || 482 offset + size > i_size_read(inode)) 483 goto err; 484 485 filemap_invalidate_lock(inode->i_mapping); 486 487 start = offset >> PAGE_SHIFT; 488 end = start + slot->npages; 489 490 if (!xa_empty(&gmem->bindings) && 491 xa_find(&gmem->bindings, &start, end - 1, XA_PRESENT)) { 492 filemap_invalidate_unlock(inode->i_mapping); 493 goto err; 494 } 495 496 /* 497 * No synchronize_rcu() needed, any in-flight readers are guaranteed to 498 * be see either a NULL file or this new file, no need for them to go 499 * away. 500 */ 501 rcu_assign_pointer(slot->gmem.file, file); 502 slot->gmem.pgoff = start; 503 504 xa_store_range(&gmem->bindings, start, end - 1, slot, GFP_KERNEL); 505 filemap_invalidate_unlock(inode->i_mapping); 506 507 /* 508 * Drop the reference to the file, even on success. The file pins KVM, 509 * not the other way 'round. Active bindings are invalidated if the 510 * file is closed before memslots are destroyed. 511 */ 512 r = 0; 513 err: 514 fput(file); 515 return r; 516 } 517 518 void kvm_gmem_unbind(struct kvm_memory_slot *slot) 519 { 520 unsigned long start = slot->gmem.pgoff; 521 unsigned long end = start + slot->npages; 522 struct kvm_gmem *gmem; 523 struct file *file; 524 525 /* 526 * Nothing to do if the underlying file was already closed (or is being 527 * closed right now), kvm_gmem_release() invalidates all bindings. 528 */ 529 file = kvm_gmem_get_file(slot); 530 if (!file) 531 return; 532 533 gmem = file->private_data; 534 535 filemap_invalidate_lock(file->f_mapping); 536 xa_store_range(&gmem->bindings, start, end - 1, NULL, GFP_KERNEL); 537 rcu_assign_pointer(slot->gmem.file, NULL); 538 synchronize_rcu(); 539 filemap_invalidate_unlock(file->f_mapping); 540 541 fput(file); 542 } 543 544 static int __kvm_gmem_get_pfn(struct file *file, struct kvm_memory_slot *slot, 545 gfn_t gfn, kvm_pfn_t *pfn, int *max_order, bool prepare) 546 { 547 pgoff_t index = gfn - slot->base_gfn + slot->gmem.pgoff; 548 struct kvm_gmem *gmem = file->private_data; 549 struct folio *folio; 550 struct page *page; 551 int r; 552 553 if (file != slot->gmem.file) { 554 WARN_ON_ONCE(slot->gmem.file); 555 return -EFAULT; 556 } 557 558 gmem = file->private_data; 559 if (xa_load(&gmem->bindings, index) != slot) { 560 WARN_ON_ONCE(xa_load(&gmem->bindings, index)); 561 return -EIO; 562 } 563 564 folio = kvm_gmem_get_folio(file_inode(file), index, prepare); 565 if (IS_ERR(folio)) 566 return PTR_ERR(folio); 567 568 if (folio_test_hwpoison(folio)) { 569 folio_unlock(folio); 570 folio_put(folio); 571 return -EHWPOISON; 572 } 573 574 page = folio_file_page(folio, index); 575 576 *pfn = page_to_pfn(page); 577 if (max_order) 578 *max_order = 0; 579 580 r = 0; 581 582 folio_unlock(folio); 583 584 return r; 585 } 586 587 int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, 588 gfn_t gfn, kvm_pfn_t *pfn, int *max_order) 589 { 590 struct file *file = kvm_gmem_get_file(slot); 591 int r; 592 593 if (!file) 594 return -EFAULT; 595 596 r = __kvm_gmem_get_pfn(file, slot, gfn, pfn, max_order, true); 597 fput(file); 598 return r; 599 } 600 EXPORT_SYMBOL_GPL(kvm_gmem_get_pfn); 601 602 long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long npages, 603 kvm_gmem_populate_cb post_populate, void *opaque) 604 { 605 struct file *file; 606 struct kvm_memory_slot *slot; 607 void __user *p; 608 609 int ret = 0, max_order; 610 long i; 611 612 lockdep_assert_held(&kvm->slots_lock); 613 if (npages < 0) 614 return -EINVAL; 615 616 slot = gfn_to_memslot(kvm, start_gfn); 617 if (!kvm_slot_can_be_private(slot)) 618 return -EINVAL; 619 620 file = kvm_gmem_get_file(slot); 621 if (!file) 622 return -EFAULT; 623 624 filemap_invalidate_lock(file->f_mapping); 625 626 npages = min_t(ulong, slot->npages - (start_gfn - slot->base_gfn), npages); 627 for (i = 0; i < npages; i += (1 << max_order)) { 628 gfn_t gfn = start_gfn + i; 629 kvm_pfn_t pfn; 630 631 if (signal_pending(current)) { 632 ret = -EINTR; 633 break; 634 } 635 636 ret = __kvm_gmem_get_pfn(file, slot, gfn, &pfn, &max_order, false); 637 if (ret) 638 break; 639 640 if (!IS_ALIGNED(gfn, (1 << max_order)) || 641 (npages - i) < (1 << max_order)) 642 max_order = 0; 643 644 p = src ? src + i * PAGE_SIZE : NULL; 645 ret = post_populate(kvm, gfn, pfn, p, max_order, opaque); 646 647 put_page(pfn_to_page(pfn)); 648 if (ret) 649 break; 650 } 651 652 filemap_invalidate_unlock(file->f_mapping); 653 654 fput(file); 655 return ret && !i ? ret : i; 656 } 657 EXPORT_SYMBOL_GPL(kvm_gmem_populate); 658