1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2012 - Virtual Open Systems and Columbia University 4 * Author: Christoffer Dall <c.dall@virtualopensystems.com> 5 */ 6 7 #include <linux/acpi.h> 8 #include <linux/mman.h> 9 #include <linux/kvm_host.h> 10 #include <linux/io.h> 11 #include <linux/hugetlb.h> 12 #include <linux/sched/signal.h> 13 #include <trace/events/kvm.h> 14 #include <asm/acpi.h> 15 #include <asm/pgalloc.h> 16 #include <asm/cacheflush.h> 17 #include <asm/kvm_arm.h> 18 #include <asm/kvm_mmu.h> 19 #include <asm/kvm_pgtable.h> 20 #include <asm/kvm_pkvm.h> 21 #include <asm/kvm_asm.h> 22 #include <asm/kvm_emulate.h> 23 #include <asm/virt.h> 24 25 #include "trace.h" 26 27 static struct kvm_pgtable *hyp_pgtable; 28 static DEFINE_MUTEX(kvm_hyp_pgd_mutex); 29 30 static unsigned long __ro_after_init hyp_idmap_start; 31 static unsigned long __ro_after_init hyp_idmap_end; 32 static phys_addr_t __ro_after_init hyp_idmap_vector; 33 34 u32 __ro_after_init __hyp_va_bits; 35 36 static unsigned long __ro_after_init io_map_base; 37 38 #define KVM_PGT_FN(fn) (!is_protected_kvm_enabled() ? fn : p ## fn) 39 40 static phys_addr_t __stage2_range_addr_end(phys_addr_t addr, phys_addr_t end, 41 phys_addr_t size) 42 { 43 phys_addr_t boundary = ALIGN_DOWN(addr + size, size); 44 45 return (boundary - 1 < end - 1) ? boundary : end; 46 } 47 48 static phys_addr_t stage2_range_addr_end(phys_addr_t addr, phys_addr_t end) 49 { 50 phys_addr_t size = kvm_granule_size(KVM_PGTABLE_MIN_BLOCK_LEVEL); 51 52 return __stage2_range_addr_end(addr, end, size); 53 } 54 55 /* 56 * Release kvm_mmu_lock periodically if the memory region is large. Otherwise, 57 * we may see kernel panics with CONFIG_DETECT_HUNG_TASK, 58 * CONFIG_LOCKUP_DETECTOR, CONFIG_LOCKDEP. Additionally, holding the lock too 59 * long will also starve other vCPUs. We have to also make sure that the page 60 * tables are not freed while we released the lock. 61 */ 62 static int stage2_apply_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, 63 phys_addr_t end, 64 int (*fn)(struct kvm_pgtable *, u64, u64), 65 bool resched) 66 { 67 struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu); 68 int ret; 69 u64 next; 70 71 do { 72 struct kvm_pgtable *pgt = mmu->pgt; 73 if (!pgt) 74 return -EINVAL; 75 76 next = stage2_range_addr_end(addr, end); 77 ret = fn(pgt, addr, next - addr); 78 if (ret) 79 break; 80 81 if (resched && next != end) 82 cond_resched_rwlock_write(&kvm->mmu_lock); 83 } while (addr = next, addr != end); 84 85 return ret; 86 } 87 88 #define stage2_apply_range_resched(mmu, addr, end, fn) \ 89 stage2_apply_range(mmu, addr, end, fn, true) 90 91 /* 92 * Get the maximum number of page-tables pages needed to split a range 93 * of blocks into PAGE_SIZE PTEs. It assumes the range is already 94 * mapped at level 2, or at level 1 if allowed. 95 */ 96 static int kvm_mmu_split_nr_page_tables(u64 range) 97 { 98 int n = 0; 99 100 if (KVM_PGTABLE_MIN_BLOCK_LEVEL < 2) 101 n += DIV_ROUND_UP(range, PUD_SIZE); 102 n += DIV_ROUND_UP(range, PMD_SIZE); 103 return n; 104 } 105 106 static bool need_split_memcache_topup_or_resched(struct kvm *kvm) 107 { 108 struct kvm_mmu_memory_cache *cache; 109 u64 chunk_size, min; 110 111 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) 112 return true; 113 114 chunk_size = kvm->arch.mmu.split_page_chunk_size; 115 min = kvm_mmu_split_nr_page_tables(chunk_size); 116 cache = &kvm->arch.mmu.split_page_cache; 117 return kvm_mmu_memory_cache_nr_free_objects(cache) < min; 118 } 119 120 static int kvm_mmu_split_huge_pages(struct kvm *kvm, phys_addr_t addr, 121 phys_addr_t end) 122 { 123 struct kvm_mmu_memory_cache *cache; 124 struct kvm_pgtable *pgt; 125 int ret, cache_capacity; 126 u64 next, chunk_size; 127 128 lockdep_assert_held_write(&kvm->mmu_lock); 129 130 chunk_size = kvm->arch.mmu.split_page_chunk_size; 131 cache_capacity = kvm_mmu_split_nr_page_tables(chunk_size); 132 133 if (chunk_size == 0) 134 return 0; 135 136 cache = &kvm->arch.mmu.split_page_cache; 137 138 do { 139 if (need_split_memcache_topup_or_resched(kvm)) { 140 write_unlock(&kvm->mmu_lock); 141 cond_resched(); 142 /* Eager page splitting is best-effort. */ 143 ret = __kvm_mmu_topup_memory_cache(cache, 144 cache_capacity, 145 cache_capacity); 146 write_lock(&kvm->mmu_lock); 147 if (ret) 148 break; 149 } 150 151 pgt = kvm->arch.mmu.pgt; 152 if (!pgt) 153 return -EINVAL; 154 155 next = __stage2_range_addr_end(addr, end, chunk_size); 156 ret = KVM_PGT_FN(kvm_pgtable_stage2_split)(pgt, addr, next - addr, cache); 157 if (ret) 158 break; 159 } while (addr = next, addr != end); 160 161 return ret; 162 } 163 164 static bool memslot_is_logging(struct kvm_memory_slot *memslot) 165 { 166 return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY); 167 } 168 169 /** 170 * kvm_arch_flush_remote_tlbs() - flush all VM TLB entries for v7/8 171 * @kvm: pointer to kvm structure. 172 * 173 * Interface to HYP function to flush all VM TLB entries 174 */ 175 int kvm_arch_flush_remote_tlbs(struct kvm *kvm) 176 { 177 if (is_protected_kvm_enabled()) 178 kvm_call_hyp_nvhe(__pkvm_tlb_flush_vmid, kvm->arch.pkvm.handle); 179 else 180 kvm_call_hyp(__kvm_tlb_flush_vmid, &kvm->arch.mmu); 181 return 0; 182 } 183 184 int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, 185 gfn_t gfn, u64 nr_pages) 186 { 187 u64 size = nr_pages << PAGE_SHIFT; 188 u64 addr = gfn << PAGE_SHIFT; 189 190 if (is_protected_kvm_enabled()) 191 kvm_call_hyp_nvhe(__pkvm_tlb_flush_vmid, kvm->arch.pkvm.handle); 192 else 193 kvm_tlb_flush_vmid_range(&kvm->arch.mmu, addr, size); 194 return 0; 195 } 196 197 static void *stage2_memcache_zalloc_page(void *arg) 198 { 199 struct kvm_mmu_memory_cache *mc = arg; 200 void *virt; 201 202 /* Allocated with __GFP_ZERO, so no need to zero */ 203 virt = kvm_mmu_memory_cache_alloc(mc); 204 if (virt) 205 kvm_account_pgtable_pages(virt, 1); 206 return virt; 207 } 208 209 static void *kvm_host_zalloc_pages_exact(size_t size) 210 { 211 return alloc_pages_exact(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO); 212 } 213 214 static void *kvm_s2_zalloc_pages_exact(size_t size) 215 { 216 void *virt = kvm_host_zalloc_pages_exact(size); 217 218 if (virt) 219 kvm_account_pgtable_pages(virt, (size >> PAGE_SHIFT)); 220 return virt; 221 } 222 223 static void kvm_s2_free_pages_exact(void *virt, size_t size) 224 { 225 kvm_account_pgtable_pages(virt, -(size >> PAGE_SHIFT)); 226 free_pages_exact(virt, size); 227 } 228 229 static struct kvm_pgtable_mm_ops kvm_s2_mm_ops; 230 231 static void stage2_free_unlinked_table_rcu_cb(struct rcu_head *head) 232 { 233 struct page *page = container_of(head, struct page, rcu_head); 234 void *pgtable = page_to_virt(page); 235 s8 level = page_private(page); 236 237 KVM_PGT_FN(kvm_pgtable_stage2_free_unlinked)(&kvm_s2_mm_ops, pgtable, level); 238 } 239 240 static void stage2_free_unlinked_table(void *addr, s8 level) 241 { 242 struct page *page = virt_to_page(addr); 243 244 set_page_private(page, (unsigned long)level); 245 call_rcu(&page->rcu_head, stage2_free_unlinked_table_rcu_cb); 246 } 247 248 static void kvm_host_get_page(void *addr) 249 { 250 get_page(virt_to_page(addr)); 251 } 252 253 static void kvm_host_put_page(void *addr) 254 { 255 put_page(virt_to_page(addr)); 256 } 257 258 static void kvm_s2_put_page(void *addr) 259 { 260 struct page *p = virt_to_page(addr); 261 /* Dropping last refcount, the page will be freed */ 262 if (page_count(p) == 1) 263 kvm_account_pgtable_pages(addr, -1); 264 put_page(p); 265 } 266 267 static int kvm_host_page_count(void *addr) 268 { 269 return page_count(virt_to_page(addr)); 270 } 271 272 static phys_addr_t kvm_host_pa(void *addr) 273 { 274 return __pa(addr); 275 } 276 277 static void *kvm_host_va(phys_addr_t phys) 278 { 279 return __va(phys); 280 } 281 282 static void clean_dcache_guest_page(void *va, size_t size) 283 { 284 __clean_dcache_guest_page(va, size); 285 } 286 287 static void invalidate_icache_guest_page(void *va, size_t size) 288 { 289 __invalidate_icache_guest_page(va, size); 290 } 291 292 /* 293 * Unmapping vs dcache management: 294 * 295 * If a guest maps certain memory pages as uncached, all writes will 296 * bypass the data cache and go directly to RAM. However, the CPUs 297 * can still speculate reads (not writes) and fill cache lines with 298 * data. 299 * 300 * Those cache lines will be *clean* cache lines though, so a 301 * clean+invalidate operation is equivalent to an invalidate 302 * operation, because no cache lines are marked dirty. 303 * 304 * Those clean cache lines could be filled prior to an uncached write 305 * by the guest, and the cache coherent IO subsystem would therefore 306 * end up writing old data to disk. 307 * 308 * This is why right after unmapping a page/section and invalidating 309 * the corresponding TLBs, we flush to make sure the IO subsystem will 310 * never hit in the cache. 311 * 312 * This is all avoided on systems that have ARM64_HAS_STAGE2_FWB, as 313 * we then fully enforce cacheability of RAM, no matter what the guest 314 * does. 315 */ 316 /** 317 * __unmap_stage2_range -- Clear stage2 page table entries to unmap a range 318 * @mmu: The KVM stage-2 MMU pointer 319 * @start: The intermediate physical base address of the range to unmap 320 * @size: The size of the area to unmap 321 * @may_block: Whether or not we are permitted to block 322 * 323 * Clear a range of stage-2 mappings, lowering the various ref-counts. Must 324 * be called while holding mmu_lock (unless for freeing the stage2 pgd before 325 * destroying the VM), otherwise another faulting VCPU may come in and mess 326 * with things behind our backs. 327 */ 328 static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size, 329 bool may_block) 330 { 331 struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu); 332 phys_addr_t end = start + size; 333 334 lockdep_assert_held_write(&kvm->mmu_lock); 335 WARN_ON(size & ~PAGE_MASK); 336 WARN_ON(stage2_apply_range(mmu, start, end, KVM_PGT_FN(kvm_pgtable_stage2_unmap), 337 may_block)); 338 } 339 340 void kvm_stage2_unmap_range(struct kvm_s2_mmu *mmu, phys_addr_t start, 341 u64 size, bool may_block) 342 { 343 if (kvm_vm_is_protected(kvm_s2_mmu_to_kvm(mmu))) 344 return; 345 346 __unmap_stage2_range(mmu, start, size, may_block); 347 } 348 349 void kvm_stage2_flush_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end) 350 { 351 stage2_apply_range_resched(mmu, addr, end, KVM_PGT_FN(kvm_pgtable_stage2_flush)); 352 } 353 354 static void stage2_flush_memslot(struct kvm *kvm, 355 struct kvm_memory_slot *memslot) 356 { 357 phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT; 358 phys_addr_t end = addr + PAGE_SIZE * memslot->npages; 359 360 kvm_stage2_flush_range(&kvm->arch.mmu, addr, end); 361 } 362 363 /** 364 * stage2_flush_vm - Invalidate cache for pages mapped in stage 2 365 * @kvm: The struct kvm pointer 366 * 367 * Go through the stage 2 page tables and invalidate any cache lines 368 * backing memory already mapped to the VM. 369 */ 370 static void stage2_flush_vm(struct kvm *kvm) 371 { 372 struct kvm_memslots *slots; 373 struct kvm_memory_slot *memslot; 374 int idx, bkt; 375 376 idx = srcu_read_lock(&kvm->srcu); 377 write_lock(&kvm->mmu_lock); 378 379 slots = kvm_memslots(kvm); 380 kvm_for_each_memslot(memslot, bkt, slots) 381 stage2_flush_memslot(kvm, memslot); 382 383 kvm_nested_s2_flush(kvm); 384 385 write_unlock(&kvm->mmu_lock); 386 srcu_read_unlock(&kvm->srcu, idx); 387 } 388 389 /** 390 * free_hyp_pgds - free Hyp-mode page tables 391 */ 392 void __init free_hyp_pgds(void) 393 { 394 mutex_lock(&kvm_hyp_pgd_mutex); 395 if (hyp_pgtable) { 396 kvm_pgtable_hyp_destroy(hyp_pgtable); 397 kfree(hyp_pgtable); 398 hyp_pgtable = NULL; 399 } 400 mutex_unlock(&kvm_hyp_pgd_mutex); 401 } 402 403 static bool kvm_host_owns_hyp_mappings(void) 404 { 405 if (is_kernel_in_hyp_mode()) 406 return false; 407 408 if (static_branch_likely(&kvm_protected_mode_initialized)) 409 return false; 410 411 /* 412 * This can happen at boot time when __create_hyp_mappings() is called 413 * after the hyp protection has been enabled, but the static key has 414 * not been flipped yet. 415 */ 416 if (!hyp_pgtable && is_protected_kvm_enabled()) 417 return false; 418 419 WARN_ON(!hyp_pgtable); 420 421 return true; 422 } 423 424 int __create_hyp_mappings(unsigned long start, unsigned long size, 425 unsigned long phys, enum kvm_pgtable_prot prot) 426 { 427 int err; 428 429 if (WARN_ON(!kvm_host_owns_hyp_mappings())) 430 return -EINVAL; 431 432 mutex_lock(&kvm_hyp_pgd_mutex); 433 err = kvm_pgtable_hyp_map(hyp_pgtable, start, size, phys, prot); 434 mutex_unlock(&kvm_hyp_pgd_mutex); 435 436 return err; 437 } 438 439 static phys_addr_t kvm_kaddr_to_phys(void *kaddr) 440 { 441 if (!is_vmalloc_addr(kaddr)) { 442 BUG_ON(!virt_addr_valid(kaddr)); 443 return __pa(kaddr); 444 } else { 445 return page_to_phys(vmalloc_to_page(kaddr)) + 446 offset_in_page(kaddr); 447 } 448 } 449 450 struct hyp_shared_pfn { 451 u64 pfn; 452 int count; 453 struct rb_node node; 454 }; 455 456 static DEFINE_MUTEX(hyp_shared_pfns_lock); 457 static struct rb_root hyp_shared_pfns = RB_ROOT; 458 459 static struct hyp_shared_pfn *find_shared_pfn(u64 pfn, struct rb_node ***node, 460 struct rb_node **parent) 461 { 462 struct hyp_shared_pfn *this; 463 464 *node = &hyp_shared_pfns.rb_node; 465 *parent = NULL; 466 while (**node) { 467 this = container_of(**node, struct hyp_shared_pfn, node); 468 *parent = **node; 469 if (this->pfn < pfn) 470 *node = &((**node)->rb_left); 471 else if (this->pfn > pfn) 472 *node = &((**node)->rb_right); 473 else 474 return this; 475 } 476 477 return NULL; 478 } 479 480 static int share_pfn_hyp(u64 pfn) 481 { 482 struct rb_node **node, *parent; 483 struct hyp_shared_pfn *this; 484 int ret = 0; 485 486 mutex_lock(&hyp_shared_pfns_lock); 487 this = find_shared_pfn(pfn, &node, &parent); 488 if (this) { 489 this->count++; 490 goto unlock; 491 } 492 493 this = kzalloc_obj(*this); 494 if (!this) { 495 ret = -ENOMEM; 496 goto unlock; 497 } 498 499 this->pfn = pfn; 500 this->count = 1; 501 rb_link_node(&this->node, parent, node); 502 rb_insert_color(&this->node, &hyp_shared_pfns); 503 ret = kvm_call_hyp_nvhe(__pkvm_host_share_hyp, pfn); 504 if (ret) { 505 rb_erase(&this->node, &hyp_shared_pfns); 506 kfree(this); 507 } 508 unlock: 509 mutex_unlock(&hyp_shared_pfns_lock); 510 511 return ret; 512 } 513 514 static int unshare_pfn_hyp(u64 pfn) 515 { 516 struct rb_node **node, *parent; 517 struct hyp_shared_pfn *this; 518 int ret = 0; 519 520 mutex_lock(&hyp_shared_pfns_lock); 521 this = find_shared_pfn(pfn, &node, &parent); 522 if (WARN_ON(!this)) { 523 ret = -ENOENT; 524 goto unlock; 525 } 526 527 if (this->count > 1) { 528 this->count--; 529 goto unlock; 530 } 531 532 ret = kvm_call_hyp_nvhe(__pkvm_host_unshare_hyp, pfn); 533 if (ret) 534 goto unlock; 535 536 rb_erase(&this->node, &hyp_shared_pfns); 537 kfree(this); 538 unlock: 539 mutex_unlock(&hyp_shared_pfns_lock); 540 541 return ret; 542 } 543 544 int kvm_share_hyp(void *from, void *to) 545 { 546 phys_addr_t start, end, cur; 547 int ret = 0; 548 u64 pfn; 549 550 if (is_kernel_in_hyp_mode()) 551 return 0; 552 553 /* 554 * The share hcall maps things in the 'fixed-offset' region of the hyp 555 * VA space, so we can only share physically contiguous data-structures 556 * for now. 557 */ 558 if (is_vmalloc_or_module_addr(from) || is_vmalloc_or_module_addr(to)) 559 return -EINVAL; 560 561 if (kvm_host_owns_hyp_mappings()) 562 return create_hyp_mappings(from, to, PAGE_HYP); 563 564 start = ALIGN_DOWN(__pa(from), PAGE_SIZE); 565 end = PAGE_ALIGN(__pa(to)); 566 for (cur = start; cur < end; cur += PAGE_SIZE) { 567 pfn = __phys_to_pfn(cur); 568 ret = share_pfn_hyp(pfn); 569 if (ret) 570 break; 571 } 572 573 if (!ret) 574 return 0; 575 576 /* 577 * Roll back the pages shared by this call. A failed unshare leaks 578 * the page (it stays shared with the hypervisor and is no longer 579 * reusable for pKVM) but breaks no isolation guarantee, so warn and 580 * continue. Not expected in practice. 581 */ 582 for (end = cur, cur = start; cur < end; cur += PAGE_SIZE) { 583 pfn = __phys_to_pfn(cur); 584 WARN_ON(unshare_pfn_hyp(pfn)); 585 } 586 587 return ret; 588 } 589 590 void kvm_unshare_hyp(void *from, void *to) 591 { 592 phys_addr_t start, end, cur; 593 u64 pfn; 594 595 if (is_kernel_in_hyp_mode() || kvm_host_owns_hyp_mappings() || !from) 596 return; 597 598 start = ALIGN_DOWN(__pa(from), PAGE_SIZE); 599 end = PAGE_ALIGN(__pa(to)); 600 for (cur = start; cur < end; cur += PAGE_SIZE) { 601 pfn = __phys_to_pfn(cur); 602 /* 603 * A failed unshare leaks the page: it stays shared with the 604 * hypervisor and is no longer reusable for pKVM. No isolation 605 * guarantee is broken, and this is not expected in practice. 606 */ 607 WARN_ON(unshare_pfn_hyp(pfn)); 608 } 609 } 610 611 /** 612 * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode 613 * @from: The virtual kernel start address of the range 614 * @to: The virtual kernel end address of the range (exclusive) 615 * @prot: The protection to be applied to this range 616 * 617 * The same virtual address as the kernel virtual address is also used 618 * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying 619 * physical pages. 620 */ 621 int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot) 622 { 623 phys_addr_t phys_addr; 624 unsigned long virt_addr; 625 unsigned long start = kern_hyp_va((unsigned long)from); 626 unsigned long end = kern_hyp_va((unsigned long)to); 627 628 if (is_kernel_in_hyp_mode()) 629 return 0; 630 631 if (!kvm_host_owns_hyp_mappings()) 632 return -EPERM; 633 634 start = start & PAGE_MASK; 635 end = PAGE_ALIGN(end); 636 637 for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) { 638 int err; 639 640 phys_addr = kvm_kaddr_to_phys(from + virt_addr - start); 641 err = __create_hyp_mappings(virt_addr, PAGE_SIZE, phys_addr, 642 prot); 643 if (err) 644 return err; 645 } 646 647 return 0; 648 } 649 650 static int __hyp_alloc_private_va_range(unsigned long base) 651 { 652 lockdep_assert_held(&kvm_hyp_pgd_mutex); 653 654 if (!PAGE_ALIGNED(base)) 655 return -EINVAL; 656 657 /* 658 * Verify that BIT(VA_BITS - 1) hasn't been flipped by 659 * allocating the new area, as it would indicate we've 660 * overflowed the idmap/IO address range. 661 */ 662 if ((base ^ io_map_base) & BIT(VA_BITS - 1)) 663 return -ENOMEM; 664 665 io_map_base = base; 666 667 return 0; 668 } 669 670 /** 671 * hyp_alloc_private_va_range - Allocates a private VA range. 672 * @size: The size of the VA range to reserve. 673 * @haddr: The hypervisor virtual start address of the allocation. 674 * 675 * The private virtual address (VA) range is allocated below io_map_base 676 * and aligned based on the order of @size. 677 * 678 * Return: 0 on success or negative error code on failure. 679 */ 680 int hyp_alloc_private_va_range(size_t size, unsigned long *haddr) 681 { 682 unsigned long base; 683 int ret = 0; 684 685 mutex_lock(&kvm_hyp_pgd_mutex); 686 687 /* 688 * This assumes that we have enough space below the idmap 689 * page to allocate our VAs. If not, the check in 690 * __hyp_alloc_private_va_range() will kick. A potential 691 * alternative would be to detect that overflow and switch 692 * to an allocation above the idmap. 693 * 694 * The allocated size is always a multiple of PAGE_SIZE. 695 */ 696 size = PAGE_ALIGN(size); 697 base = io_map_base - size; 698 ret = __hyp_alloc_private_va_range(base); 699 700 mutex_unlock(&kvm_hyp_pgd_mutex); 701 702 if (!ret) 703 *haddr = base; 704 705 return ret; 706 } 707 708 static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size, 709 unsigned long *haddr, 710 enum kvm_pgtable_prot prot) 711 { 712 unsigned long addr; 713 int ret = 0; 714 715 if (!kvm_host_owns_hyp_mappings()) { 716 addr = kvm_call_hyp_nvhe(__pkvm_create_private_mapping, 717 phys_addr, size, prot); 718 if (IS_ERR_VALUE(addr)) 719 return addr; 720 *haddr = addr; 721 722 return 0; 723 } 724 725 size = PAGE_ALIGN(size + offset_in_page(phys_addr)); 726 ret = hyp_alloc_private_va_range(size, &addr); 727 if (ret) 728 return ret; 729 730 ret = __create_hyp_mappings(addr, size, phys_addr, prot); 731 if (ret) 732 return ret; 733 734 *haddr = addr + offset_in_page(phys_addr); 735 return ret; 736 } 737 738 int create_hyp_stack(phys_addr_t phys_addr, unsigned long *haddr) 739 { 740 unsigned long base; 741 size_t size; 742 int ret; 743 744 mutex_lock(&kvm_hyp_pgd_mutex); 745 /* 746 * Efficient stack verification using the NVHE_STACK_SHIFT bit implies 747 * an alignment of our allocation on the order of the size. 748 */ 749 size = NVHE_STACK_SIZE * 2; 750 base = ALIGN_DOWN(io_map_base - size, size); 751 752 ret = __hyp_alloc_private_va_range(base); 753 754 mutex_unlock(&kvm_hyp_pgd_mutex); 755 756 if (ret) { 757 kvm_err("Cannot allocate hyp stack guard page\n"); 758 return ret; 759 } 760 761 /* 762 * Since the stack grows downwards, map the stack to the page 763 * at the higher address and leave the lower guard page 764 * unbacked. 765 * 766 * Any valid stack address now has the NVHE_STACK_SHIFT bit as 1 767 * and addresses corresponding to the guard page have the 768 * NVHE_STACK_SHIFT bit as 0 - this is used for overflow detection. 769 */ 770 ret = __create_hyp_mappings(base + NVHE_STACK_SIZE, NVHE_STACK_SIZE, 771 phys_addr, PAGE_HYP); 772 if (ret) 773 kvm_err("Cannot map hyp stack\n"); 774 775 *haddr = base + size; 776 777 return ret; 778 } 779 780 /** 781 * create_hyp_io_mappings - Map IO into both kernel and HYP 782 * @phys_addr: The physical start address which gets mapped 783 * @size: Size of the region being mapped 784 * @kaddr: Kernel VA for this mapping 785 * @haddr: HYP VA for this mapping 786 */ 787 int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size, 788 void __iomem **kaddr, 789 void __iomem **haddr) 790 { 791 unsigned long addr; 792 int ret; 793 794 if (is_protected_kvm_enabled()) 795 return -EPERM; 796 797 *kaddr = ioremap(phys_addr, size); 798 if (!*kaddr) 799 return -ENOMEM; 800 801 if (is_kernel_in_hyp_mode()) { 802 *haddr = *kaddr; 803 return 0; 804 } 805 806 ret = __create_hyp_private_mapping(phys_addr, size, 807 &addr, PAGE_HYP_DEVICE); 808 if (ret) { 809 iounmap(*kaddr); 810 *kaddr = NULL; 811 *haddr = NULL; 812 return ret; 813 } 814 815 *haddr = (void __iomem *)addr; 816 return 0; 817 } 818 819 /** 820 * create_hyp_exec_mappings - Map an executable range into HYP 821 * @phys_addr: The physical start address which gets mapped 822 * @size: Size of the region being mapped 823 * @haddr: HYP VA for this mapping 824 */ 825 int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size, 826 void **haddr) 827 { 828 unsigned long addr; 829 int ret; 830 831 BUG_ON(is_kernel_in_hyp_mode()); 832 833 ret = __create_hyp_private_mapping(phys_addr, size, 834 &addr, PAGE_HYP_EXEC); 835 if (ret) { 836 *haddr = NULL; 837 return ret; 838 } 839 840 *haddr = (void *)addr; 841 return 0; 842 } 843 844 static struct kvm_pgtable_mm_ops kvm_user_mm_ops = { 845 /* We shouldn't need any other callback to walk the PT */ 846 .phys_to_virt = kvm_host_va, 847 }; 848 849 static int get_user_mapping_size(struct kvm *kvm, u64 addr) 850 { 851 struct kvm_pgtable pgt = { 852 .pgd = (kvm_pteref_t)kvm->mm->pgd, 853 .ia_bits = vabits_actual, 854 .start_level = (KVM_PGTABLE_LAST_LEVEL - 855 ARM64_HW_PGTABLE_LEVELS(pgt.ia_bits) + 1), 856 .mm_ops = &kvm_user_mm_ops, 857 }; 858 unsigned long flags; 859 kvm_pte_t pte = 0; /* Keep GCC quiet... */ 860 s8 level = S8_MAX; 861 int ret; 862 863 /* 864 * Disable IRQs so that we hazard against a concurrent 865 * teardown of the userspace page tables (which relies on 866 * IPI-ing threads). 867 */ 868 local_irq_save(flags); 869 ret = kvm_pgtable_get_leaf(&pgt, addr, &pte, &level); 870 local_irq_restore(flags); 871 872 if (ret) 873 return ret; 874 875 /* 876 * Not seeing an error, but not updating level? Something went 877 * deeply wrong... 878 */ 879 if (WARN_ON(level > KVM_PGTABLE_LAST_LEVEL)) 880 return -EFAULT; 881 if (WARN_ON(level < KVM_PGTABLE_FIRST_LEVEL)) 882 return -EFAULT; 883 884 /* Oops, the userspace PTs are gone... Replay the fault */ 885 if (!kvm_pte_valid(pte)) 886 return -EAGAIN; 887 888 return BIT(ARM64_HW_PGTABLE_LEVEL_SHIFT(level)); 889 } 890 891 static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = { 892 .zalloc_page = stage2_memcache_zalloc_page, 893 .zalloc_pages_exact = kvm_s2_zalloc_pages_exact, 894 .free_pages_exact = kvm_s2_free_pages_exact, 895 .free_unlinked_table = stage2_free_unlinked_table, 896 .get_page = kvm_host_get_page, 897 .put_page = kvm_s2_put_page, 898 .page_count = kvm_host_page_count, 899 .phys_to_virt = kvm_host_va, 900 .virt_to_phys = kvm_host_pa, 901 .dcache_clean_inval_poc = clean_dcache_guest_page, 902 .icache_inval_pou = invalidate_icache_guest_page, 903 }; 904 905 static int kvm_init_ipa_range(struct kvm_s2_mmu *mmu, unsigned long type) 906 { 907 u32 kvm_ipa_limit = get_kvm_ipa_limit(); 908 u64 mmfr0, mmfr1; 909 u32 phys_shift; 910 911 phys_shift = KVM_VM_TYPE_ARM_IPA_SIZE(type); 912 if (is_protected_kvm_enabled()) { 913 phys_shift = kvm_ipa_limit; 914 } else if (phys_shift) { 915 if (phys_shift > kvm_ipa_limit || 916 phys_shift < ARM64_MIN_PARANGE_BITS) 917 return -EINVAL; 918 } else { 919 phys_shift = KVM_PHYS_SHIFT; 920 if (phys_shift > kvm_ipa_limit) { 921 pr_warn_once("%s using unsupported default IPA limit, upgrade your VMM\n", 922 current->comm); 923 return -EINVAL; 924 } 925 } 926 927 mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); 928 mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); 929 mmu->vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift); 930 931 return 0; 932 } 933 934 /* 935 * Assume that @pgt is valid and unlinked from the KVM MMU to free the 936 * page-table without taking the kvm_mmu_lock and without performing any 937 * TLB invalidations. 938 * 939 * Also, the range of addresses can be large enough to cause need_resched 940 * warnings, for instance on CONFIG_PREEMPT_NONE kernels. Hence, invoke 941 * cond_resched() periodically to prevent hogging the CPU for a long time 942 * and schedule something else, if required. 943 */ 944 static void stage2_destroy_range(struct kvm_pgtable *pgt, phys_addr_t addr, 945 phys_addr_t end) 946 { 947 u64 next; 948 949 do { 950 next = stage2_range_addr_end(addr, end); 951 KVM_PGT_FN(kvm_pgtable_stage2_destroy_range)(pgt, addr, 952 next - addr); 953 if (next != end) 954 cond_resched(); 955 } while (addr = next, addr != end); 956 } 957 958 static void kvm_stage2_destroy(struct kvm_pgtable *pgt) 959 { 960 unsigned int ia_bits = VTCR_EL2_IPA(pgt->mmu->vtcr); 961 962 stage2_destroy_range(pgt, 0, BIT(ia_bits)); 963 KVM_PGT_FN(kvm_pgtable_stage2_destroy_pgd)(pgt); 964 } 965 966 /** 967 * kvm_init_stage2_mmu - Initialise a S2 MMU structure 968 * @kvm: The pointer to the KVM structure 969 * @mmu: The pointer to the s2 MMU structure 970 * @type: The machine type of the virtual machine 971 * 972 * Allocates only the stage-2 HW PGD level table(s). 973 * Note we don't need locking here as this is only called in two cases: 974 * 975 * - when the VM is created, which can't race against anything 976 * 977 * - when secondary kvm_s2_mmu structures are initialised for NV 978 * guests, and the caller must hold kvm->lock as this is called on a 979 * per-vcpu basis. 980 */ 981 int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type) 982 { 983 int cpu, err; 984 struct kvm_pgtable *pgt; 985 986 /* 987 * If we already have our page tables in place, and that the 988 * MMU context is the canonical one, we have a bug somewhere, 989 * as this is only supposed to ever happen once per VM. 990 * 991 * Otherwise, we're building nested page tables, and that's 992 * probably because userspace called KVM_ARM_VCPU_INIT more 993 * than once on the same vcpu. Since that's actually legal, 994 * don't kick a fuss and leave gracefully. 995 */ 996 if (mmu->pgt != NULL) { 997 if (kvm_is_nested_s2_mmu(kvm, mmu)) 998 return 0; 999 1000 kvm_err("kvm_arch already initialized?\n"); 1001 return -EINVAL; 1002 } 1003 1004 err = kvm_init_ipa_range(mmu, type); 1005 if (err) 1006 return err; 1007 1008 pgt = kzalloc_obj(*pgt, GFP_KERNEL_ACCOUNT); 1009 if (!pgt) 1010 return -ENOMEM; 1011 1012 mmu->arch = &kvm->arch; 1013 err = KVM_PGT_FN(kvm_pgtable_stage2_init)(pgt, mmu, &kvm_s2_mm_ops); 1014 if (err) 1015 goto out_free_pgtable; 1016 1017 mmu->pgt = pgt; 1018 if (is_protected_kvm_enabled()) 1019 return 0; 1020 1021 mmu->last_vcpu_ran = alloc_percpu(typeof(*mmu->last_vcpu_ran)); 1022 if (!mmu->last_vcpu_ran) { 1023 err = -ENOMEM; 1024 goto out_destroy_pgtable; 1025 } 1026 1027 for_each_possible_cpu(cpu) 1028 *per_cpu_ptr(mmu->last_vcpu_ran, cpu) = -1; 1029 1030 /* The eager page splitting is disabled by default */ 1031 mmu->split_page_chunk_size = KVM_ARM_EAGER_SPLIT_CHUNK_SIZE_DEFAULT; 1032 mmu->split_page_cache.gfp_zero = __GFP_ZERO; 1033 1034 mmu->pgd_phys = __pa(pgt->pgd); 1035 1036 if (kvm_is_nested_s2_mmu(kvm, mmu)) 1037 kvm_init_nested_s2_mmu(mmu); 1038 1039 return 0; 1040 1041 out_destroy_pgtable: 1042 kvm_stage2_destroy(pgt); 1043 mmu->pgt = NULL; 1044 out_free_pgtable: 1045 kfree(pgt); 1046 return err; 1047 } 1048 1049 void kvm_uninit_stage2_mmu(struct kvm *kvm) 1050 { 1051 kvm_free_stage2_pgd(&kvm->arch.mmu); 1052 kvm_mmu_free_memory_cache(&kvm->arch.mmu.split_page_cache); 1053 } 1054 1055 static void stage2_unmap_memslot(struct kvm *kvm, 1056 struct kvm_memory_slot *memslot) 1057 { 1058 hva_t hva = memslot->userspace_addr; 1059 phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT; 1060 phys_addr_t size = PAGE_SIZE * memslot->npages; 1061 hva_t reg_end = hva + size; 1062 1063 /* 1064 * A memory region could potentially cover multiple VMAs, and any holes 1065 * between them, so iterate over all of them to find out if we should 1066 * unmap any of them. 1067 * 1068 * +--------------------------------------------+ 1069 * +---------------+----------------+ +----------------+ 1070 * | : VMA 1 | VMA 2 | | VMA 3 : | 1071 * +---------------+----------------+ +----------------+ 1072 * | memory region | 1073 * +--------------------------------------------+ 1074 */ 1075 do { 1076 struct vm_area_struct *vma; 1077 hva_t vm_start, vm_end; 1078 1079 vma = find_vma_intersection(current->mm, hva, reg_end); 1080 if (!vma) 1081 break; 1082 1083 /* 1084 * Take the intersection of this VMA with the memory region 1085 */ 1086 vm_start = max(hva, vma->vm_start); 1087 vm_end = min(reg_end, vma->vm_end); 1088 1089 if (!(vma->vm_flags & VM_PFNMAP)) { 1090 gpa_t gpa = addr + (vm_start - memslot->userspace_addr); 1091 kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, vm_end - vm_start, true); 1092 } 1093 hva = vm_end; 1094 } while (hva < reg_end); 1095 } 1096 1097 /** 1098 * stage2_unmap_vm - Unmap Stage-2 RAM mappings 1099 * @kvm: The struct kvm pointer 1100 * 1101 * Go through the memregions and unmap any regular RAM 1102 * backing memory already mapped to the VM. 1103 */ 1104 void stage2_unmap_vm(struct kvm *kvm) 1105 { 1106 struct kvm_memslots *slots; 1107 struct kvm_memory_slot *memslot; 1108 int idx, bkt; 1109 1110 idx = srcu_read_lock(&kvm->srcu); 1111 mmap_read_lock(current->mm); 1112 write_lock(&kvm->mmu_lock); 1113 1114 slots = kvm_memslots(kvm); 1115 kvm_for_each_memslot(memslot, bkt, slots) 1116 stage2_unmap_memslot(kvm, memslot); 1117 1118 kvm_nested_s2_unmap(kvm, true); 1119 1120 write_unlock(&kvm->mmu_lock); 1121 mmap_read_unlock(current->mm); 1122 srcu_read_unlock(&kvm->srcu, idx); 1123 } 1124 1125 void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu) 1126 { 1127 struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu); 1128 struct kvm_pgtable *pgt = NULL; 1129 1130 write_lock(&kvm->mmu_lock); 1131 pgt = mmu->pgt; 1132 if (pgt) { 1133 mmu->pgd_phys = 0; 1134 mmu->pgt = NULL; 1135 free_percpu(mmu->last_vcpu_ran); 1136 } 1137 1138 if (kvm_is_nested_s2_mmu(kvm, mmu)) 1139 kvm_init_nested_s2_mmu(mmu); 1140 1141 write_unlock(&kvm->mmu_lock); 1142 1143 if (pgt) { 1144 kvm_stage2_destroy(pgt); 1145 kfree(pgt); 1146 } 1147 } 1148 1149 static void hyp_mc_free_fn(void *addr, void *mc) 1150 { 1151 struct kvm_hyp_memcache *memcache = mc; 1152 1153 if (memcache->flags & HYP_MEMCACHE_ACCOUNT_STAGE2) 1154 kvm_account_pgtable_pages(addr, -1); 1155 1156 free_page((unsigned long)addr); 1157 } 1158 1159 static void *hyp_mc_alloc_fn(void *mc) 1160 { 1161 struct kvm_hyp_memcache *memcache = mc; 1162 void *addr; 1163 1164 addr = (void *)__get_free_page(GFP_KERNEL_ACCOUNT); 1165 if (addr && memcache->flags & HYP_MEMCACHE_ACCOUNT_STAGE2) 1166 kvm_account_pgtable_pages(addr, 1); 1167 1168 return addr; 1169 } 1170 1171 void free_hyp_memcache(struct kvm_hyp_memcache *mc) 1172 { 1173 if (!is_protected_kvm_enabled()) 1174 return; 1175 1176 kfree(mc->mapping); 1177 __free_hyp_memcache(mc, hyp_mc_free_fn, kvm_host_va, mc); 1178 } 1179 1180 int topup_hyp_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages) 1181 { 1182 if (!is_protected_kvm_enabled()) 1183 return 0; 1184 1185 if (!mc->mapping) { 1186 mc->mapping = kzalloc_obj(struct pkvm_mapping, 1187 GFP_KERNEL_ACCOUNT); 1188 if (!mc->mapping) 1189 return -ENOMEM; 1190 } 1191 1192 return __topup_hyp_memcache(mc, min_pages, hyp_mc_alloc_fn, 1193 kvm_host_pa, mc); 1194 } 1195 1196 /** 1197 * kvm_phys_addr_ioremap - map a device range to guest IPA 1198 * 1199 * @kvm: The KVM pointer 1200 * @guest_ipa: The IPA at which to insert the mapping 1201 * @pa: The physical address of the device 1202 * @size: The size of the mapping 1203 * @writable: Whether or not to create a writable mapping 1204 */ 1205 int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, 1206 phys_addr_t pa, unsigned long size, bool writable) 1207 { 1208 phys_addr_t addr; 1209 int ret = 0; 1210 struct kvm_mmu_memory_cache cache = { .gfp_zero = __GFP_ZERO }; 1211 struct kvm_s2_mmu *mmu = &kvm->arch.mmu; 1212 struct kvm_pgtable *pgt = mmu->pgt; 1213 enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_DEVICE | 1214 KVM_PGTABLE_PROT_R | 1215 (writable ? KVM_PGTABLE_PROT_W : 0); 1216 1217 if (is_protected_kvm_enabled()) 1218 return -EPERM; 1219 1220 size += offset_in_page(guest_ipa); 1221 guest_ipa &= PAGE_MASK; 1222 1223 for (addr = guest_ipa; addr < guest_ipa + size; addr += PAGE_SIZE) { 1224 ret = kvm_mmu_topup_memory_cache(&cache, 1225 kvm_mmu_cache_min_pages(mmu)); 1226 if (ret) 1227 break; 1228 1229 write_lock(&kvm->mmu_lock); 1230 ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, addr, PAGE_SIZE, 1231 pa, prot, &cache, 0); 1232 write_unlock(&kvm->mmu_lock); 1233 if (ret) 1234 break; 1235 1236 pa += PAGE_SIZE; 1237 } 1238 1239 kvm_mmu_free_memory_cache(&cache); 1240 return ret; 1241 } 1242 1243 /** 1244 * kvm_stage2_wp_range() - write protect stage2 memory region range 1245 * @mmu: The KVM stage-2 MMU pointer 1246 * @addr: Start address of range 1247 * @end: End address of range 1248 */ 1249 void kvm_stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end) 1250 { 1251 stage2_apply_range_resched(mmu, addr, end, KVM_PGT_FN(kvm_pgtable_stage2_wrprotect)); 1252 } 1253 1254 /** 1255 * kvm_mmu_wp_memory_region() - write protect stage 2 entries for memory slot 1256 * @kvm: The KVM pointer 1257 * @slot: The memory slot to write protect 1258 * 1259 * Called to start logging dirty pages after memory region 1260 * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns 1261 * all present PUD, PMD and PTEs are write protected in the memory region. 1262 * Afterwards read of dirty page log can be called. 1263 * 1264 * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired, 1265 * serializing operations for VM memory regions. 1266 */ 1267 static void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot) 1268 { 1269 struct kvm_memslots *slots = kvm_memslots(kvm); 1270 struct kvm_memory_slot *memslot = id_to_memslot(slots, slot); 1271 phys_addr_t start, end; 1272 1273 if (WARN_ON_ONCE(!memslot)) 1274 return; 1275 1276 start = memslot->base_gfn << PAGE_SHIFT; 1277 end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT; 1278 1279 write_lock(&kvm->mmu_lock); 1280 kvm_stage2_wp_range(&kvm->arch.mmu, start, end); 1281 kvm_nested_s2_wp(kvm); 1282 write_unlock(&kvm->mmu_lock); 1283 kvm_flush_remote_tlbs_memslot(kvm, memslot); 1284 } 1285 1286 /** 1287 * kvm_mmu_split_memory_region() - split the stage 2 blocks into PAGE_SIZE 1288 * pages for memory slot 1289 * @kvm: The KVM pointer 1290 * @slot: The memory slot to split 1291 * 1292 * Acquires kvm->mmu_lock. Called with kvm->slots_lock mutex acquired, 1293 * serializing operations for VM memory regions. 1294 */ 1295 static void kvm_mmu_split_memory_region(struct kvm *kvm, int slot) 1296 { 1297 struct kvm_memslots *slots; 1298 struct kvm_memory_slot *memslot; 1299 phys_addr_t start, end; 1300 1301 lockdep_assert_held(&kvm->slots_lock); 1302 1303 slots = kvm_memslots(kvm); 1304 memslot = id_to_memslot(slots, slot); 1305 1306 start = memslot->base_gfn << PAGE_SHIFT; 1307 end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT; 1308 1309 write_lock(&kvm->mmu_lock); 1310 kvm_mmu_split_huge_pages(kvm, start, end); 1311 write_unlock(&kvm->mmu_lock); 1312 } 1313 1314 /* 1315 * kvm_arch_mmu_enable_log_dirty_pt_masked() - enable dirty logging for selected pages. 1316 * @kvm: The KVM pointer 1317 * @slot: The memory slot associated with mask 1318 * @gfn_offset: The gfn offset in memory slot 1319 * @mask: The mask of pages at offset 'gfn_offset' in this memory 1320 * slot to enable dirty logging on 1321 * 1322 * Writes protect selected pages to enable dirty logging, and then 1323 * splits them to PAGE_SIZE. Caller must acquire kvm->mmu_lock. 1324 */ 1325 void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, 1326 struct kvm_memory_slot *slot, 1327 gfn_t gfn_offset, unsigned long mask) 1328 { 1329 phys_addr_t base_gfn = slot->base_gfn + gfn_offset; 1330 phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT; 1331 phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT; 1332 1333 lockdep_assert_held_write(&kvm->mmu_lock); 1334 1335 kvm_stage2_wp_range(&kvm->arch.mmu, start, end); 1336 1337 /* 1338 * Eager-splitting is done when manual-protect is set. We 1339 * also check for initially-all-set because we can avoid 1340 * eager-splitting if initially-all-set is false. 1341 * Initially-all-set equal false implies that huge-pages were 1342 * already split when enabling dirty logging: no need to do it 1343 * again. 1344 */ 1345 if (kvm_dirty_log_manual_protect_and_init_set(kvm)) 1346 kvm_mmu_split_huge_pages(kvm, start, end); 1347 1348 kvm_nested_s2_wp(kvm); 1349 } 1350 1351 static void kvm_send_hwpoison_signal(unsigned long address, short lsb) 1352 { 1353 send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current); 1354 } 1355 1356 static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot, 1357 unsigned long hva, 1358 unsigned long map_size) 1359 { 1360 gpa_t gpa_start; 1361 hva_t uaddr_start, uaddr_end; 1362 size_t size; 1363 1364 /* The memslot and the VMA are guaranteed to be aligned to PAGE_SIZE */ 1365 if (map_size == PAGE_SIZE) 1366 return true; 1367 1368 /* pKVM only supports PMD_SIZE huge-mappings */ 1369 if (is_protected_kvm_enabled() && map_size != PMD_SIZE) 1370 return false; 1371 1372 size = memslot->npages * PAGE_SIZE; 1373 1374 gpa_start = memslot->base_gfn << PAGE_SHIFT; 1375 1376 uaddr_start = memslot->userspace_addr; 1377 uaddr_end = uaddr_start + size; 1378 1379 /* 1380 * Pages belonging to memslots that don't have the same alignment 1381 * within a PMD/PUD for userspace and IPA cannot be mapped with stage-2 1382 * PMD/PUD entries, because we'll end up mapping the wrong pages. 1383 * 1384 * Consider a layout like the following: 1385 * 1386 * memslot->userspace_addr: 1387 * +-----+--------------------+--------------------+---+ 1388 * |abcde|fgh Stage-1 block | Stage-1 block tv|xyz| 1389 * +-----+--------------------+--------------------+---+ 1390 * 1391 * memslot->base_gfn << PAGE_SHIFT: 1392 * +---+--------------------+--------------------+-----+ 1393 * |abc|def Stage-2 block | Stage-2 block |tvxyz| 1394 * +---+--------------------+--------------------+-----+ 1395 * 1396 * If we create those stage-2 blocks, we'll end up with this incorrect 1397 * mapping: 1398 * d -> f 1399 * e -> g 1400 * f -> h 1401 */ 1402 if ((gpa_start & (map_size - 1)) != (uaddr_start & (map_size - 1))) 1403 return false; 1404 1405 /* 1406 * Next, let's make sure we're not trying to map anything not covered 1407 * by the memslot. This means we have to prohibit block size mappings 1408 * for the beginning and end of a non-block aligned and non-block sized 1409 * memory slot (illustrated by the head and tail parts of the 1410 * userspace view above containing pages 'abcde' and 'xyz', 1411 * respectively). 1412 * 1413 * Note that it doesn't matter if we do the check using the 1414 * userspace_addr or the base_gfn, as both are equally aligned (per 1415 * the check above) and equally sized. 1416 */ 1417 return (hva & ~(map_size - 1)) >= uaddr_start && 1418 (hva & ~(map_size - 1)) + map_size <= uaddr_end; 1419 } 1420 1421 /* 1422 * Check if the given hva is backed by a transparent huge page (THP) and 1423 * whether it can be mapped using block mapping in stage2. If so, adjust 1424 * the stage2 PFN and IPA accordingly. Only PMD_SIZE THPs are currently 1425 * supported. This will need to be updated to support other THP sizes. 1426 * 1427 * Returns the size of the mapping. 1428 */ 1429 static long 1430 transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot, 1431 unsigned long hva, kvm_pfn_t *pfnp, gfn_t *gfnp) 1432 { 1433 kvm_pfn_t pfn = *pfnp; 1434 gfn_t gfn = *gfnp; 1435 1436 /* 1437 * Make sure the adjustment is done only for THP pages. Also make 1438 * sure that the HVA and IPA are sufficiently aligned and that the 1439 * block map is contained within the memslot. 1440 */ 1441 if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) { 1442 int sz = get_user_mapping_size(kvm, hva); 1443 1444 if (sz < 0) 1445 return sz; 1446 1447 if (sz < PMD_SIZE) 1448 return PAGE_SIZE; 1449 1450 gfn &= ~(PTRS_PER_PMD - 1); 1451 *gfnp = gfn; 1452 pfn &= ~(PTRS_PER_PMD - 1); 1453 *pfnp = pfn; 1454 1455 return PMD_SIZE; 1456 } 1457 1458 /* Use page mapping if we cannot use block mapping. */ 1459 return PAGE_SIZE; 1460 } 1461 1462 static int get_vma_page_shift(struct vm_area_struct *vma, unsigned long hva) 1463 { 1464 unsigned long pa; 1465 1466 if (is_vm_hugetlb_page(vma) && !(vma->vm_flags & VM_PFNMAP)) 1467 return huge_page_shift(hstate_vma(vma)); 1468 1469 if (!(vma->vm_flags & VM_PFNMAP)) 1470 return PAGE_SHIFT; 1471 1472 VM_BUG_ON(is_vm_hugetlb_page(vma)); 1473 1474 pa = (vma->vm_pgoff << PAGE_SHIFT) + (hva - vma->vm_start); 1475 1476 #ifndef __PAGETABLE_PMD_FOLDED 1477 if ((hva & (PUD_SIZE - 1)) == (pa & (PUD_SIZE - 1)) && 1478 ALIGN_DOWN(hva, PUD_SIZE) >= vma->vm_start && 1479 ALIGN(hva, PUD_SIZE) <= vma->vm_end) 1480 return PUD_SHIFT; 1481 #endif 1482 1483 if ((hva & (PMD_SIZE - 1)) == (pa & (PMD_SIZE - 1)) && 1484 ALIGN_DOWN(hva, PMD_SIZE) >= vma->vm_start && 1485 ALIGN(hva, PMD_SIZE) <= vma->vm_end) 1486 return PMD_SHIFT; 1487 1488 return PAGE_SHIFT; 1489 } 1490 1491 /* 1492 * The page will be mapped in stage 2 as Normal Cacheable, so the VM will be 1493 * able to see the page's tags and therefore they must be initialised first. If 1494 * PG_mte_tagged is set, tags have already been initialised. 1495 * 1496 * Must be called with kvm->mmu_lock held to ensure the memory remains mapped 1497 * while the tags are zeroed. 1498 */ 1499 static void sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn, 1500 unsigned long size) 1501 { 1502 unsigned long i, nr_pages = size >> PAGE_SHIFT; 1503 struct page *page = pfn_to_page(pfn); 1504 struct folio *folio = page_folio(page); 1505 1506 if (!kvm_has_mte(kvm)) 1507 return; 1508 1509 if (is_zero_pfn(pfn)) { 1510 WARN_ON_ONCE(nr_pages != 1); 1511 return; 1512 } 1513 1514 if (folio_test_hugetlb(folio)) { 1515 /* Hugetlb has MTE flags set on head page only */ 1516 if (folio_try_hugetlb_mte_tagging(folio)) { 1517 for (i = 0; i < nr_pages; i++, page++) 1518 mte_clear_page_tags(page_address(page)); 1519 folio_set_hugetlb_mte_tagged(folio); 1520 } 1521 return; 1522 } 1523 1524 for (i = 0; i < nr_pages; i++, page++) { 1525 if (try_page_mte_tagging(page)) { 1526 mte_clear_page_tags(page_address(page)); 1527 set_page_mte_tagged(page); 1528 } 1529 } 1530 } 1531 1532 static bool kvm_vma_mte_allowed(struct vm_area_struct *vma) 1533 { 1534 return vma->vm_flags & VM_MTE_ALLOWED; 1535 } 1536 1537 static bool kvm_vma_is_cacheable(struct vm_area_struct *vma) 1538 { 1539 switch (FIELD_GET(PTE_ATTRINDX_MASK, pgprot_val(vma->vm_page_prot))) { 1540 case MT_NORMAL_NC: 1541 case MT_DEVICE_nGnRnE: 1542 case MT_DEVICE_nGnRE: 1543 return false; 1544 default: 1545 return true; 1546 } 1547 } 1548 1549 static void *get_mmu_memcache(struct kvm_vcpu *vcpu) 1550 { 1551 if (!is_protected_kvm_enabled()) 1552 return &vcpu->arch.mmu_page_cache; 1553 else 1554 return &vcpu->arch.pkvm_memcache; 1555 } 1556 1557 static int topup_mmu_memcache(struct kvm_vcpu *vcpu, void *memcache) 1558 { 1559 int min_pages = kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu); 1560 1561 if (!is_protected_kvm_enabled()) 1562 return kvm_mmu_topup_memory_cache(memcache, min_pages); 1563 1564 return topup_hyp_memcache(memcache, min_pages); 1565 } 1566 1567 /* 1568 * Potentially reduce shadow S2 permissions to match the guest's own S2. For 1569 * exec faults, we'd only reach this point if the guest actually allowed it (see 1570 * kvm_s2_handle_perm_fault). 1571 * 1572 * Also encode the level of the original translation in the SW bits of the leaf 1573 * entry as a proxy for the span of that translation. This will be retrieved on 1574 * TLB invalidation from the guest and used to limit the invalidation scope if a 1575 * TTL hint or a range isn't provided. 1576 */ 1577 static enum kvm_pgtable_prot adjust_nested_fault_perms(struct kvm_s2_trans *nested, 1578 enum kvm_pgtable_prot prot) 1579 { 1580 if (!kvm_s2_trans_writable(nested)) 1581 prot &= ~KVM_PGTABLE_PROT_W; 1582 if (!kvm_s2_trans_readable(nested)) 1583 prot &= ~KVM_PGTABLE_PROT_R; 1584 1585 return prot | kvm_encode_nested_level(nested); 1586 } 1587 1588 static enum kvm_pgtable_prot adjust_nested_exec_perms(struct kvm *kvm, 1589 struct kvm_s2_trans *nested, 1590 enum kvm_pgtable_prot prot) 1591 { 1592 if (!kvm_s2_trans_exec_el0(kvm, nested)) 1593 prot &= ~KVM_PGTABLE_PROT_UX; 1594 if (!kvm_s2_trans_exec_el1(kvm, nested)) 1595 prot &= ~KVM_PGTABLE_PROT_PX; 1596 1597 return prot; 1598 } 1599 1600 struct kvm_s2_fault_desc { 1601 struct kvm_vcpu *vcpu; 1602 phys_addr_t fault_ipa; 1603 struct kvm_s2_trans *nested; 1604 struct kvm_memory_slot *memslot; 1605 unsigned long hva; 1606 }; 1607 1608 static int gmem_abort(const struct kvm_s2_fault_desc *s2fd) 1609 { 1610 bool write_fault, exec_fault; 1611 bool perm_fault = kvm_vcpu_trap_is_permission_fault(s2fd->vcpu); 1612 enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_SHARED; 1613 enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; 1614 struct kvm_pgtable *pgt = s2fd->vcpu->arch.hw_mmu->pgt; 1615 unsigned long mmu_seq; 1616 struct page *page; 1617 struct kvm *kvm = s2fd->vcpu->kvm; 1618 void *memcache = NULL; 1619 kvm_pfn_t pfn; 1620 gfn_t gfn; 1621 int ret; 1622 1623 if (!perm_fault) { 1624 memcache = get_mmu_memcache(s2fd->vcpu); 1625 ret = topup_mmu_memcache(s2fd->vcpu, memcache); 1626 if (ret) 1627 return ret; 1628 } 1629 1630 if (s2fd->nested) 1631 gfn = kvm_s2_trans_output(s2fd->nested) >> PAGE_SHIFT; 1632 else 1633 gfn = s2fd->fault_ipa >> PAGE_SHIFT; 1634 1635 write_fault = kvm_is_write_fault(s2fd->vcpu); 1636 exec_fault = kvm_vcpu_trap_is_exec_fault(s2fd->vcpu); 1637 1638 VM_WARN_ON_ONCE(write_fault && exec_fault); 1639 1640 mmu_seq = kvm->mmu_invalidate_seq; 1641 /* Pairs with the smp_wmb() in kvm_mmu_invalidate_end(). */ 1642 smp_rmb(); 1643 1644 ret = kvm_gmem_get_pfn(kvm, s2fd->memslot, gfn, &pfn, &page, NULL); 1645 if (ret) { 1646 kvm_prepare_memory_fault_exit(s2fd->vcpu, s2fd->fault_ipa, PAGE_SIZE, 1647 write_fault, exec_fault, false); 1648 return ret; 1649 } 1650 1651 if (!(s2fd->memslot->flags & KVM_MEM_READONLY)) 1652 prot |= KVM_PGTABLE_PROT_W; 1653 1654 if (s2fd->nested) 1655 prot = adjust_nested_fault_perms(s2fd->nested, prot); 1656 1657 if (exec_fault || cpus_have_final_cap(ARM64_HAS_CACHE_DIC)) 1658 prot |= KVM_PGTABLE_PROT_X; 1659 1660 if (s2fd->nested) 1661 prot = adjust_nested_exec_perms(kvm, s2fd->nested, prot); 1662 1663 kvm_fault_lock(kvm); 1664 if (mmu_invalidate_retry(kvm, mmu_seq)) { 1665 ret = -EAGAIN; 1666 goto out_unlock; 1667 } 1668 1669 if (perm_fault) { 1670 /* 1671 * Drop the SW bits in favour of those stored in the 1672 * PTE, which will be preserved. 1673 */ 1674 prot &= ~KVM_NV_GUEST_MAP_SZ; 1675 ret = KVM_PGT_FN(kvm_pgtable_stage2_relax_perms)(pgt, s2fd->fault_ipa, 1676 prot, flags); 1677 } else { 1678 ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, s2fd->fault_ipa, PAGE_SIZE, 1679 __pfn_to_phys(pfn), prot, 1680 memcache, flags); 1681 } 1682 1683 out_unlock: 1684 kvm_release_faultin_page(kvm, page, !!ret, prot & KVM_PGTABLE_PROT_W); 1685 kvm_fault_unlock(kvm); 1686 1687 if ((prot & KVM_PGTABLE_PROT_W) && !ret) 1688 mark_page_dirty_in_slot(kvm, s2fd->memslot, gfn); 1689 1690 return ret != -EAGAIN ? ret : 0; 1691 } 1692 1693 struct kvm_s2_fault_vma_info { 1694 unsigned long mmu_seq; 1695 long vma_pagesize; 1696 vm_flags_t vm_flags; 1697 unsigned long max_map_size; 1698 struct page *page; 1699 kvm_pfn_t pfn; 1700 gfn_t gfn; 1701 bool device; 1702 bool mte_allowed; 1703 bool is_vma_cacheable; 1704 bool map_writable; 1705 bool map_non_cacheable; 1706 }; 1707 1708 static int pkvm_mem_abort(const struct kvm_s2_fault_desc *s2fd) 1709 { 1710 unsigned int flags = FOLL_HWPOISON | FOLL_LONGTERM | FOLL_WRITE; 1711 struct kvm_vcpu *vcpu = s2fd->vcpu; 1712 struct kvm_pgtable *pgt = vcpu->arch.hw_mmu->pgt; 1713 struct mm_struct *mm = current->mm; 1714 struct kvm *kvm = vcpu->kvm; 1715 void *hyp_memcache; 1716 struct page *page; 1717 int ret; 1718 1719 hyp_memcache = get_mmu_memcache(vcpu); 1720 ret = topup_mmu_memcache(vcpu, hyp_memcache); 1721 if (ret) 1722 return -ENOMEM; 1723 1724 ret = account_locked_vm(mm, 1, true); 1725 if (ret) 1726 return ret; 1727 1728 mmap_read_lock(mm); 1729 ret = pin_user_pages(s2fd->hva, 1, flags, &page); 1730 mmap_read_unlock(mm); 1731 1732 if (ret == -EHWPOISON) { 1733 kvm_send_hwpoison_signal(s2fd->hva, PAGE_SHIFT); 1734 ret = 0; 1735 goto dec_account; 1736 } else if (ret != 1) { 1737 ret = -EFAULT; 1738 goto dec_account; 1739 } else if (!folio_test_swapbacked(page_folio(page))) { 1740 /* 1741 * We really can't deal with page-cache pages returned by GUP 1742 * because (a) we may trigger writeback of a page for which we 1743 * no longer have access and (b) page_mkclean() won't find the 1744 * stage-2 mapping in the rmap so we can get out-of-whack with 1745 * the filesystem when marking the page dirty during unpinning 1746 * (see cc5095747edf ("ext4: don't BUG if someone dirty pages 1747 * without asking ext4 first")). 1748 * 1749 * Ideally we'd just restrict ourselves to anonymous pages, but 1750 * we also want to allow memfd (i.e. shmem) pages, so check for 1751 * pages backed by swap in the knowledge that the GUP pin will 1752 * prevent try_to_unmap() from succeeding. 1753 */ 1754 ret = -EIO; 1755 goto unpin; 1756 } 1757 1758 write_lock(&kvm->mmu_lock); 1759 ret = pkvm_pgtable_stage2_map(pgt, s2fd->fault_ipa, PAGE_SIZE, 1760 page_to_phys(page), KVM_PGTABLE_PROT_RWX, 1761 hyp_memcache, 0); 1762 write_unlock(&kvm->mmu_lock); 1763 if (ret) { 1764 if (ret == -EAGAIN) 1765 ret = 0; 1766 goto unpin; 1767 } 1768 1769 return 0; 1770 unpin: 1771 unpin_user_pages(&page, 1); 1772 dec_account: 1773 account_locked_vm(mm, 1, false); 1774 return ret; 1775 } 1776 1777 static short kvm_s2_resolve_vma_size(const struct kvm_s2_fault_desc *s2fd, 1778 struct kvm_s2_fault_vma_info *s2vi, 1779 struct vm_area_struct *vma) 1780 { 1781 short vma_shift; 1782 1783 if (memslot_is_logging(s2fd->memslot)) { 1784 s2vi->max_map_size = PAGE_SIZE; 1785 vma_shift = PAGE_SHIFT; 1786 } else { 1787 s2vi->max_map_size = PUD_SIZE; 1788 vma_shift = get_vma_page_shift(vma, s2fd->hva); 1789 } 1790 1791 switch (vma_shift) { 1792 #ifndef __PAGETABLE_PMD_FOLDED 1793 case PUD_SHIFT: 1794 if (fault_supports_stage2_huge_mapping(s2fd->memslot, s2fd->hva, PUD_SIZE)) 1795 break; 1796 fallthrough; 1797 #endif 1798 case CONT_PMD_SHIFT: 1799 vma_shift = PMD_SHIFT; 1800 fallthrough; 1801 case PMD_SHIFT: 1802 if (fault_supports_stage2_huge_mapping(s2fd->memslot, s2fd->hva, PMD_SIZE)) 1803 break; 1804 fallthrough; 1805 case CONT_PTE_SHIFT: 1806 vma_shift = PAGE_SHIFT; 1807 s2vi->max_map_size = PAGE_SIZE; 1808 fallthrough; 1809 case PAGE_SHIFT: 1810 break; 1811 default: 1812 WARN_ONCE(1, "Unknown vma_shift %d", vma_shift); 1813 } 1814 1815 if (s2fd->nested) { 1816 unsigned long max_map_size; 1817 1818 max_map_size = min(s2vi->max_map_size, PUD_SIZE); 1819 1820 /* 1821 * If we're about to create a shadow stage 2 entry, then we 1822 * can only create a block mapping if the guest stage 2 page 1823 * table uses at least as big a mapping. 1824 */ 1825 max_map_size = min(kvm_s2_trans_size(s2fd->nested), max_map_size); 1826 1827 /* 1828 * Be careful that if the mapping size falls between 1829 * two host sizes, take the smallest of the two. 1830 */ 1831 if (max_map_size >= PMD_SIZE && max_map_size < PUD_SIZE) 1832 max_map_size = PMD_SIZE; 1833 else if (max_map_size >= PAGE_SIZE && max_map_size < PMD_SIZE) 1834 max_map_size = PAGE_SIZE; 1835 1836 s2vi->max_map_size = max_map_size; 1837 vma_shift = min_t(short, vma_shift, __ffs(max_map_size)); 1838 } 1839 1840 return vma_shift; 1841 } 1842 1843 static bool kvm_s2_fault_is_perm(const struct kvm_s2_fault_desc *s2fd) 1844 { 1845 return kvm_vcpu_trap_is_permission_fault(s2fd->vcpu); 1846 } 1847 1848 static int kvm_s2_fault_get_vma_info(const struct kvm_s2_fault_desc *s2fd, 1849 struct kvm_s2_fault_vma_info *s2vi) 1850 { 1851 struct vm_area_struct *vma; 1852 struct kvm *kvm = s2fd->vcpu->kvm; 1853 1854 mmap_read_lock(current->mm); 1855 vma = vma_lookup(current->mm, s2fd->hva); 1856 if (unlikely(!vma)) { 1857 kvm_err("Failed to find VMA for hva 0x%lx\n", s2fd->hva); 1858 mmap_read_unlock(current->mm); 1859 return -EFAULT; 1860 } 1861 1862 s2vi->vma_pagesize = BIT(kvm_s2_resolve_vma_size(s2fd, s2vi, vma)); 1863 1864 /* 1865 * Both the canonical IPA and fault IPA must be aligned to the 1866 * mapping size to ensure we find the right PFN and lay down the 1867 * mapping in the right place. 1868 */ 1869 s2vi->gfn = ALIGN_DOWN(s2fd->fault_ipa, s2vi->vma_pagesize) >> PAGE_SHIFT; 1870 1871 s2vi->mte_allowed = kvm_vma_mte_allowed(vma); 1872 1873 s2vi->vm_flags = vma->vm_flags; 1874 1875 s2vi->is_vma_cacheable = kvm_vma_is_cacheable(vma); 1876 1877 /* 1878 * Read mmu_invalidate_seq so that KVM can detect if the results of 1879 * vma_lookup() or __kvm_faultin_pfn() become stale prior to 1880 * acquiring kvm->mmu_lock. 1881 * 1882 * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs 1883 * with the smp_wmb() in kvm_mmu_invalidate_end(). 1884 */ 1885 s2vi->mmu_seq = kvm->mmu_invalidate_seq; 1886 mmap_read_unlock(current->mm); 1887 1888 return 0; 1889 } 1890 1891 static gfn_t get_canonical_gfn(const struct kvm_s2_fault_desc *s2fd, 1892 const struct kvm_s2_fault_vma_info *s2vi) 1893 { 1894 phys_addr_t ipa; 1895 1896 if (!s2fd->nested) 1897 return s2vi->gfn; 1898 1899 ipa = kvm_s2_trans_output(s2fd->nested); 1900 return ALIGN_DOWN(ipa, s2vi->vma_pagesize) >> PAGE_SHIFT; 1901 } 1902 1903 static int kvm_s2_fault_pin_pfn(const struct kvm_s2_fault_desc *s2fd, 1904 struct kvm_s2_fault_vma_info *s2vi) 1905 { 1906 int ret; 1907 1908 ret = kvm_s2_fault_get_vma_info(s2fd, s2vi); 1909 if (ret) 1910 return ret; 1911 1912 s2vi->pfn = __kvm_faultin_pfn(s2fd->memslot, get_canonical_gfn(s2fd, s2vi), 1913 kvm_is_write_fault(s2fd->vcpu) ? FOLL_WRITE : 0, 1914 &s2vi->map_writable, &s2vi->page); 1915 if (unlikely(is_error_noslot_pfn(s2vi->pfn))) { 1916 if (s2vi->pfn == KVM_PFN_ERR_HWPOISON) { 1917 kvm_send_hwpoison_signal(s2fd->hva, __ffs(s2vi->vma_pagesize)); 1918 return 0; 1919 } 1920 return -EFAULT; 1921 } 1922 1923 /* 1924 * Check if this is non-struct page memory PFN, and cannot support 1925 * CMOs. It could potentially be unsafe to access as cacheable. 1926 */ 1927 if (s2vi->vm_flags & (VM_PFNMAP | VM_MIXEDMAP) && !pfn_is_map_memory(s2vi->pfn)) { 1928 if (s2vi->is_vma_cacheable) { 1929 /* 1930 * Whilst the VMA owner expects cacheable mapping to this 1931 * PFN, hardware also has to support the FWB and CACHE DIC 1932 * features. 1933 * 1934 * ARM64 KVM relies on kernel VA mapping to the PFN to 1935 * perform cache maintenance as the CMO instructions work on 1936 * virtual addresses. VM_PFNMAP region are not necessarily 1937 * mapped to a KVA and hence the presence of hardware features 1938 * S2FWB and CACHE DIC are mandatory to avoid the need for 1939 * cache maintenance. 1940 */ 1941 if (!kvm_supports_cacheable_pfnmap()) { 1942 kvm_release_faultin_page(s2fd->vcpu->kvm, s2vi->page, true, false); 1943 return -EFAULT; 1944 } 1945 } else { 1946 /* 1947 * If the page was identified as device early by looking at 1948 * the VMA flags, vma_pagesize is already representing the 1949 * largest quantity we can map. If instead it was mapped 1950 * via __kvm_faultin_pfn(), vma_pagesize is set to PAGE_SIZE 1951 * and must not be upgraded. 1952 * 1953 * In both cases, we don't let transparent_hugepage_adjust() 1954 * change things at the last minute. 1955 */ 1956 s2vi->map_non_cacheable = true; 1957 } 1958 1959 s2vi->device = true; 1960 } 1961 1962 return 1; 1963 } 1964 1965 static int kvm_s2_fault_compute_prot(const struct kvm_s2_fault_desc *s2fd, 1966 const struct kvm_s2_fault_vma_info *s2vi, 1967 enum kvm_pgtable_prot *prot) 1968 { 1969 struct kvm *kvm = s2fd->vcpu->kvm; 1970 1971 if (kvm_vcpu_trap_is_exec_fault(s2fd->vcpu) && s2vi->map_non_cacheable) 1972 return -ENOEXEC; 1973 1974 /* 1975 * Guest performs atomic/exclusive operations on memory with unsupported 1976 * attributes (e.g. ld64b/st64b on normal memory when no FEAT_LS64WB) 1977 * and trigger the exception here. Since the memslot is valid, inject 1978 * the fault back to the guest. 1979 */ 1980 if (esr_fsc_is_excl_atomic_fault(kvm_vcpu_get_esr(s2fd->vcpu))) { 1981 kvm_inject_dabt_excl_atomic(s2fd->vcpu, kvm_vcpu_get_hfar(s2fd->vcpu)); 1982 return 1; 1983 } 1984 1985 *prot = KVM_PGTABLE_PROT_R; 1986 1987 if (s2vi->map_writable && (s2vi->device || 1988 !memslot_is_logging(s2fd->memslot) || 1989 kvm_is_write_fault(s2fd->vcpu))) 1990 *prot |= KVM_PGTABLE_PROT_W; 1991 1992 if (s2fd->nested) 1993 *prot = adjust_nested_fault_perms(s2fd->nested, *prot); 1994 1995 if (kvm_vcpu_trap_is_exec_fault(s2fd->vcpu)) 1996 *prot |= KVM_PGTABLE_PROT_X; 1997 1998 if (s2vi->map_non_cacheable) 1999 *prot |= (s2vi->vm_flags & VM_ALLOW_ANY_UNCACHED) ? 2000 KVM_PGTABLE_PROT_NORMAL_NC : KVM_PGTABLE_PROT_DEVICE; 2001 else if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC)) 2002 *prot |= KVM_PGTABLE_PROT_X; 2003 2004 if (s2fd->nested) 2005 *prot = adjust_nested_exec_perms(kvm, s2fd->nested, *prot); 2006 2007 if (!kvm_s2_fault_is_perm(s2fd) && !s2vi->map_non_cacheable && kvm_has_mte(kvm)) { 2008 /* Check the VMM hasn't introduced a new disallowed VMA */ 2009 if (!s2vi->mte_allowed) 2010 return -EFAULT; 2011 } 2012 2013 return 0; 2014 } 2015 2016 static int kvm_s2_fault_map(const struct kvm_s2_fault_desc *s2fd, 2017 const struct kvm_s2_fault_vma_info *s2vi, 2018 enum kvm_pgtable_prot prot, 2019 void *memcache) 2020 { 2021 enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_SHARED; 2022 bool writable = prot & KVM_PGTABLE_PROT_W; 2023 struct kvm *kvm = s2fd->vcpu->kvm; 2024 struct kvm_pgtable *pgt; 2025 long perm_fault_granule; 2026 long mapping_size; 2027 kvm_pfn_t pfn; 2028 gfn_t gfn; 2029 int ret; 2030 2031 kvm_fault_lock(kvm); 2032 pgt = s2fd->vcpu->arch.hw_mmu->pgt; 2033 ret = -EAGAIN; 2034 if (mmu_invalidate_retry(kvm, s2vi->mmu_seq)) 2035 goto out_unlock; 2036 2037 perm_fault_granule = (kvm_s2_fault_is_perm(s2fd) ? 2038 kvm_vcpu_trap_get_perm_fault_granule(s2fd->vcpu) : 0); 2039 mapping_size = s2vi->vma_pagesize; 2040 pfn = s2vi->pfn; 2041 gfn = s2vi->gfn; 2042 2043 /* 2044 * If we are not forced to use page mapping, check if we are 2045 * backed by a THP and thus use block mapping if possible. 2046 */ 2047 if (mapping_size == PAGE_SIZE && 2048 !(s2vi->max_map_size == PAGE_SIZE || s2vi->map_non_cacheable)) { 2049 if (perm_fault_granule > PAGE_SIZE) { 2050 mapping_size = perm_fault_granule; 2051 } else { 2052 mapping_size = transparent_hugepage_adjust(kvm, s2fd->memslot, 2053 s2fd->hva, &pfn, 2054 &gfn); 2055 if (mapping_size < 0) { 2056 ret = mapping_size; 2057 goto out_unlock; 2058 } 2059 } 2060 } 2061 2062 if (!perm_fault_granule && !s2vi->map_non_cacheable && kvm_has_mte(kvm)) 2063 sanitise_mte_tags(kvm, pfn, mapping_size); 2064 2065 /* 2066 * Under the premise of getting a FSC_PERM fault, we just need to relax 2067 * permissions only if mapping_size equals perm_fault_granule. Otherwise, 2068 * kvm_pgtable_stage2_map() should be called to change block size. 2069 */ 2070 if (mapping_size == perm_fault_granule) { 2071 /* 2072 * Drop the SW bits in favour of those stored in the 2073 * PTE, which will be preserved. 2074 */ 2075 prot &= ~KVM_NV_GUEST_MAP_SZ; 2076 ret = KVM_PGT_FN(kvm_pgtable_stage2_relax_perms)(pgt, gfn_to_gpa(gfn), 2077 prot, flags); 2078 } else { 2079 ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, gfn_to_gpa(gfn), mapping_size, 2080 __pfn_to_phys(pfn), prot, 2081 memcache, flags); 2082 } 2083 2084 out_unlock: 2085 kvm_release_faultin_page(kvm, s2vi->page, !!ret, writable); 2086 kvm_fault_unlock(kvm); 2087 2088 /* 2089 * Mark the page dirty only if the fault is handled successfully, 2090 * making sure we adjust the canonical IPA if the mapping size has 2091 * been updated (via a THP upgrade, for example). 2092 */ 2093 if (writable && !ret) { 2094 phys_addr_t ipa = gfn_to_gpa(get_canonical_gfn(s2fd, s2vi)); 2095 ipa &= ~(mapping_size - 1); 2096 mark_page_dirty_in_slot(kvm, s2fd->memslot, gpa_to_gfn(ipa)); 2097 } 2098 2099 if (ret != -EAGAIN) 2100 return ret; 2101 return 0; 2102 } 2103 2104 static int user_mem_abort(const struct kvm_s2_fault_desc *s2fd) 2105 { 2106 bool perm_fault = kvm_vcpu_trap_is_permission_fault(s2fd->vcpu); 2107 struct kvm_s2_fault_vma_info s2vi = {}; 2108 enum kvm_pgtable_prot prot; 2109 void *memcache; 2110 int ret; 2111 2112 /* 2113 * Permission faults just need to update the existing leaf entry, 2114 * and so normally don't require allocations from the memcache. The 2115 * only exception to this is when dirty logging is enabled at runtime 2116 * and a write fault needs to collapse a block entry into a table. 2117 */ 2118 memcache = get_mmu_memcache(s2fd->vcpu); 2119 if (!perm_fault || (memslot_is_logging(s2fd->memslot) && 2120 kvm_is_write_fault(s2fd->vcpu))) { 2121 ret = topup_mmu_memcache(s2fd->vcpu, memcache); 2122 if (ret) 2123 return ret; 2124 } 2125 2126 /* 2127 * Let's check if we will get back a huge page backed by hugetlbfs, or 2128 * get block mapping for device MMIO region. 2129 */ 2130 ret = kvm_s2_fault_pin_pfn(s2fd, &s2vi); 2131 if (ret != 1) 2132 return ret; 2133 2134 ret = kvm_s2_fault_compute_prot(s2fd, &s2vi, &prot); 2135 if (ret) { 2136 kvm_release_page_unused(s2vi.page); 2137 return ret; 2138 } 2139 2140 return kvm_s2_fault_map(s2fd, &s2vi, prot, memcache); 2141 } 2142 2143 /* Resolve the access fault by making the page young again. */ 2144 static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) 2145 { 2146 enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_SHARED; 2147 struct kvm_s2_mmu *mmu; 2148 2149 trace_kvm_access_fault(fault_ipa); 2150 2151 read_lock(&vcpu->kvm->mmu_lock); 2152 mmu = vcpu->arch.hw_mmu; 2153 KVM_PGT_FN(kvm_pgtable_stage2_mkyoung)(mmu->pgt, fault_ipa, flags); 2154 read_unlock(&vcpu->kvm->mmu_lock); 2155 } 2156 2157 /* 2158 * Returns true if the SEA should be handled locally within KVM if the abort 2159 * is caused by a kernel memory allocation (e.g. stage-2 table memory). 2160 */ 2161 static bool host_owns_sea(struct kvm_vcpu *vcpu, u64 esr) 2162 { 2163 /* 2164 * Without FEAT_RAS HCR_EL2.TEA is RES0, meaning any external abort 2165 * taken from a guest EL to EL2 is due to a host-imposed access (e.g. 2166 * stage-2 PTW). 2167 */ 2168 if (!cpus_have_final_cap(ARM64_HAS_RAS_EXTN)) 2169 return true; 2170 2171 /* KVM owns the VNCR when the vCPU isn't in a nested context. */ 2172 if (is_hyp_ctxt(vcpu) && !kvm_vcpu_trap_is_iabt(vcpu) && (esr & ESR_ELx_VNCR)) 2173 return true; 2174 2175 /* 2176 * Determining if an external abort during a table walk happened at 2177 * stage-2 is only possible with S1PTW is set. Otherwise, since KVM 2178 * sets HCR_EL2.TEA, SEAs due to a stage-1 walk (i.e. accessing the 2179 * PA of the stage-1 descriptor) can reach here and are reported 2180 * with a TTW ESR value. 2181 */ 2182 return (esr_fsc_is_sea_ttw(esr) && (esr & ESR_ELx_S1PTW)); 2183 } 2184 2185 int kvm_handle_guest_sea(struct kvm_vcpu *vcpu) 2186 { 2187 struct kvm *kvm = vcpu->kvm; 2188 struct kvm_run *run = vcpu->run; 2189 u64 esr = kvm_vcpu_get_esr(vcpu); 2190 u64 esr_mask = ESR_ELx_EC_MASK | 2191 ESR_ELx_IL | 2192 ESR_ELx_FnV | 2193 ESR_ELx_EA | 2194 ESR_ELx_CM | 2195 ESR_ELx_WNR | 2196 ESR_ELx_FSC; 2197 u64 ipa; 2198 2199 /* 2200 * Give APEI the opportunity to claim the abort before handling it 2201 * within KVM. apei_claim_sea() expects to be called with IRQs enabled. 2202 */ 2203 lockdep_assert_irqs_enabled(); 2204 if (apei_claim_sea(NULL) == 0) 2205 return 1; 2206 2207 if (host_owns_sea(vcpu, esr) || 2208 !test_bit(KVM_ARCH_FLAG_EXIT_SEA, &vcpu->kvm->arch.flags)) 2209 return kvm_inject_serror(vcpu); 2210 2211 /* ESR_ELx.SET is RES0 when FEAT_RAS isn't implemented. */ 2212 if (kvm_has_ras(kvm)) 2213 esr_mask |= ESR_ELx_SET_MASK; 2214 2215 /* 2216 * Exit to userspace, and provide faulting guest virtual and physical 2217 * addresses in case userspace wants to emulate SEA to guest by 2218 * writing to FAR_ELx and HPFAR_ELx registers. 2219 */ 2220 memset(&run->arm_sea, 0, sizeof(run->arm_sea)); 2221 run->exit_reason = KVM_EXIT_ARM_SEA; 2222 run->arm_sea.esr = esr & esr_mask; 2223 2224 if (!(esr & ESR_ELx_FnV)) 2225 run->arm_sea.gva = kvm_vcpu_get_hfar(vcpu); 2226 2227 ipa = kvm_vcpu_get_fault_ipa(vcpu); 2228 if (ipa != INVALID_GPA) { 2229 run->arm_sea.flags |= KVM_EXIT_ARM_SEA_FLAG_GPA_VALID; 2230 run->arm_sea.gpa = ipa; 2231 } 2232 2233 return 0; 2234 } 2235 2236 /** 2237 * kvm_handle_guest_abort - handles all 2nd stage aborts 2238 * @vcpu: the VCPU pointer 2239 * 2240 * Any abort that gets to the host is almost guaranteed to be caused by a 2241 * missing second stage translation table entry, which can mean that either the 2242 * guest simply needs more memory and we must allocate an appropriate page or it 2243 * can mean that the guest tried to access I/O memory, which is emulated by user 2244 * space. The distinction is based on the IPA causing the fault and whether this 2245 * memory region has been registered as standard RAM by user space. 2246 */ 2247 int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) 2248 { 2249 struct kvm_s2_trans nested_trans, *nested = NULL; 2250 unsigned long esr; 2251 phys_addr_t fault_ipa; /* The address we faulted on */ 2252 phys_addr_t ipa; /* Always the IPA in the L1 guest phys space */ 2253 struct kvm_memory_slot *memslot; 2254 unsigned long hva; 2255 bool is_iabt, write_fault, writable; 2256 gfn_t gfn; 2257 int ret, idx; 2258 2259 if (kvm_vcpu_abt_issea(vcpu)) 2260 return kvm_handle_guest_sea(vcpu); 2261 2262 esr = kvm_vcpu_get_esr(vcpu); 2263 2264 /* 2265 * The fault IPA should be reliable at this point as we're not dealing 2266 * with an SEA. 2267 */ 2268 ipa = fault_ipa = kvm_vcpu_get_fault_ipa(vcpu); 2269 if (KVM_BUG_ON(ipa == INVALID_GPA, vcpu->kvm)) 2270 return -EFAULT; 2271 2272 is_iabt = kvm_vcpu_trap_is_iabt(vcpu); 2273 2274 if (esr_fsc_is_translation_fault(esr)) { 2275 /* Beyond sanitised PARange (which is the IPA limit) */ 2276 if (fault_ipa >= BIT_ULL(get_kvm_ipa_limit())) { 2277 kvm_inject_size_fault(vcpu); 2278 return 1; 2279 } 2280 2281 /* Falls between the IPA range and the PARange? */ 2282 if (fault_ipa >= BIT_ULL(VTCR_EL2_IPA(vcpu->arch.hw_mmu->vtcr))) { 2283 fault_ipa |= FAR_TO_FIPA_OFFSET(kvm_vcpu_get_hfar(vcpu)); 2284 2285 return kvm_inject_sea(vcpu, is_iabt, fault_ipa); 2286 } 2287 } 2288 2289 trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_esr(vcpu), 2290 kvm_vcpu_get_hfar(vcpu), fault_ipa); 2291 2292 /* Check the stage-2 fault is trans. fault or write fault */ 2293 if (!esr_fsc_is_translation_fault(esr) && 2294 !esr_fsc_is_permission_fault(esr) && 2295 !esr_fsc_is_access_flag_fault(esr) && 2296 !esr_fsc_is_excl_atomic_fault(esr)) { 2297 kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n", 2298 kvm_vcpu_trap_get_class(vcpu), 2299 (unsigned long)kvm_vcpu_trap_get_fault(vcpu), 2300 (unsigned long)kvm_vcpu_get_esr(vcpu)); 2301 return -EFAULT; 2302 } 2303 2304 idx = srcu_read_lock(&vcpu->kvm->srcu); 2305 2306 /* 2307 * We may have faulted on a shadow stage 2 page table if we are 2308 * running a nested guest. In this case, we have to resolve the L2 2309 * IPA to the L1 IPA first, before knowing what kind of memory should 2310 * back the L1 IPA. 2311 * 2312 * If the shadow stage 2 page table walk faults, then we simply inject 2313 * this to the guest and carry on. 2314 * 2315 * If there are no shadow S2 PTs because S2 is disabled, there is 2316 * nothing to walk and we treat it as a 1:1 before going through the 2317 * canonical translation. 2318 */ 2319 if (kvm_is_nested_s2_mmu(vcpu->kvm,vcpu->arch.hw_mmu) && 2320 vcpu->arch.hw_mmu->nested_stage2_enabled) { 2321 u32 esr; 2322 2323 ret = kvm_walk_nested_s2(vcpu, fault_ipa, &nested_trans); 2324 if (ret == -EAGAIN) { 2325 ret = 1; 2326 goto out_unlock; 2327 } 2328 2329 if (ret) { 2330 esr = kvm_s2_trans_esr(&nested_trans); 2331 kvm_inject_s2_fault(vcpu, esr); 2332 goto out_unlock; 2333 } 2334 2335 ret = kvm_s2_handle_perm_fault(vcpu, &nested_trans); 2336 if (ret) { 2337 esr = kvm_s2_trans_esr(&nested_trans); 2338 kvm_inject_s2_fault(vcpu, esr); 2339 goto out_unlock; 2340 } 2341 2342 ipa = kvm_s2_trans_output(&nested_trans); 2343 nested = &nested_trans; 2344 } 2345 2346 gfn = ipa >> PAGE_SHIFT; 2347 memslot = gfn_to_memslot(vcpu->kvm, gfn); 2348 hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable); 2349 write_fault = kvm_is_write_fault(vcpu); 2350 if (kvm_is_error_hva(hva) || (write_fault && !writable)) { 2351 /* 2352 * The guest has put either its instructions or its page-tables 2353 * somewhere it shouldn't have. Userspace won't be able to do 2354 * anything about this (there's no syndrome for a start), so 2355 * re-inject the abort back into the guest. 2356 */ 2357 if (is_iabt) { 2358 ret = -ENOEXEC; 2359 goto out; 2360 } 2361 2362 if (kvm_vcpu_abt_iss1tw(vcpu)) { 2363 ret = kvm_inject_sea_dabt(vcpu, kvm_vcpu_get_hfar(vcpu)); 2364 goto out_unlock; 2365 } 2366 2367 /* 2368 * Check for a cache maintenance operation. Since we 2369 * ended-up here, we know it is outside of any memory 2370 * slot. But we can't find out if that is for a device, 2371 * or if the guest is just being stupid. The only thing 2372 * we know for sure is that this range cannot be cached. 2373 * 2374 * So let's assume that the guest is just being 2375 * cautious, and skip the instruction. 2376 */ 2377 if (kvm_is_error_hva(hva) && kvm_vcpu_dabt_is_cm(vcpu)) { 2378 kvm_incr_pc(vcpu); 2379 ret = 1; 2380 goto out_unlock; 2381 } 2382 2383 /* 2384 * The IPA is reported as [MAX:12], so we need to 2385 * complement it with the bottom 12 bits from the 2386 * faulting VA. This is always 12 bits, irrespective 2387 * of the page size. 2388 */ 2389 ipa |= FAR_TO_FIPA_OFFSET(kvm_vcpu_get_hfar(vcpu)); 2390 ret = io_mem_abort(vcpu, ipa); 2391 goto out_unlock; 2392 } 2393 2394 /* Userspace should not be able to register out-of-bounds IPAs */ 2395 VM_BUG_ON(ipa >= kvm_phys_size(vcpu->arch.hw_mmu)); 2396 2397 if (esr_fsc_is_access_flag_fault(esr)) { 2398 handle_access_fault(vcpu, fault_ipa); 2399 ret = 1; 2400 goto out_unlock; 2401 } 2402 2403 const struct kvm_s2_fault_desc s2fd = { 2404 .vcpu = vcpu, 2405 .fault_ipa = fault_ipa, 2406 .nested = nested, 2407 .memslot = memslot, 2408 .hva = hva, 2409 }; 2410 2411 if (kvm_vm_is_protected(vcpu->kvm)) { 2412 ret = pkvm_mem_abort(&s2fd); 2413 } else { 2414 VM_WARN_ON_ONCE(kvm_vcpu_trap_is_permission_fault(vcpu) && 2415 !write_fault && 2416 !kvm_vcpu_trap_is_exec_fault(vcpu)); 2417 2418 if (kvm_slot_has_gmem(memslot)) 2419 ret = gmem_abort(&s2fd); 2420 else 2421 ret = user_mem_abort(&s2fd); 2422 } 2423 2424 if (ret == 0) 2425 ret = 1; 2426 out: 2427 if (ret == -ENOEXEC) 2428 ret = kvm_inject_sea_iabt(vcpu, kvm_vcpu_get_hfar(vcpu)); 2429 out_unlock: 2430 srcu_read_unlock(&vcpu->kvm->srcu, idx); 2431 return ret; 2432 } 2433 2434 bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) 2435 { 2436 if (!kvm->arch.mmu.pgt || kvm_vm_is_protected(kvm)) 2437 return false; 2438 2439 __unmap_stage2_range(&kvm->arch.mmu, range->start << PAGE_SHIFT, 2440 (range->end - range->start) << PAGE_SHIFT, 2441 range->may_block); 2442 2443 kvm_nested_s2_unmap(kvm, range->may_block); 2444 return false; 2445 } 2446 2447 bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 2448 { 2449 u64 size = (range->end - range->start) << PAGE_SHIFT; 2450 2451 if (!kvm->arch.mmu.pgt || kvm_vm_is_protected(kvm)) 2452 return false; 2453 2454 return KVM_PGT_FN(kvm_pgtable_stage2_test_clear_young)(kvm->arch.mmu.pgt, 2455 range->start << PAGE_SHIFT, 2456 size, true); 2457 /* 2458 * TODO: Handle nested_mmu structures here using the reverse mapping in 2459 * a later version of patch series. 2460 */ 2461 } 2462 2463 bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 2464 { 2465 u64 size = (range->end - range->start) << PAGE_SHIFT; 2466 2467 if (!kvm->arch.mmu.pgt || kvm_vm_is_protected(kvm)) 2468 return false; 2469 2470 return KVM_PGT_FN(kvm_pgtable_stage2_test_clear_young)(kvm->arch.mmu.pgt, 2471 range->start << PAGE_SHIFT, 2472 size, false); 2473 } 2474 2475 phys_addr_t kvm_mmu_get_httbr(void) 2476 { 2477 return __pa(hyp_pgtable->pgd); 2478 } 2479 2480 phys_addr_t kvm_get_idmap_vector(void) 2481 { 2482 return hyp_idmap_vector; 2483 } 2484 2485 static int kvm_map_idmap_text(void) 2486 { 2487 unsigned long size = hyp_idmap_end - hyp_idmap_start; 2488 int err = __create_hyp_mappings(hyp_idmap_start, size, hyp_idmap_start, 2489 PAGE_HYP_EXEC); 2490 if (err) 2491 kvm_err("Failed to idmap %lx-%lx\n", 2492 hyp_idmap_start, hyp_idmap_end); 2493 2494 return err; 2495 } 2496 2497 static void *kvm_hyp_zalloc_page(void *arg) 2498 { 2499 return (void *)get_zeroed_page(GFP_KERNEL); 2500 } 2501 2502 static struct kvm_pgtable_mm_ops kvm_hyp_mm_ops = { 2503 .zalloc_page = kvm_hyp_zalloc_page, 2504 .get_page = kvm_host_get_page, 2505 .put_page = kvm_host_put_page, 2506 .phys_to_virt = kvm_host_va, 2507 .virt_to_phys = kvm_host_pa, 2508 }; 2509 2510 int __init kvm_mmu_init(u32 hyp_va_bits) 2511 { 2512 int err; 2513 2514 hyp_idmap_start = __pa_symbol(__hyp_idmap_text_start); 2515 hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE); 2516 hyp_idmap_end = __pa_symbol(__hyp_idmap_text_end); 2517 hyp_idmap_end = ALIGN(hyp_idmap_end, PAGE_SIZE); 2518 hyp_idmap_vector = __pa_symbol(__kvm_hyp_init); 2519 2520 /* 2521 * We rely on the linker script to ensure at build time that the HYP 2522 * init code does not cross a page boundary. 2523 */ 2524 BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK); 2525 2526 kvm_debug("Using %u-bit virtual addresses at EL2\n", hyp_va_bits); 2527 kvm_debug("IDMAP page: %lx\n", hyp_idmap_start); 2528 kvm_debug("HYP VA range: %lx:%lx\n", 2529 kern_hyp_va(PAGE_OFFSET), 2530 kern_hyp_va((unsigned long)high_memory - 1)); 2531 2532 if (hyp_idmap_start >= kern_hyp_va(PAGE_OFFSET) && 2533 hyp_idmap_start < kern_hyp_va((unsigned long)high_memory - 1) && 2534 hyp_idmap_start != (unsigned long)__hyp_idmap_text_start) { 2535 /* 2536 * The idmap page is intersecting with the VA space, 2537 * it is not safe to continue further. 2538 */ 2539 kvm_err("IDMAP intersecting with HYP VA, unable to continue\n"); 2540 err = -EINVAL; 2541 goto out; 2542 } 2543 2544 hyp_pgtable = kzalloc_obj(*hyp_pgtable); 2545 if (!hyp_pgtable) { 2546 kvm_err("Hyp mode page-table not allocated\n"); 2547 err = -ENOMEM; 2548 goto out; 2549 } 2550 2551 err = kvm_pgtable_hyp_init(hyp_pgtable, hyp_va_bits, &kvm_hyp_mm_ops); 2552 if (err) 2553 goto out_free_pgtable; 2554 2555 err = kvm_map_idmap_text(); 2556 if (err) 2557 goto out_destroy_pgtable; 2558 2559 io_map_base = hyp_idmap_start; 2560 __hyp_va_bits = hyp_va_bits; 2561 return 0; 2562 2563 out_destroy_pgtable: 2564 kvm_pgtable_hyp_destroy(hyp_pgtable); 2565 out_free_pgtable: 2566 kfree(hyp_pgtable); 2567 hyp_pgtable = NULL; 2568 out: 2569 return err; 2570 } 2571 2572 void kvm_arch_commit_memory_region(struct kvm *kvm, 2573 struct kvm_memory_slot *old, 2574 const struct kvm_memory_slot *new, 2575 enum kvm_mr_change change) 2576 { 2577 bool log_dirty_pages = new && new->flags & KVM_MEM_LOG_DIRTY_PAGES; 2578 2579 /* 2580 * At this point memslot has been committed and there is an 2581 * allocated dirty_bitmap[], dirty pages will be tracked while the 2582 * memory slot is write protected. 2583 */ 2584 if (log_dirty_pages) { 2585 2586 if (change == KVM_MR_DELETE) 2587 return; 2588 2589 /* 2590 * Huge and normal pages are write-protected and split 2591 * on either of these two cases: 2592 * 2593 * 1. with initial-all-set: gradually with CLEAR ioctls, 2594 */ 2595 if (kvm_dirty_log_manual_protect_and_init_set(kvm)) 2596 return; 2597 /* 2598 * or 2599 * 2. without initial-all-set: all in one shot when 2600 * enabling dirty logging. 2601 */ 2602 kvm_mmu_wp_memory_region(kvm, new->id); 2603 kvm_mmu_split_memory_region(kvm, new->id); 2604 } else { 2605 /* 2606 * Free any leftovers from the eager page splitting cache. Do 2607 * this when deleting, moving, disabling dirty logging, or 2608 * creating the memslot (a nop). Doing it for deletes makes 2609 * sure we don't leak memory, and there's no need to keep the 2610 * cache around for any of the other cases. 2611 */ 2612 kvm_mmu_free_memory_cache(&kvm->arch.mmu.split_page_cache); 2613 } 2614 } 2615 2616 int kvm_arch_prepare_memory_region(struct kvm *kvm, 2617 const struct kvm_memory_slot *old, 2618 struct kvm_memory_slot *new, 2619 enum kvm_mr_change change) 2620 { 2621 hva_t hva, reg_end; 2622 int ret = 0; 2623 2624 if (kvm_vm_is_protected(kvm)) { 2625 /* Cannot modify memslots once a pVM has run. */ 2626 if (pkvm_hyp_vm_is_created(kvm) && 2627 (change == KVM_MR_DELETE || change == KVM_MR_MOVE)) { 2628 return -EPERM; 2629 } 2630 2631 if (new && 2632 new->flags & (KVM_MEM_LOG_DIRTY_PAGES | KVM_MEM_READONLY)) { 2633 return -EPERM; 2634 } 2635 } 2636 2637 if (change != KVM_MR_CREATE && change != KVM_MR_MOVE && 2638 change != KVM_MR_FLAGS_ONLY) 2639 return 0; 2640 2641 /* 2642 * Prevent userspace from creating a memory region outside of the IPA 2643 * space addressable by the KVM guest IPA space. 2644 */ 2645 if ((new->base_gfn + new->npages) > (kvm_phys_size(&kvm->arch.mmu) >> PAGE_SHIFT)) 2646 return -EFAULT; 2647 2648 /* 2649 * Only support guest_memfd backed memslots with mappable memory, since 2650 * there aren't any CoCo VMs that support only private memory on arm64. 2651 */ 2652 if (kvm_slot_has_gmem(new) && !kvm_memslot_is_gmem_only(new)) 2653 return -EINVAL; 2654 2655 hva = new->userspace_addr; 2656 reg_end = hva + (new->npages << PAGE_SHIFT); 2657 2658 mmap_read_lock(current->mm); 2659 /* 2660 * A memory region could potentially cover multiple VMAs, and any holes 2661 * between them, so iterate over all of them. 2662 * 2663 * +--------------------------------------------+ 2664 * +---------------+----------------+ +----------------+ 2665 * | : VMA 1 | VMA 2 | | VMA 3 : | 2666 * +---------------+----------------+ +----------------+ 2667 * | memory region | 2668 * +--------------------------------------------+ 2669 */ 2670 do { 2671 struct vm_area_struct *vma; 2672 2673 vma = find_vma_intersection(current->mm, hva, reg_end); 2674 if (!vma) 2675 break; 2676 2677 if (kvm_has_mte(kvm) && !kvm_vma_mte_allowed(vma)) { 2678 ret = -EINVAL; 2679 break; 2680 } 2681 2682 if (vma->vm_flags & VM_PFNMAP) { 2683 /* IO region dirty page logging not allowed */ 2684 if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) { 2685 ret = -EINVAL; 2686 break; 2687 } 2688 2689 /* 2690 * Cacheable PFNMAP is allowed only if the hardware 2691 * supports it. 2692 */ 2693 if (kvm_vma_is_cacheable(vma) && !kvm_supports_cacheable_pfnmap()) { 2694 ret = -EINVAL; 2695 break; 2696 } 2697 } 2698 hva = min(reg_end, vma->vm_end); 2699 } while (hva < reg_end); 2700 2701 mmap_read_unlock(current->mm); 2702 return ret; 2703 } 2704 2705 void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) 2706 { 2707 } 2708 2709 void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) 2710 { 2711 } 2712 2713 void kvm_arch_flush_shadow_memslot(struct kvm *kvm, 2714 struct kvm_memory_slot *slot) 2715 { 2716 gpa_t gpa = slot->base_gfn << PAGE_SHIFT; 2717 phys_addr_t size = slot->npages << PAGE_SHIFT; 2718 2719 write_lock(&kvm->mmu_lock); 2720 kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, size, true); 2721 kvm_nested_s2_unmap(kvm, true); 2722 write_unlock(&kvm->mmu_lock); 2723 } 2724 2725 /* 2726 * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized). 2727 * 2728 * Main problems: 2729 * - S/W ops are local to a CPU (not broadcast) 2730 * - We have line migration behind our back (speculation) 2731 * - System caches don't support S/W at all (damn!) 2732 * 2733 * In the face of the above, the best we can do is to try and convert 2734 * S/W ops to VA ops. Because the guest is not allowed to infer the 2735 * S/W to PA mapping, it can only use S/W to nuke the whole cache, 2736 * which is a rather good thing for us. 2737 * 2738 * Also, it is only used when turning caches on/off ("The expected 2739 * usage of the cache maintenance instructions that operate by set/way 2740 * is associated with the cache maintenance instructions associated 2741 * with the powerdown and powerup of caches, if this is required by 2742 * the implementation."). 2743 * 2744 * We use the following policy: 2745 * 2746 * - If we trap a S/W operation, we enable VM trapping to detect 2747 * caches being turned on/off, and do a full clean. 2748 * 2749 * - We flush the caches on both caches being turned on and off. 2750 * 2751 * - Once the caches are enabled, we stop trapping VM ops. 2752 */ 2753 void kvm_set_way_flush(struct kvm_vcpu *vcpu) 2754 { 2755 unsigned long hcr = *vcpu_hcr(vcpu); 2756 2757 /* 2758 * If this is the first time we do a S/W operation 2759 * (i.e. HCR_TVM not set) flush the whole memory, and set the 2760 * VM trapping. 2761 * 2762 * Otherwise, rely on the VM trapping to wait for the MMU + 2763 * Caches to be turned off. At that point, we'll be able to 2764 * clean the caches again. 2765 */ 2766 if (!(hcr & HCR_TVM)) { 2767 trace_kvm_set_way_flush(*vcpu_pc(vcpu), 2768 vcpu_has_cache_enabled(vcpu)); 2769 stage2_flush_vm(vcpu->kvm); 2770 *vcpu_hcr(vcpu) = hcr | HCR_TVM; 2771 } 2772 } 2773 2774 void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled) 2775 { 2776 bool now_enabled = vcpu_has_cache_enabled(vcpu); 2777 2778 /* 2779 * If switching the MMU+caches on, need to invalidate the caches. 2780 * If switching it off, need to clean the caches. 2781 * Clean + invalidate does the trick always. 2782 */ 2783 if (now_enabled != was_enabled) 2784 stage2_flush_vm(vcpu->kvm); 2785 2786 /* Caches are now on, stop trapping VM ops (until a S/W op) */ 2787 if (now_enabled) 2788 *vcpu_hcr(vcpu) &= ~HCR_TVM; 2789 2790 trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled); 2791 } 2792