1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2012 - Virtual Open Systems and Columbia University 4 * Author: Christoffer Dall <c.dall@virtualopensystems.com> 5 */ 6 7 #include <linux/acpi.h> 8 #include <linux/mman.h> 9 #include <linux/kvm_host.h> 10 #include <linux/io.h> 11 #include <linux/hugetlb.h> 12 #include <linux/sched/signal.h> 13 #include <trace/events/kvm.h> 14 #include <asm/acpi.h> 15 #include <asm/pgalloc.h> 16 #include <asm/cacheflush.h> 17 #include <asm/kvm_arm.h> 18 #include <asm/kvm_mmu.h> 19 #include <asm/kvm_pgtable.h> 20 #include <asm/kvm_pkvm.h> 21 #include <asm/kvm_asm.h> 22 #include <asm/kvm_emulate.h> 23 #include <asm/virt.h> 24 25 #include "trace.h" 26 27 static struct kvm_pgtable *hyp_pgtable; 28 static DEFINE_MUTEX(kvm_hyp_pgd_mutex); 29 30 static unsigned long __ro_after_init hyp_idmap_start; 31 static unsigned long __ro_after_init hyp_idmap_end; 32 static phys_addr_t __ro_after_init hyp_idmap_vector; 33 34 u32 __ro_after_init __hyp_va_bits; 35 36 static unsigned long __ro_after_init io_map_base; 37 38 #define KVM_PGT_FN(fn) (!is_protected_kvm_enabled() ? fn : p ## fn) 39 40 static phys_addr_t __stage2_range_addr_end(phys_addr_t addr, phys_addr_t end, 41 phys_addr_t size) 42 { 43 phys_addr_t boundary = ALIGN_DOWN(addr + size, size); 44 45 return (boundary - 1 < end - 1) ? boundary : end; 46 } 47 48 static phys_addr_t stage2_range_addr_end(phys_addr_t addr, phys_addr_t end) 49 { 50 phys_addr_t size = kvm_granule_size(KVM_PGTABLE_MIN_BLOCK_LEVEL); 51 52 return __stage2_range_addr_end(addr, end, size); 53 } 54 55 /* 56 * Release kvm_mmu_lock periodically if the memory region is large. Otherwise, 57 * we may see kernel panics with CONFIG_DETECT_HUNG_TASK, 58 * CONFIG_LOCKUP_DETECTOR, CONFIG_LOCKDEP. Additionally, holding the lock too 59 * long will also starve other vCPUs. We have to also make sure that the page 60 * tables are not freed while we released the lock. 61 */ 62 static int stage2_apply_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, 63 phys_addr_t end, 64 int (*fn)(struct kvm_pgtable *, u64, u64), 65 bool resched) 66 { 67 struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu); 68 int ret; 69 u64 next; 70 71 do { 72 struct kvm_pgtable *pgt = mmu->pgt; 73 if (!pgt) 74 return -EINVAL; 75 76 next = stage2_range_addr_end(addr, end); 77 ret = fn(pgt, addr, next - addr); 78 if (ret) 79 break; 80 81 if (resched && next != end) 82 cond_resched_rwlock_write(&kvm->mmu_lock); 83 } while (addr = next, addr != end); 84 85 return ret; 86 } 87 88 #define stage2_apply_range_resched(mmu, addr, end, fn) \ 89 stage2_apply_range(mmu, addr, end, fn, true) 90 91 /* 92 * Get the maximum number of page-tables pages needed to split a range 93 * of blocks into PAGE_SIZE PTEs. It assumes the range is already 94 * mapped at level 2, or at level 1 if allowed. 95 */ 96 static int kvm_mmu_split_nr_page_tables(u64 range) 97 { 98 int n = 0; 99 100 if (KVM_PGTABLE_MIN_BLOCK_LEVEL < 2) 101 n += DIV_ROUND_UP(range, PUD_SIZE); 102 n += DIV_ROUND_UP(range, PMD_SIZE); 103 return n; 104 } 105 106 static bool need_split_memcache_topup_or_resched(struct kvm *kvm) 107 { 108 struct kvm_mmu_memory_cache *cache; 109 u64 chunk_size, min; 110 111 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) 112 return true; 113 114 chunk_size = kvm->arch.mmu.split_page_chunk_size; 115 min = kvm_mmu_split_nr_page_tables(chunk_size); 116 cache = &kvm->arch.mmu.split_page_cache; 117 return kvm_mmu_memory_cache_nr_free_objects(cache) < min; 118 } 119 120 static int kvm_mmu_split_huge_pages(struct kvm *kvm, phys_addr_t addr, 121 phys_addr_t end) 122 { 123 struct kvm_mmu_memory_cache *cache; 124 struct kvm_pgtable *pgt; 125 int ret, cache_capacity; 126 u64 next, chunk_size; 127 128 lockdep_assert_held_write(&kvm->mmu_lock); 129 130 chunk_size = kvm->arch.mmu.split_page_chunk_size; 131 cache_capacity = kvm_mmu_split_nr_page_tables(chunk_size); 132 133 if (chunk_size == 0) 134 return 0; 135 136 cache = &kvm->arch.mmu.split_page_cache; 137 138 do { 139 if (need_split_memcache_topup_or_resched(kvm)) { 140 write_unlock(&kvm->mmu_lock); 141 cond_resched(); 142 /* Eager page splitting is best-effort. */ 143 ret = __kvm_mmu_topup_memory_cache(cache, 144 cache_capacity, 145 cache_capacity); 146 write_lock(&kvm->mmu_lock); 147 if (ret) 148 break; 149 } 150 151 pgt = kvm->arch.mmu.pgt; 152 if (!pgt) 153 return -EINVAL; 154 155 next = __stage2_range_addr_end(addr, end, chunk_size); 156 ret = KVM_PGT_FN(kvm_pgtable_stage2_split)(pgt, addr, next - addr, cache); 157 if (ret) 158 break; 159 } while (addr = next, addr != end); 160 161 return ret; 162 } 163 164 static bool memslot_is_logging(struct kvm_memory_slot *memslot) 165 { 166 return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY); 167 } 168 169 /** 170 * kvm_arch_flush_remote_tlbs() - flush all VM TLB entries for v7/8 171 * @kvm: pointer to kvm structure. 172 * 173 * Interface to HYP function to flush all VM TLB entries 174 */ 175 int kvm_arch_flush_remote_tlbs(struct kvm *kvm) 176 { 177 if (is_protected_kvm_enabled()) 178 kvm_call_hyp_nvhe(__pkvm_tlb_flush_vmid, kvm->arch.pkvm.handle); 179 else 180 kvm_call_hyp(__kvm_tlb_flush_vmid, &kvm->arch.mmu); 181 return 0; 182 } 183 184 int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, 185 gfn_t gfn, u64 nr_pages) 186 { 187 u64 size = nr_pages << PAGE_SHIFT; 188 u64 addr = gfn << PAGE_SHIFT; 189 190 if (is_protected_kvm_enabled()) 191 kvm_call_hyp_nvhe(__pkvm_tlb_flush_vmid, kvm->arch.pkvm.handle); 192 else 193 kvm_tlb_flush_vmid_range(&kvm->arch.mmu, addr, size); 194 return 0; 195 } 196 197 static void *stage2_memcache_zalloc_page(void *arg) 198 { 199 struct kvm_mmu_memory_cache *mc = arg; 200 void *virt; 201 202 /* Allocated with __GFP_ZERO, so no need to zero */ 203 virt = kvm_mmu_memory_cache_alloc(mc); 204 if (virt) 205 kvm_account_pgtable_pages(virt, 1); 206 return virt; 207 } 208 209 static void *kvm_host_zalloc_pages_exact(size_t size) 210 { 211 return alloc_pages_exact(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO); 212 } 213 214 static void *kvm_s2_zalloc_pages_exact(size_t size) 215 { 216 void *virt = kvm_host_zalloc_pages_exact(size); 217 218 if (virt) 219 kvm_account_pgtable_pages(virt, (size >> PAGE_SHIFT)); 220 return virt; 221 } 222 223 static void kvm_s2_free_pages_exact(void *virt, size_t size) 224 { 225 kvm_account_pgtable_pages(virt, -(size >> PAGE_SHIFT)); 226 free_pages_exact(virt, size); 227 } 228 229 static struct kvm_pgtable_mm_ops kvm_s2_mm_ops; 230 231 static void stage2_free_unlinked_table_rcu_cb(struct rcu_head *head) 232 { 233 struct page *page = container_of(head, struct page, rcu_head); 234 void *pgtable = page_to_virt(page); 235 s8 level = page_private(page); 236 237 KVM_PGT_FN(kvm_pgtable_stage2_free_unlinked)(&kvm_s2_mm_ops, pgtable, level); 238 } 239 240 static void stage2_free_unlinked_table(void *addr, s8 level) 241 { 242 struct page *page = virt_to_page(addr); 243 244 set_page_private(page, (unsigned long)level); 245 call_rcu(&page->rcu_head, stage2_free_unlinked_table_rcu_cb); 246 } 247 248 static void kvm_host_get_page(void *addr) 249 { 250 get_page(virt_to_page(addr)); 251 } 252 253 static void kvm_host_put_page(void *addr) 254 { 255 put_page(virt_to_page(addr)); 256 } 257 258 static void kvm_s2_put_page(void *addr) 259 { 260 struct page *p = virt_to_page(addr); 261 /* Dropping last refcount, the page will be freed */ 262 if (page_count(p) == 1) 263 kvm_account_pgtable_pages(addr, -1); 264 put_page(p); 265 } 266 267 static int kvm_host_page_count(void *addr) 268 { 269 return page_count(virt_to_page(addr)); 270 } 271 272 static phys_addr_t kvm_host_pa(void *addr) 273 { 274 return __pa(addr); 275 } 276 277 static void *kvm_host_va(phys_addr_t phys) 278 { 279 return __va(phys); 280 } 281 282 static void clean_dcache_guest_page(void *va, size_t size) 283 { 284 __clean_dcache_guest_page(va, size); 285 } 286 287 static void invalidate_icache_guest_page(void *va, size_t size) 288 { 289 __invalidate_icache_guest_page(va, size); 290 } 291 292 /* 293 * Unmapping vs dcache management: 294 * 295 * If a guest maps certain memory pages as uncached, all writes will 296 * bypass the data cache and go directly to RAM. However, the CPUs 297 * can still speculate reads (not writes) and fill cache lines with 298 * data. 299 * 300 * Those cache lines will be *clean* cache lines though, so a 301 * clean+invalidate operation is equivalent to an invalidate 302 * operation, because no cache lines are marked dirty. 303 * 304 * Those clean cache lines could be filled prior to an uncached write 305 * by the guest, and the cache coherent IO subsystem would therefore 306 * end up writing old data to disk. 307 * 308 * This is why right after unmapping a page/section and invalidating 309 * the corresponding TLBs, we flush to make sure the IO subsystem will 310 * never hit in the cache. 311 * 312 * This is all avoided on systems that have ARM64_HAS_STAGE2_FWB, as 313 * we then fully enforce cacheability of RAM, no matter what the guest 314 * does. 315 */ 316 /** 317 * __unmap_stage2_range -- Clear stage2 page table entries to unmap a range 318 * @mmu: The KVM stage-2 MMU pointer 319 * @start: The intermediate physical base address of the range to unmap 320 * @size: The size of the area to unmap 321 * @may_block: Whether or not we are permitted to block 322 * 323 * Clear a range of stage-2 mappings, lowering the various ref-counts. Must 324 * be called while holding mmu_lock (unless for freeing the stage2 pgd before 325 * destroying the VM), otherwise another faulting VCPU may come in and mess 326 * with things behind our backs. 327 */ 328 static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size, 329 bool may_block) 330 { 331 struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu); 332 phys_addr_t end = start + size; 333 334 lockdep_assert_held_write(&kvm->mmu_lock); 335 WARN_ON(size & ~PAGE_MASK); 336 WARN_ON(stage2_apply_range(mmu, start, end, KVM_PGT_FN(kvm_pgtable_stage2_unmap), 337 may_block)); 338 } 339 340 void kvm_stage2_unmap_range(struct kvm_s2_mmu *mmu, phys_addr_t start, 341 u64 size, bool may_block) 342 { 343 __unmap_stage2_range(mmu, start, size, may_block); 344 } 345 346 void kvm_stage2_flush_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end) 347 { 348 stage2_apply_range_resched(mmu, addr, end, KVM_PGT_FN(kvm_pgtable_stage2_flush)); 349 } 350 351 static void stage2_flush_memslot(struct kvm *kvm, 352 struct kvm_memory_slot *memslot) 353 { 354 phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT; 355 phys_addr_t end = addr + PAGE_SIZE * memslot->npages; 356 357 kvm_stage2_flush_range(&kvm->arch.mmu, addr, end); 358 } 359 360 /** 361 * stage2_flush_vm - Invalidate cache for pages mapped in stage 2 362 * @kvm: The struct kvm pointer 363 * 364 * Go through the stage 2 page tables and invalidate any cache lines 365 * backing memory already mapped to the VM. 366 */ 367 static void stage2_flush_vm(struct kvm *kvm) 368 { 369 struct kvm_memslots *slots; 370 struct kvm_memory_slot *memslot; 371 int idx, bkt; 372 373 idx = srcu_read_lock(&kvm->srcu); 374 write_lock(&kvm->mmu_lock); 375 376 slots = kvm_memslots(kvm); 377 kvm_for_each_memslot(memslot, bkt, slots) 378 stage2_flush_memslot(kvm, memslot); 379 380 kvm_nested_s2_flush(kvm); 381 382 write_unlock(&kvm->mmu_lock); 383 srcu_read_unlock(&kvm->srcu, idx); 384 } 385 386 /** 387 * free_hyp_pgds - free Hyp-mode page tables 388 */ 389 void __init free_hyp_pgds(void) 390 { 391 mutex_lock(&kvm_hyp_pgd_mutex); 392 if (hyp_pgtable) { 393 kvm_pgtable_hyp_destroy(hyp_pgtable); 394 kfree(hyp_pgtable); 395 hyp_pgtable = NULL; 396 } 397 mutex_unlock(&kvm_hyp_pgd_mutex); 398 } 399 400 static bool kvm_host_owns_hyp_mappings(void) 401 { 402 if (is_kernel_in_hyp_mode()) 403 return false; 404 405 if (static_branch_likely(&kvm_protected_mode_initialized)) 406 return false; 407 408 /* 409 * This can happen at boot time when __create_hyp_mappings() is called 410 * after the hyp protection has been enabled, but the static key has 411 * not been flipped yet. 412 */ 413 if (!hyp_pgtable && is_protected_kvm_enabled()) 414 return false; 415 416 WARN_ON(!hyp_pgtable); 417 418 return true; 419 } 420 421 int __create_hyp_mappings(unsigned long start, unsigned long size, 422 unsigned long phys, enum kvm_pgtable_prot prot) 423 { 424 int err; 425 426 if (WARN_ON(!kvm_host_owns_hyp_mappings())) 427 return -EINVAL; 428 429 mutex_lock(&kvm_hyp_pgd_mutex); 430 err = kvm_pgtable_hyp_map(hyp_pgtable, start, size, phys, prot); 431 mutex_unlock(&kvm_hyp_pgd_mutex); 432 433 return err; 434 } 435 436 static phys_addr_t kvm_kaddr_to_phys(void *kaddr) 437 { 438 if (!is_vmalloc_addr(kaddr)) { 439 BUG_ON(!virt_addr_valid(kaddr)); 440 return __pa(kaddr); 441 } else { 442 return page_to_phys(vmalloc_to_page(kaddr)) + 443 offset_in_page(kaddr); 444 } 445 } 446 447 struct hyp_shared_pfn { 448 u64 pfn; 449 int count; 450 struct rb_node node; 451 }; 452 453 static DEFINE_MUTEX(hyp_shared_pfns_lock); 454 static struct rb_root hyp_shared_pfns = RB_ROOT; 455 456 static struct hyp_shared_pfn *find_shared_pfn(u64 pfn, struct rb_node ***node, 457 struct rb_node **parent) 458 { 459 struct hyp_shared_pfn *this; 460 461 *node = &hyp_shared_pfns.rb_node; 462 *parent = NULL; 463 while (**node) { 464 this = container_of(**node, struct hyp_shared_pfn, node); 465 *parent = **node; 466 if (this->pfn < pfn) 467 *node = &((**node)->rb_left); 468 else if (this->pfn > pfn) 469 *node = &((**node)->rb_right); 470 else 471 return this; 472 } 473 474 return NULL; 475 } 476 477 static int share_pfn_hyp(u64 pfn) 478 { 479 struct rb_node **node, *parent; 480 struct hyp_shared_pfn *this; 481 int ret = 0; 482 483 mutex_lock(&hyp_shared_pfns_lock); 484 this = find_shared_pfn(pfn, &node, &parent); 485 if (this) { 486 this->count++; 487 goto unlock; 488 } 489 490 this = kzalloc_obj(*this); 491 if (!this) { 492 ret = -ENOMEM; 493 goto unlock; 494 } 495 496 this->pfn = pfn; 497 this->count = 1; 498 rb_link_node(&this->node, parent, node); 499 rb_insert_color(&this->node, &hyp_shared_pfns); 500 ret = kvm_call_hyp_nvhe(__pkvm_host_share_hyp, pfn); 501 unlock: 502 mutex_unlock(&hyp_shared_pfns_lock); 503 504 return ret; 505 } 506 507 static int unshare_pfn_hyp(u64 pfn) 508 { 509 struct rb_node **node, *parent; 510 struct hyp_shared_pfn *this; 511 int ret = 0; 512 513 mutex_lock(&hyp_shared_pfns_lock); 514 this = find_shared_pfn(pfn, &node, &parent); 515 if (WARN_ON(!this)) { 516 ret = -ENOENT; 517 goto unlock; 518 } 519 520 this->count--; 521 if (this->count) 522 goto unlock; 523 524 rb_erase(&this->node, &hyp_shared_pfns); 525 kfree(this); 526 ret = kvm_call_hyp_nvhe(__pkvm_host_unshare_hyp, pfn); 527 unlock: 528 mutex_unlock(&hyp_shared_pfns_lock); 529 530 return ret; 531 } 532 533 int kvm_share_hyp(void *from, void *to) 534 { 535 phys_addr_t start, end, cur; 536 u64 pfn; 537 int ret; 538 539 if (is_kernel_in_hyp_mode()) 540 return 0; 541 542 /* 543 * The share hcall maps things in the 'fixed-offset' region of the hyp 544 * VA space, so we can only share physically contiguous data-structures 545 * for now. 546 */ 547 if (is_vmalloc_or_module_addr(from) || is_vmalloc_or_module_addr(to)) 548 return -EINVAL; 549 550 if (kvm_host_owns_hyp_mappings()) 551 return create_hyp_mappings(from, to, PAGE_HYP); 552 553 start = ALIGN_DOWN(__pa(from), PAGE_SIZE); 554 end = PAGE_ALIGN(__pa(to)); 555 for (cur = start; cur < end; cur += PAGE_SIZE) { 556 pfn = __phys_to_pfn(cur); 557 ret = share_pfn_hyp(pfn); 558 if (ret) 559 return ret; 560 } 561 562 return 0; 563 } 564 565 void kvm_unshare_hyp(void *from, void *to) 566 { 567 phys_addr_t start, end, cur; 568 u64 pfn; 569 570 if (is_kernel_in_hyp_mode() || kvm_host_owns_hyp_mappings() || !from) 571 return; 572 573 start = ALIGN_DOWN(__pa(from), PAGE_SIZE); 574 end = PAGE_ALIGN(__pa(to)); 575 for (cur = start; cur < end; cur += PAGE_SIZE) { 576 pfn = __phys_to_pfn(cur); 577 WARN_ON(unshare_pfn_hyp(pfn)); 578 } 579 } 580 581 /** 582 * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode 583 * @from: The virtual kernel start address of the range 584 * @to: The virtual kernel end address of the range (exclusive) 585 * @prot: The protection to be applied to this range 586 * 587 * The same virtual address as the kernel virtual address is also used 588 * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying 589 * physical pages. 590 */ 591 int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot) 592 { 593 phys_addr_t phys_addr; 594 unsigned long virt_addr; 595 unsigned long start = kern_hyp_va((unsigned long)from); 596 unsigned long end = kern_hyp_va((unsigned long)to); 597 598 if (is_kernel_in_hyp_mode()) 599 return 0; 600 601 if (!kvm_host_owns_hyp_mappings()) 602 return -EPERM; 603 604 start = start & PAGE_MASK; 605 end = PAGE_ALIGN(end); 606 607 for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) { 608 int err; 609 610 phys_addr = kvm_kaddr_to_phys(from + virt_addr - start); 611 err = __create_hyp_mappings(virt_addr, PAGE_SIZE, phys_addr, 612 prot); 613 if (err) 614 return err; 615 } 616 617 return 0; 618 } 619 620 static int __hyp_alloc_private_va_range(unsigned long base) 621 { 622 lockdep_assert_held(&kvm_hyp_pgd_mutex); 623 624 if (!PAGE_ALIGNED(base)) 625 return -EINVAL; 626 627 /* 628 * Verify that BIT(VA_BITS - 1) hasn't been flipped by 629 * allocating the new area, as it would indicate we've 630 * overflowed the idmap/IO address range. 631 */ 632 if ((base ^ io_map_base) & BIT(VA_BITS - 1)) 633 return -ENOMEM; 634 635 io_map_base = base; 636 637 return 0; 638 } 639 640 /** 641 * hyp_alloc_private_va_range - Allocates a private VA range. 642 * @size: The size of the VA range to reserve. 643 * @haddr: The hypervisor virtual start address of the allocation. 644 * 645 * The private virtual address (VA) range is allocated below io_map_base 646 * and aligned based on the order of @size. 647 * 648 * Return: 0 on success or negative error code on failure. 649 */ 650 int hyp_alloc_private_va_range(size_t size, unsigned long *haddr) 651 { 652 unsigned long base; 653 int ret = 0; 654 655 mutex_lock(&kvm_hyp_pgd_mutex); 656 657 /* 658 * This assumes that we have enough space below the idmap 659 * page to allocate our VAs. If not, the check in 660 * __hyp_alloc_private_va_range() will kick. A potential 661 * alternative would be to detect that overflow and switch 662 * to an allocation above the idmap. 663 * 664 * The allocated size is always a multiple of PAGE_SIZE. 665 */ 666 size = PAGE_ALIGN(size); 667 base = io_map_base - size; 668 ret = __hyp_alloc_private_va_range(base); 669 670 mutex_unlock(&kvm_hyp_pgd_mutex); 671 672 if (!ret) 673 *haddr = base; 674 675 return ret; 676 } 677 678 static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size, 679 unsigned long *haddr, 680 enum kvm_pgtable_prot prot) 681 { 682 unsigned long addr; 683 int ret = 0; 684 685 if (!kvm_host_owns_hyp_mappings()) { 686 addr = kvm_call_hyp_nvhe(__pkvm_create_private_mapping, 687 phys_addr, size, prot); 688 if (IS_ERR_VALUE(addr)) 689 return addr; 690 *haddr = addr; 691 692 return 0; 693 } 694 695 size = PAGE_ALIGN(size + offset_in_page(phys_addr)); 696 ret = hyp_alloc_private_va_range(size, &addr); 697 if (ret) 698 return ret; 699 700 ret = __create_hyp_mappings(addr, size, phys_addr, prot); 701 if (ret) 702 return ret; 703 704 *haddr = addr + offset_in_page(phys_addr); 705 return ret; 706 } 707 708 int create_hyp_stack(phys_addr_t phys_addr, unsigned long *haddr) 709 { 710 unsigned long base; 711 size_t size; 712 int ret; 713 714 mutex_lock(&kvm_hyp_pgd_mutex); 715 /* 716 * Efficient stack verification using the NVHE_STACK_SHIFT bit implies 717 * an alignment of our allocation on the order of the size. 718 */ 719 size = NVHE_STACK_SIZE * 2; 720 base = ALIGN_DOWN(io_map_base - size, size); 721 722 ret = __hyp_alloc_private_va_range(base); 723 724 mutex_unlock(&kvm_hyp_pgd_mutex); 725 726 if (ret) { 727 kvm_err("Cannot allocate hyp stack guard page\n"); 728 return ret; 729 } 730 731 /* 732 * Since the stack grows downwards, map the stack to the page 733 * at the higher address and leave the lower guard page 734 * unbacked. 735 * 736 * Any valid stack address now has the NVHE_STACK_SHIFT bit as 1 737 * and addresses corresponding to the guard page have the 738 * NVHE_STACK_SHIFT bit as 0 - this is used for overflow detection. 739 */ 740 ret = __create_hyp_mappings(base + NVHE_STACK_SIZE, NVHE_STACK_SIZE, 741 phys_addr, PAGE_HYP); 742 if (ret) 743 kvm_err("Cannot map hyp stack\n"); 744 745 *haddr = base + size; 746 747 return ret; 748 } 749 750 /** 751 * create_hyp_io_mappings - Map IO into both kernel and HYP 752 * @phys_addr: The physical start address which gets mapped 753 * @size: Size of the region being mapped 754 * @kaddr: Kernel VA for this mapping 755 * @haddr: HYP VA for this mapping 756 */ 757 int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size, 758 void __iomem **kaddr, 759 void __iomem **haddr) 760 { 761 unsigned long addr; 762 int ret; 763 764 if (is_protected_kvm_enabled()) 765 return -EPERM; 766 767 *kaddr = ioremap(phys_addr, size); 768 if (!*kaddr) 769 return -ENOMEM; 770 771 if (is_kernel_in_hyp_mode()) { 772 *haddr = *kaddr; 773 return 0; 774 } 775 776 ret = __create_hyp_private_mapping(phys_addr, size, 777 &addr, PAGE_HYP_DEVICE); 778 if (ret) { 779 iounmap(*kaddr); 780 *kaddr = NULL; 781 *haddr = NULL; 782 return ret; 783 } 784 785 *haddr = (void __iomem *)addr; 786 return 0; 787 } 788 789 /** 790 * create_hyp_exec_mappings - Map an executable range into HYP 791 * @phys_addr: The physical start address which gets mapped 792 * @size: Size of the region being mapped 793 * @haddr: HYP VA for this mapping 794 */ 795 int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size, 796 void **haddr) 797 { 798 unsigned long addr; 799 int ret; 800 801 BUG_ON(is_kernel_in_hyp_mode()); 802 803 ret = __create_hyp_private_mapping(phys_addr, size, 804 &addr, PAGE_HYP_EXEC); 805 if (ret) { 806 *haddr = NULL; 807 return ret; 808 } 809 810 *haddr = (void *)addr; 811 return 0; 812 } 813 814 static struct kvm_pgtable_mm_ops kvm_user_mm_ops = { 815 /* We shouldn't need any other callback to walk the PT */ 816 .phys_to_virt = kvm_host_va, 817 }; 818 819 static int get_user_mapping_size(struct kvm *kvm, u64 addr) 820 { 821 struct kvm_pgtable pgt = { 822 .pgd = (kvm_pteref_t)kvm->mm->pgd, 823 .ia_bits = vabits_actual, 824 .start_level = (KVM_PGTABLE_LAST_LEVEL - 825 ARM64_HW_PGTABLE_LEVELS(pgt.ia_bits) + 1), 826 .mm_ops = &kvm_user_mm_ops, 827 }; 828 unsigned long flags; 829 kvm_pte_t pte = 0; /* Keep GCC quiet... */ 830 s8 level = S8_MAX; 831 int ret; 832 833 /* 834 * Disable IRQs so that we hazard against a concurrent 835 * teardown of the userspace page tables (which relies on 836 * IPI-ing threads). 837 */ 838 local_irq_save(flags); 839 ret = kvm_pgtable_get_leaf(&pgt, addr, &pte, &level); 840 local_irq_restore(flags); 841 842 if (ret) 843 return ret; 844 845 /* 846 * Not seeing an error, but not updating level? Something went 847 * deeply wrong... 848 */ 849 if (WARN_ON(level > KVM_PGTABLE_LAST_LEVEL)) 850 return -EFAULT; 851 if (WARN_ON(level < KVM_PGTABLE_FIRST_LEVEL)) 852 return -EFAULT; 853 854 /* Oops, the userspace PTs are gone... Replay the fault */ 855 if (!kvm_pte_valid(pte)) 856 return -EAGAIN; 857 858 return BIT(ARM64_HW_PGTABLE_LEVEL_SHIFT(level)); 859 } 860 861 static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = { 862 .zalloc_page = stage2_memcache_zalloc_page, 863 .zalloc_pages_exact = kvm_s2_zalloc_pages_exact, 864 .free_pages_exact = kvm_s2_free_pages_exact, 865 .free_unlinked_table = stage2_free_unlinked_table, 866 .get_page = kvm_host_get_page, 867 .put_page = kvm_s2_put_page, 868 .page_count = kvm_host_page_count, 869 .phys_to_virt = kvm_host_va, 870 .virt_to_phys = kvm_host_pa, 871 .dcache_clean_inval_poc = clean_dcache_guest_page, 872 .icache_inval_pou = invalidate_icache_guest_page, 873 }; 874 875 static int kvm_init_ipa_range(struct kvm_s2_mmu *mmu, unsigned long type) 876 { 877 u32 kvm_ipa_limit = get_kvm_ipa_limit(); 878 u64 mmfr0, mmfr1; 879 u32 phys_shift; 880 881 if (type & ~KVM_VM_TYPE_ARM_IPA_SIZE_MASK) 882 return -EINVAL; 883 884 phys_shift = KVM_VM_TYPE_ARM_IPA_SIZE(type); 885 if (is_protected_kvm_enabled()) { 886 phys_shift = kvm_ipa_limit; 887 } else if (phys_shift) { 888 if (phys_shift > kvm_ipa_limit || 889 phys_shift < ARM64_MIN_PARANGE_BITS) 890 return -EINVAL; 891 } else { 892 phys_shift = KVM_PHYS_SHIFT; 893 if (phys_shift > kvm_ipa_limit) { 894 pr_warn_once("%s using unsupported default IPA limit, upgrade your VMM\n", 895 current->comm); 896 return -EINVAL; 897 } 898 } 899 900 mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); 901 mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); 902 mmu->vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift); 903 904 return 0; 905 } 906 907 /* 908 * Assume that @pgt is valid and unlinked from the KVM MMU to free the 909 * page-table without taking the kvm_mmu_lock and without performing any 910 * TLB invalidations. 911 * 912 * Also, the range of addresses can be large enough to cause need_resched 913 * warnings, for instance on CONFIG_PREEMPT_NONE kernels. Hence, invoke 914 * cond_resched() periodically to prevent hogging the CPU for a long time 915 * and schedule something else, if required. 916 */ 917 static void stage2_destroy_range(struct kvm_pgtable *pgt, phys_addr_t addr, 918 phys_addr_t end) 919 { 920 u64 next; 921 922 do { 923 next = stage2_range_addr_end(addr, end); 924 KVM_PGT_FN(kvm_pgtable_stage2_destroy_range)(pgt, addr, 925 next - addr); 926 if (next != end) 927 cond_resched(); 928 } while (addr = next, addr != end); 929 } 930 931 static void kvm_stage2_destroy(struct kvm_pgtable *pgt) 932 { 933 unsigned int ia_bits = VTCR_EL2_IPA(pgt->mmu->vtcr); 934 935 stage2_destroy_range(pgt, 0, BIT(ia_bits)); 936 KVM_PGT_FN(kvm_pgtable_stage2_destroy_pgd)(pgt); 937 } 938 939 /** 940 * kvm_init_stage2_mmu - Initialise a S2 MMU structure 941 * @kvm: The pointer to the KVM structure 942 * @mmu: The pointer to the s2 MMU structure 943 * @type: The machine type of the virtual machine 944 * 945 * Allocates only the stage-2 HW PGD level table(s). 946 * Note we don't need locking here as this is only called in two cases: 947 * 948 * - when the VM is created, which can't race against anything 949 * 950 * - when secondary kvm_s2_mmu structures are initialised for NV 951 * guests, and the caller must hold kvm->lock as this is called on a 952 * per-vcpu basis. 953 */ 954 int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type) 955 { 956 int cpu, err; 957 struct kvm_pgtable *pgt; 958 959 /* 960 * If we already have our page tables in place, and that the 961 * MMU context is the canonical one, we have a bug somewhere, 962 * as this is only supposed to ever happen once per VM. 963 * 964 * Otherwise, we're building nested page tables, and that's 965 * probably because userspace called KVM_ARM_VCPU_INIT more 966 * than once on the same vcpu. Since that's actually legal, 967 * don't kick a fuss and leave gracefully. 968 */ 969 if (mmu->pgt != NULL) { 970 if (kvm_is_nested_s2_mmu(kvm, mmu)) 971 return 0; 972 973 kvm_err("kvm_arch already initialized?\n"); 974 return -EINVAL; 975 } 976 977 err = kvm_init_ipa_range(mmu, type); 978 if (err) 979 return err; 980 981 pgt = kzalloc_obj(*pgt, GFP_KERNEL_ACCOUNT); 982 if (!pgt) 983 return -ENOMEM; 984 985 mmu->arch = &kvm->arch; 986 err = KVM_PGT_FN(kvm_pgtable_stage2_init)(pgt, mmu, &kvm_s2_mm_ops); 987 if (err) 988 goto out_free_pgtable; 989 990 mmu->pgt = pgt; 991 if (is_protected_kvm_enabled()) 992 return 0; 993 994 mmu->last_vcpu_ran = alloc_percpu(typeof(*mmu->last_vcpu_ran)); 995 if (!mmu->last_vcpu_ran) { 996 err = -ENOMEM; 997 goto out_destroy_pgtable; 998 } 999 1000 for_each_possible_cpu(cpu) 1001 *per_cpu_ptr(mmu->last_vcpu_ran, cpu) = -1; 1002 1003 /* The eager page splitting is disabled by default */ 1004 mmu->split_page_chunk_size = KVM_ARM_EAGER_SPLIT_CHUNK_SIZE_DEFAULT; 1005 mmu->split_page_cache.gfp_zero = __GFP_ZERO; 1006 1007 mmu->pgd_phys = __pa(pgt->pgd); 1008 1009 if (kvm_is_nested_s2_mmu(kvm, mmu)) 1010 kvm_init_nested_s2_mmu(mmu); 1011 1012 return 0; 1013 1014 out_destroy_pgtable: 1015 kvm_stage2_destroy(pgt); 1016 out_free_pgtable: 1017 kfree(pgt); 1018 return err; 1019 } 1020 1021 void kvm_uninit_stage2_mmu(struct kvm *kvm) 1022 { 1023 kvm_free_stage2_pgd(&kvm->arch.mmu); 1024 kvm_mmu_free_memory_cache(&kvm->arch.mmu.split_page_cache); 1025 } 1026 1027 static void stage2_unmap_memslot(struct kvm *kvm, 1028 struct kvm_memory_slot *memslot) 1029 { 1030 hva_t hva = memslot->userspace_addr; 1031 phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT; 1032 phys_addr_t size = PAGE_SIZE * memslot->npages; 1033 hva_t reg_end = hva + size; 1034 1035 /* 1036 * A memory region could potentially cover multiple VMAs, and any holes 1037 * between them, so iterate over all of them to find out if we should 1038 * unmap any of them. 1039 * 1040 * +--------------------------------------------+ 1041 * +---------------+----------------+ +----------------+ 1042 * | : VMA 1 | VMA 2 | | VMA 3 : | 1043 * +---------------+----------------+ +----------------+ 1044 * | memory region | 1045 * +--------------------------------------------+ 1046 */ 1047 do { 1048 struct vm_area_struct *vma; 1049 hva_t vm_start, vm_end; 1050 1051 vma = find_vma_intersection(current->mm, hva, reg_end); 1052 if (!vma) 1053 break; 1054 1055 /* 1056 * Take the intersection of this VMA with the memory region 1057 */ 1058 vm_start = max(hva, vma->vm_start); 1059 vm_end = min(reg_end, vma->vm_end); 1060 1061 if (!(vma->vm_flags & VM_PFNMAP)) { 1062 gpa_t gpa = addr + (vm_start - memslot->userspace_addr); 1063 kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, vm_end - vm_start, true); 1064 } 1065 hva = vm_end; 1066 } while (hva < reg_end); 1067 } 1068 1069 /** 1070 * stage2_unmap_vm - Unmap Stage-2 RAM mappings 1071 * @kvm: The struct kvm pointer 1072 * 1073 * Go through the memregions and unmap any regular RAM 1074 * backing memory already mapped to the VM. 1075 */ 1076 void stage2_unmap_vm(struct kvm *kvm) 1077 { 1078 struct kvm_memslots *slots; 1079 struct kvm_memory_slot *memslot; 1080 int idx, bkt; 1081 1082 idx = srcu_read_lock(&kvm->srcu); 1083 mmap_read_lock(current->mm); 1084 write_lock(&kvm->mmu_lock); 1085 1086 slots = kvm_memslots(kvm); 1087 kvm_for_each_memslot(memslot, bkt, slots) 1088 stage2_unmap_memslot(kvm, memslot); 1089 1090 kvm_nested_s2_unmap(kvm, true); 1091 1092 write_unlock(&kvm->mmu_lock); 1093 mmap_read_unlock(current->mm); 1094 srcu_read_unlock(&kvm->srcu, idx); 1095 } 1096 1097 void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu) 1098 { 1099 struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu); 1100 struct kvm_pgtable *pgt = NULL; 1101 1102 write_lock(&kvm->mmu_lock); 1103 pgt = mmu->pgt; 1104 if (pgt) { 1105 mmu->pgd_phys = 0; 1106 mmu->pgt = NULL; 1107 free_percpu(mmu->last_vcpu_ran); 1108 } 1109 1110 if (kvm_is_nested_s2_mmu(kvm, mmu)) 1111 kvm_init_nested_s2_mmu(mmu); 1112 1113 write_unlock(&kvm->mmu_lock); 1114 1115 if (pgt) { 1116 kvm_stage2_destroy(pgt); 1117 kfree(pgt); 1118 } 1119 } 1120 1121 static void hyp_mc_free_fn(void *addr, void *mc) 1122 { 1123 struct kvm_hyp_memcache *memcache = mc; 1124 1125 if (memcache->flags & HYP_MEMCACHE_ACCOUNT_STAGE2) 1126 kvm_account_pgtable_pages(addr, -1); 1127 1128 free_page((unsigned long)addr); 1129 } 1130 1131 static void *hyp_mc_alloc_fn(void *mc) 1132 { 1133 struct kvm_hyp_memcache *memcache = mc; 1134 void *addr; 1135 1136 addr = (void *)__get_free_page(GFP_KERNEL_ACCOUNT); 1137 if (addr && memcache->flags & HYP_MEMCACHE_ACCOUNT_STAGE2) 1138 kvm_account_pgtable_pages(addr, 1); 1139 1140 return addr; 1141 } 1142 1143 void free_hyp_memcache(struct kvm_hyp_memcache *mc) 1144 { 1145 if (!is_protected_kvm_enabled()) 1146 return; 1147 1148 kfree(mc->mapping); 1149 __free_hyp_memcache(mc, hyp_mc_free_fn, kvm_host_va, mc); 1150 } 1151 1152 int topup_hyp_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages) 1153 { 1154 if (!is_protected_kvm_enabled()) 1155 return 0; 1156 1157 if (!mc->mapping) { 1158 mc->mapping = kzalloc_obj(struct pkvm_mapping, 1159 GFP_KERNEL_ACCOUNT); 1160 if (!mc->mapping) 1161 return -ENOMEM; 1162 } 1163 1164 return __topup_hyp_memcache(mc, min_pages, hyp_mc_alloc_fn, 1165 kvm_host_pa, mc); 1166 } 1167 1168 /** 1169 * kvm_phys_addr_ioremap - map a device range to guest IPA 1170 * 1171 * @kvm: The KVM pointer 1172 * @guest_ipa: The IPA at which to insert the mapping 1173 * @pa: The physical address of the device 1174 * @size: The size of the mapping 1175 * @writable: Whether or not to create a writable mapping 1176 */ 1177 int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, 1178 phys_addr_t pa, unsigned long size, bool writable) 1179 { 1180 phys_addr_t addr; 1181 int ret = 0; 1182 struct kvm_mmu_memory_cache cache = { .gfp_zero = __GFP_ZERO }; 1183 struct kvm_s2_mmu *mmu = &kvm->arch.mmu; 1184 struct kvm_pgtable *pgt = mmu->pgt; 1185 enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_DEVICE | 1186 KVM_PGTABLE_PROT_R | 1187 (writable ? KVM_PGTABLE_PROT_W : 0); 1188 1189 if (is_protected_kvm_enabled()) 1190 return -EPERM; 1191 1192 size += offset_in_page(guest_ipa); 1193 guest_ipa &= PAGE_MASK; 1194 1195 for (addr = guest_ipa; addr < guest_ipa + size; addr += PAGE_SIZE) { 1196 ret = kvm_mmu_topup_memory_cache(&cache, 1197 kvm_mmu_cache_min_pages(mmu)); 1198 if (ret) 1199 break; 1200 1201 write_lock(&kvm->mmu_lock); 1202 ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, addr, PAGE_SIZE, 1203 pa, prot, &cache, 0); 1204 write_unlock(&kvm->mmu_lock); 1205 if (ret) 1206 break; 1207 1208 pa += PAGE_SIZE; 1209 } 1210 1211 kvm_mmu_free_memory_cache(&cache); 1212 return ret; 1213 } 1214 1215 /** 1216 * kvm_stage2_wp_range() - write protect stage2 memory region range 1217 * @mmu: The KVM stage-2 MMU pointer 1218 * @addr: Start address of range 1219 * @end: End address of range 1220 */ 1221 void kvm_stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end) 1222 { 1223 stage2_apply_range_resched(mmu, addr, end, KVM_PGT_FN(kvm_pgtable_stage2_wrprotect)); 1224 } 1225 1226 /** 1227 * kvm_mmu_wp_memory_region() - write protect stage 2 entries for memory slot 1228 * @kvm: The KVM pointer 1229 * @slot: The memory slot to write protect 1230 * 1231 * Called to start logging dirty pages after memory region 1232 * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns 1233 * all present PUD, PMD and PTEs are write protected in the memory region. 1234 * Afterwards read of dirty page log can be called. 1235 * 1236 * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired, 1237 * serializing operations for VM memory regions. 1238 */ 1239 static void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot) 1240 { 1241 struct kvm_memslots *slots = kvm_memslots(kvm); 1242 struct kvm_memory_slot *memslot = id_to_memslot(slots, slot); 1243 phys_addr_t start, end; 1244 1245 if (WARN_ON_ONCE(!memslot)) 1246 return; 1247 1248 start = memslot->base_gfn << PAGE_SHIFT; 1249 end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT; 1250 1251 write_lock(&kvm->mmu_lock); 1252 kvm_stage2_wp_range(&kvm->arch.mmu, start, end); 1253 kvm_nested_s2_wp(kvm); 1254 write_unlock(&kvm->mmu_lock); 1255 kvm_flush_remote_tlbs_memslot(kvm, memslot); 1256 } 1257 1258 /** 1259 * kvm_mmu_split_memory_region() - split the stage 2 blocks into PAGE_SIZE 1260 * pages for memory slot 1261 * @kvm: The KVM pointer 1262 * @slot: The memory slot to split 1263 * 1264 * Acquires kvm->mmu_lock. Called with kvm->slots_lock mutex acquired, 1265 * serializing operations for VM memory regions. 1266 */ 1267 static void kvm_mmu_split_memory_region(struct kvm *kvm, int slot) 1268 { 1269 struct kvm_memslots *slots; 1270 struct kvm_memory_slot *memslot; 1271 phys_addr_t start, end; 1272 1273 lockdep_assert_held(&kvm->slots_lock); 1274 1275 slots = kvm_memslots(kvm); 1276 memslot = id_to_memslot(slots, slot); 1277 1278 start = memslot->base_gfn << PAGE_SHIFT; 1279 end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT; 1280 1281 write_lock(&kvm->mmu_lock); 1282 kvm_mmu_split_huge_pages(kvm, start, end); 1283 write_unlock(&kvm->mmu_lock); 1284 } 1285 1286 /* 1287 * kvm_arch_mmu_enable_log_dirty_pt_masked() - enable dirty logging for selected pages. 1288 * @kvm: The KVM pointer 1289 * @slot: The memory slot associated with mask 1290 * @gfn_offset: The gfn offset in memory slot 1291 * @mask: The mask of pages at offset 'gfn_offset' in this memory 1292 * slot to enable dirty logging on 1293 * 1294 * Writes protect selected pages to enable dirty logging, and then 1295 * splits them to PAGE_SIZE. Caller must acquire kvm->mmu_lock. 1296 */ 1297 void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, 1298 struct kvm_memory_slot *slot, 1299 gfn_t gfn_offset, unsigned long mask) 1300 { 1301 phys_addr_t base_gfn = slot->base_gfn + gfn_offset; 1302 phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT; 1303 phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT; 1304 1305 lockdep_assert_held_write(&kvm->mmu_lock); 1306 1307 kvm_stage2_wp_range(&kvm->arch.mmu, start, end); 1308 1309 /* 1310 * Eager-splitting is done when manual-protect is set. We 1311 * also check for initially-all-set because we can avoid 1312 * eager-splitting if initially-all-set is false. 1313 * Initially-all-set equal false implies that huge-pages were 1314 * already split when enabling dirty logging: no need to do it 1315 * again. 1316 */ 1317 if (kvm_dirty_log_manual_protect_and_init_set(kvm)) 1318 kvm_mmu_split_huge_pages(kvm, start, end); 1319 1320 kvm_nested_s2_wp(kvm); 1321 } 1322 1323 static void kvm_send_hwpoison_signal(unsigned long address, short lsb) 1324 { 1325 send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current); 1326 } 1327 1328 static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot, 1329 unsigned long hva, 1330 unsigned long map_size) 1331 { 1332 gpa_t gpa_start; 1333 hva_t uaddr_start, uaddr_end; 1334 size_t size; 1335 1336 /* The memslot and the VMA are guaranteed to be aligned to PAGE_SIZE */ 1337 if (map_size == PAGE_SIZE) 1338 return true; 1339 1340 /* pKVM only supports PMD_SIZE huge-mappings */ 1341 if (is_protected_kvm_enabled() && map_size != PMD_SIZE) 1342 return false; 1343 1344 size = memslot->npages * PAGE_SIZE; 1345 1346 gpa_start = memslot->base_gfn << PAGE_SHIFT; 1347 1348 uaddr_start = memslot->userspace_addr; 1349 uaddr_end = uaddr_start + size; 1350 1351 /* 1352 * Pages belonging to memslots that don't have the same alignment 1353 * within a PMD/PUD for userspace and IPA cannot be mapped with stage-2 1354 * PMD/PUD entries, because we'll end up mapping the wrong pages. 1355 * 1356 * Consider a layout like the following: 1357 * 1358 * memslot->userspace_addr: 1359 * +-----+--------------------+--------------------+---+ 1360 * |abcde|fgh Stage-1 block | Stage-1 block tv|xyz| 1361 * +-----+--------------------+--------------------+---+ 1362 * 1363 * memslot->base_gfn << PAGE_SHIFT: 1364 * +---+--------------------+--------------------+-----+ 1365 * |abc|def Stage-2 block | Stage-2 block |tvxyz| 1366 * +---+--------------------+--------------------+-----+ 1367 * 1368 * If we create those stage-2 blocks, we'll end up with this incorrect 1369 * mapping: 1370 * d -> f 1371 * e -> g 1372 * f -> h 1373 */ 1374 if ((gpa_start & (map_size - 1)) != (uaddr_start & (map_size - 1))) 1375 return false; 1376 1377 /* 1378 * Next, let's make sure we're not trying to map anything not covered 1379 * by the memslot. This means we have to prohibit block size mappings 1380 * for the beginning and end of a non-block aligned and non-block sized 1381 * memory slot (illustrated by the head and tail parts of the 1382 * userspace view above containing pages 'abcde' and 'xyz', 1383 * respectively). 1384 * 1385 * Note that it doesn't matter if we do the check using the 1386 * userspace_addr or the base_gfn, as both are equally aligned (per 1387 * the check above) and equally sized. 1388 */ 1389 return (hva & ~(map_size - 1)) >= uaddr_start && 1390 (hva & ~(map_size - 1)) + map_size <= uaddr_end; 1391 } 1392 1393 /* 1394 * Check if the given hva is backed by a transparent huge page (THP) and 1395 * whether it can be mapped using block mapping in stage2. If so, adjust 1396 * the stage2 PFN and IPA accordingly. Only PMD_SIZE THPs are currently 1397 * supported. This will need to be updated to support other THP sizes. 1398 * 1399 * Returns the size of the mapping. 1400 */ 1401 static long 1402 transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot, 1403 unsigned long hva, kvm_pfn_t *pfnp, 1404 phys_addr_t *ipap) 1405 { 1406 kvm_pfn_t pfn = *pfnp; 1407 1408 /* 1409 * Make sure the adjustment is done only for THP pages. Also make 1410 * sure that the HVA and IPA are sufficiently aligned and that the 1411 * block map is contained within the memslot. 1412 */ 1413 if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) { 1414 int sz = get_user_mapping_size(kvm, hva); 1415 1416 if (sz < 0) 1417 return sz; 1418 1419 if (sz < PMD_SIZE) 1420 return PAGE_SIZE; 1421 1422 *ipap &= PMD_MASK; 1423 pfn &= ~(PTRS_PER_PMD - 1); 1424 *pfnp = pfn; 1425 1426 return PMD_SIZE; 1427 } 1428 1429 /* Use page mapping if we cannot use block mapping. */ 1430 return PAGE_SIZE; 1431 } 1432 1433 static int get_vma_page_shift(struct vm_area_struct *vma, unsigned long hva) 1434 { 1435 unsigned long pa; 1436 1437 if (is_vm_hugetlb_page(vma) && !(vma->vm_flags & VM_PFNMAP)) 1438 return huge_page_shift(hstate_vma(vma)); 1439 1440 if (!(vma->vm_flags & VM_PFNMAP)) 1441 return PAGE_SHIFT; 1442 1443 VM_BUG_ON(is_vm_hugetlb_page(vma)); 1444 1445 pa = (vma->vm_pgoff << PAGE_SHIFT) + (hva - vma->vm_start); 1446 1447 #ifndef __PAGETABLE_PMD_FOLDED 1448 if ((hva & (PUD_SIZE - 1)) == (pa & (PUD_SIZE - 1)) && 1449 ALIGN_DOWN(hva, PUD_SIZE) >= vma->vm_start && 1450 ALIGN(hva, PUD_SIZE) <= vma->vm_end) 1451 return PUD_SHIFT; 1452 #endif 1453 1454 if ((hva & (PMD_SIZE - 1)) == (pa & (PMD_SIZE - 1)) && 1455 ALIGN_DOWN(hva, PMD_SIZE) >= vma->vm_start && 1456 ALIGN(hva, PMD_SIZE) <= vma->vm_end) 1457 return PMD_SHIFT; 1458 1459 return PAGE_SHIFT; 1460 } 1461 1462 /* 1463 * The page will be mapped in stage 2 as Normal Cacheable, so the VM will be 1464 * able to see the page's tags and therefore they must be initialised first. If 1465 * PG_mte_tagged is set, tags have already been initialised. 1466 * 1467 * Must be called with kvm->mmu_lock held to ensure the memory remains mapped 1468 * while the tags are zeroed. 1469 */ 1470 static void sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn, 1471 unsigned long size) 1472 { 1473 unsigned long i, nr_pages = size >> PAGE_SHIFT; 1474 struct page *page = pfn_to_page(pfn); 1475 struct folio *folio = page_folio(page); 1476 1477 if (!kvm_has_mte(kvm)) 1478 return; 1479 1480 if (folio_test_hugetlb(folio)) { 1481 /* Hugetlb has MTE flags set on head page only */ 1482 if (folio_try_hugetlb_mte_tagging(folio)) { 1483 for (i = 0; i < nr_pages; i++, page++) 1484 mte_clear_page_tags(page_address(page)); 1485 folio_set_hugetlb_mte_tagged(folio); 1486 } 1487 return; 1488 } 1489 1490 for (i = 0; i < nr_pages; i++, page++) { 1491 if (try_page_mte_tagging(page)) { 1492 mte_clear_page_tags(page_address(page)); 1493 set_page_mte_tagged(page); 1494 } 1495 } 1496 } 1497 1498 static bool kvm_vma_mte_allowed(struct vm_area_struct *vma) 1499 { 1500 return vma->vm_flags & VM_MTE_ALLOWED; 1501 } 1502 1503 static bool kvm_vma_is_cacheable(struct vm_area_struct *vma) 1504 { 1505 switch (FIELD_GET(PTE_ATTRINDX_MASK, pgprot_val(vma->vm_page_prot))) { 1506 case MT_NORMAL_NC: 1507 case MT_DEVICE_nGnRnE: 1508 case MT_DEVICE_nGnRE: 1509 return false; 1510 default: 1511 return true; 1512 } 1513 } 1514 1515 static int prepare_mmu_memcache(struct kvm_vcpu *vcpu, bool topup_memcache, 1516 void **memcache) 1517 { 1518 int min_pages; 1519 1520 if (!is_protected_kvm_enabled()) 1521 *memcache = &vcpu->arch.mmu_page_cache; 1522 else 1523 *memcache = &vcpu->arch.pkvm_memcache; 1524 1525 if (!topup_memcache) 1526 return 0; 1527 1528 min_pages = kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu); 1529 1530 if (!is_protected_kvm_enabled()) 1531 return kvm_mmu_topup_memory_cache(*memcache, min_pages); 1532 1533 return topup_hyp_memcache(*memcache, min_pages); 1534 } 1535 1536 /* 1537 * Potentially reduce shadow S2 permissions to match the guest's own S2. For 1538 * exec faults, we'd only reach this point if the guest actually allowed it (see 1539 * kvm_s2_handle_perm_fault). 1540 * 1541 * Also encode the level of the original translation in the SW bits of the leaf 1542 * entry as a proxy for the span of that translation. This will be retrieved on 1543 * TLB invalidation from the guest and used to limit the invalidation scope if a 1544 * TTL hint or a range isn't provided. 1545 */ 1546 static void adjust_nested_fault_perms(struct kvm_s2_trans *nested, 1547 enum kvm_pgtable_prot *prot, 1548 bool *writable) 1549 { 1550 *writable &= kvm_s2_trans_writable(nested); 1551 if (!kvm_s2_trans_readable(nested)) 1552 *prot &= ~KVM_PGTABLE_PROT_R; 1553 1554 *prot |= kvm_encode_nested_level(nested); 1555 } 1556 1557 static void adjust_nested_exec_perms(struct kvm *kvm, 1558 struct kvm_s2_trans *nested, 1559 enum kvm_pgtable_prot *prot) 1560 { 1561 if (!kvm_s2_trans_exec_el0(kvm, nested)) 1562 *prot &= ~KVM_PGTABLE_PROT_UX; 1563 if (!kvm_s2_trans_exec_el1(kvm, nested)) 1564 *prot &= ~KVM_PGTABLE_PROT_PX; 1565 } 1566 1567 static int gmem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, 1568 struct kvm_s2_trans *nested, 1569 struct kvm_memory_slot *memslot, bool is_perm) 1570 { 1571 bool write_fault, exec_fault, writable; 1572 enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_SHARED; 1573 enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; 1574 struct kvm_pgtable *pgt = vcpu->arch.hw_mmu->pgt; 1575 unsigned long mmu_seq; 1576 struct page *page; 1577 struct kvm *kvm = vcpu->kvm; 1578 void *memcache; 1579 kvm_pfn_t pfn; 1580 gfn_t gfn; 1581 int ret; 1582 1583 ret = prepare_mmu_memcache(vcpu, true, &memcache); 1584 if (ret) 1585 return ret; 1586 1587 if (nested) 1588 gfn = kvm_s2_trans_output(nested) >> PAGE_SHIFT; 1589 else 1590 gfn = fault_ipa >> PAGE_SHIFT; 1591 1592 write_fault = kvm_is_write_fault(vcpu); 1593 exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu); 1594 1595 VM_WARN_ON_ONCE(write_fault && exec_fault); 1596 1597 mmu_seq = kvm->mmu_invalidate_seq; 1598 /* Pairs with the smp_wmb() in kvm_mmu_invalidate_end(). */ 1599 smp_rmb(); 1600 1601 ret = kvm_gmem_get_pfn(kvm, memslot, gfn, &pfn, &page, NULL); 1602 if (ret) { 1603 kvm_prepare_memory_fault_exit(vcpu, fault_ipa, PAGE_SIZE, 1604 write_fault, exec_fault, false); 1605 return ret; 1606 } 1607 1608 writable = !(memslot->flags & KVM_MEM_READONLY); 1609 1610 if (nested) 1611 adjust_nested_fault_perms(nested, &prot, &writable); 1612 1613 if (writable) 1614 prot |= KVM_PGTABLE_PROT_W; 1615 1616 if (exec_fault || cpus_have_final_cap(ARM64_HAS_CACHE_DIC)) 1617 prot |= KVM_PGTABLE_PROT_X; 1618 1619 if (nested) 1620 adjust_nested_exec_perms(kvm, nested, &prot); 1621 1622 kvm_fault_lock(kvm); 1623 if (mmu_invalidate_retry(kvm, mmu_seq)) { 1624 ret = -EAGAIN; 1625 goto out_unlock; 1626 } 1627 1628 ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, fault_ipa, PAGE_SIZE, 1629 __pfn_to_phys(pfn), prot, 1630 memcache, flags); 1631 1632 out_unlock: 1633 kvm_release_faultin_page(kvm, page, !!ret, writable); 1634 kvm_fault_unlock(kvm); 1635 1636 if (writable && !ret) 1637 mark_page_dirty_in_slot(kvm, memslot, gfn); 1638 1639 return ret != -EAGAIN ? ret : 0; 1640 } 1641 1642 static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, 1643 struct kvm_s2_trans *nested, 1644 struct kvm_memory_slot *memslot, unsigned long hva, 1645 bool fault_is_perm) 1646 { 1647 int ret = 0; 1648 bool topup_memcache; 1649 bool write_fault, writable; 1650 bool exec_fault, mte_allowed, is_vma_cacheable; 1651 bool s2_force_noncacheable = false, vfio_allow_any_uc = false; 1652 unsigned long mmu_seq; 1653 phys_addr_t ipa = fault_ipa; 1654 struct kvm *kvm = vcpu->kvm; 1655 struct vm_area_struct *vma; 1656 short vma_shift; 1657 void *memcache; 1658 gfn_t gfn; 1659 kvm_pfn_t pfn; 1660 bool logging_active = memslot_is_logging(memslot); 1661 bool force_pte = logging_active; 1662 long vma_pagesize, fault_granule; 1663 enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; 1664 struct kvm_pgtable *pgt; 1665 struct page *page; 1666 vm_flags_t vm_flags; 1667 enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_SHARED; 1668 1669 if (fault_is_perm) 1670 fault_granule = kvm_vcpu_trap_get_perm_fault_granule(vcpu); 1671 write_fault = kvm_is_write_fault(vcpu); 1672 exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu); 1673 VM_WARN_ON_ONCE(write_fault && exec_fault); 1674 1675 /* 1676 * Permission faults just need to update the existing leaf entry, 1677 * and so normally don't require allocations from the memcache. The 1678 * only exception to this is when dirty logging is enabled at runtime 1679 * and a write fault needs to collapse a block entry into a table. 1680 */ 1681 topup_memcache = !fault_is_perm || (logging_active && write_fault); 1682 ret = prepare_mmu_memcache(vcpu, topup_memcache, &memcache); 1683 if (ret) 1684 return ret; 1685 1686 /* 1687 * Let's check if we will get back a huge page backed by hugetlbfs, or 1688 * get block mapping for device MMIO region. 1689 */ 1690 mmap_read_lock(current->mm); 1691 vma = vma_lookup(current->mm, hva); 1692 if (unlikely(!vma)) { 1693 kvm_err("Failed to find VMA for hva 0x%lx\n", hva); 1694 mmap_read_unlock(current->mm); 1695 return -EFAULT; 1696 } 1697 1698 if (force_pte) 1699 vma_shift = PAGE_SHIFT; 1700 else 1701 vma_shift = get_vma_page_shift(vma, hva); 1702 1703 switch (vma_shift) { 1704 #ifndef __PAGETABLE_PMD_FOLDED 1705 case PUD_SHIFT: 1706 if (fault_supports_stage2_huge_mapping(memslot, hva, PUD_SIZE)) 1707 break; 1708 fallthrough; 1709 #endif 1710 case CONT_PMD_SHIFT: 1711 vma_shift = PMD_SHIFT; 1712 fallthrough; 1713 case PMD_SHIFT: 1714 if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) 1715 break; 1716 fallthrough; 1717 case CONT_PTE_SHIFT: 1718 vma_shift = PAGE_SHIFT; 1719 force_pte = true; 1720 fallthrough; 1721 case PAGE_SHIFT: 1722 break; 1723 default: 1724 WARN_ONCE(1, "Unknown vma_shift %d", vma_shift); 1725 } 1726 1727 vma_pagesize = 1UL << vma_shift; 1728 1729 if (nested) { 1730 unsigned long max_map_size; 1731 1732 max_map_size = force_pte ? PAGE_SIZE : PUD_SIZE; 1733 1734 ipa = kvm_s2_trans_output(nested); 1735 1736 /* 1737 * If we're about to create a shadow stage 2 entry, then we 1738 * can only create a block mapping if the guest stage 2 page 1739 * table uses at least as big a mapping. 1740 */ 1741 max_map_size = min(kvm_s2_trans_size(nested), max_map_size); 1742 1743 /* 1744 * Be careful that if the mapping size falls between 1745 * two host sizes, take the smallest of the two. 1746 */ 1747 if (max_map_size >= PMD_SIZE && max_map_size < PUD_SIZE) 1748 max_map_size = PMD_SIZE; 1749 else if (max_map_size >= PAGE_SIZE && max_map_size < PMD_SIZE) 1750 max_map_size = PAGE_SIZE; 1751 1752 force_pte = (max_map_size == PAGE_SIZE); 1753 vma_pagesize = min_t(long, vma_pagesize, max_map_size); 1754 vma_shift = __ffs(vma_pagesize); 1755 } 1756 1757 /* 1758 * Both the canonical IPA and fault IPA must be aligned to the 1759 * mapping size to ensure we find the right PFN and lay down the 1760 * mapping in the right place. 1761 */ 1762 fault_ipa = ALIGN_DOWN(fault_ipa, vma_pagesize); 1763 ipa = ALIGN_DOWN(ipa, vma_pagesize); 1764 1765 gfn = ipa >> PAGE_SHIFT; 1766 mte_allowed = kvm_vma_mte_allowed(vma); 1767 1768 vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED; 1769 1770 vm_flags = vma->vm_flags; 1771 1772 is_vma_cacheable = kvm_vma_is_cacheable(vma); 1773 1774 /* Don't use the VMA after the unlock -- it may have vanished */ 1775 vma = NULL; 1776 1777 /* 1778 * Read mmu_invalidate_seq so that KVM can detect if the results of 1779 * vma_lookup() or __kvm_faultin_pfn() become stale prior to 1780 * acquiring kvm->mmu_lock. 1781 * 1782 * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs 1783 * with the smp_wmb() in kvm_mmu_invalidate_end(). 1784 */ 1785 mmu_seq = kvm->mmu_invalidate_seq; 1786 mmap_read_unlock(current->mm); 1787 1788 pfn = __kvm_faultin_pfn(memslot, gfn, write_fault ? FOLL_WRITE : 0, 1789 &writable, &page); 1790 if (pfn == KVM_PFN_ERR_HWPOISON) { 1791 kvm_send_hwpoison_signal(hva, vma_shift); 1792 return 0; 1793 } 1794 if (is_error_noslot_pfn(pfn)) 1795 return -EFAULT; 1796 1797 /* 1798 * Check if this is non-struct page memory PFN, and cannot support 1799 * CMOs. It could potentially be unsafe to access as cacheable. 1800 */ 1801 if (vm_flags & (VM_PFNMAP | VM_MIXEDMAP) && !pfn_is_map_memory(pfn)) { 1802 if (is_vma_cacheable) { 1803 /* 1804 * Whilst the VMA owner expects cacheable mapping to this 1805 * PFN, hardware also has to support the FWB and CACHE DIC 1806 * features. 1807 * 1808 * ARM64 KVM relies on kernel VA mapping to the PFN to 1809 * perform cache maintenance as the CMO instructions work on 1810 * virtual addresses. VM_PFNMAP region are not necessarily 1811 * mapped to a KVA and hence the presence of hardware features 1812 * S2FWB and CACHE DIC are mandatory to avoid the need for 1813 * cache maintenance. 1814 */ 1815 if (!kvm_supports_cacheable_pfnmap()) 1816 ret = -EFAULT; 1817 } else { 1818 /* 1819 * If the page was identified as device early by looking at 1820 * the VMA flags, vma_pagesize is already representing the 1821 * largest quantity we can map. If instead it was mapped 1822 * via __kvm_faultin_pfn(), vma_pagesize is set to PAGE_SIZE 1823 * and must not be upgraded. 1824 * 1825 * In both cases, we don't let transparent_hugepage_adjust() 1826 * change things at the last minute. 1827 */ 1828 s2_force_noncacheable = true; 1829 } 1830 } else if (logging_active && !write_fault) { 1831 /* 1832 * Only actually map the page as writable if this was a write 1833 * fault. 1834 */ 1835 writable = false; 1836 } 1837 1838 if (exec_fault && s2_force_noncacheable) 1839 ret = -ENOEXEC; 1840 1841 if (ret) 1842 goto out_put_page; 1843 1844 /* 1845 * Guest performs atomic/exclusive operations on memory with unsupported 1846 * attributes (e.g. ld64b/st64b on normal memory when no FEAT_LS64WB) 1847 * and trigger the exception here. Since the memslot is valid, inject 1848 * the fault back to the guest. 1849 */ 1850 if (esr_fsc_is_excl_atomic_fault(kvm_vcpu_get_esr(vcpu))) { 1851 kvm_inject_dabt_excl_atomic(vcpu, kvm_vcpu_get_hfar(vcpu)); 1852 ret = 1; 1853 goto out_put_page; 1854 } 1855 1856 if (nested) 1857 adjust_nested_fault_perms(nested, &prot, &writable); 1858 1859 kvm_fault_lock(kvm); 1860 pgt = vcpu->arch.hw_mmu->pgt; 1861 if (mmu_invalidate_retry(kvm, mmu_seq)) { 1862 ret = -EAGAIN; 1863 goto out_unlock; 1864 } 1865 1866 /* 1867 * If we are not forced to use page mapping, check if we are 1868 * backed by a THP and thus use block mapping if possible. 1869 */ 1870 if (vma_pagesize == PAGE_SIZE && !(force_pte || s2_force_noncacheable)) { 1871 if (fault_is_perm && fault_granule > PAGE_SIZE) 1872 vma_pagesize = fault_granule; 1873 else 1874 vma_pagesize = transparent_hugepage_adjust(kvm, memslot, 1875 hva, &pfn, 1876 &fault_ipa); 1877 1878 if (vma_pagesize < 0) { 1879 ret = vma_pagesize; 1880 goto out_unlock; 1881 } 1882 } 1883 1884 if (!fault_is_perm && !s2_force_noncacheable && kvm_has_mte(kvm)) { 1885 /* Check the VMM hasn't introduced a new disallowed VMA */ 1886 if (mte_allowed) { 1887 sanitise_mte_tags(kvm, pfn, vma_pagesize); 1888 } else { 1889 ret = -EFAULT; 1890 goto out_unlock; 1891 } 1892 } 1893 1894 if (writable) 1895 prot |= KVM_PGTABLE_PROT_W; 1896 1897 if (exec_fault) 1898 prot |= KVM_PGTABLE_PROT_X; 1899 1900 if (s2_force_noncacheable) { 1901 if (vfio_allow_any_uc) 1902 prot |= KVM_PGTABLE_PROT_NORMAL_NC; 1903 else 1904 prot |= KVM_PGTABLE_PROT_DEVICE; 1905 } else if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC)) { 1906 prot |= KVM_PGTABLE_PROT_X; 1907 } 1908 1909 if (nested) 1910 adjust_nested_exec_perms(kvm, nested, &prot); 1911 1912 /* 1913 * Under the premise of getting a FSC_PERM fault, we just need to relax 1914 * permissions only if vma_pagesize equals fault_granule. Otherwise, 1915 * kvm_pgtable_stage2_map() should be called to change block size. 1916 */ 1917 if (fault_is_perm && vma_pagesize == fault_granule) { 1918 /* 1919 * Drop the SW bits in favour of those stored in the 1920 * PTE, which will be preserved. 1921 */ 1922 prot &= ~KVM_NV_GUEST_MAP_SZ; 1923 ret = KVM_PGT_FN(kvm_pgtable_stage2_relax_perms)(pgt, fault_ipa, prot, flags); 1924 } else { 1925 ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, fault_ipa, vma_pagesize, 1926 __pfn_to_phys(pfn), prot, 1927 memcache, flags); 1928 } 1929 1930 out_unlock: 1931 kvm_release_faultin_page(kvm, page, !!ret, writable); 1932 kvm_fault_unlock(kvm); 1933 1934 /* Mark the page dirty only if the fault is handled successfully */ 1935 if (writable && !ret) 1936 mark_page_dirty_in_slot(kvm, memslot, gfn); 1937 1938 return ret != -EAGAIN ? ret : 0; 1939 1940 out_put_page: 1941 kvm_release_page_unused(page); 1942 return ret; 1943 } 1944 1945 /* Resolve the access fault by making the page young again. */ 1946 static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) 1947 { 1948 enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_SHARED; 1949 struct kvm_s2_mmu *mmu; 1950 1951 trace_kvm_access_fault(fault_ipa); 1952 1953 read_lock(&vcpu->kvm->mmu_lock); 1954 mmu = vcpu->arch.hw_mmu; 1955 KVM_PGT_FN(kvm_pgtable_stage2_mkyoung)(mmu->pgt, fault_ipa, flags); 1956 read_unlock(&vcpu->kvm->mmu_lock); 1957 } 1958 1959 /* 1960 * Returns true if the SEA should be handled locally within KVM if the abort 1961 * is caused by a kernel memory allocation (e.g. stage-2 table memory). 1962 */ 1963 static bool host_owns_sea(struct kvm_vcpu *vcpu, u64 esr) 1964 { 1965 /* 1966 * Without FEAT_RAS HCR_EL2.TEA is RES0, meaning any external abort 1967 * taken from a guest EL to EL2 is due to a host-imposed access (e.g. 1968 * stage-2 PTW). 1969 */ 1970 if (!cpus_have_final_cap(ARM64_HAS_RAS_EXTN)) 1971 return true; 1972 1973 /* KVM owns the VNCR when the vCPU isn't in a nested context. */ 1974 if (is_hyp_ctxt(vcpu) && !kvm_vcpu_trap_is_iabt(vcpu) && (esr & ESR_ELx_VNCR)) 1975 return true; 1976 1977 /* 1978 * Determining if an external abort during a table walk happened at 1979 * stage-2 is only possible with S1PTW is set. Otherwise, since KVM 1980 * sets HCR_EL2.TEA, SEAs due to a stage-1 walk (i.e. accessing the 1981 * PA of the stage-1 descriptor) can reach here and are reported 1982 * with a TTW ESR value. 1983 */ 1984 return (esr_fsc_is_sea_ttw(esr) && (esr & ESR_ELx_S1PTW)); 1985 } 1986 1987 int kvm_handle_guest_sea(struct kvm_vcpu *vcpu) 1988 { 1989 struct kvm *kvm = vcpu->kvm; 1990 struct kvm_run *run = vcpu->run; 1991 u64 esr = kvm_vcpu_get_esr(vcpu); 1992 u64 esr_mask = ESR_ELx_EC_MASK | 1993 ESR_ELx_IL | 1994 ESR_ELx_FnV | 1995 ESR_ELx_EA | 1996 ESR_ELx_CM | 1997 ESR_ELx_WNR | 1998 ESR_ELx_FSC; 1999 u64 ipa; 2000 2001 /* 2002 * Give APEI the opportunity to claim the abort before handling it 2003 * within KVM. apei_claim_sea() expects to be called with IRQs enabled. 2004 */ 2005 lockdep_assert_irqs_enabled(); 2006 if (apei_claim_sea(NULL) == 0) 2007 return 1; 2008 2009 if (host_owns_sea(vcpu, esr) || 2010 !test_bit(KVM_ARCH_FLAG_EXIT_SEA, &vcpu->kvm->arch.flags)) 2011 return kvm_inject_serror(vcpu); 2012 2013 /* ESR_ELx.SET is RES0 when FEAT_RAS isn't implemented. */ 2014 if (kvm_has_ras(kvm)) 2015 esr_mask |= ESR_ELx_SET_MASK; 2016 2017 /* 2018 * Exit to userspace, and provide faulting guest virtual and physical 2019 * addresses in case userspace wants to emulate SEA to guest by 2020 * writing to FAR_ELx and HPFAR_ELx registers. 2021 */ 2022 memset(&run->arm_sea, 0, sizeof(run->arm_sea)); 2023 run->exit_reason = KVM_EXIT_ARM_SEA; 2024 run->arm_sea.esr = esr & esr_mask; 2025 2026 if (!(esr & ESR_ELx_FnV)) 2027 run->arm_sea.gva = kvm_vcpu_get_hfar(vcpu); 2028 2029 ipa = kvm_vcpu_get_fault_ipa(vcpu); 2030 if (ipa != INVALID_GPA) { 2031 run->arm_sea.flags |= KVM_EXIT_ARM_SEA_FLAG_GPA_VALID; 2032 run->arm_sea.gpa = ipa; 2033 } 2034 2035 return 0; 2036 } 2037 2038 /** 2039 * kvm_handle_guest_abort - handles all 2nd stage aborts 2040 * @vcpu: the VCPU pointer 2041 * 2042 * Any abort that gets to the host is almost guaranteed to be caused by a 2043 * missing second stage translation table entry, which can mean that either the 2044 * guest simply needs more memory and we must allocate an appropriate page or it 2045 * can mean that the guest tried to access I/O memory, which is emulated by user 2046 * space. The distinction is based on the IPA causing the fault and whether this 2047 * memory region has been registered as standard RAM by user space. 2048 */ 2049 int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) 2050 { 2051 struct kvm_s2_trans nested_trans, *nested = NULL; 2052 unsigned long esr; 2053 phys_addr_t fault_ipa; /* The address we faulted on */ 2054 phys_addr_t ipa; /* Always the IPA in the L1 guest phys space */ 2055 struct kvm_memory_slot *memslot; 2056 unsigned long hva; 2057 bool is_iabt, write_fault, writable; 2058 gfn_t gfn; 2059 int ret, idx; 2060 2061 if (kvm_vcpu_abt_issea(vcpu)) 2062 return kvm_handle_guest_sea(vcpu); 2063 2064 esr = kvm_vcpu_get_esr(vcpu); 2065 2066 /* 2067 * The fault IPA should be reliable at this point as we're not dealing 2068 * with an SEA. 2069 */ 2070 ipa = fault_ipa = kvm_vcpu_get_fault_ipa(vcpu); 2071 if (KVM_BUG_ON(ipa == INVALID_GPA, vcpu->kvm)) 2072 return -EFAULT; 2073 2074 is_iabt = kvm_vcpu_trap_is_iabt(vcpu); 2075 2076 if (esr_fsc_is_translation_fault(esr)) { 2077 /* Beyond sanitised PARange (which is the IPA limit) */ 2078 if (fault_ipa >= BIT_ULL(get_kvm_ipa_limit())) { 2079 kvm_inject_size_fault(vcpu); 2080 return 1; 2081 } 2082 2083 /* Falls between the IPA range and the PARange? */ 2084 if (fault_ipa >= BIT_ULL(VTCR_EL2_IPA(vcpu->arch.hw_mmu->vtcr))) { 2085 fault_ipa |= FAR_TO_FIPA_OFFSET(kvm_vcpu_get_hfar(vcpu)); 2086 2087 return kvm_inject_sea(vcpu, is_iabt, fault_ipa); 2088 } 2089 } 2090 2091 trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_esr(vcpu), 2092 kvm_vcpu_get_hfar(vcpu), fault_ipa); 2093 2094 /* Check the stage-2 fault is trans. fault or write fault */ 2095 if (!esr_fsc_is_translation_fault(esr) && 2096 !esr_fsc_is_permission_fault(esr) && 2097 !esr_fsc_is_access_flag_fault(esr) && 2098 !esr_fsc_is_excl_atomic_fault(esr)) { 2099 kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n", 2100 kvm_vcpu_trap_get_class(vcpu), 2101 (unsigned long)kvm_vcpu_trap_get_fault(vcpu), 2102 (unsigned long)kvm_vcpu_get_esr(vcpu)); 2103 return -EFAULT; 2104 } 2105 2106 idx = srcu_read_lock(&vcpu->kvm->srcu); 2107 2108 /* 2109 * We may have faulted on a shadow stage 2 page table if we are 2110 * running a nested guest. In this case, we have to resolve the L2 2111 * IPA to the L1 IPA first, before knowing what kind of memory should 2112 * back the L1 IPA. 2113 * 2114 * If the shadow stage 2 page table walk faults, then we simply inject 2115 * this to the guest and carry on. 2116 * 2117 * If there are no shadow S2 PTs because S2 is disabled, there is 2118 * nothing to walk and we treat it as a 1:1 before going through the 2119 * canonical translation. 2120 */ 2121 if (kvm_is_nested_s2_mmu(vcpu->kvm,vcpu->arch.hw_mmu) && 2122 vcpu->arch.hw_mmu->nested_stage2_enabled) { 2123 u32 esr; 2124 2125 ret = kvm_walk_nested_s2(vcpu, fault_ipa, &nested_trans); 2126 if (ret == -EAGAIN) { 2127 ret = 1; 2128 goto out_unlock; 2129 } 2130 2131 if (ret) { 2132 esr = kvm_s2_trans_esr(&nested_trans); 2133 kvm_inject_s2_fault(vcpu, esr); 2134 goto out_unlock; 2135 } 2136 2137 ret = kvm_s2_handle_perm_fault(vcpu, &nested_trans); 2138 if (ret) { 2139 esr = kvm_s2_trans_esr(&nested_trans); 2140 kvm_inject_s2_fault(vcpu, esr); 2141 goto out_unlock; 2142 } 2143 2144 ipa = kvm_s2_trans_output(&nested_trans); 2145 nested = &nested_trans; 2146 } 2147 2148 gfn = ipa >> PAGE_SHIFT; 2149 memslot = gfn_to_memslot(vcpu->kvm, gfn); 2150 hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable); 2151 write_fault = kvm_is_write_fault(vcpu); 2152 if (kvm_is_error_hva(hva) || (write_fault && !writable)) { 2153 /* 2154 * The guest has put either its instructions or its page-tables 2155 * somewhere it shouldn't have. Userspace won't be able to do 2156 * anything about this (there's no syndrome for a start), so 2157 * re-inject the abort back into the guest. 2158 */ 2159 if (is_iabt) { 2160 ret = -ENOEXEC; 2161 goto out; 2162 } 2163 2164 if (kvm_vcpu_abt_iss1tw(vcpu)) { 2165 ret = kvm_inject_sea_dabt(vcpu, kvm_vcpu_get_hfar(vcpu)); 2166 goto out_unlock; 2167 } 2168 2169 /* 2170 * Check for a cache maintenance operation. Since we 2171 * ended-up here, we know it is outside of any memory 2172 * slot. But we can't find out if that is for a device, 2173 * or if the guest is just being stupid. The only thing 2174 * we know for sure is that this range cannot be cached. 2175 * 2176 * So let's assume that the guest is just being 2177 * cautious, and skip the instruction. 2178 */ 2179 if (kvm_is_error_hva(hva) && kvm_vcpu_dabt_is_cm(vcpu)) { 2180 kvm_incr_pc(vcpu); 2181 ret = 1; 2182 goto out_unlock; 2183 } 2184 2185 /* 2186 * The IPA is reported as [MAX:12], so we need to 2187 * complement it with the bottom 12 bits from the 2188 * faulting VA. This is always 12 bits, irrespective 2189 * of the page size. 2190 */ 2191 ipa |= FAR_TO_FIPA_OFFSET(kvm_vcpu_get_hfar(vcpu)); 2192 ret = io_mem_abort(vcpu, ipa); 2193 goto out_unlock; 2194 } 2195 2196 /* Userspace should not be able to register out-of-bounds IPAs */ 2197 VM_BUG_ON(ipa >= kvm_phys_size(vcpu->arch.hw_mmu)); 2198 2199 if (esr_fsc_is_access_flag_fault(esr)) { 2200 handle_access_fault(vcpu, fault_ipa); 2201 ret = 1; 2202 goto out_unlock; 2203 } 2204 2205 VM_WARN_ON_ONCE(kvm_vcpu_trap_is_permission_fault(vcpu) && 2206 !write_fault && !kvm_vcpu_trap_is_exec_fault(vcpu)); 2207 2208 if (kvm_slot_has_gmem(memslot)) 2209 ret = gmem_abort(vcpu, fault_ipa, nested, memslot, 2210 esr_fsc_is_permission_fault(esr)); 2211 else 2212 ret = user_mem_abort(vcpu, fault_ipa, nested, memslot, hva, 2213 esr_fsc_is_permission_fault(esr)); 2214 if (ret == 0) 2215 ret = 1; 2216 out: 2217 if (ret == -ENOEXEC) 2218 ret = kvm_inject_sea_iabt(vcpu, kvm_vcpu_get_hfar(vcpu)); 2219 out_unlock: 2220 srcu_read_unlock(&vcpu->kvm->srcu, idx); 2221 return ret; 2222 } 2223 2224 bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) 2225 { 2226 if (!kvm->arch.mmu.pgt) 2227 return false; 2228 2229 __unmap_stage2_range(&kvm->arch.mmu, range->start << PAGE_SHIFT, 2230 (range->end - range->start) << PAGE_SHIFT, 2231 range->may_block); 2232 2233 kvm_nested_s2_unmap(kvm, range->may_block); 2234 return false; 2235 } 2236 2237 bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 2238 { 2239 u64 size = (range->end - range->start) << PAGE_SHIFT; 2240 2241 if (!kvm->arch.mmu.pgt) 2242 return false; 2243 2244 return KVM_PGT_FN(kvm_pgtable_stage2_test_clear_young)(kvm->arch.mmu.pgt, 2245 range->start << PAGE_SHIFT, 2246 size, true); 2247 /* 2248 * TODO: Handle nested_mmu structures here using the reverse mapping in 2249 * a later version of patch series. 2250 */ 2251 } 2252 2253 bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 2254 { 2255 u64 size = (range->end - range->start) << PAGE_SHIFT; 2256 2257 if (!kvm->arch.mmu.pgt) 2258 return false; 2259 2260 return KVM_PGT_FN(kvm_pgtable_stage2_test_clear_young)(kvm->arch.mmu.pgt, 2261 range->start << PAGE_SHIFT, 2262 size, false); 2263 } 2264 2265 phys_addr_t kvm_mmu_get_httbr(void) 2266 { 2267 return __pa(hyp_pgtable->pgd); 2268 } 2269 2270 phys_addr_t kvm_get_idmap_vector(void) 2271 { 2272 return hyp_idmap_vector; 2273 } 2274 2275 static int kvm_map_idmap_text(void) 2276 { 2277 unsigned long size = hyp_idmap_end - hyp_idmap_start; 2278 int err = __create_hyp_mappings(hyp_idmap_start, size, hyp_idmap_start, 2279 PAGE_HYP_EXEC); 2280 if (err) 2281 kvm_err("Failed to idmap %lx-%lx\n", 2282 hyp_idmap_start, hyp_idmap_end); 2283 2284 return err; 2285 } 2286 2287 static void *kvm_hyp_zalloc_page(void *arg) 2288 { 2289 return (void *)get_zeroed_page(GFP_KERNEL); 2290 } 2291 2292 static struct kvm_pgtable_mm_ops kvm_hyp_mm_ops = { 2293 .zalloc_page = kvm_hyp_zalloc_page, 2294 .get_page = kvm_host_get_page, 2295 .put_page = kvm_host_put_page, 2296 .phys_to_virt = kvm_host_va, 2297 .virt_to_phys = kvm_host_pa, 2298 }; 2299 2300 int __init kvm_mmu_init(u32 hyp_va_bits) 2301 { 2302 int err; 2303 2304 hyp_idmap_start = __pa_symbol(__hyp_idmap_text_start); 2305 hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE); 2306 hyp_idmap_end = __pa_symbol(__hyp_idmap_text_end); 2307 hyp_idmap_end = ALIGN(hyp_idmap_end, PAGE_SIZE); 2308 hyp_idmap_vector = __pa_symbol(__kvm_hyp_init); 2309 2310 /* 2311 * We rely on the linker script to ensure at build time that the HYP 2312 * init code does not cross a page boundary. 2313 */ 2314 BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK); 2315 2316 kvm_debug("Using %u-bit virtual addresses at EL2\n", hyp_va_bits); 2317 kvm_debug("IDMAP page: %lx\n", hyp_idmap_start); 2318 kvm_debug("HYP VA range: %lx:%lx\n", 2319 kern_hyp_va(PAGE_OFFSET), 2320 kern_hyp_va((unsigned long)high_memory - 1)); 2321 2322 if (hyp_idmap_start >= kern_hyp_va(PAGE_OFFSET) && 2323 hyp_idmap_start < kern_hyp_va((unsigned long)high_memory - 1) && 2324 hyp_idmap_start != (unsigned long)__hyp_idmap_text_start) { 2325 /* 2326 * The idmap page is intersecting with the VA space, 2327 * it is not safe to continue further. 2328 */ 2329 kvm_err("IDMAP intersecting with HYP VA, unable to continue\n"); 2330 err = -EINVAL; 2331 goto out; 2332 } 2333 2334 hyp_pgtable = kzalloc_obj(*hyp_pgtable); 2335 if (!hyp_pgtable) { 2336 kvm_err("Hyp mode page-table not allocated\n"); 2337 err = -ENOMEM; 2338 goto out; 2339 } 2340 2341 err = kvm_pgtable_hyp_init(hyp_pgtable, hyp_va_bits, &kvm_hyp_mm_ops); 2342 if (err) 2343 goto out_free_pgtable; 2344 2345 err = kvm_map_idmap_text(); 2346 if (err) 2347 goto out_destroy_pgtable; 2348 2349 io_map_base = hyp_idmap_start; 2350 __hyp_va_bits = hyp_va_bits; 2351 return 0; 2352 2353 out_destroy_pgtable: 2354 kvm_pgtable_hyp_destroy(hyp_pgtable); 2355 out_free_pgtable: 2356 kfree(hyp_pgtable); 2357 hyp_pgtable = NULL; 2358 out: 2359 return err; 2360 } 2361 2362 void kvm_arch_commit_memory_region(struct kvm *kvm, 2363 struct kvm_memory_slot *old, 2364 const struct kvm_memory_slot *new, 2365 enum kvm_mr_change change) 2366 { 2367 bool log_dirty_pages = new && new->flags & KVM_MEM_LOG_DIRTY_PAGES; 2368 2369 /* 2370 * At this point memslot has been committed and there is an 2371 * allocated dirty_bitmap[], dirty pages will be tracked while the 2372 * memory slot is write protected. 2373 */ 2374 if (log_dirty_pages) { 2375 2376 if (change == KVM_MR_DELETE) 2377 return; 2378 2379 /* 2380 * Huge and normal pages are write-protected and split 2381 * on either of these two cases: 2382 * 2383 * 1. with initial-all-set: gradually with CLEAR ioctls, 2384 */ 2385 if (kvm_dirty_log_manual_protect_and_init_set(kvm)) 2386 return; 2387 /* 2388 * or 2389 * 2. without initial-all-set: all in one shot when 2390 * enabling dirty logging. 2391 */ 2392 kvm_mmu_wp_memory_region(kvm, new->id); 2393 kvm_mmu_split_memory_region(kvm, new->id); 2394 } else { 2395 /* 2396 * Free any leftovers from the eager page splitting cache. Do 2397 * this when deleting, moving, disabling dirty logging, or 2398 * creating the memslot (a nop). Doing it for deletes makes 2399 * sure we don't leak memory, and there's no need to keep the 2400 * cache around for any of the other cases. 2401 */ 2402 kvm_mmu_free_memory_cache(&kvm->arch.mmu.split_page_cache); 2403 } 2404 } 2405 2406 int kvm_arch_prepare_memory_region(struct kvm *kvm, 2407 const struct kvm_memory_slot *old, 2408 struct kvm_memory_slot *new, 2409 enum kvm_mr_change change) 2410 { 2411 hva_t hva, reg_end; 2412 int ret = 0; 2413 2414 if (change != KVM_MR_CREATE && change != KVM_MR_MOVE && 2415 change != KVM_MR_FLAGS_ONLY) 2416 return 0; 2417 2418 /* 2419 * Prevent userspace from creating a memory region outside of the IPA 2420 * space addressable by the KVM guest IPA space. 2421 */ 2422 if ((new->base_gfn + new->npages) > (kvm_phys_size(&kvm->arch.mmu) >> PAGE_SHIFT)) 2423 return -EFAULT; 2424 2425 /* 2426 * Only support guest_memfd backed memslots with mappable memory, since 2427 * there aren't any CoCo VMs that support only private memory on arm64. 2428 */ 2429 if (kvm_slot_has_gmem(new) && !kvm_memslot_is_gmem_only(new)) 2430 return -EINVAL; 2431 2432 hva = new->userspace_addr; 2433 reg_end = hva + (new->npages << PAGE_SHIFT); 2434 2435 mmap_read_lock(current->mm); 2436 /* 2437 * A memory region could potentially cover multiple VMAs, and any holes 2438 * between them, so iterate over all of them. 2439 * 2440 * +--------------------------------------------+ 2441 * +---------------+----------------+ +----------------+ 2442 * | : VMA 1 | VMA 2 | | VMA 3 : | 2443 * +---------------+----------------+ +----------------+ 2444 * | memory region | 2445 * +--------------------------------------------+ 2446 */ 2447 do { 2448 struct vm_area_struct *vma; 2449 2450 vma = find_vma_intersection(current->mm, hva, reg_end); 2451 if (!vma) 2452 break; 2453 2454 if (kvm_has_mte(kvm) && !kvm_vma_mte_allowed(vma)) { 2455 ret = -EINVAL; 2456 break; 2457 } 2458 2459 if (vma->vm_flags & VM_PFNMAP) { 2460 /* IO region dirty page logging not allowed */ 2461 if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) { 2462 ret = -EINVAL; 2463 break; 2464 } 2465 2466 /* 2467 * Cacheable PFNMAP is allowed only if the hardware 2468 * supports it. 2469 */ 2470 if (kvm_vma_is_cacheable(vma) && !kvm_supports_cacheable_pfnmap()) { 2471 ret = -EINVAL; 2472 break; 2473 } 2474 } 2475 hva = min(reg_end, vma->vm_end); 2476 } while (hva < reg_end); 2477 2478 mmap_read_unlock(current->mm); 2479 return ret; 2480 } 2481 2482 void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) 2483 { 2484 } 2485 2486 void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) 2487 { 2488 } 2489 2490 void kvm_arch_flush_shadow_memslot(struct kvm *kvm, 2491 struct kvm_memory_slot *slot) 2492 { 2493 gpa_t gpa = slot->base_gfn << PAGE_SHIFT; 2494 phys_addr_t size = slot->npages << PAGE_SHIFT; 2495 2496 write_lock(&kvm->mmu_lock); 2497 kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, size, true); 2498 kvm_nested_s2_unmap(kvm, true); 2499 write_unlock(&kvm->mmu_lock); 2500 } 2501 2502 /* 2503 * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized). 2504 * 2505 * Main problems: 2506 * - S/W ops are local to a CPU (not broadcast) 2507 * - We have line migration behind our back (speculation) 2508 * - System caches don't support S/W at all (damn!) 2509 * 2510 * In the face of the above, the best we can do is to try and convert 2511 * S/W ops to VA ops. Because the guest is not allowed to infer the 2512 * S/W to PA mapping, it can only use S/W to nuke the whole cache, 2513 * which is a rather good thing for us. 2514 * 2515 * Also, it is only used when turning caches on/off ("The expected 2516 * usage of the cache maintenance instructions that operate by set/way 2517 * is associated with the cache maintenance instructions associated 2518 * with the powerdown and powerup of caches, if this is required by 2519 * the implementation."). 2520 * 2521 * We use the following policy: 2522 * 2523 * - If we trap a S/W operation, we enable VM trapping to detect 2524 * caches being turned on/off, and do a full clean. 2525 * 2526 * - We flush the caches on both caches being turned on and off. 2527 * 2528 * - Once the caches are enabled, we stop trapping VM ops. 2529 */ 2530 void kvm_set_way_flush(struct kvm_vcpu *vcpu) 2531 { 2532 unsigned long hcr = *vcpu_hcr(vcpu); 2533 2534 /* 2535 * If this is the first time we do a S/W operation 2536 * (i.e. HCR_TVM not set) flush the whole memory, and set the 2537 * VM trapping. 2538 * 2539 * Otherwise, rely on the VM trapping to wait for the MMU + 2540 * Caches to be turned off. At that point, we'll be able to 2541 * clean the caches again. 2542 */ 2543 if (!(hcr & HCR_TVM)) { 2544 trace_kvm_set_way_flush(*vcpu_pc(vcpu), 2545 vcpu_has_cache_enabled(vcpu)); 2546 stage2_flush_vm(vcpu->kvm); 2547 *vcpu_hcr(vcpu) = hcr | HCR_TVM; 2548 } 2549 } 2550 2551 void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled) 2552 { 2553 bool now_enabled = vcpu_has_cache_enabled(vcpu); 2554 2555 /* 2556 * If switching the MMU+caches on, need to invalidate the caches. 2557 * If switching it off, need to clean the caches. 2558 * Clean + invalidate does the trick always. 2559 */ 2560 if (now_enabled != was_enabled) 2561 stage2_flush_vm(vcpu->kvm); 2562 2563 /* Caches are now on, stop trapping VM ops (until a S/W op) */ 2564 if (now_enabled) 2565 *vcpu_hcr(vcpu) &= ~HCR_TVM; 2566 2567 trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled); 2568 } 2569