1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2012 - Virtual Open Systems and Columbia University 4 * Author: Christoffer Dall <c.dall@virtualopensystems.com> 5 */ 6 7 #include <linux/acpi.h> 8 #include <linux/mman.h> 9 #include <linux/kvm_host.h> 10 #include <linux/io.h> 11 #include <linux/hugetlb.h> 12 #include <linux/sched/signal.h> 13 #include <trace/events/kvm.h> 14 #include <asm/acpi.h> 15 #include <asm/pgalloc.h> 16 #include <asm/cacheflush.h> 17 #include <asm/kvm_arm.h> 18 #include <asm/kvm_mmu.h> 19 #include <asm/kvm_pgtable.h> 20 #include <asm/kvm_pkvm.h> 21 #include <asm/kvm_asm.h> 22 #include <asm/kvm_emulate.h> 23 #include <asm/virt.h> 24 25 #include "trace.h" 26 27 static struct kvm_pgtable *hyp_pgtable; 28 static DEFINE_MUTEX(kvm_hyp_pgd_mutex); 29 30 static unsigned long __ro_after_init hyp_idmap_start; 31 static unsigned long __ro_after_init hyp_idmap_end; 32 static phys_addr_t __ro_after_init hyp_idmap_vector; 33 34 u32 __ro_after_init __hyp_va_bits; 35 36 static unsigned long __ro_after_init io_map_base; 37 38 #define KVM_PGT_FN(fn) (!is_protected_kvm_enabled() ? fn : p ## fn) 39 40 static phys_addr_t __stage2_range_addr_end(phys_addr_t addr, phys_addr_t end, 41 phys_addr_t size) 42 { 43 phys_addr_t boundary = ALIGN_DOWN(addr + size, size); 44 45 return (boundary - 1 < end - 1) ? boundary : end; 46 } 47 48 static phys_addr_t stage2_range_addr_end(phys_addr_t addr, phys_addr_t end) 49 { 50 phys_addr_t size = kvm_granule_size(KVM_PGTABLE_MIN_BLOCK_LEVEL); 51 52 return __stage2_range_addr_end(addr, end, size); 53 } 54 55 /* 56 * Release kvm_mmu_lock periodically if the memory region is large. Otherwise, 57 * we may see kernel panics with CONFIG_DETECT_HUNG_TASK, 58 * CONFIG_LOCKUP_DETECTOR, CONFIG_LOCKDEP. Additionally, holding the lock too 59 * long will also starve other vCPUs. We have to also make sure that the page 60 * tables are not freed while we released the lock. 61 */ 62 static int stage2_apply_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, 63 phys_addr_t end, 64 int (*fn)(struct kvm_pgtable *, u64, u64), 65 bool resched) 66 { 67 struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu); 68 int ret; 69 u64 next; 70 71 do { 72 struct kvm_pgtable *pgt = mmu->pgt; 73 if (!pgt) 74 return -EINVAL; 75 76 next = stage2_range_addr_end(addr, end); 77 ret = fn(pgt, addr, next - addr); 78 if (ret) 79 break; 80 81 if (resched && next != end) 82 cond_resched_rwlock_write(&kvm->mmu_lock); 83 } while (addr = next, addr != end); 84 85 return ret; 86 } 87 88 #define stage2_apply_range_resched(mmu, addr, end, fn) \ 89 stage2_apply_range(mmu, addr, end, fn, true) 90 91 /* 92 * Get the maximum number of page-tables pages needed to split a range 93 * of blocks into PAGE_SIZE PTEs. It assumes the range is already 94 * mapped at level 2, or at level 1 if allowed. 95 */ 96 static int kvm_mmu_split_nr_page_tables(u64 range) 97 { 98 int n = 0; 99 100 if (KVM_PGTABLE_MIN_BLOCK_LEVEL < 2) 101 n += DIV_ROUND_UP(range, PUD_SIZE); 102 n += DIV_ROUND_UP(range, PMD_SIZE); 103 return n; 104 } 105 106 static bool need_split_memcache_topup_or_resched(struct kvm *kvm) 107 { 108 struct kvm_mmu_memory_cache *cache; 109 u64 chunk_size, min; 110 111 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) 112 return true; 113 114 chunk_size = kvm->arch.mmu.split_page_chunk_size; 115 min = kvm_mmu_split_nr_page_tables(chunk_size); 116 cache = &kvm->arch.mmu.split_page_cache; 117 return kvm_mmu_memory_cache_nr_free_objects(cache) < min; 118 } 119 120 static int kvm_mmu_split_huge_pages(struct kvm *kvm, phys_addr_t addr, 121 phys_addr_t end) 122 { 123 struct kvm_mmu_memory_cache *cache; 124 struct kvm_pgtable *pgt; 125 int ret, cache_capacity; 126 u64 next, chunk_size; 127 128 lockdep_assert_held_write(&kvm->mmu_lock); 129 130 chunk_size = kvm->arch.mmu.split_page_chunk_size; 131 cache_capacity = kvm_mmu_split_nr_page_tables(chunk_size); 132 133 if (chunk_size == 0) 134 return 0; 135 136 cache = &kvm->arch.mmu.split_page_cache; 137 138 do { 139 if (need_split_memcache_topup_or_resched(kvm)) { 140 write_unlock(&kvm->mmu_lock); 141 cond_resched(); 142 /* Eager page splitting is best-effort. */ 143 ret = __kvm_mmu_topup_memory_cache(cache, 144 cache_capacity, 145 cache_capacity); 146 write_lock(&kvm->mmu_lock); 147 if (ret) 148 break; 149 } 150 151 pgt = kvm->arch.mmu.pgt; 152 if (!pgt) 153 return -EINVAL; 154 155 next = __stage2_range_addr_end(addr, end, chunk_size); 156 ret = KVM_PGT_FN(kvm_pgtable_stage2_split)(pgt, addr, next - addr, cache); 157 if (ret) 158 break; 159 } while (addr = next, addr != end); 160 161 return ret; 162 } 163 164 static bool memslot_is_logging(struct kvm_memory_slot *memslot) 165 { 166 return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY); 167 } 168 169 /** 170 * kvm_arch_flush_remote_tlbs() - flush all VM TLB entries for v7/8 171 * @kvm: pointer to kvm structure. 172 * 173 * Interface to HYP function to flush all VM TLB entries 174 */ 175 int kvm_arch_flush_remote_tlbs(struct kvm *kvm) 176 { 177 if (is_protected_kvm_enabled()) 178 kvm_call_hyp_nvhe(__pkvm_tlb_flush_vmid, kvm->arch.pkvm.handle); 179 else 180 kvm_call_hyp(__kvm_tlb_flush_vmid, &kvm->arch.mmu); 181 return 0; 182 } 183 184 int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, 185 gfn_t gfn, u64 nr_pages) 186 { 187 u64 size = nr_pages << PAGE_SHIFT; 188 u64 addr = gfn << PAGE_SHIFT; 189 190 if (is_protected_kvm_enabled()) 191 kvm_call_hyp_nvhe(__pkvm_tlb_flush_vmid, kvm->arch.pkvm.handle); 192 else 193 kvm_tlb_flush_vmid_range(&kvm->arch.mmu, addr, size); 194 return 0; 195 } 196 197 static void *stage2_memcache_zalloc_page(void *arg) 198 { 199 struct kvm_mmu_memory_cache *mc = arg; 200 void *virt; 201 202 /* Allocated with __GFP_ZERO, so no need to zero */ 203 virt = kvm_mmu_memory_cache_alloc(mc); 204 if (virt) 205 kvm_account_pgtable_pages(virt, 1); 206 return virt; 207 } 208 209 static void *kvm_host_zalloc_pages_exact(size_t size) 210 { 211 return alloc_pages_exact(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO); 212 } 213 214 static void *kvm_s2_zalloc_pages_exact(size_t size) 215 { 216 void *virt = kvm_host_zalloc_pages_exact(size); 217 218 if (virt) 219 kvm_account_pgtable_pages(virt, (size >> PAGE_SHIFT)); 220 return virt; 221 } 222 223 static void kvm_s2_free_pages_exact(void *virt, size_t size) 224 { 225 kvm_account_pgtable_pages(virt, -(size >> PAGE_SHIFT)); 226 free_pages_exact(virt, size); 227 } 228 229 static struct kvm_pgtable_mm_ops kvm_s2_mm_ops; 230 231 static void stage2_free_unlinked_table_rcu_cb(struct rcu_head *head) 232 { 233 struct page *page = container_of(head, struct page, rcu_head); 234 void *pgtable = page_to_virt(page); 235 s8 level = page_private(page); 236 237 KVM_PGT_FN(kvm_pgtable_stage2_free_unlinked)(&kvm_s2_mm_ops, pgtable, level); 238 } 239 240 static void stage2_free_unlinked_table(void *addr, s8 level) 241 { 242 struct page *page = virt_to_page(addr); 243 244 set_page_private(page, (unsigned long)level); 245 call_rcu(&page->rcu_head, stage2_free_unlinked_table_rcu_cb); 246 } 247 248 static void kvm_host_get_page(void *addr) 249 { 250 get_page(virt_to_page(addr)); 251 } 252 253 static void kvm_host_put_page(void *addr) 254 { 255 put_page(virt_to_page(addr)); 256 } 257 258 static void kvm_s2_put_page(void *addr) 259 { 260 struct page *p = virt_to_page(addr); 261 /* Dropping last refcount, the page will be freed */ 262 if (page_count(p) == 1) 263 kvm_account_pgtable_pages(addr, -1); 264 put_page(p); 265 } 266 267 static int kvm_host_page_count(void *addr) 268 { 269 return page_count(virt_to_page(addr)); 270 } 271 272 static phys_addr_t kvm_host_pa(void *addr) 273 { 274 return __pa(addr); 275 } 276 277 static void *kvm_host_va(phys_addr_t phys) 278 { 279 return __va(phys); 280 } 281 282 static void clean_dcache_guest_page(void *va, size_t size) 283 { 284 __clean_dcache_guest_page(va, size); 285 } 286 287 static void invalidate_icache_guest_page(void *va, size_t size) 288 { 289 __invalidate_icache_guest_page(va, size); 290 } 291 292 /* 293 * Unmapping vs dcache management: 294 * 295 * If a guest maps certain memory pages as uncached, all writes will 296 * bypass the data cache and go directly to RAM. However, the CPUs 297 * can still speculate reads (not writes) and fill cache lines with 298 * data. 299 * 300 * Those cache lines will be *clean* cache lines though, so a 301 * clean+invalidate operation is equivalent to an invalidate 302 * operation, because no cache lines are marked dirty. 303 * 304 * Those clean cache lines could be filled prior to an uncached write 305 * by the guest, and the cache coherent IO subsystem would therefore 306 * end up writing old data to disk. 307 * 308 * This is why right after unmapping a page/section and invalidating 309 * the corresponding TLBs, we flush to make sure the IO subsystem will 310 * never hit in the cache. 311 * 312 * This is all avoided on systems that have ARM64_HAS_STAGE2_FWB, as 313 * we then fully enforce cacheability of RAM, no matter what the guest 314 * does. 315 */ 316 /** 317 * __unmap_stage2_range -- Clear stage2 page table entries to unmap a range 318 * @mmu: The KVM stage-2 MMU pointer 319 * @start: The intermediate physical base address of the range to unmap 320 * @size: The size of the area to unmap 321 * @may_block: Whether or not we are permitted to block 322 * 323 * Clear a range of stage-2 mappings, lowering the various ref-counts. Must 324 * be called while holding mmu_lock (unless for freeing the stage2 pgd before 325 * destroying the VM), otherwise another faulting VCPU may come in and mess 326 * with things behind our backs. 327 */ 328 static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size, 329 bool may_block) 330 { 331 struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu); 332 phys_addr_t end = start + size; 333 334 lockdep_assert_held_write(&kvm->mmu_lock); 335 WARN_ON(size & ~PAGE_MASK); 336 WARN_ON(stage2_apply_range(mmu, start, end, KVM_PGT_FN(kvm_pgtable_stage2_unmap), 337 may_block)); 338 } 339 340 void kvm_stage2_unmap_range(struct kvm_s2_mmu *mmu, phys_addr_t start, 341 u64 size, bool may_block) 342 { 343 __unmap_stage2_range(mmu, start, size, may_block); 344 } 345 346 void kvm_stage2_flush_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end) 347 { 348 stage2_apply_range_resched(mmu, addr, end, KVM_PGT_FN(kvm_pgtable_stage2_flush)); 349 } 350 351 static void stage2_flush_memslot(struct kvm *kvm, 352 struct kvm_memory_slot *memslot) 353 { 354 phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT; 355 phys_addr_t end = addr + PAGE_SIZE * memslot->npages; 356 357 kvm_stage2_flush_range(&kvm->arch.mmu, addr, end); 358 } 359 360 /** 361 * stage2_flush_vm - Invalidate cache for pages mapped in stage 2 362 * @kvm: The struct kvm pointer 363 * 364 * Go through the stage 2 page tables and invalidate any cache lines 365 * backing memory already mapped to the VM. 366 */ 367 static void stage2_flush_vm(struct kvm *kvm) 368 { 369 struct kvm_memslots *slots; 370 struct kvm_memory_slot *memslot; 371 int idx, bkt; 372 373 idx = srcu_read_lock(&kvm->srcu); 374 write_lock(&kvm->mmu_lock); 375 376 slots = kvm_memslots(kvm); 377 kvm_for_each_memslot(memslot, bkt, slots) 378 stage2_flush_memslot(kvm, memslot); 379 380 kvm_nested_s2_flush(kvm); 381 382 write_unlock(&kvm->mmu_lock); 383 srcu_read_unlock(&kvm->srcu, idx); 384 } 385 386 /** 387 * free_hyp_pgds - free Hyp-mode page tables 388 */ 389 void __init free_hyp_pgds(void) 390 { 391 mutex_lock(&kvm_hyp_pgd_mutex); 392 if (hyp_pgtable) { 393 kvm_pgtable_hyp_destroy(hyp_pgtable); 394 kfree(hyp_pgtable); 395 hyp_pgtable = NULL; 396 } 397 mutex_unlock(&kvm_hyp_pgd_mutex); 398 } 399 400 static bool kvm_host_owns_hyp_mappings(void) 401 { 402 if (is_kernel_in_hyp_mode()) 403 return false; 404 405 if (static_branch_likely(&kvm_protected_mode_initialized)) 406 return false; 407 408 /* 409 * This can happen at boot time when __create_hyp_mappings() is called 410 * after the hyp protection has been enabled, but the static key has 411 * not been flipped yet. 412 */ 413 if (!hyp_pgtable && is_protected_kvm_enabled()) 414 return false; 415 416 WARN_ON(!hyp_pgtable); 417 418 return true; 419 } 420 421 int __create_hyp_mappings(unsigned long start, unsigned long size, 422 unsigned long phys, enum kvm_pgtable_prot prot) 423 { 424 int err; 425 426 if (WARN_ON(!kvm_host_owns_hyp_mappings())) 427 return -EINVAL; 428 429 mutex_lock(&kvm_hyp_pgd_mutex); 430 err = kvm_pgtable_hyp_map(hyp_pgtable, start, size, phys, prot); 431 mutex_unlock(&kvm_hyp_pgd_mutex); 432 433 return err; 434 } 435 436 static phys_addr_t kvm_kaddr_to_phys(void *kaddr) 437 { 438 if (!is_vmalloc_addr(kaddr)) { 439 BUG_ON(!virt_addr_valid(kaddr)); 440 return __pa(kaddr); 441 } else { 442 return page_to_phys(vmalloc_to_page(kaddr)) + 443 offset_in_page(kaddr); 444 } 445 } 446 447 struct hyp_shared_pfn { 448 u64 pfn; 449 int count; 450 struct rb_node node; 451 }; 452 453 static DEFINE_MUTEX(hyp_shared_pfns_lock); 454 static struct rb_root hyp_shared_pfns = RB_ROOT; 455 456 static struct hyp_shared_pfn *find_shared_pfn(u64 pfn, struct rb_node ***node, 457 struct rb_node **parent) 458 { 459 struct hyp_shared_pfn *this; 460 461 *node = &hyp_shared_pfns.rb_node; 462 *parent = NULL; 463 while (**node) { 464 this = container_of(**node, struct hyp_shared_pfn, node); 465 *parent = **node; 466 if (this->pfn < pfn) 467 *node = &((**node)->rb_left); 468 else if (this->pfn > pfn) 469 *node = &((**node)->rb_right); 470 else 471 return this; 472 } 473 474 return NULL; 475 } 476 477 static int share_pfn_hyp(u64 pfn) 478 { 479 struct rb_node **node, *parent; 480 struct hyp_shared_pfn *this; 481 int ret = 0; 482 483 mutex_lock(&hyp_shared_pfns_lock); 484 this = find_shared_pfn(pfn, &node, &parent); 485 if (this) { 486 this->count++; 487 goto unlock; 488 } 489 490 this = kzalloc(sizeof(*this), GFP_KERNEL); 491 if (!this) { 492 ret = -ENOMEM; 493 goto unlock; 494 } 495 496 this->pfn = pfn; 497 this->count = 1; 498 rb_link_node(&this->node, parent, node); 499 rb_insert_color(&this->node, &hyp_shared_pfns); 500 ret = kvm_call_hyp_nvhe(__pkvm_host_share_hyp, pfn); 501 unlock: 502 mutex_unlock(&hyp_shared_pfns_lock); 503 504 return ret; 505 } 506 507 static int unshare_pfn_hyp(u64 pfn) 508 { 509 struct rb_node **node, *parent; 510 struct hyp_shared_pfn *this; 511 int ret = 0; 512 513 mutex_lock(&hyp_shared_pfns_lock); 514 this = find_shared_pfn(pfn, &node, &parent); 515 if (WARN_ON(!this)) { 516 ret = -ENOENT; 517 goto unlock; 518 } 519 520 this->count--; 521 if (this->count) 522 goto unlock; 523 524 rb_erase(&this->node, &hyp_shared_pfns); 525 kfree(this); 526 ret = kvm_call_hyp_nvhe(__pkvm_host_unshare_hyp, pfn); 527 unlock: 528 mutex_unlock(&hyp_shared_pfns_lock); 529 530 return ret; 531 } 532 533 int kvm_share_hyp(void *from, void *to) 534 { 535 phys_addr_t start, end, cur; 536 u64 pfn; 537 int ret; 538 539 if (is_kernel_in_hyp_mode()) 540 return 0; 541 542 /* 543 * The share hcall maps things in the 'fixed-offset' region of the hyp 544 * VA space, so we can only share physically contiguous data-structures 545 * for now. 546 */ 547 if (is_vmalloc_or_module_addr(from) || is_vmalloc_or_module_addr(to)) 548 return -EINVAL; 549 550 if (kvm_host_owns_hyp_mappings()) 551 return create_hyp_mappings(from, to, PAGE_HYP); 552 553 start = ALIGN_DOWN(__pa(from), PAGE_SIZE); 554 end = PAGE_ALIGN(__pa(to)); 555 for (cur = start; cur < end; cur += PAGE_SIZE) { 556 pfn = __phys_to_pfn(cur); 557 ret = share_pfn_hyp(pfn); 558 if (ret) 559 return ret; 560 } 561 562 return 0; 563 } 564 565 void kvm_unshare_hyp(void *from, void *to) 566 { 567 phys_addr_t start, end, cur; 568 u64 pfn; 569 570 if (is_kernel_in_hyp_mode() || kvm_host_owns_hyp_mappings() || !from) 571 return; 572 573 start = ALIGN_DOWN(__pa(from), PAGE_SIZE); 574 end = PAGE_ALIGN(__pa(to)); 575 for (cur = start; cur < end; cur += PAGE_SIZE) { 576 pfn = __phys_to_pfn(cur); 577 WARN_ON(unshare_pfn_hyp(pfn)); 578 } 579 } 580 581 /** 582 * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode 583 * @from: The virtual kernel start address of the range 584 * @to: The virtual kernel end address of the range (exclusive) 585 * @prot: The protection to be applied to this range 586 * 587 * The same virtual address as the kernel virtual address is also used 588 * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying 589 * physical pages. 590 */ 591 int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot) 592 { 593 phys_addr_t phys_addr; 594 unsigned long virt_addr; 595 unsigned long start = kern_hyp_va((unsigned long)from); 596 unsigned long end = kern_hyp_va((unsigned long)to); 597 598 if (is_kernel_in_hyp_mode()) 599 return 0; 600 601 if (!kvm_host_owns_hyp_mappings()) 602 return -EPERM; 603 604 start = start & PAGE_MASK; 605 end = PAGE_ALIGN(end); 606 607 for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) { 608 int err; 609 610 phys_addr = kvm_kaddr_to_phys(from + virt_addr - start); 611 err = __create_hyp_mappings(virt_addr, PAGE_SIZE, phys_addr, 612 prot); 613 if (err) 614 return err; 615 } 616 617 return 0; 618 } 619 620 static int __hyp_alloc_private_va_range(unsigned long base) 621 { 622 lockdep_assert_held(&kvm_hyp_pgd_mutex); 623 624 if (!PAGE_ALIGNED(base)) 625 return -EINVAL; 626 627 /* 628 * Verify that BIT(VA_BITS - 1) hasn't been flipped by 629 * allocating the new area, as it would indicate we've 630 * overflowed the idmap/IO address range. 631 */ 632 if ((base ^ io_map_base) & BIT(VA_BITS - 1)) 633 return -ENOMEM; 634 635 io_map_base = base; 636 637 return 0; 638 } 639 640 /** 641 * hyp_alloc_private_va_range - Allocates a private VA range. 642 * @size: The size of the VA range to reserve. 643 * @haddr: The hypervisor virtual start address of the allocation. 644 * 645 * The private virtual address (VA) range is allocated below io_map_base 646 * and aligned based on the order of @size. 647 * 648 * Return: 0 on success or negative error code on failure. 649 */ 650 int hyp_alloc_private_va_range(size_t size, unsigned long *haddr) 651 { 652 unsigned long base; 653 int ret = 0; 654 655 mutex_lock(&kvm_hyp_pgd_mutex); 656 657 /* 658 * This assumes that we have enough space below the idmap 659 * page to allocate our VAs. If not, the check in 660 * __hyp_alloc_private_va_range() will kick. A potential 661 * alternative would be to detect that overflow and switch 662 * to an allocation above the idmap. 663 * 664 * The allocated size is always a multiple of PAGE_SIZE. 665 */ 666 size = PAGE_ALIGN(size); 667 base = io_map_base - size; 668 ret = __hyp_alloc_private_va_range(base); 669 670 mutex_unlock(&kvm_hyp_pgd_mutex); 671 672 if (!ret) 673 *haddr = base; 674 675 return ret; 676 } 677 678 static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size, 679 unsigned long *haddr, 680 enum kvm_pgtable_prot prot) 681 { 682 unsigned long addr; 683 int ret = 0; 684 685 if (!kvm_host_owns_hyp_mappings()) { 686 addr = kvm_call_hyp_nvhe(__pkvm_create_private_mapping, 687 phys_addr, size, prot); 688 if (IS_ERR_VALUE(addr)) 689 return addr; 690 *haddr = addr; 691 692 return 0; 693 } 694 695 size = PAGE_ALIGN(size + offset_in_page(phys_addr)); 696 ret = hyp_alloc_private_va_range(size, &addr); 697 if (ret) 698 return ret; 699 700 ret = __create_hyp_mappings(addr, size, phys_addr, prot); 701 if (ret) 702 return ret; 703 704 *haddr = addr + offset_in_page(phys_addr); 705 return ret; 706 } 707 708 int create_hyp_stack(phys_addr_t phys_addr, unsigned long *haddr) 709 { 710 unsigned long base; 711 size_t size; 712 int ret; 713 714 mutex_lock(&kvm_hyp_pgd_mutex); 715 /* 716 * Efficient stack verification using the NVHE_STACK_SHIFT bit implies 717 * an alignment of our allocation on the order of the size. 718 */ 719 size = NVHE_STACK_SIZE * 2; 720 base = ALIGN_DOWN(io_map_base - size, size); 721 722 ret = __hyp_alloc_private_va_range(base); 723 724 mutex_unlock(&kvm_hyp_pgd_mutex); 725 726 if (ret) { 727 kvm_err("Cannot allocate hyp stack guard page\n"); 728 return ret; 729 } 730 731 /* 732 * Since the stack grows downwards, map the stack to the page 733 * at the higher address and leave the lower guard page 734 * unbacked. 735 * 736 * Any valid stack address now has the NVHE_STACK_SHIFT bit as 1 737 * and addresses corresponding to the guard page have the 738 * NVHE_STACK_SHIFT bit as 0 - this is used for overflow detection. 739 */ 740 ret = __create_hyp_mappings(base + NVHE_STACK_SIZE, NVHE_STACK_SIZE, 741 phys_addr, PAGE_HYP); 742 if (ret) 743 kvm_err("Cannot map hyp stack\n"); 744 745 *haddr = base + size; 746 747 return ret; 748 } 749 750 /** 751 * create_hyp_io_mappings - Map IO into both kernel and HYP 752 * @phys_addr: The physical start address which gets mapped 753 * @size: Size of the region being mapped 754 * @kaddr: Kernel VA for this mapping 755 * @haddr: HYP VA for this mapping 756 */ 757 int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size, 758 void __iomem **kaddr, 759 void __iomem **haddr) 760 { 761 unsigned long addr; 762 int ret; 763 764 if (is_protected_kvm_enabled()) 765 return -EPERM; 766 767 *kaddr = ioremap(phys_addr, size); 768 if (!*kaddr) 769 return -ENOMEM; 770 771 if (is_kernel_in_hyp_mode()) { 772 *haddr = *kaddr; 773 return 0; 774 } 775 776 ret = __create_hyp_private_mapping(phys_addr, size, 777 &addr, PAGE_HYP_DEVICE); 778 if (ret) { 779 iounmap(*kaddr); 780 *kaddr = NULL; 781 *haddr = NULL; 782 return ret; 783 } 784 785 *haddr = (void __iomem *)addr; 786 return 0; 787 } 788 789 /** 790 * create_hyp_exec_mappings - Map an executable range into HYP 791 * @phys_addr: The physical start address which gets mapped 792 * @size: Size of the region being mapped 793 * @haddr: HYP VA for this mapping 794 */ 795 int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size, 796 void **haddr) 797 { 798 unsigned long addr; 799 int ret; 800 801 BUG_ON(is_kernel_in_hyp_mode()); 802 803 ret = __create_hyp_private_mapping(phys_addr, size, 804 &addr, PAGE_HYP_EXEC); 805 if (ret) { 806 *haddr = NULL; 807 return ret; 808 } 809 810 *haddr = (void *)addr; 811 return 0; 812 } 813 814 static struct kvm_pgtable_mm_ops kvm_user_mm_ops = { 815 /* We shouldn't need any other callback to walk the PT */ 816 .phys_to_virt = kvm_host_va, 817 }; 818 819 static int get_user_mapping_size(struct kvm *kvm, u64 addr) 820 { 821 struct kvm_pgtable pgt = { 822 .pgd = (kvm_pteref_t)kvm->mm->pgd, 823 .ia_bits = vabits_actual, 824 .start_level = (KVM_PGTABLE_LAST_LEVEL - 825 ARM64_HW_PGTABLE_LEVELS(pgt.ia_bits) + 1), 826 .mm_ops = &kvm_user_mm_ops, 827 }; 828 unsigned long flags; 829 kvm_pte_t pte = 0; /* Keep GCC quiet... */ 830 s8 level = S8_MAX; 831 int ret; 832 833 /* 834 * Disable IRQs so that we hazard against a concurrent 835 * teardown of the userspace page tables (which relies on 836 * IPI-ing threads). 837 */ 838 local_irq_save(flags); 839 ret = kvm_pgtable_get_leaf(&pgt, addr, &pte, &level); 840 local_irq_restore(flags); 841 842 if (ret) 843 return ret; 844 845 /* 846 * Not seeing an error, but not updating level? Something went 847 * deeply wrong... 848 */ 849 if (WARN_ON(level > KVM_PGTABLE_LAST_LEVEL)) 850 return -EFAULT; 851 if (WARN_ON(level < KVM_PGTABLE_FIRST_LEVEL)) 852 return -EFAULT; 853 854 /* Oops, the userspace PTs are gone... Replay the fault */ 855 if (!kvm_pte_valid(pte)) 856 return -EAGAIN; 857 858 return BIT(ARM64_HW_PGTABLE_LEVEL_SHIFT(level)); 859 } 860 861 static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = { 862 .zalloc_page = stage2_memcache_zalloc_page, 863 .zalloc_pages_exact = kvm_s2_zalloc_pages_exact, 864 .free_pages_exact = kvm_s2_free_pages_exact, 865 .free_unlinked_table = stage2_free_unlinked_table, 866 .get_page = kvm_host_get_page, 867 .put_page = kvm_s2_put_page, 868 .page_count = kvm_host_page_count, 869 .phys_to_virt = kvm_host_va, 870 .virt_to_phys = kvm_host_pa, 871 .dcache_clean_inval_poc = clean_dcache_guest_page, 872 .icache_inval_pou = invalidate_icache_guest_page, 873 }; 874 875 static int kvm_init_ipa_range(struct kvm_s2_mmu *mmu, unsigned long type) 876 { 877 u32 kvm_ipa_limit = get_kvm_ipa_limit(); 878 u64 mmfr0, mmfr1; 879 u32 phys_shift; 880 881 if (type & ~KVM_VM_TYPE_ARM_IPA_SIZE_MASK) 882 return -EINVAL; 883 884 phys_shift = KVM_VM_TYPE_ARM_IPA_SIZE(type); 885 if (is_protected_kvm_enabled()) { 886 phys_shift = kvm_ipa_limit; 887 } else if (phys_shift) { 888 if (phys_shift > kvm_ipa_limit || 889 phys_shift < ARM64_MIN_PARANGE_BITS) 890 return -EINVAL; 891 } else { 892 phys_shift = KVM_PHYS_SHIFT; 893 if (phys_shift > kvm_ipa_limit) { 894 pr_warn_once("%s using unsupported default IPA limit, upgrade your VMM\n", 895 current->comm); 896 return -EINVAL; 897 } 898 } 899 900 mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); 901 mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); 902 mmu->vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift); 903 904 return 0; 905 } 906 907 /* 908 * Assume that @pgt is valid and unlinked from the KVM MMU to free the 909 * page-table without taking the kvm_mmu_lock and without performing any 910 * TLB invalidations. 911 * 912 * Also, the range of addresses can be large enough to cause need_resched 913 * warnings, for instance on CONFIG_PREEMPT_NONE kernels. Hence, invoke 914 * cond_resched() periodically to prevent hogging the CPU for a long time 915 * and schedule something else, if required. 916 */ 917 static void stage2_destroy_range(struct kvm_pgtable *pgt, phys_addr_t addr, 918 phys_addr_t end) 919 { 920 u64 next; 921 922 do { 923 next = stage2_range_addr_end(addr, end); 924 KVM_PGT_FN(kvm_pgtable_stage2_destroy_range)(pgt, addr, 925 next - addr); 926 if (next != end) 927 cond_resched(); 928 } while (addr = next, addr != end); 929 } 930 931 static void kvm_stage2_destroy(struct kvm_pgtable *pgt) 932 { 933 unsigned int ia_bits = VTCR_EL2_IPA(pgt->mmu->vtcr); 934 935 stage2_destroy_range(pgt, 0, BIT(ia_bits)); 936 KVM_PGT_FN(kvm_pgtable_stage2_destroy_pgd)(pgt); 937 } 938 939 /** 940 * kvm_init_stage2_mmu - Initialise a S2 MMU structure 941 * @kvm: The pointer to the KVM structure 942 * @mmu: The pointer to the s2 MMU structure 943 * @type: The machine type of the virtual machine 944 * 945 * Allocates only the stage-2 HW PGD level table(s). 946 * Note we don't need locking here as this is only called in two cases: 947 * 948 * - when the VM is created, which can't race against anything 949 * 950 * - when secondary kvm_s2_mmu structures are initialised for NV 951 * guests, and the caller must hold kvm->lock as this is called on a 952 * per-vcpu basis. 953 */ 954 int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type) 955 { 956 int cpu, err; 957 struct kvm_pgtable *pgt; 958 959 /* 960 * If we already have our page tables in place, and that the 961 * MMU context is the canonical one, we have a bug somewhere, 962 * as this is only supposed to ever happen once per VM. 963 * 964 * Otherwise, we're building nested page tables, and that's 965 * probably because userspace called KVM_ARM_VCPU_INIT more 966 * than once on the same vcpu. Since that's actually legal, 967 * don't kick a fuss and leave gracefully. 968 */ 969 if (mmu->pgt != NULL) { 970 if (kvm_is_nested_s2_mmu(kvm, mmu)) 971 return 0; 972 973 kvm_err("kvm_arch already initialized?\n"); 974 return -EINVAL; 975 } 976 977 err = kvm_init_ipa_range(mmu, type); 978 if (err) 979 return err; 980 981 pgt = kzalloc(sizeof(*pgt), GFP_KERNEL_ACCOUNT); 982 if (!pgt) 983 return -ENOMEM; 984 985 mmu->arch = &kvm->arch; 986 err = KVM_PGT_FN(kvm_pgtable_stage2_init)(pgt, mmu, &kvm_s2_mm_ops); 987 if (err) 988 goto out_free_pgtable; 989 990 mmu->pgt = pgt; 991 if (is_protected_kvm_enabled()) 992 return 0; 993 994 mmu->last_vcpu_ran = alloc_percpu(typeof(*mmu->last_vcpu_ran)); 995 if (!mmu->last_vcpu_ran) { 996 err = -ENOMEM; 997 goto out_destroy_pgtable; 998 } 999 1000 for_each_possible_cpu(cpu) 1001 *per_cpu_ptr(mmu->last_vcpu_ran, cpu) = -1; 1002 1003 /* The eager page splitting is disabled by default */ 1004 mmu->split_page_chunk_size = KVM_ARM_EAGER_SPLIT_CHUNK_SIZE_DEFAULT; 1005 mmu->split_page_cache.gfp_zero = __GFP_ZERO; 1006 1007 mmu->pgd_phys = __pa(pgt->pgd); 1008 1009 if (kvm_is_nested_s2_mmu(kvm, mmu)) 1010 kvm_init_nested_s2_mmu(mmu); 1011 1012 return 0; 1013 1014 out_destroy_pgtable: 1015 kvm_stage2_destroy(pgt); 1016 out_free_pgtable: 1017 kfree(pgt); 1018 return err; 1019 } 1020 1021 void kvm_uninit_stage2_mmu(struct kvm *kvm) 1022 { 1023 kvm_free_stage2_pgd(&kvm->arch.mmu); 1024 kvm_mmu_free_memory_cache(&kvm->arch.mmu.split_page_cache); 1025 } 1026 1027 static void stage2_unmap_memslot(struct kvm *kvm, 1028 struct kvm_memory_slot *memslot) 1029 { 1030 hva_t hva = memslot->userspace_addr; 1031 phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT; 1032 phys_addr_t size = PAGE_SIZE * memslot->npages; 1033 hva_t reg_end = hva + size; 1034 1035 /* 1036 * A memory region could potentially cover multiple VMAs, and any holes 1037 * between them, so iterate over all of them to find out if we should 1038 * unmap any of them. 1039 * 1040 * +--------------------------------------------+ 1041 * +---------------+----------------+ +----------------+ 1042 * | : VMA 1 | VMA 2 | | VMA 3 : | 1043 * +---------------+----------------+ +----------------+ 1044 * | memory region | 1045 * +--------------------------------------------+ 1046 */ 1047 do { 1048 struct vm_area_struct *vma; 1049 hva_t vm_start, vm_end; 1050 1051 vma = find_vma_intersection(current->mm, hva, reg_end); 1052 if (!vma) 1053 break; 1054 1055 /* 1056 * Take the intersection of this VMA with the memory region 1057 */ 1058 vm_start = max(hva, vma->vm_start); 1059 vm_end = min(reg_end, vma->vm_end); 1060 1061 if (!(vma->vm_flags & VM_PFNMAP)) { 1062 gpa_t gpa = addr + (vm_start - memslot->userspace_addr); 1063 kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, vm_end - vm_start, true); 1064 } 1065 hva = vm_end; 1066 } while (hva < reg_end); 1067 } 1068 1069 /** 1070 * stage2_unmap_vm - Unmap Stage-2 RAM mappings 1071 * @kvm: The struct kvm pointer 1072 * 1073 * Go through the memregions and unmap any regular RAM 1074 * backing memory already mapped to the VM. 1075 */ 1076 void stage2_unmap_vm(struct kvm *kvm) 1077 { 1078 struct kvm_memslots *slots; 1079 struct kvm_memory_slot *memslot; 1080 int idx, bkt; 1081 1082 idx = srcu_read_lock(&kvm->srcu); 1083 mmap_read_lock(current->mm); 1084 write_lock(&kvm->mmu_lock); 1085 1086 slots = kvm_memslots(kvm); 1087 kvm_for_each_memslot(memslot, bkt, slots) 1088 stage2_unmap_memslot(kvm, memslot); 1089 1090 kvm_nested_s2_unmap(kvm, true); 1091 1092 write_unlock(&kvm->mmu_lock); 1093 mmap_read_unlock(current->mm); 1094 srcu_read_unlock(&kvm->srcu, idx); 1095 } 1096 1097 void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu) 1098 { 1099 struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu); 1100 struct kvm_pgtable *pgt = NULL; 1101 1102 write_lock(&kvm->mmu_lock); 1103 pgt = mmu->pgt; 1104 if (pgt) { 1105 mmu->pgd_phys = 0; 1106 mmu->pgt = NULL; 1107 free_percpu(mmu->last_vcpu_ran); 1108 } 1109 1110 if (kvm_is_nested_s2_mmu(kvm, mmu)) 1111 kvm_init_nested_s2_mmu(mmu); 1112 1113 write_unlock(&kvm->mmu_lock); 1114 1115 if (pgt) { 1116 kvm_stage2_destroy(pgt); 1117 kfree(pgt); 1118 } 1119 } 1120 1121 static void hyp_mc_free_fn(void *addr, void *mc) 1122 { 1123 struct kvm_hyp_memcache *memcache = mc; 1124 1125 if (memcache->flags & HYP_MEMCACHE_ACCOUNT_STAGE2) 1126 kvm_account_pgtable_pages(addr, -1); 1127 1128 free_page((unsigned long)addr); 1129 } 1130 1131 static void *hyp_mc_alloc_fn(void *mc) 1132 { 1133 struct kvm_hyp_memcache *memcache = mc; 1134 void *addr; 1135 1136 addr = (void *)__get_free_page(GFP_KERNEL_ACCOUNT); 1137 if (addr && memcache->flags & HYP_MEMCACHE_ACCOUNT_STAGE2) 1138 kvm_account_pgtable_pages(addr, 1); 1139 1140 return addr; 1141 } 1142 1143 void free_hyp_memcache(struct kvm_hyp_memcache *mc) 1144 { 1145 if (!is_protected_kvm_enabled()) 1146 return; 1147 1148 kfree(mc->mapping); 1149 __free_hyp_memcache(mc, hyp_mc_free_fn, kvm_host_va, mc); 1150 } 1151 1152 int topup_hyp_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages) 1153 { 1154 if (!is_protected_kvm_enabled()) 1155 return 0; 1156 1157 if (!mc->mapping) { 1158 mc->mapping = kzalloc(sizeof(struct pkvm_mapping), GFP_KERNEL_ACCOUNT); 1159 if (!mc->mapping) 1160 return -ENOMEM; 1161 } 1162 1163 return __topup_hyp_memcache(mc, min_pages, hyp_mc_alloc_fn, 1164 kvm_host_pa, mc); 1165 } 1166 1167 /** 1168 * kvm_phys_addr_ioremap - map a device range to guest IPA 1169 * 1170 * @kvm: The KVM pointer 1171 * @guest_ipa: The IPA at which to insert the mapping 1172 * @pa: The physical address of the device 1173 * @size: The size of the mapping 1174 * @writable: Whether or not to create a writable mapping 1175 */ 1176 int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, 1177 phys_addr_t pa, unsigned long size, bool writable) 1178 { 1179 phys_addr_t addr; 1180 int ret = 0; 1181 struct kvm_mmu_memory_cache cache = { .gfp_zero = __GFP_ZERO }; 1182 struct kvm_s2_mmu *mmu = &kvm->arch.mmu; 1183 struct kvm_pgtable *pgt = mmu->pgt; 1184 enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_DEVICE | 1185 KVM_PGTABLE_PROT_R | 1186 (writable ? KVM_PGTABLE_PROT_W : 0); 1187 1188 if (is_protected_kvm_enabled()) 1189 return -EPERM; 1190 1191 size += offset_in_page(guest_ipa); 1192 guest_ipa &= PAGE_MASK; 1193 1194 for (addr = guest_ipa; addr < guest_ipa + size; addr += PAGE_SIZE) { 1195 ret = kvm_mmu_topup_memory_cache(&cache, 1196 kvm_mmu_cache_min_pages(mmu)); 1197 if (ret) 1198 break; 1199 1200 write_lock(&kvm->mmu_lock); 1201 ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, addr, PAGE_SIZE, 1202 pa, prot, &cache, 0); 1203 write_unlock(&kvm->mmu_lock); 1204 if (ret) 1205 break; 1206 1207 pa += PAGE_SIZE; 1208 } 1209 1210 kvm_mmu_free_memory_cache(&cache); 1211 return ret; 1212 } 1213 1214 /** 1215 * kvm_stage2_wp_range() - write protect stage2 memory region range 1216 * @mmu: The KVM stage-2 MMU pointer 1217 * @addr: Start address of range 1218 * @end: End address of range 1219 */ 1220 void kvm_stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end) 1221 { 1222 stage2_apply_range_resched(mmu, addr, end, KVM_PGT_FN(kvm_pgtable_stage2_wrprotect)); 1223 } 1224 1225 /** 1226 * kvm_mmu_wp_memory_region() - write protect stage 2 entries for memory slot 1227 * @kvm: The KVM pointer 1228 * @slot: The memory slot to write protect 1229 * 1230 * Called to start logging dirty pages after memory region 1231 * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns 1232 * all present PUD, PMD and PTEs are write protected in the memory region. 1233 * Afterwards read of dirty page log can be called. 1234 * 1235 * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired, 1236 * serializing operations for VM memory regions. 1237 */ 1238 static void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot) 1239 { 1240 struct kvm_memslots *slots = kvm_memslots(kvm); 1241 struct kvm_memory_slot *memslot = id_to_memslot(slots, slot); 1242 phys_addr_t start, end; 1243 1244 if (WARN_ON_ONCE(!memslot)) 1245 return; 1246 1247 start = memslot->base_gfn << PAGE_SHIFT; 1248 end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT; 1249 1250 write_lock(&kvm->mmu_lock); 1251 kvm_stage2_wp_range(&kvm->arch.mmu, start, end); 1252 kvm_nested_s2_wp(kvm); 1253 write_unlock(&kvm->mmu_lock); 1254 kvm_flush_remote_tlbs_memslot(kvm, memslot); 1255 } 1256 1257 /** 1258 * kvm_mmu_split_memory_region() - split the stage 2 blocks into PAGE_SIZE 1259 * pages for memory slot 1260 * @kvm: The KVM pointer 1261 * @slot: The memory slot to split 1262 * 1263 * Acquires kvm->mmu_lock. Called with kvm->slots_lock mutex acquired, 1264 * serializing operations for VM memory regions. 1265 */ 1266 static void kvm_mmu_split_memory_region(struct kvm *kvm, int slot) 1267 { 1268 struct kvm_memslots *slots; 1269 struct kvm_memory_slot *memslot; 1270 phys_addr_t start, end; 1271 1272 lockdep_assert_held(&kvm->slots_lock); 1273 1274 slots = kvm_memslots(kvm); 1275 memslot = id_to_memslot(slots, slot); 1276 1277 start = memslot->base_gfn << PAGE_SHIFT; 1278 end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT; 1279 1280 write_lock(&kvm->mmu_lock); 1281 kvm_mmu_split_huge_pages(kvm, start, end); 1282 write_unlock(&kvm->mmu_lock); 1283 } 1284 1285 /* 1286 * kvm_arch_mmu_enable_log_dirty_pt_masked() - enable dirty logging for selected pages. 1287 * @kvm: The KVM pointer 1288 * @slot: The memory slot associated with mask 1289 * @gfn_offset: The gfn offset in memory slot 1290 * @mask: The mask of pages at offset 'gfn_offset' in this memory 1291 * slot to enable dirty logging on 1292 * 1293 * Writes protect selected pages to enable dirty logging, and then 1294 * splits them to PAGE_SIZE. Caller must acquire kvm->mmu_lock. 1295 */ 1296 void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, 1297 struct kvm_memory_slot *slot, 1298 gfn_t gfn_offset, unsigned long mask) 1299 { 1300 phys_addr_t base_gfn = slot->base_gfn + gfn_offset; 1301 phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT; 1302 phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT; 1303 1304 lockdep_assert_held_write(&kvm->mmu_lock); 1305 1306 kvm_stage2_wp_range(&kvm->arch.mmu, start, end); 1307 1308 /* 1309 * Eager-splitting is done when manual-protect is set. We 1310 * also check for initially-all-set because we can avoid 1311 * eager-splitting if initially-all-set is false. 1312 * Initially-all-set equal false implies that huge-pages were 1313 * already split when enabling dirty logging: no need to do it 1314 * again. 1315 */ 1316 if (kvm_dirty_log_manual_protect_and_init_set(kvm)) 1317 kvm_mmu_split_huge_pages(kvm, start, end); 1318 1319 kvm_nested_s2_wp(kvm); 1320 } 1321 1322 static void kvm_send_hwpoison_signal(unsigned long address, short lsb) 1323 { 1324 send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current); 1325 } 1326 1327 static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot, 1328 unsigned long hva, 1329 unsigned long map_size) 1330 { 1331 gpa_t gpa_start; 1332 hva_t uaddr_start, uaddr_end; 1333 size_t size; 1334 1335 /* The memslot and the VMA are guaranteed to be aligned to PAGE_SIZE */ 1336 if (map_size == PAGE_SIZE) 1337 return true; 1338 1339 /* pKVM only supports PMD_SIZE huge-mappings */ 1340 if (is_protected_kvm_enabled() && map_size != PMD_SIZE) 1341 return false; 1342 1343 size = memslot->npages * PAGE_SIZE; 1344 1345 gpa_start = memslot->base_gfn << PAGE_SHIFT; 1346 1347 uaddr_start = memslot->userspace_addr; 1348 uaddr_end = uaddr_start + size; 1349 1350 /* 1351 * Pages belonging to memslots that don't have the same alignment 1352 * within a PMD/PUD for userspace and IPA cannot be mapped with stage-2 1353 * PMD/PUD entries, because we'll end up mapping the wrong pages. 1354 * 1355 * Consider a layout like the following: 1356 * 1357 * memslot->userspace_addr: 1358 * +-----+--------------------+--------------------+---+ 1359 * |abcde|fgh Stage-1 block | Stage-1 block tv|xyz| 1360 * +-----+--------------------+--------------------+---+ 1361 * 1362 * memslot->base_gfn << PAGE_SHIFT: 1363 * +---+--------------------+--------------------+-----+ 1364 * |abc|def Stage-2 block | Stage-2 block |tvxyz| 1365 * +---+--------------------+--------------------+-----+ 1366 * 1367 * If we create those stage-2 blocks, we'll end up with this incorrect 1368 * mapping: 1369 * d -> f 1370 * e -> g 1371 * f -> h 1372 */ 1373 if ((gpa_start & (map_size - 1)) != (uaddr_start & (map_size - 1))) 1374 return false; 1375 1376 /* 1377 * Next, let's make sure we're not trying to map anything not covered 1378 * by the memslot. This means we have to prohibit block size mappings 1379 * for the beginning and end of a non-block aligned and non-block sized 1380 * memory slot (illustrated by the head and tail parts of the 1381 * userspace view above containing pages 'abcde' and 'xyz', 1382 * respectively). 1383 * 1384 * Note that it doesn't matter if we do the check using the 1385 * userspace_addr or the base_gfn, as both are equally aligned (per 1386 * the check above) and equally sized. 1387 */ 1388 return (hva & ~(map_size - 1)) >= uaddr_start && 1389 (hva & ~(map_size - 1)) + map_size <= uaddr_end; 1390 } 1391 1392 /* 1393 * Check if the given hva is backed by a transparent huge page (THP) and 1394 * whether it can be mapped using block mapping in stage2. If so, adjust 1395 * the stage2 PFN and IPA accordingly. Only PMD_SIZE THPs are currently 1396 * supported. This will need to be updated to support other THP sizes. 1397 * 1398 * Returns the size of the mapping. 1399 */ 1400 static long 1401 transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot, 1402 unsigned long hva, kvm_pfn_t *pfnp, 1403 phys_addr_t *ipap) 1404 { 1405 kvm_pfn_t pfn = *pfnp; 1406 1407 /* 1408 * Make sure the adjustment is done only for THP pages. Also make 1409 * sure that the HVA and IPA are sufficiently aligned and that the 1410 * block map is contained within the memslot. 1411 */ 1412 if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) { 1413 int sz = get_user_mapping_size(kvm, hva); 1414 1415 if (sz < 0) 1416 return sz; 1417 1418 if (sz < PMD_SIZE) 1419 return PAGE_SIZE; 1420 1421 *ipap &= PMD_MASK; 1422 pfn &= ~(PTRS_PER_PMD - 1); 1423 *pfnp = pfn; 1424 1425 return PMD_SIZE; 1426 } 1427 1428 /* Use page mapping if we cannot use block mapping. */ 1429 return PAGE_SIZE; 1430 } 1431 1432 static int get_vma_page_shift(struct vm_area_struct *vma, unsigned long hva) 1433 { 1434 unsigned long pa; 1435 1436 if (is_vm_hugetlb_page(vma) && !(vma->vm_flags & VM_PFNMAP)) 1437 return huge_page_shift(hstate_vma(vma)); 1438 1439 if (!(vma->vm_flags & VM_PFNMAP)) 1440 return PAGE_SHIFT; 1441 1442 VM_BUG_ON(is_vm_hugetlb_page(vma)); 1443 1444 pa = (vma->vm_pgoff << PAGE_SHIFT) + (hva - vma->vm_start); 1445 1446 #ifndef __PAGETABLE_PMD_FOLDED 1447 if ((hva & (PUD_SIZE - 1)) == (pa & (PUD_SIZE - 1)) && 1448 ALIGN_DOWN(hva, PUD_SIZE) >= vma->vm_start && 1449 ALIGN(hva, PUD_SIZE) <= vma->vm_end) 1450 return PUD_SHIFT; 1451 #endif 1452 1453 if ((hva & (PMD_SIZE - 1)) == (pa & (PMD_SIZE - 1)) && 1454 ALIGN_DOWN(hva, PMD_SIZE) >= vma->vm_start && 1455 ALIGN(hva, PMD_SIZE) <= vma->vm_end) 1456 return PMD_SHIFT; 1457 1458 return PAGE_SHIFT; 1459 } 1460 1461 /* 1462 * The page will be mapped in stage 2 as Normal Cacheable, so the VM will be 1463 * able to see the page's tags and therefore they must be initialised first. If 1464 * PG_mte_tagged is set, tags have already been initialised. 1465 * 1466 * Must be called with kvm->mmu_lock held to ensure the memory remains mapped 1467 * while the tags are zeroed. 1468 */ 1469 static void sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn, 1470 unsigned long size) 1471 { 1472 unsigned long i, nr_pages = size >> PAGE_SHIFT; 1473 struct page *page = pfn_to_page(pfn); 1474 struct folio *folio = page_folio(page); 1475 1476 if (!kvm_has_mte(kvm)) 1477 return; 1478 1479 if (folio_test_hugetlb(folio)) { 1480 /* Hugetlb has MTE flags set on head page only */ 1481 if (folio_try_hugetlb_mte_tagging(folio)) { 1482 for (i = 0; i < nr_pages; i++, page++) 1483 mte_clear_page_tags(page_address(page)); 1484 folio_set_hugetlb_mte_tagged(folio); 1485 } 1486 return; 1487 } 1488 1489 for (i = 0; i < nr_pages; i++, page++) { 1490 if (try_page_mte_tagging(page)) { 1491 mte_clear_page_tags(page_address(page)); 1492 set_page_mte_tagged(page); 1493 } 1494 } 1495 } 1496 1497 static bool kvm_vma_mte_allowed(struct vm_area_struct *vma) 1498 { 1499 return vma->vm_flags & VM_MTE_ALLOWED; 1500 } 1501 1502 static bool kvm_vma_is_cacheable(struct vm_area_struct *vma) 1503 { 1504 switch (FIELD_GET(PTE_ATTRINDX_MASK, pgprot_val(vma->vm_page_prot))) { 1505 case MT_NORMAL_NC: 1506 case MT_DEVICE_nGnRnE: 1507 case MT_DEVICE_nGnRE: 1508 return false; 1509 default: 1510 return true; 1511 } 1512 } 1513 1514 static int prepare_mmu_memcache(struct kvm_vcpu *vcpu, bool topup_memcache, 1515 void **memcache) 1516 { 1517 int min_pages; 1518 1519 if (!is_protected_kvm_enabled()) 1520 *memcache = &vcpu->arch.mmu_page_cache; 1521 else 1522 *memcache = &vcpu->arch.pkvm_memcache; 1523 1524 if (!topup_memcache) 1525 return 0; 1526 1527 min_pages = kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu); 1528 1529 if (!is_protected_kvm_enabled()) 1530 return kvm_mmu_topup_memory_cache(*memcache, min_pages); 1531 1532 return topup_hyp_memcache(*memcache, min_pages); 1533 } 1534 1535 /* 1536 * Potentially reduce shadow S2 permissions to match the guest's own S2. For 1537 * exec faults, we'd only reach this point if the guest actually allowed it (see 1538 * kvm_s2_handle_perm_fault). 1539 * 1540 * Also encode the level of the original translation in the SW bits of the leaf 1541 * entry as a proxy for the span of that translation. This will be retrieved on 1542 * TLB invalidation from the guest and used to limit the invalidation scope if a 1543 * TTL hint or a range isn't provided. 1544 */ 1545 static void adjust_nested_fault_perms(struct kvm_s2_trans *nested, 1546 enum kvm_pgtable_prot *prot, 1547 bool *writable) 1548 { 1549 *writable &= kvm_s2_trans_writable(nested); 1550 if (!kvm_s2_trans_readable(nested)) 1551 *prot &= ~KVM_PGTABLE_PROT_R; 1552 1553 *prot |= kvm_encode_nested_level(nested); 1554 } 1555 1556 static void adjust_nested_exec_perms(struct kvm *kvm, 1557 struct kvm_s2_trans *nested, 1558 enum kvm_pgtable_prot *prot) 1559 { 1560 if (!kvm_s2_trans_exec_el0(kvm, nested)) 1561 *prot &= ~KVM_PGTABLE_PROT_UX; 1562 if (!kvm_s2_trans_exec_el1(kvm, nested)) 1563 *prot &= ~KVM_PGTABLE_PROT_PX; 1564 } 1565 1566 static int gmem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, 1567 struct kvm_s2_trans *nested, 1568 struct kvm_memory_slot *memslot, bool is_perm) 1569 { 1570 bool write_fault, exec_fault, writable; 1571 enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_SHARED; 1572 enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; 1573 struct kvm_pgtable *pgt = vcpu->arch.hw_mmu->pgt; 1574 unsigned long mmu_seq; 1575 struct page *page; 1576 struct kvm *kvm = vcpu->kvm; 1577 void *memcache; 1578 kvm_pfn_t pfn; 1579 gfn_t gfn; 1580 int ret; 1581 1582 ret = prepare_mmu_memcache(vcpu, true, &memcache); 1583 if (ret) 1584 return ret; 1585 1586 if (nested) 1587 gfn = kvm_s2_trans_output(nested) >> PAGE_SHIFT; 1588 else 1589 gfn = fault_ipa >> PAGE_SHIFT; 1590 1591 write_fault = kvm_is_write_fault(vcpu); 1592 exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu); 1593 1594 VM_WARN_ON_ONCE(write_fault && exec_fault); 1595 1596 mmu_seq = kvm->mmu_invalidate_seq; 1597 /* Pairs with the smp_wmb() in kvm_mmu_invalidate_end(). */ 1598 smp_rmb(); 1599 1600 ret = kvm_gmem_get_pfn(kvm, memslot, gfn, &pfn, &page, NULL); 1601 if (ret) { 1602 kvm_prepare_memory_fault_exit(vcpu, fault_ipa, PAGE_SIZE, 1603 write_fault, exec_fault, false); 1604 return ret; 1605 } 1606 1607 writable = !(memslot->flags & KVM_MEM_READONLY); 1608 1609 if (nested) 1610 adjust_nested_fault_perms(nested, &prot, &writable); 1611 1612 if (writable) 1613 prot |= KVM_PGTABLE_PROT_W; 1614 1615 if (exec_fault || cpus_have_final_cap(ARM64_HAS_CACHE_DIC)) 1616 prot |= KVM_PGTABLE_PROT_X; 1617 1618 if (nested) 1619 adjust_nested_exec_perms(kvm, nested, &prot); 1620 1621 kvm_fault_lock(kvm); 1622 if (mmu_invalidate_retry(kvm, mmu_seq)) { 1623 ret = -EAGAIN; 1624 goto out_unlock; 1625 } 1626 1627 ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, fault_ipa, PAGE_SIZE, 1628 __pfn_to_phys(pfn), prot, 1629 memcache, flags); 1630 1631 out_unlock: 1632 kvm_release_faultin_page(kvm, page, !!ret, writable); 1633 kvm_fault_unlock(kvm); 1634 1635 if (writable && !ret) 1636 mark_page_dirty_in_slot(kvm, memslot, gfn); 1637 1638 return ret != -EAGAIN ? ret : 0; 1639 } 1640 1641 static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, 1642 struct kvm_s2_trans *nested, 1643 struct kvm_memory_slot *memslot, unsigned long hva, 1644 bool fault_is_perm) 1645 { 1646 int ret = 0; 1647 bool topup_memcache; 1648 bool write_fault, writable; 1649 bool exec_fault, mte_allowed, is_vma_cacheable; 1650 bool s2_force_noncacheable = false, vfio_allow_any_uc = false; 1651 unsigned long mmu_seq; 1652 phys_addr_t ipa = fault_ipa; 1653 struct kvm *kvm = vcpu->kvm; 1654 struct vm_area_struct *vma; 1655 short vma_shift; 1656 void *memcache; 1657 gfn_t gfn; 1658 kvm_pfn_t pfn; 1659 bool logging_active = memslot_is_logging(memslot); 1660 bool force_pte = logging_active; 1661 long vma_pagesize, fault_granule; 1662 enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; 1663 struct kvm_pgtable *pgt; 1664 struct page *page; 1665 vm_flags_t vm_flags; 1666 enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_SHARED; 1667 1668 if (fault_is_perm) 1669 fault_granule = kvm_vcpu_trap_get_perm_fault_granule(vcpu); 1670 write_fault = kvm_is_write_fault(vcpu); 1671 exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu); 1672 VM_WARN_ON_ONCE(write_fault && exec_fault); 1673 1674 /* 1675 * Permission faults just need to update the existing leaf entry, 1676 * and so normally don't require allocations from the memcache. The 1677 * only exception to this is when dirty logging is enabled at runtime 1678 * and a write fault needs to collapse a block entry into a table. 1679 */ 1680 topup_memcache = !fault_is_perm || (logging_active && write_fault); 1681 ret = prepare_mmu_memcache(vcpu, topup_memcache, &memcache); 1682 if (ret) 1683 return ret; 1684 1685 /* 1686 * Let's check if we will get back a huge page backed by hugetlbfs, or 1687 * get block mapping for device MMIO region. 1688 */ 1689 mmap_read_lock(current->mm); 1690 vma = vma_lookup(current->mm, hva); 1691 if (unlikely(!vma)) { 1692 kvm_err("Failed to find VMA for hva 0x%lx\n", hva); 1693 mmap_read_unlock(current->mm); 1694 return -EFAULT; 1695 } 1696 1697 if (force_pte) 1698 vma_shift = PAGE_SHIFT; 1699 else 1700 vma_shift = get_vma_page_shift(vma, hva); 1701 1702 switch (vma_shift) { 1703 #ifndef __PAGETABLE_PMD_FOLDED 1704 case PUD_SHIFT: 1705 if (fault_supports_stage2_huge_mapping(memslot, hva, PUD_SIZE)) 1706 break; 1707 fallthrough; 1708 #endif 1709 case CONT_PMD_SHIFT: 1710 vma_shift = PMD_SHIFT; 1711 fallthrough; 1712 case PMD_SHIFT: 1713 if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) 1714 break; 1715 fallthrough; 1716 case CONT_PTE_SHIFT: 1717 vma_shift = PAGE_SHIFT; 1718 force_pte = true; 1719 fallthrough; 1720 case PAGE_SHIFT: 1721 break; 1722 default: 1723 WARN_ONCE(1, "Unknown vma_shift %d", vma_shift); 1724 } 1725 1726 vma_pagesize = 1UL << vma_shift; 1727 1728 if (nested) { 1729 unsigned long max_map_size; 1730 1731 max_map_size = force_pte ? PAGE_SIZE : PUD_SIZE; 1732 1733 ipa = kvm_s2_trans_output(nested); 1734 1735 /* 1736 * If we're about to create a shadow stage 2 entry, then we 1737 * can only create a block mapping if the guest stage 2 page 1738 * table uses at least as big a mapping. 1739 */ 1740 max_map_size = min(kvm_s2_trans_size(nested), max_map_size); 1741 1742 /* 1743 * Be careful that if the mapping size falls between 1744 * two host sizes, take the smallest of the two. 1745 */ 1746 if (max_map_size >= PMD_SIZE && max_map_size < PUD_SIZE) 1747 max_map_size = PMD_SIZE; 1748 else if (max_map_size >= PAGE_SIZE && max_map_size < PMD_SIZE) 1749 max_map_size = PAGE_SIZE; 1750 1751 force_pte = (max_map_size == PAGE_SIZE); 1752 vma_pagesize = min_t(long, vma_pagesize, max_map_size); 1753 } 1754 1755 /* 1756 * Both the canonical IPA and fault IPA must be hugepage-aligned to 1757 * ensure we find the right PFN and lay down the mapping in the right 1758 * place. 1759 */ 1760 if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE) { 1761 fault_ipa &= ~(vma_pagesize - 1); 1762 ipa &= ~(vma_pagesize - 1); 1763 } 1764 1765 gfn = ipa >> PAGE_SHIFT; 1766 mte_allowed = kvm_vma_mte_allowed(vma); 1767 1768 vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED; 1769 1770 vm_flags = vma->vm_flags; 1771 1772 is_vma_cacheable = kvm_vma_is_cacheable(vma); 1773 1774 /* Don't use the VMA after the unlock -- it may have vanished */ 1775 vma = NULL; 1776 1777 /* 1778 * Read mmu_invalidate_seq so that KVM can detect if the results of 1779 * vma_lookup() or __kvm_faultin_pfn() become stale prior to 1780 * acquiring kvm->mmu_lock. 1781 * 1782 * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs 1783 * with the smp_wmb() in kvm_mmu_invalidate_end(). 1784 */ 1785 mmu_seq = kvm->mmu_invalidate_seq; 1786 mmap_read_unlock(current->mm); 1787 1788 pfn = __kvm_faultin_pfn(memslot, gfn, write_fault ? FOLL_WRITE : 0, 1789 &writable, &page); 1790 if (pfn == KVM_PFN_ERR_HWPOISON) { 1791 kvm_send_hwpoison_signal(hva, vma_shift); 1792 return 0; 1793 } 1794 if (is_error_noslot_pfn(pfn)) 1795 return -EFAULT; 1796 1797 /* 1798 * Check if this is non-struct page memory PFN, and cannot support 1799 * CMOs. It could potentially be unsafe to access as cacheable. 1800 */ 1801 if (vm_flags & (VM_PFNMAP | VM_MIXEDMAP) && !pfn_is_map_memory(pfn)) { 1802 if (is_vma_cacheable) { 1803 /* 1804 * Whilst the VMA owner expects cacheable mapping to this 1805 * PFN, hardware also has to support the FWB and CACHE DIC 1806 * features. 1807 * 1808 * ARM64 KVM relies on kernel VA mapping to the PFN to 1809 * perform cache maintenance as the CMO instructions work on 1810 * virtual addresses. VM_PFNMAP region are not necessarily 1811 * mapped to a KVA and hence the presence of hardware features 1812 * S2FWB and CACHE DIC are mandatory to avoid the need for 1813 * cache maintenance. 1814 */ 1815 if (!kvm_supports_cacheable_pfnmap()) 1816 ret = -EFAULT; 1817 } else { 1818 /* 1819 * If the page was identified as device early by looking at 1820 * the VMA flags, vma_pagesize is already representing the 1821 * largest quantity we can map. If instead it was mapped 1822 * via __kvm_faultin_pfn(), vma_pagesize is set to PAGE_SIZE 1823 * and must not be upgraded. 1824 * 1825 * In both cases, we don't let transparent_hugepage_adjust() 1826 * change things at the last minute. 1827 */ 1828 s2_force_noncacheable = true; 1829 } 1830 } else if (logging_active && !write_fault) { 1831 /* 1832 * Only actually map the page as writable if this was a write 1833 * fault. 1834 */ 1835 writable = false; 1836 } 1837 1838 if (exec_fault && s2_force_noncacheable) 1839 ret = -ENOEXEC; 1840 1841 if (ret) { 1842 kvm_release_page_unused(page); 1843 return ret; 1844 } 1845 1846 if (nested) 1847 adjust_nested_fault_perms(nested, &prot, &writable); 1848 1849 kvm_fault_lock(kvm); 1850 pgt = vcpu->arch.hw_mmu->pgt; 1851 if (mmu_invalidate_retry(kvm, mmu_seq)) { 1852 ret = -EAGAIN; 1853 goto out_unlock; 1854 } 1855 1856 /* 1857 * If we are not forced to use page mapping, check if we are 1858 * backed by a THP and thus use block mapping if possible. 1859 */ 1860 if (vma_pagesize == PAGE_SIZE && !(force_pte || s2_force_noncacheable)) { 1861 if (fault_is_perm && fault_granule > PAGE_SIZE) 1862 vma_pagesize = fault_granule; 1863 else 1864 vma_pagesize = transparent_hugepage_adjust(kvm, memslot, 1865 hva, &pfn, 1866 &fault_ipa); 1867 1868 if (vma_pagesize < 0) { 1869 ret = vma_pagesize; 1870 goto out_unlock; 1871 } 1872 } 1873 1874 if (!fault_is_perm && !s2_force_noncacheable && kvm_has_mte(kvm)) { 1875 /* Check the VMM hasn't introduced a new disallowed VMA */ 1876 if (mte_allowed) { 1877 sanitise_mte_tags(kvm, pfn, vma_pagesize); 1878 } else { 1879 ret = -EFAULT; 1880 goto out_unlock; 1881 } 1882 } 1883 1884 if (writable) 1885 prot |= KVM_PGTABLE_PROT_W; 1886 1887 if (exec_fault) 1888 prot |= KVM_PGTABLE_PROT_X; 1889 1890 if (s2_force_noncacheable) { 1891 if (vfio_allow_any_uc) 1892 prot |= KVM_PGTABLE_PROT_NORMAL_NC; 1893 else 1894 prot |= KVM_PGTABLE_PROT_DEVICE; 1895 } else if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC)) { 1896 prot |= KVM_PGTABLE_PROT_X; 1897 } 1898 1899 if (nested) 1900 adjust_nested_exec_perms(kvm, nested, &prot); 1901 1902 /* 1903 * Under the premise of getting a FSC_PERM fault, we just need to relax 1904 * permissions only if vma_pagesize equals fault_granule. Otherwise, 1905 * kvm_pgtable_stage2_map() should be called to change block size. 1906 */ 1907 if (fault_is_perm && vma_pagesize == fault_granule) { 1908 /* 1909 * Drop the SW bits in favour of those stored in the 1910 * PTE, which will be preserved. 1911 */ 1912 prot &= ~KVM_NV_GUEST_MAP_SZ; 1913 ret = KVM_PGT_FN(kvm_pgtable_stage2_relax_perms)(pgt, fault_ipa, prot, flags); 1914 } else { 1915 ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, fault_ipa, vma_pagesize, 1916 __pfn_to_phys(pfn), prot, 1917 memcache, flags); 1918 } 1919 1920 out_unlock: 1921 kvm_release_faultin_page(kvm, page, !!ret, writable); 1922 kvm_fault_unlock(kvm); 1923 1924 /* Mark the page dirty only if the fault is handled successfully */ 1925 if (writable && !ret) 1926 mark_page_dirty_in_slot(kvm, memslot, gfn); 1927 1928 return ret != -EAGAIN ? ret : 0; 1929 } 1930 1931 /* Resolve the access fault by making the page young again. */ 1932 static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) 1933 { 1934 enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_SHARED; 1935 struct kvm_s2_mmu *mmu; 1936 1937 trace_kvm_access_fault(fault_ipa); 1938 1939 read_lock(&vcpu->kvm->mmu_lock); 1940 mmu = vcpu->arch.hw_mmu; 1941 KVM_PGT_FN(kvm_pgtable_stage2_mkyoung)(mmu->pgt, fault_ipa, flags); 1942 read_unlock(&vcpu->kvm->mmu_lock); 1943 } 1944 1945 /* 1946 * Returns true if the SEA should be handled locally within KVM if the abort 1947 * is caused by a kernel memory allocation (e.g. stage-2 table memory). 1948 */ 1949 static bool host_owns_sea(struct kvm_vcpu *vcpu, u64 esr) 1950 { 1951 /* 1952 * Without FEAT_RAS HCR_EL2.TEA is RES0, meaning any external abort 1953 * taken from a guest EL to EL2 is due to a host-imposed access (e.g. 1954 * stage-2 PTW). 1955 */ 1956 if (!cpus_have_final_cap(ARM64_HAS_RAS_EXTN)) 1957 return true; 1958 1959 /* KVM owns the VNCR when the vCPU isn't in a nested context. */ 1960 if (is_hyp_ctxt(vcpu) && !kvm_vcpu_trap_is_iabt(vcpu) && (esr & ESR_ELx_VNCR)) 1961 return true; 1962 1963 /* 1964 * Determining if an external abort during a table walk happened at 1965 * stage-2 is only possible with S1PTW is set. Otherwise, since KVM 1966 * sets HCR_EL2.TEA, SEAs due to a stage-1 walk (i.e. accessing the 1967 * PA of the stage-1 descriptor) can reach here and are reported 1968 * with a TTW ESR value. 1969 */ 1970 return (esr_fsc_is_sea_ttw(esr) && (esr & ESR_ELx_S1PTW)); 1971 } 1972 1973 int kvm_handle_guest_sea(struct kvm_vcpu *vcpu) 1974 { 1975 struct kvm *kvm = vcpu->kvm; 1976 struct kvm_run *run = vcpu->run; 1977 u64 esr = kvm_vcpu_get_esr(vcpu); 1978 u64 esr_mask = ESR_ELx_EC_MASK | 1979 ESR_ELx_IL | 1980 ESR_ELx_FnV | 1981 ESR_ELx_EA | 1982 ESR_ELx_CM | 1983 ESR_ELx_WNR | 1984 ESR_ELx_FSC; 1985 u64 ipa; 1986 1987 /* 1988 * Give APEI the opportunity to claim the abort before handling it 1989 * within KVM. apei_claim_sea() expects to be called with IRQs enabled. 1990 */ 1991 lockdep_assert_irqs_enabled(); 1992 if (apei_claim_sea(NULL) == 0) 1993 return 1; 1994 1995 if (host_owns_sea(vcpu, esr) || 1996 !test_bit(KVM_ARCH_FLAG_EXIT_SEA, &vcpu->kvm->arch.flags)) 1997 return kvm_inject_serror(vcpu); 1998 1999 /* ESR_ELx.SET is RES0 when FEAT_RAS isn't implemented. */ 2000 if (kvm_has_ras(kvm)) 2001 esr_mask |= ESR_ELx_SET_MASK; 2002 2003 /* 2004 * Exit to userspace, and provide faulting guest virtual and physical 2005 * addresses in case userspace wants to emulate SEA to guest by 2006 * writing to FAR_ELx and HPFAR_ELx registers. 2007 */ 2008 memset(&run->arm_sea, 0, sizeof(run->arm_sea)); 2009 run->exit_reason = KVM_EXIT_ARM_SEA; 2010 run->arm_sea.esr = esr & esr_mask; 2011 2012 if (!(esr & ESR_ELx_FnV)) 2013 run->arm_sea.gva = kvm_vcpu_get_hfar(vcpu); 2014 2015 ipa = kvm_vcpu_get_fault_ipa(vcpu); 2016 if (ipa != INVALID_GPA) { 2017 run->arm_sea.flags |= KVM_EXIT_ARM_SEA_FLAG_GPA_VALID; 2018 run->arm_sea.gpa = ipa; 2019 } 2020 2021 return 0; 2022 } 2023 2024 /** 2025 * kvm_handle_guest_abort - handles all 2nd stage aborts 2026 * @vcpu: the VCPU pointer 2027 * 2028 * Any abort that gets to the host is almost guaranteed to be caused by a 2029 * missing second stage translation table entry, which can mean that either the 2030 * guest simply needs more memory and we must allocate an appropriate page or it 2031 * can mean that the guest tried to access I/O memory, which is emulated by user 2032 * space. The distinction is based on the IPA causing the fault and whether this 2033 * memory region has been registered as standard RAM by user space. 2034 */ 2035 int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) 2036 { 2037 struct kvm_s2_trans nested_trans, *nested = NULL; 2038 unsigned long esr; 2039 phys_addr_t fault_ipa; /* The address we faulted on */ 2040 phys_addr_t ipa; /* Always the IPA in the L1 guest phys space */ 2041 struct kvm_memory_slot *memslot; 2042 unsigned long hva; 2043 bool is_iabt, write_fault, writable; 2044 gfn_t gfn; 2045 int ret, idx; 2046 2047 if (kvm_vcpu_abt_issea(vcpu)) 2048 return kvm_handle_guest_sea(vcpu); 2049 2050 esr = kvm_vcpu_get_esr(vcpu); 2051 2052 /* 2053 * The fault IPA should be reliable at this point as we're not dealing 2054 * with an SEA. 2055 */ 2056 ipa = fault_ipa = kvm_vcpu_get_fault_ipa(vcpu); 2057 if (KVM_BUG_ON(ipa == INVALID_GPA, vcpu->kvm)) 2058 return -EFAULT; 2059 2060 is_iabt = kvm_vcpu_trap_is_iabt(vcpu); 2061 2062 if (esr_fsc_is_translation_fault(esr)) { 2063 /* Beyond sanitised PARange (which is the IPA limit) */ 2064 if (fault_ipa >= BIT_ULL(get_kvm_ipa_limit())) { 2065 kvm_inject_size_fault(vcpu); 2066 return 1; 2067 } 2068 2069 /* Falls between the IPA range and the PARange? */ 2070 if (fault_ipa >= BIT_ULL(VTCR_EL2_IPA(vcpu->arch.hw_mmu->vtcr))) { 2071 fault_ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0); 2072 2073 return kvm_inject_sea(vcpu, is_iabt, fault_ipa); 2074 } 2075 } 2076 2077 trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_esr(vcpu), 2078 kvm_vcpu_get_hfar(vcpu), fault_ipa); 2079 2080 /* Check the stage-2 fault is trans. fault or write fault */ 2081 if (!esr_fsc_is_translation_fault(esr) && 2082 !esr_fsc_is_permission_fault(esr) && 2083 !esr_fsc_is_access_flag_fault(esr)) { 2084 kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n", 2085 kvm_vcpu_trap_get_class(vcpu), 2086 (unsigned long)kvm_vcpu_trap_get_fault(vcpu), 2087 (unsigned long)kvm_vcpu_get_esr(vcpu)); 2088 return -EFAULT; 2089 } 2090 2091 idx = srcu_read_lock(&vcpu->kvm->srcu); 2092 2093 /* 2094 * We may have faulted on a shadow stage 2 page table if we are 2095 * running a nested guest. In this case, we have to resolve the L2 2096 * IPA to the L1 IPA first, before knowing what kind of memory should 2097 * back the L1 IPA. 2098 * 2099 * If the shadow stage 2 page table walk faults, then we simply inject 2100 * this to the guest and carry on. 2101 * 2102 * If there are no shadow S2 PTs because S2 is disabled, there is 2103 * nothing to walk and we treat it as a 1:1 before going through the 2104 * canonical translation. 2105 */ 2106 if (kvm_is_nested_s2_mmu(vcpu->kvm,vcpu->arch.hw_mmu) && 2107 vcpu->arch.hw_mmu->nested_stage2_enabled) { 2108 u32 esr; 2109 2110 ret = kvm_walk_nested_s2(vcpu, fault_ipa, &nested_trans); 2111 if (ret == -EAGAIN) { 2112 ret = 1; 2113 goto out_unlock; 2114 } 2115 2116 if (ret) { 2117 esr = kvm_s2_trans_esr(&nested_trans); 2118 kvm_inject_s2_fault(vcpu, esr); 2119 goto out_unlock; 2120 } 2121 2122 ret = kvm_s2_handle_perm_fault(vcpu, &nested_trans); 2123 if (ret) { 2124 esr = kvm_s2_trans_esr(&nested_trans); 2125 kvm_inject_s2_fault(vcpu, esr); 2126 goto out_unlock; 2127 } 2128 2129 ipa = kvm_s2_trans_output(&nested_trans); 2130 nested = &nested_trans; 2131 } 2132 2133 gfn = ipa >> PAGE_SHIFT; 2134 memslot = gfn_to_memslot(vcpu->kvm, gfn); 2135 hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable); 2136 write_fault = kvm_is_write_fault(vcpu); 2137 if (kvm_is_error_hva(hva) || (write_fault && !writable)) { 2138 /* 2139 * The guest has put either its instructions or its page-tables 2140 * somewhere it shouldn't have. Userspace won't be able to do 2141 * anything about this (there's no syndrome for a start), so 2142 * re-inject the abort back into the guest. 2143 */ 2144 if (is_iabt) { 2145 ret = -ENOEXEC; 2146 goto out; 2147 } 2148 2149 if (kvm_vcpu_abt_iss1tw(vcpu)) { 2150 ret = kvm_inject_sea_dabt(vcpu, kvm_vcpu_get_hfar(vcpu)); 2151 goto out_unlock; 2152 } 2153 2154 /* 2155 * Check for a cache maintenance operation. Since we 2156 * ended-up here, we know it is outside of any memory 2157 * slot. But we can't find out if that is for a device, 2158 * or if the guest is just being stupid. The only thing 2159 * we know for sure is that this range cannot be cached. 2160 * 2161 * So let's assume that the guest is just being 2162 * cautious, and skip the instruction. 2163 */ 2164 if (kvm_is_error_hva(hva) && kvm_vcpu_dabt_is_cm(vcpu)) { 2165 kvm_incr_pc(vcpu); 2166 ret = 1; 2167 goto out_unlock; 2168 } 2169 2170 /* 2171 * The IPA is reported as [MAX:12], so we need to 2172 * complement it with the bottom 12 bits from the 2173 * faulting VA. This is always 12 bits, irrespective 2174 * of the page size. 2175 */ 2176 ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0); 2177 ret = io_mem_abort(vcpu, ipa); 2178 goto out_unlock; 2179 } 2180 2181 /* Userspace should not be able to register out-of-bounds IPAs */ 2182 VM_BUG_ON(ipa >= kvm_phys_size(vcpu->arch.hw_mmu)); 2183 2184 if (esr_fsc_is_access_flag_fault(esr)) { 2185 handle_access_fault(vcpu, fault_ipa); 2186 ret = 1; 2187 goto out_unlock; 2188 } 2189 2190 VM_WARN_ON_ONCE(kvm_vcpu_trap_is_permission_fault(vcpu) && 2191 !write_fault && !kvm_vcpu_trap_is_exec_fault(vcpu)); 2192 2193 if (kvm_slot_has_gmem(memslot)) 2194 ret = gmem_abort(vcpu, fault_ipa, nested, memslot, 2195 esr_fsc_is_permission_fault(esr)); 2196 else 2197 ret = user_mem_abort(vcpu, fault_ipa, nested, memslot, hva, 2198 esr_fsc_is_permission_fault(esr)); 2199 if (ret == 0) 2200 ret = 1; 2201 out: 2202 if (ret == -ENOEXEC) 2203 ret = kvm_inject_sea_iabt(vcpu, kvm_vcpu_get_hfar(vcpu)); 2204 out_unlock: 2205 srcu_read_unlock(&vcpu->kvm->srcu, idx); 2206 return ret; 2207 } 2208 2209 bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) 2210 { 2211 if (!kvm->arch.mmu.pgt) 2212 return false; 2213 2214 __unmap_stage2_range(&kvm->arch.mmu, range->start << PAGE_SHIFT, 2215 (range->end - range->start) << PAGE_SHIFT, 2216 range->may_block); 2217 2218 kvm_nested_s2_unmap(kvm, range->may_block); 2219 return false; 2220 } 2221 2222 bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 2223 { 2224 u64 size = (range->end - range->start) << PAGE_SHIFT; 2225 2226 if (!kvm->arch.mmu.pgt) 2227 return false; 2228 2229 return KVM_PGT_FN(kvm_pgtable_stage2_test_clear_young)(kvm->arch.mmu.pgt, 2230 range->start << PAGE_SHIFT, 2231 size, true); 2232 /* 2233 * TODO: Handle nested_mmu structures here using the reverse mapping in 2234 * a later version of patch series. 2235 */ 2236 } 2237 2238 bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 2239 { 2240 u64 size = (range->end - range->start) << PAGE_SHIFT; 2241 2242 if (!kvm->arch.mmu.pgt) 2243 return false; 2244 2245 return KVM_PGT_FN(kvm_pgtable_stage2_test_clear_young)(kvm->arch.mmu.pgt, 2246 range->start << PAGE_SHIFT, 2247 size, false); 2248 } 2249 2250 phys_addr_t kvm_mmu_get_httbr(void) 2251 { 2252 return __pa(hyp_pgtable->pgd); 2253 } 2254 2255 phys_addr_t kvm_get_idmap_vector(void) 2256 { 2257 return hyp_idmap_vector; 2258 } 2259 2260 static int kvm_map_idmap_text(void) 2261 { 2262 unsigned long size = hyp_idmap_end - hyp_idmap_start; 2263 int err = __create_hyp_mappings(hyp_idmap_start, size, hyp_idmap_start, 2264 PAGE_HYP_EXEC); 2265 if (err) 2266 kvm_err("Failed to idmap %lx-%lx\n", 2267 hyp_idmap_start, hyp_idmap_end); 2268 2269 return err; 2270 } 2271 2272 static void *kvm_hyp_zalloc_page(void *arg) 2273 { 2274 return (void *)get_zeroed_page(GFP_KERNEL); 2275 } 2276 2277 static struct kvm_pgtable_mm_ops kvm_hyp_mm_ops = { 2278 .zalloc_page = kvm_hyp_zalloc_page, 2279 .get_page = kvm_host_get_page, 2280 .put_page = kvm_host_put_page, 2281 .phys_to_virt = kvm_host_va, 2282 .virt_to_phys = kvm_host_pa, 2283 }; 2284 2285 int __init kvm_mmu_init(u32 *hyp_va_bits) 2286 { 2287 int err; 2288 u32 idmap_bits; 2289 u32 kernel_bits; 2290 2291 hyp_idmap_start = __pa_symbol(__hyp_idmap_text_start); 2292 hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE); 2293 hyp_idmap_end = __pa_symbol(__hyp_idmap_text_end); 2294 hyp_idmap_end = ALIGN(hyp_idmap_end, PAGE_SIZE); 2295 hyp_idmap_vector = __pa_symbol(__kvm_hyp_init); 2296 2297 /* 2298 * We rely on the linker script to ensure at build time that the HYP 2299 * init code does not cross a page boundary. 2300 */ 2301 BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK); 2302 2303 /* 2304 * The ID map is always configured for 48 bits of translation, which 2305 * may be fewer than the number of VA bits used by the regular kernel 2306 * stage 1, when VA_BITS=52. 2307 * 2308 * At EL2, there is only one TTBR register, and we can't switch between 2309 * translation tables *and* update TCR_EL2.T0SZ at the same time. Bottom 2310 * line: we need to use the extended range with *both* our translation 2311 * tables. 2312 * 2313 * So use the maximum of the idmap VA bits and the regular kernel stage 2314 * 1 VA bits to assure that the hypervisor can both ID map its code page 2315 * and map any kernel memory. 2316 */ 2317 idmap_bits = IDMAP_VA_BITS; 2318 kernel_bits = vabits_actual; 2319 *hyp_va_bits = max(idmap_bits, kernel_bits); 2320 2321 kvm_debug("Using %u-bit virtual addresses at EL2\n", *hyp_va_bits); 2322 kvm_debug("IDMAP page: %lx\n", hyp_idmap_start); 2323 kvm_debug("HYP VA range: %lx:%lx\n", 2324 kern_hyp_va(PAGE_OFFSET), 2325 kern_hyp_va((unsigned long)high_memory - 1)); 2326 2327 if (hyp_idmap_start >= kern_hyp_va(PAGE_OFFSET) && 2328 hyp_idmap_start < kern_hyp_va((unsigned long)high_memory - 1) && 2329 hyp_idmap_start != (unsigned long)__hyp_idmap_text_start) { 2330 /* 2331 * The idmap page is intersecting with the VA space, 2332 * it is not safe to continue further. 2333 */ 2334 kvm_err("IDMAP intersecting with HYP VA, unable to continue\n"); 2335 err = -EINVAL; 2336 goto out; 2337 } 2338 2339 hyp_pgtable = kzalloc(sizeof(*hyp_pgtable), GFP_KERNEL); 2340 if (!hyp_pgtable) { 2341 kvm_err("Hyp mode page-table not allocated\n"); 2342 err = -ENOMEM; 2343 goto out; 2344 } 2345 2346 err = kvm_pgtable_hyp_init(hyp_pgtable, *hyp_va_bits, &kvm_hyp_mm_ops); 2347 if (err) 2348 goto out_free_pgtable; 2349 2350 err = kvm_map_idmap_text(); 2351 if (err) 2352 goto out_destroy_pgtable; 2353 2354 io_map_base = hyp_idmap_start; 2355 __hyp_va_bits = *hyp_va_bits; 2356 return 0; 2357 2358 out_destroy_pgtable: 2359 kvm_pgtable_hyp_destroy(hyp_pgtable); 2360 out_free_pgtable: 2361 kfree(hyp_pgtable); 2362 hyp_pgtable = NULL; 2363 out: 2364 return err; 2365 } 2366 2367 void kvm_arch_commit_memory_region(struct kvm *kvm, 2368 struct kvm_memory_slot *old, 2369 const struct kvm_memory_slot *new, 2370 enum kvm_mr_change change) 2371 { 2372 bool log_dirty_pages = new && new->flags & KVM_MEM_LOG_DIRTY_PAGES; 2373 2374 /* 2375 * At this point memslot has been committed and there is an 2376 * allocated dirty_bitmap[], dirty pages will be tracked while the 2377 * memory slot is write protected. 2378 */ 2379 if (log_dirty_pages) { 2380 2381 if (change == KVM_MR_DELETE) 2382 return; 2383 2384 /* 2385 * Huge and normal pages are write-protected and split 2386 * on either of these two cases: 2387 * 2388 * 1. with initial-all-set: gradually with CLEAR ioctls, 2389 */ 2390 if (kvm_dirty_log_manual_protect_and_init_set(kvm)) 2391 return; 2392 /* 2393 * or 2394 * 2. without initial-all-set: all in one shot when 2395 * enabling dirty logging. 2396 */ 2397 kvm_mmu_wp_memory_region(kvm, new->id); 2398 kvm_mmu_split_memory_region(kvm, new->id); 2399 } else { 2400 /* 2401 * Free any leftovers from the eager page splitting cache. Do 2402 * this when deleting, moving, disabling dirty logging, or 2403 * creating the memslot (a nop). Doing it for deletes makes 2404 * sure we don't leak memory, and there's no need to keep the 2405 * cache around for any of the other cases. 2406 */ 2407 kvm_mmu_free_memory_cache(&kvm->arch.mmu.split_page_cache); 2408 } 2409 } 2410 2411 int kvm_arch_prepare_memory_region(struct kvm *kvm, 2412 const struct kvm_memory_slot *old, 2413 struct kvm_memory_slot *new, 2414 enum kvm_mr_change change) 2415 { 2416 hva_t hva, reg_end; 2417 int ret = 0; 2418 2419 if (change != KVM_MR_CREATE && change != KVM_MR_MOVE && 2420 change != KVM_MR_FLAGS_ONLY) 2421 return 0; 2422 2423 /* 2424 * Prevent userspace from creating a memory region outside of the IPA 2425 * space addressable by the KVM guest IPA space. 2426 */ 2427 if ((new->base_gfn + new->npages) > (kvm_phys_size(&kvm->arch.mmu) >> PAGE_SHIFT)) 2428 return -EFAULT; 2429 2430 /* 2431 * Only support guest_memfd backed memslots with mappable memory, since 2432 * there aren't any CoCo VMs that support only private memory on arm64. 2433 */ 2434 if (kvm_slot_has_gmem(new) && !kvm_memslot_is_gmem_only(new)) 2435 return -EINVAL; 2436 2437 hva = new->userspace_addr; 2438 reg_end = hva + (new->npages << PAGE_SHIFT); 2439 2440 mmap_read_lock(current->mm); 2441 /* 2442 * A memory region could potentially cover multiple VMAs, and any holes 2443 * between them, so iterate over all of them. 2444 * 2445 * +--------------------------------------------+ 2446 * +---------------+----------------+ +----------------+ 2447 * | : VMA 1 | VMA 2 | | VMA 3 : | 2448 * +---------------+----------------+ +----------------+ 2449 * | memory region | 2450 * +--------------------------------------------+ 2451 */ 2452 do { 2453 struct vm_area_struct *vma; 2454 2455 vma = find_vma_intersection(current->mm, hva, reg_end); 2456 if (!vma) 2457 break; 2458 2459 if (kvm_has_mte(kvm) && !kvm_vma_mte_allowed(vma)) { 2460 ret = -EINVAL; 2461 break; 2462 } 2463 2464 if (vma->vm_flags & VM_PFNMAP) { 2465 /* IO region dirty page logging not allowed */ 2466 if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) { 2467 ret = -EINVAL; 2468 break; 2469 } 2470 2471 /* 2472 * Cacheable PFNMAP is allowed only if the hardware 2473 * supports it. 2474 */ 2475 if (kvm_vma_is_cacheable(vma) && !kvm_supports_cacheable_pfnmap()) { 2476 ret = -EINVAL; 2477 break; 2478 } 2479 } 2480 hva = min(reg_end, vma->vm_end); 2481 } while (hva < reg_end); 2482 2483 mmap_read_unlock(current->mm); 2484 return ret; 2485 } 2486 2487 void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) 2488 { 2489 } 2490 2491 void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) 2492 { 2493 } 2494 2495 void kvm_arch_flush_shadow_memslot(struct kvm *kvm, 2496 struct kvm_memory_slot *slot) 2497 { 2498 gpa_t gpa = slot->base_gfn << PAGE_SHIFT; 2499 phys_addr_t size = slot->npages << PAGE_SHIFT; 2500 2501 write_lock(&kvm->mmu_lock); 2502 kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, size, true); 2503 kvm_nested_s2_unmap(kvm, true); 2504 write_unlock(&kvm->mmu_lock); 2505 } 2506 2507 /* 2508 * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized). 2509 * 2510 * Main problems: 2511 * - S/W ops are local to a CPU (not broadcast) 2512 * - We have line migration behind our back (speculation) 2513 * - System caches don't support S/W at all (damn!) 2514 * 2515 * In the face of the above, the best we can do is to try and convert 2516 * S/W ops to VA ops. Because the guest is not allowed to infer the 2517 * S/W to PA mapping, it can only use S/W to nuke the whole cache, 2518 * which is a rather good thing for us. 2519 * 2520 * Also, it is only used when turning caches on/off ("The expected 2521 * usage of the cache maintenance instructions that operate by set/way 2522 * is associated with the cache maintenance instructions associated 2523 * with the powerdown and powerup of caches, if this is required by 2524 * the implementation."). 2525 * 2526 * We use the following policy: 2527 * 2528 * - If we trap a S/W operation, we enable VM trapping to detect 2529 * caches being turned on/off, and do a full clean. 2530 * 2531 * - We flush the caches on both caches being turned on and off. 2532 * 2533 * - Once the caches are enabled, we stop trapping VM ops. 2534 */ 2535 void kvm_set_way_flush(struct kvm_vcpu *vcpu) 2536 { 2537 unsigned long hcr = *vcpu_hcr(vcpu); 2538 2539 /* 2540 * If this is the first time we do a S/W operation 2541 * (i.e. HCR_TVM not set) flush the whole memory, and set the 2542 * VM trapping. 2543 * 2544 * Otherwise, rely on the VM trapping to wait for the MMU + 2545 * Caches to be turned off. At that point, we'll be able to 2546 * clean the caches again. 2547 */ 2548 if (!(hcr & HCR_TVM)) { 2549 trace_kvm_set_way_flush(*vcpu_pc(vcpu), 2550 vcpu_has_cache_enabled(vcpu)); 2551 stage2_flush_vm(vcpu->kvm); 2552 *vcpu_hcr(vcpu) = hcr | HCR_TVM; 2553 } 2554 } 2555 2556 void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled) 2557 { 2558 bool now_enabled = vcpu_has_cache_enabled(vcpu); 2559 2560 /* 2561 * If switching the MMU+caches on, need to invalidate the caches. 2562 * If switching it off, need to clean the caches. 2563 * Clean + invalidate does the trick always. 2564 */ 2565 if (now_enabled != was_enabled) 2566 stage2_flush_vm(vcpu->kvm); 2567 2568 /* Caches are now on, stop trapping VM ops (until a S/W op) */ 2569 if (now_enabled) 2570 *vcpu_hcr(vcpu) &= ~HCR_TVM; 2571 2572 trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled); 2573 } 2574