1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2012 - Virtual Open Systems and Columbia University 4 * Author: Christoffer Dall <c.dall@virtualopensystems.com> 5 */ 6 7 #include <linux/acpi.h> 8 #include <linux/mman.h> 9 #include <linux/kvm_host.h> 10 #include <linux/io.h> 11 #include <linux/hugetlb.h> 12 #include <linux/sched/signal.h> 13 #include <trace/events/kvm.h> 14 #include <asm/acpi.h> 15 #include <asm/pgalloc.h> 16 #include <asm/cacheflush.h> 17 #include <asm/kvm_arm.h> 18 #include <asm/kvm_mmu.h> 19 #include <asm/kvm_pgtable.h> 20 #include <asm/kvm_pkvm.h> 21 #include <asm/kvm_asm.h> 22 #include <asm/kvm_emulate.h> 23 #include <asm/virt.h> 24 25 #include "trace.h" 26 27 static struct kvm_pgtable *hyp_pgtable; 28 static DEFINE_MUTEX(kvm_hyp_pgd_mutex); 29 30 static unsigned long __ro_after_init hyp_idmap_start; 31 static unsigned long __ro_after_init hyp_idmap_end; 32 static phys_addr_t __ro_after_init hyp_idmap_vector; 33 34 u32 __ro_after_init __hyp_va_bits; 35 36 static unsigned long __ro_after_init io_map_base; 37 38 #define KVM_PGT_FN(fn) (!is_protected_kvm_enabled() ? fn : p ## fn) 39 40 static phys_addr_t __stage2_range_addr_end(phys_addr_t addr, phys_addr_t end, 41 phys_addr_t size) 42 { 43 phys_addr_t boundary = ALIGN_DOWN(addr + size, size); 44 45 return (boundary - 1 < end - 1) ? boundary : end; 46 } 47 48 static phys_addr_t stage2_range_addr_end(phys_addr_t addr, phys_addr_t end) 49 { 50 phys_addr_t size = kvm_granule_size(KVM_PGTABLE_MIN_BLOCK_LEVEL); 51 52 return __stage2_range_addr_end(addr, end, size); 53 } 54 55 /* 56 * Release kvm_mmu_lock periodically if the memory region is large. Otherwise, 57 * we may see kernel panics with CONFIG_DETECT_HUNG_TASK, 58 * CONFIG_LOCKUP_DETECTOR, CONFIG_LOCKDEP. Additionally, holding the lock too 59 * long will also starve other vCPUs. We have to also make sure that the page 60 * tables are not freed while we released the lock. 61 */ 62 static int stage2_apply_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, 63 phys_addr_t end, 64 int (*fn)(struct kvm_pgtable *, u64, u64), 65 bool resched) 66 { 67 struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu); 68 int ret; 69 u64 next; 70 71 do { 72 struct kvm_pgtable *pgt = mmu->pgt; 73 if (!pgt) 74 return -EINVAL; 75 76 next = stage2_range_addr_end(addr, end); 77 ret = fn(pgt, addr, next - addr); 78 if (ret) 79 break; 80 81 if (resched && next != end) 82 cond_resched_rwlock_write(&kvm->mmu_lock); 83 } while (addr = next, addr != end); 84 85 return ret; 86 } 87 88 #define stage2_apply_range_resched(mmu, addr, end, fn) \ 89 stage2_apply_range(mmu, addr, end, fn, true) 90 91 /* 92 * Get the maximum number of page-tables pages needed to split a range 93 * of blocks into PAGE_SIZE PTEs. It assumes the range is already 94 * mapped at level 2, or at level 1 if allowed. 95 */ 96 static int kvm_mmu_split_nr_page_tables(u64 range) 97 { 98 int n = 0; 99 100 if (KVM_PGTABLE_MIN_BLOCK_LEVEL < 2) 101 n += DIV_ROUND_UP(range, PUD_SIZE); 102 n += DIV_ROUND_UP(range, PMD_SIZE); 103 return n; 104 } 105 106 static bool need_split_memcache_topup_or_resched(struct kvm *kvm) 107 { 108 struct kvm_mmu_memory_cache *cache; 109 u64 chunk_size, min; 110 111 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) 112 return true; 113 114 chunk_size = kvm->arch.mmu.split_page_chunk_size; 115 min = kvm_mmu_split_nr_page_tables(chunk_size); 116 cache = &kvm->arch.mmu.split_page_cache; 117 return kvm_mmu_memory_cache_nr_free_objects(cache) < min; 118 } 119 120 static int kvm_mmu_split_huge_pages(struct kvm *kvm, phys_addr_t addr, 121 phys_addr_t end) 122 { 123 struct kvm_mmu_memory_cache *cache; 124 struct kvm_pgtable *pgt; 125 int ret, cache_capacity; 126 u64 next, chunk_size; 127 128 lockdep_assert_held_write(&kvm->mmu_lock); 129 130 chunk_size = kvm->arch.mmu.split_page_chunk_size; 131 cache_capacity = kvm_mmu_split_nr_page_tables(chunk_size); 132 133 if (chunk_size == 0) 134 return 0; 135 136 cache = &kvm->arch.mmu.split_page_cache; 137 138 do { 139 if (need_split_memcache_topup_or_resched(kvm)) { 140 write_unlock(&kvm->mmu_lock); 141 cond_resched(); 142 /* Eager page splitting is best-effort. */ 143 ret = __kvm_mmu_topup_memory_cache(cache, 144 cache_capacity, 145 cache_capacity); 146 write_lock(&kvm->mmu_lock); 147 if (ret) 148 break; 149 } 150 151 pgt = kvm->arch.mmu.pgt; 152 if (!pgt) 153 return -EINVAL; 154 155 next = __stage2_range_addr_end(addr, end, chunk_size); 156 ret = KVM_PGT_FN(kvm_pgtable_stage2_split)(pgt, addr, next - addr, cache); 157 if (ret) 158 break; 159 } while (addr = next, addr != end); 160 161 return ret; 162 } 163 164 static bool memslot_is_logging(struct kvm_memory_slot *memslot) 165 { 166 return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY); 167 } 168 169 /** 170 * kvm_arch_flush_remote_tlbs() - flush all VM TLB entries for v7/8 171 * @kvm: pointer to kvm structure. 172 * 173 * Interface to HYP function to flush all VM TLB entries 174 */ 175 int kvm_arch_flush_remote_tlbs(struct kvm *kvm) 176 { 177 if (is_protected_kvm_enabled()) 178 kvm_call_hyp_nvhe(__pkvm_tlb_flush_vmid, kvm->arch.pkvm.handle); 179 else 180 kvm_call_hyp(__kvm_tlb_flush_vmid, &kvm->arch.mmu); 181 return 0; 182 } 183 184 int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, 185 gfn_t gfn, u64 nr_pages) 186 { 187 u64 size = nr_pages << PAGE_SHIFT; 188 u64 addr = gfn << PAGE_SHIFT; 189 190 if (is_protected_kvm_enabled()) 191 kvm_call_hyp_nvhe(__pkvm_tlb_flush_vmid, kvm->arch.pkvm.handle); 192 else 193 kvm_tlb_flush_vmid_range(&kvm->arch.mmu, addr, size); 194 return 0; 195 } 196 197 static void *stage2_memcache_zalloc_page(void *arg) 198 { 199 struct kvm_mmu_memory_cache *mc = arg; 200 void *virt; 201 202 /* Allocated with __GFP_ZERO, so no need to zero */ 203 virt = kvm_mmu_memory_cache_alloc(mc); 204 if (virt) 205 kvm_account_pgtable_pages(virt, 1); 206 return virt; 207 } 208 209 static void *kvm_host_zalloc_pages_exact(size_t size) 210 { 211 return alloc_pages_exact(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO); 212 } 213 214 static void *kvm_s2_zalloc_pages_exact(size_t size) 215 { 216 void *virt = kvm_host_zalloc_pages_exact(size); 217 218 if (virt) 219 kvm_account_pgtable_pages(virt, (size >> PAGE_SHIFT)); 220 return virt; 221 } 222 223 static void kvm_s2_free_pages_exact(void *virt, size_t size) 224 { 225 kvm_account_pgtable_pages(virt, -(size >> PAGE_SHIFT)); 226 free_pages_exact(virt, size); 227 } 228 229 static struct kvm_pgtable_mm_ops kvm_s2_mm_ops; 230 231 static void stage2_free_unlinked_table_rcu_cb(struct rcu_head *head) 232 { 233 struct page *page = container_of(head, struct page, rcu_head); 234 void *pgtable = page_to_virt(page); 235 s8 level = page_private(page); 236 237 KVM_PGT_FN(kvm_pgtable_stage2_free_unlinked)(&kvm_s2_mm_ops, pgtable, level); 238 } 239 240 static void stage2_free_unlinked_table(void *addr, s8 level) 241 { 242 struct page *page = virt_to_page(addr); 243 244 set_page_private(page, (unsigned long)level); 245 call_rcu(&page->rcu_head, stage2_free_unlinked_table_rcu_cb); 246 } 247 248 static void kvm_host_get_page(void *addr) 249 { 250 get_page(virt_to_page(addr)); 251 } 252 253 static void kvm_host_put_page(void *addr) 254 { 255 put_page(virt_to_page(addr)); 256 } 257 258 static void kvm_s2_put_page(void *addr) 259 { 260 struct page *p = virt_to_page(addr); 261 /* Dropping last refcount, the page will be freed */ 262 if (page_count(p) == 1) 263 kvm_account_pgtable_pages(addr, -1); 264 put_page(p); 265 } 266 267 static int kvm_host_page_count(void *addr) 268 { 269 return page_count(virt_to_page(addr)); 270 } 271 272 static phys_addr_t kvm_host_pa(void *addr) 273 { 274 return __pa(addr); 275 } 276 277 static void *kvm_host_va(phys_addr_t phys) 278 { 279 return __va(phys); 280 } 281 282 static void clean_dcache_guest_page(void *va, size_t size) 283 { 284 __clean_dcache_guest_page(va, size); 285 } 286 287 static void invalidate_icache_guest_page(void *va, size_t size) 288 { 289 __invalidate_icache_guest_page(va, size); 290 } 291 292 /* 293 * Unmapping vs dcache management: 294 * 295 * If a guest maps certain memory pages as uncached, all writes will 296 * bypass the data cache and go directly to RAM. However, the CPUs 297 * can still speculate reads (not writes) and fill cache lines with 298 * data. 299 * 300 * Those cache lines will be *clean* cache lines though, so a 301 * clean+invalidate operation is equivalent to an invalidate 302 * operation, because no cache lines are marked dirty. 303 * 304 * Those clean cache lines could be filled prior to an uncached write 305 * by the guest, and the cache coherent IO subsystem would therefore 306 * end up writing old data to disk. 307 * 308 * This is why right after unmapping a page/section and invalidating 309 * the corresponding TLBs, we flush to make sure the IO subsystem will 310 * never hit in the cache. 311 * 312 * This is all avoided on systems that have ARM64_HAS_STAGE2_FWB, as 313 * we then fully enforce cacheability of RAM, no matter what the guest 314 * does. 315 */ 316 /** 317 * __unmap_stage2_range -- Clear stage2 page table entries to unmap a range 318 * @mmu: The KVM stage-2 MMU pointer 319 * @start: The intermediate physical base address of the range to unmap 320 * @size: The size of the area to unmap 321 * @may_block: Whether or not we are permitted to block 322 * 323 * Clear a range of stage-2 mappings, lowering the various ref-counts. Must 324 * be called while holding mmu_lock (unless for freeing the stage2 pgd before 325 * destroying the VM), otherwise another faulting VCPU may come in and mess 326 * with things behind our backs. 327 */ 328 static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size, 329 bool may_block) 330 { 331 struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu); 332 phys_addr_t end = start + size; 333 334 lockdep_assert_held_write(&kvm->mmu_lock); 335 WARN_ON(size & ~PAGE_MASK); 336 WARN_ON(stage2_apply_range(mmu, start, end, KVM_PGT_FN(kvm_pgtable_stage2_unmap), 337 may_block)); 338 } 339 340 void kvm_stage2_unmap_range(struct kvm_s2_mmu *mmu, phys_addr_t start, 341 u64 size, bool may_block) 342 { 343 __unmap_stage2_range(mmu, start, size, may_block); 344 } 345 346 void kvm_stage2_flush_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end) 347 { 348 stage2_apply_range_resched(mmu, addr, end, KVM_PGT_FN(kvm_pgtable_stage2_flush)); 349 } 350 351 static void stage2_flush_memslot(struct kvm *kvm, 352 struct kvm_memory_slot *memslot) 353 { 354 phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT; 355 phys_addr_t end = addr + PAGE_SIZE * memslot->npages; 356 357 kvm_stage2_flush_range(&kvm->arch.mmu, addr, end); 358 } 359 360 /** 361 * stage2_flush_vm - Invalidate cache for pages mapped in stage 2 362 * @kvm: The struct kvm pointer 363 * 364 * Go through the stage 2 page tables and invalidate any cache lines 365 * backing memory already mapped to the VM. 366 */ 367 static void stage2_flush_vm(struct kvm *kvm) 368 { 369 struct kvm_memslots *slots; 370 struct kvm_memory_slot *memslot; 371 int idx, bkt; 372 373 idx = srcu_read_lock(&kvm->srcu); 374 write_lock(&kvm->mmu_lock); 375 376 slots = kvm_memslots(kvm); 377 kvm_for_each_memslot(memslot, bkt, slots) 378 stage2_flush_memslot(kvm, memslot); 379 380 kvm_nested_s2_flush(kvm); 381 382 write_unlock(&kvm->mmu_lock); 383 srcu_read_unlock(&kvm->srcu, idx); 384 } 385 386 /** 387 * free_hyp_pgds - free Hyp-mode page tables 388 */ 389 void __init free_hyp_pgds(void) 390 { 391 mutex_lock(&kvm_hyp_pgd_mutex); 392 if (hyp_pgtable) { 393 kvm_pgtable_hyp_destroy(hyp_pgtable); 394 kfree(hyp_pgtable); 395 hyp_pgtable = NULL; 396 } 397 mutex_unlock(&kvm_hyp_pgd_mutex); 398 } 399 400 static bool kvm_host_owns_hyp_mappings(void) 401 { 402 if (is_kernel_in_hyp_mode()) 403 return false; 404 405 if (static_branch_likely(&kvm_protected_mode_initialized)) 406 return false; 407 408 /* 409 * This can happen at boot time when __create_hyp_mappings() is called 410 * after the hyp protection has been enabled, but the static key has 411 * not been flipped yet. 412 */ 413 if (!hyp_pgtable && is_protected_kvm_enabled()) 414 return false; 415 416 WARN_ON(!hyp_pgtable); 417 418 return true; 419 } 420 421 int __create_hyp_mappings(unsigned long start, unsigned long size, 422 unsigned long phys, enum kvm_pgtable_prot prot) 423 { 424 int err; 425 426 if (WARN_ON(!kvm_host_owns_hyp_mappings())) 427 return -EINVAL; 428 429 mutex_lock(&kvm_hyp_pgd_mutex); 430 err = kvm_pgtable_hyp_map(hyp_pgtable, start, size, phys, prot); 431 mutex_unlock(&kvm_hyp_pgd_mutex); 432 433 return err; 434 } 435 436 static phys_addr_t kvm_kaddr_to_phys(void *kaddr) 437 { 438 if (!is_vmalloc_addr(kaddr)) { 439 BUG_ON(!virt_addr_valid(kaddr)); 440 return __pa(kaddr); 441 } else { 442 return page_to_phys(vmalloc_to_page(kaddr)) + 443 offset_in_page(kaddr); 444 } 445 } 446 447 struct hyp_shared_pfn { 448 u64 pfn; 449 int count; 450 struct rb_node node; 451 }; 452 453 static DEFINE_MUTEX(hyp_shared_pfns_lock); 454 static struct rb_root hyp_shared_pfns = RB_ROOT; 455 456 static struct hyp_shared_pfn *find_shared_pfn(u64 pfn, struct rb_node ***node, 457 struct rb_node **parent) 458 { 459 struct hyp_shared_pfn *this; 460 461 *node = &hyp_shared_pfns.rb_node; 462 *parent = NULL; 463 while (**node) { 464 this = container_of(**node, struct hyp_shared_pfn, node); 465 *parent = **node; 466 if (this->pfn < pfn) 467 *node = &((**node)->rb_left); 468 else if (this->pfn > pfn) 469 *node = &((**node)->rb_right); 470 else 471 return this; 472 } 473 474 return NULL; 475 } 476 477 static int share_pfn_hyp(u64 pfn) 478 { 479 struct rb_node **node, *parent; 480 struct hyp_shared_pfn *this; 481 int ret = 0; 482 483 mutex_lock(&hyp_shared_pfns_lock); 484 this = find_shared_pfn(pfn, &node, &parent); 485 if (this) { 486 this->count++; 487 goto unlock; 488 } 489 490 this = kzalloc(sizeof(*this), GFP_KERNEL); 491 if (!this) { 492 ret = -ENOMEM; 493 goto unlock; 494 } 495 496 this->pfn = pfn; 497 this->count = 1; 498 rb_link_node(&this->node, parent, node); 499 rb_insert_color(&this->node, &hyp_shared_pfns); 500 ret = kvm_call_hyp_nvhe(__pkvm_host_share_hyp, pfn, 1); 501 unlock: 502 mutex_unlock(&hyp_shared_pfns_lock); 503 504 return ret; 505 } 506 507 static int unshare_pfn_hyp(u64 pfn) 508 { 509 struct rb_node **node, *parent; 510 struct hyp_shared_pfn *this; 511 int ret = 0; 512 513 mutex_lock(&hyp_shared_pfns_lock); 514 this = find_shared_pfn(pfn, &node, &parent); 515 if (WARN_ON(!this)) { 516 ret = -ENOENT; 517 goto unlock; 518 } 519 520 this->count--; 521 if (this->count) 522 goto unlock; 523 524 rb_erase(&this->node, &hyp_shared_pfns); 525 kfree(this); 526 ret = kvm_call_hyp_nvhe(__pkvm_host_unshare_hyp, pfn, 1); 527 unlock: 528 mutex_unlock(&hyp_shared_pfns_lock); 529 530 return ret; 531 } 532 533 int kvm_share_hyp(void *from, void *to) 534 { 535 phys_addr_t start, end, cur; 536 u64 pfn; 537 int ret; 538 539 if (is_kernel_in_hyp_mode()) 540 return 0; 541 542 /* 543 * The share hcall maps things in the 'fixed-offset' region of the hyp 544 * VA space, so we can only share physically contiguous data-structures 545 * for now. 546 */ 547 if (is_vmalloc_or_module_addr(from) || is_vmalloc_or_module_addr(to)) 548 return -EINVAL; 549 550 if (kvm_host_owns_hyp_mappings()) 551 return create_hyp_mappings(from, to, PAGE_HYP); 552 553 start = ALIGN_DOWN(__pa(from), PAGE_SIZE); 554 end = PAGE_ALIGN(__pa(to)); 555 for (cur = start; cur < end; cur += PAGE_SIZE) { 556 pfn = __phys_to_pfn(cur); 557 ret = share_pfn_hyp(pfn); 558 if (ret) 559 return ret; 560 } 561 562 return 0; 563 } 564 565 void kvm_unshare_hyp(void *from, void *to) 566 { 567 phys_addr_t start, end, cur; 568 u64 pfn; 569 570 if (is_kernel_in_hyp_mode() || kvm_host_owns_hyp_mappings() || !from) 571 return; 572 573 start = ALIGN_DOWN(__pa(from), PAGE_SIZE); 574 end = PAGE_ALIGN(__pa(to)); 575 for (cur = start; cur < end; cur += PAGE_SIZE) { 576 pfn = __phys_to_pfn(cur); 577 WARN_ON(unshare_pfn_hyp(pfn)); 578 } 579 } 580 581 /** 582 * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode 583 * @from: The virtual kernel start address of the range 584 * @to: The virtual kernel end address of the range (exclusive) 585 * @prot: The protection to be applied to this range 586 * 587 * The same virtual address as the kernel virtual address is also used 588 * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying 589 * physical pages. 590 */ 591 int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot) 592 { 593 phys_addr_t phys_addr; 594 unsigned long virt_addr; 595 unsigned long start = kern_hyp_va((unsigned long)from); 596 unsigned long end = kern_hyp_va((unsigned long)to); 597 598 if (is_kernel_in_hyp_mode()) 599 return 0; 600 601 if (!kvm_host_owns_hyp_mappings()) 602 return -EPERM; 603 604 start = start & PAGE_MASK; 605 end = PAGE_ALIGN(end); 606 607 for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) { 608 int err; 609 610 phys_addr = kvm_kaddr_to_phys(from + virt_addr - start); 611 err = __create_hyp_mappings(virt_addr, PAGE_SIZE, phys_addr, 612 prot); 613 if (err) 614 return err; 615 } 616 617 return 0; 618 } 619 620 static int __hyp_alloc_private_va_range(unsigned long base) 621 { 622 lockdep_assert_held(&kvm_hyp_pgd_mutex); 623 624 if (!PAGE_ALIGNED(base)) 625 return -EINVAL; 626 627 /* 628 * Verify that BIT(VA_BITS - 1) hasn't been flipped by 629 * allocating the new area, as it would indicate we've 630 * overflowed the idmap/IO address range. 631 */ 632 if ((base ^ io_map_base) & BIT(VA_BITS - 1)) 633 return -ENOMEM; 634 635 io_map_base = base; 636 637 return 0; 638 } 639 640 /** 641 * hyp_alloc_private_va_range - Allocates a private VA range. 642 * @size: The size of the VA range to reserve. 643 * @haddr: The hypervisor virtual start address of the allocation. 644 * 645 * The private virtual address (VA) range is allocated below io_map_base 646 * and aligned based on the order of @size. 647 * 648 * Return: 0 on success or negative error code on failure. 649 */ 650 int hyp_alloc_private_va_range(size_t size, unsigned long *haddr) 651 { 652 unsigned long base; 653 int ret = 0; 654 655 mutex_lock(&kvm_hyp_pgd_mutex); 656 657 /* 658 * This assumes that we have enough space below the idmap 659 * page to allocate our VAs. If not, the check in 660 * __hyp_alloc_private_va_range() will kick. A potential 661 * alternative would be to detect that overflow and switch 662 * to an allocation above the idmap. 663 * 664 * The allocated size is always a multiple of PAGE_SIZE. 665 */ 666 size = PAGE_ALIGN(size); 667 base = io_map_base - size; 668 ret = __hyp_alloc_private_va_range(base); 669 670 mutex_unlock(&kvm_hyp_pgd_mutex); 671 672 if (!ret) 673 *haddr = base; 674 675 return ret; 676 } 677 678 static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size, 679 unsigned long *haddr, 680 enum kvm_pgtable_prot prot) 681 { 682 unsigned long addr; 683 int ret = 0; 684 685 if (!kvm_host_owns_hyp_mappings()) { 686 addr = kvm_call_hyp_nvhe(__pkvm_create_private_mapping, 687 phys_addr, size, prot); 688 if (IS_ERR_VALUE(addr)) 689 return addr; 690 *haddr = addr; 691 692 return 0; 693 } 694 695 size = PAGE_ALIGN(size + offset_in_page(phys_addr)); 696 ret = hyp_alloc_private_va_range(size, &addr); 697 if (ret) 698 return ret; 699 700 ret = __create_hyp_mappings(addr, size, phys_addr, prot); 701 if (ret) 702 return ret; 703 704 *haddr = addr + offset_in_page(phys_addr); 705 return ret; 706 } 707 708 int create_hyp_stack(phys_addr_t phys_addr, unsigned long *haddr) 709 { 710 unsigned long base; 711 size_t size; 712 int ret; 713 714 mutex_lock(&kvm_hyp_pgd_mutex); 715 /* 716 * Efficient stack verification using the NVHE_STACK_SHIFT bit implies 717 * an alignment of our allocation on the order of the size. 718 */ 719 size = NVHE_STACK_SIZE * 2; 720 base = ALIGN_DOWN(io_map_base - size, size); 721 722 ret = __hyp_alloc_private_va_range(base); 723 724 mutex_unlock(&kvm_hyp_pgd_mutex); 725 726 if (ret) { 727 kvm_err("Cannot allocate hyp stack guard page\n"); 728 return ret; 729 } 730 731 /* 732 * Since the stack grows downwards, map the stack to the page 733 * at the higher address and leave the lower guard page 734 * unbacked. 735 * 736 * Any valid stack address now has the NVHE_STACK_SHIFT bit as 1 737 * and addresses corresponding to the guard page have the 738 * NVHE_STACK_SHIFT bit as 0 - this is used for overflow detection. 739 */ 740 ret = __create_hyp_mappings(base + NVHE_STACK_SIZE, NVHE_STACK_SIZE, 741 phys_addr, PAGE_HYP); 742 if (ret) 743 kvm_err("Cannot map hyp stack\n"); 744 745 *haddr = base + size; 746 747 return ret; 748 } 749 750 /** 751 * create_hyp_io_mappings - Map IO into both kernel and HYP 752 * @phys_addr: The physical start address which gets mapped 753 * @size: Size of the region being mapped 754 * @kaddr: Kernel VA for this mapping 755 * @haddr: HYP VA for this mapping 756 */ 757 int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size, 758 void __iomem **kaddr, 759 void __iomem **haddr) 760 { 761 unsigned long addr; 762 int ret; 763 764 if (is_protected_kvm_enabled()) 765 return -EPERM; 766 767 *kaddr = ioremap(phys_addr, size); 768 if (!*kaddr) 769 return -ENOMEM; 770 771 if (is_kernel_in_hyp_mode()) { 772 *haddr = *kaddr; 773 return 0; 774 } 775 776 ret = __create_hyp_private_mapping(phys_addr, size, 777 &addr, PAGE_HYP_DEVICE); 778 if (ret) { 779 iounmap(*kaddr); 780 *kaddr = NULL; 781 *haddr = NULL; 782 return ret; 783 } 784 785 *haddr = (void __iomem *)addr; 786 return 0; 787 } 788 789 /** 790 * create_hyp_exec_mappings - Map an executable range into HYP 791 * @phys_addr: The physical start address which gets mapped 792 * @size: Size of the region being mapped 793 * @haddr: HYP VA for this mapping 794 */ 795 int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size, 796 void **haddr) 797 { 798 unsigned long addr; 799 int ret; 800 801 BUG_ON(is_kernel_in_hyp_mode()); 802 803 ret = __create_hyp_private_mapping(phys_addr, size, 804 &addr, PAGE_HYP_EXEC); 805 if (ret) { 806 *haddr = NULL; 807 return ret; 808 } 809 810 *haddr = (void *)addr; 811 return 0; 812 } 813 814 static struct kvm_pgtable_mm_ops kvm_user_mm_ops = { 815 /* We shouldn't need any other callback to walk the PT */ 816 .phys_to_virt = kvm_host_va, 817 }; 818 819 static int get_user_mapping_size(struct kvm *kvm, u64 addr) 820 { 821 struct kvm_pgtable pgt = { 822 .pgd = (kvm_pteref_t)kvm->mm->pgd, 823 .ia_bits = vabits_actual, 824 .start_level = (KVM_PGTABLE_LAST_LEVEL - 825 ARM64_HW_PGTABLE_LEVELS(pgt.ia_bits) + 1), 826 .mm_ops = &kvm_user_mm_ops, 827 }; 828 unsigned long flags; 829 kvm_pte_t pte = 0; /* Keep GCC quiet... */ 830 s8 level = S8_MAX; 831 int ret; 832 833 /* 834 * Disable IRQs so that we hazard against a concurrent 835 * teardown of the userspace page tables (which relies on 836 * IPI-ing threads). 837 */ 838 local_irq_save(flags); 839 ret = kvm_pgtable_get_leaf(&pgt, addr, &pte, &level); 840 local_irq_restore(flags); 841 842 if (ret) 843 return ret; 844 845 /* 846 * Not seeing an error, but not updating level? Something went 847 * deeply wrong... 848 */ 849 if (WARN_ON(level > KVM_PGTABLE_LAST_LEVEL)) 850 return -EFAULT; 851 if (WARN_ON(level < KVM_PGTABLE_FIRST_LEVEL)) 852 return -EFAULT; 853 854 /* Oops, the userspace PTs are gone... Replay the fault */ 855 if (!kvm_pte_valid(pte)) 856 return -EAGAIN; 857 858 return BIT(ARM64_HW_PGTABLE_LEVEL_SHIFT(level)); 859 } 860 861 static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = { 862 .zalloc_page = stage2_memcache_zalloc_page, 863 .zalloc_pages_exact = kvm_s2_zalloc_pages_exact, 864 .free_pages_exact = kvm_s2_free_pages_exact, 865 .free_unlinked_table = stage2_free_unlinked_table, 866 .get_page = kvm_host_get_page, 867 .put_page = kvm_s2_put_page, 868 .page_count = kvm_host_page_count, 869 .phys_to_virt = kvm_host_va, 870 .virt_to_phys = kvm_host_pa, 871 .dcache_clean_inval_poc = clean_dcache_guest_page, 872 .icache_inval_pou = invalidate_icache_guest_page, 873 }; 874 875 static int kvm_init_ipa_range(struct kvm_s2_mmu *mmu, unsigned long type) 876 { 877 u32 kvm_ipa_limit = get_kvm_ipa_limit(); 878 u64 mmfr0, mmfr1; 879 u32 phys_shift; 880 881 if (type & ~KVM_VM_TYPE_ARM_IPA_SIZE_MASK) 882 return -EINVAL; 883 884 phys_shift = KVM_VM_TYPE_ARM_IPA_SIZE(type); 885 if (is_protected_kvm_enabled()) { 886 phys_shift = kvm_ipa_limit; 887 } else if (phys_shift) { 888 if (phys_shift > kvm_ipa_limit || 889 phys_shift < ARM64_MIN_PARANGE_BITS) 890 return -EINVAL; 891 } else { 892 phys_shift = KVM_PHYS_SHIFT; 893 if (phys_shift > kvm_ipa_limit) { 894 pr_warn_once("%s using unsupported default IPA limit, upgrade your VMM\n", 895 current->comm); 896 return -EINVAL; 897 } 898 } 899 900 mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); 901 mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); 902 mmu->vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift); 903 904 return 0; 905 } 906 907 /** 908 * kvm_init_stage2_mmu - Initialise a S2 MMU structure 909 * @kvm: The pointer to the KVM structure 910 * @mmu: The pointer to the s2 MMU structure 911 * @type: The machine type of the virtual machine 912 * 913 * Allocates only the stage-2 HW PGD level table(s). 914 * Note we don't need locking here as this is only called in two cases: 915 * 916 * - when the VM is created, which can't race against anything 917 * 918 * - when secondary kvm_s2_mmu structures are initialised for NV 919 * guests, and the caller must hold kvm->lock as this is called on a 920 * per-vcpu basis. 921 */ 922 int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type) 923 { 924 int cpu, err; 925 struct kvm_pgtable *pgt; 926 927 /* 928 * If we already have our page tables in place, and that the 929 * MMU context is the canonical one, we have a bug somewhere, 930 * as this is only supposed to ever happen once per VM. 931 * 932 * Otherwise, we're building nested page tables, and that's 933 * probably because userspace called KVM_ARM_VCPU_INIT more 934 * than once on the same vcpu. Since that's actually legal, 935 * don't kick a fuss and leave gracefully. 936 */ 937 if (mmu->pgt != NULL) { 938 if (kvm_is_nested_s2_mmu(kvm, mmu)) 939 return 0; 940 941 kvm_err("kvm_arch already initialized?\n"); 942 return -EINVAL; 943 } 944 945 err = kvm_init_ipa_range(mmu, type); 946 if (err) 947 return err; 948 949 pgt = kzalloc(sizeof(*pgt), GFP_KERNEL_ACCOUNT); 950 if (!pgt) 951 return -ENOMEM; 952 953 mmu->arch = &kvm->arch; 954 err = KVM_PGT_FN(kvm_pgtable_stage2_init)(pgt, mmu, &kvm_s2_mm_ops); 955 if (err) 956 goto out_free_pgtable; 957 958 mmu->pgt = pgt; 959 if (is_protected_kvm_enabled()) 960 return 0; 961 962 mmu->last_vcpu_ran = alloc_percpu(typeof(*mmu->last_vcpu_ran)); 963 if (!mmu->last_vcpu_ran) { 964 err = -ENOMEM; 965 goto out_destroy_pgtable; 966 } 967 968 for_each_possible_cpu(cpu) 969 *per_cpu_ptr(mmu->last_vcpu_ran, cpu) = -1; 970 971 /* The eager page splitting is disabled by default */ 972 mmu->split_page_chunk_size = KVM_ARM_EAGER_SPLIT_CHUNK_SIZE_DEFAULT; 973 mmu->split_page_cache.gfp_zero = __GFP_ZERO; 974 975 mmu->pgd_phys = __pa(pgt->pgd); 976 977 if (kvm_is_nested_s2_mmu(kvm, mmu)) 978 kvm_init_nested_s2_mmu(mmu); 979 980 return 0; 981 982 out_destroy_pgtable: 983 KVM_PGT_FN(kvm_pgtable_stage2_destroy)(pgt); 984 out_free_pgtable: 985 kfree(pgt); 986 return err; 987 } 988 989 void kvm_uninit_stage2_mmu(struct kvm *kvm) 990 { 991 kvm_free_stage2_pgd(&kvm->arch.mmu); 992 kvm_mmu_free_memory_cache(&kvm->arch.mmu.split_page_cache); 993 } 994 995 static void stage2_unmap_memslot(struct kvm *kvm, 996 struct kvm_memory_slot *memslot) 997 { 998 hva_t hva = memslot->userspace_addr; 999 phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT; 1000 phys_addr_t size = PAGE_SIZE * memslot->npages; 1001 hva_t reg_end = hva + size; 1002 1003 /* 1004 * A memory region could potentially cover multiple VMAs, and any holes 1005 * between them, so iterate over all of them to find out if we should 1006 * unmap any of them. 1007 * 1008 * +--------------------------------------------+ 1009 * +---------------+----------------+ +----------------+ 1010 * | : VMA 1 | VMA 2 | | VMA 3 : | 1011 * +---------------+----------------+ +----------------+ 1012 * | memory region | 1013 * +--------------------------------------------+ 1014 */ 1015 do { 1016 struct vm_area_struct *vma; 1017 hva_t vm_start, vm_end; 1018 1019 vma = find_vma_intersection(current->mm, hva, reg_end); 1020 if (!vma) 1021 break; 1022 1023 /* 1024 * Take the intersection of this VMA with the memory region 1025 */ 1026 vm_start = max(hva, vma->vm_start); 1027 vm_end = min(reg_end, vma->vm_end); 1028 1029 if (!(vma->vm_flags & VM_PFNMAP)) { 1030 gpa_t gpa = addr + (vm_start - memslot->userspace_addr); 1031 kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, vm_end - vm_start, true); 1032 } 1033 hva = vm_end; 1034 } while (hva < reg_end); 1035 } 1036 1037 /** 1038 * stage2_unmap_vm - Unmap Stage-2 RAM mappings 1039 * @kvm: The struct kvm pointer 1040 * 1041 * Go through the memregions and unmap any regular RAM 1042 * backing memory already mapped to the VM. 1043 */ 1044 void stage2_unmap_vm(struct kvm *kvm) 1045 { 1046 struct kvm_memslots *slots; 1047 struct kvm_memory_slot *memslot; 1048 int idx, bkt; 1049 1050 idx = srcu_read_lock(&kvm->srcu); 1051 mmap_read_lock(current->mm); 1052 write_lock(&kvm->mmu_lock); 1053 1054 slots = kvm_memslots(kvm); 1055 kvm_for_each_memslot(memslot, bkt, slots) 1056 stage2_unmap_memslot(kvm, memslot); 1057 1058 kvm_nested_s2_unmap(kvm, true); 1059 1060 write_unlock(&kvm->mmu_lock); 1061 mmap_read_unlock(current->mm); 1062 srcu_read_unlock(&kvm->srcu, idx); 1063 } 1064 1065 void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu) 1066 { 1067 struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu); 1068 struct kvm_pgtable *pgt = NULL; 1069 1070 write_lock(&kvm->mmu_lock); 1071 pgt = mmu->pgt; 1072 if (pgt) { 1073 mmu->pgd_phys = 0; 1074 mmu->pgt = NULL; 1075 free_percpu(mmu->last_vcpu_ran); 1076 } 1077 1078 if (kvm_is_nested_s2_mmu(kvm, mmu)) 1079 kvm_init_nested_s2_mmu(mmu); 1080 1081 write_unlock(&kvm->mmu_lock); 1082 1083 if (pgt) { 1084 KVM_PGT_FN(kvm_pgtable_stage2_destroy)(pgt); 1085 kfree(pgt); 1086 } 1087 } 1088 1089 static void hyp_mc_free_fn(void *addr, void *mc) 1090 { 1091 struct kvm_hyp_memcache *memcache = mc; 1092 1093 if (memcache->flags & HYP_MEMCACHE_ACCOUNT_STAGE2) 1094 kvm_account_pgtable_pages(addr, -1); 1095 1096 free_page((unsigned long)addr); 1097 } 1098 1099 static void *hyp_mc_alloc_fn(void *mc) 1100 { 1101 struct kvm_hyp_memcache *memcache = mc; 1102 void *addr; 1103 1104 addr = (void *)__get_free_page(GFP_KERNEL_ACCOUNT); 1105 if (addr && memcache->flags & HYP_MEMCACHE_ACCOUNT_STAGE2) 1106 kvm_account_pgtable_pages(addr, 1); 1107 1108 return addr; 1109 } 1110 1111 void free_hyp_memcache(struct kvm_hyp_memcache *mc) 1112 { 1113 if (!is_protected_kvm_enabled()) 1114 return; 1115 1116 kfree(mc->mapping); 1117 __free_hyp_memcache(mc, hyp_mc_free_fn, kvm_host_va, mc); 1118 } 1119 1120 int topup_hyp_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages) 1121 { 1122 if (!is_protected_kvm_enabled()) 1123 return 0; 1124 1125 if (!mc->mapping) { 1126 mc->mapping = kzalloc(sizeof(struct pkvm_mapping), GFP_KERNEL_ACCOUNT); 1127 if (!mc->mapping) 1128 return -ENOMEM; 1129 } 1130 1131 return __topup_hyp_memcache(mc, min_pages, hyp_mc_alloc_fn, 1132 kvm_host_pa, mc); 1133 } 1134 1135 /** 1136 * kvm_phys_addr_ioremap - map a device range to guest IPA 1137 * 1138 * @kvm: The KVM pointer 1139 * @guest_ipa: The IPA at which to insert the mapping 1140 * @pa: The physical address of the device 1141 * @size: The size of the mapping 1142 * @writable: Whether or not to create a writable mapping 1143 */ 1144 int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, 1145 phys_addr_t pa, unsigned long size, bool writable) 1146 { 1147 phys_addr_t addr; 1148 int ret = 0; 1149 struct kvm_mmu_memory_cache cache = { .gfp_zero = __GFP_ZERO }; 1150 struct kvm_s2_mmu *mmu = &kvm->arch.mmu; 1151 struct kvm_pgtable *pgt = mmu->pgt; 1152 enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_DEVICE | 1153 KVM_PGTABLE_PROT_R | 1154 (writable ? KVM_PGTABLE_PROT_W : 0); 1155 1156 if (is_protected_kvm_enabled()) 1157 return -EPERM; 1158 1159 size += offset_in_page(guest_ipa); 1160 guest_ipa &= PAGE_MASK; 1161 1162 for (addr = guest_ipa; addr < guest_ipa + size; addr += PAGE_SIZE) { 1163 ret = kvm_mmu_topup_memory_cache(&cache, 1164 kvm_mmu_cache_min_pages(mmu)); 1165 if (ret) 1166 break; 1167 1168 write_lock(&kvm->mmu_lock); 1169 ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, addr, PAGE_SIZE, 1170 pa, prot, &cache, 0); 1171 write_unlock(&kvm->mmu_lock); 1172 if (ret) 1173 break; 1174 1175 pa += PAGE_SIZE; 1176 } 1177 1178 kvm_mmu_free_memory_cache(&cache); 1179 return ret; 1180 } 1181 1182 /** 1183 * kvm_stage2_wp_range() - write protect stage2 memory region range 1184 * @mmu: The KVM stage-2 MMU pointer 1185 * @addr: Start address of range 1186 * @end: End address of range 1187 */ 1188 void kvm_stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end) 1189 { 1190 stage2_apply_range_resched(mmu, addr, end, KVM_PGT_FN(kvm_pgtable_stage2_wrprotect)); 1191 } 1192 1193 /** 1194 * kvm_mmu_wp_memory_region() - write protect stage 2 entries for memory slot 1195 * @kvm: The KVM pointer 1196 * @slot: The memory slot to write protect 1197 * 1198 * Called to start logging dirty pages after memory region 1199 * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns 1200 * all present PUD, PMD and PTEs are write protected in the memory region. 1201 * Afterwards read of dirty page log can be called. 1202 * 1203 * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired, 1204 * serializing operations for VM memory regions. 1205 */ 1206 static void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot) 1207 { 1208 struct kvm_memslots *slots = kvm_memslots(kvm); 1209 struct kvm_memory_slot *memslot = id_to_memslot(slots, slot); 1210 phys_addr_t start, end; 1211 1212 if (WARN_ON_ONCE(!memslot)) 1213 return; 1214 1215 start = memslot->base_gfn << PAGE_SHIFT; 1216 end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT; 1217 1218 write_lock(&kvm->mmu_lock); 1219 kvm_stage2_wp_range(&kvm->arch.mmu, start, end); 1220 kvm_nested_s2_wp(kvm); 1221 write_unlock(&kvm->mmu_lock); 1222 kvm_flush_remote_tlbs_memslot(kvm, memslot); 1223 } 1224 1225 /** 1226 * kvm_mmu_split_memory_region() - split the stage 2 blocks into PAGE_SIZE 1227 * pages for memory slot 1228 * @kvm: The KVM pointer 1229 * @slot: The memory slot to split 1230 * 1231 * Acquires kvm->mmu_lock. Called with kvm->slots_lock mutex acquired, 1232 * serializing operations for VM memory regions. 1233 */ 1234 static void kvm_mmu_split_memory_region(struct kvm *kvm, int slot) 1235 { 1236 struct kvm_memslots *slots; 1237 struct kvm_memory_slot *memslot; 1238 phys_addr_t start, end; 1239 1240 lockdep_assert_held(&kvm->slots_lock); 1241 1242 slots = kvm_memslots(kvm); 1243 memslot = id_to_memslot(slots, slot); 1244 1245 start = memslot->base_gfn << PAGE_SHIFT; 1246 end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT; 1247 1248 write_lock(&kvm->mmu_lock); 1249 kvm_mmu_split_huge_pages(kvm, start, end); 1250 write_unlock(&kvm->mmu_lock); 1251 } 1252 1253 /* 1254 * kvm_arch_mmu_enable_log_dirty_pt_masked() - enable dirty logging for selected pages. 1255 * @kvm: The KVM pointer 1256 * @slot: The memory slot associated with mask 1257 * @gfn_offset: The gfn offset in memory slot 1258 * @mask: The mask of pages at offset 'gfn_offset' in this memory 1259 * slot to enable dirty logging on 1260 * 1261 * Writes protect selected pages to enable dirty logging, and then 1262 * splits them to PAGE_SIZE. Caller must acquire kvm->mmu_lock. 1263 */ 1264 void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, 1265 struct kvm_memory_slot *slot, 1266 gfn_t gfn_offset, unsigned long mask) 1267 { 1268 phys_addr_t base_gfn = slot->base_gfn + gfn_offset; 1269 phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT; 1270 phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT; 1271 1272 lockdep_assert_held_write(&kvm->mmu_lock); 1273 1274 kvm_stage2_wp_range(&kvm->arch.mmu, start, end); 1275 1276 /* 1277 * Eager-splitting is done when manual-protect is set. We 1278 * also check for initially-all-set because we can avoid 1279 * eager-splitting if initially-all-set is false. 1280 * Initially-all-set equal false implies that huge-pages were 1281 * already split when enabling dirty logging: no need to do it 1282 * again. 1283 */ 1284 if (kvm_dirty_log_manual_protect_and_init_set(kvm)) 1285 kvm_mmu_split_huge_pages(kvm, start, end); 1286 1287 kvm_nested_s2_wp(kvm); 1288 } 1289 1290 static void kvm_send_hwpoison_signal(unsigned long address, short lsb) 1291 { 1292 send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current); 1293 } 1294 1295 static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot, 1296 unsigned long hva, 1297 unsigned long map_size) 1298 { 1299 gpa_t gpa_start; 1300 hva_t uaddr_start, uaddr_end; 1301 size_t size; 1302 1303 /* The memslot and the VMA are guaranteed to be aligned to PAGE_SIZE */ 1304 if (map_size == PAGE_SIZE) 1305 return true; 1306 1307 /* pKVM only supports PMD_SIZE huge-mappings */ 1308 if (is_protected_kvm_enabled() && map_size != PMD_SIZE) 1309 return false; 1310 1311 size = memslot->npages * PAGE_SIZE; 1312 1313 gpa_start = memslot->base_gfn << PAGE_SHIFT; 1314 1315 uaddr_start = memslot->userspace_addr; 1316 uaddr_end = uaddr_start + size; 1317 1318 /* 1319 * Pages belonging to memslots that don't have the same alignment 1320 * within a PMD/PUD for userspace and IPA cannot be mapped with stage-2 1321 * PMD/PUD entries, because we'll end up mapping the wrong pages. 1322 * 1323 * Consider a layout like the following: 1324 * 1325 * memslot->userspace_addr: 1326 * +-----+--------------------+--------------------+---+ 1327 * |abcde|fgh Stage-1 block | Stage-1 block tv|xyz| 1328 * +-----+--------------------+--------------------+---+ 1329 * 1330 * memslot->base_gfn << PAGE_SHIFT: 1331 * +---+--------------------+--------------------+-----+ 1332 * |abc|def Stage-2 block | Stage-2 block |tvxyz| 1333 * +---+--------------------+--------------------+-----+ 1334 * 1335 * If we create those stage-2 blocks, we'll end up with this incorrect 1336 * mapping: 1337 * d -> f 1338 * e -> g 1339 * f -> h 1340 */ 1341 if ((gpa_start & (map_size - 1)) != (uaddr_start & (map_size - 1))) 1342 return false; 1343 1344 /* 1345 * Next, let's make sure we're not trying to map anything not covered 1346 * by the memslot. This means we have to prohibit block size mappings 1347 * for the beginning and end of a non-block aligned and non-block sized 1348 * memory slot (illustrated by the head and tail parts of the 1349 * userspace view above containing pages 'abcde' and 'xyz', 1350 * respectively). 1351 * 1352 * Note that it doesn't matter if we do the check using the 1353 * userspace_addr or the base_gfn, as both are equally aligned (per 1354 * the check above) and equally sized. 1355 */ 1356 return (hva & ~(map_size - 1)) >= uaddr_start && 1357 (hva & ~(map_size - 1)) + map_size <= uaddr_end; 1358 } 1359 1360 /* 1361 * Check if the given hva is backed by a transparent huge page (THP) and 1362 * whether it can be mapped using block mapping in stage2. If so, adjust 1363 * the stage2 PFN and IPA accordingly. Only PMD_SIZE THPs are currently 1364 * supported. This will need to be updated to support other THP sizes. 1365 * 1366 * Returns the size of the mapping. 1367 */ 1368 static long 1369 transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot, 1370 unsigned long hva, kvm_pfn_t *pfnp, 1371 phys_addr_t *ipap) 1372 { 1373 kvm_pfn_t pfn = *pfnp; 1374 1375 /* 1376 * Make sure the adjustment is done only for THP pages. Also make 1377 * sure that the HVA and IPA are sufficiently aligned and that the 1378 * block map is contained within the memslot. 1379 */ 1380 if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) { 1381 int sz = get_user_mapping_size(kvm, hva); 1382 1383 if (sz < 0) 1384 return sz; 1385 1386 if (sz < PMD_SIZE) 1387 return PAGE_SIZE; 1388 1389 *ipap &= PMD_MASK; 1390 pfn &= ~(PTRS_PER_PMD - 1); 1391 *pfnp = pfn; 1392 1393 return PMD_SIZE; 1394 } 1395 1396 /* Use page mapping if we cannot use block mapping. */ 1397 return PAGE_SIZE; 1398 } 1399 1400 static int get_vma_page_shift(struct vm_area_struct *vma, unsigned long hva) 1401 { 1402 unsigned long pa; 1403 1404 if (is_vm_hugetlb_page(vma) && !(vma->vm_flags & VM_PFNMAP)) 1405 return huge_page_shift(hstate_vma(vma)); 1406 1407 if (!(vma->vm_flags & VM_PFNMAP)) 1408 return PAGE_SHIFT; 1409 1410 VM_BUG_ON(is_vm_hugetlb_page(vma)); 1411 1412 pa = (vma->vm_pgoff << PAGE_SHIFT) + (hva - vma->vm_start); 1413 1414 #ifndef __PAGETABLE_PMD_FOLDED 1415 if ((hva & (PUD_SIZE - 1)) == (pa & (PUD_SIZE - 1)) && 1416 ALIGN_DOWN(hva, PUD_SIZE) >= vma->vm_start && 1417 ALIGN(hva, PUD_SIZE) <= vma->vm_end) 1418 return PUD_SHIFT; 1419 #endif 1420 1421 if ((hva & (PMD_SIZE - 1)) == (pa & (PMD_SIZE - 1)) && 1422 ALIGN_DOWN(hva, PMD_SIZE) >= vma->vm_start && 1423 ALIGN(hva, PMD_SIZE) <= vma->vm_end) 1424 return PMD_SHIFT; 1425 1426 return PAGE_SHIFT; 1427 } 1428 1429 /* 1430 * The page will be mapped in stage 2 as Normal Cacheable, so the VM will be 1431 * able to see the page's tags and therefore they must be initialised first. If 1432 * PG_mte_tagged is set, tags have already been initialised. 1433 * 1434 * Must be called with kvm->mmu_lock held to ensure the memory remains mapped 1435 * while the tags are zeroed. 1436 */ 1437 static void sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn, 1438 unsigned long size) 1439 { 1440 unsigned long i, nr_pages = size >> PAGE_SHIFT; 1441 struct page *page = pfn_to_page(pfn); 1442 struct folio *folio = page_folio(page); 1443 1444 if (!kvm_has_mte(kvm)) 1445 return; 1446 1447 if (folio_test_hugetlb(folio)) { 1448 /* Hugetlb has MTE flags set on head page only */ 1449 if (folio_try_hugetlb_mte_tagging(folio)) { 1450 for (i = 0; i < nr_pages; i++, page++) 1451 mte_clear_page_tags(page_address(page)); 1452 folio_set_hugetlb_mte_tagged(folio); 1453 } 1454 return; 1455 } 1456 1457 for (i = 0; i < nr_pages; i++, page++) { 1458 if (try_page_mte_tagging(page)) { 1459 mte_clear_page_tags(page_address(page)); 1460 set_page_mte_tagged(page); 1461 } 1462 } 1463 } 1464 1465 static bool kvm_vma_mte_allowed(struct vm_area_struct *vma) 1466 { 1467 return vma->vm_flags & VM_MTE_ALLOWED; 1468 } 1469 1470 static bool kvm_vma_is_cacheable(struct vm_area_struct *vma) 1471 { 1472 switch (FIELD_GET(PTE_ATTRINDX_MASK, pgprot_val(vma->vm_page_prot))) { 1473 case MT_NORMAL_NC: 1474 case MT_DEVICE_nGnRnE: 1475 case MT_DEVICE_nGnRE: 1476 return false; 1477 default: 1478 return true; 1479 } 1480 } 1481 1482 static int prepare_mmu_memcache(struct kvm_vcpu *vcpu, bool topup_memcache, 1483 void **memcache) 1484 { 1485 int min_pages; 1486 1487 if (!is_protected_kvm_enabled()) 1488 *memcache = &vcpu->arch.mmu_page_cache; 1489 else 1490 *memcache = &vcpu->arch.pkvm_memcache; 1491 1492 if (!topup_memcache) 1493 return 0; 1494 1495 min_pages = kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu); 1496 1497 if (!is_protected_kvm_enabled()) 1498 return kvm_mmu_topup_memory_cache(*memcache, min_pages); 1499 1500 return topup_hyp_memcache(*memcache, min_pages); 1501 } 1502 1503 /* 1504 * Potentially reduce shadow S2 permissions to match the guest's own S2. For 1505 * exec faults, we'd only reach this point if the guest actually allowed it (see 1506 * kvm_s2_handle_perm_fault). 1507 * 1508 * Also encode the level of the original translation in the SW bits of the leaf 1509 * entry as a proxy for the span of that translation. This will be retrieved on 1510 * TLB invalidation from the guest and used to limit the invalidation scope if a 1511 * TTL hint or a range isn't provided. 1512 */ 1513 static void adjust_nested_fault_perms(struct kvm_s2_trans *nested, 1514 enum kvm_pgtable_prot *prot, 1515 bool *writable) 1516 { 1517 *writable &= kvm_s2_trans_writable(nested); 1518 if (!kvm_s2_trans_readable(nested)) 1519 *prot &= ~KVM_PGTABLE_PROT_R; 1520 1521 *prot |= kvm_encode_nested_level(nested); 1522 } 1523 1524 #define KVM_PGTABLE_WALK_MEMABORT_FLAGS (KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED) 1525 1526 static int gmem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, 1527 struct kvm_s2_trans *nested, 1528 struct kvm_memory_slot *memslot, bool is_perm) 1529 { 1530 bool write_fault, exec_fault, writable; 1531 enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_MEMABORT_FLAGS; 1532 enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; 1533 struct kvm_pgtable *pgt = vcpu->arch.hw_mmu->pgt; 1534 unsigned long mmu_seq; 1535 struct page *page; 1536 struct kvm *kvm = vcpu->kvm; 1537 void *memcache; 1538 kvm_pfn_t pfn; 1539 gfn_t gfn; 1540 int ret; 1541 1542 ret = prepare_mmu_memcache(vcpu, true, &memcache); 1543 if (ret) 1544 return ret; 1545 1546 if (nested) 1547 gfn = kvm_s2_trans_output(nested) >> PAGE_SHIFT; 1548 else 1549 gfn = fault_ipa >> PAGE_SHIFT; 1550 1551 write_fault = kvm_is_write_fault(vcpu); 1552 exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu); 1553 1554 VM_WARN_ON_ONCE(write_fault && exec_fault); 1555 1556 mmu_seq = kvm->mmu_invalidate_seq; 1557 /* Pairs with the smp_wmb() in kvm_mmu_invalidate_end(). */ 1558 smp_rmb(); 1559 1560 ret = kvm_gmem_get_pfn(kvm, memslot, gfn, &pfn, &page, NULL); 1561 if (ret) { 1562 kvm_prepare_memory_fault_exit(vcpu, fault_ipa, PAGE_SIZE, 1563 write_fault, exec_fault, false); 1564 return ret; 1565 } 1566 1567 writable = !(memslot->flags & KVM_MEM_READONLY); 1568 1569 if (nested) 1570 adjust_nested_fault_perms(nested, &prot, &writable); 1571 1572 if (writable) 1573 prot |= KVM_PGTABLE_PROT_W; 1574 1575 if (exec_fault || 1576 (cpus_have_final_cap(ARM64_HAS_CACHE_DIC) && 1577 (!nested || kvm_s2_trans_executable(nested)))) 1578 prot |= KVM_PGTABLE_PROT_X; 1579 1580 kvm_fault_lock(kvm); 1581 if (mmu_invalidate_retry(kvm, mmu_seq)) { 1582 ret = -EAGAIN; 1583 goto out_unlock; 1584 } 1585 1586 ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, fault_ipa, PAGE_SIZE, 1587 __pfn_to_phys(pfn), prot, 1588 memcache, flags); 1589 1590 out_unlock: 1591 kvm_release_faultin_page(kvm, page, !!ret, writable); 1592 kvm_fault_unlock(kvm); 1593 1594 if (writable && !ret) 1595 mark_page_dirty_in_slot(kvm, memslot, gfn); 1596 1597 return ret != -EAGAIN ? ret : 0; 1598 } 1599 1600 static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, 1601 struct kvm_s2_trans *nested, 1602 struct kvm_memory_slot *memslot, unsigned long hva, 1603 bool fault_is_perm) 1604 { 1605 int ret = 0; 1606 bool topup_memcache; 1607 bool write_fault, writable; 1608 bool exec_fault, mte_allowed, is_vma_cacheable; 1609 bool s2_force_noncacheable = false, vfio_allow_any_uc = false; 1610 unsigned long mmu_seq; 1611 phys_addr_t ipa = fault_ipa; 1612 struct kvm *kvm = vcpu->kvm; 1613 struct vm_area_struct *vma; 1614 short vma_shift; 1615 void *memcache; 1616 gfn_t gfn; 1617 kvm_pfn_t pfn; 1618 bool logging_active = memslot_is_logging(memslot); 1619 bool force_pte = logging_active; 1620 long vma_pagesize, fault_granule; 1621 enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; 1622 struct kvm_pgtable *pgt; 1623 struct page *page; 1624 vm_flags_t vm_flags; 1625 enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_MEMABORT_FLAGS; 1626 1627 if (fault_is_perm) 1628 fault_granule = kvm_vcpu_trap_get_perm_fault_granule(vcpu); 1629 write_fault = kvm_is_write_fault(vcpu); 1630 exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu); 1631 VM_WARN_ON_ONCE(write_fault && exec_fault); 1632 1633 /* 1634 * Permission faults just need to update the existing leaf entry, 1635 * and so normally don't require allocations from the memcache. The 1636 * only exception to this is when dirty logging is enabled at runtime 1637 * and a write fault needs to collapse a block entry into a table. 1638 */ 1639 topup_memcache = !fault_is_perm || (logging_active && write_fault); 1640 ret = prepare_mmu_memcache(vcpu, topup_memcache, &memcache); 1641 if (ret) 1642 return ret; 1643 1644 /* 1645 * Let's check if we will get back a huge page backed by hugetlbfs, or 1646 * get block mapping for device MMIO region. 1647 */ 1648 mmap_read_lock(current->mm); 1649 vma = vma_lookup(current->mm, hva); 1650 if (unlikely(!vma)) { 1651 kvm_err("Failed to find VMA for hva 0x%lx\n", hva); 1652 mmap_read_unlock(current->mm); 1653 return -EFAULT; 1654 } 1655 1656 if (force_pte) 1657 vma_shift = PAGE_SHIFT; 1658 else 1659 vma_shift = get_vma_page_shift(vma, hva); 1660 1661 switch (vma_shift) { 1662 #ifndef __PAGETABLE_PMD_FOLDED 1663 case PUD_SHIFT: 1664 if (fault_supports_stage2_huge_mapping(memslot, hva, PUD_SIZE)) 1665 break; 1666 fallthrough; 1667 #endif 1668 case CONT_PMD_SHIFT: 1669 vma_shift = PMD_SHIFT; 1670 fallthrough; 1671 case PMD_SHIFT: 1672 if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) 1673 break; 1674 fallthrough; 1675 case CONT_PTE_SHIFT: 1676 vma_shift = PAGE_SHIFT; 1677 force_pte = true; 1678 fallthrough; 1679 case PAGE_SHIFT: 1680 break; 1681 default: 1682 WARN_ONCE(1, "Unknown vma_shift %d", vma_shift); 1683 } 1684 1685 vma_pagesize = 1UL << vma_shift; 1686 1687 if (nested) { 1688 unsigned long max_map_size; 1689 1690 max_map_size = force_pte ? PAGE_SIZE : PUD_SIZE; 1691 1692 ipa = kvm_s2_trans_output(nested); 1693 1694 /* 1695 * If we're about to create a shadow stage 2 entry, then we 1696 * can only create a block mapping if the guest stage 2 page 1697 * table uses at least as big a mapping. 1698 */ 1699 max_map_size = min(kvm_s2_trans_size(nested), max_map_size); 1700 1701 /* 1702 * Be careful that if the mapping size falls between 1703 * two host sizes, take the smallest of the two. 1704 */ 1705 if (max_map_size >= PMD_SIZE && max_map_size < PUD_SIZE) 1706 max_map_size = PMD_SIZE; 1707 else if (max_map_size >= PAGE_SIZE && max_map_size < PMD_SIZE) 1708 max_map_size = PAGE_SIZE; 1709 1710 force_pte = (max_map_size == PAGE_SIZE); 1711 vma_pagesize = min_t(long, vma_pagesize, max_map_size); 1712 } 1713 1714 /* 1715 * Both the canonical IPA and fault IPA must be hugepage-aligned to 1716 * ensure we find the right PFN and lay down the mapping in the right 1717 * place. 1718 */ 1719 if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE) { 1720 fault_ipa &= ~(vma_pagesize - 1); 1721 ipa &= ~(vma_pagesize - 1); 1722 } 1723 1724 gfn = ipa >> PAGE_SHIFT; 1725 mte_allowed = kvm_vma_mte_allowed(vma); 1726 1727 vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED; 1728 1729 vm_flags = vma->vm_flags; 1730 1731 is_vma_cacheable = kvm_vma_is_cacheable(vma); 1732 1733 /* Don't use the VMA after the unlock -- it may have vanished */ 1734 vma = NULL; 1735 1736 /* 1737 * Read mmu_invalidate_seq so that KVM can detect if the results of 1738 * vma_lookup() or __kvm_faultin_pfn() become stale prior to 1739 * acquiring kvm->mmu_lock. 1740 * 1741 * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs 1742 * with the smp_wmb() in kvm_mmu_invalidate_end(). 1743 */ 1744 mmu_seq = kvm->mmu_invalidate_seq; 1745 mmap_read_unlock(current->mm); 1746 1747 pfn = __kvm_faultin_pfn(memslot, gfn, write_fault ? FOLL_WRITE : 0, 1748 &writable, &page); 1749 if (pfn == KVM_PFN_ERR_HWPOISON) { 1750 kvm_send_hwpoison_signal(hva, vma_shift); 1751 return 0; 1752 } 1753 if (is_error_noslot_pfn(pfn)) 1754 return -EFAULT; 1755 1756 /* 1757 * Check if this is non-struct page memory PFN, and cannot support 1758 * CMOs. It could potentially be unsafe to access as cachable. 1759 */ 1760 if (vm_flags & (VM_PFNMAP | VM_MIXEDMAP) && !pfn_is_map_memory(pfn)) { 1761 if (is_vma_cacheable) { 1762 /* 1763 * Whilst the VMA owner expects cacheable mapping to this 1764 * PFN, hardware also has to support the FWB and CACHE DIC 1765 * features. 1766 * 1767 * ARM64 KVM relies on kernel VA mapping to the PFN to 1768 * perform cache maintenance as the CMO instructions work on 1769 * virtual addresses. VM_PFNMAP region are not necessarily 1770 * mapped to a KVA and hence the presence of hardware features 1771 * S2FWB and CACHE DIC are mandatory to avoid the need for 1772 * cache maintenance. 1773 */ 1774 if (!kvm_supports_cacheable_pfnmap()) 1775 ret = -EFAULT; 1776 } else { 1777 /* 1778 * If the page was identified as device early by looking at 1779 * the VMA flags, vma_pagesize is already representing the 1780 * largest quantity we can map. If instead it was mapped 1781 * via __kvm_faultin_pfn(), vma_pagesize is set to PAGE_SIZE 1782 * and must not be upgraded. 1783 * 1784 * In both cases, we don't let transparent_hugepage_adjust() 1785 * change things at the last minute. 1786 */ 1787 s2_force_noncacheable = true; 1788 } 1789 } else if (logging_active && !write_fault) { 1790 /* 1791 * Only actually map the page as writable if this was a write 1792 * fault. 1793 */ 1794 writable = false; 1795 } 1796 1797 if (exec_fault && s2_force_noncacheable) 1798 ret = -ENOEXEC; 1799 1800 if (ret) { 1801 kvm_release_page_unused(page); 1802 return ret; 1803 } 1804 1805 if (nested) 1806 adjust_nested_fault_perms(nested, &prot, &writable); 1807 1808 kvm_fault_lock(kvm); 1809 pgt = vcpu->arch.hw_mmu->pgt; 1810 if (mmu_invalidate_retry(kvm, mmu_seq)) { 1811 ret = -EAGAIN; 1812 goto out_unlock; 1813 } 1814 1815 /* 1816 * If we are not forced to use page mapping, check if we are 1817 * backed by a THP and thus use block mapping if possible. 1818 */ 1819 if (vma_pagesize == PAGE_SIZE && !(force_pte || s2_force_noncacheable)) { 1820 if (fault_is_perm && fault_granule > PAGE_SIZE) 1821 vma_pagesize = fault_granule; 1822 else 1823 vma_pagesize = transparent_hugepage_adjust(kvm, memslot, 1824 hva, &pfn, 1825 &fault_ipa); 1826 1827 if (vma_pagesize < 0) { 1828 ret = vma_pagesize; 1829 goto out_unlock; 1830 } 1831 } 1832 1833 if (!fault_is_perm && !s2_force_noncacheable && kvm_has_mte(kvm)) { 1834 /* Check the VMM hasn't introduced a new disallowed VMA */ 1835 if (mte_allowed) { 1836 sanitise_mte_tags(kvm, pfn, vma_pagesize); 1837 } else { 1838 ret = -EFAULT; 1839 goto out_unlock; 1840 } 1841 } 1842 1843 if (writable) 1844 prot |= KVM_PGTABLE_PROT_W; 1845 1846 if (exec_fault) 1847 prot |= KVM_PGTABLE_PROT_X; 1848 1849 if (s2_force_noncacheable) { 1850 if (vfio_allow_any_uc) 1851 prot |= KVM_PGTABLE_PROT_NORMAL_NC; 1852 else 1853 prot |= KVM_PGTABLE_PROT_DEVICE; 1854 } else if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC) && 1855 (!nested || kvm_s2_trans_executable(nested))) { 1856 prot |= KVM_PGTABLE_PROT_X; 1857 } 1858 1859 /* 1860 * Under the premise of getting a FSC_PERM fault, we just need to relax 1861 * permissions only if vma_pagesize equals fault_granule. Otherwise, 1862 * kvm_pgtable_stage2_map() should be called to change block size. 1863 */ 1864 if (fault_is_perm && vma_pagesize == fault_granule) { 1865 /* 1866 * Drop the SW bits in favour of those stored in the 1867 * PTE, which will be preserved. 1868 */ 1869 prot &= ~KVM_NV_GUEST_MAP_SZ; 1870 ret = KVM_PGT_FN(kvm_pgtable_stage2_relax_perms)(pgt, fault_ipa, prot, flags); 1871 } else { 1872 ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, fault_ipa, vma_pagesize, 1873 __pfn_to_phys(pfn), prot, 1874 memcache, flags); 1875 } 1876 1877 out_unlock: 1878 kvm_release_faultin_page(kvm, page, !!ret, writable); 1879 kvm_fault_unlock(kvm); 1880 1881 /* Mark the page dirty only if the fault is handled successfully */ 1882 if (writable && !ret) 1883 mark_page_dirty_in_slot(kvm, memslot, gfn); 1884 1885 return ret != -EAGAIN ? ret : 0; 1886 } 1887 1888 /* Resolve the access fault by making the page young again. */ 1889 static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) 1890 { 1891 enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED; 1892 struct kvm_s2_mmu *mmu; 1893 1894 trace_kvm_access_fault(fault_ipa); 1895 1896 read_lock(&vcpu->kvm->mmu_lock); 1897 mmu = vcpu->arch.hw_mmu; 1898 KVM_PGT_FN(kvm_pgtable_stage2_mkyoung)(mmu->pgt, fault_ipa, flags); 1899 read_unlock(&vcpu->kvm->mmu_lock); 1900 } 1901 1902 int kvm_handle_guest_sea(struct kvm_vcpu *vcpu) 1903 { 1904 /* 1905 * Give APEI the opportunity to claim the abort before handling it 1906 * within KVM. apei_claim_sea() expects to be called with IRQs enabled. 1907 */ 1908 lockdep_assert_irqs_enabled(); 1909 if (apei_claim_sea(NULL) == 0) 1910 return 1; 1911 1912 return kvm_inject_serror(vcpu); 1913 } 1914 1915 /** 1916 * kvm_handle_guest_abort - handles all 2nd stage aborts 1917 * @vcpu: the VCPU pointer 1918 * 1919 * Any abort that gets to the host is almost guaranteed to be caused by a 1920 * missing second stage translation table entry, which can mean that either the 1921 * guest simply needs more memory and we must allocate an appropriate page or it 1922 * can mean that the guest tried to access I/O memory, which is emulated by user 1923 * space. The distinction is based on the IPA causing the fault and whether this 1924 * memory region has been registered as standard RAM by user space. 1925 */ 1926 int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) 1927 { 1928 struct kvm_s2_trans nested_trans, *nested = NULL; 1929 unsigned long esr; 1930 phys_addr_t fault_ipa; /* The address we faulted on */ 1931 phys_addr_t ipa; /* Always the IPA in the L1 guest phys space */ 1932 struct kvm_memory_slot *memslot; 1933 unsigned long hva; 1934 bool is_iabt, write_fault, writable; 1935 gfn_t gfn; 1936 int ret, idx; 1937 1938 if (kvm_vcpu_abt_issea(vcpu)) 1939 return kvm_handle_guest_sea(vcpu); 1940 1941 esr = kvm_vcpu_get_esr(vcpu); 1942 1943 /* 1944 * The fault IPA should be reliable at this point as we're not dealing 1945 * with an SEA. 1946 */ 1947 ipa = fault_ipa = kvm_vcpu_get_fault_ipa(vcpu); 1948 if (KVM_BUG_ON(ipa == INVALID_GPA, vcpu->kvm)) 1949 return -EFAULT; 1950 1951 is_iabt = kvm_vcpu_trap_is_iabt(vcpu); 1952 1953 if (esr_fsc_is_translation_fault(esr)) { 1954 /* Beyond sanitised PARange (which is the IPA limit) */ 1955 if (fault_ipa >= BIT_ULL(get_kvm_ipa_limit())) { 1956 kvm_inject_size_fault(vcpu); 1957 return 1; 1958 } 1959 1960 /* Falls between the IPA range and the PARange? */ 1961 if (fault_ipa >= BIT_ULL(VTCR_EL2_IPA(vcpu->arch.hw_mmu->vtcr))) { 1962 fault_ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0); 1963 1964 return kvm_inject_sea(vcpu, is_iabt, fault_ipa); 1965 } 1966 } 1967 1968 trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_esr(vcpu), 1969 kvm_vcpu_get_hfar(vcpu), fault_ipa); 1970 1971 /* Check the stage-2 fault is trans. fault or write fault */ 1972 if (!esr_fsc_is_translation_fault(esr) && 1973 !esr_fsc_is_permission_fault(esr) && 1974 !esr_fsc_is_access_flag_fault(esr)) { 1975 kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n", 1976 kvm_vcpu_trap_get_class(vcpu), 1977 (unsigned long)kvm_vcpu_trap_get_fault(vcpu), 1978 (unsigned long)kvm_vcpu_get_esr(vcpu)); 1979 return -EFAULT; 1980 } 1981 1982 idx = srcu_read_lock(&vcpu->kvm->srcu); 1983 1984 /* 1985 * We may have faulted on a shadow stage 2 page table if we are 1986 * running a nested guest. In this case, we have to resolve the L2 1987 * IPA to the L1 IPA first, before knowing what kind of memory should 1988 * back the L1 IPA. 1989 * 1990 * If the shadow stage 2 page table walk faults, then we simply inject 1991 * this to the guest and carry on. 1992 * 1993 * If there are no shadow S2 PTs because S2 is disabled, there is 1994 * nothing to walk and we treat it as a 1:1 before going through the 1995 * canonical translation. 1996 */ 1997 if (kvm_is_nested_s2_mmu(vcpu->kvm,vcpu->arch.hw_mmu) && 1998 vcpu->arch.hw_mmu->nested_stage2_enabled) { 1999 u32 esr; 2000 2001 ret = kvm_walk_nested_s2(vcpu, fault_ipa, &nested_trans); 2002 if (ret) { 2003 esr = kvm_s2_trans_esr(&nested_trans); 2004 kvm_inject_s2_fault(vcpu, esr); 2005 goto out_unlock; 2006 } 2007 2008 ret = kvm_s2_handle_perm_fault(vcpu, &nested_trans); 2009 if (ret) { 2010 esr = kvm_s2_trans_esr(&nested_trans); 2011 kvm_inject_s2_fault(vcpu, esr); 2012 goto out_unlock; 2013 } 2014 2015 ipa = kvm_s2_trans_output(&nested_trans); 2016 nested = &nested_trans; 2017 } 2018 2019 gfn = ipa >> PAGE_SHIFT; 2020 memslot = gfn_to_memslot(vcpu->kvm, gfn); 2021 hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable); 2022 write_fault = kvm_is_write_fault(vcpu); 2023 if (kvm_is_error_hva(hva) || (write_fault && !writable)) { 2024 /* 2025 * The guest has put either its instructions or its page-tables 2026 * somewhere it shouldn't have. Userspace won't be able to do 2027 * anything about this (there's no syndrome for a start), so 2028 * re-inject the abort back into the guest. 2029 */ 2030 if (is_iabt) { 2031 ret = -ENOEXEC; 2032 goto out; 2033 } 2034 2035 if (kvm_vcpu_abt_iss1tw(vcpu)) { 2036 ret = kvm_inject_sea_dabt(vcpu, kvm_vcpu_get_hfar(vcpu)); 2037 goto out_unlock; 2038 } 2039 2040 /* 2041 * Check for a cache maintenance operation. Since we 2042 * ended-up here, we know it is outside of any memory 2043 * slot. But we can't find out if that is for a device, 2044 * or if the guest is just being stupid. The only thing 2045 * we know for sure is that this range cannot be cached. 2046 * 2047 * So let's assume that the guest is just being 2048 * cautious, and skip the instruction. 2049 */ 2050 if (kvm_is_error_hva(hva) && kvm_vcpu_dabt_is_cm(vcpu)) { 2051 kvm_incr_pc(vcpu); 2052 ret = 1; 2053 goto out_unlock; 2054 } 2055 2056 /* 2057 * The IPA is reported as [MAX:12], so we need to 2058 * complement it with the bottom 12 bits from the 2059 * faulting VA. This is always 12 bits, irrespective 2060 * of the page size. 2061 */ 2062 ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0); 2063 ret = io_mem_abort(vcpu, ipa); 2064 goto out_unlock; 2065 } 2066 2067 /* Userspace should not be able to register out-of-bounds IPAs */ 2068 VM_BUG_ON(ipa >= kvm_phys_size(vcpu->arch.hw_mmu)); 2069 2070 if (esr_fsc_is_access_flag_fault(esr)) { 2071 handle_access_fault(vcpu, fault_ipa); 2072 ret = 1; 2073 goto out_unlock; 2074 } 2075 2076 VM_WARN_ON_ONCE(kvm_vcpu_trap_is_permission_fault(vcpu) && 2077 !write_fault && !kvm_vcpu_trap_is_exec_fault(vcpu)); 2078 2079 if (kvm_slot_has_gmem(memslot)) 2080 ret = gmem_abort(vcpu, fault_ipa, nested, memslot, 2081 esr_fsc_is_permission_fault(esr)); 2082 else 2083 ret = user_mem_abort(vcpu, fault_ipa, nested, memslot, hva, 2084 esr_fsc_is_permission_fault(esr)); 2085 if (ret == 0) 2086 ret = 1; 2087 out: 2088 if (ret == -ENOEXEC) 2089 ret = kvm_inject_sea_iabt(vcpu, kvm_vcpu_get_hfar(vcpu)); 2090 out_unlock: 2091 srcu_read_unlock(&vcpu->kvm->srcu, idx); 2092 return ret; 2093 } 2094 2095 bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) 2096 { 2097 if (!kvm->arch.mmu.pgt) 2098 return false; 2099 2100 __unmap_stage2_range(&kvm->arch.mmu, range->start << PAGE_SHIFT, 2101 (range->end - range->start) << PAGE_SHIFT, 2102 range->may_block); 2103 2104 kvm_nested_s2_unmap(kvm, range->may_block); 2105 return false; 2106 } 2107 2108 bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 2109 { 2110 u64 size = (range->end - range->start) << PAGE_SHIFT; 2111 2112 if (!kvm->arch.mmu.pgt) 2113 return false; 2114 2115 return KVM_PGT_FN(kvm_pgtable_stage2_test_clear_young)(kvm->arch.mmu.pgt, 2116 range->start << PAGE_SHIFT, 2117 size, true); 2118 /* 2119 * TODO: Handle nested_mmu structures here using the reverse mapping in 2120 * a later version of patch series. 2121 */ 2122 } 2123 2124 bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 2125 { 2126 u64 size = (range->end - range->start) << PAGE_SHIFT; 2127 2128 if (!kvm->arch.mmu.pgt) 2129 return false; 2130 2131 return KVM_PGT_FN(kvm_pgtable_stage2_test_clear_young)(kvm->arch.mmu.pgt, 2132 range->start << PAGE_SHIFT, 2133 size, false); 2134 } 2135 2136 phys_addr_t kvm_mmu_get_httbr(void) 2137 { 2138 return __pa(hyp_pgtable->pgd); 2139 } 2140 2141 phys_addr_t kvm_get_idmap_vector(void) 2142 { 2143 return hyp_idmap_vector; 2144 } 2145 2146 static int kvm_map_idmap_text(void) 2147 { 2148 unsigned long size = hyp_idmap_end - hyp_idmap_start; 2149 int err = __create_hyp_mappings(hyp_idmap_start, size, hyp_idmap_start, 2150 PAGE_HYP_EXEC); 2151 if (err) 2152 kvm_err("Failed to idmap %lx-%lx\n", 2153 hyp_idmap_start, hyp_idmap_end); 2154 2155 return err; 2156 } 2157 2158 static void *kvm_hyp_zalloc_page(void *arg) 2159 { 2160 return (void *)get_zeroed_page(GFP_KERNEL); 2161 } 2162 2163 static struct kvm_pgtable_mm_ops kvm_hyp_mm_ops = { 2164 .zalloc_page = kvm_hyp_zalloc_page, 2165 .get_page = kvm_host_get_page, 2166 .put_page = kvm_host_put_page, 2167 .phys_to_virt = kvm_host_va, 2168 .virt_to_phys = kvm_host_pa, 2169 }; 2170 2171 int __init kvm_mmu_init(u32 *hyp_va_bits) 2172 { 2173 int err; 2174 u32 idmap_bits; 2175 u32 kernel_bits; 2176 2177 hyp_idmap_start = __pa_symbol(__hyp_idmap_text_start); 2178 hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE); 2179 hyp_idmap_end = __pa_symbol(__hyp_idmap_text_end); 2180 hyp_idmap_end = ALIGN(hyp_idmap_end, PAGE_SIZE); 2181 hyp_idmap_vector = __pa_symbol(__kvm_hyp_init); 2182 2183 /* 2184 * We rely on the linker script to ensure at build time that the HYP 2185 * init code does not cross a page boundary. 2186 */ 2187 BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK); 2188 2189 /* 2190 * The ID map is always configured for 48 bits of translation, which 2191 * may be fewer than the number of VA bits used by the regular kernel 2192 * stage 1, when VA_BITS=52. 2193 * 2194 * At EL2, there is only one TTBR register, and we can't switch between 2195 * translation tables *and* update TCR_EL2.T0SZ at the same time. Bottom 2196 * line: we need to use the extended range with *both* our translation 2197 * tables. 2198 * 2199 * So use the maximum of the idmap VA bits and the regular kernel stage 2200 * 1 VA bits to assure that the hypervisor can both ID map its code page 2201 * and map any kernel memory. 2202 */ 2203 idmap_bits = IDMAP_VA_BITS; 2204 kernel_bits = vabits_actual; 2205 *hyp_va_bits = max(idmap_bits, kernel_bits); 2206 2207 kvm_debug("Using %u-bit virtual addresses at EL2\n", *hyp_va_bits); 2208 kvm_debug("IDMAP page: %lx\n", hyp_idmap_start); 2209 kvm_debug("HYP VA range: %lx:%lx\n", 2210 kern_hyp_va(PAGE_OFFSET), 2211 kern_hyp_va((unsigned long)high_memory - 1)); 2212 2213 if (hyp_idmap_start >= kern_hyp_va(PAGE_OFFSET) && 2214 hyp_idmap_start < kern_hyp_va((unsigned long)high_memory - 1) && 2215 hyp_idmap_start != (unsigned long)__hyp_idmap_text_start) { 2216 /* 2217 * The idmap page is intersecting with the VA space, 2218 * it is not safe to continue further. 2219 */ 2220 kvm_err("IDMAP intersecting with HYP VA, unable to continue\n"); 2221 err = -EINVAL; 2222 goto out; 2223 } 2224 2225 hyp_pgtable = kzalloc(sizeof(*hyp_pgtable), GFP_KERNEL); 2226 if (!hyp_pgtable) { 2227 kvm_err("Hyp mode page-table not allocated\n"); 2228 err = -ENOMEM; 2229 goto out; 2230 } 2231 2232 err = kvm_pgtable_hyp_init(hyp_pgtable, *hyp_va_bits, &kvm_hyp_mm_ops); 2233 if (err) 2234 goto out_free_pgtable; 2235 2236 err = kvm_map_idmap_text(); 2237 if (err) 2238 goto out_destroy_pgtable; 2239 2240 io_map_base = hyp_idmap_start; 2241 __hyp_va_bits = *hyp_va_bits; 2242 return 0; 2243 2244 out_destroy_pgtable: 2245 kvm_pgtable_hyp_destroy(hyp_pgtable); 2246 out_free_pgtable: 2247 kfree(hyp_pgtable); 2248 hyp_pgtable = NULL; 2249 out: 2250 return err; 2251 } 2252 2253 void kvm_arch_commit_memory_region(struct kvm *kvm, 2254 struct kvm_memory_slot *old, 2255 const struct kvm_memory_slot *new, 2256 enum kvm_mr_change change) 2257 { 2258 bool log_dirty_pages = new && new->flags & KVM_MEM_LOG_DIRTY_PAGES; 2259 2260 /* 2261 * At this point memslot has been committed and there is an 2262 * allocated dirty_bitmap[], dirty pages will be tracked while the 2263 * memory slot is write protected. 2264 */ 2265 if (log_dirty_pages) { 2266 2267 if (change == KVM_MR_DELETE) 2268 return; 2269 2270 /* 2271 * Huge and normal pages are write-protected and split 2272 * on either of these two cases: 2273 * 2274 * 1. with initial-all-set: gradually with CLEAR ioctls, 2275 */ 2276 if (kvm_dirty_log_manual_protect_and_init_set(kvm)) 2277 return; 2278 /* 2279 * or 2280 * 2. without initial-all-set: all in one shot when 2281 * enabling dirty logging. 2282 */ 2283 kvm_mmu_wp_memory_region(kvm, new->id); 2284 kvm_mmu_split_memory_region(kvm, new->id); 2285 } else { 2286 /* 2287 * Free any leftovers from the eager page splitting cache. Do 2288 * this when deleting, moving, disabling dirty logging, or 2289 * creating the memslot (a nop). Doing it for deletes makes 2290 * sure we don't leak memory, and there's no need to keep the 2291 * cache around for any of the other cases. 2292 */ 2293 kvm_mmu_free_memory_cache(&kvm->arch.mmu.split_page_cache); 2294 } 2295 } 2296 2297 int kvm_arch_prepare_memory_region(struct kvm *kvm, 2298 const struct kvm_memory_slot *old, 2299 struct kvm_memory_slot *new, 2300 enum kvm_mr_change change) 2301 { 2302 hva_t hva, reg_end; 2303 int ret = 0; 2304 2305 if (change != KVM_MR_CREATE && change != KVM_MR_MOVE && 2306 change != KVM_MR_FLAGS_ONLY) 2307 return 0; 2308 2309 /* 2310 * Prevent userspace from creating a memory region outside of the IPA 2311 * space addressable by the KVM guest IPA space. 2312 */ 2313 if ((new->base_gfn + new->npages) > (kvm_phys_size(&kvm->arch.mmu) >> PAGE_SHIFT)) 2314 return -EFAULT; 2315 2316 /* 2317 * Only support guest_memfd backed memslots with mappable memory, since 2318 * there aren't any CoCo VMs that support only private memory on arm64. 2319 */ 2320 if (kvm_slot_has_gmem(new) && !kvm_memslot_is_gmem_only(new)) 2321 return -EINVAL; 2322 2323 hva = new->userspace_addr; 2324 reg_end = hva + (new->npages << PAGE_SHIFT); 2325 2326 mmap_read_lock(current->mm); 2327 /* 2328 * A memory region could potentially cover multiple VMAs, and any holes 2329 * between them, so iterate over all of them. 2330 * 2331 * +--------------------------------------------+ 2332 * +---------------+----------------+ +----------------+ 2333 * | : VMA 1 | VMA 2 | | VMA 3 : | 2334 * +---------------+----------------+ +----------------+ 2335 * | memory region | 2336 * +--------------------------------------------+ 2337 */ 2338 do { 2339 struct vm_area_struct *vma; 2340 2341 vma = find_vma_intersection(current->mm, hva, reg_end); 2342 if (!vma) 2343 break; 2344 2345 if (kvm_has_mte(kvm) && !kvm_vma_mte_allowed(vma)) { 2346 ret = -EINVAL; 2347 break; 2348 } 2349 2350 if (vma->vm_flags & VM_PFNMAP) { 2351 /* IO region dirty page logging not allowed */ 2352 if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) { 2353 ret = -EINVAL; 2354 break; 2355 } 2356 2357 /* 2358 * Cacheable PFNMAP is allowed only if the hardware 2359 * supports it. 2360 */ 2361 if (kvm_vma_is_cacheable(vma) && !kvm_supports_cacheable_pfnmap()) { 2362 ret = -EINVAL; 2363 break; 2364 } 2365 } 2366 hva = min(reg_end, vma->vm_end); 2367 } while (hva < reg_end); 2368 2369 mmap_read_unlock(current->mm); 2370 return ret; 2371 } 2372 2373 void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) 2374 { 2375 } 2376 2377 void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) 2378 { 2379 } 2380 2381 void kvm_arch_flush_shadow_memslot(struct kvm *kvm, 2382 struct kvm_memory_slot *slot) 2383 { 2384 gpa_t gpa = slot->base_gfn << PAGE_SHIFT; 2385 phys_addr_t size = slot->npages << PAGE_SHIFT; 2386 2387 write_lock(&kvm->mmu_lock); 2388 kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, size, true); 2389 kvm_nested_s2_unmap(kvm, true); 2390 write_unlock(&kvm->mmu_lock); 2391 } 2392 2393 /* 2394 * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized). 2395 * 2396 * Main problems: 2397 * - S/W ops are local to a CPU (not broadcast) 2398 * - We have line migration behind our back (speculation) 2399 * - System caches don't support S/W at all (damn!) 2400 * 2401 * In the face of the above, the best we can do is to try and convert 2402 * S/W ops to VA ops. Because the guest is not allowed to infer the 2403 * S/W to PA mapping, it can only use S/W to nuke the whole cache, 2404 * which is a rather good thing for us. 2405 * 2406 * Also, it is only used when turning caches on/off ("The expected 2407 * usage of the cache maintenance instructions that operate by set/way 2408 * is associated with the cache maintenance instructions associated 2409 * with the powerdown and powerup of caches, if this is required by 2410 * the implementation."). 2411 * 2412 * We use the following policy: 2413 * 2414 * - If we trap a S/W operation, we enable VM trapping to detect 2415 * caches being turned on/off, and do a full clean. 2416 * 2417 * - We flush the caches on both caches being turned on and off. 2418 * 2419 * - Once the caches are enabled, we stop trapping VM ops. 2420 */ 2421 void kvm_set_way_flush(struct kvm_vcpu *vcpu) 2422 { 2423 unsigned long hcr = *vcpu_hcr(vcpu); 2424 2425 /* 2426 * If this is the first time we do a S/W operation 2427 * (i.e. HCR_TVM not set) flush the whole memory, and set the 2428 * VM trapping. 2429 * 2430 * Otherwise, rely on the VM trapping to wait for the MMU + 2431 * Caches to be turned off. At that point, we'll be able to 2432 * clean the caches again. 2433 */ 2434 if (!(hcr & HCR_TVM)) { 2435 trace_kvm_set_way_flush(*vcpu_pc(vcpu), 2436 vcpu_has_cache_enabled(vcpu)); 2437 stage2_flush_vm(vcpu->kvm); 2438 *vcpu_hcr(vcpu) = hcr | HCR_TVM; 2439 } 2440 } 2441 2442 void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled) 2443 { 2444 bool now_enabled = vcpu_has_cache_enabled(vcpu); 2445 2446 /* 2447 * If switching the MMU+caches on, need to invalidate the caches. 2448 * If switching it off, need to clean the caches. 2449 * Clean + invalidate does the trick always. 2450 */ 2451 if (now_enabled != was_enabled) 2452 stage2_flush_vm(vcpu->kvm); 2453 2454 /* Caches are now on, stop trapping VM ops (until a S/W op) */ 2455 if (now_enabled) 2456 *vcpu_hcr(vcpu) &= ~HCR_TVM; 2457 2458 trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled); 2459 } 2460