1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2012 - Virtual Open Systems and Columbia University 4 * Author: Christoffer Dall <c.dall@virtualopensystems.com> 5 */ 6 7 #include <linux/acpi.h> 8 #include <linux/mman.h> 9 #include <linux/kvm_host.h> 10 #include <linux/io.h> 11 #include <linux/hugetlb.h> 12 #include <linux/sched/signal.h> 13 #include <trace/events/kvm.h> 14 #include <asm/acpi.h> 15 #include <asm/pgalloc.h> 16 #include <asm/cacheflush.h> 17 #include <asm/kvm_arm.h> 18 #include <asm/kvm_mmu.h> 19 #include <asm/kvm_pgtable.h> 20 #include <asm/kvm_pkvm.h> 21 #include <asm/kvm_asm.h> 22 #include <asm/kvm_emulate.h> 23 #include <asm/virt.h> 24 25 #include "trace.h" 26 27 static struct kvm_pgtable *hyp_pgtable; 28 static DEFINE_MUTEX(kvm_hyp_pgd_mutex); 29 30 static unsigned long __ro_after_init hyp_idmap_start; 31 static unsigned long __ro_after_init hyp_idmap_end; 32 static phys_addr_t __ro_after_init hyp_idmap_vector; 33 34 u32 __ro_after_init __hyp_va_bits; 35 36 static unsigned long __ro_after_init io_map_base; 37 38 #define KVM_PGT_FN(fn) (!is_protected_kvm_enabled() ? fn : p ## fn) 39 40 static phys_addr_t __stage2_range_addr_end(phys_addr_t addr, phys_addr_t end, 41 phys_addr_t size) 42 { 43 phys_addr_t boundary = ALIGN_DOWN(addr + size, size); 44 45 return (boundary - 1 < end - 1) ? boundary : end; 46 } 47 48 static phys_addr_t stage2_range_addr_end(phys_addr_t addr, phys_addr_t end) 49 { 50 phys_addr_t size = kvm_granule_size(KVM_PGTABLE_MIN_BLOCK_LEVEL); 51 52 return __stage2_range_addr_end(addr, end, size); 53 } 54 55 /* 56 * Release kvm_mmu_lock periodically if the memory region is large. Otherwise, 57 * we may see kernel panics with CONFIG_DETECT_HUNG_TASK, 58 * CONFIG_LOCKUP_DETECTOR, CONFIG_LOCKDEP. Additionally, holding the lock too 59 * long will also starve other vCPUs. We have to also make sure that the page 60 * tables are not freed while we released the lock. 61 */ 62 static int stage2_apply_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, 63 phys_addr_t end, 64 int (*fn)(struct kvm_pgtable *, u64, u64), 65 bool resched) 66 { 67 struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu); 68 int ret; 69 u64 next; 70 71 do { 72 struct kvm_pgtable *pgt = mmu->pgt; 73 if (!pgt) 74 return -EINVAL; 75 76 next = stage2_range_addr_end(addr, end); 77 ret = fn(pgt, addr, next - addr); 78 if (ret) 79 break; 80 81 if (resched && next != end) 82 cond_resched_rwlock_write(&kvm->mmu_lock); 83 } while (addr = next, addr != end); 84 85 return ret; 86 } 87 88 #define stage2_apply_range_resched(mmu, addr, end, fn) \ 89 stage2_apply_range(mmu, addr, end, fn, true) 90 91 /* 92 * Get the maximum number of page-tables pages needed to split a range 93 * of blocks into PAGE_SIZE PTEs. It assumes the range is already 94 * mapped at level 2, or at level 1 if allowed. 95 */ 96 static int kvm_mmu_split_nr_page_tables(u64 range) 97 { 98 int n = 0; 99 100 if (KVM_PGTABLE_MIN_BLOCK_LEVEL < 2) 101 n += DIV_ROUND_UP(range, PUD_SIZE); 102 n += DIV_ROUND_UP(range, PMD_SIZE); 103 return n; 104 } 105 106 static bool need_split_memcache_topup_or_resched(struct kvm *kvm) 107 { 108 struct kvm_mmu_memory_cache *cache; 109 u64 chunk_size, min; 110 111 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) 112 return true; 113 114 chunk_size = kvm->arch.mmu.split_page_chunk_size; 115 min = kvm_mmu_split_nr_page_tables(chunk_size); 116 cache = &kvm->arch.mmu.split_page_cache; 117 return kvm_mmu_memory_cache_nr_free_objects(cache) < min; 118 } 119 120 static int kvm_mmu_split_huge_pages(struct kvm *kvm, phys_addr_t addr, 121 phys_addr_t end) 122 { 123 struct kvm_mmu_memory_cache *cache; 124 struct kvm_pgtable *pgt; 125 int ret, cache_capacity; 126 u64 next, chunk_size; 127 128 lockdep_assert_held_write(&kvm->mmu_lock); 129 130 chunk_size = kvm->arch.mmu.split_page_chunk_size; 131 cache_capacity = kvm_mmu_split_nr_page_tables(chunk_size); 132 133 if (chunk_size == 0) 134 return 0; 135 136 cache = &kvm->arch.mmu.split_page_cache; 137 138 do { 139 if (need_split_memcache_topup_or_resched(kvm)) { 140 write_unlock(&kvm->mmu_lock); 141 cond_resched(); 142 /* Eager page splitting is best-effort. */ 143 ret = __kvm_mmu_topup_memory_cache(cache, 144 cache_capacity, 145 cache_capacity); 146 write_lock(&kvm->mmu_lock); 147 if (ret) 148 break; 149 } 150 151 pgt = kvm->arch.mmu.pgt; 152 if (!pgt) 153 return -EINVAL; 154 155 next = __stage2_range_addr_end(addr, end, chunk_size); 156 ret = KVM_PGT_FN(kvm_pgtable_stage2_split)(pgt, addr, next - addr, cache); 157 if (ret) 158 break; 159 } while (addr = next, addr != end); 160 161 return ret; 162 } 163 164 static bool memslot_is_logging(struct kvm_memory_slot *memslot) 165 { 166 return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY); 167 } 168 169 /** 170 * kvm_arch_flush_remote_tlbs() - flush all VM TLB entries for v7/8 171 * @kvm: pointer to kvm structure. 172 * 173 * Interface to HYP function to flush all VM TLB entries 174 */ 175 int kvm_arch_flush_remote_tlbs(struct kvm *kvm) 176 { 177 if (is_protected_kvm_enabled()) 178 kvm_call_hyp_nvhe(__pkvm_tlb_flush_vmid, kvm->arch.pkvm.handle); 179 else 180 kvm_call_hyp(__kvm_tlb_flush_vmid, &kvm->arch.mmu); 181 return 0; 182 } 183 184 int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, 185 gfn_t gfn, u64 nr_pages) 186 { 187 u64 size = nr_pages << PAGE_SHIFT; 188 u64 addr = gfn << PAGE_SHIFT; 189 190 if (is_protected_kvm_enabled()) 191 kvm_call_hyp_nvhe(__pkvm_tlb_flush_vmid, kvm->arch.pkvm.handle); 192 else 193 kvm_tlb_flush_vmid_range(&kvm->arch.mmu, addr, size); 194 return 0; 195 } 196 197 static void *stage2_memcache_zalloc_page(void *arg) 198 { 199 struct kvm_mmu_memory_cache *mc = arg; 200 void *virt; 201 202 /* Allocated with __GFP_ZERO, so no need to zero */ 203 virt = kvm_mmu_memory_cache_alloc(mc); 204 if (virt) 205 kvm_account_pgtable_pages(virt, 1); 206 return virt; 207 } 208 209 static void *kvm_host_zalloc_pages_exact(size_t size) 210 { 211 return alloc_pages_exact(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO); 212 } 213 214 static void *kvm_s2_zalloc_pages_exact(size_t size) 215 { 216 void *virt = kvm_host_zalloc_pages_exact(size); 217 218 if (virt) 219 kvm_account_pgtable_pages(virt, (size >> PAGE_SHIFT)); 220 return virt; 221 } 222 223 static void kvm_s2_free_pages_exact(void *virt, size_t size) 224 { 225 kvm_account_pgtable_pages(virt, -(size >> PAGE_SHIFT)); 226 free_pages_exact(virt, size); 227 } 228 229 static struct kvm_pgtable_mm_ops kvm_s2_mm_ops; 230 231 static void stage2_free_unlinked_table_rcu_cb(struct rcu_head *head) 232 { 233 struct page *page = container_of(head, struct page, rcu_head); 234 void *pgtable = page_to_virt(page); 235 s8 level = page_private(page); 236 237 KVM_PGT_FN(kvm_pgtable_stage2_free_unlinked)(&kvm_s2_mm_ops, pgtable, level); 238 } 239 240 static void stage2_free_unlinked_table(void *addr, s8 level) 241 { 242 struct page *page = virt_to_page(addr); 243 244 set_page_private(page, (unsigned long)level); 245 call_rcu(&page->rcu_head, stage2_free_unlinked_table_rcu_cb); 246 } 247 248 static void kvm_host_get_page(void *addr) 249 { 250 get_page(virt_to_page(addr)); 251 } 252 253 static void kvm_host_put_page(void *addr) 254 { 255 put_page(virt_to_page(addr)); 256 } 257 258 static void kvm_s2_put_page(void *addr) 259 { 260 struct page *p = virt_to_page(addr); 261 /* Dropping last refcount, the page will be freed */ 262 if (page_count(p) == 1) 263 kvm_account_pgtable_pages(addr, -1); 264 put_page(p); 265 } 266 267 static int kvm_host_page_count(void *addr) 268 { 269 return page_count(virt_to_page(addr)); 270 } 271 272 static phys_addr_t kvm_host_pa(void *addr) 273 { 274 return __pa(addr); 275 } 276 277 static void *kvm_host_va(phys_addr_t phys) 278 { 279 return __va(phys); 280 } 281 282 static void clean_dcache_guest_page(void *va, size_t size) 283 { 284 __clean_dcache_guest_page(va, size); 285 } 286 287 static void invalidate_icache_guest_page(void *va, size_t size) 288 { 289 __invalidate_icache_guest_page(va, size); 290 } 291 292 /* 293 * Unmapping vs dcache management: 294 * 295 * If a guest maps certain memory pages as uncached, all writes will 296 * bypass the data cache and go directly to RAM. However, the CPUs 297 * can still speculate reads (not writes) and fill cache lines with 298 * data. 299 * 300 * Those cache lines will be *clean* cache lines though, so a 301 * clean+invalidate operation is equivalent to an invalidate 302 * operation, because no cache lines are marked dirty. 303 * 304 * Those clean cache lines could be filled prior to an uncached write 305 * by the guest, and the cache coherent IO subsystem would therefore 306 * end up writing old data to disk. 307 * 308 * This is why right after unmapping a page/section and invalidating 309 * the corresponding TLBs, we flush to make sure the IO subsystem will 310 * never hit in the cache. 311 * 312 * This is all avoided on systems that have ARM64_HAS_STAGE2_FWB, as 313 * we then fully enforce cacheability of RAM, no matter what the guest 314 * does. 315 */ 316 /** 317 * __unmap_stage2_range -- Clear stage2 page table entries to unmap a range 318 * @mmu: The KVM stage-2 MMU pointer 319 * @start: The intermediate physical base address of the range to unmap 320 * @size: The size of the area to unmap 321 * @may_block: Whether or not we are permitted to block 322 * 323 * Clear a range of stage-2 mappings, lowering the various ref-counts. Must 324 * be called while holding mmu_lock (unless for freeing the stage2 pgd before 325 * destroying the VM), otherwise another faulting VCPU may come in and mess 326 * with things behind our backs. 327 */ 328 static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size, 329 bool may_block) 330 { 331 struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu); 332 phys_addr_t end = start + size; 333 334 lockdep_assert_held_write(&kvm->mmu_lock); 335 WARN_ON(size & ~PAGE_MASK); 336 WARN_ON(stage2_apply_range(mmu, start, end, KVM_PGT_FN(kvm_pgtable_stage2_unmap), 337 may_block)); 338 } 339 340 void kvm_stage2_unmap_range(struct kvm_s2_mmu *mmu, phys_addr_t start, 341 u64 size, bool may_block) 342 { 343 __unmap_stage2_range(mmu, start, size, may_block); 344 } 345 346 void kvm_stage2_flush_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end) 347 { 348 stage2_apply_range_resched(mmu, addr, end, KVM_PGT_FN(kvm_pgtable_stage2_flush)); 349 } 350 351 static void stage2_flush_memslot(struct kvm *kvm, 352 struct kvm_memory_slot *memslot) 353 { 354 phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT; 355 phys_addr_t end = addr + PAGE_SIZE * memslot->npages; 356 357 kvm_stage2_flush_range(&kvm->arch.mmu, addr, end); 358 } 359 360 /** 361 * stage2_flush_vm - Invalidate cache for pages mapped in stage 2 362 * @kvm: The struct kvm pointer 363 * 364 * Go through the stage 2 page tables and invalidate any cache lines 365 * backing memory already mapped to the VM. 366 */ 367 static void stage2_flush_vm(struct kvm *kvm) 368 { 369 struct kvm_memslots *slots; 370 struct kvm_memory_slot *memslot; 371 int idx, bkt; 372 373 idx = srcu_read_lock(&kvm->srcu); 374 write_lock(&kvm->mmu_lock); 375 376 slots = kvm_memslots(kvm); 377 kvm_for_each_memslot(memslot, bkt, slots) 378 stage2_flush_memslot(kvm, memslot); 379 380 kvm_nested_s2_flush(kvm); 381 382 write_unlock(&kvm->mmu_lock); 383 srcu_read_unlock(&kvm->srcu, idx); 384 } 385 386 /** 387 * free_hyp_pgds - free Hyp-mode page tables 388 */ 389 void __init free_hyp_pgds(void) 390 { 391 mutex_lock(&kvm_hyp_pgd_mutex); 392 if (hyp_pgtable) { 393 kvm_pgtable_hyp_destroy(hyp_pgtable); 394 kfree(hyp_pgtable); 395 hyp_pgtable = NULL; 396 } 397 mutex_unlock(&kvm_hyp_pgd_mutex); 398 } 399 400 static bool kvm_host_owns_hyp_mappings(void) 401 { 402 if (is_kernel_in_hyp_mode()) 403 return false; 404 405 if (static_branch_likely(&kvm_protected_mode_initialized)) 406 return false; 407 408 /* 409 * This can happen at boot time when __create_hyp_mappings() is called 410 * after the hyp protection has been enabled, but the static key has 411 * not been flipped yet. 412 */ 413 if (!hyp_pgtable && is_protected_kvm_enabled()) 414 return false; 415 416 WARN_ON(!hyp_pgtable); 417 418 return true; 419 } 420 421 int __create_hyp_mappings(unsigned long start, unsigned long size, 422 unsigned long phys, enum kvm_pgtable_prot prot) 423 { 424 int err; 425 426 if (WARN_ON(!kvm_host_owns_hyp_mappings())) 427 return -EINVAL; 428 429 mutex_lock(&kvm_hyp_pgd_mutex); 430 err = kvm_pgtable_hyp_map(hyp_pgtable, start, size, phys, prot); 431 mutex_unlock(&kvm_hyp_pgd_mutex); 432 433 return err; 434 } 435 436 static phys_addr_t kvm_kaddr_to_phys(void *kaddr) 437 { 438 if (!is_vmalloc_addr(kaddr)) { 439 BUG_ON(!virt_addr_valid(kaddr)); 440 return __pa(kaddr); 441 } else { 442 return page_to_phys(vmalloc_to_page(kaddr)) + 443 offset_in_page(kaddr); 444 } 445 } 446 447 struct hyp_shared_pfn { 448 u64 pfn; 449 int count; 450 struct rb_node node; 451 }; 452 453 static DEFINE_MUTEX(hyp_shared_pfns_lock); 454 static struct rb_root hyp_shared_pfns = RB_ROOT; 455 456 static struct hyp_shared_pfn *find_shared_pfn(u64 pfn, struct rb_node ***node, 457 struct rb_node **parent) 458 { 459 struct hyp_shared_pfn *this; 460 461 *node = &hyp_shared_pfns.rb_node; 462 *parent = NULL; 463 while (**node) { 464 this = container_of(**node, struct hyp_shared_pfn, node); 465 *parent = **node; 466 if (this->pfn < pfn) 467 *node = &((**node)->rb_left); 468 else if (this->pfn > pfn) 469 *node = &((**node)->rb_right); 470 else 471 return this; 472 } 473 474 return NULL; 475 } 476 477 static int share_pfn_hyp(u64 pfn) 478 { 479 struct rb_node **node, *parent; 480 struct hyp_shared_pfn *this; 481 int ret = 0; 482 483 mutex_lock(&hyp_shared_pfns_lock); 484 this = find_shared_pfn(pfn, &node, &parent); 485 if (this) { 486 this->count++; 487 goto unlock; 488 } 489 490 this = kzalloc(sizeof(*this), GFP_KERNEL); 491 if (!this) { 492 ret = -ENOMEM; 493 goto unlock; 494 } 495 496 this->pfn = pfn; 497 this->count = 1; 498 rb_link_node(&this->node, parent, node); 499 rb_insert_color(&this->node, &hyp_shared_pfns); 500 ret = kvm_call_hyp_nvhe(__pkvm_host_share_hyp, pfn, 1); 501 unlock: 502 mutex_unlock(&hyp_shared_pfns_lock); 503 504 return ret; 505 } 506 507 static int unshare_pfn_hyp(u64 pfn) 508 { 509 struct rb_node **node, *parent; 510 struct hyp_shared_pfn *this; 511 int ret = 0; 512 513 mutex_lock(&hyp_shared_pfns_lock); 514 this = find_shared_pfn(pfn, &node, &parent); 515 if (WARN_ON(!this)) { 516 ret = -ENOENT; 517 goto unlock; 518 } 519 520 this->count--; 521 if (this->count) 522 goto unlock; 523 524 rb_erase(&this->node, &hyp_shared_pfns); 525 kfree(this); 526 ret = kvm_call_hyp_nvhe(__pkvm_host_unshare_hyp, pfn, 1); 527 unlock: 528 mutex_unlock(&hyp_shared_pfns_lock); 529 530 return ret; 531 } 532 533 int kvm_share_hyp(void *from, void *to) 534 { 535 phys_addr_t start, end, cur; 536 u64 pfn; 537 int ret; 538 539 if (is_kernel_in_hyp_mode()) 540 return 0; 541 542 /* 543 * The share hcall maps things in the 'fixed-offset' region of the hyp 544 * VA space, so we can only share physically contiguous data-structures 545 * for now. 546 */ 547 if (is_vmalloc_or_module_addr(from) || is_vmalloc_or_module_addr(to)) 548 return -EINVAL; 549 550 if (kvm_host_owns_hyp_mappings()) 551 return create_hyp_mappings(from, to, PAGE_HYP); 552 553 start = ALIGN_DOWN(__pa(from), PAGE_SIZE); 554 end = PAGE_ALIGN(__pa(to)); 555 for (cur = start; cur < end; cur += PAGE_SIZE) { 556 pfn = __phys_to_pfn(cur); 557 ret = share_pfn_hyp(pfn); 558 if (ret) 559 return ret; 560 } 561 562 return 0; 563 } 564 565 void kvm_unshare_hyp(void *from, void *to) 566 { 567 phys_addr_t start, end, cur; 568 u64 pfn; 569 570 if (is_kernel_in_hyp_mode() || kvm_host_owns_hyp_mappings() || !from) 571 return; 572 573 start = ALIGN_DOWN(__pa(from), PAGE_SIZE); 574 end = PAGE_ALIGN(__pa(to)); 575 for (cur = start; cur < end; cur += PAGE_SIZE) { 576 pfn = __phys_to_pfn(cur); 577 WARN_ON(unshare_pfn_hyp(pfn)); 578 } 579 } 580 581 /** 582 * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode 583 * @from: The virtual kernel start address of the range 584 * @to: The virtual kernel end address of the range (exclusive) 585 * @prot: The protection to be applied to this range 586 * 587 * The same virtual address as the kernel virtual address is also used 588 * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying 589 * physical pages. 590 */ 591 int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot) 592 { 593 phys_addr_t phys_addr; 594 unsigned long virt_addr; 595 unsigned long start = kern_hyp_va((unsigned long)from); 596 unsigned long end = kern_hyp_va((unsigned long)to); 597 598 if (is_kernel_in_hyp_mode()) 599 return 0; 600 601 if (!kvm_host_owns_hyp_mappings()) 602 return -EPERM; 603 604 start = start & PAGE_MASK; 605 end = PAGE_ALIGN(end); 606 607 for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) { 608 int err; 609 610 phys_addr = kvm_kaddr_to_phys(from + virt_addr - start); 611 err = __create_hyp_mappings(virt_addr, PAGE_SIZE, phys_addr, 612 prot); 613 if (err) 614 return err; 615 } 616 617 return 0; 618 } 619 620 static int __hyp_alloc_private_va_range(unsigned long base) 621 { 622 lockdep_assert_held(&kvm_hyp_pgd_mutex); 623 624 if (!PAGE_ALIGNED(base)) 625 return -EINVAL; 626 627 /* 628 * Verify that BIT(VA_BITS - 1) hasn't been flipped by 629 * allocating the new area, as it would indicate we've 630 * overflowed the idmap/IO address range. 631 */ 632 if ((base ^ io_map_base) & BIT(VA_BITS - 1)) 633 return -ENOMEM; 634 635 io_map_base = base; 636 637 return 0; 638 } 639 640 /** 641 * hyp_alloc_private_va_range - Allocates a private VA range. 642 * @size: The size of the VA range to reserve. 643 * @haddr: The hypervisor virtual start address of the allocation. 644 * 645 * The private virtual address (VA) range is allocated below io_map_base 646 * and aligned based on the order of @size. 647 * 648 * Return: 0 on success or negative error code on failure. 649 */ 650 int hyp_alloc_private_va_range(size_t size, unsigned long *haddr) 651 { 652 unsigned long base; 653 int ret = 0; 654 655 mutex_lock(&kvm_hyp_pgd_mutex); 656 657 /* 658 * This assumes that we have enough space below the idmap 659 * page to allocate our VAs. If not, the check in 660 * __hyp_alloc_private_va_range() will kick. A potential 661 * alternative would be to detect that overflow and switch 662 * to an allocation above the idmap. 663 * 664 * The allocated size is always a multiple of PAGE_SIZE. 665 */ 666 size = PAGE_ALIGN(size); 667 base = io_map_base - size; 668 ret = __hyp_alloc_private_va_range(base); 669 670 mutex_unlock(&kvm_hyp_pgd_mutex); 671 672 if (!ret) 673 *haddr = base; 674 675 return ret; 676 } 677 678 static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size, 679 unsigned long *haddr, 680 enum kvm_pgtable_prot prot) 681 { 682 unsigned long addr; 683 int ret = 0; 684 685 if (!kvm_host_owns_hyp_mappings()) { 686 addr = kvm_call_hyp_nvhe(__pkvm_create_private_mapping, 687 phys_addr, size, prot); 688 if (IS_ERR_VALUE(addr)) 689 return addr; 690 *haddr = addr; 691 692 return 0; 693 } 694 695 size = PAGE_ALIGN(size + offset_in_page(phys_addr)); 696 ret = hyp_alloc_private_va_range(size, &addr); 697 if (ret) 698 return ret; 699 700 ret = __create_hyp_mappings(addr, size, phys_addr, prot); 701 if (ret) 702 return ret; 703 704 *haddr = addr + offset_in_page(phys_addr); 705 return ret; 706 } 707 708 int create_hyp_stack(phys_addr_t phys_addr, unsigned long *haddr) 709 { 710 unsigned long base; 711 size_t size; 712 int ret; 713 714 mutex_lock(&kvm_hyp_pgd_mutex); 715 /* 716 * Efficient stack verification using the NVHE_STACK_SHIFT bit implies 717 * an alignment of our allocation on the order of the size. 718 */ 719 size = NVHE_STACK_SIZE * 2; 720 base = ALIGN_DOWN(io_map_base - size, size); 721 722 ret = __hyp_alloc_private_va_range(base); 723 724 mutex_unlock(&kvm_hyp_pgd_mutex); 725 726 if (ret) { 727 kvm_err("Cannot allocate hyp stack guard page\n"); 728 return ret; 729 } 730 731 /* 732 * Since the stack grows downwards, map the stack to the page 733 * at the higher address and leave the lower guard page 734 * unbacked. 735 * 736 * Any valid stack address now has the NVHE_STACK_SHIFT bit as 1 737 * and addresses corresponding to the guard page have the 738 * NVHE_STACK_SHIFT bit as 0 - this is used for overflow detection. 739 */ 740 ret = __create_hyp_mappings(base + NVHE_STACK_SIZE, NVHE_STACK_SIZE, 741 phys_addr, PAGE_HYP); 742 if (ret) 743 kvm_err("Cannot map hyp stack\n"); 744 745 *haddr = base + size; 746 747 return ret; 748 } 749 750 /** 751 * create_hyp_io_mappings - Map IO into both kernel and HYP 752 * @phys_addr: The physical start address which gets mapped 753 * @size: Size of the region being mapped 754 * @kaddr: Kernel VA for this mapping 755 * @haddr: HYP VA for this mapping 756 */ 757 int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size, 758 void __iomem **kaddr, 759 void __iomem **haddr) 760 { 761 unsigned long addr; 762 int ret; 763 764 if (is_protected_kvm_enabled()) 765 return -EPERM; 766 767 *kaddr = ioremap(phys_addr, size); 768 if (!*kaddr) 769 return -ENOMEM; 770 771 if (is_kernel_in_hyp_mode()) { 772 *haddr = *kaddr; 773 return 0; 774 } 775 776 ret = __create_hyp_private_mapping(phys_addr, size, 777 &addr, PAGE_HYP_DEVICE); 778 if (ret) { 779 iounmap(*kaddr); 780 *kaddr = NULL; 781 *haddr = NULL; 782 return ret; 783 } 784 785 *haddr = (void __iomem *)addr; 786 return 0; 787 } 788 789 /** 790 * create_hyp_exec_mappings - Map an executable range into HYP 791 * @phys_addr: The physical start address which gets mapped 792 * @size: Size of the region being mapped 793 * @haddr: HYP VA for this mapping 794 */ 795 int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size, 796 void **haddr) 797 { 798 unsigned long addr; 799 int ret; 800 801 BUG_ON(is_kernel_in_hyp_mode()); 802 803 ret = __create_hyp_private_mapping(phys_addr, size, 804 &addr, PAGE_HYP_EXEC); 805 if (ret) { 806 *haddr = NULL; 807 return ret; 808 } 809 810 *haddr = (void *)addr; 811 return 0; 812 } 813 814 static struct kvm_pgtable_mm_ops kvm_user_mm_ops = { 815 /* We shouldn't need any other callback to walk the PT */ 816 .phys_to_virt = kvm_host_va, 817 }; 818 819 static int get_user_mapping_size(struct kvm *kvm, u64 addr) 820 { 821 struct kvm_pgtable pgt = { 822 .pgd = (kvm_pteref_t)kvm->mm->pgd, 823 .ia_bits = vabits_actual, 824 .start_level = (KVM_PGTABLE_LAST_LEVEL - 825 ARM64_HW_PGTABLE_LEVELS(pgt.ia_bits) + 1), 826 .mm_ops = &kvm_user_mm_ops, 827 }; 828 unsigned long flags; 829 kvm_pte_t pte = 0; /* Keep GCC quiet... */ 830 s8 level = S8_MAX; 831 int ret; 832 833 /* 834 * Disable IRQs so that we hazard against a concurrent 835 * teardown of the userspace page tables (which relies on 836 * IPI-ing threads). 837 */ 838 local_irq_save(flags); 839 ret = kvm_pgtable_get_leaf(&pgt, addr, &pte, &level); 840 local_irq_restore(flags); 841 842 if (ret) 843 return ret; 844 845 /* 846 * Not seeing an error, but not updating level? Something went 847 * deeply wrong... 848 */ 849 if (WARN_ON(level > KVM_PGTABLE_LAST_LEVEL)) 850 return -EFAULT; 851 if (WARN_ON(level < KVM_PGTABLE_FIRST_LEVEL)) 852 return -EFAULT; 853 854 /* Oops, the userspace PTs are gone... Replay the fault */ 855 if (!kvm_pte_valid(pte)) 856 return -EAGAIN; 857 858 return BIT(ARM64_HW_PGTABLE_LEVEL_SHIFT(level)); 859 } 860 861 static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = { 862 .zalloc_page = stage2_memcache_zalloc_page, 863 .zalloc_pages_exact = kvm_s2_zalloc_pages_exact, 864 .free_pages_exact = kvm_s2_free_pages_exact, 865 .free_unlinked_table = stage2_free_unlinked_table, 866 .get_page = kvm_host_get_page, 867 .put_page = kvm_s2_put_page, 868 .page_count = kvm_host_page_count, 869 .phys_to_virt = kvm_host_va, 870 .virt_to_phys = kvm_host_pa, 871 .dcache_clean_inval_poc = clean_dcache_guest_page, 872 .icache_inval_pou = invalidate_icache_guest_page, 873 }; 874 875 static int kvm_init_ipa_range(struct kvm_s2_mmu *mmu, unsigned long type) 876 { 877 u32 kvm_ipa_limit = get_kvm_ipa_limit(); 878 u64 mmfr0, mmfr1; 879 u32 phys_shift; 880 881 if (type & ~KVM_VM_TYPE_ARM_IPA_SIZE_MASK) 882 return -EINVAL; 883 884 phys_shift = KVM_VM_TYPE_ARM_IPA_SIZE(type); 885 if (is_protected_kvm_enabled()) { 886 phys_shift = kvm_ipa_limit; 887 } else if (phys_shift) { 888 if (phys_shift > kvm_ipa_limit || 889 phys_shift < ARM64_MIN_PARANGE_BITS) 890 return -EINVAL; 891 } else { 892 phys_shift = KVM_PHYS_SHIFT; 893 if (phys_shift > kvm_ipa_limit) { 894 pr_warn_once("%s using unsupported default IPA limit, upgrade your VMM\n", 895 current->comm); 896 return -EINVAL; 897 } 898 } 899 900 mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); 901 mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); 902 mmu->vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift); 903 904 return 0; 905 } 906 907 /** 908 * kvm_init_stage2_mmu - Initialise a S2 MMU structure 909 * @kvm: The pointer to the KVM structure 910 * @mmu: The pointer to the s2 MMU structure 911 * @type: The machine type of the virtual machine 912 * 913 * Allocates only the stage-2 HW PGD level table(s). 914 * Note we don't need locking here as this is only called in two cases: 915 * 916 * - when the VM is created, which can't race against anything 917 * 918 * - when secondary kvm_s2_mmu structures are initialised for NV 919 * guests, and the caller must hold kvm->lock as this is called on a 920 * per-vcpu basis. 921 */ 922 int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type) 923 { 924 int cpu, err; 925 struct kvm_pgtable *pgt; 926 927 /* 928 * If we already have our page tables in place, and that the 929 * MMU context is the canonical one, we have a bug somewhere, 930 * as this is only supposed to ever happen once per VM. 931 * 932 * Otherwise, we're building nested page tables, and that's 933 * probably because userspace called KVM_ARM_VCPU_INIT more 934 * than once on the same vcpu. Since that's actually legal, 935 * don't kick a fuss and leave gracefully. 936 */ 937 if (mmu->pgt != NULL) { 938 if (kvm_is_nested_s2_mmu(kvm, mmu)) 939 return 0; 940 941 kvm_err("kvm_arch already initialized?\n"); 942 return -EINVAL; 943 } 944 945 err = kvm_init_ipa_range(mmu, type); 946 if (err) 947 return err; 948 949 pgt = kzalloc(sizeof(*pgt), GFP_KERNEL_ACCOUNT); 950 if (!pgt) 951 return -ENOMEM; 952 953 mmu->arch = &kvm->arch; 954 err = KVM_PGT_FN(kvm_pgtable_stage2_init)(pgt, mmu, &kvm_s2_mm_ops); 955 if (err) 956 goto out_free_pgtable; 957 958 mmu->pgt = pgt; 959 if (is_protected_kvm_enabled()) 960 return 0; 961 962 mmu->last_vcpu_ran = alloc_percpu(typeof(*mmu->last_vcpu_ran)); 963 if (!mmu->last_vcpu_ran) { 964 err = -ENOMEM; 965 goto out_destroy_pgtable; 966 } 967 968 for_each_possible_cpu(cpu) 969 *per_cpu_ptr(mmu->last_vcpu_ran, cpu) = -1; 970 971 /* The eager page splitting is disabled by default */ 972 mmu->split_page_chunk_size = KVM_ARM_EAGER_SPLIT_CHUNK_SIZE_DEFAULT; 973 mmu->split_page_cache.gfp_zero = __GFP_ZERO; 974 975 mmu->pgd_phys = __pa(pgt->pgd); 976 977 if (kvm_is_nested_s2_mmu(kvm, mmu)) 978 kvm_init_nested_s2_mmu(mmu); 979 980 return 0; 981 982 out_destroy_pgtable: 983 KVM_PGT_FN(kvm_pgtable_stage2_destroy)(pgt); 984 out_free_pgtable: 985 kfree(pgt); 986 return err; 987 } 988 989 void kvm_uninit_stage2_mmu(struct kvm *kvm) 990 { 991 kvm_free_stage2_pgd(&kvm->arch.mmu); 992 kvm_mmu_free_memory_cache(&kvm->arch.mmu.split_page_cache); 993 } 994 995 static void stage2_unmap_memslot(struct kvm *kvm, 996 struct kvm_memory_slot *memslot) 997 { 998 hva_t hva = memslot->userspace_addr; 999 phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT; 1000 phys_addr_t size = PAGE_SIZE * memslot->npages; 1001 hva_t reg_end = hva + size; 1002 1003 /* 1004 * A memory region could potentially cover multiple VMAs, and any holes 1005 * between them, so iterate over all of them to find out if we should 1006 * unmap any of them. 1007 * 1008 * +--------------------------------------------+ 1009 * +---------------+----------------+ +----------------+ 1010 * | : VMA 1 | VMA 2 | | VMA 3 : | 1011 * +---------------+----------------+ +----------------+ 1012 * | memory region | 1013 * +--------------------------------------------+ 1014 */ 1015 do { 1016 struct vm_area_struct *vma; 1017 hva_t vm_start, vm_end; 1018 1019 vma = find_vma_intersection(current->mm, hva, reg_end); 1020 if (!vma) 1021 break; 1022 1023 /* 1024 * Take the intersection of this VMA with the memory region 1025 */ 1026 vm_start = max(hva, vma->vm_start); 1027 vm_end = min(reg_end, vma->vm_end); 1028 1029 if (!(vma->vm_flags & VM_PFNMAP)) { 1030 gpa_t gpa = addr + (vm_start - memslot->userspace_addr); 1031 kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, vm_end - vm_start, true); 1032 } 1033 hva = vm_end; 1034 } while (hva < reg_end); 1035 } 1036 1037 /** 1038 * stage2_unmap_vm - Unmap Stage-2 RAM mappings 1039 * @kvm: The struct kvm pointer 1040 * 1041 * Go through the memregions and unmap any regular RAM 1042 * backing memory already mapped to the VM. 1043 */ 1044 void stage2_unmap_vm(struct kvm *kvm) 1045 { 1046 struct kvm_memslots *slots; 1047 struct kvm_memory_slot *memslot; 1048 int idx, bkt; 1049 1050 idx = srcu_read_lock(&kvm->srcu); 1051 mmap_read_lock(current->mm); 1052 write_lock(&kvm->mmu_lock); 1053 1054 slots = kvm_memslots(kvm); 1055 kvm_for_each_memslot(memslot, bkt, slots) 1056 stage2_unmap_memslot(kvm, memslot); 1057 1058 kvm_nested_s2_unmap(kvm, true); 1059 1060 write_unlock(&kvm->mmu_lock); 1061 mmap_read_unlock(current->mm); 1062 srcu_read_unlock(&kvm->srcu, idx); 1063 } 1064 1065 void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu) 1066 { 1067 struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu); 1068 struct kvm_pgtable *pgt = NULL; 1069 1070 write_lock(&kvm->mmu_lock); 1071 pgt = mmu->pgt; 1072 if (pgt) { 1073 mmu->pgd_phys = 0; 1074 mmu->pgt = NULL; 1075 free_percpu(mmu->last_vcpu_ran); 1076 } 1077 1078 if (kvm_is_nested_s2_mmu(kvm, mmu)) 1079 kvm_init_nested_s2_mmu(mmu); 1080 1081 write_unlock(&kvm->mmu_lock); 1082 1083 if (pgt) { 1084 KVM_PGT_FN(kvm_pgtable_stage2_destroy)(pgt); 1085 kfree(pgt); 1086 } 1087 } 1088 1089 static void hyp_mc_free_fn(void *addr, void *mc) 1090 { 1091 struct kvm_hyp_memcache *memcache = mc; 1092 1093 if (memcache->flags & HYP_MEMCACHE_ACCOUNT_STAGE2) 1094 kvm_account_pgtable_pages(addr, -1); 1095 1096 free_page((unsigned long)addr); 1097 } 1098 1099 static void *hyp_mc_alloc_fn(void *mc) 1100 { 1101 struct kvm_hyp_memcache *memcache = mc; 1102 void *addr; 1103 1104 addr = (void *)__get_free_page(GFP_KERNEL_ACCOUNT); 1105 if (addr && memcache->flags & HYP_MEMCACHE_ACCOUNT_STAGE2) 1106 kvm_account_pgtable_pages(addr, 1); 1107 1108 return addr; 1109 } 1110 1111 void free_hyp_memcache(struct kvm_hyp_memcache *mc) 1112 { 1113 if (!is_protected_kvm_enabled()) 1114 return; 1115 1116 kfree(mc->mapping); 1117 __free_hyp_memcache(mc, hyp_mc_free_fn, kvm_host_va, mc); 1118 } 1119 1120 int topup_hyp_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages) 1121 { 1122 if (!is_protected_kvm_enabled()) 1123 return 0; 1124 1125 if (!mc->mapping) { 1126 mc->mapping = kzalloc(sizeof(struct pkvm_mapping), GFP_KERNEL_ACCOUNT); 1127 if (!mc->mapping) 1128 return -ENOMEM; 1129 } 1130 1131 return __topup_hyp_memcache(mc, min_pages, hyp_mc_alloc_fn, 1132 kvm_host_pa, mc); 1133 } 1134 1135 /** 1136 * kvm_phys_addr_ioremap - map a device range to guest IPA 1137 * 1138 * @kvm: The KVM pointer 1139 * @guest_ipa: The IPA at which to insert the mapping 1140 * @pa: The physical address of the device 1141 * @size: The size of the mapping 1142 * @writable: Whether or not to create a writable mapping 1143 */ 1144 int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, 1145 phys_addr_t pa, unsigned long size, bool writable) 1146 { 1147 phys_addr_t addr; 1148 int ret = 0; 1149 struct kvm_mmu_memory_cache cache = { .gfp_zero = __GFP_ZERO }; 1150 struct kvm_s2_mmu *mmu = &kvm->arch.mmu; 1151 struct kvm_pgtable *pgt = mmu->pgt; 1152 enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_DEVICE | 1153 KVM_PGTABLE_PROT_R | 1154 (writable ? KVM_PGTABLE_PROT_W : 0); 1155 1156 if (is_protected_kvm_enabled()) 1157 return -EPERM; 1158 1159 size += offset_in_page(guest_ipa); 1160 guest_ipa &= PAGE_MASK; 1161 1162 for (addr = guest_ipa; addr < guest_ipa + size; addr += PAGE_SIZE) { 1163 ret = kvm_mmu_topup_memory_cache(&cache, 1164 kvm_mmu_cache_min_pages(mmu)); 1165 if (ret) 1166 break; 1167 1168 write_lock(&kvm->mmu_lock); 1169 ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, addr, PAGE_SIZE, 1170 pa, prot, &cache, 0); 1171 write_unlock(&kvm->mmu_lock); 1172 if (ret) 1173 break; 1174 1175 pa += PAGE_SIZE; 1176 } 1177 1178 kvm_mmu_free_memory_cache(&cache); 1179 return ret; 1180 } 1181 1182 /** 1183 * kvm_stage2_wp_range() - write protect stage2 memory region range 1184 * @mmu: The KVM stage-2 MMU pointer 1185 * @addr: Start address of range 1186 * @end: End address of range 1187 */ 1188 void kvm_stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end) 1189 { 1190 stage2_apply_range_resched(mmu, addr, end, KVM_PGT_FN(kvm_pgtable_stage2_wrprotect)); 1191 } 1192 1193 /** 1194 * kvm_mmu_wp_memory_region() - write protect stage 2 entries for memory slot 1195 * @kvm: The KVM pointer 1196 * @slot: The memory slot to write protect 1197 * 1198 * Called to start logging dirty pages after memory region 1199 * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns 1200 * all present PUD, PMD and PTEs are write protected in the memory region. 1201 * Afterwards read of dirty page log can be called. 1202 * 1203 * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired, 1204 * serializing operations for VM memory regions. 1205 */ 1206 static void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot) 1207 { 1208 struct kvm_memslots *slots = kvm_memslots(kvm); 1209 struct kvm_memory_slot *memslot = id_to_memslot(slots, slot); 1210 phys_addr_t start, end; 1211 1212 if (WARN_ON_ONCE(!memslot)) 1213 return; 1214 1215 start = memslot->base_gfn << PAGE_SHIFT; 1216 end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT; 1217 1218 write_lock(&kvm->mmu_lock); 1219 kvm_stage2_wp_range(&kvm->arch.mmu, start, end); 1220 kvm_nested_s2_wp(kvm); 1221 write_unlock(&kvm->mmu_lock); 1222 kvm_flush_remote_tlbs_memslot(kvm, memslot); 1223 } 1224 1225 /** 1226 * kvm_mmu_split_memory_region() - split the stage 2 blocks into PAGE_SIZE 1227 * pages for memory slot 1228 * @kvm: The KVM pointer 1229 * @slot: The memory slot to split 1230 * 1231 * Acquires kvm->mmu_lock. Called with kvm->slots_lock mutex acquired, 1232 * serializing operations for VM memory regions. 1233 */ 1234 static void kvm_mmu_split_memory_region(struct kvm *kvm, int slot) 1235 { 1236 struct kvm_memslots *slots; 1237 struct kvm_memory_slot *memslot; 1238 phys_addr_t start, end; 1239 1240 lockdep_assert_held(&kvm->slots_lock); 1241 1242 slots = kvm_memslots(kvm); 1243 memslot = id_to_memslot(slots, slot); 1244 1245 start = memslot->base_gfn << PAGE_SHIFT; 1246 end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT; 1247 1248 write_lock(&kvm->mmu_lock); 1249 kvm_mmu_split_huge_pages(kvm, start, end); 1250 write_unlock(&kvm->mmu_lock); 1251 } 1252 1253 /* 1254 * kvm_arch_mmu_enable_log_dirty_pt_masked() - enable dirty logging for selected pages. 1255 * @kvm: The KVM pointer 1256 * @slot: The memory slot associated with mask 1257 * @gfn_offset: The gfn offset in memory slot 1258 * @mask: The mask of pages at offset 'gfn_offset' in this memory 1259 * slot to enable dirty logging on 1260 * 1261 * Writes protect selected pages to enable dirty logging, and then 1262 * splits them to PAGE_SIZE. Caller must acquire kvm->mmu_lock. 1263 */ 1264 void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, 1265 struct kvm_memory_slot *slot, 1266 gfn_t gfn_offset, unsigned long mask) 1267 { 1268 phys_addr_t base_gfn = slot->base_gfn + gfn_offset; 1269 phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT; 1270 phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT; 1271 1272 lockdep_assert_held_write(&kvm->mmu_lock); 1273 1274 kvm_stage2_wp_range(&kvm->arch.mmu, start, end); 1275 1276 /* 1277 * Eager-splitting is done when manual-protect is set. We 1278 * also check for initially-all-set because we can avoid 1279 * eager-splitting if initially-all-set is false. 1280 * Initially-all-set equal false implies that huge-pages were 1281 * already split when enabling dirty logging: no need to do it 1282 * again. 1283 */ 1284 if (kvm_dirty_log_manual_protect_and_init_set(kvm)) 1285 kvm_mmu_split_huge_pages(kvm, start, end); 1286 1287 kvm_nested_s2_wp(kvm); 1288 } 1289 1290 static void kvm_send_hwpoison_signal(unsigned long address, short lsb) 1291 { 1292 send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current); 1293 } 1294 1295 static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot, 1296 unsigned long hva, 1297 unsigned long map_size) 1298 { 1299 gpa_t gpa_start; 1300 hva_t uaddr_start, uaddr_end; 1301 size_t size; 1302 1303 /* The memslot and the VMA are guaranteed to be aligned to PAGE_SIZE */ 1304 if (map_size == PAGE_SIZE) 1305 return true; 1306 1307 /* pKVM only supports PMD_SIZE huge-mappings */ 1308 if (is_protected_kvm_enabled() && map_size != PMD_SIZE) 1309 return false; 1310 1311 size = memslot->npages * PAGE_SIZE; 1312 1313 gpa_start = memslot->base_gfn << PAGE_SHIFT; 1314 1315 uaddr_start = memslot->userspace_addr; 1316 uaddr_end = uaddr_start + size; 1317 1318 /* 1319 * Pages belonging to memslots that don't have the same alignment 1320 * within a PMD/PUD for userspace and IPA cannot be mapped with stage-2 1321 * PMD/PUD entries, because we'll end up mapping the wrong pages. 1322 * 1323 * Consider a layout like the following: 1324 * 1325 * memslot->userspace_addr: 1326 * +-----+--------------------+--------------------+---+ 1327 * |abcde|fgh Stage-1 block | Stage-1 block tv|xyz| 1328 * +-----+--------------------+--------------------+---+ 1329 * 1330 * memslot->base_gfn << PAGE_SHIFT: 1331 * +---+--------------------+--------------------+-----+ 1332 * |abc|def Stage-2 block | Stage-2 block |tvxyz| 1333 * +---+--------------------+--------------------+-----+ 1334 * 1335 * If we create those stage-2 blocks, we'll end up with this incorrect 1336 * mapping: 1337 * d -> f 1338 * e -> g 1339 * f -> h 1340 */ 1341 if ((gpa_start & (map_size - 1)) != (uaddr_start & (map_size - 1))) 1342 return false; 1343 1344 /* 1345 * Next, let's make sure we're not trying to map anything not covered 1346 * by the memslot. This means we have to prohibit block size mappings 1347 * for the beginning and end of a non-block aligned and non-block sized 1348 * memory slot (illustrated by the head and tail parts of the 1349 * userspace view above containing pages 'abcde' and 'xyz', 1350 * respectively). 1351 * 1352 * Note that it doesn't matter if we do the check using the 1353 * userspace_addr or the base_gfn, as both are equally aligned (per 1354 * the check above) and equally sized. 1355 */ 1356 return (hva & ~(map_size - 1)) >= uaddr_start && 1357 (hva & ~(map_size - 1)) + map_size <= uaddr_end; 1358 } 1359 1360 /* 1361 * Check if the given hva is backed by a transparent huge page (THP) and 1362 * whether it can be mapped using block mapping in stage2. If so, adjust 1363 * the stage2 PFN and IPA accordingly. Only PMD_SIZE THPs are currently 1364 * supported. This will need to be updated to support other THP sizes. 1365 * 1366 * Returns the size of the mapping. 1367 */ 1368 static long 1369 transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot, 1370 unsigned long hva, kvm_pfn_t *pfnp, 1371 phys_addr_t *ipap) 1372 { 1373 kvm_pfn_t pfn = *pfnp; 1374 1375 /* 1376 * Make sure the adjustment is done only for THP pages. Also make 1377 * sure that the HVA and IPA are sufficiently aligned and that the 1378 * block map is contained within the memslot. 1379 */ 1380 if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) { 1381 int sz = get_user_mapping_size(kvm, hva); 1382 1383 if (sz < 0) 1384 return sz; 1385 1386 if (sz < PMD_SIZE) 1387 return PAGE_SIZE; 1388 1389 *ipap &= PMD_MASK; 1390 pfn &= ~(PTRS_PER_PMD - 1); 1391 *pfnp = pfn; 1392 1393 return PMD_SIZE; 1394 } 1395 1396 /* Use page mapping if we cannot use block mapping. */ 1397 return PAGE_SIZE; 1398 } 1399 1400 static int get_vma_page_shift(struct vm_area_struct *vma, unsigned long hva) 1401 { 1402 unsigned long pa; 1403 1404 if (is_vm_hugetlb_page(vma) && !(vma->vm_flags & VM_PFNMAP)) 1405 return huge_page_shift(hstate_vma(vma)); 1406 1407 if (!(vma->vm_flags & VM_PFNMAP)) 1408 return PAGE_SHIFT; 1409 1410 VM_BUG_ON(is_vm_hugetlb_page(vma)); 1411 1412 pa = (vma->vm_pgoff << PAGE_SHIFT) + (hva - vma->vm_start); 1413 1414 #ifndef __PAGETABLE_PMD_FOLDED 1415 if ((hva & (PUD_SIZE - 1)) == (pa & (PUD_SIZE - 1)) && 1416 ALIGN_DOWN(hva, PUD_SIZE) >= vma->vm_start && 1417 ALIGN(hva, PUD_SIZE) <= vma->vm_end) 1418 return PUD_SHIFT; 1419 #endif 1420 1421 if ((hva & (PMD_SIZE - 1)) == (pa & (PMD_SIZE - 1)) && 1422 ALIGN_DOWN(hva, PMD_SIZE) >= vma->vm_start && 1423 ALIGN(hva, PMD_SIZE) <= vma->vm_end) 1424 return PMD_SHIFT; 1425 1426 return PAGE_SHIFT; 1427 } 1428 1429 /* 1430 * The page will be mapped in stage 2 as Normal Cacheable, so the VM will be 1431 * able to see the page's tags and therefore they must be initialised first. If 1432 * PG_mte_tagged is set, tags have already been initialised. 1433 * 1434 * The race in the test/set of the PG_mte_tagged flag is handled by: 1435 * - preventing VM_SHARED mappings in a memslot with MTE preventing two VMs 1436 * racing to santise the same page 1437 * - mmap_lock protects between a VM faulting a page in and the VMM performing 1438 * an mprotect() to add VM_MTE 1439 */ 1440 static void sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn, 1441 unsigned long size) 1442 { 1443 unsigned long i, nr_pages = size >> PAGE_SHIFT; 1444 struct page *page = pfn_to_page(pfn); 1445 struct folio *folio = page_folio(page); 1446 1447 if (!kvm_has_mte(kvm)) 1448 return; 1449 1450 if (folio_test_hugetlb(folio)) { 1451 /* Hugetlb has MTE flags set on head page only */ 1452 if (folio_try_hugetlb_mte_tagging(folio)) { 1453 for (i = 0; i < nr_pages; i++, page++) 1454 mte_clear_page_tags(page_address(page)); 1455 folio_set_hugetlb_mte_tagged(folio); 1456 } 1457 return; 1458 } 1459 1460 for (i = 0; i < nr_pages; i++, page++) { 1461 if (try_page_mte_tagging(page)) { 1462 mte_clear_page_tags(page_address(page)); 1463 set_page_mte_tagged(page); 1464 } 1465 } 1466 } 1467 1468 static bool kvm_vma_mte_allowed(struct vm_area_struct *vma) 1469 { 1470 return vma->vm_flags & VM_MTE_ALLOWED; 1471 } 1472 1473 static bool kvm_vma_is_cacheable(struct vm_area_struct *vma) 1474 { 1475 switch (FIELD_GET(PTE_ATTRINDX_MASK, pgprot_val(vma->vm_page_prot))) { 1476 case MT_NORMAL_NC: 1477 case MT_DEVICE_nGnRnE: 1478 case MT_DEVICE_nGnRE: 1479 return false; 1480 default: 1481 return true; 1482 } 1483 } 1484 1485 static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, 1486 struct kvm_s2_trans *nested, 1487 struct kvm_memory_slot *memslot, unsigned long hva, 1488 bool fault_is_perm) 1489 { 1490 int ret = 0; 1491 bool write_fault, writable, force_pte = false; 1492 bool exec_fault, mte_allowed, is_vma_cacheable; 1493 bool s2_force_noncacheable = false, vfio_allow_any_uc = false; 1494 unsigned long mmu_seq; 1495 phys_addr_t ipa = fault_ipa; 1496 struct kvm *kvm = vcpu->kvm; 1497 struct vm_area_struct *vma; 1498 short vma_shift; 1499 void *memcache; 1500 gfn_t gfn; 1501 kvm_pfn_t pfn; 1502 bool logging_active = memslot_is_logging(memslot); 1503 long vma_pagesize, fault_granule; 1504 enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; 1505 struct kvm_pgtable *pgt; 1506 struct page *page; 1507 vm_flags_t vm_flags; 1508 enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED; 1509 1510 if (fault_is_perm) 1511 fault_granule = kvm_vcpu_trap_get_perm_fault_granule(vcpu); 1512 write_fault = kvm_is_write_fault(vcpu); 1513 exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu); 1514 VM_BUG_ON(write_fault && exec_fault); 1515 1516 if (!is_protected_kvm_enabled()) 1517 memcache = &vcpu->arch.mmu_page_cache; 1518 else 1519 memcache = &vcpu->arch.pkvm_memcache; 1520 1521 /* 1522 * Permission faults just need to update the existing leaf entry, 1523 * and so normally don't require allocations from the memcache. The 1524 * only exception to this is when dirty logging is enabled at runtime 1525 * and a write fault needs to collapse a block entry into a table. 1526 */ 1527 if (!fault_is_perm || (logging_active && write_fault)) { 1528 int min_pages = kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu); 1529 1530 if (!is_protected_kvm_enabled()) 1531 ret = kvm_mmu_topup_memory_cache(memcache, min_pages); 1532 else 1533 ret = topup_hyp_memcache(memcache, min_pages); 1534 1535 if (ret) 1536 return ret; 1537 } 1538 1539 /* 1540 * Let's check if we will get back a huge page backed by hugetlbfs, or 1541 * get block mapping for device MMIO region. 1542 */ 1543 mmap_read_lock(current->mm); 1544 vma = vma_lookup(current->mm, hva); 1545 if (unlikely(!vma)) { 1546 kvm_err("Failed to find VMA for hva 0x%lx\n", hva); 1547 mmap_read_unlock(current->mm); 1548 return -EFAULT; 1549 } 1550 1551 /* 1552 * logging_active is guaranteed to never be true for VM_PFNMAP 1553 * memslots. 1554 */ 1555 if (logging_active) { 1556 force_pte = true; 1557 vma_shift = PAGE_SHIFT; 1558 } else { 1559 vma_shift = get_vma_page_shift(vma, hva); 1560 } 1561 1562 switch (vma_shift) { 1563 #ifndef __PAGETABLE_PMD_FOLDED 1564 case PUD_SHIFT: 1565 if (fault_supports_stage2_huge_mapping(memslot, hva, PUD_SIZE)) 1566 break; 1567 fallthrough; 1568 #endif 1569 case CONT_PMD_SHIFT: 1570 vma_shift = PMD_SHIFT; 1571 fallthrough; 1572 case PMD_SHIFT: 1573 if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) 1574 break; 1575 fallthrough; 1576 case CONT_PTE_SHIFT: 1577 vma_shift = PAGE_SHIFT; 1578 force_pte = true; 1579 fallthrough; 1580 case PAGE_SHIFT: 1581 break; 1582 default: 1583 WARN_ONCE(1, "Unknown vma_shift %d", vma_shift); 1584 } 1585 1586 vma_pagesize = 1UL << vma_shift; 1587 1588 if (nested) { 1589 unsigned long max_map_size; 1590 1591 max_map_size = force_pte ? PAGE_SIZE : PUD_SIZE; 1592 1593 ipa = kvm_s2_trans_output(nested); 1594 1595 /* 1596 * If we're about to create a shadow stage 2 entry, then we 1597 * can only create a block mapping if the guest stage 2 page 1598 * table uses at least as big a mapping. 1599 */ 1600 max_map_size = min(kvm_s2_trans_size(nested), max_map_size); 1601 1602 /* 1603 * Be careful that if the mapping size falls between 1604 * two host sizes, take the smallest of the two. 1605 */ 1606 if (max_map_size >= PMD_SIZE && max_map_size < PUD_SIZE) 1607 max_map_size = PMD_SIZE; 1608 else if (max_map_size >= PAGE_SIZE && max_map_size < PMD_SIZE) 1609 max_map_size = PAGE_SIZE; 1610 1611 force_pte = (max_map_size == PAGE_SIZE); 1612 vma_pagesize = min(vma_pagesize, (long)max_map_size); 1613 } 1614 1615 /* 1616 * Both the canonical IPA and fault IPA must be hugepage-aligned to 1617 * ensure we find the right PFN and lay down the mapping in the right 1618 * place. 1619 */ 1620 if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE) { 1621 fault_ipa &= ~(vma_pagesize - 1); 1622 ipa &= ~(vma_pagesize - 1); 1623 } 1624 1625 gfn = ipa >> PAGE_SHIFT; 1626 mte_allowed = kvm_vma_mte_allowed(vma); 1627 1628 vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED; 1629 1630 vm_flags = vma->vm_flags; 1631 1632 is_vma_cacheable = kvm_vma_is_cacheable(vma); 1633 1634 /* Don't use the VMA after the unlock -- it may have vanished */ 1635 vma = NULL; 1636 1637 /* 1638 * Read mmu_invalidate_seq so that KVM can detect if the results of 1639 * vma_lookup() or __kvm_faultin_pfn() become stale prior to 1640 * acquiring kvm->mmu_lock. 1641 * 1642 * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs 1643 * with the smp_wmb() in kvm_mmu_invalidate_end(). 1644 */ 1645 mmu_seq = vcpu->kvm->mmu_invalidate_seq; 1646 mmap_read_unlock(current->mm); 1647 1648 pfn = __kvm_faultin_pfn(memslot, gfn, write_fault ? FOLL_WRITE : 0, 1649 &writable, &page); 1650 if (pfn == KVM_PFN_ERR_HWPOISON) { 1651 kvm_send_hwpoison_signal(hva, vma_shift); 1652 return 0; 1653 } 1654 if (is_error_noslot_pfn(pfn)) 1655 return -EFAULT; 1656 1657 /* 1658 * Check if this is non-struct page memory PFN, and cannot support 1659 * CMOs. It could potentially be unsafe to access as cachable. 1660 */ 1661 if (vm_flags & (VM_PFNMAP | VM_MIXEDMAP) && !pfn_is_map_memory(pfn)) { 1662 if (is_vma_cacheable) { 1663 /* 1664 * Whilst the VMA owner expects cacheable mapping to this 1665 * PFN, hardware also has to support the FWB and CACHE DIC 1666 * features. 1667 * 1668 * ARM64 KVM relies on kernel VA mapping to the PFN to 1669 * perform cache maintenance as the CMO instructions work on 1670 * virtual addresses. VM_PFNMAP region are not necessarily 1671 * mapped to a KVA and hence the presence of hardware features 1672 * S2FWB and CACHE DIC are mandatory to avoid the need for 1673 * cache maintenance. 1674 */ 1675 if (!kvm_supports_cacheable_pfnmap()) 1676 return -EFAULT; 1677 } else { 1678 /* 1679 * If the page was identified as device early by looking at 1680 * the VMA flags, vma_pagesize is already representing the 1681 * largest quantity we can map. If instead it was mapped 1682 * via __kvm_faultin_pfn(), vma_pagesize is set to PAGE_SIZE 1683 * and must not be upgraded. 1684 * 1685 * In both cases, we don't let transparent_hugepage_adjust() 1686 * change things at the last minute. 1687 */ 1688 s2_force_noncacheable = true; 1689 } 1690 } else if (logging_active && !write_fault) { 1691 /* 1692 * Only actually map the page as writable if this was a write 1693 * fault. 1694 */ 1695 writable = false; 1696 } 1697 1698 if (exec_fault && s2_force_noncacheable) 1699 return -ENOEXEC; 1700 1701 /* 1702 * Potentially reduce shadow S2 permissions to match the guest's own 1703 * S2. For exec faults, we'd only reach this point if the guest 1704 * actually allowed it (see kvm_s2_handle_perm_fault). 1705 * 1706 * Also encode the level of the original translation in the SW bits 1707 * of the leaf entry as a proxy for the span of that translation. 1708 * This will be retrieved on TLB invalidation from the guest and 1709 * used to limit the invalidation scope if a TTL hint or a range 1710 * isn't provided. 1711 */ 1712 if (nested) { 1713 writable &= kvm_s2_trans_writable(nested); 1714 if (!kvm_s2_trans_readable(nested)) 1715 prot &= ~KVM_PGTABLE_PROT_R; 1716 1717 prot |= kvm_encode_nested_level(nested); 1718 } 1719 1720 kvm_fault_lock(kvm); 1721 pgt = vcpu->arch.hw_mmu->pgt; 1722 if (mmu_invalidate_retry(kvm, mmu_seq)) { 1723 ret = -EAGAIN; 1724 goto out_unlock; 1725 } 1726 1727 /* 1728 * If we are not forced to use page mapping, check if we are 1729 * backed by a THP and thus use block mapping if possible. 1730 */ 1731 if (vma_pagesize == PAGE_SIZE && !(force_pte || s2_force_noncacheable)) { 1732 if (fault_is_perm && fault_granule > PAGE_SIZE) 1733 vma_pagesize = fault_granule; 1734 else 1735 vma_pagesize = transparent_hugepage_adjust(kvm, memslot, 1736 hva, &pfn, 1737 &fault_ipa); 1738 1739 if (vma_pagesize < 0) { 1740 ret = vma_pagesize; 1741 goto out_unlock; 1742 } 1743 } 1744 1745 if (!fault_is_perm && !s2_force_noncacheable && kvm_has_mte(kvm)) { 1746 /* Check the VMM hasn't introduced a new disallowed VMA */ 1747 if (mte_allowed) { 1748 sanitise_mte_tags(kvm, pfn, vma_pagesize); 1749 } else { 1750 ret = -EFAULT; 1751 goto out_unlock; 1752 } 1753 } 1754 1755 if (writable) 1756 prot |= KVM_PGTABLE_PROT_W; 1757 1758 if (exec_fault) 1759 prot |= KVM_PGTABLE_PROT_X; 1760 1761 if (s2_force_noncacheable) { 1762 if (vfio_allow_any_uc) 1763 prot |= KVM_PGTABLE_PROT_NORMAL_NC; 1764 else 1765 prot |= KVM_PGTABLE_PROT_DEVICE; 1766 } else if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC) && 1767 (!nested || kvm_s2_trans_executable(nested))) { 1768 prot |= KVM_PGTABLE_PROT_X; 1769 } 1770 1771 /* 1772 * Under the premise of getting a FSC_PERM fault, we just need to relax 1773 * permissions only if vma_pagesize equals fault_granule. Otherwise, 1774 * kvm_pgtable_stage2_map() should be called to change block size. 1775 */ 1776 if (fault_is_perm && vma_pagesize == fault_granule) { 1777 /* 1778 * Drop the SW bits in favour of those stored in the 1779 * PTE, which will be preserved. 1780 */ 1781 prot &= ~KVM_NV_GUEST_MAP_SZ; 1782 ret = KVM_PGT_FN(kvm_pgtable_stage2_relax_perms)(pgt, fault_ipa, prot, flags); 1783 } else { 1784 ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, fault_ipa, vma_pagesize, 1785 __pfn_to_phys(pfn), prot, 1786 memcache, flags); 1787 } 1788 1789 out_unlock: 1790 kvm_release_faultin_page(kvm, page, !!ret, writable); 1791 kvm_fault_unlock(kvm); 1792 1793 /* Mark the page dirty only if the fault is handled successfully */ 1794 if (writable && !ret) 1795 mark_page_dirty_in_slot(kvm, memslot, gfn); 1796 1797 return ret != -EAGAIN ? ret : 0; 1798 } 1799 1800 /* Resolve the access fault by making the page young again. */ 1801 static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) 1802 { 1803 enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED; 1804 struct kvm_s2_mmu *mmu; 1805 1806 trace_kvm_access_fault(fault_ipa); 1807 1808 read_lock(&vcpu->kvm->mmu_lock); 1809 mmu = vcpu->arch.hw_mmu; 1810 KVM_PGT_FN(kvm_pgtable_stage2_mkyoung)(mmu->pgt, fault_ipa, flags); 1811 read_unlock(&vcpu->kvm->mmu_lock); 1812 } 1813 1814 int kvm_handle_guest_sea(struct kvm_vcpu *vcpu) 1815 { 1816 /* 1817 * Give APEI the opportunity to claim the abort before handling it 1818 * within KVM. apei_claim_sea() expects to be called with IRQs enabled. 1819 */ 1820 lockdep_assert_irqs_enabled(); 1821 if (apei_claim_sea(NULL) == 0) 1822 return 1; 1823 1824 return kvm_inject_serror(vcpu); 1825 } 1826 1827 /** 1828 * kvm_handle_guest_abort - handles all 2nd stage aborts 1829 * @vcpu: the VCPU pointer 1830 * 1831 * Any abort that gets to the host is almost guaranteed to be caused by a 1832 * missing second stage translation table entry, which can mean that either the 1833 * guest simply needs more memory and we must allocate an appropriate page or it 1834 * can mean that the guest tried to access I/O memory, which is emulated by user 1835 * space. The distinction is based on the IPA causing the fault and whether this 1836 * memory region has been registered as standard RAM by user space. 1837 */ 1838 int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) 1839 { 1840 struct kvm_s2_trans nested_trans, *nested = NULL; 1841 unsigned long esr; 1842 phys_addr_t fault_ipa; /* The address we faulted on */ 1843 phys_addr_t ipa; /* Always the IPA in the L1 guest phys space */ 1844 struct kvm_memory_slot *memslot; 1845 unsigned long hva; 1846 bool is_iabt, write_fault, writable; 1847 gfn_t gfn; 1848 int ret, idx; 1849 1850 if (kvm_vcpu_abt_issea(vcpu)) 1851 return kvm_handle_guest_sea(vcpu); 1852 1853 esr = kvm_vcpu_get_esr(vcpu); 1854 1855 /* 1856 * The fault IPA should be reliable at this point as we're not dealing 1857 * with an SEA. 1858 */ 1859 ipa = fault_ipa = kvm_vcpu_get_fault_ipa(vcpu); 1860 if (KVM_BUG_ON(ipa == INVALID_GPA, vcpu->kvm)) 1861 return -EFAULT; 1862 1863 is_iabt = kvm_vcpu_trap_is_iabt(vcpu); 1864 1865 if (esr_fsc_is_translation_fault(esr)) { 1866 /* Beyond sanitised PARange (which is the IPA limit) */ 1867 if (fault_ipa >= BIT_ULL(get_kvm_ipa_limit())) { 1868 kvm_inject_size_fault(vcpu); 1869 return 1; 1870 } 1871 1872 /* Falls between the IPA range and the PARange? */ 1873 if (fault_ipa >= BIT_ULL(VTCR_EL2_IPA(vcpu->arch.hw_mmu->vtcr))) { 1874 fault_ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0); 1875 1876 return kvm_inject_sea(vcpu, is_iabt, fault_ipa); 1877 } 1878 } 1879 1880 trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_esr(vcpu), 1881 kvm_vcpu_get_hfar(vcpu), fault_ipa); 1882 1883 /* Check the stage-2 fault is trans. fault or write fault */ 1884 if (!esr_fsc_is_translation_fault(esr) && 1885 !esr_fsc_is_permission_fault(esr) && 1886 !esr_fsc_is_access_flag_fault(esr)) { 1887 kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n", 1888 kvm_vcpu_trap_get_class(vcpu), 1889 (unsigned long)kvm_vcpu_trap_get_fault(vcpu), 1890 (unsigned long)kvm_vcpu_get_esr(vcpu)); 1891 return -EFAULT; 1892 } 1893 1894 idx = srcu_read_lock(&vcpu->kvm->srcu); 1895 1896 /* 1897 * We may have faulted on a shadow stage 2 page table if we are 1898 * running a nested guest. In this case, we have to resolve the L2 1899 * IPA to the L1 IPA first, before knowing what kind of memory should 1900 * back the L1 IPA. 1901 * 1902 * If the shadow stage 2 page table walk faults, then we simply inject 1903 * this to the guest and carry on. 1904 * 1905 * If there are no shadow S2 PTs because S2 is disabled, there is 1906 * nothing to walk and we treat it as a 1:1 before going through the 1907 * canonical translation. 1908 */ 1909 if (kvm_is_nested_s2_mmu(vcpu->kvm,vcpu->arch.hw_mmu) && 1910 vcpu->arch.hw_mmu->nested_stage2_enabled) { 1911 u32 esr; 1912 1913 ret = kvm_walk_nested_s2(vcpu, fault_ipa, &nested_trans); 1914 if (ret) { 1915 esr = kvm_s2_trans_esr(&nested_trans); 1916 kvm_inject_s2_fault(vcpu, esr); 1917 goto out_unlock; 1918 } 1919 1920 ret = kvm_s2_handle_perm_fault(vcpu, &nested_trans); 1921 if (ret) { 1922 esr = kvm_s2_trans_esr(&nested_trans); 1923 kvm_inject_s2_fault(vcpu, esr); 1924 goto out_unlock; 1925 } 1926 1927 ipa = kvm_s2_trans_output(&nested_trans); 1928 nested = &nested_trans; 1929 } 1930 1931 gfn = ipa >> PAGE_SHIFT; 1932 memslot = gfn_to_memslot(vcpu->kvm, gfn); 1933 hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable); 1934 write_fault = kvm_is_write_fault(vcpu); 1935 if (kvm_is_error_hva(hva) || (write_fault && !writable)) { 1936 /* 1937 * The guest has put either its instructions or its page-tables 1938 * somewhere it shouldn't have. Userspace won't be able to do 1939 * anything about this (there's no syndrome for a start), so 1940 * re-inject the abort back into the guest. 1941 */ 1942 if (is_iabt) { 1943 ret = -ENOEXEC; 1944 goto out; 1945 } 1946 1947 if (kvm_vcpu_abt_iss1tw(vcpu)) { 1948 ret = kvm_inject_sea_dabt(vcpu, kvm_vcpu_get_hfar(vcpu)); 1949 goto out_unlock; 1950 } 1951 1952 /* 1953 * Check for a cache maintenance operation. Since we 1954 * ended-up here, we know it is outside of any memory 1955 * slot. But we can't find out if that is for a device, 1956 * or if the guest is just being stupid. The only thing 1957 * we know for sure is that this range cannot be cached. 1958 * 1959 * So let's assume that the guest is just being 1960 * cautious, and skip the instruction. 1961 */ 1962 if (kvm_is_error_hva(hva) && kvm_vcpu_dabt_is_cm(vcpu)) { 1963 kvm_incr_pc(vcpu); 1964 ret = 1; 1965 goto out_unlock; 1966 } 1967 1968 /* 1969 * The IPA is reported as [MAX:12], so we need to 1970 * complement it with the bottom 12 bits from the 1971 * faulting VA. This is always 12 bits, irrespective 1972 * of the page size. 1973 */ 1974 ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0); 1975 ret = io_mem_abort(vcpu, ipa); 1976 goto out_unlock; 1977 } 1978 1979 /* Userspace should not be able to register out-of-bounds IPAs */ 1980 VM_BUG_ON(ipa >= kvm_phys_size(vcpu->arch.hw_mmu)); 1981 1982 if (esr_fsc_is_access_flag_fault(esr)) { 1983 handle_access_fault(vcpu, fault_ipa); 1984 ret = 1; 1985 goto out_unlock; 1986 } 1987 1988 ret = user_mem_abort(vcpu, fault_ipa, nested, memslot, hva, 1989 esr_fsc_is_permission_fault(esr)); 1990 if (ret == 0) 1991 ret = 1; 1992 out: 1993 if (ret == -ENOEXEC) 1994 ret = kvm_inject_sea_iabt(vcpu, kvm_vcpu_get_hfar(vcpu)); 1995 out_unlock: 1996 srcu_read_unlock(&vcpu->kvm->srcu, idx); 1997 return ret; 1998 } 1999 2000 bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) 2001 { 2002 if (!kvm->arch.mmu.pgt) 2003 return false; 2004 2005 __unmap_stage2_range(&kvm->arch.mmu, range->start << PAGE_SHIFT, 2006 (range->end - range->start) << PAGE_SHIFT, 2007 range->may_block); 2008 2009 kvm_nested_s2_unmap(kvm, range->may_block); 2010 return false; 2011 } 2012 2013 bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 2014 { 2015 u64 size = (range->end - range->start) << PAGE_SHIFT; 2016 2017 if (!kvm->arch.mmu.pgt) 2018 return false; 2019 2020 return KVM_PGT_FN(kvm_pgtable_stage2_test_clear_young)(kvm->arch.mmu.pgt, 2021 range->start << PAGE_SHIFT, 2022 size, true); 2023 /* 2024 * TODO: Handle nested_mmu structures here using the reverse mapping in 2025 * a later version of patch series. 2026 */ 2027 } 2028 2029 bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 2030 { 2031 u64 size = (range->end - range->start) << PAGE_SHIFT; 2032 2033 if (!kvm->arch.mmu.pgt) 2034 return false; 2035 2036 return KVM_PGT_FN(kvm_pgtable_stage2_test_clear_young)(kvm->arch.mmu.pgt, 2037 range->start << PAGE_SHIFT, 2038 size, false); 2039 } 2040 2041 phys_addr_t kvm_mmu_get_httbr(void) 2042 { 2043 return __pa(hyp_pgtable->pgd); 2044 } 2045 2046 phys_addr_t kvm_get_idmap_vector(void) 2047 { 2048 return hyp_idmap_vector; 2049 } 2050 2051 static int kvm_map_idmap_text(void) 2052 { 2053 unsigned long size = hyp_idmap_end - hyp_idmap_start; 2054 int err = __create_hyp_mappings(hyp_idmap_start, size, hyp_idmap_start, 2055 PAGE_HYP_EXEC); 2056 if (err) 2057 kvm_err("Failed to idmap %lx-%lx\n", 2058 hyp_idmap_start, hyp_idmap_end); 2059 2060 return err; 2061 } 2062 2063 static void *kvm_hyp_zalloc_page(void *arg) 2064 { 2065 return (void *)get_zeroed_page(GFP_KERNEL); 2066 } 2067 2068 static struct kvm_pgtable_mm_ops kvm_hyp_mm_ops = { 2069 .zalloc_page = kvm_hyp_zalloc_page, 2070 .get_page = kvm_host_get_page, 2071 .put_page = kvm_host_put_page, 2072 .phys_to_virt = kvm_host_va, 2073 .virt_to_phys = kvm_host_pa, 2074 }; 2075 2076 int __init kvm_mmu_init(u32 *hyp_va_bits) 2077 { 2078 int err; 2079 u32 idmap_bits; 2080 u32 kernel_bits; 2081 2082 hyp_idmap_start = __pa_symbol(__hyp_idmap_text_start); 2083 hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE); 2084 hyp_idmap_end = __pa_symbol(__hyp_idmap_text_end); 2085 hyp_idmap_end = ALIGN(hyp_idmap_end, PAGE_SIZE); 2086 hyp_idmap_vector = __pa_symbol(__kvm_hyp_init); 2087 2088 /* 2089 * We rely on the linker script to ensure at build time that the HYP 2090 * init code does not cross a page boundary. 2091 */ 2092 BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK); 2093 2094 /* 2095 * The ID map is always configured for 48 bits of translation, which 2096 * may be fewer than the number of VA bits used by the regular kernel 2097 * stage 1, when VA_BITS=52. 2098 * 2099 * At EL2, there is only one TTBR register, and we can't switch between 2100 * translation tables *and* update TCR_EL2.T0SZ at the same time. Bottom 2101 * line: we need to use the extended range with *both* our translation 2102 * tables. 2103 * 2104 * So use the maximum of the idmap VA bits and the regular kernel stage 2105 * 1 VA bits to assure that the hypervisor can both ID map its code page 2106 * and map any kernel memory. 2107 */ 2108 idmap_bits = IDMAP_VA_BITS; 2109 kernel_bits = vabits_actual; 2110 *hyp_va_bits = max(idmap_bits, kernel_bits); 2111 2112 kvm_debug("Using %u-bit virtual addresses at EL2\n", *hyp_va_bits); 2113 kvm_debug("IDMAP page: %lx\n", hyp_idmap_start); 2114 kvm_debug("HYP VA range: %lx:%lx\n", 2115 kern_hyp_va(PAGE_OFFSET), 2116 kern_hyp_va((unsigned long)high_memory - 1)); 2117 2118 if (hyp_idmap_start >= kern_hyp_va(PAGE_OFFSET) && 2119 hyp_idmap_start < kern_hyp_va((unsigned long)high_memory - 1) && 2120 hyp_idmap_start != (unsigned long)__hyp_idmap_text_start) { 2121 /* 2122 * The idmap page is intersecting with the VA space, 2123 * it is not safe to continue further. 2124 */ 2125 kvm_err("IDMAP intersecting with HYP VA, unable to continue\n"); 2126 err = -EINVAL; 2127 goto out; 2128 } 2129 2130 hyp_pgtable = kzalloc(sizeof(*hyp_pgtable), GFP_KERNEL); 2131 if (!hyp_pgtable) { 2132 kvm_err("Hyp mode page-table not allocated\n"); 2133 err = -ENOMEM; 2134 goto out; 2135 } 2136 2137 err = kvm_pgtable_hyp_init(hyp_pgtable, *hyp_va_bits, &kvm_hyp_mm_ops); 2138 if (err) 2139 goto out_free_pgtable; 2140 2141 err = kvm_map_idmap_text(); 2142 if (err) 2143 goto out_destroy_pgtable; 2144 2145 io_map_base = hyp_idmap_start; 2146 __hyp_va_bits = *hyp_va_bits; 2147 return 0; 2148 2149 out_destroy_pgtable: 2150 kvm_pgtable_hyp_destroy(hyp_pgtable); 2151 out_free_pgtable: 2152 kfree(hyp_pgtable); 2153 hyp_pgtable = NULL; 2154 out: 2155 return err; 2156 } 2157 2158 void kvm_arch_commit_memory_region(struct kvm *kvm, 2159 struct kvm_memory_slot *old, 2160 const struct kvm_memory_slot *new, 2161 enum kvm_mr_change change) 2162 { 2163 bool log_dirty_pages = new && new->flags & KVM_MEM_LOG_DIRTY_PAGES; 2164 2165 /* 2166 * At this point memslot has been committed and there is an 2167 * allocated dirty_bitmap[], dirty pages will be tracked while the 2168 * memory slot is write protected. 2169 */ 2170 if (log_dirty_pages) { 2171 2172 if (change == KVM_MR_DELETE) 2173 return; 2174 2175 /* 2176 * Huge and normal pages are write-protected and split 2177 * on either of these two cases: 2178 * 2179 * 1. with initial-all-set: gradually with CLEAR ioctls, 2180 */ 2181 if (kvm_dirty_log_manual_protect_and_init_set(kvm)) 2182 return; 2183 /* 2184 * or 2185 * 2. without initial-all-set: all in one shot when 2186 * enabling dirty logging. 2187 */ 2188 kvm_mmu_wp_memory_region(kvm, new->id); 2189 kvm_mmu_split_memory_region(kvm, new->id); 2190 } else { 2191 /* 2192 * Free any leftovers from the eager page splitting cache. Do 2193 * this when deleting, moving, disabling dirty logging, or 2194 * creating the memslot (a nop). Doing it for deletes makes 2195 * sure we don't leak memory, and there's no need to keep the 2196 * cache around for any of the other cases. 2197 */ 2198 kvm_mmu_free_memory_cache(&kvm->arch.mmu.split_page_cache); 2199 } 2200 } 2201 2202 int kvm_arch_prepare_memory_region(struct kvm *kvm, 2203 const struct kvm_memory_slot *old, 2204 struct kvm_memory_slot *new, 2205 enum kvm_mr_change change) 2206 { 2207 hva_t hva, reg_end; 2208 int ret = 0; 2209 2210 if (change != KVM_MR_CREATE && change != KVM_MR_MOVE && 2211 change != KVM_MR_FLAGS_ONLY) 2212 return 0; 2213 2214 /* 2215 * Prevent userspace from creating a memory region outside of the IPA 2216 * space addressable by the KVM guest IPA space. 2217 */ 2218 if ((new->base_gfn + new->npages) > (kvm_phys_size(&kvm->arch.mmu) >> PAGE_SHIFT)) 2219 return -EFAULT; 2220 2221 hva = new->userspace_addr; 2222 reg_end = hva + (new->npages << PAGE_SHIFT); 2223 2224 mmap_read_lock(current->mm); 2225 /* 2226 * A memory region could potentially cover multiple VMAs, and any holes 2227 * between them, so iterate over all of them. 2228 * 2229 * +--------------------------------------------+ 2230 * +---------------+----------------+ +----------------+ 2231 * | : VMA 1 | VMA 2 | | VMA 3 : | 2232 * +---------------+----------------+ +----------------+ 2233 * | memory region | 2234 * +--------------------------------------------+ 2235 */ 2236 do { 2237 struct vm_area_struct *vma; 2238 2239 vma = find_vma_intersection(current->mm, hva, reg_end); 2240 if (!vma) 2241 break; 2242 2243 if (kvm_has_mte(kvm) && !kvm_vma_mte_allowed(vma)) { 2244 ret = -EINVAL; 2245 break; 2246 } 2247 2248 if (vma->vm_flags & VM_PFNMAP) { 2249 /* IO region dirty page logging not allowed */ 2250 if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) { 2251 ret = -EINVAL; 2252 break; 2253 } 2254 2255 /* 2256 * Cacheable PFNMAP is allowed only if the hardware 2257 * supports it. 2258 */ 2259 if (kvm_vma_is_cacheable(vma) && !kvm_supports_cacheable_pfnmap()) { 2260 ret = -EINVAL; 2261 break; 2262 } 2263 } 2264 hva = min(reg_end, vma->vm_end); 2265 } while (hva < reg_end); 2266 2267 mmap_read_unlock(current->mm); 2268 return ret; 2269 } 2270 2271 void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) 2272 { 2273 } 2274 2275 void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) 2276 { 2277 } 2278 2279 void kvm_arch_flush_shadow_memslot(struct kvm *kvm, 2280 struct kvm_memory_slot *slot) 2281 { 2282 gpa_t gpa = slot->base_gfn << PAGE_SHIFT; 2283 phys_addr_t size = slot->npages << PAGE_SHIFT; 2284 2285 write_lock(&kvm->mmu_lock); 2286 kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, size, true); 2287 kvm_nested_s2_unmap(kvm, true); 2288 write_unlock(&kvm->mmu_lock); 2289 } 2290 2291 /* 2292 * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized). 2293 * 2294 * Main problems: 2295 * - S/W ops are local to a CPU (not broadcast) 2296 * - We have line migration behind our back (speculation) 2297 * - System caches don't support S/W at all (damn!) 2298 * 2299 * In the face of the above, the best we can do is to try and convert 2300 * S/W ops to VA ops. Because the guest is not allowed to infer the 2301 * S/W to PA mapping, it can only use S/W to nuke the whole cache, 2302 * which is a rather good thing for us. 2303 * 2304 * Also, it is only used when turning caches on/off ("The expected 2305 * usage of the cache maintenance instructions that operate by set/way 2306 * is associated with the cache maintenance instructions associated 2307 * with the powerdown and powerup of caches, if this is required by 2308 * the implementation."). 2309 * 2310 * We use the following policy: 2311 * 2312 * - If we trap a S/W operation, we enable VM trapping to detect 2313 * caches being turned on/off, and do a full clean. 2314 * 2315 * - We flush the caches on both caches being turned on and off. 2316 * 2317 * - Once the caches are enabled, we stop trapping VM ops. 2318 */ 2319 void kvm_set_way_flush(struct kvm_vcpu *vcpu) 2320 { 2321 unsigned long hcr = *vcpu_hcr(vcpu); 2322 2323 /* 2324 * If this is the first time we do a S/W operation 2325 * (i.e. HCR_TVM not set) flush the whole memory, and set the 2326 * VM trapping. 2327 * 2328 * Otherwise, rely on the VM trapping to wait for the MMU + 2329 * Caches to be turned off. At that point, we'll be able to 2330 * clean the caches again. 2331 */ 2332 if (!(hcr & HCR_TVM)) { 2333 trace_kvm_set_way_flush(*vcpu_pc(vcpu), 2334 vcpu_has_cache_enabled(vcpu)); 2335 stage2_flush_vm(vcpu->kvm); 2336 *vcpu_hcr(vcpu) = hcr | HCR_TVM; 2337 } 2338 } 2339 2340 void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled) 2341 { 2342 bool now_enabled = vcpu_has_cache_enabled(vcpu); 2343 2344 /* 2345 * If switching the MMU+caches on, need to invalidate the caches. 2346 * If switching it off, need to clean the caches. 2347 * Clean + invalidate does the trick always. 2348 */ 2349 if (now_enabled != was_enabled) 2350 stage2_flush_vm(vcpu->kvm); 2351 2352 /* Caches are now on, stop trapping VM ops (until a S/W op) */ 2353 if (now_enabled) 2354 *vcpu_hcr(vcpu) &= ~HCR_TVM; 2355 2356 trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled); 2357 } 2358