1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2012 - Virtual Open Systems and Columbia University 4 * Author: Christoffer Dall <c.dall@virtualopensystems.com> 5 */ 6 7 #include <linux/acpi.h> 8 #include <linux/mman.h> 9 #include <linux/kvm_host.h> 10 #include <linux/io.h> 11 #include <linux/hugetlb.h> 12 #include <linux/sched/signal.h> 13 #include <trace/events/kvm.h> 14 #include <asm/acpi.h> 15 #include <asm/pgalloc.h> 16 #include <asm/cacheflush.h> 17 #include <asm/kvm_arm.h> 18 #include <asm/kvm_mmu.h> 19 #include <asm/kvm_pgtable.h> 20 #include <asm/kvm_pkvm.h> 21 #include <asm/kvm_asm.h> 22 #include <asm/kvm_emulate.h> 23 #include <asm/virt.h> 24 25 #include "trace.h" 26 27 static struct kvm_pgtable *hyp_pgtable; 28 static DEFINE_MUTEX(kvm_hyp_pgd_mutex); 29 30 static unsigned long __ro_after_init hyp_idmap_start; 31 static unsigned long __ro_after_init hyp_idmap_end; 32 static phys_addr_t __ro_after_init hyp_idmap_vector; 33 34 u32 __ro_after_init __hyp_va_bits; 35 36 static unsigned long __ro_after_init io_map_base; 37 38 #define KVM_PGT_FN(fn) (!is_protected_kvm_enabled() ? fn : p ## fn) 39 40 static phys_addr_t __stage2_range_addr_end(phys_addr_t addr, phys_addr_t end, 41 phys_addr_t size) 42 { 43 phys_addr_t boundary = ALIGN_DOWN(addr + size, size); 44 45 return (boundary - 1 < end - 1) ? boundary : end; 46 } 47 48 static phys_addr_t stage2_range_addr_end(phys_addr_t addr, phys_addr_t end) 49 { 50 phys_addr_t size = kvm_granule_size(KVM_PGTABLE_MIN_BLOCK_LEVEL); 51 52 return __stage2_range_addr_end(addr, end, size); 53 } 54 55 /* 56 * Release kvm_mmu_lock periodically if the memory region is large. Otherwise, 57 * we may see kernel panics with CONFIG_DETECT_HUNG_TASK, 58 * CONFIG_LOCKUP_DETECTOR, CONFIG_LOCKDEP. Additionally, holding the lock too 59 * long will also starve other vCPUs. We have to also make sure that the page 60 * tables are not freed while we released the lock. 61 */ 62 static int stage2_apply_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, 63 phys_addr_t end, 64 int (*fn)(struct kvm_pgtable *, u64, u64), 65 bool resched) 66 { 67 struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu); 68 int ret; 69 u64 next; 70 71 do { 72 struct kvm_pgtable *pgt = mmu->pgt; 73 if (!pgt) 74 return -EINVAL; 75 76 next = stage2_range_addr_end(addr, end); 77 ret = fn(pgt, addr, next - addr); 78 if (ret) 79 break; 80 81 if (resched && next != end) 82 cond_resched_rwlock_write(&kvm->mmu_lock); 83 } while (addr = next, addr != end); 84 85 return ret; 86 } 87 88 #define stage2_apply_range_resched(mmu, addr, end, fn) \ 89 stage2_apply_range(mmu, addr, end, fn, true) 90 91 /* 92 * Get the maximum number of page-tables pages needed to split a range 93 * of blocks into PAGE_SIZE PTEs. It assumes the range is already 94 * mapped at level 2, or at level 1 if allowed. 95 */ 96 static int kvm_mmu_split_nr_page_tables(u64 range) 97 { 98 int n = 0; 99 100 if (KVM_PGTABLE_MIN_BLOCK_LEVEL < 2) 101 n += DIV_ROUND_UP(range, PUD_SIZE); 102 n += DIV_ROUND_UP(range, PMD_SIZE); 103 return n; 104 } 105 106 static bool need_split_memcache_topup_or_resched(struct kvm *kvm) 107 { 108 struct kvm_mmu_memory_cache *cache; 109 u64 chunk_size, min; 110 111 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) 112 return true; 113 114 chunk_size = kvm->arch.mmu.split_page_chunk_size; 115 min = kvm_mmu_split_nr_page_tables(chunk_size); 116 cache = &kvm->arch.mmu.split_page_cache; 117 return kvm_mmu_memory_cache_nr_free_objects(cache) < min; 118 } 119 120 static int kvm_mmu_split_huge_pages(struct kvm *kvm, phys_addr_t addr, 121 phys_addr_t end) 122 { 123 struct kvm_mmu_memory_cache *cache; 124 struct kvm_pgtable *pgt; 125 int ret, cache_capacity; 126 u64 next, chunk_size; 127 128 lockdep_assert_held_write(&kvm->mmu_lock); 129 130 chunk_size = kvm->arch.mmu.split_page_chunk_size; 131 cache_capacity = kvm_mmu_split_nr_page_tables(chunk_size); 132 133 if (chunk_size == 0) 134 return 0; 135 136 cache = &kvm->arch.mmu.split_page_cache; 137 138 do { 139 if (need_split_memcache_topup_or_resched(kvm)) { 140 write_unlock(&kvm->mmu_lock); 141 cond_resched(); 142 /* Eager page splitting is best-effort. */ 143 ret = __kvm_mmu_topup_memory_cache(cache, 144 cache_capacity, 145 cache_capacity); 146 write_lock(&kvm->mmu_lock); 147 if (ret) 148 break; 149 } 150 151 pgt = kvm->arch.mmu.pgt; 152 if (!pgt) 153 return -EINVAL; 154 155 next = __stage2_range_addr_end(addr, end, chunk_size); 156 ret = KVM_PGT_FN(kvm_pgtable_stage2_split)(pgt, addr, next - addr, cache); 157 if (ret) 158 break; 159 } while (addr = next, addr != end); 160 161 return ret; 162 } 163 164 static bool memslot_is_logging(struct kvm_memory_slot *memslot) 165 { 166 return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY); 167 } 168 169 /** 170 * kvm_arch_flush_remote_tlbs() - flush all VM TLB entries for v7/8 171 * @kvm: pointer to kvm structure. 172 * 173 * Interface to HYP function to flush all VM TLB entries 174 */ 175 int kvm_arch_flush_remote_tlbs(struct kvm *kvm) 176 { 177 if (is_protected_kvm_enabled()) 178 kvm_call_hyp_nvhe(__pkvm_tlb_flush_vmid, kvm->arch.pkvm.handle); 179 else 180 kvm_call_hyp(__kvm_tlb_flush_vmid, &kvm->arch.mmu); 181 return 0; 182 } 183 184 int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, 185 gfn_t gfn, u64 nr_pages) 186 { 187 u64 size = nr_pages << PAGE_SHIFT; 188 u64 addr = gfn << PAGE_SHIFT; 189 190 if (is_protected_kvm_enabled()) 191 kvm_call_hyp_nvhe(__pkvm_tlb_flush_vmid, kvm->arch.pkvm.handle); 192 else 193 kvm_tlb_flush_vmid_range(&kvm->arch.mmu, addr, size); 194 return 0; 195 } 196 197 static void *stage2_memcache_zalloc_page(void *arg) 198 { 199 struct kvm_mmu_memory_cache *mc = arg; 200 void *virt; 201 202 /* Allocated with __GFP_ZERO, so no need to zero */ 203 virt = kvm_mmu_memory_cache_alloc(mc); 204 if (virt) 205 kvm_account_pgtable_pages(virt, 1); 206 return virt; 207 } 208 209 static void *kvm_host_zalloc_pages_exact(size_t size) 210 { 211 return alloc_pages_exact(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO); 212 } 213 214 static void *kvm_s2_zalloc_pages_exact(size_t size) 215 { 216 void *virt = kvm_host_zalloc_pages_exact(size); 217 218 if (virt) 219 kvm_account_pgtable_pages(virt, (size >> PAGE_SHIFT)); 220 return virt; 221 } 222 223 static void kvm_s2_free_pages_exact(void *virt, size_t size) 224 { 225 kvm_account_pgtable_pages(virt, -(size >> PAGE_SHIFT)); 226 free_pages_exact(virt, size); 227 } 228 229 static struct kvm_pgtable_mm_ops kvm_s2_mm_ops; 230 231 static void stage2_free_unlinked_table_rcu_cb(struct rcu_head *head) 232 { 233 struct page *page = container_of(head, struct page, rcu_head); 234 void *pgtable = page_to_virt(page); 235 s8 level = page_private(page); 236 237 KVM_PGT_FN(kvm_pgtable_stage2_free_unlinked)(&kvm_s2_mm_ops, pgtable, level); 238 } 239 240 static void stage2_free_unlinked_table(void *addr, s8 level) 241 { 242 struct page *page = virt_to_page(addr); 243 244 set_page_private(page, (unsigned long)level); 245 call_rcu(&page->rcu_head, stage2_free_unlinked_table_rcu_cb); 246 } 247 248 static void kvm_host_get_page(void *addr) 249 { 250 get_page(virt_to_page(addr)); 251 } 252 253 static void kvm_host_put_page(void *addr) 254 { 255 put_page(virt_to_page(addr)); 256 } 257 258 static void kvm_s2_put_page(void *addr) 259 { 260 struct page *p = virt_to_page(addr); 261 /* Dropping last refcount, the page will be freed */ 262 if (page_count(p) == 1) 263 kvm_account_pgtable_pages(addr, -1); 264 put_page(p); 265 } 266 267 static int kvm_host_page_count(void *addr) 268 { 269 return page_count(virt_to_page(addr)); 270 } 271 272 static phys_addr_t kvm_host_pa(void *addr) 273 { 274 return __pa(addr); 275 } 276 277 static void *kvm_host_va(phys_addr_t phys) 278 { 279 return __va(phys); 280 } 281 282 static void clean_dcache_guest_page(void *va, size_t size) 283 { 284 __clean_dcache_guest_page(va, size); 285 } 286 287 static void invalidate_icache_guest_page(void *va, size_t size) 288 { 289 __invalidate_icache_guest_page(va, size); 290 } 291 292 /* 293 * Unmapping vs dcache management: 294 * 295 * If a guest maps certain memory pages as uncached, all writes will 296 * bypass the data cache and go directly to RAM. However, the CPUs 297 * can still speculate reads (not writes) and fill cache lines with 298 * data. 299 * 300 * Those cache lines will be *clean* cache lines though, so a 301 * clean+invalidate operation is equivalent to an invalidate 302 * operation, because no cache lines are marked dirty. 303 * 304 * Those clean cache lines could be filled prior to an uncached write 305 * by the guest, and the cache coherent IO subsystem would therefore 306 * end up writing old data to disk. 307 * 308 * This is why right after unmapping a page/section and invalidating 309 * the corresponding TLBs, we flush to make sure the IO subsystem will 310 * never hit in the cache. 311 * 312 * This is all avoided on systems that have ARM64_HAS_STAGE2_FWB, as 313 * we then fully enforce cacheability of RAM, no matter what the guest 314 * does. 315 */ 316 /** 317 * __unmap_stage2_range -- Clear stage2 page table entries to unmap a range 318 * @mmu: The KVM stage-2 MMU pointer 319 * @start: The intermediate physical base address of the range to unmap 320 * @size: The size of the area to unmap 321 * @may_block: Whether or not we are permitted to block 322 * 323 * Clear a range of stage-2 mappings, lowering the various ref-counts. Must 324 * be called while holding mmu_lock (unless for freeing the stage2 pgd before 325 * destroying the VM), otherwise another faulting VCPU may come in and mess 326 * with things behind our backs. 327 */ 328 static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size, 329 bool may_block) 330 { 331 struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu); 332 phys_addr_t end = start + size; 333 334 lockdep_assert_held_write(&kvm->mmu_lock); 335 WARN_ON(size & ~PAGE_MASK); 336 WARN_ON(stage2_apply_range(mmu, start, end, KVM_PGT_FN(kvm_pgtable_stage2_unmap), 337 may_block)); 338 } 339 340 void kvm_stage2_unmap_range(struct kvm_s2_mmu *mmu, phys_addr_t start, 341 u64 size, bool may_block) 342 { 343 __unmap_stage2_range(mmu, start, size, may_block); 344 } 345 346 void kvm_stage2_flush_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end) 347 { 348 stage2_apply_range_resched(mmu, addr, end, KVM_PGT_FN(kvm_pgtable_stage2_flush)); 349 } 350 351 static void stage2_flush_memslot(struct kvm *kvm, 352 struct kvm_memory_slot *memslot) 353 { 354 phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT; 355 phys_addr_t end = addr + PAGE_SIZE * memslot->npages; 356 357 kvm_stage2_flush_range(&kvm->arch.mmu, addr, end); 358 } 359 360 /** 361 * stage2_flush_vm - Invalidate cache for pages mapped in stage 2 362 * @kvm: The struct kvm pointer 363 * 364 * Go through the stage 2 page tables and invalidate any cache lines 365 * backing memory already mapped to the VM. 366 */ 367 static void stage2_flush_vm(struct kvm *kvm) 368 { 369 struct kvm_memslots *slots; 370 struct kvm_memory_slot *memslot; 371 int idx, bkt; 372 373 idx = srcu_read_lock(&kvm->srcu); 374 write_lock(&kvm->mmu_lock); 375 376 slots = kvm_memslots(kvm); 377 kvm_for_each_memslot(memslot, bkt, slots) 378 stage2_flush_memslot(kvm, memslot); 379 380 kvm_nested_s2_flush(kvm); 381 382 write_unlock(&kvm->mmu_lock); 383 srcu_read_unlock(&kvm->srcu, idx); 384 } 385 386 /** 387 * free_hyp_pgds - free Hyp-mode page tables 388 */ 389 void __init free_hyp_pgds(void) 390 { 391 mutex_lock(&kvm_hyp_pgd_mutex); 392 if (hyp_pgtable) { 393 kvm_pgtable_hyp_destroy(hyp_pgtable); 394 kfree(hyp_pgtable); 395 hyp_pgtable = NULL; 396 } 397 mutex_unlock(&kvm_hyp_pgd_mutex); 398 } 399 400 static bool kvm_host_owns_hyp_mappings(void) 401 { 402 if (is_kernel_in_hyp_mode()) 403 return false; 404 405 if (static_branch_likely(&kvm_protected_mode_initialized)) 406 return false; 407 408 /* 409 * This can happen at boot time when __create_hyp_mappings() is called 410 * after the hyp protection has been enabled, but the static key has 411 * not been flipped yet. 412 */ 413 if (!hyp_pgtable && is_protected_kvm_enabled()) 414 return false; 415 416 WARN_ON(!hyp_pgtable); 417 418 return true; 419 } 420 421 int __create_hyp_mappings(unsigned long start, unsigned long size, 422 unsigned long phys, enum kvm_pgtable_prot prot) 423 { 424 int err; 425 426 if (WARN_ON(!kvm_host_owns_hyp_mappings())) 427 return -EINVAL; 428 429 mutex_lock(&kvm_hyp_pgd_mutex); 430 err = kvm_pgtable_hyp_map(hyp_pgtable, start, size, phys, prot); 431 mutex_unlock(&kvm_hyp_pgd_mutex); 432 433 return err; 434 } 435 436 static phys_addr_t kvm_kaddr_to_phys(void *kaddr) 437 { 438 if (!is_vmalloc_addr(kaddr)) { 439 BUG_ON(!virt_addr_valid(kaddr)); 440 return __pa(kaddr); 441 } else { 442 return page_to_phys(vmalloc_to_page(kaddr)) + 443 offset_in_page(kaddr); 444 } 445 } 446 447 struct hyp_shared_pfn { 448 u64 pfn; 449 int count; 450 struct rb_node node; 451 }; 452 453 static DEFINE_MUTEX(hyp_shared_pfns_lock); 454 static struct rb_root hyp_shared_pfns = RB_ROOT; 455 456 static struct hyp_shared_pfn *find_shared_pfn(u64 pfn, struct rb_node ***node, 457 struct rb_node **parent) 458 { 459 struct hyp_shared_pfn *this; 460 461 *node = &hyp_shared_pfns.rb_node; 462 *parent = NULL; 463 while (**node) { 464 this = container_of(**node, struct hyp_shared_pfn, node); 465 *parent = **node; 466 if (this->pfn < pfn) 467 *node = &((**node)->rb_left); 468 else if (this->pfn > pfn) 469 *node = &((**node)->rb_right); 470 else 471 return this; 472 } 473 474 return NULL; 475 } 476 477 static int share_pfn_hyp(u64 pfn) 478 { 479 struct rb_node **node, *parent; 480 struct hyp_shared_pfn *this; 481 int ret = 0; 482 483 mutex_lock(&hyp_shared_pfns_lock); 484 this = find_shared_pfn(pfn, &node, &parent); 485 if (this) { 486 this->count++; 487 goto unlock; 488 } 489 490 this = kzalloc(sizeof(*this), GFP_KERNEL); 491 if (!this) { 492 ret = -ENOMEM; 493 goto unlock; 494 } 495 496 this->pfn = pfn; 497 this->count = 1; 498 rb_link_node(&this->node, parent, node); 499 rb_insert_color(&this->node, &hyp_shared_pfns); 500 ret = kvm_call_hyp_nvhe(__pkvm_host_share_hyp, pfn, 1); 501 unlock: 502 mutex_unlock(&hyp_shared_pfns_lock); 503 504 return ret; 505 } 506 507 static int unshare_pfn_hyp(u64 pfn) 508 { 509 struct rb_node **node, *parent; 510 struct hyp_shared_pfn *this; 511 int ret = 0; 512 513 mutex_lock(&hyp_shared_pfns_lock); 514 this = find_shared_pfn(pfn, &node, &parent); 515 if (WARN_ON(!this)) { 516 ret = -ENOENT; 517 goto unlock; 518 } 519 520 this->count--; 521 if (this->count) 522 goto unlock; 523 524 rb_erase(&this->node, &hyp_shared_pfns); 525 kfree(this); 526 ret = kvm_call_hyp_nvhe(__pkvm_host_unshare_hyp, pfn, 1); 527 unlock: 528 mutex_unlock(&hyp_shared_pfns_lock); 529 530 return ret; 531 } 532 533 int kvm_share_hyp(void *from, void *to) 534 { 535 phys_addr_t start, end, cur; 536 u64 pfn; 537 int ret; 538 539 if (is_kernel_in_hyp_mode()) 540 return 0; 541 542 /* 543 * The share hcall maps things in the 'fixed-offset' region of the hyp 544 * VA space, so we can only share physically contiguous data-structures 545 * for now. 546 */ 547 if (is_vmalloc_or_module_addr(from) || is_vmalloc_or_module_addr(to)) 548 return -EINVAL; 549 550 if (kvm_host_owns_hyp_mappings()) 551 return create_hyp_mappings(from, to, PAGE_HYP); 552 553 start = ALIGN_DOWN(__pa(from), PAGE_SIZE); 554 end = PAGE_ALIGN(__pa(to)); 555 for (cur = start; cur < end; cur += PAGE_SIZE) { 556 pfn = __phys_to_pfn(cur); 557 ret = share_pfn_hyp(pfn); 558 if (ret) 559 return ret; 560 } 561 562 return 0; 563 } 564 565 void kvm_unshare_hyp(void *from, void *to) 566 { 567 phys_addr_t start, end, cur; 568 u64 pfn; 569 570 if (is_kernel_in_hyp_mode() || kvm_host_owns_hyp_mappings() || !from) 571 return; 572 573 start = ALIGN_DOWN(__pa(from), PAGE_SIZE); 574 end = PAGE_ALIGN(__pa(to)); 575 for (cur = start; cur < end; cur += PAGE_SIZE) { 576 pfn = __phys_to_pfn(cur); 577 WARN_ON(unshare_pfn_hyp(pfn)); 578 } 579 } 580 581 /** 582 * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode 583 * @from: The virtual kernel start address of the range 584 * @to: The virtual kernel end address of the range (exclusive) 585 * @prot: The protection to be applied to this range 586 * 587 * The same virtual address as the kernel virtual address is also used 588 * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying 589 * physical pages. 590 */ 591 int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot) 592 { 593 phys_addr_t phys_addr; 594 unsigned long virt_addr; 595 unsigned long start = kern_hyp_va((unsigned long)from); 596 unsigned long end = kern_hyp_va((unsigned long)to); 597 598 if (is_kernel_in_hyp_mode()) 599 return 0; 600 601 if (!kvm_host_owns_hyp_mappings()) 602 return -EPERM; 603 604 start = start & PAGE_MASK; 605 end = PAGE_ALIGN(end); 606 607 for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) { 608 int err; 609 610 phys_addr = kvm_kaddr_to_phys(from + virt_addr - start); 611 err = __create_hyp_mappings(virt_addr, PAGE_SIZE, phys_addr, 612 prot); 613 if (err) 614 return err; 615 } 616 617 return 0; 618 } 619 620 static int __hyp_alloc_private_va_range(unsigned long base) 621 { 622 lockdep_assert_held(&kvm_hyp_pgd_mutex); 623 624 if (!PAGE_ALIGNED(base)) 625 return -EINVAL; 626 627 /* 628 * Verify that BIT(VA_BITS - 1) hasn't been flipped by 629 * allocating the new area, as it would indicate we've 630 * overflowed the idmap/IO address range. 631 */ 632 if ((base ^ io_map_base) & BIT(VA_BITS - 1)) 633 return -ENOMEM; 634 635 io_map_base = base; 636 637 return 0; 638 } 639 640 /** 641 * hyp_alloc_private_va_range - Allocates a private VA range. 642 * @size: The size of the VA range to reserve. 643 * @haddr: The hypervisor virtual start address of the allocation. 644 * 645 * The private virtual address (VA) range is allocated below io_map_base 646 * and aligned based on the order of @size. 647 * 648 * Return: 0 on success or negative error code on failure. 649 */ 650 int hyp_alloc_private_va_range(size_t size, unsigned long *haddr) 651 { 652 unsigned long base; 653 int ret = 0; 654 655 mutex_lock(&kvm_hyp_pgd_mutex); 656 657 /* 658 * This assumes that we have enough space below the idmap 659 * page to allocate our VAs. If not, the check in 660 * __hyp_alloc_private_va_range() will kick. A potential 661 * alternative would be to detect that overflow and switch 662 * to an allocation above the idmap. 663 * 664 * The allocated size is always a multiple of PAGE_SIZE. 665 */ 666 size = PAGE_ALIGN(size); 667 base = io_map_base - size; 668 ret = __hyp_alloc_private_va_range(base); 669 670 mutex_unlock(&kvm_hyp_pgd_mutex); 671 672 if (!ret) 673 *haddr = base; 674 675 return ret; 676 } 677 678 static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size, 679 unsigned long *haddr, 680 enum kvm_pgtable_prot prot) 681 { 682 unsigned long addr; 683 int ret = 0; 684 685 if (!kvm_host_owns_hyp_mappings()) { 686 addr = kvm_call_hyp_nvhe(__pkvm_create_private_mapping, 687 phys_addr, size, prot); 688 if (IS_ERR_VALUE(addr)) 689 return addr; 690 *haddr = addr; 691 692 return 0; 693 } 694 695 size = PAGE_ALIGN(size + offset_in_page(phys_addr)); 696 ret = hyp_alloc_private_va_range(size, &addr); 697 if (ret) 698 return ret; 699 700 ret = __create_hyp_mappings(addr, size, phys_addr, prot); 701 if (ret) 702 return ret; 703 704 *haddr = addr + offset_in_page(phys_addr); 705 return ret; 706 } 707 708 int create_hyp_stack(phys_addr_t phys_addr, unsigned long *haddr) 709 { 710 unsigned long base; 711 size_t size; 712 int ret; 713 714 mutex_lock(&kvm_hyp_pgd_mutex); 715 /* 716 * Efficient stack verification using the NVHE_STACK_SHIFT bit implies 717 * an alignment of our allocation on the order of the size. 718 */ 719 size = NVHE_STACK_SIZE * 2; 720 base = ALIGN_DOWN(io_map_base - size, size); 721 722 ret = __hyp_alloc_private_va_range(base); 723 724 mutex_unlock(&kvm_hyp_pgd_mutex); 725 726 if (ret) { 727 kvm_err("Cannot allocate hyp stack guard page\n"); 728 return ret; 729 } 730 731 /* 732 * Since the stack grows downwards, map the stack to the page 733 * at the higher address and leave the lower guard page 734 * unbacked. 735 * 736 * Any valid stack address now has the NVHE_STACK_SHIFT bit as 1 737 * and addresses corresponding to the guard page have the 738 * NVHE_STACK_SHIFT bit as 0 - this is used for overflow detection. 739 */ 740 ret = __create_hyp_mappings(base + NVHE_STACK_SIZE, NVHE_STACK_SIZE, 741 phys_addr, PAGE_HYP); 742 if (ret) 743 kvm_err("Cannot map hyp stack\n"); 744 745 *haddr = base + size; 746 747 return ret; 748 } 749 750 /** 751 * create_hyp_io_mappings - Map IO into both kernel and HYP 752 * @phys_addr: The physical start address which gets mapped 753 * @size: Size of the region being mapped 754 * @kaddr: Kernel VA for this mapping 755 * @haddr: HYP VA for this mapping 756 */ 757 int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size, 758 void __iomem **kaddr, 759 void __iomem **haddr) 760 { 761 unsigned long addr; 762 int ret; 763 764 if (is_protected_kvm_enabled()) 765 return -EPERM; 766 767 *kaddr = ioremap(phys_addr, size); 768 if (!*kaddr) 769 return -ENOMEM; 770 771 if (is_kernel_in_hyp_mode()) { 772 *haddr = *kaddr; 773 return 0; 774 } 775 776 ret = __create_hyp_private_mapping(phys_addr, size, 777 &addr, PAGE_HYP_DEVICE); 778 if (ret) { 779 iounmap(*kaddr); 780 *kaddr = NULL; 781 *haddr = NULL; 782 return ret; 783 } 784 785 *haddr = (void __iomem *)addr; 786 return 0; 787 } 788 789 /** 790 * create_hyp_exec_mappings - Map an executable range into HYP 791 * @phys_addr: The physical start address which gets mapped 792 * @size: Size of the region being mapped 793 * @haddr: HYP VA for this mapping 794 */ 795 int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size, 796 void **haddr) 797 { 798 unsigned long addr; 799 int ret; 800 801 BUG_ON(is_kernel_in_hyp_mode()); 802 803 ret = __create_hyp_private_mapping(phys_addr, size, 804 &addr, PAGE_HYP_EXEC); 805 if (ret) { 806 *haddr = NULL; 807 return ret; 808 } 809 810 *haddr = (void *)addr; 811 return 0; 812 } 813 814 static struct kvm_pgtable_mm_ops kvm_user_mm_ops = { 815 /* We shouldn't need any other callback to walk the PT */ 816 .phys_to_virt = kvm_host_va, 817 }; 818 819 static int get_user_mapping_size(struct kvm *kvm, u64 addr) 820 { 821 struct kvm_pgtable pgt = { 822 .pgd = (kvm_pteref_t)kvm->mm->pgd, 823 .ia_bits = vabits_actual, 824 .start_level = (KVM_PGTABLE_LAST_LEVEL - 825 ARM64_HW_PGTABLE_LEVELS(pgt.ia_bits) + 1), 826 .mm_ops = &kvm_user_mm_ops, 827 }; 828 unsigned long flags; 829 kvm_pte_t pte = 0; /* Keep GCC quiet... */ 830 s8 level = S8_MAX; 831 int ret; 832 833 /* 834 * Disable IRQs so that we hazard against a concurrent 835 * teardown of the userspace page tables (which relies on 836 * IPI-ing threads). 837 */ 838 local_irq_save(flags); 839 ret = kvm_pgtable_get_leaf(&pgt, addr, &pte, &level); 840 local_irq_restore(flags); 841 842 if (ret) 843 return ret; 844 845 /* 846 * Not seeing an error, but not updating level? Something went 847 * deeply wrong... 848 */ 849 if (WARN_ON(level > KVM_PGTABLE_LAST_LEVEL)) 850 return -EFAULT; 851 if (WARN_ON(level < KVM_PGTABLE_FIRST_LEVEL)) 852 return -EFAULT; 853 854 /* Oops, the userspace PTs are gone... Replay the fault */ 855 if (!kvm_pte_valid(pte)) 856 return -EAGAIN; 857 858 return BIT(ARM64_HW_PGTABLE_LEVEL_SHIFT(level)); 859 } 860 861 static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = { 862 .zalloc_page = stage2_memcache_zalloc_page, 863 .zalloc_pages_exact = kvm_s2_zalloc_pages_exact, 864 .free_pages_exact = kvm_s2_free_pages_exact, 865 .free_unlinked_table = stage2_free_unlinked_table, 866 .get_page = kvm_host_get_page, 867 .put_page = kvm_s2_put_page, 868 .page_count = kvm_host_page_count, 869 .phys_to_virt = kvm_host_va, 870 .virt_to_phys = kvm_host_pa, 871 .dcache_clean_inval_poc = clean_dcache_guest_page, 872 .icache_inval_pou = invalidate_icache_guest_page, 873 }; 874 875 static int kvm_init_ipa_range(struct kvm_s2_mmu *mmu, unsigned long type) 876 { 877 u32 kvm_ipa_limit = get_kvm_ipa_limit(); 878 u64 mmfr0, mmfr1; 879 u32 phys_shift; 880 881 if (type & ~KVM_VM_TYPE_ARM_IPA_SIZE_MASK) 882 return -EINVAL; 883 884 phys_shift = KVM_VM_TYPE_ARM_IPA_SIZE(type); 885 if (is_protected_kvm_enabled()) { 886 phys_shift = kvm_ipa_limit; 887 } else if (phys_shift) { 888 if (phys_shift > kvm_ipa_limit || 889 phys_shift < ARM64_MIN_PARANGE_BITS) 890 return -EINVAL; 891 } else { 892 phys_shift = KVM_PHYS_SHIFT; 893 if (phys_shift > kvm_ipa_limit) { 894 pr_warn_once("%s using unsupported default IPA limit, upgrade your VMM\n", 895 current->comm); 896 return -EINVAL; 897 } 898 } 899 900 mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); 901 mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); 902 mmu->vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift); 903 904 return 0; 905 } 906 907 /* 908 * Assume that @pgt is valid and unlinked from the KVM MMU to free the 909 * page-table without taking the kvm_mmu_lock and without performing any 910 * TLB invalidations. 911 * 912 * Also, the range of addresses can be large enough to cause need_resched 913 * warnings, for instance on CONFIG_PREEMPT_NONE kernels. Hence, invoke 914 * cond_resched() periodically to prevent hogging the CPU for a long time 915 * and schedule something else, if required. 916 */ 917 static void stage2_destroy_range(struct kvm_pgtable *pgt, phys_addr_t addr, 918 phys_addr_t end) 919 { 920 u64 next; 921 922 do { 923 next = stage2_range_addr_end(addr, end); 924 KVM_PGT_FN(kvm_pgtable_stage2_destroy_range)(pgt, addr, 925 next - addr); 926 if (next != end) 927 cond_resched(); 928 } while (addr = next, addr != end); 929 } 930 931 static void kvm_stage2_destroy(struct kvm_pgtable *pgt) 932 { 933 unsigned int ia_bits = VTCR_EL2_IPA(pgt->mmu->vtcr); 934 935 stage2_destroy_range(pgt, 0, BIT(ia_bits)); 936 KVM_PGT_FN(kvm_pgtable_stage2_destroy_pgd)(pgt); 937 } 938 939 /** 940 * kvm_init_stage2_mmu - Initialise a S2 MMU structure 941 * @kvm: The pointer to the KVM structure 942 * @mmu: The pointer to the s2 MMU structure 943 * @type: The machine type of the virtual machine 944 * 945 * Allocates only the stage-2 HW PGD level table(s). 946 * Note we don't need locking here as this is only called in two cases: 947 * 948 * - when the VM is created, which can't race against anything 949 * 950 * - when secondary kvm_s2_mmu structures are initialised for NV 951 * guests, and the caller must hold kvm->lock as this is called on a 952 * per-vcpu basis. 953 */ 954 int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type) 955 { 956 int cpu, err; 957 struct kvm_pgtable *pgt; 958 959 /* 960 * If we already have our page tables in place, and that the 961 * MMU context is the canonical one, we have a bug somewhere, 962 * as this is only supposed to ever happen once per VM. 963 * 964 * Otherwise, we're building nested page tables, and that's 965 * probably because userspace called KVM_ARM_VCPU_INIT more 966 * than once on the same vcpu. Since that's actually legal, 967 * don't kick a fuss and leave gracefully. 968 */ 969 if (mmu->pgt != NULL) { 970 if (kvm_is_nested_s2_mmu(kvm, mmu)) 971 return 0; 972 973 kvm_err("kvm_arch already initialized?\n"); 974 return -EINVAL; 975 } 976 977 err = kvm_init_ipa_range(mmu, type); 978 if (err) 979 return err; 980 981 pgt = kzalloc(sizeof(*pgt), GFP_KERNEL_ACCOUNT); 982 if (!pgt) 983 return -ENOMEM; 984 985 mmu->arch = &kvm->arch; 986 err = KVM_PGT_FN(kvm_pgtable_stage2_init)(pgt, mmu, &kvm_s2_mm_ops); 987 if (err) 988 goto out_free_pgtable; 989 990 mmu->pgt = pgt; 991 if (is_protected_kvm_enabled()) 992 return 0; 993 994 mmu->last_vcpu_ran = alloc_percpu(typeof(*mmu->last_vcpu_ran)); 995 if (!mmu->last_vcpu_ran) { 996 err = -ENOMEM; 997 goto out_destroy_pgtable; 998 } 999 1000 for_each_possible_cpu(cpu) 1001 *per_cpu_ptr(mmu->last_vcpu_ran, cpu) = -1; 1002 1003 /* The eager page splitting is disabled by default */ 1004 mmu->split_page_chunk_size = KVM_ARM_EAGER_SPLIT_CHUNK_SIZE_DEFAULT; 1005 mmu->split_page_cache.gfp_zero = __GFP_ZERO; 1006 1007 mmu->pgd_phys = __pa(pgt->pgd); 1008 1009 if (kvm_is_nested_s2_mmu(kvm, mmu)) 1010 kvm_init_nested_s2_mmu(mmu); 1011 1012 return 0; 1013 1014 out_destroy_pgtable: 1015 kvm_stage2_destroy(pgt); 1016 out_free_pgtable: 1017 kfree(pgt); 1018 return err; 1019 } 1020 1021 void kvm_uninit_stage2_mmu(struct kvm *kvm) 1022 { 1023 kvm_free_stage2_pgd(&kvm->arch.mmu); 1024 kvm_mmu_free_memory_cache(&kvm->arch.mmu.split_page_cache); 1025 } 1026 1027 static void stage2_unmap_memslot(struct kvm *kvm, 1028 struct kvm_memory_slot *memslot) 1029 { 1030 hva_t hva = memslot->userspace_addr; 1031 phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT; 1032 phys_addr_t size = PAGE_SIZE * memslot->npages; 1033 hva_t reg_end = hva + size; 1034 1035 /* 1036 * A memory region could potentially cover multiple VMAs, and any holes 1037 * between them, so iterate over all of them to find out if we should 1038 * unmap any of them. 1039 * 1040 * +--------------------------------------------+ 1041 * +---------------+----------------+ +----------------+ 1042 * | : VMA 1 | VMA 2 | | VMA 3 : | 1043 * +---------------+----------------+ +----------------+ 1044 * | memory region | 1045 * +--------------------------------------------+ 1046 */ 1047 do { 1048 struct vm_area_struct *vma; 1049 hva_t vm_start, vm_end; 1050 1051 vma = find_vma_intersection(current->mm, hva, reg_end); 1052 if (!vma) 1053 break; 1054 1055 /* 1056 * Take the intersection of this VMA with the memory region 1057 */ 1058 vm_start = max(hva, vma->vm_start); 1059 vm_end = min(reg_end, vma->vm_end); 1060 1061 if (!(vma->vm_flags & VM_PFNMAP)) { 1062 gpa_t gpa = addr + (vm_start - memslot->userspace_addr); 1063 kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, vm_end - vm_start, true); 1064 } 1065 hva = vm_end; 1066 } while (hva < reg_end); 1067 } 1068 1069 /** 1070 * stage2_unmap_vm - Unmap Stage-2 RAM mappings 1071 * @kvm: The struct kvm pointer 1072 * 1073 * Go through the memregions and unmap any regular RAM 1074 * backing memory already mapped to the VM. 1075 */ 1076 void stage2_unmap_vm(struct kvm *kvm) 1077 { 1078 struct kvm_memslots *slots; 1079 struct kvm_memory_slot *memslot; 1080 int idx, bkt; 1081 1082 idx = srcu_read_lock(&kvm->srcu); 1083 mmap_read_lock(current->mm); 1084 write_lock(&kvm->mmu_lock); 1085 1086 slots = kvm_memslots(kvm); 1087 kvm_for_each_memslot(memslot, bkt, slots) 1088 stage2_unmap_memslot(kvm, memslot); 1089 1090 kvm_nested_s2_unmap(kvm, true); 1091 1092 write_unlock(&kvm->mmu_lock); 1093 mmap_read_unlock(current->mm); 1094 srcu_read_unlock(&kvm->srcu, idx); 1095 } 1096 1097 void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu) 1098 { 1099 struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu); 1100 struct kvm_pgtable *pgt = NULL; 1101 1102 write_lock(&kvm->mmu_lock); 1103 pgt = mmu->pgt; 1104 if (pgt) { 1105 mmu->pgd_phys = 0; 1106 mmu->pgt = NULL; 1107 free_percpu(mmu->last_vcpu_ran); 1108 } 1109 write_unlock(&kvm->mmu_lock); 1110 1111 if (pgt) { 1112 kvm_stage2_destroy(pgt); 1113 kfree(pgt); 1114 } 1115 } 1116 1117 static void hyp_mc_free_fn(void *addr, void *mc) 1118 { 1119 struct kvm_hyp_memcache *memcache = mc; 1120 1121 if (memcache->flags & HYP_MEMCACHE_ACCOUNT_STAGE2) 1122 kvm_account_pgtable_pages(addr, -1); 1123 1124 free_page((unsigned long)addr); 1125 } 1126 1127 static void *hyp_mc_alloc_fn(void *mc) 1128 { 1129 struct kvm_hyp_memcache *memcache = mc; 1130 void *addr; 1131 1132 addr = (void *)__get_free_page(GFP_KERNEL_ACCOUNT); 1133 if (addr && memcache->flags & HYP_MEMCACHE_ACCOUNT_STAGE2) 1134 kvm_account_pgtable_pages(addr, 1); 1135 1136 return addr; 1137 } 1138 1139 void free_hyp_memcache(struct kvm_hyp_memcache *mc) 1140 { 1141 if (!is_protected_kvm_enabled()) 1142 return; 1143 1144 kfree(mc->mapping); 1145 __free_hyp_memcache(mc, hyp_mc_free_fn, kvm_host_va, mc); 1146 } 1147 1148 int topup_hyp_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages) 1149 { 1150 if (!is_protected_kvm_enabled()) 1151 return 0; 1152 1153 if (!mc->mapping) { 1154 mc->mapping = kzalloc(sizeof(struct pkvm_mapping), GFP_KERNEL_ACCOUNT); 1155 if (!mc->mapping) 1156 return -ENOMEM; 1157 } 1158 1159 return __topup_hyp_memcache(mc, min_pages, hyp_mc_alloc_fn, 1160 kvm_host_pa, mc); 1161 } 1162 1163 /** 1164 * kvm_phys_addr_ioremap - map a device range to guest IPA 1165 * 1166 * @kvm: The KVM pointer 1167 * @guest_ipa: The IPA at which to insert the mapping 1168 * @pa: The physical address of the device 1169 * @size: The size of the mapping 1170 * @writable: Whether or not to create a writable mapping 1171 */ 1172 int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, 1173 phys_addr_t pa, unsigned long size, bool writable) 1174 { 1175 phys_addr_t addr; 1176 int ret = 0; 1177 struct kvm_mmu_memory_cache cache = { .gfp_zero = __GFP_ZERO }; 1178 struct kvm_s2_mmu *mmu = &kvm->arch.mmu; 1179 struct kvm_pgtable *pgt = mmu->pgt; 1180 enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_DEVICE | 1181 KVM_PGTABLE_PROT_R | 1182 (writable ? KVM_PGTABLE_PROT_W : 0); 1183 1184 if (is_protected_kvm_enabled()) 1185 return -EPERM; 1186 1187 size += offset_in_page(guest_ipa); 1188 guest_ipa &= PAGE_MASK; 1189 1190 for (addr = guest_ipa; addr < guest_ipa + size; addr += PAGE_SIZE) { 1191 ret = kvm_mmu_topup_memory_cache(&cache, 1192 kvm_mmu_cache_min_pages(mmu)); 1193 if (ret) 1194 break; 1195 1196 write_lock(&kvm->mmu_lock); 1197 ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, addr, PAGE_SIZE, 1198 pa, prot, &cache, 0); 1199 write_unlock(&kvm->mmu_lock); 1200 if (ret) 1201 break; 1202 1203 pa += PAGE_SIZE; 1204 } 1205 1206 kvm_mmu_free_memory_cache(&cache); 1207 return ret; 1208 } 1209 1210 /** 1211 * kvm_stage2_wp_range() - write protect stage2 memory region range 1212 * @mmu: The KVM stage-2 MMU pointer 1213 * @addr: Start address of range 1214 * @end: End address of range 1215 */ 1216 void kvm_stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end) 1217 { 1218 stage2_apply_range_resched(mmu, addr, end, KVM_PGT_FN(kvm_pgtable_stage2_wrprotect)); 1219 } 1220 1221 /** 1222 * kvm_mmu_wp_memory_region() - write protect stage 2 entries for memory slot 1223 * @kvm: The KVM pointer 1224 * @slot: The memory slot to write protect 1225 * 1226 * Called to start logging dirty pages after memory region 1227 * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns 1228 * all present PUD, PMD and PTEs are write protected in the memory region. 1229 * Afterwards read of dirty page log can be called. 1230 * 1231 * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired, 1232 * serializing operations for VM memory regions. 1233 */ 1234 static void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot) 1235 { 1236 struct kvm_memslots *slots = kvm_memslots(kvm); 1237 struct kvm_memory_slot *memslot = id_to_memslot(slots, slot); 1238 phys_addr_t start, end; 1239 1240 if (WARN_ON_ONCE(!memslot)) 1241 return; 1242 1243 start = memslot->base_gfn << PAGE_SHIFT; 1244 end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT; 1245 1246 write_lock(&kvm->mmu_lock); 1247 kvm_stage2_wp_range(&kvm->arch.mmu, start, end); 1248 kvm_nested_s2_wp(kvm); 1249 write_unlock(&kvm->mmu_lock); 1250 kvm_flush_remote_tlbs_memslot(kvm, memslot); 1251 } 1252 1253 /** 1254 * kvm_mmu_split_memory_region() - split the stage 2 blocks into PAGE_SIZE 1255 * pages for memory slot 1256 * @kvm: The KVM pointer 1257 * @slot: The memory slot to split 1258 * 1259 * Acquires kvm->mmu_lock. Called with kvm->slots_lock mutex acquired, 1260 * serializing operations for VM memory regions. 1261 */ 1262 static void kvm_mmu_split_memory_region(struct kvm *kvm, int slot) 1263 { 1264 struct kvm_memslots *slots; 1265 struct kvm_memory_slot *memslot; 1266 phys_addr_t start, end; 1267 1268 lockdep_assert_held(&kvm->slots_lock); 1269 1270 slots = kvm_memslots(kvm); 1271 memslot = id_to_memslot(slots, slot); 1272 1273 start = memslot->base_gfn << PAGE_SHIFT; 1274 end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT; 1275 1276 write_lock(&kvm->mmu_lock); 1277 kvm_mmu_split_huge_pages(kvm, start, end); 1278 write_unlock(&kvm->mmu_lock); 1279 } 1280 1281 /* 1282 * kvm_arch_mmu_enable_log_dirty_pt_masked() - enable dirty logging for selected pages. 1283 * @kvm: The KVM pointer 1284 * @slot: The memory slot associated with mask 1285 * @gfn_offset: The gfn offset in memory slot 1286 * @mask: The mask of pages at offset 'gfn_offset' in this memory 1287 * slot to enable dirty logging on 1288 * 1289 * Writes protect selected pages to enable dirty logging, and then 1290 * splits them to PAGE_SIZE. Caller must acquire kvm->mmu_lock. 1291 */ 1292 void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, 1293 struct kvm_memory_slot *slot, 1294 gfn_t gfn_offset, unsigned long mask) 1295 { 1296 phys_addr_t base_gfn = slot->base_gfn + gfn_offset; 1297 phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT; 1298 phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT; 1299 1300 lockdep_assert_held_write(&kvm->mmu_lock); 1301 1302 kvm_stage2_wp_range(&kvm->arch.mmu, start, end); 1303 1304 /* 1305 * Eager-splitting is done when manual-protect is set. We 1306 * also check for initially-all-set because we can avoid 1307 * eager-splitting if initially-all-set is false. 1308 * Initially-all-set equal false implies that huge-pages were 1309 * already split when enabling dirty logging: no need to do it 1310 * again. 1311 */ 1312 if (kvm_dirty_log_manual_protect_and_init_set(kvm)) 1313 kvm_mmu_split_huge_pages(kvm, start, end); 1314 1315 kvm_nested_s2_wp(kvm); 1316 } 1317 1318 static void kvm_send_hwpoison_signal(unsigned long address, short lsb) 1319 { 1320 send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current); 1321 } 1322 1323 static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot, 1324 unsigned long hva, 1325 unsigned long map_size) 1326 { 1327 gpa_t gpa_start; 1328 hva_t uaddr_start, uaddr_end; 1329 size_t size; 1330 1331 /* The memslot and the VMA are guaranteed to be aligned to PAGE_SIZE */ 1332 if (map_size == PAGE_SIZE) 1333 return true; 1334 1335 /* pKVM only supports PMD_SIZE huge-mappings */ 1336 if (is_protected_kvm_enabled() && map_size != PMD_SIZE) 1337 return false; 1338 1339 size = memslot->npages * PAGE_SIZE; 1340 1341 gpa_start = memslot->base_gfn << PAGE_SHIFT; 1342 1343 uaddr_start = memslot->userspace_addr; 1344 uaddr_end = uaddr_start + size; 1345 1346 /* 1347 * Pages belonging to memslots that don't have the same alignment 1348 * within a PMD/PUD for userspace and IPA cannot be mapped with stage-2 1349 * PMD/PUD entries, because we'll end up mapping the wrong pages. 1350 * 1351 * Consider a layout like the following: 1352 * 1353 * memslot->userspace_addr: 1354 * +-----+--------------------+--------------------+---+ 1355 * |abcde|fgh Stage-1 block | Stage-1 block tv|xyz| 1356 * +-----+--------------------+--------------------+---+ 1357 * 1358 * memslot->base_gfn << PAGE_SHIFT: 1359 * +---+--------------------+--------------------+-----+ 1360 * |abc|def Stage-2 block | Stage-2 block |tvxyz| 1361 * +---+--------------------+--------------------+-----+ 1362 * 1363 * If we create those stage-2 blocks, we'll end up with this incorrect 1364 * mapping: 1365 * d -> f 1366 * e -> g 1367 * f -> h 1368 */ 1369 if ((gpa_start & (map_size - 1)) != (uaddr_start & (map_size - 1))) 1370 return false; 1371 1372 /* 1373 * Next, let's make sure we're not trying to map anything not covered 1374 * by the memslot. This means we have to prohibit block size mappings 1375 * for the beginning and end of a non-block aligned and non-block sized 1376 * memory slot (illustrated by the head and tail parts of the 1377 * userspace view above containing pages 'abcde' and 'xyz', 1378 * respectively). 1379 * 1380 * Note that it doesn't matter if we do the check using the 1381 * userspace_addr or the base_gfn, as both are equally aligned (per 1382 * the check above) and equally sized. 1383 */ 1384 return (hva & ~(map_size - 1)) >= uaddr_start && 1385 (hva & ~(map_size - 1)) + map_size <= uaddr_end; 1386 } 1387 1388 /* 1389 * Check if the given hva is backed by a transparent huge page (THP) and 1390 * whether it can be mapped using block mapping in stage2. If so, adjust 1391 * the stage2 PFN and IPA accordingly. Only PMD_SIZE THPs are currently 1392 * supported. This will need to be updated to support other THP sizes. 1393 * 1394 * Returns the size of the mapping. 1395 */ 1396 static long 1397 transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot, 1398 unsigned long hva, kvm_pfn_t *pfnp, 1399 phys_addr_t *ipap) 1400 { 1401 kvm_pfn_t pfn = *pfnp; 1402 1403 /* 1404 * Make sure the adjustment is done only for THP pages. Also make 1405 * sure that the HVA and IPA are sufficiently aligned and that the 1406 * block map is contained within the memslot. 1407 */ 1408 if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) { 1409 int sz = get_user_mapping_size(kvm, hva); 1410 1411 if (sz < 0) 1412 return sz; 1413 1414 if (sz < PMD_SIZE) 1415 return PAGE_SIZE; 1416 1417 *ipap &= PMD_MASK; 1418 pfn &= ~(PTRS_PER_PMD - 1); 1419 *pfnp = pfn; 1420 1421 return PMD_SIZE; 1422 } 1423 1424 /* Use page mapping if we cannot use block mapping. */ 1425 return PAGE_SIZE; 1426 } 1427 1428 static int get_vma_page_shift(struct vm_area_struct *vma, unsigned long hva) 1429 { 1430 unsigned long pa; 1431 1432 if (is_vm_hugetlb_page(vma) && !(vma->vm_flags & VM_PFNMAP)) 1433 return huge_page_shift(hstate_vma(vma)); 1434 1435 if (!(vma->vm_flags & VM_PFNMAP)) 1436 return PAGE_SHIFT; 1437 1438 VM_BUG_ON(is_vm_hugetlb_page(vma)); 1439 1440 pa = (vma->vm_pgoff << PAGE_SHIFT) + (hva - vma->vm_start); 1441 1442 #ifndef __PAGETABLE_PMD_FOLDED 1443 if ((hva & (PUD_SIZE - 1)) == (pa & (PUD_SIZE - 1)) && 1444 ALIGN_DOWN(hva, PUD_SIZE) >= vma->vm_start && 1445 ALIGN(hva, PUD_SIZE) <= vma->vm_end) 1446 return PUD_SHIFT; 1447 #endif 1448 1449 if ((hva & (PMD_SIZE - 1)) == (pa & (PMD_SIZE - 1)) && 1450 ALIGN_DOWN(hva, PMD_SIZE) >= vma->vm_start && 1451 ALIGN(hva, PMD_SIZE) <= vma->vm_end) 1452 return PMD_SHIFT; 1453 1454 return PAGE_SHIFT; 1455 } 1456 1457 /* 1458 * The page will be mapped in stage 2 as Normal Cacheable, so the VM will be 1459 * able to see the page's tags and therefore they must be initialised first. If 1460 * PG_mte_tagged is set, tags have already been initialised. 1461 * 1462 * The race in the test/set of the PG_mte_tagged flag is handled by: 1463 * - preventing VM_SHARED mappings in a memslot with MTE preventing two VMs 1464 * racing to santise the same page 1465 * - mmap_lock protects between a VM faulting a page in and the VMM performing 1466 * an mprotect() to add VM_MTE 1467 */ 1468 static void sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn, 1469 unsigned long size) 1470 { 1471 unsigned long i, nr_pages = size >> PAGE_SHIFT; 1472 struct page *page = pfn_to_page(pfn); 1473 struct folio *folio = page_folio(page); 1474 1475 if (!kvm_has_mte(kvm)) 1476 return; 1477 1478 if (folio_test_hugetlb(folio)) { 1479 /* Hugetlb has MTE flags set on head page only */ 1480 if (folio_try_hugetlb_mte_tagging(folio)) { 1481 for (i = 0; i < nr_pages; i++, page++) 1482 mte_clear_page_tags(page_address(page)); 1483 folio_set_hugetlb_mte_tagged(folio); 1484 } 1485 return; 1486 } 1487 1488 for (i = 0; i < nr_pages; i++, page++) { 1489 if (try_page_mte_tagging(page)) { 1490 mte_clear_page_tags(page_address(page)); 1491 set_page_mte_tagged(page); 1492 } 1493 } 1494 } 1495 1496 static bool kvm_vma_mte_allowed(struct vm_area_struct *vma) 1497 { 1498 return vma->vm_flags & VM_MTE_ALLOWED; 1499 } 1500 1501 static bool kvm_vma_is_cacheable(struct vm_area_struct *vma) 1502 { 1503 switch (FIELD_GET(PTE_ATTRINDX_MASK, pgprot_val(vma->vm_page_prot))) { 1504 case MT_NORMAL_NC: 1505 case MT_DEVICE_nGnRnE: 1506 case MT_DEVICE_nGnRE: 1507 return false; 1508 default: 1509 return true; 1510 } 1511 } 1512 1513 static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, 1514 struct kvm_s2_trans *nested, 1515 struct kvm_memory_slot *memslot, unsigned long hva, 1516 bool fault_is_perm) 1517 { 1518 int ret = 0; 1519 bool write_fault, writable, force_pte = false; 1520 bool exec_fault, mte_allowed, is_vma_cacheable; 1521 bool s2_force_noncacheable = false, vfio_allow_any_uc = false; 1522 unsigned long mmu_seq; 1523 phys_addr_t ipa = fault_ipa; 1524 struct kvm *kvm = vcpu->kvm; 1525 struct vm_area_struct *vma; 1526 short vma_shift; 1527 void *memcache; 1528 gfn_t gfn; 1529 kvm_pfn_t pfn; 1530 bool logging_active = memslot_is_logging(memslot); 1531 long vma_pagesize, fault_granule; 1532 enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; 1533 struct kvm_pgtable *pgt; 1534 struct page *page; 1535 vm_flags_t vm_flags; 1536 enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED; 1537 1538 if (fault_is_perm) 1539 fault_granule = kvm_vcpu_trap_get_perm_fault_granule(vcpu); 1540 write_fault = kvm_is_write_fault(vcpu); 1541 exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu); 1542 VM_BUG_ON(write_fault && exec_fault); 1543 1544 if (fault_is_perm && !write_fault && !exec_fault) { 1545 kvm_err("Unexpected L2 read permission error\n"); 1546 return -EFAULT; 1547 } 1548 1549 if (!is_protected_kvm_enabled()) 1550 memcache = &vcpu->arch.mmu_page_cache; 1551 else 1552 memcache = &vcpu->arch.pkvm_memcache; 1553 1554 /* 1555 * Permission faults just need to update the existing leaf entry, 1556 * and so normally don't require allocations from the memcache. The 1557 * only exception to this is when dirty logging is enabled at runtime 1558 * and a write fault needs to collapse a block entry into a table. 1559 */ 1560 if (!fault_is_perm || (logging_active && write_fault)) { 1561 int min_pages = kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu); 1562 1563 if (!is_protected_kvm_enabled()) 1564 ret = kvm_mmu_topup_memory_cache(memcache, min_pages); 1565 else 1566 ret = topup_hyp_memcache(memcache, min_pages); 1567 1568 if (ret) 1569 return ret; 1570 } 1571 1572 /* 1573 * Let's check if we will get back a huge page backed by hugetlbfs, or 1574 * get block mapping for device MMIO region. 1575 */ 1576 mmap_read_lock(current->mm); 1577 vma = vma_lookup(current->mm, hva); 1578 if (unlikely(!vma)) { 1579 kvm_err("Failed to find VMA for hva 0x%lx\n", hva); 1580 mmap_read_unlock(current->mm); 1581 return -EFAULT; 1582 } 1583 1584 /* 1585 * logging_active is guaranteed to never be true for VM_PFNMAP 1586 * memslots. 1587 */ 1588 if (logging_active) { 1589 force_pte = true; 1590 vma_shift = PAGE_SHIFT; 1591 } else { 1592 vma_shift = get_vma_page_shift(vma, hva); 1593 } 1594 1595 switch (vma_shift) { 1596 #ifndef __PAGETABLE_PMD_FOLDED 1597 case PUD_SHIFT: 1598 if (fault_supports_stage2_huge_mapping(memslot, hva, PUD_SIZE)) 1599 break; 1600 fallthrough; 1601 #endif 1602 case CONT_PMD_SHIFT: 1603 vma_shift = PMD_SHIFT; 1604 fallthrough; 1605 case PMD_SHIFT: 1606 if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) 1607 break; 1608 fallthrough; 1609 case CONT_PTE_SHIFT: 1610 vma_shift = PAGE_SHIFT; 1611 force_pte = true; 1612 fallthrough; 1613 case PAGE_SHIFT: 1614 break; 1615 default: 1616 WARN_ONCE(1, "Unknown vma_shift %d", vma_shift); 1617 } 1618 1619 vma_pagesize = 1UL << vma_shift; 1620 1621 if (nested) { 1622 unsigned long max_map_size; 1623 1624 max_map_size = force_pte ? PAGE_SIZE : PUD_SIZE; 1625 1626 ipa = kvm_s2_trans_output(nested); 1627 1628 /* 1629 * If we're about to create a shadow stage 2 entry, then we 1630 * can only create a block mapping if the guest stage 2 page 1631 * table uses at least as big a mapping. 1632 */ 1633 max_map_size = min(kvm_s2_trans_size(nested), max_map_size); 1634 1635 /* 1636 * Be careful that if the mapping size falls between 1637 * two host sizes, take the smallest of the two. 1638 */ 1639 if (max_map_size >= PMD_SIZE && max_map_size < PUD_SIZE) 1640 max_map_size = PMD_SIZE; 1641 else if (max_map_size >= PAGE_SIZE && max_map_size < PMD_SIZE) 1642 max_map_size = PAGE_SIZE; 1643 1644 force_pte = (max_map_size == PAGE_SIZE); 1645 vma_pagesize = min(vma_pagesize, (long)max_map_size); 1646 } 1647 1648 /* 1649 * Both the canonical IPA and fault IPA must be hugepage-aligned to 1650 * ensure we find the right PFN and lay down the mapping in the right 1651 * place. 1652 */ 1653 if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE) { 1654 fault_ipa &= ~(vma_pagesize - 1); 1655 ipa &= ~(vma_pagesize - 1); 1656 } 1657 1658 gfn = ipa >> PAGE_SHIFT; 1659 mte_allowed = kvm_vma_mte_allowed(vma); 1660 1661 vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED; 1662 1663 vm_flags = vma->vm_flags; 1664 1665 is_vma_cacheable = kvm_vma_is_cacheable(vma); 1666 1667 /* Don't use the VMA after the unlock -- it may have vanished */ 1668 vma = NULL; 1669 1670 /* 1671 * Read mmu_invalidate_seq so that KVM can detect if the results of 1672 * vma_lookup() or __kvm_faultin_pfn() become stale prior to 1673 * acquiring kvm->mmu_lock. 1674 * 1675 * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs 1676 * with the smp_wmb() in kvm_mmu_invalidate_end(). 1677 */ 1678 mmu_seq = vcpu->kvm->mmu_invalidate_seq; 1679 mmap_read_unlock(current->mm); 1680 1681 pfn = __kvm_faultin_pfn(memslot, gfn, write_fault ? FOLL_WRITE : 0, 1682 &writable, &page); 1683 if (pfn == KVM_PFN_ERR_HWPOISON) { 1684 kvm_send_hwpoison_signal(hva, vma_shift); 1685 return 0; 1686 } 1687 if (is_error_noslot_pfn(pfn)) 1688 return -EFAULT; 1689 1690 /* 1691 * Check if this is non-struct page memory PFN, and cannot support 1692 * CMOs. It could potentially be unsafe to access as cachable. 1693 */ 1694 if (vm_flags & (VM_PFNMAP | VM_MIXEDMAP) && !pfn_is_map_memory(pfn)) { 1695 if (is_vma_cacheable) { 1696 /* 1697 * Whilst the VMA owner expects cacheable mapping to this 1698 * PFN, hardware also has to support the FWB and CACHE DIC 1699 * features. 1700 * 1701 * ARM64 KVM relies on kernel VA mapping to the PFN to 1702 * perform cache maintenance as the CMO instructions work on 1703 * virtual addresses. VM_PFNMAP region are not necessarily 1704 * mapped to a KVA and hence the presence of hardware features 1705 * S2FWB and CACHE DIC are mandatory to avoid the need for 1706 * cache maintenance. 1707 */ 1708 if (!kvm_supports_cacheable_pfnmap()) 1709 return -EFAULT; 1710 } else { 1711 /* 1712 * If the page was identified as device early by looking at 1713 * the VMA flags, vma_pagesize is already representing the 1714 * largest quantity we can map. If instead it was mapped 1715 * via __kvm_faultin_pfn(), vma_pagesize is set to PAGE_SIZE 1716 * and must not be upgraded. 1717 * 1718 * In both cases, we don't let transparent_hugepage_adjust() 1719 * change things at the last minute. 1720 */ 1721 s2_force_noncacheable = true; 1722 } 1723 } else if (logging_active && !write_fault) { 1724 /* 1725 * Only actually map the page as writable if this was a write 1726 * fault. 1727 */ 1728 writable = false; 1729 } 1730 1731 if (exec_fault && s2_force_noncacheable) 1732 return -ENOEXEC; 1733 1734 /* 1735 * Potentially reduce shadow S2 permissions to match the guest's own 1736 * S2. For exec faults, we'd only reach this point if the guest 1737 * actually allowed it (see kvm_s2_handle_perm_fault). 1738 * 1739 * Also encode the level of the original translation in the SW bits 1740 * of the leaf entry as a proxy for the span of that translation. 1741 * This will be retrieved on TLB invalidation from the guest and 1742 * used to limit the invalidation scope if a TTL hint or a range 1743 * isn't provided. 1744 */ 1745 if (nested) { 1746 writable &= kvm_s2_trans_writable(nested); 1747 if (!kvm_s2_trans_readable(nested)) 1748 prot &= ~KVM_PGTABLE_PROT_R; 1749 1750 prot |= kvm_encode_nested_level(nested); 1751 } 1752 1753 kvm_fault_lock(kvm); 1754 pgt = vcpu->arch.hw_mmu->pgt; 1755 if (mmu_invalidate_retry(kvm, mmu_seq)) { 1756 ret = -EAGAIN; 1757 goto out_unlock; 1758 } 1759 1760 /* 1761 * If we are not forced to use page mapping, check if we are 1762 * backed by a THP and thus use block mapping if possible. 1763 */ 1764 if (vma_pagesize == PAGE_SIZE && !(force_pte || s2_force_noncacheable)) { 1765 if (fault_is_perm && fault_granule > PAGE_SIZE) 1766 vma_pagesize = fault_granule; 1767 else 1768 vma_pagesize = transparent_hugepage_adjust(kvm, memslot, 1769 hva, &pfn, 1770 &fault_ipa); 1771 1772 if (vma_pagesize < 0) { 1773 ret = vma_pagesize; 1774 goto out_unlock; 1775 } 1776 } 1777 1778 if (!fault_is_perm && !s2_force_noncacheable && kvm_has_mte(kvm)) { 1779 /* Check the VMM hasn't introduced a new disallowed VMA */ 1780 if (mte_allowed) { 1781 sanitise_mte_tags(kvm, pfn, vma_pagesize); 1782 } else { 1783 ret = -EFAULT; 1784 goto out_unlock; 1785 } 1786 } 1787 1788 if (writable) 1789 prot |= KVM_PGTABLE_PROT_W; 1790 1791 if (exec_fault) 1792 prot |= KVM_PGTABLE_PROT_X; 1793 1794 if (s2_force_noncacheable) { 1795 if (vfio_allow_any_uc) 1796 prot |= KVM_PGTABLE_PROT_NORMAL_NC; 1797 else 1798 prot |= KVM_PGTABLE_PROT_DEVICE; 1799 } else if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC) && 1800 (!nested || kvm_s2_trans_executable(nested))) { 1801 prot |= KVM_PGTABLE_PROT_X; 1802 } 1803 1804 /* 1805 * Under the premise of getting a FSC_PERM fault, we just need to relax 1806 * permissions only if vma_pagesize equals fault_granule. Otherwise, 1807 * kvm_pgtable_stage2_map() should be called to change block size. 1808 */ 1809 if (fault_is_perm && vma_pagesize == fault_granule) { 1810 /* 1811 * Drop the SW bits in favour of those stored in the 1812 * PTE, which will be preserved. 1813 */ 1814 prot &= ~KVM_NV_GUEST_MAP_SZ; 1815 ret = KVM_PGT_FN(kvm_pgtable_stage2_relax_perms)(pgt, fault_ipa, prot, flags); 1816 } else { 1817 ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, fault_ipa, vma_pagesize, 1818 __pfn_to_phys(pfn), prot, 1819 memcache, flags); 1820 } 1821 1822 out_unlock: 1823 kvm_release_faultin_page(kvm, page, !!ret, writable); 1824 kvm_fault_unlock(kvm); 1825 1826 /* Mark the page dirty only if the fault is handled successfully */ 1827 if (writable && !ret) 1828 mark_page_dirty_in_slot(kvm, memslot, gfn); 1829 1830 return ret != -EAGAIN ? ret : 0; 1831 } 1832 1833 /* Resolve the access fault by making the page young again. */ 1834 static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) 1835 { 1836 enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED; 1837 struct kvm_s2_mmu *mmu; 1838 1839 trace_kvm_access_fault(fault_ipa); 1840 1841 read_lock(&vcpu->kvm->mmu_lock); 1842 mmu = vcpu->arch.hw_mmu; 1843 KVM_PGT_FN(kvm_pgtable_stage2_mkyoung)(mmu->pgt, fault_ipa, flags); 1844 read_unlock(&vcpu->kvm->mmu_lock); 1845 } 1846 1847 int kvm_handle_guest_sea(struct kvm_vcpu *vcpu) 1848 { 1849 /* 1850 * Give APEI the opportunity to claim the abort before handling it 1851 * within KVM. apei_claim_sea() expects to be called with IRQs enabled. 1852 */ 1853 lockdep_assert_irqs_enabled(); 1854 if (apei_claim_sea(NULL) == 0) 1855 return 1; 1856 1857 return kvm_inject_serror(vcpu); 1858 } 1859 1860 /** 1861 * kvm_handle_guest_abort - handles all 2nd stage aborts 1862 * @vcpu: the VCPU pointer 1863 * 1864 * Any abort that gets to the host is almost guaranteed to be caused by a 1865 * missing second stage translation table entry, which can mean that either the 1866 * guest simply needs more memory and we must allocate an appropriate page or it 1867 * can mean that the guest tried to access I/O memory, which is emulated by user 1868 * space. The distinction is based on the IPA causing the fault and whether this 1869 * memory region has been registered as standard RAM by user space. 1870 */ 1871 int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) 1872 { 1873 struct kvm_s2_trans nested_trans, *nested = NULL; 1874 unsigned long esr; 1875 phys_addr_t fault_ipa; /* The address we faulted on */ 1876 phys_addr_t ipa; /* Always the IPA in the L1 guest phys space */ 1877 struct kvm_memory_slot *memslot; 1878 unsigned long hva; 1879 bool is_iabt, write_fault, writable; 1880 gfn_t gfn; 1881 int ret, idx; 1882 1883 if (kvm_vcpu_abt_issea(vcpu)) 1884 return kvm_handle_guest_sea(vcpu); 1885 1886 esr = kvm_vcpu_get_esr(vcpu); 1887 1888 /* 1889 * The fault IPA should be reliable at this point as we're not dealing 1890 * with an SEA. 1891 */ 1892 ipa = fault_ipa = kvm_vcpu_get_fault_ipa(vcpu); 1893 if (KVM_BUG_ON(ipa == INVALID_GPA, vcpu->kvm)) 1894 return -EFAULT; 1895 1896 is_iabt = kvm_vcpu_trap_is_iabt(vcpu); 1897 1898 if (esr_fsc_is_translation_fault(esr)) { 1899 /* Beyond sanitised PARange (which is the IPA limit) */ 1900 if (fault_ipa >= BIT_ULL(get_kvm_ipa_limit())) { 1901 kvm_inject_size_fault(vcpu); 1902 return 1; 1903 } 1904 1905 /* Falls between the IPA range and the PARange? */ 1906 if (fault_ipa >= BIT_ULL(VTCR_EL2_IPA(vcpu->arch.hw_mmu->vtcr))) { 1907 fault_ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0); 1908 1909 return kvm_inject_sea(vcpu, is_iabt, fault_ipa); 1910 } 1911 } 1912 1913 trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_esr(vcpu), 1914 kvm_vcpu_get_hfar(vcpu), fault_ipa); 1915 1916 /* Check the stage-2 fault is trans. fault or write fault */ 1917 if (!esr_fsc_is_translation_fault(esr) && 1918 !esr_fsc_is_permission_fault(esr) && 1919 !esr_fsc_is_access_flag_fault(esr)) { 1920 kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n", 1921 kvm_vcpu_trap_get_class(vcpu), 1922 (unsigned long)kvm_vcpu_trap_get_fault(vcpu), 1923 (unsigned long)kvm_vcpu_get_esr(vcpu)); 1924 return -EFAULT; 1925 } 1926 1927 idx = srcu_read_lock(&vcpu->kvm->srcu); 1928 1929 /* 1930 * We may have faulted on a shadow stage 2 page table if we are 1931 * running a nested guest. In this case, we have to resolve the L2 1932 * IPA to the L1 IPA first, before knowing what kind of memory should 1933 * back the L1 IPA. 1934 * 1935 * If the shadow stage 2 page table walk faults, then we simply inject 1936 * this to the guest and carry on. 1937 * 1938 * If there are no shadow S2 PTs because S2 is disabled, there is 1939 * nothing to walk and we treat it as a 1:1 before going through the 1940 * canonical translation. 1941 */ 1942 if (kvm_is_nested_s2_mmu(vcpu->kvm,vcpu->arch.hw_mmu) && 1943 vcpu->arch.hw_mmu->nested_stage2_enabled) { 1944 u32 esr; 1945 1946 ret = kvm_walk_nested_s2(vcpu, fault_ipa, &nested_trans); 1947 if (ret) { 1948 esr = kvm_s2_trans_esr(&nested_trans); 1949 kvm_inject_s2_fault(vcpu, esr); 1950 goto out_unlock; 1951 } 1952 1953 ret = kvm_s2_handle_perm_fault(vcpu, &nested_trans); 1954 if (ret) { 1955 esr = kvm_s2_trans_esr(&nested_trans); 1956 kvm_inject_s2_fault(vcpu, esr); 1957 goto out_unlock; 1958 } 1959 1960 ipa = kvm_s2_trans_output(&nested_trans); 1961 nested = &nested_trans; 1962 } 1963 1964 gfn = ipa >> PAGE_SHIFT; 1965 memslot = gfn_to_memslot(vcpu->kvm, gfn); 1966 hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable); 1967 write_fault = kvm_is_write_fault(vcpu); 1968 if (kvm_is_error_hva(hva) || (write_fault && !writable)) { 1969 /* 1970 * The guest has put either its instructions or its page-tables 1971 * somewhere it shouldn't have. Userspace won't be able to do 1972 * anything about this (there's no syndrome for a start), so 1973 * re-inject the abort back into the guest. 1974 */ 1975 if (is_iabt) { 1976 ret = -ENOEXEC; 1977 goto out; 1978 } 1979 1980 if (kvm_vcpu_abt_iss1tw(vcpu)) { 1981 ret = kvm_inject_sea_dabt(vcpu, kvm_vcpu_get_hfar(vcpu)); 1982 goto out_unlock; 1983 } 1984 1985 /* 1986 * Check for a cache maintenance operation. Since we 1987 * ended-up here, we know it is outside of any memory 1988 * slot. But we can't find out if that is for a device, 1989 * or if the guest is just being stupid. The only thing 1990 * we know for sure is that this range cannot be cached. 1991 * 1992 * So let's assume that the guest is just being 1993 * cautious, and skip the instruction. 1994 */ 1995 if (kvm_is_error_hva(hva) && kvm_vcpu_dabt_is_cm(vcpu)) { 1996 kvm_incr_pc(vcpu); 1997 ret = 1; 1998 goto out_unlock; 1999 } 2000 2001 /* 2002 * The IPA is reported as [MAX:12], so we need to 2003 * complement it with the bottom 12 bits from the 2004 * faulting VA. This is always 12 bits, irrespective 2005 * of the page size. 2006 */ 2007 ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0); 2008 ret = io_mem_abort(vcpu, ipa); 2009 goto out_unlock; 2010 } 2011 2012 /* Userspace should not be able to register out-of-bounds IPAs */ 2013 VM_BUG_ON(ipa >= kvm_phys_size(vcpu->arch.hw_mmu)); 2014 2015 if (esr_fsc_is_access_flag_fault(esr)) { 2016 handle_access_fault(vcpu, fault_ipa); 2017 ret = 1; 2018 goto out_unlock; 2019 } 2020 2021 ret = user_mem_abort(vcpu, fault_ipa, nested, memslot, hva, 2022 esr_fsc_is_permission_fault(esr)); 2023 if (ret == 0) 2024 ret = 1; 2025 out: 2026 if (ret == -ENOEXEC) 2027 ret = kvm_inject_sea_iabt(vcpu, kvm_vcpu_get_hfar(vcpu)); 2028 out_unlock: 2029 srcu_read_unlock(&vcpu->kvm->srcu, idx); 2030 return ret; 2031 } 2032 2033 bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) 2034 { 2035 if (!kvm->arch.mmu.pgt) 2036 return false; 2037 2038 __unmap_stage2_range(&kvm->arch.mmu, range->start << PAGE_SHIFT, 2039 (range->end - range->start) << PAGE_SHIFT, 2040 range->may_block); 2041 2042 kvm_nested_s2_unmap(kvm, range->may_block); 2043 return false; 2044 } 2045 2046 bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 2047 { 2048 u64 size = (range->end - range->start) << PAGE_SHIFT; 2049 2050 if (!kvm->arch.mmu.pgt) 2051 return false; 2052 2053 return KVM_PGT_FN(kvm_pgtable_stage2_test_clear_young)(kvm->arch.mmu.pgt, 2054 range->start << PAGE_SHIFT, 2055 size, true); 2056 /* 2057 * TODO: Handle nested_mmu structures here using the reverse mapping in 2058 * a later version of patch series. 2059 */ 2060 } 2061 2062 bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 2063 { 2064 u64 size = (range->end - range->start) << PAGE_SHIFT; 2065 2066 if (!kvm->arch.mmu.pgt) 2067 return false; 2068 2069 return KVM_PGT_FN(kvm_pgtable_stage2_test_clear_young)(kvm->arch.mmu.pgt, 2070 range->start << PAGE_SHIFT, 2071 size, false); 2072 } 2073 2074 phys_addr_t kvm_mmu_get_httbr(void) 2075 { 2076 return __pa(hyp_pgtable->pgd); 2077 } 2078 2079 phys_addr_t kvm_get_idmap_vector(void) 2080 { 2081 return hyp_idmap_vector; 2082 } 2083 2084 static int kvm_map_idmap_text(void) 2085 { 2086 unsigned long size = hyp_idmap_end - hyp_idmap_start; 2087 int err = __create_hyp_mappings(hyp_idmap_start, size, hyp_idmap_start, 2088 PAGE_HYP_EXEC); 2089 if (err) 2090 kvm_err("Failed to idmap %lx-%lx\n", 2091 hyp_idmap_start, hyp_idmap_end); 2092 2093 return err; 2094 } 2095 2096 static void *kvm_hyp_zalloc_page(void *arg) 2097 { 2098 return (void *)get_zeroed_page(GFP_KERNEL); 2099 } 2100 2101 static struct kvm_pgtable_mm_ops kvm_hyp_mm_ops = { 2102 .zalloc_page = kvm_hyp_zalloc_page, 2103 .get_page = kvm_host_get_page, 2104 .put_page = kvm_host_put_page, 2105 .phys_to_virt = kvm_host_va, 2106 .virt_to_phys = kvm_host_pa, 2107 }; 2108 2109 int __init kvm_mmu_init(u32 *hyp_va_bits) 2110 { 2111 int err; 2112 u32 idmap_bits; 2113 u32 kernel_bits; 2114 2115 hyp_idmap_start = __pa_symbol(__hyp_idmap_text_start); 2116 hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE); 2117 hyp_idmap_end = __pa_symbol(__hyp_idmap_text_end); 2118 hyp_idmap_end = ALIGN(hyp_idmap_end, PAGE_SIZE); 2119 hyp_idmap_vector = __pa_symbol(__kvm_hyp_init); 2120 2121 /* 2122 * We rely on the linker script to ensure at build time that the HYP 2123 * init code does not cross a page boundary. 2124 */ 2125 BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK); 2126 2127 /* 2128 * The ID map is always configured for 48 bits of translation, which 2129 * may be fewer than the number of VA bits used by the regular kernel 2130 * stage 1, when VA_BITS=52. 2131 * 2132 * At EL2, there is only one TTBR register, and we can't switch between 2133 * translation tables *and* update TCR_EL2.T0SZ at the same time. Bottom 2134 * line: we need to use the extended range with *both* our translation 2135 * tables. 2136 * 2137 * So use the maximum of the idmap VA bits and the regular kernel stage 2138 * 1 VA bits to assure that the hypervisor can both ID map its code page 2139 * and map any kernel memory. 2140 */ 2141 idmap_bits = IDMAP_VA_BITS; 2142 kernel_bits = vabits_actual; 2143 *hyp_va_bits = max(idmap_bits, kernel_bits); 2144 2145 kvm_debug("Using %u-bit virtual addresses at EL2\n", *hyp_va_bits); 2146 kvm_debug("IDMAP page: %lx\n", hyp_idmap_start); 2147 kvm_debug("HYP VA range: %lx:%lx\n", 2148 kern_hyp_va(PAGE_OFFSET), 2149 kern_hyp_va((unsigned long)high_memory - 1)); 2150 2151 if (hyp_idmap_start >= kern_hyp_va(PAGE_OFFSET) && 2152 hyp_idmap_start < kern_hyp_va((unsigned long)high_memory - 1) && 2153 hyp_idmap_start != (unsigned long)__hyp_idmap_text_start) { 2154 /* 2155 * The idmap page is intersecting with the VA space, 2156 * it is not safe to continue further. 2157 */ 2158 kvm_err("IDMAP intersecting with HYP VA, unable to continue\n"); 2159 err = -EINVAL; 2160 goto out; 2161 } 2162 2163 hyp_pgtable = kzalloc(sizeof(*hyp_pgtable), GFP_KERNEL); 2164 if (!hyp_pgtable) { 2165 kvm_err("Hyp mode page-table not allocated\n"); 2166 err = -ENOMEM; 2167 goto out; 2168 } 2169 2170 err = kvm_pgtable_hyp_init(hyp_pgtable, *hyp_va_bits, &kvm_hyp_mm_ops); 2171 if (err) 2172 goto out_free_pgtable; 2173 2174 err = kvm_map_idmap_text(); 2175 if (err) 2176 goto out_destroy_pgtable; 2177 2178 io_map_base = hyp_idmap_start; 2179 __hyp_va_bits = *hyp_va_bits; 2180 return 0; 2181 2182 out_destroy_pgtable: 2183 kvm_pgtable_hyp_destroy(hyp_pgtable); 2184 out_free_pgtable: 2185 kfree(hyp_pgtable); 2186 hyp_pgtable = NULL; 2187 out: 2188 return err; 2189 } 2190 2191 void kvm_arch_commit_memory_region(struct kvm *kvm, 2192 struct kvm_memory_slot *old, 2193 const struct kvm_memory_slot *new, 2194 enum kvm_mr_change change) 2195 { 2196 bool log_dirty_pages = new && new->flags & KVM_MEM_LOG_DIRTY_PAGES; 2197 2198 /* 2199 * At this point memslot has been committed and there is an 2200 * allocated dirty_bitmap[], dirty pages will be tracked while the 2201 * memory slot is write protected. 2202 */ 2203 if (log_dirty_pages) { 2204 2205 if (change == KVM_MR_DELETE) 2206 return; 2207 2208 /* 2209 * Huge and normal pages are write-protected and split 2210 * on either of these two cases: 2211 * 2212 * 1. with initial-all-set: gradually with CLEAR ioctls, 2213 */ 2214 if (kvm_dirty_log_manual_protect_and_init_set(kvm)) 2215 return; 2216 /* 2217 * or 2218 * 2. without initial-all-set: all in one shot when 2219 * enabling dirty logging. 2220 */ 2221 kvm_mmu_wp_memory_region(kvm, new->id); 2222 kvm_mmu_split_memory_region(kvm, new->id); 2223 } else { 2224 /* 2225 * Free any leftovers from the eager page splitting cache. Do 2226 * this when deleting, moving, disabling dirty logging, or 2227 * creating the memslot (a nop). Doing it for deletes makes 2228 * sure we don't leak memory, and there's no need to keep the 2229 * cache around for any of the other cases. 2230 */ 2231 kvm_mmu_free_memory_cache(&kvm->arch.mmu.split_page_cache); 2232 } 2233 } 2234 2235 int kvm_arch_prepare_memory_region(struct kvm *kvm, 2236 const struct kvm_memory_slot *old, 2237 struct kvm_memory_slot *new, 2238 enum kvm_mr_change change) 2239 { 2240 hva_t hva, reg_end; 2241 int ret = 0; 2242 2243 if (change != KVM_MR_CREATE && change != KVM_MR_MOVE && 2244 change != KVM_MR_FLAGS_ONLY) 2245 return 0; 2246 2247 /* 2248 * Prevent userspace from creating a memory region outside of the IPA 2249 * space addressable by the KVM guest IPA space. 2250 */ 2251 if ((new->base_gfn + new->npages) > (kvm_phys_size(&kvm->arch.mmu) >> PAGE_SHIFT)) 2252 return -EFAULT; 2253 2254 hva = new->userspace_addr; 2255 reg_end = hva + (new->npages << PAGE_SHIFT); 2256 2257 mmap_read_lock(current->mm); 2258 /* 2259 * A memory region could potentially cover multiple VMAs, and any holes 2260 * between them, so iterate over all of them. 2261 * 2262 * +--------------------------------------------+ 2263 * +---------------+----------------+ +----------------+ 2264 * | : VMA 1 | VMA 2 | | VMA 3 : | 2265 * +---------------+----------------+ +----------------+ 2266 * | memory region | 2267 * +--------------------------------------------+ 2268 */ 2269 do { 2270 struct vm_area_struct *vma; 2271 2272 vma = find_vma_intersection(current->mm, hva, reg_end); 2273 if (!vma) 2274 break; 2275 2276 if (kvm_has_mte(kvm) && !kvm_vma_mte_allowed(vma)) { 2277 ret = -EINVAL; 2278 break; 2279 } 2280 2281 if (vma->vm_flags & VM_PFNMAP) { 2282 /* IO region dirty page logging not allowed */ 2283 if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) { 2284 ret = -EINVAL; 2285 break; 2286 } 2287 2288 /* 2289 * Cacheable PFNMAP is allowed only if the hardware 2290 * supports it. 2291 */ 2292 if (kvm_vma_is_cacheable(vma) && !kvm_supports_cacheable_pfnmap()) { 2293 ret = -EINVAL; 2294 break; 2295 } 2296 } 2297 hva = min(reg_end, vma->vm_end); 2298 } while (hva < reg_end); 2299 2300 mmap_read_unlock(current->mm); 2301 return ret; 2302 } 2303 2304 void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) 2305 { 2306 } 2307 2308 void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) 2309 { 2310 } 2311 2312 void kvm_arch_flush_shadow_memslot(struct kvm *kvm, 2313 struct kvm_memory_slot *slot) 2314 { 2315 gpa_t gpa = slot->base_gfn << PAGE_SHIFT; 2316 phys_addr_t size = slot->npages << PAGE_SHIFT; 2317 2318 write_lock(&kvm->mmu_lock); 2319 kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, size, true); 2320 kvm_nested_s2_unmap(kvm, true); 2321 write_unlock(&kvm->mmu_lock); 2322 } 2323 2324 /* 2325 * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized). 2326 * 2327 * Main problems: 2328 * - S/W ops are local to a CPU (not broadcast) 2329 * - We have line migration behind our back (speculation) 2330 * - System caches don't support S/W at all (damn!) 2331 * 2332 * In the face of the above, the best we can do is to try and convert 2333 * S/W ops to VA ops. Because the guest is not allowed to infer the 2334 * S/W to PA mapping, it can only use S/W to nuke the whole cache, 2335 * which is a rather good thing for us. 2336 * 2337 * Also, it is only used when turning caches on/off ("The expected 2338 * usage of the cache maintenance instructions that operate by set/way 2339 * is associated with the cache maintenance instructions associated 2340 * with the powerdown and powerup of caches, if this is required by 2341 * the implementation."). 2342 * 2343 * We use the following policy: 2344 * 2345 * - If we trap a S/W operation, we enable VM trapping to detect 2346 * caches being turned on/off, and do a full clean. 2347 * 2348 * - We flush the caches on both caches being turned on and off. 2349 * 2350 * - Once the caches are enabled, we stop trapping VM ops. 2351 */ 2352 void kvm_set_way_flush(struct kvm_vcpu *vcpu) 2353 { 2354 unsigned long hcr = *vcpu_hcr(vcpu); 2355 2356 /* 2357 * If this is the first time we do a S/W operation 2358 * (i.e. HCR_TVM not set) flush the whole memory, and set the 2359 * VM trapping. 2360 * 2361 * Otherwise, rely on the VM trapping to wait for the MMU + 2362 * Caches to be turned off. At that point, we'll be able to 2363 * clean the caches again. 2364 */ 2365 if (!(hcr & HCR_TVM)) { 2366 trace_kvm_set_way_flush(*vcpu_pc(vcpu), 2367 vcpu_has_cache_enabled(vcpu)); 2368 stage2_flush_vm(vcpu->kvm); 2369 *vcpu_hcr(vcpu) = hcr | HCR_TVM; 2370 } 2371 } 2372 2373 void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled) 2374 { 2375 bool now_enabled = vcpu_has_cache_enabled(vcpu); 2376 2377 /* 2378 * If switching the MMU+caches on, need to invalidate the caches. 2379 * If switching it off, need to clean the caches. 2380 * Clean + invalidate does the trick always. 2381 */ 2382 if (now_enabled != was_enabled) 2383 stage2_flush_vm(vcpu->kvm); 2384 2385 /* Caches are now on, stop trapping VM ops (until a S/W op) */ 2386 if (now_enabled) 2387 *vcpu_hcr(vcpu) &= ~HCR_TVM; 2388 2389 trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled); 2390 } 2391