1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2012 - Virtual Open Systems and Columbia University 4 * Author: Christoffer Dall <c.dall@virtualopensystems.com> 5 */ 6 7 #include <linux/mman.h> 8 #include <linux/kvm_host.h> 9 #include <linux/io.h> 10 #include <linux/hugetlb.h> 11 #include <linux/sched/signal.h> 12 #include <trace/events/kvm.h> 13 #include <asm/pgalloc.h> 14 #include <asm/cacheflush.h> 15 #include <asm/kvm_arm.h> 16 #include <asm/kvm_mmu.h> 17 #include <asm/kvm_pgtable.h> 18 #include <asm/kvm_pkvm.h> 19 #include <asm/kvm_ras.h> 20 #include <asm/kvm_asm.h> 21 #include <asm/kvm_emulate.h> 22 #include <asm/virt.h> 23 24 #include "trace.h" 25 26 static struct kvm_pgtable *hyp_pgtable; 27 static DEFINE_MUTEX(kvm_hyp_pgd_mutex); 28 29 static unsigned long __ro_after_init hyp_idmap_start; 30 static unsigned long __ro_after_init hyp_idmap_end; 31 static phys_addr_t __ro_after_init hyp_idmap_vector; 32 33 u32 __ro_after_init __hyp_va_bits; 34 35 static unsigned long __ro_after_init io_map_base; 36 37 #define KVM_PGT_FN(fn) (!is_protected_kvm_enabled() ? fn : p ## fn) 38 39 static phys_addr_t __stage2_range_addr_end(phys_addr_t addr, phys_addr_t end, 40 phys_addr_t size) 41 { 42 phys_addr_t boundary = ALIGN_DOWN(addr + size, size); 43 44 return (boundary - 1 < end - 1) ? boundary : end; 45 } 46 47 static phys_addr_t stage2_range_addr_end(phys_addr_t addr, phys_addr_t end) 48 { 49 phys_addr_t size = kvm_granule_size(KVM_PGTABLE_MIN_BLOCK_LEVEL); 50 51 return __stage2_range_addr_end(addr, end, size); 52 } 53 54 /* 55 * Release kvm_mmu_lock periodically if the memory region is large. Otherwise, 56 * we may see kernel panics with CONFIG_DETECT_HUNG_TASK, 57 * CONFIG_LOCKUP_DETECTOR, CONFIG_LOCKDEP. Additionally, holding the lock too 58 * long will also starve other vCPUs. We have to also make sure that the page 59 * tables are not freed while we released the lock. 60 */ 61 static int stage2_apply_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, 62 phys_addr_t end, 63 int (*fn)(struct kvm_pgtable *, u64, u64), 64 bool resched) 65 { 66 struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu); 67 int ret; 68 u64 next; 69 70 do { 71 struct kvm_pgtable *pgt = mmu->pgt; 72 if (!pgt) 73 return -EINVAL; 74 75 next = stage2_range_addr_end(addr, end); 76 ret = fn(pgt, addr, next - addr); 77 if (ret) 78 break; 79 80 if (resched && next != end) 81 cond_resched_rwlock_write(&kvm->mmu_lock); 82 } while (addr = next, addr != end); 83 84 return ret; 85 } 86 87 #define stage2_apply_range_resched(mmu, addr, end, fn) \ 88 stage2_apply_range(mmu, addr, end, fn, true) 89 90 /* 91 * Get the maximum number of page-tables pages needed to split a range 92 * of blocks into PAGE_SIZE PTEs. It assumes the range is already 93 * mapped at level 2, or at level 1 if allowed. 94 */ 95 static int kvm_mmu_split_nr_page_tables(u64 range) 96 { 97 int n = 0; 98 99 if (KVM_PGTABLE_MIN_BLOCK_LEVEL < 2) 100 n += DIV_ROUND_UP(range, PUD_SIZE); 101 n += DIV_ROUND_UP(range, PMD_SIZE); 102 return n; 103 } 104 105 static bool need_split_memcache_topup_or_resched(struct kvm *kvm) 106 { 107 struct kvm_mmu_memory_cache *cache; 108 u64 chunk_size, min; 109 110 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) 111 return true; 112 113 chunk_size = kvm->arch.mmu.split_page_chunk_size; 114 min = kvm_mmu_split_nr_page_tables(chunk_size); 115 cache = &kvm->arch.mmu.split_page_cache; 116 return kvm_mmu_memory_cache_nr_free_objects(cache) < min; 117 } 118 119 static int kvm_mmu_split_huge_pages(struct kvm *kvm, phys_addr_t addr, 120 phys_addr_t end) 121 { 122 struct kvm_mmu_memory_cache *cache; 123 struct kvm_pgtable *pgt; 124 int ret, cache_capacity; 125 u64 next, chunk_size; 126 127 lockdep_assert_held_write(&kvm->mmu_lock); 128 129 chunk_size = kvm->arch.mmu.split_page_chunk_size; 130 cache_capacity = kvm_mmu_split_nr_page_tables(chunk_size); 131 132 if (chunk_size == 0) 133 return 0; 134 135 cache = &kvm->arch.mmu.split_page_cache; 136 137 do { 138 if (need_split_memcache_topup_or_resched(kvm)) { 139 write_unlock(&kvm->mmu_lock); 140 cond_resched(); 141 /* Eager page splitting is best-effort. */ 142 ret = __kvm_mmu_topup_memory_cache(cache, 143 cache_capacity, 144 cache_capacity); 145 write_lock(&kvm->mmu_lock); 146 if (ret) 147 break; 148 } 149 150 pgt = kvm->arch.mmu.pgt; 151 if (!pgt) 152 return -EINVAL; 153 154 next = __stage2_range_addr_end(addr, end, chunk_size); 155 ret = KVM_PGT_FN(kvm_pgtable_stage2_split)(pgt, addr, next - addr, cache); 156 if (ret) 157 break; 158 } while (addr = next, addr != end); 159 160 return ret; 161 } 162 163 static bool memslot_is_logging(struct kvm_memory_slot *memslot) 164 { 165 return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY); 166 } 167 168 /** 169 * kvm_arch_flush_remote_tlbs() - flush all VM TLB entries for v7/8 170 * @kvm: pointer to kvm structure. 171 * 172 * Interface to HYP function to flush all VM TLB entries 173 */ 174 int kvm_arch_flush_remote_tlbs(struct kvm *kvm) 175 { 176 if (is_protected_kvm_enabled()) 177 kvm_call_hyp_nvhe(__pkvm_tlb_flush_vmid, kvm->arch.pkvm.handle); 178 else 179 kvm_call_hyp(__kvm_tlb_flush_vmid, &kvm->arch.mmu); 180 return 0; 181 } 182 183 int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, 184 gfn_t gfn, u64 nr_pages) 185 { 186 u64 size = nr_pages << PAGE_SHIFT; 187 u64 addr = gfn << PAGE_SHIFT; 188 189 if (is_protected_kvm_enabled()) 190 kvm_call_hyp_nvhe(__pkvm_tlb_flush_vmid, kvm->arch.pkvm.handle); 191 else 192 kvm_tlb_flush_vmid_range(&kvm->arch.mmu, addr, size); 193 return 0; 194 } 195 196 static bool kvm_is_device_pfn(unsigned long pfn) 197 { 198 return !pfn_is_map_memory(pfn); 199 } 200 201 static void *stage2_memcache_zalloc_page(void *arg) 202 { 203 struct kvm_mmu_memory_cache *mc = arg; 204 void *virt; 205 206 /* Allocated with __GFP_ZERO, so no need to zero */ 207 virt = kvm_mmu_memory_cache_alloc(mc); 208 if (virt) 209 kvm_account_pgtable_pages(virt, 1); 210 return virt; 211 } 212 213 static void *kvm_host_zalloc_pages_exact(size_t size) 214 { 215 return alloc_pages_exact(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO); 216 } 217 218 static void *kvm_s2_zalloc_pages_exact(size_t size) 219 { 220 void *virt = kvm_host_zalloc_pages_exact(size); 221 222 if (virt) 223 kvm_account_pgtable_pages(virt, (size >> PAGE_SHIFT)); 224 return virt; 225 } 226 227 static void kvm_s2_free_pages_exact(void *virt, size_t size) 228 { 229 kvm_account_pgtable_pages(virt, -(size >> PAGE_SHIFT)); 230 free_pages_exact(virt, size); 231 } 232 233 static struct kvm_pgtable_mm_ops kvm_s2_mm_ops; 234 235 static void stage2_free_unlinked_table_rcu_cb(struct rcu_head *head) 236 { 237 struct page *page = container_of(head, struct page, rcu_head); 238 void *pgtable = page_to_virt(page); 239 s8 level = page_private(page); 240 241 KVM_PGT_FN(kvm_pgtable_stage2_free_unlinked)(&kvm_s2_mm_ops, pgtable, level); 242 } 243 244 static void stage2_free_unlinked_table(void *addr, s8 level) 245 { 246 struct page *page = virt_to_page(addr); 247 248 set_page_private(page, (unsigned long)level); 249 call_rcu(&page->rcu_head, stage2_free_unlinked_table_rcu_cb); 250 } 251 252 static void kvm_host_get_page(void *addr) 253 { 254 get_page(virt_to_page(addr)); 255 } 256 257 static void kvm_host_put_page(void *addr) 258 { 259 put_page(virt_to_page(addr)); 260 } 261 262 static void kvm_s2_put_page(void *addr) 263 { 264 struct page *p = virt_to_page(addr); 265 /* Dropping last refcount, the page will be freed */ 266 if (page_count(p) == 1) 267 kvm_account_pgtable_pages(addr, -1); 268 put_page(p); 269 } 270 271 static int kvm_host_page_count(void *addr) 272 { 273 return page_count(virt_to_page(addr)); 274 } 275 276 static phys_addr_t kvm_host_pa(void *addr) 277 { 278 return __pa(addr); 279 } 280 281 static void *kvm_host_va(phys_addr_t phys) 282 { 283 return __va(phys); 284 } 285 286 static void clean_dcache_guest_page(void *va, size_t size) 287 { 288 __clean_dcache_guest_page(va, size); 289 } 290 291 static void invalidate_icache_guest_page(void *va, size_t size) 292 { 293 __invalidate_icache_guest_page(va, size); 294 } 295 296 /* 297 * Unmapping vs dcache management: 298 * 299 * If a guest maps certain memory pages as uncached, all writes will 300 * bypass the data cache and go directly to RAM. However, the CPUs 301 * can still speculate reads (not writes) and fill cache lines with 302 * data. 303 * 304 * Those cache lines will be *clean* cache lines though, so a 305 * clean+invalidate operation is equivalent to an invalidate 306 * operation, because no cache lines are marked dirty. 307 * 308 * Those clean cache lines could be filled prior to an uncached write 309 * by the guest, and the cache coherent IO subsystem would therefore 310 * end up writing old data to disk. 311 * 312 * This is why right after unmapping a page/section and invalidating 313 * the corresponding TLBs, we flush to make sure the IO subsystem will 314 * never hit in the cache. 315 * 316 * This is all avoided on systems that have ARM64_HAS_STAGE2_FWB, as 317 * we then fully enforce cacheability of RAM, no matter what the guest 318 * does. 319 */ 320 /** 321 * __unmap_stage2_range -- Clear stage2 page table entries to unmap a range 322 * @mmu: The KVM stage-2 MMU pointer 323 * @start: The intermediate physical base address of the range to unmap 324 * @size: The size of the area to unmap 325 * @may_block: Whether or not we are permitted to block 326 * 327 * Clear a range of stage-2 mappings, lowering the various ref-counts. Must 328 * be called while holding mmu_lock (unless for freeing the stage2 pgd before 329 * destroying the VM), otherwise another faulting VCPU may come in and mess 330 * with things behind our backs. 331 */ 332 static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size, 333 bool may_block) 334 { 335 struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu); 336 phys_addr_t end = start + size; 337 338 lockdep_assert_held_write(&kvm->mmu_lock); 339 WARN_ON(size & ~PAGE_MASK); 340 WARN_ON(stage2_apply_range(mmu, start, end, KVM_PGT_FN(kvm_pgtable_stage2_unmap), 341 may_block)); 342 } 343 344 void kvm_stage2_unmap_range(struct kvm_s2_mmu *mmu, phys_addr_t start, 345 u64 size, bool may_block) 346 { 347 __unmap_stage2_range(mmu, start, size, may_block); 348 } 349 350 void kvm_stage2_flush_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end) 351 { 352 stage2_apply_range_resched(mmu, addr, end, KVM_PGT_FN(kvm_pgtable_stage2_flush)); 353 } 354 355 static void stage2_flush_memslot(struct kvm *kvm, 356 struct kvm_memory_slot *memslot) 357 { 358 phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT; 359 phys_addr_t end = addr + PAGE_SIZE * memslot->npages; 360 361 kvm_stage2_flush_range(&kvm->arch.mmu, addr, end); 362 } 363 364 /** 365 * stage2_flush_vm - Invalidate cache for pages mapped in stage 2 366 * @kvm: The struct kvm pointer 367 * 368 * Go through the stage 2 page tables and invalidate any cache lines 369 * backing memory already mapped to the VM. 370 */ 371 static void stage2_flush_vm(struct kvm *kvm) 372 { 373 struct kvm_memslots *slots; 374 struct kvm_memory_slot *memslot; 375 int idx, bkt; 376 377 idx = srcu_read_lock(&kvm->srcu); 378 write_lock(&kvm->mmu_lock); 379 380 slots = kvm_memslots(kvm); 381 kvm_for_each_memslot(memslot, bkt, slots) 382 stage2_flush_memslot(kvm, memslot); 383 384 kvm_nested_s2_flush(kvm); 385 386 write_unlock(&kvm->mmu_lock); 387 srcu_read_unlock(&kvm->srcu, idx); 388 } 389 390 /** 391 * free_hyp_pgds - free Hyp-mode page tables 392 */ 393 void __init free_hyp_pgds(void) 394 { 395 mutex_lock(&kvm_hyp_pgd_mutex); 396 if (hyp_pgtable) { 397 kvm_pgtable_hyp_destroy(hyp_pgtable); 398 kfree(hyp_pgtable); 399 hyp_pgtable = NULL; 400 } 401 mutex_unlock(&kvm_hyp_pgd_mutex); 402 } 403 404 static bool kvm_host_owns_hyp_mappings(void) 405 { 406 if (is_kernel_in_hyp_mode()) 407 return false; 408 409 if (static_branch_likely(&kvm_protected_mode_initialized)) 410 return false; 411 412 /* 413 * This can happen at boot time when __create_hyp_mappings() is called 414 * after the hyp protection has been enabled, but the static key has 415 * not been flipped yet. 416 */ 417 if (!hyp_pgtable && is_protected_kvm_enabled()) 418 return false; 419 420 WARN_ON(!hyp_pgtable); 421 422 return true; 423 } 424 425 int __create_hyp_mappings(unsigned long start, unsigned long size, 426 unsigned long phys, enum kvm_pgtable_prot prot) 427 { 428 int err; 429 430 if (WARN_ON(!kvm_host_owns_hyp_mappings())) 431 return -EINVAL; 432 433 mutex_lock(&kvm_hyp_pgd_mutex); 434 err = kvm_pgtable_hyp_map(hyp_pgtable, start, size, phys, prot); 435 mutex_unlock(&kvm_hyp_pgd_mutex); 436 437 return err; 438 } 439 440 static phys_addr_t kvm_kaddr_to_phys(void *kaddr) 441 { 442 if (!is_vmalloc_addr(kaddr)) { 443 BUG_ON(!virt_addr_valid(kaddr)); 444 return __pa(kaddr); 445 } else { 446 return page_to_phys(vmalloc_to_page(kaddr)) + 447 offset_in_page(kaddr); 448 } 449 } 450 451 struct hyp_shared_pfn { 452 u64 pfn; 453 int count; 454 struct rb_node node; 455 }; 456 457 static DEFINE_MUTEX(hyp_shared_pfns_lock); 458 static struct rb_root hyp_shared_pfns = RB_ROOT; 459 460 static struct hyp_shared_pfn *find_shared_pfn(u64 pfn, struct rb_node ***node, 461 struct rb_node **parent) 462 { 463 struct hyp_shared_pfn *this; 464 465 *node = &hyp_shared_pfns.rb_node; 466 *parent = NULL; 467 while (**node) { 468 this = container_of(**node, struct hyp_shared_pfn, node); 469 *parent = **node; 470 if (this->pfn < pfn) 471 *node = &((**node)->rb_left); 472 else if (this->pfn > pfn) 473 *node = &((**node)->rb_right); 474 else 475 return this; 476 } 477 478 return NULL; 479 } 480 481 static int share_pfn_hyp(u64 pfn) 482 { 483 struct rb_node **node, *parent; 484 struct hyp_shared_pfn *this; 485 int ret = 0; 486 487 mutex_lock(&hyp_shared_pfns_lock); 488 this = find_shared_pfn(pfn, &node, &parent); 489 if (this) { 490 this->count++; 491 goto unlock; 492 } 493 494 this = kzalloc(sizeof(*this), GFP_KERNEL); 495 if (!this) { 496 ret = -ENOMEM; 497 goto unlock; 498 } 499 500 this->pfn = pfn; 501 this->count = 1; 502 rb_link_node(&this->node, parent, node); 503 rb_insert_color(&this->node, &hyp_shared_pfns); 504 ret = kvm_call_hyp_nvhe(__pkvm_host_share_hyp, pfn, 1); 505 unlock: 506 mutex_unlock(&hyp_shared_pfns_lock); 507 508 return ret; 509 } 510 511 static int unshare_pfn_hyp(u64 pfn) 512 { 513 struct rb_node **node, *parent; 514 struct hyp_shared_pfn *this; 515 int ret = 0; 516 517 mutex_lock(&hyp_shared_pfns_lock); 518 this = find_shared_pfn(pfn, &node, &parent); 519 if (WARN_ON(!this)) { 520 ret = -ENOENT; 521 goto unlock; 522 } 523 524 this->count--; 525 if (this->count) 526 goto unlock; 527 528 rb_erase(&this->node, &hyp_shared_pfns); 529 kfree(this); 530 ret = kvm_call_hyp_nvhe(__pkvm_host_unshare_hyp, pfn, 1); 531 unlock: 532 mutex_unlock(&hyp_shared_pfns_lock); 533 534 return ret; 535 } 536 537 int kvm_share_hyp(void *from, void *to) 538 { 539 phys_addr_t start, end, cur; 540 u64 pfn; 541 int ret; 542 543 if (is_kernel_in_hyp_mode()) 544 return 0; 545 546 /* 547 * The share hcall maps things in the 'fixed-offset' region of the hyp 548 * VA space, so we can only share physically contiguous data-structures 549 * for now. 550 */ 551 if (is_vmalloc_or_module_addr(from) || is_vmalloc_or_module_addr(to)) 552 return -EINVAL; 553 554 if (kvm_host_owns_hyp_mappings()) 555 return create_hyp_mappings(from, to, PAGE_HYP); 556 557 start = ALIGN_DOWN(__pa(from), PAGE_SIZE); 558 end = PAGE_ALIGN(__pa(to)); 559 for (cur = start; cur < end; cur += PAGE_SIZE) { 560 pfn = __phys_to_pfn(cur); 561 ret = share_pfn_hyp(pfn); 562 if (ret) 563 return ret; 564 } 565 566 return 0; 567 } 568 569 void kvm_unshare_hyp(void *from, void *to) 570 { 571 phys_addr_t start, end, cur; 572 u64 pfn; 573 574 if (is_kernel_in_hyp_mode() || kvm_host_owns_hyp_mappings() || !from) 575 return; 576 577 start = ALIGN_DOWN(__pa(from), PAGE_SIZE); 578 end = PAGE_ALIGN(__pa(to)); 579 for (cur = start; cur < end; cur += PAGE_SIZE) { 580 pfn = __phys_to_pfn(cur); 581 WARN_ON(unshare_pfn_hyp(pfn)); 582 } 583 } 584 585 /** 586 * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode 587 * @from: The virtual kernel start address of the range 588 * @to: The virtual kernel end address of the range (exclusive) 589 * @prot: The protection to be applied to this range 590 * 591 * The same virtual address as the kernel virtual address is also used 592 * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying 593 * physical pages. 594 */ 595 int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot) 596 { 597 phys_addr_t phys_addr; 598 unsigned long virt_addr; 599 unsigned long start = kern_hyp_va((unsigned long)from); 600 unsigned long end = kern_hyp_va((unsigned long)to); 601 602 if (is_kernel_in_hyp_mode()) 603 return 0; 604 605 if (!kvm_host_owns_hyp_mappings()) 606 return -EPERM; 607 608 start = start & PAGE_MASK; 609 end = PAGE_ALIGN(end); 610 611 for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) { 612 int err; 613 614 phys_addr = kvm_kaddr_to_phys(from + virt_addr - start); 615 err = __create_hyp_mappings(virt_addr, PAGE_SIZE, phys_addr, 616 prot); 617 if (err) 618 return err; 619 } 620 621 return 0; 622 } 623 624 static int __hyp_alloc_private_va_range(unsigned long base) 625 { 626 lockdep_assert_held(&kvm_hyp_pgd_mutex); 627 628 if (!PAGE_ALIGNED(base)) 629 return -EINVAL; 630 631 /* 632 * Verify that BIT(VA_BITS - 1) hasn't been flipped by 633 * allocating the new area, as it would indicate we've 634 * overflowed the idmap/IO address range. 635 */ 636 if ((base ^ io_map_base) & BIT(VA_BITS - 1)) 637 return -ENOMEM; 638 639 io_map_base = base; 640 641 return 0; 642 } 643 644 /** 645 * hyp_alloc_private_va_range - Allocates a private VA range. 646 * @size: The size of the VA range to reserve. 647 * @haddr: The hypervisor virtual start address of the allocation. 648 * 649 * The private virtual address (VA) range is allocated below io_map_base 650 * and aligned based on the order of @size. 651 * 652 * Return: 0 on success or negative error code on failure. 653 */ 654 int hyp_alloc_private_va_range(size_t size, unsigned long *haddr) 655 { 656 unsigned long base; 657 int ret = 0; 658 659 mutex_lock(&kvm_hyp_pgd_mutex); 660 661 /* 662 * This assumes that we have enough space below the idmap 663 * page to allocate our VAs. If not, the check in 664 * __hyp_alloc_private_va_range() will kick. A potential 665 * alternative would be to detect that overflow and switch 666 * to an allocation above the idmap. 667 * 668 * The allocated size is always a multiple of PAGE_SIZE. 669 */ 670 size = PAGE_ALIGN(size); 671 base = io_map_base - size; 672 ret = __hyp_alloc_private_va_range(base); 673 674 mutex_unlock(&kvm_hyp_pgd_mutex); 675 676 if (!ret) 677 *haddr = base; 678 679 return ret; 680 } 681 682 static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size, 683 unsigned long *haddr, 684 enum kvm_pgtable_prot prot) 685 { 686 unsigned long addr; 687 int ret = 0; 688 689 if (!kvm_host_owns_hyp_mappings()) { 690 addr = kvm_call_hyp_nvhe(__pkvm_create_private_mapping, 691 phys_addr, size, prot); 692 if (IS_ERR_VALUE(addr)) 693 return addr; 694 *haddr = addr; 695 696 return 0; 697 } 698 699 size = PAGE_ALIGN(size + offset_in_page(phys_addr)); 700 ret = hyp_alloc_private_va_range(size, &addr); 701 if (ret) 702 return ret; 703 704 ret = __create_hyp_mappings(addr, size, phys_addr, prot); 705 if (ret) 706 return ret; 707 708 *haddr = addr + offset_in_page(phys_addr); 709 return ret; 710 } 711 712 int create_hyp_stack(phys_addr_t phys_addr, unsigned long *haddr) 713 { 714 unsigned long base; 715 size_t size; 716 int ret; 717 718 mutex_lock(&kvm_hyp_pgd_mutex); 719 /* 720 * Efficient stack verification using the NVHE_STACK_SHIFT bit implies 721 * an alignment of our allocation on the order of the size. 722 */ 723 size = NVHE_STACK_SIZE * 2; 724 base = ALIGN_DOWN(io_map_base - size, size); 725 726 ret = __hyp_alloc_private_va_range(base); 727 728 mutex_unlock(&kvm_hyp_pgd_mutex); 729 730 if (ret) { 731 kvm_err("Cannot allocate hyp stack guard page\n"); 732 return ret; 733 } 734 735 /* 736 * Since the stack grows downwards, map the stack to the page 737 * at the higher address and leave the lower guard page 738 * unbacked. 739 * 740 * Any valid stack address now has the NVHE_STACK_SHIFT bit as 1 741 * and addresses corresponding to the guard page have the 742 * NVHE_STACK_SHIFT bit as 0 - this is used for overflow detection. 743 */ 744 ret = __create_hyp_mappings(base + NVHE_STACK_SIZE, NVHE_STACK_SIZE, 745 phys_addr, PAGE_HYP); 746 if (ret) 747 kvm_err("Cannot map hyp stack\n"); 748 749 *haddr = base + size; 750 751 return ret; 752 } 753 754 /** 755 * create_hyp_io_mappings - Map IO into both kernel and HYP 756 * @phys_addr: The physical start address which gets mapped 757 * @size: Size of the region being mapped 758 * @kaddr: Kernel VA for this mapping 759 * @haddr: HYP VA for this mapping 760 */ 761 int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size, 762 void __iomem **kaddr, 763 void __iomem **haddr) 764 { 765 unsigned long addr; 766 int ret; 767 768 if (is_protected_kvm_enabled()) 769 return -EPERM; 770 771 *kaddr = ioremap(phys_addr, size); 772 if (!*kaddr) 773 return -ENOMEM; 774 775 if (is_kernel_in_hyp_mode()) { 776 *haddr = *kaddr; 777 return 0; 778 } 779 780 ret = __create_hyp_private_mapping(phys_addr, size, 781 &addr, PAGE_HYP_DEVICE); 782 if (ret) { 783 iounmap(*kaddr); 784 *kaddr = NULL; 785 *haddr = NULL; 786 return ret; 787 } 788 789 *haddr = (void __iomem *)addr; 790 return 0; 791 } 792 793 /** 794 * create_hyp_exec_mappings - Map an executable range into HYP 795 * @phys_addr: The physical start address which gets mapped 796 * @size: Size of the region being mapped 797 * @haddr: HYP VA for this mapping 798 */ 799 int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size, 800 void **haddr) 801 { 802 unsigned long addr; 803 int ret; 804 805 BUG_ON(is_kernel_in_hyp_mode()); 806 807 ret = __create_hyp_private_mapping(phys_addr, size, 808 &addr, PAGE_HYP_EXEC); 809 if (ret) { 810 *haddr = NULL; 811 return ret; 812 } 813 814 *haddr = (void *)addr; 815 return 0; 816 } 817 818 static struct kvm_pgtable_mm_ops kvm_user_mm_ops = { 819 /* We shouldn't need any other callback to walk the PT */ 820 .phys_to_virt = kvm_host_va, 821 }; 822 823 static int get_user_mapping_size(struct kvm *kvm, u64 addr) 824 { 825 struct kvm_pgtable pgt = { 826 .pgd = (kvm_pteref_t)kvm->mm->pgd, 827 .ia_bits = vabits_actual, 828 .start_level = (KVM_PGTABLE_LAST_LEVEL - 829 ARM64_HW_PGTABLE_LEVELS(pgt.ia_bits) + 1), 830 .mm_ops = &kvm_user_mm_ops, 831 }; 832 unsigned long flags; 833 kvm_pte_t pte = 0; /* Keep GCC quiet... */ 834 s8 level = S8_MAX; 835 int ret; 836 837 /* 838 * Disable IRQs so that we hazard against a concurrent 839 * teardown of the userspace page tables (which relies on 840 * IPI-ing threads). 841 */ 842 local_irq_save(flags); 843 ret = kvm_pgtable_get_leaf(&pgt, addr, &pte, &level); 844 local_irq_restore(flags); 845 846 if (ret) 847 return ret; 848 849 /* 850 * Not seeing an error, but not updating level? Something went 851 * deeply wrong... 852 */ 853 if (WARN_ON(level > KVM_PGTABLE_LAST_LEVEL)) 854 return -EFAULT; 855 if (WARN_ON(level < KVM_PGTABLE_FIRST_LEVEL)) 856 return -EFAULT; 857 858 /* Oops, the userspace PTs are gone... Replay the fault */ 859 if (!kvm_pte_valid(pte)) 860 return -EAGAIN; 861 862 return BIT(ARM64_HW_PGTABLE_LEVEL_SHIFT(level)); 863 } 864 865 static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = { 866 .zalloc_page = stage2_memcache_zalloc_page, 867 .zalloc_pages_exact = kvm_s2_zalloc_pages_exact, 868 .free_pages_exact = kvm_s2_free_pages_exact, 869 .free_unlinked_table = stage2_free_unlinked_table, 870 .get_page = kvm_host_get_page, 871 .put_page = kvm_s2_put_page, 872 .page_count = kvm_host_page_count, 873 .phys_to_virt = kvm_host_va, 874 .virt_to_phys = kvm_host_pa, 875 .dcache_clean_inval_poc = clean_dcache_guest_page, 876 .icache_inval_pou = invalidate_icache_guest_page, 877 }; 878 879 static int kvm_init_ipa_range(struct kvm_s2_mmu *mmu, unsigned long type) 880 { 881 u32 kvm_ipa_limit = get_kvm_ipa_limit(); 882 u64 mmfr0, mmfr1; 883 u32 phys_shift; 884 885 if (type & ~KVM_VM_TYPE_ARM_IPA_SIZE_MASK) 886 return -EINVAL; 887 888 phys_shift = KVM_VM_TYPE_ARM_IPA_SIZE(type); 889 if (is_protected_kvm_enabled()) { 890 phys_shift = kvm_ipa_limit; 891 } else if (phys_shift) { 892 if (phys_shift > kvm_ipa_limit || 893 phys_shift < ARM64_MIN_PARANGE_BITS) 894 return -EINVAL; 895 } else { 896 phys_shift = KVM_PHYS_SHIFT; 897 if (phys_shift > kvm_ipa_limit) { 898 pr_warn_once("%s using unsupported default IPA limit, upgrade your VMM\n", 899 current->comm); 900 return -EINVAL; 901 } 902 } 903 904 mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); 905 mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); 906 mmu->vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift); 907 908 return 0; 909 } 910 911 /** 912 * kvm_init_stage2_mmu - Initialise a S2 MMU structure 913 * @kvm: The pointer to the KVM structure 914 * @mmu: The pointer to the s2 MMU structure 915 * @type: The machine type of the virtual machine 916 * 917 * Allocates only the stage-2 HW PGD level table(s). 918 * Note we don't need locking here as this is only called in two cases: 919 * 920 * - when the VM is created, which can't race against anything 921 * 922 * - when secondary kvm_s2_mmu structures are initialised for NV 923 * guests, and the caller must hold kvm->lock as this is called on a 924 * per-vcpu basis. 925 */ 926 int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type) 927 { 928 int cpu, err; 929 struct kvm_pgtable *pgt; 930 931 /* 932 * If we already have our page tables in place, and that the 933 * MMU context is the canonical one, we have a bug somewhere, 934 * as this is only supposed to ever happen once per VM. 935 * 936 * Otherwise, we're building nested page tables, and that's 937 * probably because userspace called KVM_ARM_VCPU_INIT more 938 * than once on the same vcpu. Since that's actually legal, 939 * don't kick a fuss and leave gracefully. 940 */ 941 if (mmu->pgt != NULL) { 942 if (kvm_is_nested_s2_mmu(kvm, mmu)) 943 return 0; 944 945 kvm_err("kvm_arch already initialized?\n"); 946 return -EINVAL; 947 } 948 949 err = kvm_init_ipa_range(mmu, type); 950 if (err) 951 return err; 952 953 pgt = kzalloc(sizeof(*pgt), GFP_KERNEL_ACCOUNT); 954 if (!pgt) 955 return -ENOMEM; 956 957 mmu->arch = &kvm->arch; 958 err = KVM_PGT_FN(kvm_pgtable_stage2_init)(pgt, mmu, &kvm_s2_mm_ops); 959 if (err) 960 goto out_free_pgtable; 961 962 mmu->pgt = pgt; 963 if (is_protected_kvm_enabled()) 964 return 0; 965 966 mmu->last_vcpu_ran = alloc_percpu(typeof(*mmu->last_vcpu_ran)); 967 if (!mmu->last_vcpu_ran) { 968 err = -ENOMEM; 969 goto out_destroy_pgtable; 970 } 971 972 for_each_possible_cpu(cpu) 973 *per_cpu_ptr(mmu->last_vcpu_ran, cpu) = -1; 974 975 /* The eager page splitting is disabled by default */ 976 mmu->split_page_chunk_size = KVM_ARM_EAGER_SPLIT_CHUNK_SIZE_DEFAULT; 977 mmu->split_page_cache.gfp_zero = __GFP_ZERO; 978 979 mmu->pgd_phys = __pa(pgt->pgd); 980 981 if (kvm_is_nested_s2_mmu(kvm, mmu)) 982 kvm_init_nested_s2_mmu(mmu); 983 984 return 0; 985 986 out_destroy_pgtable: 987 KVM_PGT_FN(kvm_pgtable_stage2_destroy)(pgt); 988 out_free_pgtable: 989 kfree(pgt); 990 return err; 991 } 992 993 void kvm_uninit_stage2_mmu(struct kvm *kvm) 994 { 995 kvm_free_stage2_pgd(&kvm->arch.mmu); 996 kvm_mmu_free_memory_cache(&kvm->arch.mmu.split_page_cache); 997 } 998 999 static void stage2_unmap_memslot(struct kvm *kvm, 1000 struct kvm_memory_slot *memslot) 1001 { 1002 hva_t hva = memslot->userspace_addr; 1003 phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT; 1004 phys_addr_t size = PAGE_SIZE * memslot->npages; 1005 hva_t reg_end = hva + size; 1006 1007 /* 1008 * A memory region could potentially cover multiple VMAs, and any holes 1009 * between them, so iterate over all of them to find out if we should 1010 * unmap any of them. 1011 * 1012 * +--------------------------------------------+ 1013 * +---------------+----------------+ +----------------+ 1014 * | : VMA 1 | VMA 2 | | VMA 3 : | 1015 * +---------------+----------------+ +----------------+ 1016 * | memory region | 1017 * +--------------------------------------------+ 1018 */ 1019 do { 1020 struct vm_area_struct *vma; 1021 hva_t vm_start, vm_end; 1022 1023 vma = find_vma_intersection(current->mm, hva, reg_end); 1024 if (!vma) 1025 break; 1026 1027 /* 1028 * Take the intersection of this VMA with the memory region 1029 */ 1030 vm_start = max(hva, vma->vm_start); 1031 vm_end = min(reg_end, vma->vm_end); 1032 1033 if (!(vma->vm_flags & VM_PFNMAP)) { 1034 gpa_t gpa = addr + (vm_start - memslot->userspace_addr); 1035 kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, vm_end - vm_start, true); 1036 } 1037 hva = vm_end; 1038 } while (hva < reg_end); 1039 } 1040 1041 /** 1042 * stage2_unmap_vm - Unmap Stage-2 RAM mappings 1043 * @kvm: The struct kvm pointer 1044 * 1045 * Go through the memregions and unmap any regular RAM 1046 * backing memory already mapped to the VM. 1047 */ 1048 void stage2_unmap_vm(struct kvm *kvm) 1049 { 1050 struct kvm_memslots *slots; 1051 struct kvm_memory_slot *memslot; 1052 int idx, bkt; 1053 1054 idx = srcu_read_lock(&kvm->srcu); 1055 mmap_read_lock(current->mm); 1056 write_lock(&kvm->mmu_lock); 1057 1058 slots = kvm_memslots(kvm); 1059 kvm_for_each_memslot(memslot, bkt, slots) 1060 stage2_unmap_memslot(kvm, memslot); 1061 1062 kvm_nested_s2_unmap(kvm, true); 1063 1064 write_unlock(&kvm->mmu_lock); 1065 mmap_read_unlock(current->mm); 1066 srcu_read_unlock(&kvm->srcu, idx); 1067 } 1068 1069 void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu) 1070 { 1071 struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu); 1072 struct kvm_pgtable *pgt = NULL; 1073 1074 write_lock(&kvm->mmu_lock); 1075 pgt = mmu->pgt; 1076 if (pgt) { 1077 mmu->pgd_phys = 0; 1078 mmu->pgt = NULL; 1079 free_percpu(mmu->last_vcpu_ran); 1080 } 1081 write_unlock(&kvm->mmu_lock); 1082 1083 if (pgt) { 1084 KVM_PGT_FN(kvm_pgtable_stage2_destroy)(pgt); 1085 kfree(pgt); 1086 } 1087 } 1088 1089 static void hyp_mc_free_fn(void *addr, void *unused) 1090 { 1091 free_page((unsigned long)addr); 1092 } 1093 1094 static void *hyp_mc_alloc_fn(void *unused) 1095 { 1096 return (void *)__get_free_page(GFP_KERNEL_ACCOUNT); 1097 } 1098 1099 void free_hyp_memcache(struct kvm_hyp_memcache *mc) 1100 { 1101 if (!is_protected_kvm_enabled()) 1102 return; 1103 1104 kfree(mc->mapping); 1105 __free_hyp_memcache(mc, hyp_mc_free_fn, kvm_host_va, NULL); 1106 } 1107 1108 int topup_hyp_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages) 1109 { 1110 if (!is_protected_kvm_enabled()) 1111 return 0; 1112 1113 if (!mc->mapping) { 1114 mc->mapping = kzalloc(sizeof(struct pkvm_mapping), GFP_KERNEL_ACCOUNT); 1115 if (!mc->mapping) 1116 return -ENOMEM; 1117 } 1118 1119 return __topup_hyp_memcache(mc, min_pages, hyp_mc_alloc_fn, 1120 kvm_host_pa, NULL); 1121 } 1122 1123 /** 1124 * kvm_phys_addr_ioremap - map a device range to guest IPA 1125 * 1126 * @kvm: The KVM pointer 1127 * @guest_ipa: The IPA at which to insert the mapping 1128 * @pa: The physical address of the device 1129 * @size: The size of the mapping 1130 * @writable: Whether or not to create a writable mapping 1131 */ 1132 int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, 1133 phys_addr_t pa, unsigned long size, bool writable) 1134 { 1135 phys_addr_t addr; 1136 int ret = 0; 1137 struct kvm_mmu_memory_cache cache = { .gfp_zero = __GFP_ZERO }; 1138 struct kvm_s2_mmu *mmu = &kvm->arch.mmu; 1139 struct kvm_pgtable *pgt = mmu->pgt; 1140 enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_DEVICE | 1141 KVM_PGTABLE_PROT_R | 1142 (writable ? KVM_PGTABLE_PROT_W : 0); 1143 1144 if (is_protected_kvm_enabled()) 1145 return -EPERM; 1146 1147 size += offset_in_page(guest_ipa); 1148 guest_ipa &= PAGE_MASK; 1149 1150 for (addr = guest_ipa; addr < guest_ipa + size; addr += PAGE_SIZE) { 1151 ret = kvm_mmu_topup_memory_cache(&cache, 1152 kvm_mmu_cache_min_pages(mmu)); 1153 if (ret) 1154 break; 1155 1156 write_lock(&kvm->mmu_lock); 1157 ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, addr, PAGE_SIZE, 1158 pa, prot, &cache, 0); 1159 write_unlock(&kvm->mmu_lock); 1160 if (ret) 1161 break; 1162 1163 pa += PAGE_SIZE; 1164 } 1165 1166 kvm_mmu_free_memory_cache(&cache); 1167 return ret; 1168 } 1169 1170 /** 1171 * kvm_stage2_wp_range() - write protect stage2 memory region range 1172 * @mmu: The KVM stage-2 MMU pointer 1173 * @addr: Start address of range 1174 * @end: End address of range 1175 */ 1176 void kvm_stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end) 1177 { 1178 stage2_apply_range_resched(mmu, addr, end, KVM_PGT_FN(kvm_pgtable_stage2_wrprotect)); 1179 } 1180 1181 /** 1182 * kvm_mmu_wp_memory_region() - write protect stage 2 entries for memory slot 1183 * @kvm: The KVM pointer 1184 * @slot: The memory slot to write protect 1185 * 1186 * Called to start logging dirty pages after memory region 1187 * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns 1188 * all present PUD, PMD and PTEs are write protected in the memory region. 1189 * Afterwards read of dirty page log can be called. 1190 * 1191 * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired, 1192 * serializing operations for VM memory regions. 1193 */ 1194 static void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot) 1195 { 1196 struct kvm_memslots *slots = kvm_memslots(kvm); 1197 struct kvm_memory_slot *memslot = id_to_memslot(slots, slot); 1198 phys_addr_t start, end; 1199 1200 if (WARN_ON_ONCE(!memslot)) 1201 return; 1202 1203 start = memslot->base_gfn << PAGE_SHIFT; 1204 end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT; 1205 1206 write_lock(&kvm->mmu_lock); 1207 kvm_stage2_wp_range(&kvm->arch.mmu, start, end); 1208 kvm_nested_s2_wp(kvm); 1209 write_unlock(&kvm->mmu_lock); 1210 kvm_flush_remote_tlbs_memslot(kvm, memslot); 1211 } 1212 1213 /** 1214 * kvm_mmu_split_memory_region() - split the stage 2 blocks into PAGE_SIZE 1215 * pages for memory slot 1216 * @kvm: The KVM pointer 1217 * @slot: The memory slot to split 1218 * 1219 * Acquires kvm->mmu_lock. Called with kvm->slots_lock mutex acquired, 1220 * serializing operations for VM memory regions. 1221 */ 1222 static void kvm_mmu_split_memory_region(struct kvm *kvm, int slot) 1223 { 1224 struct kvm_memslots *slots; 1225 struct kvm_memory_slot *memslot; 1226 phys_addr_t start, end; 1227 1228 lockdep_assert_held(&kvm->slots_lock); 1229 1230 slots = kvm_memslots(kvm); 1231 memslot = id_to_memslot(slots, slot); 1232 1233 start = memslot->base_gfn << PAGE_SHIFT; 1234 end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT; 1235 1236 write_lock(&kvm->mmu_lock); 1237 kvm_mmu_split_huge_pages(kvm, start, end); 1238 write_unlock(&kvm->mmu_lock); 1239 } 1240 1241 /* 1242 * kvm_arch_mmu_enable_log_dirty_pt_masked() - enable dirty logging for selected pages. 1243 * @kvm: The KVM pointer 1244 * @slot: The memory slot associated with mask 1245 * @gfn_offset: The gfn offset in memory slot 1246 * @mask: The mask of pages at offset 'gfn_offset' in this memory 1247 * slot to enable dirty logging on 1248 * 1249 * Writes protect selected pages to enable dirty logging, and then 1250 * splits them to PAGE_SIZE. Caller must acquire kvm->mmu_lock. 1251 */ 1252 void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, 1253 struct kvm_memory_slot *slot, 1254 gfn_t gfn_offset, unsigned long mask) 1255 { 1256 phys_addr_t base_gfn = slot->base_gfn + gfn_offset; 1257 phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT; 1258 phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT; 1259 1260 lockdep_assert_held_write(&kvm->mmu_lock); 1261 1262 kvm_stage2_wp_range(&kvm->arch.mmu, start, end); 1263 1264 /* 1265 * Eager-splitting is done when manual-protect is set. We 1266 * also check for initially-all-set because we can avoid 1267 * eager-splitting if initially-all-set is false. 1268 * Initially-all-set equal false implies that huge-pages were 1269 * already split when enabling dirty logging: no need to do it 1270 * again. 1271 */ 1272 if (kvm_dirty_log_manual_protect_and_init_set(kvm)) 1273 kvm_mmu_split_huge_pages(kvm, start, end); 1274 1275 kvm_nested_s2_wp(kvm); 1276 } 1277 1278 static void kvm_send_hwpoison_signal(unsigned long address, short lsb) 1279 { 1280 send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current); 1281 } 1282 1283 static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot, 1284 unsigned long hva, 1285 unsigned long map_size) 1286 { 1287 gpa_t gpa_start; 1288 hva_t uaddr_start, uaddr_end; 1289 size_t size; 1290 1291 /* The memslot and the VMA are guaranteed to be aligned to PAGE_SIZE */ 1292 if (map_size == PAGE_SIZE) 1293 return true; 1294 1295 size = memslot->npages * PAGE_SIZE; 1296 1297 gpa_start = memslot->base_gfn << PAGE_SHIFT; 1298 1299 uaddr_start = memslot->userspace_addr; 1300 uaddr_end = uaddr_start + size; 1301 1302 /* 1303 * Pages belonging to memslots that don't have the same alignment 1304 * within a PMD/PUD for userspace and IPA cannot be mapped with stage-2 1305 * PMD/PUD entries, because we'll end up mapping the wrong pages. 1306 * 1307 * Consider a layout like the following: 1308 * 1309 * memslot->userspace_addr: 1310 * +-----+--------------------+--------------------+---+ 1311 * |abcde|fgh Stage-1 block | Stage-1 block tv|xyz| 1312 * +-----+--------------------+--------------------+---+ 1313 * 1314 * memslot->base_gfn << PAGE_SHIFT: 1315 * +---+--------------------+--------------------+-----+ 1316 * |abc|def Stage-2 block | Stage-2 block |tvxyz| 1317 * +---+--------------------+--------------------+-----+ 1318 * 1319 * If we create those stage-2 blocks, we'll end up with this incorrect 1320 * mapping: 1321 * d -> f 1322 * e -> g 1323 * f -> h 1324 */ 1325 if ((gpa_start & (map_size - 1)) != (uaddr_start & (map_size - 1))) 1326 return false; 1327 1328 /* 1329 * Next, let's make sure we're not trying to map anything not covered 1330 * by the memslot. This means we have to prohibit block size mappings 1331 * for the beginning and end of a non-block aligned and non-block sized 1332 * memory slot (illustrated by the head and tail parts of the 1333 * userspace view above containing pages 'abcde' and 'xyz', 1334 * respectively). 1335 * 1336 * Note that it doesn't matter if we do the check using the 1337 * userspace_addr or the base_gfn, as both are equally aligned (per 1338 * the check above) and equally sized. 1339 */ 1340 return (hva & ~(map_size - 1)) >= uaddr_start && 1341 (hva & ~(map_size - 1)) + map_size <= uaddr_end; 1342 } 1343 1344 /* 1345 * Check if the given hva is backed by a transparent huge page (THP) and 1346 * whether it can be mapped using block mapping in stage2. If so, adjust 1347 * the stage2 PFN and IPA accordingly. Only PMD_SIZE THPs are currently 1348 * supported. This will need to be updated to support other THP sizes. 1349 * 1350 * Returns the size of the mapping. 1351 */ 1352 static long 1353 transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot, 1354 unsigned long hva, kvm_pfn_t *pfnp, 1355 phys_addr_t *ipap) 1356 { 1357 kvm_pfn_t pfn = *pfnp; 1358 1359 /* 1360 * Make sure the adjustment is done only for THP pages. Also make 1361 * sure that the HVA and IPA are sufficiently aligned and that the 1362 * block map is contained within the memslot. 1363 */ 1364 if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) { 1365 int sz = get_user_mapping_size(kvm, hva); 1366 1367 if (sz < 0) 1368 return sz; 1369 1370 if (sz < PMD_SIZE) 1371 return PAGE_SIZE; 1372 1373 *ipap &= PMD_MASK; 1374 pfn &= ~(PTRS_PER_PMD - 1); 1375 *pfnp = pfn; 1376 1377 return PMD_SIZE; 1378 } 1379 1380 /* Use page mapping if we cannot use block mapping. */ 1381 return PAGE_SIZE; 1382 } 1383 1384 static int get_vma_page_shift(struct vm_area_struct *vma, unsigned long hva) 1385 { 1386 unsigned long pa; 1387 1388 if (is_vm_hugetlb_page(vma) && !(vma->vm_flags & VM_PFNMAP)) 1389 return huge_page_shift(hstate_vma(vma)); 1390 1391 if (!(vma->vm_flags & VM_PFNMAP)) 1392 return PAGE_SHIFT; 1393 1394 VM_BUG_ON(is_vm_hugetlb_page(vma)); 1395 1396 pa = (vma->vm_pgoff << PAGE_SHIFT) + (hva - vma->vm_start); 1397 1398 #ifndef __PAGETABLE_PMD_FOLDED 1399 if ((hva & (PUD_SIZE - 1)) == (pa & (PUD_SIZE - 1)) && 1400 ALIGN_DOWN(hva, PUD_SIZE) >= vma->vm_start && 1401 ALIGN(hva, PUD_SIZE) <= vma->vm_end) 1402 return PUD_SHIFT; 1403 #endif 1404 1405 if ((hva & (PMD_SIZE - 1)) == (pa & (PMD_SIZE - 1)) && 1406 ALIGN_DOWN(hva, PMD_SIZE) >= vma->vm_start && 1407 ALIGN(hva, PMD_SIZE) <= vma->vm_end) 1408 return PMD_SHIFT; 1409 1410 return PAGE_SHIFT; 1411 } 1412 1413 /* 1414 * The page will be mapped in stage 2 as Normal Cacheable, so the VM will be 1415 * able to see the page's tags and therefore they must be initialised first. If 1416 * PG_mte_tagged is set, tags have already been initialised. 1417 * 1418 * The race in the test/set of the PG_mte_tagged flag is handled by: 1419 * - preventing VM_SHARED mappings in a memslot with MTE preventing two VMs 1420 * racing to santise the same page 1421 * - mmap_lock protects between a VM faulting a page in and the VMM performing 1422 * an mprotect() to add VM_MTE 1423 */ 1424 static void sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn, 1425 unsigned long size) 1426 { 1427 unsigned long i, nr_pages = size >> PAGE_SHIFT; 1428 struct page *page = pfn_to_page(pfn); 1429 struct folio *folio = page_folio(page); 1430 1431 if (!kvm_has_mte(kvm)) 1432 return; 1433 1434 if (folio_test_hugetlb(folio)) { 1435 /* Hugetlb has MTE flags set on head page only */ 1436 if (folio_try_hugetlb_mte_tagging(folio)) { 1437 for (i = 0; i < nr_pages; i++, page++) 1438 mte_clear_page_tags(page_address(page)); 1439 folio_set_hugetlb_mte_tagged(folio); 1440 } 1441 return; 1442 } 1443 1444 for (i = 0; i < nr_pages; i++, page++) { 1445 if (try_page_mte_tagging(page)) { 1446 mte_clear_page_tags(page_address(page)); 1447 set_page_mte_tagged(page); 1448 } 1449 } 1450 } 1451 1452 static bool kvm_vma_mte_allowed(struct vm_area_struct *vma) 1453 { 1454 return vma->vm_flags & VM_MTE_ALLOWED; 1455 } 1456 1457 static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, 1458 struct kvm_s2_trans *nested, 1459 struct kvm_memory_slot *memslot, unsigned long hva, 1460 bool fault_is_perm) 1461 { 1462 int ret = 0; 1463 bool write_fault, writable, force_pte = false; 1464 bool exec_fault, mte_allowed; 1465 bool device = false, vfio_allow_any_uc = false; 1466 unsigned long mmu_seq; 1467 phys_addr_t ipa = fault_ipa; 1468 struct kvm *kvm = vcpu->kvm; 1469 struct vm_area_struct *vma; 1470 short vma_shift; 1471 void *memcache; 1472 gfn_t gfn; 1473 kvm_pfn_t pfn; 1474 bool logging_active = memslot_is_logging(memslot); 1475 long vma_pagesize, fault_granule; 1476 enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; 1477 struct kvm_pgtable *pgt; 1478 struct page *page; 1479 enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED; 1480 1481 if (fault_is_perm) 1482 fault_granule = kvm_vcpu_trap_get_perm_fault_granule(vcpu); 1483 write_fault = kvm_is_write_fault(vcpu); 1484 exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu); 1485 VM_BUG_ON(write_fault && exec_fault); 1486 1487 if (fault_is_perm && !write_fault && !exec_fault) { 1488 kvm_err("Unexpected L2 read permission error\n"); 1489 return -EFAULT; 1490 } 1491 1492 /* 1493 * Permission faults just need to update the existing leaf entry, 1494 * and so normally don't require allocations from the memcache. The 1495 * only exception to this is when dirty logging is enabled at runtime 1496 * and a write fault needs to collapse a block entry into a table. 1497 */ 1498 if (!fault_is_perm || (logging_active && write_fault)) { 1499 int min_pages = kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu); 1500 1501 if (!is_protected_kvm_enabled()) { 1502 memcache = &vcpu->arch.mmu_page_cache; 1503 ret = kvm_mmu_topup_memory_cache(memcache, min_pages); 1504 } else { 1505 memcache = &vcpu->arch.pkvm_memcache; 1506 ret = topup_hyp_memcache(memcache, min_pages); 1507 } 1508 if (ret) 1509 return ret; 1510 } 1511 1512 /* 1513 * Let's check if we will get back a huge page backed by hugetlbfs, or 1514 * get block mapping for device MMIO region. 1515 */ 1516 mmap_read_lock(current->mm); 1517 vma = vma_lookup(current->mm, hva); 1518 if (unlikely(!vma)) { 1519 kvm_err("Failed to find VMA for hva 0x%lx\n", hva); 1520 mmap_read_unlock(current->mm); 1521 return -EFAULT; 1522 } 1523 1524 /* 1525 * logging_active is guaranteed to never be true for VM_PFNMAP 1526 * memslots. 1527 */ 1528 if (logging_active || is_protected_kvm_enabled()) { 1529 force_pte = true; 1530 vma_shift = PAGE_SHIFT; 1531 } else { 1532 vma_shift = get_vma_page_shift(vma, hva); 1533 } 1534 1535 switch (vma_shift) { 1536 #ifndef __PAGETABLE_PMD_FOLDED 1537 case PUD_SHIFT: 1538 if (fault_supports_stage2_huge_mapping(memslot, hva, PUD_SIZE)) 1539 break; 1540 fallthrough; 1541 #endif 1542 case CONT_PMD_SHIFT: 1543 vma_shift = PMD_SHIFT; 1544 fallthrough; 1545 case PMD_SHIFT: 1546 if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) 1547 break; 1548 fallthrough; 1549 case CONT_PTE_SHIFT: 1550 vma_shift = PAGE_SHIFT; 1551 force_pte = true; 1552 fallthrough; 1553 case PAGE_SHIFT: 1554 break; 1555 default: 1556 WARN_ONCE(1, "Unknown vma_shift %d", vma_shift); 1557 } 1558 1559 vma_pagesize = 1UL << vma_shift; 1560 1561 if (nested) { 1562 unsigned long max_map_size; 1563 1564 max_map_size = force_pte ? PAGE_SIZE : PUD_SIZE; 1565 1566 ipa = kvm_s2_trans_output(nested); 1567 1568 /* 1569 * If we're about to create a shadow stage 2 entry, then we 1570 * can only create a block mapping if the guest stage 2 page 1571 * table uses at least as big a mapping. 1572 */ 1573 max_map_size = min(kvm_s2_trans_size(nested), max_map_size); 1574 1575 /* 1576 * Be careful that if the mapping size falls between 1577 * two host sizes, take the smallest of the two. 1578 */ 1579 if (max_map_size >= PMD_SIZE && max_map_size < PUD_SIZE) 1580 max_map_size = PMD_SIZE; 1581 else if (max_map_size >= PAGE_SIZE && max_map_size < PMD_SIZE) 1582 max_map_size = PAGE_SIZE; 1583 1584 force_pte = (max_map_size == PAGE_SIZE); 1585 vma_pagesize = min(vma_pagesize, (long)max_map_size); 1586 } 1587 1588 /* 1589 * Both the canonical IPA and fault IPA must be hugepage-aligned to 1590 * ensure we find the right PFN and lay down the mapping in the right 1591 * place. 1592 */ 1593 if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE) { 1594 fault_ipa &= ~(vma_pagesize - 1); 1595 ipa &= ~(vma_pagesize - 1); 1596 } 1597 1598 gfn = ipa >> PAGE_SHIFT; 1599 mte_allowed = kvm_vma_mte_allowed(vma); 1600 1601 vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED; 1602 1603 /* Don't use the VMA after the unlock -- it may have vanished */ 1604 vma = NULL; 1605 1606 /* 1607 * Read mmu_invalidate_seq so that KVM can detect if the results of 1608 * vma_lookup() or __kvm_faultin_pfn() become stale prior to 1609 * acquiring kvm->mmu_lock. 1610 * 1611 * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs 1612 * with the smp_wmb() in kvm_mmu_invalidate_end(). 1613 */ 1614 mmu_seq = vcpu->kvm->mmu_invalidate_seq; 1615 mmap_read_unlock(current->mm); 1616 1617 pfn = __kvm_faultin_pfn(memslot, gfn, write_fault ? FOLL_WRITE : 0, 1618 &writable, &page); 1619 if (pfn == KVM_PFN_ERR_HWPOISON) { 1620 kvm_send_hwpoison_signal(hva, vma_shift); 1621 return 0; 1622 } 1623 if (is_error_noslot_pfn(pfn)) 1624 return -EFAULT; 1625 1626 if (kvm_is_device_pfn(pfn)) { 1627 /* 1628 * If the page was identified as device early by looking at 1629 * the VMA flags, vma_pagesize is already representing the 1630 * largest quantity we can map. If instead it was mapped 1631 * via __kvm_faultin_pfn(), vma_pagesize is set to PAGE_SIZE 1632 * and must not be upgraded. 1633 * 1634 * In both cases, we don't let transparent_hugepage_adjust() 1635 * change things at the last minute. 1636 */ 1637 device = true; 1638 } else if (logging_active && !write_fault) { 1639 /* 1640 * Only actually map the page as writable if this was a write 1641 * fault. 1642 */ 1643 writable = false; 1644 } 1645 1646 if (exec_fault && device) 1647 return -ENOEXEC; 1648 1649 /* 1650 * Potentially reduce shadow S2 permissions to match the guest's own 1651 * S2. For exec faults, we'd only reach this point if the guest 1652 * actually allowed it (see kvm_s2_handle_perm_fault). 1653 * 1654 * Also encode the level of the original translation in the SW bits 1655 * of the leaf entry as a proxy for the span of that translation. 1656 * This will be retrieved on TLB invalidation from the guest and 1657 * used to limit the invalidation scope if a TTL hint or a range 1658 * isn't provided. 1659 */ 1660 if (nested) { 1661 writable &= kvm_s2_trans_writable(nested); 1662 if (!kvm_s2_trans_readable(nested)) 1663 prot &= ~KVM_PGTABLE_PROT_R; 1664 1665 prot |= kvm_encode_nested_level(nested); 1666 } 1667 1668 kvm_fault_lock(kvm); 1669 pgt = vcpu->arch.hw_mmu->pgt; 1670 if (mmu_invalidate_retry(kvm, mmu_seq)) { 1671 ret = -EAGAIN; 1672 goto out_unlock; 1673 } 1674 1675 /* 1676 * If we are not forced to use page mapping, check if we are 1677 * backed by a THP and thus use block mapping if possible. 1678 */ 1679 if (vma_pagesize == PAGE_SIZE && !(force_pte || device)) { 1680 if (fault_is_perm && fault_granule > PAGE_SIZE) 1681 vma_pagesize = fault_granule; 1682 else 1683 vma_pagesize = transparent_hugepage_adjust(kvm, memslot, 1684 hva, &pfn, 1685 &fault_ipa); 1686 1687 if (vma_pagesize < 0) { 1688 ret = vma_pagesize; 1689 goto out_unlock; 1690 } 1691 } 1692 1693 if (!fault_is_perm && !device && kvm_has_mte(kvm)) { 1694 /* Check the VMM hasn't introduced a new disallowed VMA */ 1695 if (mte_allowed) { 1696 sanitise_mte_tags(kvm, pfn, vma_pagesize); 1697 } else { 1698 ret = -EFAULT; 1699 goto out_unlock; 1700 } 1701 } 1702 1703 if (writable) 1704 prot |= KVM_PGTABLE_PROT_W; 1705 1706 if (exec_fault) 1707 prot |= KVM_PGTABLE_PROT_X; 1708 1709 if (device) { 1710 if (vfio_allow_any_uc) 1711 prot |= KVM_PGTABLE_PROT_NORMAL_NC; 1712 else 1713 prot |= KVM_PGTABLE_PROT_DEVICE; 1714 } else if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC) && 1715 (!nested || kvm_s2_trans_executable(nested))) { 1716 prot |= KVM_PGTABLE_PROT_X; 1717 } 1718 1719 /* 1720 * Under the premise of getting a FSC_PERM fault, we just need to relax 1721 * permissions only if vma_pagesize equals fault_granule. Otherwise, 1722 * kvm_pgtable_stage2_map() should be called to change block size. 1723 */ 1724 if (fault_is_perm && vma_pagesize == fault_granule) { 1725 /* 1726 * Drop the SW bits in favour of those stored in the 1727 * PTE, which will be preserved. 1728 */ 1729 prot &= ~KVM_NV_GUEST_MAP_SZ; 1730 ret = KVM_PGT_FN(kvm_pgtable_stage2_relax_perms)(pgt, fault_ipa, prot, flags); 1731 } else { 1732 ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, fault_ipa, vma_pagesize, 1733 __pfn_to_phys(pfn), prot, 1734 memcache, flags); 1735 } 1736 1737 out_unlock: 1738 kvm_release_faultin_page(kvm, page, !!ret, writable); 1739 kvm_fault_unlock(kvm); 1740 1741 /* Mark the page dirty only if the fault is handled successfully */ 1742 if (writable && !ret) 1743 mark_page_dirty_in_slot(kvm, memslot, gfn); 1744 1745 return ret != -EAGAIN ? ret : 0; 1746 } 1747 1748 /* Resolve the access fault by making the page young again. */ 1749 static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) 1750 { 1751 enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED; 1752 struct kvm_s2_mmu *mmu; 1753 1754 trace_kvm_access_fault(fault_ipa); 1755 1756 read_lock(&vcpu->kvm->mmu_lock); 1757 mmu = vcpu->arch.hw_mmu; 1758 KVM_PGT_FN(kvm_pgtable_stage2_mkyoung)(mmu->pgt, fault_ipa, flags); 1759 read_unlock(&vcpu->kvm->mmu_lock); 1760 } 1761 1762 /** 1763 * kvm_handle_guest_abort - handles all 2nd stage aborts 1764 * @vcpu: the VCPU pointer 1765 * 1766 * Any abort that gets to the host is almost guaranteed to be caused by a 1767 * missing second stage translation table entry, which can mean that either the 1768 * guest simply needs more memory and we must allocate an appropriate page or it 1769 * can mean that the guest tried to access I/O memory, which is emulated by user 1770 * space. The distinction is based on the IPA causing the fault and whether this 1771 * memory region has been registered as standard RAM by user space. 1772 */ 1773 int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) 1774 { 1775 struct kvm_s2_trans nested_trans, *nested = NULL; 1776 unsigned long esr; 1777 phys_addr_t fault_ipa; /* The address we faulted on */ 1778 phys_addr_t ipa; /* Always the IPA in the L1 guest phys space */ 1779 struct kvm_memory_slot *memslot; 1780 unsigned long hva; 1781 bool is_iabt, write_fault, writable; 1782 gfn_t gfn; 1783 int ret, idx; 1784 1785 esr = kvm_vcpu_get_esr(vcpu); 1786 1787 ipa = fault_ipa = kvm_vcpu_get_fault_ipa(vcpu); 1788 is_iabt = kvm_vcpu_trap_is_iabt(vcpu); 1789 1790 if (esr_fsc_is_translation_fault(esr)) { 1791 /* Beyond sanitised PARange (which is the IPA limit) */ 1792 if (fault_ipa >= BIT_ULL(get_kvm_ipa_limit())) { 1793 kvm_inject_size_fault(vcpu); 1794 return 1; 1795 } 1796 1797 /* Falls between the IPA range and the PARange? */ 1798 if (fault_ipa >= BIT_ULL(VTCR_EL2_IPA(vcpu->arch.hw_mmu->vtcr))) { 1799 fault_ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0); 1800 1801 if (is_iabt) 1802 kvm_inject_pabt(vcpu, fault_ipa); 1803 else 1804 kvm_inject_dabt(vcpu, fault_ipa); 1805 return 1; 1806 } 1807 } 1808 1809 /* Synchronous External Abort? */ 1810 if (kvm_vcpu_abt_issea(vcpu)) { 1811 /* 1812 * For RAS the host kernel may handle this abort. 1813 * There is no need to pass the error into the guest. 1814 */ 1815 if (kvm_handle_guest_sea(fault_ipa, kvm_vcpu_get_esr(vcpu))) 1816 kvm_inject_vabt(vcpu); 1817 1818 return 1; 1819 } 1820 1821 trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_esr(vcpu), 1822 kvm_vcpu_get_hfar(vcpu), fault_ipa); 1823 1824 /* Check the stage-2 fault is trans. fault or write fault */ 1825 if (!esr_fsc_is_translation_fault(esr) && 1826 !esr_fsc_is_permission_fault(esr) && 1827 !esr_fsc_is_access_flag_fault(esr)) { 1828 kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n", 1829 kvm_vcpu_trap_get_class(vcpu), 1830 (unsigned long)kvm_vcpu_trap_get_fault(vcpu), 1831 (unsigned long)kvm_vcpu_get_esr(vcpu)); 1832 return -EFAULT; 1833 } 1834 1835 idx = srcu_read_lock(&vcpu->kvm->srcu); 1836 1837 /* 1838 * We may have faulted on a shadow stage 2 page table if we are 1839 * running a nested guest. In this case, we have to resolve the L2 1840 * IPA to the L1 IPA first, before knowing what kind of memory should 1841 * back the L1 IPA. 1842 * 1843 * If the shadow stage 2 page table walk faults, then we simply inject 1844 * this to the guest and carry on. 1845 * 1846 * If there are no shadow S2 PTs because S2 is disabled, there is 1847 * nothing to walk and we treat it as a 1:1 before going through the 1848 * canonical translation. 1849 */ 1850 if (kvm_is_nested_s2_mmu(vcpu->kvm,vcpu->arch.hw_mmu) && 1851 vcpu->arch.hw_mmu->nested_stage2_enabled) { 1852 u32 esr; 1853 1854 ret = kvm_walk_nested_s2(vcpu, fault_ipa, &nested_trans); 1855 if (ret) { 1856 esr = kvm_s2_trans_esr(&nested_trans); 1857 kvm_inject_s2_fault(vcpu, esr); 1858 goto out_unlock; 1859 } 1860 1861 ret = kvm_s2_handle_perm_fault(vcpu, &nested_trans); 1862 if (ret) { 1863 esr = kvm_s2_trans_esr(&nested_trans); 1864 kvm_inject_s2_fault(vcpu, esr); 1865 goto out_unlock; 1866 } 1867 1868 ipa = kvm_s2_trans_output(&nested_trans); 1869 nested = &nested_trans; 1870 } 1871 1872 gfn = ipa >> PAGE_SHIFT; 1873 memslot = gfn_to_memslot(vcpu->kvm, gfn); 1874 hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable); 1875 write_fault = kvm_is_write_fault(vcpu); 1876 if (kvm_is_error_hva(hva) || (write_fault && !writable)) { 1877 /* 1878 * The guest has put either its instructions or its page-tables 1879 * somewhere it shouldn't have. Userspace won't be able to do 1880 * anything about this (there's no syndrome for a start), so 1881 * re-inject the abort back into the guest. 1882 */ 1883 if (is_iabt) { 1884 ret = -ENOEXEC; 1885 goto out; 1886 } 1887 1888 if (kvm_vcpu_abt_iss1tw(vcpu)) { 1889 kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu)); 1890 ret = 1; 1891 goto out_unlock; 1892 } 1893 1894 /* 1895 * Check for a cache maintenance operation. Since we 1896 * ended-up here, we know it is outside of any memory 1897 * slot. But we can't find out if that is for a device, 1898 * or if the guest is just being stupid. The only thing 1899 * we know for sure is that this range cannot be cached. 1900 * 1901 * So let's assume that the guest is just being 1902 * cautious, and skip the instruction. 1903 */ 1904 if (kvm_is_error_hva(hva) && kvm_vcpu_dabt_is_cm(vcpu)) { 1905 kvm_incr_pc(vcpu); 1906 ret = 1; 1907 goto out_unlock; 1908 } 1909 1910 /* 1911 * The IPA is reported as [MAX:12], so we need to 1912 * complement it with the bottom 12 bits from the 1913 * faulting VA. This is always 12 bits, irrespective 1914 * of the page size. 1915 */ 1916 ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0); 1917 ret = io_mem_abort(vcpu, ipa); 1918 goto out_unlock; 1919 } 1920 1921 /* Userspace should not be able to register out-of-bounds IPAs */ 1922 VM_BUG_ON(ipa >= kvm_phys_size(vcpu->arch.hw_mmu)); 1923 1924 if (esr_fsc_is_access_flag_fault(esr)) { 1925 handle_access_fault(vcpu, fault_ipa); 1926 ret = 1; 1927 goto out_unlock; 1928 } 1929 1930 ret = user_mem_abort(vcpu, fault_ipa, nested, memslot, hva, 1931 esr_fsc_is_permission_fault(esr)); 1932 if (ret == 0) 1933 ret = 1; 1934 out: 1935 if (ret == -ENOEXEC) { 1936 kvm_inject_pabt(vcpu, kvm_vcpu_get_hfar(vcpu)); 1937 ret = 1; 1938 } 1939 out_unlock: 1940 srcu_read_unlock(&vcpu->kvm->srcu, idx); 1941 return ret; 1942 } 1943 1944 bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) 1945 { 1946 if (!kvm->arch.mmu.pgt) 1947 return false; 1948 1949 __unmap_stage2_range(&kvm->arch.mmu, range->start << PAGE_SHIFT, 1950 (range->end - range->start) << PAGE_SHIFT, 1951 range->may_block); 1952 1953 kvm_nested_s2_unmap(kvm, range->may_block); 1954 return false; 1955 } 1956 1957 bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 1958 { 1959 u64 size = (range->end - range->start) << PAGE_SHIFT; 1960 1961 if (!kvm->arch.mmu.pgt) 1962 return false; 1963 1964 return KVM_PGT_FN(kvm_pgtable_stage2_test_clear_young)(kvm->arch.mmu.pgt, 1965 range->start << PAGE_SHIFT, 1966 size, true); 1967 /* 1968 * TODO: Handle nested_mmu structures here using the reverse mapping in 1969 * a later version of patch series. 1970 */ 1971 } 1972 1973 bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 1974 { 1975 u64 size = (range->end - range->start) << PAGE_SHIFT; 1976 1977 if (!kvm->arch.mmu.pgt) 1978 return false; 1979 1980 return KVM_PGT_FN(kvm_pgtable_stage2_test_clear_young)(kvm->arch.mmu.pgt, 1981 range->start << PAGE_SHIFT, 1982 size, false); 1983 } 1984 1985 phys_addr_t kvm_mmu_get_httbr(void) 1986 { 1987 return __pa(hyp_pgtable->pgd); 1988 } 1989 1990 phys_addr_t kvm_get_idmap_vector(void) 1991 { 1992 return hyp_idmap_vector; 1993 } 1994 1995 static int kvm_map_idmap_text(void) 1996 { 1997 unsigned long size = hyp_idmap_end - hyp_idmap_start; 1998 int err = __create_hyp_mappings(hyp_idmap_start, size, hyp_idmap_start, 1999 PAGE_HYP_EXEC); 2000 if (err) 2001 kvm_err("Failed to idmap %lx-%lx\n", 2002 hyp_idmap_start, hyp_idmap_end); 2003 2004 return err; 2005 } 2006 2007 static void *kvm_hyp_zalloc_page(void *arg) 2008 { 2009 return (void *)get_zeroed_page(GFP_KERNEL); 2010 } 2011 2012 static struct kvm_pgtable_mm_ops kvm_hyp_mm_ops = { 2013 .zalloc_page = kvm_hyp_zalloc_page, 2014 .get_page = kvm_host_get_page, 2015 .put_page = kvm_host_put_page, 2016 .phys_to_virt = kvm_host_va, 2017 .virt_to_phys = kvm_host_pa, 2018 }; 2019 2020 int __init kvm_mmu_init(u32 *hyp_va_bits) 2021 { 2022 int err; 2023 u32 idmap_bits; 2024 u32 kernel_bits; 2025 2026 hyp_idmap_start = __pa_symbol(__hyp_idmap_text_start); 2027 hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE); 2028 hyp_idmap_end = __pa_symbol(__hyp_idmap_text_end); 2029 hyp_idmap_end = ALIGN(hyp_idmap_end, PAGE_SIZE); 2030 hyp_idmap_vector = __pa_symbol(__kvm_hyp_init); 2031 2032 /* 2033 * We rely on the linker script to ensure at build time that the HYP 2034 * init code does not cross a page boundary. 2035 */ 2036 BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK); 2037 2038 /* 2039 * The ID map is always configured for 48 bits of translation, which 2040 * may be fewer than the number of VA bits used by the regular kernel 2041 * stage 1, when VA_BITS=52. 2042 * 2043 * At EL2, there is only one TTBR register, and we can't switch between 2044 * translation tables *and* update TCR_EL2.T0SZ at the same time. Bottom 2045 * line: we need to use the extended range with *both* our translation 2046 * tables. 2047 * 2048 * So use the maximum of the idmap VA bits and the regular kernel stage 2049 * 1 VA bits to assure that the hypervisor can both ID map its code page 2050 * and map any kernel memory. 2051 */ 2052 idmap_bits = IDMAP_VA_BITS; 2053 kernel_bits = vabits_actual; 2054 *hyp_va_bits = max(idmap_bits, kernel_bits); 2055 2056 kvm_debug("Using %u-bit virtual addresses at EL2\n", *hyp_va_bits); 2057 kvm_debug("IDMAP page: %lx\n", hyp_idmap_start); 2058 kvm_debug("HYP VA range: %lx:%lx\n", 2059 kern_hyp_va(PAGE_OFFSET), 2060 kern_hyp_va((unsigned long)high_memory - 1)); 2061 2062 if (hyp_idmap_start >= kern_hyp_va(PAGE_OFFSET) && 2063 hyp_idmap_start < kern_hyp_va((unsigned long)high_memory - 1) && 2064 hyp_idmap_start != (unsigned long)__hyp_idmap_text_start) { 2065 /* 2066 * The idmap page is intersecting with the VA space, 2067 * it is not safe to continue further. 2068 */ 2069 kvm_err("IDMAP intersecting with HYP VA, unable to continue\n"); 2070 err = -EINVAL; 2071 goto out; 2072 } 2073 2074 hyp_pgtable = kzalloc(sizeof(*hyp_pgtable), GFP_KERNEL); 2075 if (!hyp_pgtable) { 2076 kvm_err("Hyp mode page-table not allocated\n"); 2077 err = -ENOMEM; 2078 goto out; 2079 } 2080 2081 err = kvm_pgtable_hyp_init(hyp_pgtable, *hyp_va_bits, &kvm_hyp_mm_ops); 2082 if (err) 2083 goto out_free_pgtable; 2084 2085 err = kvm_map_idmap_text(); 2086 if (err) 2087 goto out_destroy_pgtable; 2088 2089 io_map_base = hyp_idmap_start; 2090 __hyp_va_bits = *hyp_va_bits; 2091 return 0; 2092 2093 out_destroy_pgtable: 2094 kvm_pgtable_hyp_destroy(hyp_pgtable); 2095 out_free_pgtable: 2096 kfree(hyp_pgtable); 2097 hyp_pgtable = NULL; 2098 out: 2099 return err; 2100 } 2101 2102 void kvm_arch_commit_memory_region(struct kvm *kvm, 2103 struct kvm_memory_slot *old, 2104 const struct kvm_memory_slot *new, 2105 enum kvm_mr_change change) 2106 { 2107 bool log_dirty_pages = new && new->flags & KVM_MEM_LOG_DIRTY_PAGES; 2108 2109 /* 2110 * At this point memslot has been committed and there is an 2111 * allocated dirty_bitmap[], dirty pages will be tracked while the 2112 * memory slot is write protected. 2113 */ 2114 if (log_dirty_pages) { 2115 2116 if (change == KVM_MR_DELETE) 2117 return; 2118 2119 /* 2120 * Huge and normal pages are write-protected and split 2121 * on either of these two cases: 2122 * 2123 * 1. with initial-all-set: gradually with CLEAR ioctls, 2124 */ 2125 if (kvm_dirty_log_manual_protect_and_init_set(kvm)) 2126 return; 2127 /* 2128 * or 2129 * 2. without initial-all-set: all in one shot when 2130 * enabling dirty logging. 2131 */ 2132 kvm_mmu_wp_memory_region(kvm, new->id); 2133 kvm_mmu_split_memory_region(kvm, new->id); 2134 } else { 2135 /* 2136 * Free any leftovers from the eager page splitting cache. Do 2137 * this when deleting, moving, disabling dirty logging, or 2138 * creating the memslot (a nop). Doing it for deletes makes 2139 * sure we don't leak memory, and there's no need to keep the 2140 * cache around for any of the other cases. 2141 */ 2142 kvm_mmu_free_memory_cache(&kvm->arch.mmu.split_page_cache); 2143 } 2144 } 2145 2146 int kvm_arch_prepare_memory_region(struct kvm *kvm, 2147 const struct kvm_memory_slot *old, 2148 struct kvm_memory_slot *new, 2149 enum kvm_mr_change change) 2150 { 2151 hva_t hva, reg_end; 2152 int ret = 0; 2153 2154 if (change != KVM_MR_CREATE && change != KVM_MR_MOVE && 2155 change != KVM_MR_FLAGS_ONLY) 2156 return 0; 2157 2158 /* 2159 * Prevent userspace from creating a memory region outside of the IPA 2160 * space addressable by the KVM guest IPA space. 2161 */ 2162 if ((new->base_gfn + new->npages) > (kvm_phys_size(&kvm->arch.mmu) >> PAGE_SHIFT)) 2163 return -EFAULT; 2164 2165 hva = new->userspace_addr; 2166 reg_end = hva + (new->npages << PAGE_SHIFT); 2167 2168 mmap_read_lock(current->mm); 2169 /* 2170 * A memory region could potentially cover multiple VMAs, and any holes 2171 * between them, so iterate over all of them. 2172 * 2173 * +--------------------------------------------+ 2174 * +---------------+----------------+ +----------------+ 2175 * | : VMA 1 | VMA 2 | | VMA 3 : | 2176 * +---------------+----------------+ +----------------+ 2177 * | memory region | 2178 * +--------------------------------------------+ 2179 */ 2180 do { 2181 struct vm_area_struct *vma; 2182 2183 vma = find_vma_intersection(current->mm, hva, reg_end); 2184 if (!vma) 2185 break; 2186 2187 if (kvm_has_mte(kvm) && !kvm_vma_mte_allowed(vma)) { 2188 ret = -EINVAL; 2189 break; 2190 } 2191 2192 if (vma->vm_flags & VM_PFNMAP) { 2193 /* IO region dirty page logging not allowed */ 2194 if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) { 2195 ret = -EINVAL; 2196 break; 2197 } 2198 } 2199 hva = min(reg_end, vma->vm_end); 2200 } while (hva < reg_end); 2201 2202 mmap_read_unlock(current->mm); 2203 return ret; 2204 } 2205 2206 void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) 2207 { 2208 } 2209 2210 void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) 2211 { 2212 } 2213 2214 void kvm_arch_flush_shadow_memslot(struct kvm *kvm, 2215 struct kvm_memory_slot *slot) 2216 { 2217 gpa_t gpa = slot->base_gfn << PAGE_SHIFT; 2218 phys_addr_t size = slot->npages << PAGE_SHIFT; 2219 2220 write_lock(&kvm->mmu_lock); 2221 kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, size, true); 2222 kvm_nested_s2_unmap(kvm, true); 2223 write_unlock(&kvm->mmu_lock); 2224 } 2225 2226 /* 2227 * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized). 2228 * 2229 * Main problems: 2230 * - S/W ops are local to a CPU (not broadcast) 2231 * - We have line migration behind our back (speculation) 2232 * - System caches don't support S/W at all (damn!) 2233 * 2234 * In the face of the above, the best we can do is to try and convert 2235 * S/W ops to VA ops. Because the guest is not allowed to infer the 2236 * S/W to PA mapping, it can only use S/W to nuke the whole cache, 2237 * which is a rather good thing for us. 2238 * 2239 * Also, it is only used when turning caches on/off ("The expected 2240 * usage of the cache maintenance instructions that operate by set/way 2241 * is associated with the cache maintenance instructions associated 2242 * with the powerdown and powerup of caches, if this is required by 2243 * the implementation."). 2244 * 2245 * We use the following policy: 2246 * 2247 * - If we trap a S/W operation, we enable VM trapping to detect 2248 * caches being turned on/off, and do a full clean. 2249 * 2250 * - We flush the caches on both caches being turned on and off. 2251 * 2252 * - Once the caches are enabled, we stop trapping VM ops. 2253 */ 2254 void kvm_set_way_flush(struct kvm_vcpu *vcpu) 2255 { 2256 unsigned long hcr = *vcpu_hcr(vcpu); 2257 2258 /* 2259 * If this is the first time we do a S/W operation 2260 * (i.e. HCR_TVM not set) flush the whole memory, and set the 2261 * VM trapping. 2262 * 2263 * Otherwise, rely on the VM trapping to wait for the MMU + 2264 * Caches to be turned off. At that point, we'll be able to 2265 * clean the caches again. 2266 */ 2267 if (!(hcr & HCR_TVM)) { 2268 trace_kvm_set_way_flush(*vcpu_pc(vcpu), 2269 vcpu_has_cache_enabled(vcpu)); 2270 stage2_flush_vm(vcpu->kvm); 2271 *vcpu_hcr(vcpu) = hcr | HCR_TVM; 2272 } 2273 } 2274 2275 void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled) 2276 { 2277 bool now_enabled = vcpu_has_cache_enabled(vcpu); 2278 2279 /* 2280 * If switching the MMU+caches on, need to invalidate the caches. 2281 * If switching it off, need to clean the caches. 2282 * Clean + invalidate does the trick always. 2283 */ 2284 if (now_enabled != was_enabled) 2285 stage2_flush_vm(vcpu->kvm); 2286 2287 /* Caches are now on, stop trapping VM ops (until a S/W op) */ 2288 if (now_enabled) 2289 *vcpu_hcr(vcpu) &= ~HCR_TVM; 2290 2291 trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled); 2292 } 2293