1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2012 - Virtual Open Systems and Columbia University 4 * Author: Christoffer Dall <c.dall@virtualopensystems.com> 5 */ 6 7 #include <linux/mman.h> 8 #include <linux/kvm_host.h> 9 #include <linux/io.h> 10 #include <linux/hugetlb.h> 11 #include <linux/sched/signal.h> 12 #include <trace/events/kvm.h> 13 #include <asm/pgalloc.h> 14 #include <asm/cacheflush.h> 15 #include <asm/kvm_arm.h> 16 #include <asm/kvm_mmu.h> 17 #include <asm/kvm_pgtable.h> 18 #include <asm/kvm_pkvm.h> 19 #include <asm/kvm_ras.h> 20 #include <asm/kvm_asm.h> 21 #include <asm/kvm_emulate.h> 22 #include <asm/virt.h> 23 24 #include "trace.h" 25 26 static struct kvm_pgtable *hyp_pgtable; 27 static DEFINE_MUTEX(kvm_hyp_pgd_mutex); 28 29 static unsigned long __ro_after_init hyp_idmap_start; 30 static unsigned long __ro_after_init hyp_idmap_end; 31 static phys_addr_t __ro_after_init hyp_idmap_vector; 32 33 u32 __ro_after_init __hyp_va_bits; 34 35 static unsigned long __ro_after_init io_map_base; 36 37 #define KVM_PGT_FN(fn) (!is_protected_kvm_enabled() ? fn : p ## fn) 38 39 static phys_addr_t __stage2_range_addr_end(phys_addr_t addr, phys_addr_t end, 40 phys_addr_t size) 41 { 42 phys_addr_t boundary = ALIGN_DOWN(addr + size, size); 43 44 return (boundary - 1 < end - 1) ? boundary : end; 45 } 46 47 static phys_addr_t stage2_range_addr_end(phys_addr_t addr, phys_addr_t end) 48 { 49 phys_addr_t size = kvm_granule_size(KVM_PGTABLE_MIN_BLOCK_LEVEL); 50 51 return __stage2_range_addr_end(addr, end, size); 52 } 53 54 /* 55 * Release kvm_mmu_lock periodically if the memory region is large. Otherwise, 56 * we may see kernel panics with CONFIG_DETECT_HUNG_TASK, 57 * CONFIG_LOCKUP_DETECTOR, CONFIG_LOCKDEP. Additionally, holding the lock too 58 * long will also starve other vCPUs. We have to also make sure that the page 59 * tables are not freed while we released the lock. 60 */ 61 static int stage2_apply_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, 62 phys_addr_t end, 63 int (*fn)(struct kvm_pgtable *, u64, u64), 64 bool resched) 65 { 66 struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu); 67 int ret; 68 u64 next; 69 70 do { 71 struct kvm_pgtable *pgt = mmu->pgt; 72 if (!pgt) 73 return -EINVAL; 74 75 next = stage2_range_addr_end(addr, end); 76 ret = fn(pgt, addr, next - addr); 77 if (ret) 78 break; 79 80 if (resched && next != end) 81 cond_resched_rwlock_write(&kvm->mmu_lock); 82 } while (addr = next, addr != end); 83 84 return ret; 85 } 86 87 #define stage2_apply_range_resched(mmu, addr, end, fn) \ 88 stage2_apply_range(mmu, addr, end, fn, true) 89 90 /* 91 * Get the maximum number of page-tables pages needed to split a range 92 * of blocks into PAGE_SIZE PTEs. It assumes the range is already 93 * mapped at level 2, or at level 1 if allowed. 94 */ 95 static int kvm_mmu_split_nr_page_tables(u64 range) 96 { 97 int n = 0; 98 99 if (KVM_PGTABLE_MIN_BLOCK_LEVEL < 2) 100 n += DIV_ROUND_UP(range, PUD_SIZE); 101 n += DIV_ROUND_UP(range, PMD_SIZE); 102 return n; 103 } 104 105 static bool need_split_memcache_topup_or_resched(struct kvm *kvm) 106 { 107 struct kvm_mmu_memory_cache *cache; 108 u64 chunk_size, min; 109 110 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) 111 return true; 112 113 chunk_size = kvm->arch.mmu.split_page_chunk_size; 114 min = kvm_mmu_split_nr_page_tables(chunk_size); 115 cache = &kvm->arch.mmu.split_page_cache; 116 return kvm_mmu_memory_cache_nr_free_objects(cache) < min; 117 } 118 119 static int kvm_mmu_split_huge_pages(struct kvm *kvm, phys_addr_t addr, 120 phys_addr_t end) 121 { 122 struct kvm_mmu_memory_cache *cache; 123 struct kvm_pgtable *pgt; 124 int ret, cache_capacity; 125 u64 next, chunk_size; 126 127 lockdep_assert_held_write(&kvm->mmu_lock); 128 129 chunk_size = kvm->arch.mmu.split_page_chunk_size; 130 cache_capacity = kvm_mmu_split_nr_page_tables(chunk_size); 131 132 if (chunk_size == 0) 133 return 0; 134 135 cache = &kvm->arch.mmu.split_page_cache; 136 137 do { 138 if (need_split_memcache_topup_or_resched(kvm)) { 139 write_unlock(&kvm->mmu_lock); 140 cond_resched(); 141 /* Eager page splitting is best-effort. */ 142 ret = __kvm_mmu_topup_memory_cache(cache, 143 cache_capacity, 144 cache_capacity); 145 write_lock(&kvm->mmu_lock); 146 if (ret) 147 break; 148 } 149 150 pgt = kvm->arch.mmu.pgt; 151 if (!pgt) 152 return -EINVAL; 153 154 next = __stage2_range_addr_end(addr, end, chunk_size); 155 ret = KVM_PGT_FN(kvm_pgtable_stage2_split)(pgt, addr, next - addr, cache); 156 if (ret) 157 break; 158 } while (addr = next, addr != end); 159 160 return ret; 161 } 162 163 static bool memslot_is_logging(struct kvm_memory_slot *memslot) 164 { 165 return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY); 166 } 167 168 /** 169 * kvm_arch_flush_remote_tlbs() - flush all VM TLB entries for v7/8 170 * @kvm: pointer to kvm structure. 171 * 172 * Interface to HYP function to flush all VM TLB entries 173 */ 174 int kvm_arch_flush_remote_tlbs(struct kvm *kvm) 175 { 176 if (is_protected_kvm_enabled()) 177 kvm_call_hyp_nvhe(__pkvm_tlb_flush_vmid, kvm->arch.pkvm.handle); 178 else 179 kvm_call_hyp(__kvm_tlb_flush_vmid, &kvm->arch.mmu); 180 return 0; 181 } 182 183 int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, 184 gfn_t gfn, u64 nr_pages) 185 { 186 u64 size = nr_pages << PAGE_SHIFT; 187 u64 addr = gfn << PAGE_SHIFT; 188 189 if (is_protected_kvm_enabled()) 190 kvm_call_hyp_nvhe(__pkvm_tlb_flush_vmid, kvm->arch.pkvm.handle); 191 else 192 kvm_tlb_flush_vmid_range(&kvm->arch.mmu, addr, size); 193 return 0; 194 } 195 196 static void *stage2_memcache_zalloc_page(void *arg) 197 { 198 struct kvm_mmu_memory_cache *mc = arg; 199 void *virt; 200 201 /* Allocated with __GFP_ZERO, so no need to zero */ 202 virt = kvm_mmu_memory_cache_alloc(mc); 203 if (virt) 204 kvm_account_pgtable_pages(virt, 1); 205 return virt; 206 } 207 208 static void *kvm_host_zalloc_pages_exact(size_t size) 209 { 210 return alloc_pages_exact(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO); 211 } 212 213 static void *kvm_s2_zalloc_pages_exact(size_t size) 214 { 215 void *virt = kvm_host_zalloc_pages_exact(size); 216 217 if (virt) 218 kvm_account_pgtable_pages(virt, (size >> PAGE_SHIFT)); 219 return virt; 220 } 221 222 static void kvm_s2_free_pages_exact(void *virt, size_t size) 223 { 224 kvm_account_pgtable_pages(virt, -(size >> PAGE_SHIFT)); 225 free_pages_exact(virt, size); 226 } 227 228 static struct kvm_pgtable_mm_ops kvm_s2_mm_ops; 229 230 static void stage2_free_unlinked_table_rcu_cb(struct rcu_head *head) 231 { 232 struct page *page = container_of(head, struct page, rcu_head); 233 void *pgtable = page_to_virt(page); 234 s8 level = page_private(page); 235 236 KVM_PGT_FN(kvm_pgtable_stage2_free_unlinked)(&kvm_s2_mm_ops, pgtable, level); 237 } 238 239 static void stage2_free_unlinked_table(void *addr, s8 level) 240 { 241 struct page *page = virt_to_page(addr); 242 243 set_page_private(page, (unsigned long)level); 244 call_rcu(&page->rcu_head, stage2_free_unlinked_table_rcu_cb); 245 } 246 247 static void kvm_host_get_page(void *addr) 248 { 249 get_page(virt_to_page(addr)); 250 } 251 252 static void kvm_host_put_page(void *addr) 253 { 254 put_page(virt_to_page(addr)); 255 } 256 257 static void kvm_s2_put_page(void *addr) 258 { 259 struct page *p = virt_to_page(addr); 260 /* Dropping last refcount, the page will be freed */ 261 if (page_count(p) == 1) 262 kvm_account_pgtable_pages(addr, -1); 263 put_page(p); 264 } 265 266 static int kvm_host_page_count(void *addr) 267 { 268 return page_count(virt_to_page(addr)); 269 } 270 271 static phys_addr_t kvm_host_pa(void *addr) 272 { 273 return __pa(addr); 274 } 275 276 static void *kvm_host_va(phys_addr_t phys) 277 { 278 return __va(phys); 279 } 280 281 static void clean_dcache_guest_page(void *va, size_t size) 282 { 283 __clean_dcache_guest_page(va, size); 284 } 285 286 static void invalidate_icache_guest_page(void *va, size_t size) 287 { 288 __invalidate_icache_guest_page(va, size); 289 } 290 291 /* 292 * Unmapping vs dcache management: 293 * 294 * If a guest maps certain memory pages as uncached, all writes will 295 * bypass the data cache and go directly to RAM. However, the CPUs 296 * can still speculate reads (not writes) and fill cache lines with 297 * data. 298 * 299 * Those cache lines will be *clean* cache lines though, so a 300 * clean+invalidate operation is equivalent to an invalidate 301 * operation, because no cache lines are marked dirty. 302 * 303 * Those clean cache lines could be filled prior to an uncached write 304 * by the guest, and the cache coherent IO subsystem would therefore 305 * end up writing old data to disk. 306 * 307 * This is why right after unmapping a page/section and invalidating 308 * the corresponding TLBs, we flush to make sure the IO subsystem will 309 * never hit in the cache. 310 * 311 * This is all avoided on systems that have ARM64_HAS_STAGE2_FWB, as 312 * we then fully enforce cacheability of RAM, no matter what the guest 313 * does. 314 */ 315 /** 316 * __unmap_stage2_range -- Clear stage2 page table entries to unmap a range 317 * @mmu: The KVM stage-2 MMU pointer 318 * @start: The intermediate physical base address of the range to unmap 319 * @size: The size of the area to unmap 320 * @may_block: Whether or not we are permitted to block 321 * 322 * Clear a range of stage-2 mappings, lowering the various ref-counts. Must 323 * be called while holding mmu_lock (unless for freeing the stage2 pgd before 324 * destroying the VM), otherwise another faulting VCPU may come in and mess 325 * with things behind our backs. 326 */ 327 static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size, 328 bool may_block) 329 { 330 struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu); 331 phys_addr_t end = start + size; 332 333 lockdep_assert_held_write(&kvm->mmu_lock); 334 WARN_ON(size & ~PAGE_MASK); 335 WARN_ON(stage2_apply_range(mmu, start, end, KVM_PGT_FN(kvm_pgtable_stage2_unmap), 336 may_block)); 337 } 338 339 void kvm_stage2_unmap_range(struct kvm_s2_mmu *mmu, phys_addr_t start, 340 u64 size, bool may_block) 341 { 342 __unmap_stage2_range(mmu, start, size, may_block); 343 } 344 345 void kvm_stage2_flush_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end) 346 { 347 stage2_apply_range_resched(mmu, addr, end, KVM_PGT_FN(kvm_pgtable_stage2_flush)); 348 } 349 350 static void stage2_flush_memslot(struct kvm *kvm, 351 struct kvm_memory_slot *memslot) 352 { 353 phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT; 354 phys_addr_t end = addr + PAGE_SIZE * memslot->npages; 355 356 kvm_stage2_flush_range(&kvm->arch.mmu, addr, end); 357 } 358 359 /** 360 * stage2_flush_vm - Invalidate cache for pages mapped in stage 2 361 * @kvm: The struct kvm pointer 362 * 363 * Go through the stage 2 page tables and invalidate any cache lines 364 * backing memory already mapped to the VM. 365 */ 366 static void stage2_flush_vm(struct kvm *kvm) 367 { 368 struct kvm_memslots *slots; 369 struct kvm_memory_slot *memslot; 370 int idx, bkt; 371 372 idx = srcu_read_lock(&kvm->srcu); 373 write_lock(&kvm->mmu_lock); 374 375 slots = kvm_memslots(kvm); 376 kvm_for_each_memslot(memslot, bkt, slots) 377 stage2_flush_memslot(kvm, memslot); 378 379 kvm_nested_s2_flush(kvm); 380 381 write_unlock(&kvm->mmu_lock); 382 srcu_read_unlock(&kvm->srcu, idx); 383 } 384 385 /** 386 * free_hyp_pgds - free Hyp-mode page tables 387 */ 388 void __init free_hyp_pgds(void) 389 { 390 mutex_lock(&kvm_hyp_pgd_mutex); 391 if (hyp_pgtable) { 392 kvm_pgtable_hyp_destroy(hyp_pgtable); 393 kfree(hyp_pgtable); 394 hyp_pgtable = NULL; 395 } 396 mutex_unlock(&kvm_hyp_pgd_mutex); 397 } 398 399 static bool kvm_host_owns_hyp_mappings(void) 400 { 401 if (is_kernel_in_hyp_mode()) 402 return false; 403 404 if (static_branch_likely(&kvm_protected_mode_initialized)) 405 return false; 406 407 /* 408 * This can happen at boot time when __create_hyp_mappings() is called 409 * after the hyp protection has been enabled, but the static key has 410 * not been flipped yet. 411 */ 412 if (!hyp_pgtable && is_protected_kvm_enabled()) 413 return false; 414 415 WARN_ON(!hyp_pgtable); 416 417 return true; 418 } 419 420 int __create_hyp_mappings(unsigned long start, unsigned long size, 421 unsigned long phys, enum kvm_pgtable_prot prot) 422 { 423 int err; 424 425 if (WARN_ON(!kvm_host_owns_hyp_mappings())) 426 return -EINVAL; 427 428 mutex_lock(&kvm_hyp_pgd_mutex); 429 err = kvm_pgtable_hyp_map(hyp_pgtable, start, size, phys, prot); 430 mutex_unlock(&kvm_hyp_pgd_mutex); 431 432 return err; 433 } 434 435 static phys_addr_t kvm_kaddr_to_phys(void *kaddr) 436 { 437 if (!is_vmalloc_addr(kaddr)) { 438 BUG_ON(!virt_addr_valid(kaddr)); 439 return __pa(kaddr); 440 } else { 441 return page_to_phys(vmalloc_to_page(kaddr)) + 442 offset_in_page(kaddr); 443 } 444 } 445 446 struct hyp_shared_pfn { 447 u64 pfn; 448 int count; 449 struct rb_node node; 450 }; 451 452 static DEFINE_MUTEX(hyp_shared_pfns_lock); 453 static struct rb_root hyp_shared_pfns = RB_ROOT; 454 455 static struct hyp_shared_pfn *find_shared_pfn(u64 pfn, struct rb_node ***node, 456 struct rb_node **parent) 457 { 458 struct hyp_shared_pfn *this; 459 460 *node = &hyp_shared_pfns.rb_node; 461 *parent = NULL; 462 while (**node) { 463 this = container_of(**node, struct hyp_shared_pfn, node); 464 *parent = **node; 465 if (this->pfn < pfn) 466 *node = &((**node)->rb_left); 467 else if (this->pfn > pfn) 468 *node = &((**node)->rb_right); 469 else 470 return this; 471 } 472 473 return NULL; 474 } 475 476 static int share_pfn_hyp(u64 pfn) 477 { 478 struct rb_node **node, *parent; 479 struct hyp_shared_pfn *this; 480 int ret = 0; 481 482 mutex_lock(&hyp_shared_pfns_lock); 483 this = find_shared_pfn(pfn, &node, &parent); 484 if (this) { 485 this->count++; 486 goto unlock; 487 } 488 489 this = kzalloc(sizeof(*this), GFP_KERNEL); 490 if (!this) { 491 ret = -ENOMEM; 492 goto unlock; 493 } 494 495 this->pfn = pfn; 496 this->count = 1; 497 rb_link_node(&this->node, parent, node); 498 rb_insert_color(&this->node, &hyp_shared_pfns); 499 ret = kvm_call_hyp_nvhe(__pkvm_host_share_hyp, pfn, 1); 500 unlock: 501 mutex_unlock(&hyp_shared_pfns_lock); 502 503 return ret; 504 } 505 506 static int unshare_pfn_hyp(u64 pfn) 507 { 508 struct rb_node **node, *parent; 509 struct hyp_shared_pfn *this; 510 int ret = 0; 511 512 mutex_lock(&hyp_shared_pfns_lock); 513 this = find_shared_pfn(pfn, &node, &parent); 514 if (WARN_ON(!this)) { 515 ret = -ENOENT; 516 goto unlock; 517 } 518 519 this->count--; 520 if (this->count) 521 goto unlock; 522 523 rb_erase(&this->node, &hyp_shared_pfns); 524 kfree(this); 525 ret = kvm_call_hyp_nvhe(__pkvm_host_unshare_hyp, pfn, 1); 526 unlock: 527 mutex_unlock(&hyp_shared_pfns_lock); 528 529 return ret; 530 } 531 532 int kvm_share_hyp(void *from, void *to) 533 { 534 phys_addr_t start, end, cur; 535 u64 pfn; 536 int ret; 537 538 if (is_kernel_in_hyp_mode()) 539 return 0; 540 541 /* 542 * The share hcall maps things in the 'fixed-offset' region of the hyp 543 * VA space, so we can only share physically contiguous data-structures 544 * for now. 545 */ 546 if (is_vmalloc_or_module_addr(from) || is_vmalloc_or_module_addr(to)) 547 return -EINVAL; 548 549 if (kvm_host_owns_hyp_mappings()) 550 return create_hyp_mappings(from, to, PAGE_HYP); 551 552 start = ALIGN_DOWN(__pa(from), PAGE_SIZE); 553 end = PAGE_ALIGN(__pa(to)); 554 for (cur = start; cur < end; cur += PAGE_SIZE) { 555 pfn = __phys_to_pfn(cur); 556 ret = share_pfn_hyp(pfn); 557 if (ret) 558 return ret; 559 } 560 561 return 0; 562 } 563 564 void kvm_unshare_hyp(void *from, void *to) 565 { 566 phys_addr_t start, end, cur; 567 u64 pfn; 568 569 if (is_kernel_in_hyp_mode() || kvm_host_owns_hyp_mappings() || !from) 570 return; 571 572 start = ALIGN_DOWN(__pa(from), PAGE_SIZE); 573 end = PAGE_ALIGN(__pa(to)); 574 for (cur = start; cur < end; cur += PAGE_SIZE) { 575 pfn = __phys_to_pfn(cur); 576 WARN_ON(unshare_pfn_hyp(pfn)); 577 } 578 } 579 580 /** 581 * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode 582 * @from: The virtual kernel start address of the range 583 * @to: The virtual kernel end address of the range (exclusive) 584 * @prot: The protection to be applied to this range 585 * 586 * The same virtual address as the kernel virtual address is also used 587 * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying 588 * physical pages. 589 */ 590 int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot) 591 { 592 phys_addr_t phys_addr; 593 unsigned long virt_addr; 594 unsigned long start = kern_hyp_va((unsigned long)from); 595 unsigned long end = kern_hyp_va((unsigned long)to); 596 597 if (is_kernel_in_hyp_mode()) 598 return 0; 599 600 if (!kvm_host_owns_hyp_mappings()) 601 return -EPERM; 602 603 start = start & PAGE_MASK; 604 end = PAGE_ALIGN(end); 605 606 for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) { 607 int err; 608 609 phys_addr = kvm_kaddr_to_phys(from + virt_addr - start); 610 err = __create_hyp_mappings(virt_addr, PAGE_SIZE, phys_addr, 611 prot); 612 if (err) 613 return err; 614 } 615 616 return 0; 617 } 618 619 static int __hyp_alloc_private_va_range(unsigned long base) 620 { 621 lockdep_assert_held(&kvm_hyp_pgd_mutex); 622 623 if (!PAGE_ALIGNED(base)) 624 return -EINVAL; 625 626 /* 627 * Verify that BIT(VA_BITS - 1) hasn't been flipped by 628 * allocating the new area, as it would indicate we've 629 * overflowed the idmap/IO address range. 630 */ 631 if ((base ^ io_map_base) & BIT(VA_BITS - 1)) 632 return -ENOMEM; 633 634 io_map_base = base; 635 636 return 0; 637 } 638 639 /** 640 * hyp_alloc_private_va_range - Allocates a private VA range. 641 * @size: The size of the VA range to reserve. 642 * @haddr: The hypervisor virtual start address of the allocation. 643 * 644 * The private virtual address (VA) range is allocated below io_map_base 645 * and aligned based on the order of @size. 646 * 647 * Return: 0 on success or negative error code on failure. 648 */ 649 int hyp_alloc_private_va_range(size_t size, unsigned long *haddr) 650 { 651 unsigned long base; 652 int ret = 0; 653 654 mutex_lock(&kvm_hyp_pgd_mutex); 655 656 /* 657 * This assumes that we have enough space below the idmap 658 * page to allocate our VAs. If not, the check in 659 * __hyp_alloc_private_va_range() will kick. A potential 660 * alternative would be to detect that overflow and switch 661 * to an allocation above the idmap. 662 * 663 * The allocated size is always a multiple of PAGE_SIZE. 664 */ 665 size = PAGE_ALIGN(size); 666 base = io_map_base - size; 667 ret = __hyp_alloc_private_va_range(base); 668 669 mutex_unlock(&kvm_hyp_pgd_mutex); 670 671 if (!ret) 672 *haddr = base; 673 674 return ret; 675 } 676 677 static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size, 678 unsigned long *haddr, 679 enum kvm_pgtable_prot prot) 680 { 681 unsigned long addr; 682 int ret = 0; 683 684 if (!kvm_host_owns_hyp_mappings()) { 685 addr = kvm_call_hyp_nvhe(__pkvm_create_private_mapping, 686 phys_addr, size, prot); 687 if (IS_ERR_VALUE(addr)) 688 return addr; 689 *haddr = addr; 690 691 return 0; 692 } 693 694 size = PAGE_ALIGN(size + offset_in_page(phys_addr)); 695 ret = hyp_alloc_private_va_range(size, &addr); 696 if (ret) 697 return ret; 698 699 ret = __create_hyp_mappings(addr, size, phys_addr, prot); 700 if (ret) 701 return ret; 702 703 *haddr = addr + offset_in_page(phys_addr); 704 return ret; 705 } 706 707 int create_hyp_stack(phys_addr_t phys_addr, unsigned long *haddr) 708 { 709 unsigned long base; 710 size_t size; 711 int ret; 712 713 mutex_lock(&kvm_hyp_pgd_mutex); 714 /* 715 * Efficient stack verification using the NVHE_STACK_SHIFT bit implies 716 * an alignment of our allocation on the order of the size. 717 */ 718 size = NVHE_STACK_SIZE * 2; 719 base = ALIGN_DOWN(io_map_base - size, size); 720 721 ret = __hyp_alloc_private_va_range(base); 722 723 mutex_unlock(&kvm_hyp_pgd_mutex); 724 725 if (ret) { 726 kvm_err("Cannot allocate hyp stack guard page\n"); 727 return ret; 728 } 729 730 /* 731 * Since the stack grows downwards, map the stack to the page 732 * at the higher address and leave the lower guard page 733 * unbacked. 734 * 735 * Any valid stack address now has the NVHE_STACK_SHIFT bit as 1 736 * and addresses corresponding to the guard page have the 737 * NVHE_STACK_SHIFT bit as 0 - this is used for overflow detection. 738 */ 739 ret = __create_hyp_mappings(base + NVHE_STACK_SIZE, NVHE_STACK_SIZE, 740 phys_addr, PAGE_HYP); 741 if (ret) 742 kvm_err("Cannot map hyp stack\n"); 743 744 *haddr = base + size; 745 746 return ret; 747 } 748 749 /** 750 * create_hyp_io_mappings - Map IO into both kernel and HYP 751 * @phys_addr: The physical start address which gets mapped 752 * @size: Size of the region being mapped 753 * @kaddr: Kernel VA for this mapping 754 * @haddr: HYP VA for this mapping 755 */ 756 int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size, 757 void __iomem **kaddr, 758 void __iomem **haddr) 759 { 760 unsigned long addr; 761 int ret; 762 763 if (is_protected_kvm_enabled()) 764 return -EPERM; 765 766 *kaddr = ioremap(phys_addr, size); 767 if (!*kaddr) 768 return -ENOMEM; 769 770 if (is_kernel_in_hyp_mode()) { 771 *haddr = *kaddr; 772 return 0; 773 } 774 775 ret = __create_hyp_private_mapping(phys_addr, size, 776 &addr, PAGE_HYP_DEVICE); 777 if (ret) { 778 iounmap(*kaddr); 779 *kaddr = NULL; 780 *haddr = NULL; 781 return ret; 782 } 783 784 *haddr = (void __iomem *)addr; 785 return 0; 786 } 787 788 /** 789 * create_hyp_exec_mappings - Map an executable range into HYP 790 * @phys_addr: The physical start address which gets mapped 791 * @size: Size of the region being mapped 792 * @haddr: HYP VA for this mapping 793 */ 794 int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size, 795 void **haddr) 796 { 797 unsigned long addr; 798 int ret; 799 800 BUG_ON(is_kernel_in_hyp_mode()); 801 802 ret = __create_hyp_private_mapping(phys_addr, size, 803 &addr, PAGE_HYP_EXEC); 804 if (ret) { 805 *haddr = NULL; 806 return ret; 807 } 808 809 *haddr = (void *)addr; 810 return 0; 811 } 812 813 static struct kvm_pgtable_mm_ops kvm_user_mm_ops = { 814 /* We shouldn't need any other callback to walk the PT */ 815 .phys_to_virt = kvm_host_va, 816 }; 817 818 static int get_user_mapping_size(struct kvm *kvm, u64 addr) 819 { 820 struct kvm_pgtable pgt = { 821 .pgd = (kvm_pteref_t)kvm->mm->pgd, 822 .ia_bits = vabits_actual, 823 .start_level = (KVM_PGTABLE_LAST_LEVEL - 824 ARM64_HW_PGTABLE_LEVELS(pgt.ia_bits) + 1), 825 .mm_ops = &kvm_user_mm_ops, 826 }; 827 unsigned long flags; 828 kvm_pte_t pte = 0; /* Keep GCC quiet... */ 829 s8 level = S8_MAX; 830 int ret; 831 832 /* 833 * Disable IRQs so that we hazard against a concurrent 834 * teardown of the userspace page tables (which relies on 835 * IPI-ing threads). 836 */ 837 local_irq_save(flags); 838 ret = kvm_pgtable_get_leaf(&pgt, addr, &pte, &level); 839 local_irq_restore(flags); 840 841 if (ret) 842 return ret; 843 844 /* 845 * Not seeing an error, but not updating level? Something went 846 * deeply wrong... 847 */ 848 if (WARN_ON(level > KVM_PGTABLE_LAST_LEVEL)) 849 return -EFAULT; 850 if (WARN_ON(level < KVM_PGTABLE_FIRST_LEVEL)) 851 return -EFAULT; 852 853 /* Oops, the userspace PTs are gone... Replay the fault */ 854 if (!kvm_pte_valid(pte)) 855 return -EAGAIN; 856 857 return BIT(ARM64_HW_PGTABLE_LEVEL_SHIFT(level)); 858 } 859 860 static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = { 861 .zalloc_page = stage2_memcache_zalloc_page, 862 .zalloc_pages_exact = kvm_s2_zalloc_pages_exact, 863 .free_pages_exact = kvm_s2_free_pages_exact, 864 .free_unlinked_table = stage2_free_unlinked_table, 865 .get_page = kvm_host_get_page, 866 .put_page = kvm_s2_put_page, 867 .page_count = kvm_host_page_count, 868 .phys_to_virt = kvm_host_va, 869 .virt_to_phys = kvm_host_pa, 870 .dcache_clean_inval_poc = clean_dcache_guest_page, 871 .icache_inval_pou = invalidate_icache_guest_page, 872 }; 873 874 static int kvm_init_ipa_range(struct kvm_s2_mmu *mmu, unsigned long type) 875 { 876 u32 kvm_ipa_limit = get_kvm_ipa_limit(); 877 u64 mmfr0, mmfr1; 878 u32 phys_shift; 879 880 if (type & ~KVM_VM_TYPE_ARM_IPA_SIZE_MASK) 881 return -EINVAL; 882 883 phys_shift = KVM_VM_TYPE_ARM_IPA_SIZE(type); 884 if (is_protected_kvm_enabled()) { 885 phys_shift = kvm_ipa_limit; 886 } else if (phys_shift) { 887 if (phys_shift > kvm_ipa_limit || 888 phys_shift < ARM64_MIN_PARANGE_BITS) 889 return -EINVAL; 890 } else { 891 phys_shift = KVM_PHYS_SHIFT; 892 if (phys_shift > kvm_ipa_limit) { 893 pr_warn_once("%s using unsupported default IPA limit, upgrade your VMM\n", 894 current->comm); 895 return -EINVAL; 896 } 897 } 898 899 mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); 900 mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); 901 mmu->vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift); 902 903 return 0; 904 } 905 906 /** 907 * kvm_init_stage2_mmu - Initialise a S2 MMU structure 908 * @kvm: The pointer to the KVM structure 909 * @mmu: The pointer to the s2 MMU structure 910 * @type: The machine type of the virtual machine 911 * 912 * Allocates only the stage-2 HW PGD level table(s). 913 * Note we don't need locking here as this is only called in two cases: 914 * 915 * - when the VM is created, which can't race against anything 916 * 917 * - when secondary kvm_s2_mmu structures are initialised for NV 918 * guests, and the caller must hold kvm->lock as this is called on a 919 * per-vcpu basis. 920 */ 921 int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type) 922 { 923 int cpu, err; 924 struct kvm_pgtable *pgt; 925 926 /* 927 * If we already have our page tables in place, and that the 928 * MMU context is the canonical one, we have a bug somewhere, 929 * as this is only supposed to ever happen once per VM. 930 * 931 * Otherwise, we're building nested page tables, and that's 932 * probably because userspace called KVM_ARM_VCPU_INIT more 933 * than once on the same vcpu. Since that's actually legal, 934 * don't kick a fuss and leave gracefully. 935 */ 936 if (mmu->pgt != NULL) { 937 if (kvm_is_nested_s2_mmu(kvm, mmu)) 938 return 0; 939 940 kvm_err("kvm_arch already initialized?\n"); 941 return -EINVAL; 942 } 943 944 err = kvm_init_ipa_range(mmu, type); 945 if (err) 946 return err; 947 948 pgt = kzalloc(sizeof(*pgt), GFP_KERNEL_ACCOUNT); 949 if (!pgt) 950 return -ENOMEM; 951 952 mmu->arch = &kvm->arch; 953 err = KVM_PGT_FN(kvm_pgtable_stage2_init)(pgt, mmu, &kvm_s2_mm_ops); 954 if (err) 955 goto out_free_pgtable; 956 957 mmu->pgt = pgt; 958 if (is_protected_kvm_enabled()) 959 return 0; 960 961 mmu->last_vcpu_ran = alloc_percpu(typeof(*mmu->last_vcpu_ran)); 962 if (!mmu->last_vcpu_ran) { 963 err = -ENOMEM; 964 goto out_destroy_pgtable; 965 } 966 967 for_each_possible_cpu(cpu) 968 *per_cpu_ptr(mmu->last_vcpu_ran, cpu) = -1; 969 970 /* The eager page splitting is disabled by default */ 971 mmu->split_page_chunk_size = KVM_ARM_EAGER_SPLIT_CHUNK_SIZE_DEFAULT; 972 mmu->split_page_cache.gfp_zero = __GFP_ZERO; 973 974 mmu->pgd_phys = __pa(pgt->pgd); 975 976 if (kvm_is_nested_s2_mmu(kvm, mmu)) 977 kvm_init_nested_s2_mmu(mmu); 978 979 return 0; 980 981 out_destroy_pgtable: 982 KVM_PGT_FN(kvm_pgtable_stage2_destroy)(pgt); 983 out_free_pgtable: 984 kfree(pgt); 985 return err; 986 } 987 988 void kvm_uninit_stage2_mmu(struct kvm *kvm) 989 { 990 kvm_free_stage2_pgd(&kvm->arch.mmu); 991 kvm_mmu_free_memory_cache(&kvm->arch.mmu.split_page_cache); 992 } 993 994 static void stage2_unmap_memslot(struct kvm *kvm, 995 struct kvm_memory_slot *memslot) 996 { 997 hva_t hva = memslot->userspace_addr; 998 phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT; 999 phys_addr_t size = PAGE_SIZE * memslot->npages; 1000 hva_t reg_end = hva + size; 1001 1002 /* 1003 * A memory region could potentially cover multiple VMAs, and any holes 1004 * between them, so iterate over all of them to find out if we should 1005 * unmap any of them. 1006 * 1007 * +--------------------------------------------+ 1008 * +---------------+----------------+ +----------------+ 1009 * | : VMA 1 | VMA 2 | | VMA 3 : | 1010 * +---------------+----------------+ +----------------+ 1011 * | memory region | 1012 * +--------------------------------------------+ 1013 */ 1014 do { 1015 struct vm_area_struct *vma; 1016 hva_t vm_start, vm_end; 1017 1018 vma = find_vma_intersection(current->mm, hva, reg_end); 1019 if (!vma) 1020 break; 1021 1022 /* 1023 * Take the intersection of this VMA with the memory region 1024 */ 1025 vm_start = max(hva, vma->vm_start); 1026 vm_end = min(reg_end, vma->vm_end); 1027 1028 if (!(vma->vm_flags & VM_PFNMAP)) { 1029 gpa_t gpa = addr + (vm_start - memslot->userspace_addr); 1030 kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, vm_end - vm_start, true); 1031 } 1032 hva = vm_end; 1033 } while (hva < reg_end); 1034 } 1035 1036 /** 1037 * stage2_unmap_vm - Unmap Stage-2 RAM mappings 1038 * @kvm: The struct kvm pointer 1039 * 1040 * Go through the memregions and unmap any regular RAM 1041 * backing memory already mapped to the VM. 1042 */ 1043 void stage2_unmap_vm(struct kvm *kvm) 1044 { 1045 struct kvm_memslots *slots; 1046 struct kvm_memory_slot *memslot; 1047 int idx, bkt; 1048 1049 idx = srcu_read_lock(&kvm->srcu); 1050 mmap_read_lock(current->mm); 1051 write_lock(&kvm->mmu_lock); 1052 1053 slots = kvm_memslots(kvm); 1054 kvm_for_each_memslot(memslot, bkt, slots) 1055 stage2_unmap_memslot(kvm, memslot); 1056 1057 kvm_nested_s2_unmap(kvm, true); 1058 1059 write_unlock(&kvm->mmu_lock); 1060 mmap_read_unlock(current->mm); 1061 srcu_read_unlock(&kvm->srcu, idx); 1062 } 1063 1064 void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu) 1065 { 1066 struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu); 1067 struct kvm_pgtable *pgt = NULL; 1068 1069 write_lock(&kvm->mmu_lock); 1070 pgt = mmu->pgt; 1071 if (pgt) { 1072 mmu->pgd_phys = 0; 1073 mmu->pgt = NULL; 1074 free_percpu(mmu->last_vcpu_ran); 1075 } 1076 write_unlock(&kvm->mmu_lock); 1077 1078 if (pgt) { 1079 KVM_PGT_FN(kvm_pgtable_stage2_destroy)(pgt); 1080 kfree(pgt); 1081 } 1082 } 1083 1084 static void hyp_mc_free_fn(void *addr, void *mc) 1085 { 1086 struct kvm_hyp_memcache *memcache = mc; 1087 1088 if (memcache->flags & HYP_MEMCACHE_ACCOUNT_STAGE2) 1089 kvm_account_pgtable_pages(addr, -1); 1090 1091 free_page((unsigned long)addr); 1092 } 1093 1094 static void *hyp_mc_alloc_fn(void *mc) 1095 { 1096 struct kvm_hyp_memcache *memcache = mc; 1097 void *addr; 1098 1099 addr = (void *)__get_free_page(GFP_KERNEL_ACCOUNT); 1100 if (addr && memcache->flags & HYP_MEMCACHE_ACCOUNT_STAGE2) 1101 kvm_account_pgtable_pages(addr, 1); 1102 1103 return addr; 1104 } 1105 1106 void free_hyp_memcache(struct kvm_hyp_memcache *mc) 1107 { 1108 if (!is_protected_kvm_enabled()) 1109 return; 1110 1111 kfree(mc->mapping); 1112 __free_hyp_memcache(mc, hyp_mc_free_fn, kvm_host_va, mc); 1113 } 1114 1115 int topup_hyp_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages) 1116 { 1117 if (!is_protected_kvm_enabled()) 1118 return 0; 1119 1120 if (!mc->mapping) { 1121 mc->mapping = kzalloc(sizeof(struct pkvm_mapping), GFP_KERNEL_ACCOUNT); 1122 if (!mc->mapping) 1123 return -ENOMEM; 1124 } 1125 1126 return __topup_hyp_memcache(mc, min_pages, hyp_mc_alloc_fn, 1127 kvm_host_pa, mc); 1128 } 1129 1130 /** 1131 * kvm_phys_addr_ioremap - map a device range to guest IPA 1132 * 1133 * @kvm: The KVM pointer 1134 * @guest_ipa: The IPA at which to insert the mapping 1135 * @pa: The physical address of the device 1136 * @size: The size of the mapping 1137 * @writable: Whether or not to create a writable mapping 1138 */ 1139 int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, 1140 phys_addr_t pa, unsigned long size, bool writable) 1141 { 1142 phys_addr_t addr; 1143 int ret = 0; 1144 struct kvm_mmu_memory_cache cache = { .gfp_zero = __GFP_ZERO }; 1145 struct kvm_s2_mmu *mmu = &kvm->arch.mmu; 1146 struct kvm_pgtable *pgt = mmu->pgt; 1147 enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_DEVICE | 1148 KVM_PGTABLE_PROT_R | 1149 (writable ? KVM_PGTABLE_PROT_W : 0); 1150 1151 if (is_protected_kvm_enabled()) 1152 return -EPERM; 1153 1154 size += offset_in_page(guest_ipa); 1155 guest_ipa &= PAGE_MASK; 1156 1157 for (addr = guest_ipa; addr < guest_ipa + size; addr += PAGE_SIZE) { 1158 ret = kvm_mmu_topup_memory_cache(&cache, 1159 kvm_mmu_cache_min_pages(mmu)); 1160 if (ret) 1161 break; 1162 1163 write_lock(&kvm->mmu_lock); 1164 ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, addr, PAGE_SIZE, 1165 pa, prot, &cache, 0); 1166 write_unlock(&kvm->mmu_lock); 1167 if (ret) 1168 break; 1169 1170 pa += PAGE_SIZE; 1171 } 1172 1173 kvm_mmu_free_memory_cache(&cache); 1174 return ret; 1175 } 1176 1177 /** 1178 * kvm_stage2_wp_range() - write protect stage2 memory region range 1179 * @mmu: The KVM stage-2 MMU pointer 1180 * @addr: Start address of range 1181 * @end: End address of range 1182 */ 1183 void kvm_stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end) 1184 { 1185 stage2_apply_range_resched(mmu, addr, end, KVM_PGT_FN(kvm_pgtable_stage2_wrprotect)); 1186 } 1187 1188 /** 1189 * kvm_mmu_wp_memory_region() - write protect stage 2 entries for memory slot 1190 * @kvm: The KVM pointer 1191 * @slot: The memory slot to write protect 1192 * 1193 * Called to start logging dirty pages after memory region 1194 * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns 1195 * all present PUD, PMD and PTEs are write protected in the memory region. 1196 * Afterwards read of dirty page log can be called. 1197 * 1198 * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired, 1199 * serializing operations for VM memory regions. 1200 */ 1201 static void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot) 1202 { 1203 struct kvm_memslots *slots = kvm_memslots(kvm); 1204 struct kvm_memory_slot *memslot = id_to_memslot(slots, slot); 1205 phys_addr_t start, end; 1206 1207 if (WARN_ON_ONCE(!memslot)) 1208 return; 1209 1210 start = memslot->base_gfn << PAGE_SHIFT; 1211 end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT; 1212 1213 write_lock(&kvm->mmu_lock); 1214 kvm_stage2_wp_range(&kvm->arch.mmu, start, end); 1215 kvm_nested_s2_wp(kvm); 1216 write_unlock(&kvm->mmu_lock); 1217 kvm_flush_remote_tlbs_memslot(kvm, memslot); 1218 } 1219 1220 /** 1221 * kvm_mmu_split_memory_region() - split the stage 2 blocks into PAGE_SIZE 1222 * pages for memory slot 1223 * @kvm: The KVM pointer 1224 * @slot: The memory slot to split 1225 * 1226 * Acquires kvm->mmu_lock. Called with kvm->slots_lock mutex acquired, 1227 * serializing operations for VM memory regions. 1228 */ 1229 static void kvm_mmu_split_memory_region(struct kvm *kvm, int slot) 1230 { 1231 struct kvm_memslots *slots; 1232 struct kvm_memory_slot *memslot; 1233 phys_addr_t start, end; 1234 1235 lockdep_assert_held(&kvm->slots_lock); 1236 1237 slots = kvm_memslots(kvm); 1238 memslot = id_to_memslot(slots, slot); 1239 1240 start = memslot->base_gfn << PAGE_SHIFT; 1241 end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT; 1242 1243 write_lock(&kvm->mmu_lock); 1244 kvm_mmu_split_huge_pages(kvm, start, end); 1245 write_unlock(&kvm->mmu_lock); 1246 } 1247 1248 /* 1249 * kvm_arch_mmu_enable_log_dirty_pt_masked() - enable dirty logging for selected pages. 1250 * @kvm: The KVM pointer 1251 * @slot: The memory slot associated with mask 1252 * @gfn_offset: The gfn offset in memory slot 1253 * @mask: The mask of pages at offset 'gfn_offset' in this memory 1254 * slot to enable dirty logging on 1255 * 1256 * Writes protect selected pages to enable dirty logging, and then 1257 * splits them to PAGE_SIZE. Caller must acquire kvm->mmu_lock. 1258 */ 1259 void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, 1260 struct kvm_memory_slot *slot, 1261 gfn_t gfn_offset, unsigned long mask) 1262 { 1263 phys_addr_t base_gfn = slot->base_gfn + gfn_offset; 1264 phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT; 1265 phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT; 1266 1267 lockdep_assert_held_write(&kvm->mmu_lock); 1268 1269 kvm_stage2_wp_range(&kvm->arch.mmu, start, end); 1270 1271 /* 1272 * Eager-splitting is done when manual-protect is set. We 1273 * also check for initially-all-set because we can avoid 1274 * eager-splitting if initially-all-set is false. 1275 * Initially-all-set equal false implies that huge-pages were 1276 * already split when enabling dirty logging: no need to do it 1277 * again. 1278 */ 1279 if (kvm_dirty_log_manual_protect_and_init_set(kvm)) 1280 kvm_mmu_split_huge_pages(kvm, start, end); 1281 1282 kvm_nested_s2_wp(kvm); 1283 } 1284 1285 static void kvm_send_hwpoison_signal(unsigned long address, short lsb) 1286 { 1287 send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current); 1288 } 1289 1290 static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot, 1291 unsigned long hva, 1292 unsigned long map_size) 1293 { 1294 gpa_t gpa_start; 1295 hva_t uaddr_start, uaddr_end; 1296 size_t size; 1297 1298 /* The memslot and the VMA are guaranteed to be aligned to PAGE_SIZE */ 1299 if (map_size == PAGE_SIZE) 1300 return true; 1301 1302 /* pKVM only supports PMD_SIZE huge-mappings */ 1303 if (is_protected_kvm_enabled() && map_size != PMD_SIZE) 1304 return false; 1305 1306 size = memslot->npages * PAGE_SIZE; 1307 1308 gpa_start = memslot->base_gfn << PAGE_SHIFT; 1309 1310 uaddr_start = memslot->userspace_addr; 1311 uaddr_end = uaddr_start + size; 1312 1313 /* 1314 * Pages belonging to memslots that don't have the same alignment 1315 * within a PMD/PUD for userspace and IPA cannot be mapped with stage-2 1316 * PMD/PUD entries, because we'll end up mapping the wrong pages. 1317 * 1318 * Consider a layout like the following: 1319 * 1320 * memslot->userspace_addr: 1321 * +-----+--------------------+--------------------+---+ 1322 * |abcde|fgh Stage-1 block | Stage-1 block tv|xyz| 1323 * +-----+--------------------+--------------------+---+ 1324 * 1325 * memslot->base_gfn << PAGE_SHIFT: 1326 * +---+--------------------+--------------------+-----+ 1327 * |abc|def Stage-2 block | Stage-2 block |tvxyz| 1328 * +---+--------------------+--------------------+-----+ 1329 * 1330 * If we create those stage-2 blocks, we'll end up with this incorrect 1331 * mapping: 1332 * d -> f 1333 * e -> g 1334 * f -> h 1335 */ 1336 if ((gpa_start & (map_size - 1)) != (uaddr_start & (map_size - 1))) 1337 return false; 1338 1339 /* 1340 * Next, let's make sure we're not trying to map anything not covered 1341 * by the memslot. This means we have to prohibit block size mappings 1342 * for the beginning and end of a non-block aligned and non-block sized 1343 * memory slot (illustrated by the head and tail parts of the 1344 * userspace view above containing pages 'abcde' and 'xyz', 1345 * respectively). 1346 * 1347 * Note that it doesn't matter if we do the check using the 1348 * userspace_addr or the base_gfn, as both are equally aligned (per 1349 * the check above) and equally sized. 1350 */ 1351 return (hva & ~(map_size - 1)) >= uaddr_start && 1352 (hva & ~(map_size - 1)) + map_size <= uaddr_end; 1353 } 1354 1355 /* 1356 * Check if the given hva is backed by a transparent huge page (THP) and 1357 * whether it can be mapped using block mapping in stage2. If so, adjust 1358 * the stage2 PFN and IPA accordingly. Only PMD_SIZE THPs are currently 1359 * supported. This will need to be updated to support other THP sizes. 1360 * 1361 * Returns the size of the mapping. 1362 */ 1363 static long 1364 transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot, 1365 unsigned long hva, kvm_pfn_t *pfnp, 1366 phys_addr_t *ipap) 1367 { 1368 kvm_pfn_t pfn = *pfnp; 1369 1370 /* 1371 * Make sure the adjustment is done only for THP pages. Also make 1372 * sure that the HVA and IPA are sufficiently aligned and that the 1373 * block map is contained within the memslot. 1374 */ 1375 if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) { 1376 int sz = get_user_mapping_size(kvm, hva); 1377 1378 if (sz < 0) 1379 return sz; 1380 1381 if (sz < PMD_SIZE) 1382 return PAGE_SIZE; 1383 1384 *ipap &= PMD_MASK; 1385 pfn &= ~(PTRS_PER_PMD - 1); 1386 *pfnp = pfn; 1387 1388 return PMD_SIZE; 1389 } 1390 1391 /* Use page mapping if we cannot use block mapping. */ 1392 return PAGE_SIZE; 1393 } 1394 1395 static int get_vma_page_shift(struct vm_area_struct *vma, unsigned long hva) 1396 { 1397 unsigned long pa; 1398 1399 if (is_vm_hugetlb_page(vma) && !(vma->vm_flags & VM_PFNMAP)) 1400 return huge_page_shift(hstate_vma(vma)); 1401 1402 if (!(vma->vm_flags & VM_PFNMAP)) 1403 return PAGE_SHIFT; 1404 1405 VM_BUG_ON(is_vm_hugetlb_page(vma)); 1406 1407 pa = (vma->vm_pgoff << PAGE_SHIFT) + (hva - vma->vm_start); 1408 1409 #ifndef __PAGETABLE_PMD_FOLDED 1410 if ((hva & (PUD_SIZE - 1)) == (pa & (PUD_SIZE - 1)) && 1411 ALIGN_DOWN(hva, PUD_SIZE) >= vma->vm_start && 1412 ALIGN(hva, PUD_SIZE) <= vma->vm_end) 1413 return PUD_SHIFT; 1414 #endif 1415 1416 if ((hva & (PMD_SIZE - 1)) == (pa & (PMD_SIZE - 1)) && 1417 ALIGN_DOWN(hva, PMD_SIZE) >= vma->vm_start && 1418 ALIGN(hva, PMD_SIZE) <= vma->vm_end) 1419 return PMD_SHIFT; 1420 1421 return PAGE_SHIFT; 1422 } 1423 1424 /* 1425 * The page will be mapped in stage 2 as Normal Cacheable, so the VM will be 1426 * able to see the page's tags and therefore they must be initialised first. If 1427 * PG_mte_tagged is set, tags have already been initialised. 1428 * 1429 * The race in the test/set of the PG_mte_tagged flag is handled by: 1430 * - preventing VM_SHARED mappings in a memslot with MTE preventing two VMs 1431 * racing to santise the same page 1432 * - mmap_lock protects between a VM faulting a page in and the VMM performing 1433 * an mprotect() to add VM_MTE 1434 */ 1435 static void sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn, 1436 unsigned long size) 1437 { 1438 unsigned long i, nr_pages = size >> PAGE_SHIFT; 1439 struct page *page = pfn_to_page(pfn); 1440 struct folio *folio = page_folio(page); 1441 1442 if (!kvm_has_mte(kvm)) 1443 return; 1444 1445 if (folio_test_hugetlb(folio)) { 1446 /* Hugetlb has MTE flags set on head page only */ 1447 if (folio_try_hugetlb_mte_tagging(folio)) { 1448 for (i = 0; i < nr_pages; i++, page++) 1449 mte_clear_page_tags(page_address(page)); 1450 folio_set_hugetlb_mte_tagged(folio); 1451 } 1452 return; 1453 } 1454 1455 for (i = 0; i < nr_pages; i++, page++) { 1456 if (try_page_mte_tagging(page)) { 1457 mte_clear_page_tags(page_address(page)); 1458 set_page_mte_tagged(page); 1459 } 1460 } 1461 } 1462 1463 static bool kvm_vma_mte_allowed(struct vm_area_struct *vma) 1464 { 1465 return vma->vm_flags & VM_MTE_ALLOWED; 1466 } 1467 1468 static bool kvm_vma_is_cacheable(struct vm_area_struct *vma) 1469 { 1470 switch (FIELD_GET(PTE_ATTRINDX_MASK, pgprot_val(vma->vm_page_prot))) { 1471 case MT_NORMAL_NC: 1472 case MT_DEVICE_nGnRnE: 1473 case MT_DEVICE_nGnRE: 1474 return false; 1475 default: 1476 return true; 1477 } 1478 } 1479 1480 static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, 1481 struct kvm_s2_trans *nested, 1482 struct kvm_memory_slot *memslot, unsigned long hva, 1483 bool fault_is_perm) 1484 { 1485 int ret = 0; 1486 bool write_fault, writable, force_pte = false; 1487 bool exec_fault, mte_allowed, is_vma_cacheable; 1488 bool s2_force_noncacheable = false, vfio_allow_any_uc = false; 1489 unsigned long mmu_seq; 1490 phys_addr_t ipa = fault_ipa; 1491 struct kvm *kvm = vcpu->kvm; 1492 struct vm_area_struct *vma; 1493 short vma_shift; 1494 void *memcache; 1495 gfn_t gfn; 1496 kvm_pfn_t pfn; 1497 bool logging_active = memslot_is_logging(memslot); 1498 long vma_pagesize, fault_granule; 1499 enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; 1500 struct kvm_pgtable *pgt; 1501 struct page *page; 1502 vm_flags_t vm_flags; 1503 enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED; 1504 1505 if (fault_is_perm) 1506 fault_granule = kvm_vcpu_trap_get_perm_fault_granule(vcpu); 1507 write_fault = kvm_is_write_fault(vcpu); 1508 exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu); 1509 VM_BUG_ON(write_fault && exec_fault); 1510 1511 if (fault_is_perm && !write_fault && !exec_fault) { 1512 kvm_err("Unexpected L2 read permission error\n"); 1513 return -EFAULT; 1514 } 1515 1516 if (!is_protected_kvm_enabled()) 1517 memcache = &vcpu->arch.mmu_page_cache; 1518 else 1519 memcache = &vcpu->arch.pkvm_memcache; 1520 1521 /* 1522 * Permission faults just need to update the existing leaf entry, 1523 * and so normally don't require allocations from the memcache. The 1524 * only exception to this is when dirty logging is enabled at runtime 1525 * and a write fault needs to collapse a block entry into a table. 1526 */ 1527 if (!fault_is_perm || (logging_active && write_fault)) { 1528 int min_pages = kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu); 1529 1530 if (!is_protected_kvm_enabled()) 1531 ret = kvm_mmu_topup_memory_cache(memcache, min_pages); 1532 else 1533 ret = topup_hyp_memcache(memcache, min_pages); 1534 1535 if (ret) 1536 return ret; 1537 } 1538 1539 /* 1540 * Let's check if we will get back a huge page backed by hugetlbfs, or 1541 * get block mapping for device MMIO region. 1542 */ 1543 mmap_read_lock(current->mm); 1544 vma = vma_lookup(current->mm, hva); 1545 if (unlikely(!vma)) { 1546 kvm_err("Failed to find VMA for hva 0x%lx\n", hva); 1547 mmap_read_unlock(current->mm); 1548 return -EFAULT; 1549 } 1550 1551 /* 1552 * logging_active is guaranteed to never be true for VM_PFNMAP 1553 * memslots. 1554 */ 1555 if (logging_active) { 1556 force_pte = true; 1557 vma_shift = PAGE_SHIFT; 1558 } else { 1559 vma_shift = get_vma_page_shift(vma, hva); 1560 } 1561 1562 switch (vma_shift) { 1563 #ifndef __PAGETABLE_PMD_FOLDED 1564 case PUD_SHIFT: 1565 if (fault_supports_stage2_huge_mapping(memslot, hva, PUD_SIZE)) 1566 break; 1567 fallthrough; 1568 #endif 1569 case CONT_PMD_SHIFT: 1570 vma_shift = PMD_SHIFT; 1571 fallthrough; 1572 case PMD_SHIFT: 1573 if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) 1574 break; 1575 fallthrough; 1576 case CONT_PTE_SHIFT: 1577 vma_shift = PAGE_SHIFT; 1578 force_pte = true; 1579 fallthrough; 1580 case PAGE_SHIFT: 1581 break; 1582 default: 1583 WARN_ONCE(1, "Unknown vma_shift %d", vma_shift); 1584 } 1585 1586 vma_pagesize = 1UL << vma_shift; 1587 1588 if (nested) { 1589 unsigned long max_map_size; 1590 1591 max_map_size = force_pte ? PAGE_SIZE : PUD_SIZE; 1592 1593 ipa = kvm_s2_trans_output(nested); 1594 1595 /* 1596 * If we're about to create a shadow stage 2 entry, then we 1597 * can only create a block mapping if the guest stage 2 page 1598 * table uses at least as big a mapping. 1599 */ 1600 max_map_size = min(kvm_s2_trans_size(nested), max_map_size); 1601 1602 /* 1603 * Be careful that if the mapping size falls between 1604 * two host sizes, take the smallest of the two. 1605 */ 1606 if (max_map_size >= PMD_SIZE && max_map_size < PUD_SIZE) 1607 max_map_size = PMD_SIZE; 1608 else if (max_map_size >= PAGE_SIZE && max_map_size < PMD_SIZE) 1609 max_map_size = PAGE_SIZE; 1610 1611 force_pte = (max_map_size == PAGE_SIZE); 1612 vma_pagesize = min(vma_pagesize, (long)max_map_size); 1613 } 1614 1615 /* 1616 * Both the canonical IPA and fault IPA must be hugepage-aligned to 1617 * ensure we find the right PFN and lay down the mapping in the right 1618 * place. 1619 */ 1620 if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE) { 1621 fault_ipa &= ~(vma_pagesize - 1); 1622 ipa &= ~(vma_pagesize - 1); 1623 } 1624 1625 gfn = ipa >> PAGE_SHIFT; 1626 mte_allowed = kvm_vma_mte_allowed(vma); 1627 1628 vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED; 1629 1630 vm_flags = vma->vm_flags; 1631 1632 is_vma_cacheable = kvm_vma_is_cacheable(vma); 1633 1634 /* Don't use the VMA after the unlock -- it may have vanished */ 1635 vma = NULL; 1636 1637 /* 1638 * Read mmu_invalidate_seq so that KVM can detect if the results of 1639 * vma_lookup() or __kvm_faultin_pfn() become stale prior to 1640 * acquiring kvm->mmu_lock. 1641 * 1642 * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs 1643 * with the smp_wmb() in kvm_mmu_invalidate_end(). 1644 */ 1645 mmu_seq = vcpu->kvm->mmu_invalidate_seq; 1646 mmap_read_unlock(current->mm); 1647 1648 pfn = __kvm_faultin_pfn(memslot, gfn, write_fault ? FOLL_WRITE : 0, 1649 &writable, &page); 1650 if (pfn == KVM_PFN_ERR_HWPOISON) { 1651 kvm_send_hwpoison_signal(hva, vma_shift); 1652 return 0; 1653 } 1654 if (is_error_noslot_pfn(pfn)) 1655 return -EFAULT; 1656 1657 /* 1658 * Check if this is non-struct page memory PFN, and cannot support 1659 * CMOs. It could potentially be unsafe to access as cachable. 1660 */ 1661 if (vm_flags & (VM_PFNMAP | VM_MIXEDMAP) && !pfn_is_map_memory(pfn)) { 1662 if (is_vma_cacheable) { 1663 /* 1664 * Whilst the VMA owner expects cacheable mapping to this 1665 * PFN, hardware also has to support the FWB and CACHE DIC 1666 * features. 1667 * 1668 * ARM64 KVM relies on kernel VA mapping to the PFN to 1669 * perform cache maintenance as the CMO instructions work on 1670 * virtual addresses. VM_PFNMAP region are not necessarily 1671 * mapped to a KVA and hence the presence of hardware features 1672 * S2FWB and CACHE DIC are mandatory to avoid the need for 1673 * cache maintenance. 1674 */ 1675 if (!kvm_supports_cacheable_pfnmap()) 1676 return -EFAULT; 1677 } else { 1678 /* 1679 * If the page was identified as device early by looking at 1680 * the VMA flags, vma_pagesize is already representing the 1681 * largest quantity we can map. If instead it was mapped 1682 * via __kvm_faultin_pfn(), vma_pagesize is set to PAGE_SIZE 1683 * and must not be upgraded. 1684 * 1685 * In both cases, we don't let transparent_hugepage_adjust() 1686 * change things at the last minute. 1687 */ 1688 s2_force_noncacheable = true; 1689 } 1690 } else if (logging_active && !write_fault) { 1691 /* 1692 * Only actually map the page as writable if this was a write 1693 * fault. 1694 */ 1695 writable = false; 1696 } 1697 1698 if (exec_fault && s2_force_noncacheable) 1699 return -ENOEXEC; 1700 1701 /* 1702 * Potentially reduce shadow S2 permissions to match the guest's own 1703 * S2. For exec faults, we'd only reach this point if the guest 1704 * actually allowed it (see kvm_s2_handle_perm_fault). 1705 * 1706 * Also encode the level of the original translation in the SW bits 1707 * of the leaf entry as a proxy for the span of that translation. 1708 * This will be retrieved on TLB invalidation from the guest and 1709 * used to limit the invalidation scope if a TTL hint or a range 1710 * isn't provided. 1711 */ 1712 if (nested) { 1713 writable &= kvm_s2_trans_writable(nested); 1714 if (!kvm_s2_trans_readable(nested)) 1715 prot &= ~KVM_PGTABLE_PROT_R; 1716 1717 prot |= kvm_encode_nested_level(nested); 1718 } 1719 1720 kvm_fault_lock(kvm); 1721 pgt = vcpu->arch.hw_mmu->pgt; 1722 if (mmu_invalidate_retry(kvm, mmu_seq)) { 1723 ret = -EAGAIN; 1724 goto out_unlock; 1725 } 1726 1727 /* 1728 * If we are not forced to use page mapping, check if we are 1729 * backed by a THP and thus use block mapping if possible. 1730 */ 1731 if (vma_pagesize == PAGE_SIZE && !(force_pte || s2_force_noncacheable)) { 1732 if (fault_is_perm && fault_granule > PAGE_SIZE) 1733 vma_pagesize = fault_granule; 1734 else 1735 vma_pagesize = transparent_hugepage_adjust(kvm, memslot, 1736 hva, &pfn, 1737 &fault_ipa); 1738 1739 if (vma_pagesize < 0) { 1740 ret = vma_pagesize; 1741 goto out_unlock; 1742 } 1743 } 1744 1745 if (!fault_is_perm && !s2_force_noncacheable && kvm_has_mte(kvm)) { 1746 /* Check the VMM hasn't introduced a new disallowed VMA */ 1747 if (mte_allowed) { 1748 sanitise_mte_tags(kvm, pfn, vma_pagesize); 1749 } else { 1750 ret = -EFAULT; 1751 goto out_unlock; 1752 } 1753 } 1754 1755 if (writable) 1756 prot |= KVM_PGTABLE_PROT_W; 1757 1758 if (exec_fault) 1759 prot |= KVM_PGTABLE_PROT_X; 1760 1761 if (s2_force_noncacheable) { 1762 if (vfio_allow_any_uc) 1763 prot |= KVM_PGTABLE_PROT_NORMAL_NC; 1764 else 1765 prot |= KVM_PGTABLE_PROT_DEVICE; 1766 } else if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC) && 1767 (!nested || kvm_s2_trans_executable(nested))) { 1768 prot |= KVM_PGTABLE_PROT_X; 1769 } 1770 1771 /* 1772 * Under the premise of getting a FSC_PERM fault, we just need to relax 1773 * permissions only if vma_pagesize equals fault_granule. Otherwise, 1774 * kvm_pgtable_stage2_map() should be called to change block size. 1775 */ 1776 if (fault_is_perm && vma_pagesize == fault_granule) { 1777 /* 1778 * Drop the SW bits in favour of those stored in the 1779 * PTE, which will be preserved. 1780 */ 1781 prot &= ~KVM_NV_GUEST_MAP_SZ; 1782 ret = KVM_PGT_FN(kvm_pgtable_stage2_relax_perms)(pgt, fault_ipa, prot, flags); 1783 } else { 1784 ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, fault_ipa, vma_pagesize, 1785 __pfn_to_phys(pfn), prot, 1786 memcache, flags); 1787 } 1788 1789 out_unlock: 1790 kvm_release_faultin_page(kvm, page, !!ret, writable); 1791 kvm_fault_unlock(kvm); 1792 1793 /* Mark the page dirty only if the fault is handled successfully */ 1794 if (writable && !ret) 1795 mark_page_dirty_in_slot(kvm, memslot, gfn); 1796 1797 return ret != -EAGAIN ? ret : 0; 1798 } 1799 1800 /* Resolve the access fault by making the page young again. */ 1801 static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) 1802 { 1803 enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED; 1804 struct kvm_s2_mmu *mmu; 1805 1806 trace_kvm_access_fault(fault_ipa); 1807 1808 read_lock(&vcpu->kvm->mmu_lock); 1809 mmu = vcpu->arch.hw_mmu; 1810 KVM_PGT_FN(kvm_pgtable_stage2_mkyoung)(mmu->pgt, fault_ipa, flags); 1811 read_unlock(&vcpu->kvm->mmu_lock); 1812 } 1813 1814 /** 1815 * kvm_handle_guest_abort - handles all 2nd stage aborts 1816 * @vcpu: the VCPU pointer 1817 * 1818 * Any abort that gets to the host is almost guaranteed to be caused by a 1819 * missing second stage translation table entry, which can mean that either the 1820 * guest simply needs more memory and we must allocate an appropriate page or it 1821 * can mean that the guest tried to access I/O memory, which is emulated by user 1822 * space. The distinction is based on the IPA causing the fault and whether this 1823 * memory region has been registered as standard RAM by user space. 1824 */ 1825 int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) 1826 { 1827 struct kvm_s2_trans nested_trans, *nested = NULL; 1828 unsigned long esr; 1829 phys_addr_t fault_ipa; /* The address we faulted on */ 1830 phys_addr_t ipa; /* Always the IPA in the L1 guest phys space */ 1831 struct kvm_memory_slot *memslot; 1832 unsigned long hva; 1833 bool is_iabt, write_fault, writable; 1834 gfn_t gfn; 1835 int ret, idx; 1836 1837 /* Synchronous External Abort? */ 1838 if (kvm_vcpu_abt_issea(vcpu)) { 1839 /* 1840 * For RAS the host kernel may handle this abort. 1841 * There is no need to pass the error into the guest. 1842 */ 1843 if (kvm_handle_guest_sea()) 1844 return kvm_inject_serror(vcpu); 1845 1846 return 1; 1847 } 1848 1849 esr = kvm_vcpu_get_esr(vcpu); 1850 1851 /* 1852 * The fault IPA should be reliable at this point as we're not dealing 1853 * with an SEA. 1854 */ 1855 ipa = fault_ipa = kvm_vcpu_get_fault_ipa(vcpu); 1856 if (KVM_BUG_ON(ipa == INVALID_GPA, vcpu->kvm)) 1857 return -EFAULT; 1858 1859 is_iabt = kvm_vcpu_trap_is_iabt(vcpu); 1860 1861 if (esr_fsc_is_translation_fault(esr)) { 1862 /* Beyond sanitised PARange (which is the IPA limit) */ 1863 if (fault_ipa >= BIT_ULL(get_kvm_ipa_limit())) { 1864 kvm_inject_size_fault(vcpu); 1865 return 1; 1866 } 1867 1868 /* Falls between the IPA range and the PARange? */ 1869 if (fault_ipa >= BIT_ULL(VTCR_EL2_IPA(vcpu->arch.hw_mmu->vtcr))) { 1870 fault_ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0); 1871 1872 return kvm_inject_sea(vcpu, is_iabt, fault_ipa); 1873 } 1874 } 1875 1876 trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_esr(vcpu), 1877 kvm_vcpu_get_hfar(vcpu), fault_ipa); 1878 1879 /* Check the stage-2 fault is trans. fault or write fault */ 1880 if (!esr_fsc_is_translation_fault(esr) && 1881 !esr_fsc_is_permission_fault(esr) && 1882 !esr_fsc_is_access_flag_fault(esr)) { 1883 kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n", 1884 kvm_vcpu_trap_get_class(vcpu), 1885 (unsigned long)kvm_vcpu_trap_get_fault(vcpu), 1886 (unsigned long)kvm_vcpu_get_esr(vcpu)); 1887 return -EFAULT; 1888 } 1889 1890 idx = srcu_read_lock(&vcpu->kvm->srcu); 1891 1892 /* 1893 * We may have faulted on a shadow stage 2 page table if we are 1894 * running a nested guest. In this case, we have to resolve the L2 1895 * IPA to the L1 IPA first, before knowing what kind of memory should 1896 * back the L1 IPA. 1897 * 1898 * If the shadow stage 2 page table walk faults, then we simply inject 1899 * this to the guest and carry on. 1900 * 1901 * If there are no shadow S2 PTs because S2 is disabled, there is 1902 * nothing to walk and we treat it as a 1:1 before going through the 1903 * canonical translation. 1904 */ 1905 if (kvm_is_nested_s2_mmu(vcpu->kvm,vcpu->arch.hw_mmu) && 1906 vcpu->arch.hw_mmu->nested_stage2_enabled) { 1907 u32 esr; 1908 1909 ret = kvm_walk_nested_s2(vcpu, fault_ipa, &nested_trans); 1910 if (ret) { 1911 esr = kvm_s2_trans_esr(&nested_trans); 1912 kvm_inject_s2_fault(vcpu, esr); 1913 goto out_unlock; 1914 } 1915 1916 ret = kvm_s2_handle_perm_fault(vcpu, &nested_trans); 1917 if (ret) { 1918 esr = kvm_s2_trans_esr(&nested_trans); 1919 kvm_inject_s2_fault(vcpu, esr); 1920 goto out_unlock; 1921 } 1922 1923 ipa = kvm_s2_trans_output(&nested_trans); 1924 nested = &nested_trans; 1925 } 1926 1927 gfn = ipa >> PAGE_SHIFT; 1928 memslot = gfn_to_memslot(vcpu->kvm, gfn); 1929 hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable); 1930 write_fault = kvm_is_write_fault(vcpu); 1931 if (kvm_is_error_hva(hva) || (write_fault && !writable)) { 1932 /* 1933 * The guest has put either its instructions or its page-tables 1934 * somewhere it shouldn't have. Userspace won't be able to do 1935 * anything about this (there's no syndrome for a start), so 1936 * re-inject the abort back into the guest. 1937 */ 1938 if (is_iabt) { 1939 ret = -ENOEXEC; 1940 goto out; 1941 } 1942 1943 if (kvm_vcpu_abt_iss1tw(vcpu)) { 1944 ret = kvm_inject_sea_dabt(vcpu, kvm_vcpu_get_hfar(vcpu)); 1945 goto out_unlock; 1946 } 1947 1948 /* 1949 * Check for a cache maintenance operation. Since we 1950 * ended-up here, we know it is outside of any memory 1951 * slot. But we can't find out if that is for a device, 1952 * or if the guest is just being stupid. The only thing 1953 * we know for sure is that this range cannot be cached. 1954 * 1955 * So let's assume that the guest is just being 1956 * cautious, and skip the instruction. 1957 */ 1958 if (kvm_is_error_hva(hva) && kvm_vcpu_dabt_is_cm(vcpu)) { 1959 kvm_incr_pc(vcpu); 1960 ret = 1; 1961 goto out_unlock; 1962 } 1963 1964 /* 1965 * The IPA is reported as [MAX:12], so we need to 1966 * complement it with the bottom 12 bits from the 1967 * faulting VA. This is always 12 bits, irrespective 1968 * of the page size. 1969 */ 1970 ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0); 1971 ret = io_mem_abort(vcpu, ipa); 1972 goto out_unlock; 1973 } 1974 1975 /* Userspace should not be able to register out-of-bounds IPAs */ 1976 VM_BUG_ON(ipa >= kvm_phys_size(vcpu->arch.hw_mmu)); 1977 1978 if (esr_fsc_is_access_flag_fault(esr)) { 1979 handle_access_fault(vcpu, fault_ipa); 1980 ret = 1; 1981 goto out_unlock; 1982 } 1983 1984 ret = user_mem_abort(vcpu, fault_ipa, nested, memslot, hva, 1985 esr_fsc_is_permission_fault(esr)); 1986 if (ret == 0) 1987 ret = 1; 1988 out: 1989 if (ret == -ENOEXEC) 1990 ret = kvm_inject_sea_iabt(vcpu, kvm_vcpu_get_hfar(vcpu)); 1991 out_unlock: 1992 srcu_read_unlock(&vcpu->kvm->srcu, idx); 1993 return ret; 1994 } 1995 1996 bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) 1997 { 1998 if (!kvm->arch.mmu.pgt) 1999 return false; 2000 2001 __unmap_stage2_range(&kvm->arch.mmu, range->start << PAGE_SHIFT, 2002 (range->end - range->start) << PAGE_SHIFT, 2003 range->may_block); 2004 2005 kvm_nested_s2_unmap(kvm, range->may_block); 2006 return false; 2007 } 2008 2009 bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 2010 { 2011 u64 size = (range->end - range->start) << PAGE_SHIFT; 2012 2013 if (!kvm->arch.mmu.pgt) 2014 return false; 2015 2016 return KVM_PGT_FN(kvm_pgtable_stage2_test_clear_young)(kvm->arch.mmu.pgt, 2017 range->start << PAGE_SHIFT, 2018 size, true); 2019 /* 2020 * TODO: Handle nested_mmu structures here using the reverse mapping in 2021 * a later version of patch series. 2022 */ 2023 } 2024 2025 bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 2026 { 2027 u64 size = (range->end - range->start) << PAGE_SHIFT; 2028 2029 if (!kvm->arch.mmu.pgt) 2030 return false; 2031 2032 return KVM_PGT_FN(kvm_pgtable_stage2_test_clear_young)(kvm->arch.mmu.pgt, 2033 range->start << PAGE_SHIFT, 2034 size, false); 2035 } 2036 2037 phys_addr_t kvm_mmu_get_httbr(void) 2038 { 2039 return __pa(hyp_pgtable->pgd); 2040 } 2041 2042 phys_addr_t kvm_get_idmap_vector(void) 2043 { 2044 return hyp_idmap_vector; 2045 } 2046 2047 static int kvm_map_idmap_text(void) 2048 { 2049 unsigned long size = hyp_idmap_end - hyp_idmap_start; 2050 int err = __create_hyp_mappings(hyp_idmap_start, size, hyp_idmap_start, 2051 PAGE_HYP_EXEC); 2052 if (err) 2053 kvm_err("Failed to idmap %lx-%lx\n", 2054 hyp_idmap_start, hyp_idmap_end); 2055 2056 return err; 2057 } 2058 2059 static void *kvm_hyp_zalloc_page(void *arg) 2060 { 2061 return (void *)get_zeroed_page(GFP_KERNEL); 2062 } 2063 2064 static struct kvm_pgtable_mm_ops kvm_hyp_mm_ops = { 2065 .zalloc_page = kvm_hyp_zalloc_page, 2066 .get_page = kvm_host_get_page, 2067 .put_page = kvm_host_put_page, 2068 .phys_to_virt = kvm_host_va, 2069 .virt_to_phys = kvm_host_pa, 2070 }; 2071 2072 int __init kvm_mmu_init(u32 *hyp_va_bits) 2073 { 2074 int err; 2075 u32 idmap_bits; 2076 u32 kernel_bits; 2077 2078 hyp_idmap_start = __pa_symbol(__hyp_idmap_text_start); 2079 hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE); 2080 hyp_idmap_end = __pa_symbol(__hyp_idmap_text_end); 2081 hyp_idmap_end = ALIGN(hyp_idmap_end, PAGE_SIZE); 2082 hyp_idmap_vector = __pa_symbol(__kvm_hyp_init); 2083 2084 /* 2085 * We rely on the linker script to ensure at build time that the HYP 2086 * init code does not cross a page boundary. 2087 */ 2088 BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK); 2089 2090 /* 2091 * The ID map is always configured for 48 bits of translation, which 2092 * may be fewer than the number of VA bits used by the regular kernel 2093 * stage 1, when VA_BITS=52. 2094 * 2095 * At EL2, there is only one TTBR register, and we can't switch between 2096 * translation tables *and* update TCR_EL2.T0SZ at the same time. Bottom 2097 * line: we need to use the extended range with *both* our translation 2098 * tables. 2099 * 2100 * So use the maximum of the idmap VA bits and the regular kernel stage 2101 * 1 VA bits to assure that the hypervisor can both ID map its code page 2102 * and map any kernel memory. 2103 */ 2104 idmap_bits = IDMAP_VA_BITS; 2105 kernel_bits = vabits_actual; 2106 *hyp_va_bits = max(idmap_bits, kernel_bits); 2107 2108 kvm_debug("Using %u-bit virtual addresses at EL2\n", *hyp_va_bits); 2109 kvm_debug("IDMAP page: %lx\n", hyp_idmap_start); 2110 kvm_debug("HYP VA range: %lx:%lx\n", 2111 kern_hyp_va(PAGE_OFFSET), 2112 kern_hyp_va((unsigned long)high_memory - 1)); 2113 2114 if (hyp_idmap_start >= kern_hyp_va(PAGE_OFFSET) && 2115 hyp_idmap_start < kern_hyp_va((unsigned long)high_memory - 1) && 2116 hyp_idmap_start != (unsigned long)__hyp_idmap_text_start) { 2117 /* 2118 * The idmap page is intersecting with the VA space, 2119 * it is not safe to continue further. 2120 */ 2121 kvm_err("IDMAP intersecting with HYP VA, unable to continue\n"); 2122 err = -EINVAL; 2123 goto out; 2124 } 2125 2126 hyp_pgtable = kzalloc(sizeof(*hyp_pgtable), GFP_KERNEL); 2127 if (!hyp_pgtable) { 2128 kvm_err("Hyp mode page-table not allocated\n"); 2129 err = -ENOMEM; 2130 goto out; 2131 } 2132 2133 err = kvm_pgtable_hyp_init(hyp_pgtable, *hyp_va_bits, &kvm_hyp_mm_ops); 2134 if (err) 2135 goto out_free_pgtable; 2136 2137 err = kvm_map_idmap_text(); 2138 if (err) 2139 goto out_destroy_pgtable; 2140 2141 io_map_base = hyp_idmap_start; 2142 __hyp_va_bits = *hyp_va_bits; 2143 return 0; 2144 2145 out_destroy_pgtable: 2146 kvm_pgtable_hyp_destroy(hyp_pgtable); 2147 out_free_pgtable: 2148 kfree(hyp_pgtable); 2149 hyp_pgtable = NULL; 2150 out: 2151 return err; 2152 } 2153 2154 void kvm_arch_commit_memory_region(struct kvm *kvm, 2155 struct kvm_memory_slot *old, 2156 const struct kvm_memory_slot *new, 2157 enum kvm_mr_change change) 2158 { 2159 bool log_dirty_pages = new && new->flags & KVM_MEM_LOG_DIRTY_PAGES; 2160 2161 /* 2162 * At this point memslot has been committed and there is an 2163 * allocated dirty_bitmap[], dirty pages will be tracked while the 2164 * memory slot is write protected. 2165 */ 2166 if (log_dirty_pages) { 2167 2168 if (change == KVM_MR_DELETE) 2169 return; 2170 2171 /* 2172 * Huge and normal pages are write-protected and split 2173 * on either of these two cases: 2174 * 2175 * 1. with initial-all-set: gradually with CLEAR ioctls, 2176 */ 2177 if (kvm_dirty_log_manual_protect_and_init_set(kvm)) 2178 return; 2179 /* 2180 * or 2181 * 2. without initial-all-set: all in one shot when 2182 * enabling dirty logging. 2183 */ 2184 kvm_mmu_wp_memory_region(kvm, new->id); 2185 kvm_mmu_split_memory_region(kvm, new->id); 2186 } else { 2187 /* 2188 * Free any leftovers from the eager page splitting cache. Do 2189 * this when deleting, moving, disabling dirty logging, or 2190 * creating the memslot (a nop). Doing it for deletes makes 2191 * sure we don't leak memory, and there's no need to keep the 2192 * cache around for any of the other cases. 2193 */ 2194 kvm_mmu_free_memory_cache(&kvm->arch.mmu.split_page_cache); 2195 } 2196 } 2197 2198 int kvm_arch_prepare_memory_region(struct kvm *kvm, 2199 const struct kvm_memory_slot *old, 2200 struct kvm_memory_slot *new, 2201 enum kvm_mr_change change) 2202 { 2203 hva_t hva, reg_end; 2204 int ret = 0; 2205 2206 if (change != KVM_MR_CREATE && change != KVM_MR_MOVE && 2207 change != KVM_MR_FLAGS_ONLY) 2208 return 0; 2209 2210 /* 2211 * Prevent userspace from creating a memory region outside of the IPA 2212 * space addressable by the KVM guest IPA space. 2213 */ 2214 if ((new->base_gfn + new->npages) > (kvm_phys_size(&kvm->arch.mmu) >> PAGE_SHIFT)) 2215 return -EFAULT; 2216 2217 hva = new->userspace_addr; 2218 reg_end = hva + (new->npages << PAGE_SHIFT); 2219 2220 mmap_read_lock(current->mm); 2221 /* 2222 * A memory region could potentially cover multiple VMAs, and any holes 2223 * between them, so iterate over all of them. 2224 * 2225 * +--------------------------------------------+ 2226 * +---------------+----------------+ +----------------+ 2227 * | : VMA 1 | VMA 2 | | VMA 3 : | 2228 * +---------------+----------------+ +----------------+ 2229 * | memory region | 2230 * +--------------------------------------------+ 2231 */ 2232 do { 2233 struct vm_area_struct *vma; 2234 2235 vma = find_vma_intersection(current->mm, hva, reg_end); 2236 if (!vma) 2237 break; 2238 2239 if (kvm_has_mte(kvm) && !kvm_vma_mte_allowed(vma)) { 2240 ret = -EINVAL; 2241 break; 2242 } 2243 2244 if (vma->vm_flags & VM_PFNMAP) { 2245 /* IO region dirty page logging not allowed */ 2246 if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) { 2247 ret = -EINVAL; 2248 break; 2249 } 2250 2251 /* 2252 * Cacheable PFNMAP is allowed only if the hardware 2253 * supports it. 2254 */ 2255 if (kvm_vma_is_cacheable(vma) && !kvm_supports_cacheable_pfnmap()) { 2256 ret = -EINVAL; 2257 break; 2258 } 2259 } 2260 hva = min(reg_end, vma->vm_end); 2261 } while (hva < reg_end); 2262 2263 mmap_read_unlock(current->mm); 2264 return ret; 2265 } 2266 2267 void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) 2268 { 2269 } 2270 2271 void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) 2272 { 2273 } 2274 2275 void kvm_arch_flush_shadow_memslot(struct kvm *kvm, 2276 struct kvm_memory_slot *slot) 2277 { 2278 gpa_t gpa = slot->base_gfn << PAGE_SHIFT; 2279 phys_addr_t size = slot->npages << PAGE_SHIFT; 2280 2281 write_lock(&kvm->mmu_lock); 2282 kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, size, true); 2283 kvm_nested_s2_unmap(kvm, true); 2284 write_unlock(&kvm->mmu_lock); 2285 } 2286 2287 /* 2288 * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized). 2289 * 2290 * Main problems: 2291 * - S/W ops are local to a CPU (not broadcast) 2292 * - We have line migration behind our back (speculation) 2293 * - System caches don't support S/W at all (damn!) 2294 * 2295 * In the face of the above, the best we can do is to try and convert 2296 * S/W ops to VA ops. Because the guest is not allowed to infer the 2297 * S/W to PA mapping, it can only use S/W to nuke the whole cache, 2298 * which is a rather good thing for us. 2299 * 2300 * Also, it is only used when turning caches on/off ("The expected 2301 * usage of the cache maintenance instructions that operate by set/way 2302 * is associated with the cache maintenance instructions associated 2303 * with the powerdown and powerup of caches, if this is required by 2304 * the implementation."). 2305 * 2306 * We use the following policy: 2307 * 2308 * - If we trap a S/W operation, we enable VM trapping to detect 2309 * caches being turned on/off, and do a full clean. 2310 * 2311 * - We flush the caches on both caches being turned on and off. 2312 * 2313 * - Once the caches are enabled, we stop trapping VM ops. 2314 */ 2315 void kvm_set_way_flush(struct kvm_vcpu *vcpu) 2316 { 2317 unsigned long hcr = *vcpu_hcr(vcpu); 2318 2319 /* 2320 * If this is the first time we do a S/W operation 2321 * (i.e. HCR_TVM not set) flush the whole memory, and set the 2322 * VM trapping. 2323 * 2324 * Otherwise, rely on the VM trapping to wait for the MMU + 2325 * Caches to be turned off. At that point, we'll be able to 2326 * clean the caches again. 2327 */ 2328 if (!(hcr & HCR_TVM)) { 2329 trace_kvm_set_way_flush(*vcpu_pc(vcpu), 2330 vcpu_has_cache_enabled(vcpu)); 2331 stage2_flush_vm(vcpu->kvm); 2332 *vcpu_hcr(vcpu) = hcr | HCR_TVM; 2333 } 2334 } 2335 2336 void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled) 2337 { 2338 bool now_enabled = vcpu_has_cache_enabled(vcpu); 2339 2340 /* 2341 * If switching the MMU+caches on, need to invalidate the caches. 2342 * If switching it off, need to clean the caches. 2343 * Clean + invalidate does the trick always. 2344 */ 2345 if (now_enabled != was_enabled) 2346 stage2_flush_vm(vcpu->kvm); 2347 2348 /* Caches are now on, stop trapping VM ops (until a S/W op) */ 2349 if (now_enabled) 2350 *vcpu_hcr(vcpu) &= ~HCR_TVM; 2351 2352 trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled); 2353 } 2354