1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2012 - Virtual Open Systems and Columbia University 4 * Author: Christoffer Dall <c.dall@virtualopensystems.com> 5 */ 6 7 #include <linux/acpi.h> 8 #include <linux/mman.h> 9 #include <linux/kvm_host.h> 10 #include <linux/io.h> 11 #include <linux/hugetlb.h> 12 #include <linux/sched/signal.h> 13 #include <trace/events/kvm.h> 14 #include <asm/acpi.h> 15 #include <asm/pgalloc.h> 16 #include <asm/cacheflush.h> 17 #include <asm/kvm_arm.h> 18 #include <asm/kvm_mmu.h> 19 #include <asm/kvm_pgtable.h> 20 #include <asm/kvm_pkvm.h> 21 #include <asm/kvm_asm.h> 22 #include <asm/kvm_emulate.h> 23 #include <asm/virt.h> 24 25 #include "trace.h" 26 27 static struct kvm_pgtable *hyp_pgtable; 28 static DEFINE_MUTEX(kvm_hyp_pgd_mutex); 29 30 static unsigned long __ro_after_init hyp_idmap_start; 31 static unsigned long __ro_after_init hyp_idmap_end; 32 static phys_addr_t __ro_after_init hyp_idmap_vector; 33 34 u32 __ro_after_init __hyp_va_bits; 35 36 static unsigned long __ro_after_init io_map_base; 37 38 #define KVM_PGT_FN(fn) (!is_protected_kvm_enabled() ? fn : p ## fn) 39 40 static phys_addr_t __stage2_range_addr_end(phys_addr_t addr, phys_addr_t end, 41 phys_addr_t size) 42 { 43 phys_addr_t boundary = ALIGN_DOWN(addr + size, size); 44 45 return (boundary - 1 < end - 1) ? boundary : end; 46 } 47 48 static phys_addr_t stage2_range_addr_end(phys_addr_t addr, phys_addr_t end) 49 { 50 phys_addr_t size = kvm_granule_size(KVM_PGTABLE_MIN_BLOCK_LEVEL); 51 52 return __stage2_range_addr_end(addr, end, size); 53 } 54 55 /* 56 * Release kvm_mmu_lock periodically if the memory region is large. Otherwise, 57 * we may see kernel panics with CONFIG_DETECT_HUNG_TASK, 58 * CONFIG_LOCKUP_DETECTOR, CONFIG_LOCKDEP. Additionally, holding the lock too 59 * long will also starve other vCPUs. We have to also make sure that the page 60 * tables are not freed while we released the lock. 61 */ 62 static int stage2_apply_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, 63 phys_addr_t end, 64 int (*fn)(struct kvm_pgtable *, u64, u64), 65 bool resched) 66 { 67 struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu); 68 int ret; 69 u64 next; 70 71 do { 72 struct kvm_pgtable *pgt = mmu->pgt; 73 if (!pgt) 74 return -EINVAL; 75 76 next = stage2_range_addr_end(addr, end); 77 ret = fn(pgt, addr, next - addr); 78 if (ret) 79 break; 80 81 if (resched && next != end) 82 cond_resched_rwlock_write(&kvm->mmu_lock); 83 } while (addr = next, addr != end); 84 85 return ret; 86 } 87 88 #define stage2_apply_range_resched(mmu, addr, end, fn) \ 89 stage2_apply_range(mmu, addr, end, fn, true) 90 91 /* 92 * Get the maximum number of page-tables pages needed to split a range 93 * of blocks into PAGE_SIZE PTEs. It assumes the range is already 94 * mapped at level 2, or at level 1 if allowed. 95 */ 96 static int kvm_mmu_split_nr_page_tables(u64 range) 97 { 98 int n = 0; 99 100 if (KVM_PGTABLE_MIN_BLOCK_LEVEL < 2) 101 n += DIV_ROUND_UP(range, PUD_SIZE); 102 n += DIV_ROUND_UP(range, PMD_SIZE); 103 return n; 104 } 105 106 static bool need_split_memcache_topup_or_resched(struct kvm *kvm) 107 { 108 struct kvm_mmu_memory_cache *cache; 109 u64 chunk_size, min; 110 111 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) 112 return true; 113 114 chunk_size = kvm->arch.mmu.split_page_chunk_size; 115 min = kvm_mmu_split_nr_page_tables(chunk_size); 116 cache = &kvm->arch.mmu.split_page_cache; 117 return kvm_mmu_memory_cache_nr_free_objects(cache) < min; 118 } 119 120 static int kvm_mmu_split_huge_pages(struct kvm *kvm, phys_addr_t addr, 121 phys_addr_t end) 122 { 123 struct kvm_mmu_memory_cache *cache; 124 struct kvm_pgtable *pgt; 125 int ret, cache_capacity; 126 u64 next, chunk_size; 127 128 lockdep_assert_held_write(&kvm->mmu_lock); 129 130 chunk_size = kvm->arch.mmu.split_page_chunk_size; 131 cache_capacity = kvm_mmu_split_nr_page_tables(chunk_size); 132 133 if (chunk_size == 0) 134 return 0; 135 136 cache = &kvm->arch.mmu.split_page_cache; 137 138 do { 139 if (need_split_memcache_topup_or_resched(kvm)) { 140 write_unlock(&kvm->mmu_lock); 141 cond_resched(); 142 /* Eager page splitting is best-effort. */ 143 ret = __kvm_mmu_topup_memory_cache(cache, 144 cache_capacity, 145 cache_capacity); 146 write_lock(&kvm->mmu_lock); 147 if (ret) 148 break; 149 } 150 151 pgt = kvm->arch.mmu.pgt; 152 if (!pgt) 153 return -EINVAL; 154 155 next = __stage2_range_addr_end(addr, end, chunk_size); 156 ret = KVM_PGT_FN(kvm_pgtable_stage2_split)(pgt, addr, next - addr, cache); 157 if (ret) 158 break; 159 } while (addr = next, addr != end); 160 161 return ret; 162 } 163 164 static bool memslot_is_logging(struct kvm_memory_slot *memslot) 165 { 166 return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY); 167 } 168 169 /** 170 * kvm_arch_flush_remote_tlbs() - flush all VM TLB entries for v7/8 171 * @kvm: pointer to kvm structure. 172 * 173 * Interface to HYP function to flush all VM TLB entries 174 */ 175 int kvm_arch_flush_remote_tlbs(struct kvm *kvm) 176 { 177 if (is_protected_kvm_enabled()) 178 kvm_call_hyp_nvhe(__pkvm_tlb_flush_vmid, kvm->arch.pkvm.handle); 179 else 180 kvm_call_hyp(__kvm_tlb_flush_vmid, &kvm->arch.mmu); 181 return 0; 182 } 183 184 int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, 185 gfn_t gfn, u64 nr_pages) 186 { 187 u64 size = nr_pages << PAGE_SHIFT; 188 u64 addr = gfn << PAGE_SHIFT; 189 190 if (is_protected_kvm_enabled()) 191 kvm_call_hyp_nvhe(__pkvm_tlb_flush_vmid, kvm->arch.pkvm.handle); 192 else 193 kvm_tlb_flush_vmid_range(&kvm->arch.mmu, addr, size); 194 return 0; 195 } 196 197 static void *stage2_memcache_zalloc_page(void *arg) 198 { 199 struct kvm_mmu_memory_cache *mc = arg; 200 void *virt; 201 202 /* Allocated with __GFP_ZERO, so no need to zero */ 203 virt = kvm_mmu_memory_cache_alloc(mc); 204 if (virt) 205 kvm_account_pgtable_pages(virt, 1); 206 return virt; 207 } 208 209 static void *kvm_host_zalloc_pages_exact(size_t size) 210 { 211 return alloc_pages_exact(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO); 212 } 213 214 static void *kvm_s2_zalloc_pages_exact(size_t size) 215 { 216 void *virt = kvm_host_zalloc_pages_exact(size); 217 218 if (virt) 219 kvm_account_pgtable_pages(virt, (size >> PAGE_SHIFT)); 220 return virt; 221 } 222 223 static void kvm_s2_free_pages_exact(void *virt, size_t size) 224 { 225 kvm_account_pgtable_pages(virt, -(size >> PAGE_SHIFT)); 226 free_pages_exact(virt, size); 227 } 228 229 static struct kvm_pgtable_mm_ops kvm_s2_mm_ops; 230 231 static void stage2_free_unlinked_table_rcu_cb(struct rcu_head *head) 232 { 233 struct page *page = container_of(head, struct page, rcu_head); 234 void *pgtable = page_to_virt(page); 235 s8 level = page_private(page); 236 237 KVM_PGT_FN(kvm_pgtable_stage2_free_unlinked)(&kvm_s2_mm_ops, pgtable, level); 238 } 239 240 static void stage2_free_unlinked_table(void *addr, s8 level) 241 { 242 struct page *page = virt_to_page(addr); 243 244 set_page_private(page, (unsigned long)level); 245 call_rcu(&page->rcu_head, stage2_free_unlinked_table_rcu_cb); 246 } 247 248 static void kvm_host_get_page(void *addr) 249 { 250 get_page(virt_to_page(addr)); 251 } 252 253 static void kvm_host_put_page(void *addr) 254 { 255 put_page(virt_to_page(addr)); 256 } 257 258 static void kvm_s2_put_page(void *addr) 259 { 260 struct page *p = virt_to_page(addr); 261 /* Dropping last refcount, the page will be freed */ 262 if (page_count(p) == 1) 263 kvm_account_pgtable_pages(addr, -1); 264 put_page(p); 265 } 266 267 static int kvm_host_page_count(void *addr) 268 { 269 return page_count(virt_to_page(addr)); 270 } 271 272 static phys_addr_t kvm_host_pa(void *addr) 273 { 274 return __pa(addr); 275 } 276 277 static void *kvm_host_va(phys_addr_t phys) 278 { 279 return __va(phys); 280 } 281 282 static void clean_dcache_guest_page(void *va, size_t size) 283 { 284 __clean_dcache_guest_page(va, size); 285 } 286 287 static void invalidate_icache_guest_page(void *va, size_t size) 288 { 289 __invalidate_icache_guest_page(va, size); 290 } 291 292 /* 293 * Unmapping vs dcache management: 294 * 295 * If a guest maps certain memory pages as uncached, all writes will 296 * bypass the data cache and go directly to RAM. However, the CPUs 297 * can still speculate reads (not writes) and fill cache lines with 298 * data. 299 * 300 * Those cache lines will be *clean* cache lines though, so a 301 * clean+invalidate operation is equivalent to an invalidate 302 * operation, because no cache lines are marked dirty. 303 * 304 * Those clean cache lines could be filled prior to an uncached write 305 * by the guest, and the cache coherent IO subsystem would therefore 306 * end up writing old data to disk. 307 * 308 * This is why right after unmapping a page/section and invalidating 309 * the corresponding TLBs, we flush to make sure the IO subsystem will 310 * never hit in the cache. 311 * 312 * This is all avoided on systems that have ARM64_HAS_STAGE2_FWB, as 313 * we then fully enforce cacheability of RAM, no matter what the guest 314 * does. 315 */ 316 /** 317 * __unmap_stage2_range -- Clear stage2 page table entries to unmap a range 318 * @mmu: The KVM stage-2 MMU pointer 319 * @start: The intermediate physical base address of the range to unmap 320 * @size: The size of the area to unmap 321 * @may_block: Whether or not we are permitted to block 322 * 323 * Clear a range of stage-2 mappings, lowering the various ref-counts. Must 324 * be called while holding mmu_lock (unless for freeing the stage2 pgd before 325 * destroying the VM), otherwise another faulting VCPU may come in and mess 326 * with things behind our backs. 327 */ 328 static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size, 329 bool may_block) 330 { 331 struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu); 332 phys_addr_t end = start + size; 333 334 lockdep_assert_held_write(&kvm->mmu_lock); 335 WARN_ON(size & ~PAGE_MASK); 336 WARN_ON(stage2_apply_range(mmu, start, end, KVM_PGT_FN(kvm_pgtable_stage2_unmap), 337 may_block)); 338 } 339 340 void kvm_stage2_unmap_range(struct kvm_s2_mmu *mmu, phys_addr_t start, 341 u64 size, bool may_block) 342 { 343 if (kvm_vm_is_protected(kvm_s2_mmu_to_kvm(mmu))) 344 return; 345 346 __unmap_stage2_range(mmu, start, size, may_block); 347 } 348 349 void kvm_stage2_flush_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end) 350 { 351 stage2_apply_range_resched(mmu, addr, end, KVM_PGT_FN(kvm_pgtable_stage2_flush)); 352 } 353 354 static void stage2_flush_memslot(struct kvm *kvm, 355 struct kvm_memory_slot *memslot) 356 { 357 phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT; 358 phys_addr_t end = addr + PAGE_SIZE * memslot->npages; 359 360 kvm_stage2_flush_range(&kvm->arch.mmu, addr, end); 361 } 362 363 /** 364 * stage2_flush_vm - Invalidate cache for pages mapped in stage 2 365 * @kvm: The struct kvm pointer 366 * 367 * Go through the stage 2 page tables and invalidate any cache lines 368 * backing memory already mapped to the VM. 369 */ 370 static void stage2_flush_vm(struct kvm *kvm) 371 { 372 struct kvm_memslots *slots; 373 struct kvm_memory_slot *memslot; 374 int idx, bkt; 375 376 idx = srcu_read_lock(&kvm->srcu); 377 write_lock(&kvm->mmu_lock); 378 379 slots = kvm_memslots(kvm); 380 kvm_for_each_memslot(memslot, bkt, slots) 381 stage2_flush_memslot(kvm, memslot); 382 383 kvm_nested_s2_flush(kvm); 384 385 write_unlock(&kvm->mmu_lock); 386 srcu_read_unlock(&kvm->srcu, idx); 387 } 388 389 /** 390 * free_hyp_pgds - free Hyp-mode page tables 391 */ 392 void __init free_hyp_pgds(void) 393 { 394 mutex_lock(&kvm_hyp_pgd_mutex); 395 if (hyp_pgtable) { 396 kvm_pgtable_hyp_destroy(hyp_pgtable); 397 kfree(hyp_pgtable); 398 hyp_pgtable = NULL; 399 } 400 mutex_unlock(&kvm_hyp_pgd_mutex); 401 } 402 403 static bool kvm_host_owns_hyp_mappings(void) 404 { 405 if (is_kernel_in_hyp_mode()) 406 return false; 407 408 if (static_branch_likely(&kvm_protected_mode_initialized)) 409 return false; 410 411 /* 412 * This can happen at boot time when __create_hyp_mappings() is called 413 * after the hyp protection has been enabled, but the static key has 414 * not been flipped yet. 415 */ 416 if (!hyp_pgtable && is_protected_kvm_enabled()) 417 return false; 418 419 WARN_ON(!hyp_pgtable); 420 421 return true; 422 } 423 424 int __create_hyp_mappings(unsigned long start, unsigned long size, 425 unsigned long phys, enum kvm_pgtable_prot prot) 426 { 427 int err; 428 429 if (WARN_ON(!kvm_host_owns_hyp_mappings())) 430 return -EINVAL; 431 432 mutex_lock(&kvm_hyp_pgd_mutex); 433 err = kvm_pgtable_hyp_map(hyp_pgtable, start, size, phys, prot); 434 mutex_unlock(&kvm_hyp_pgd_mutex); 435 436 return err; 437 } 438 439 static phys_addr_t kvm_kaddr_to_phys(void *kaddr) 440 { 441 if (!is_vmalloc_addr(kaddr)) { 442 BUG_ON(!virt_addr_valid(kaddr)); 443 return __pa(kaddr); 444 } else { 445 return page_to_phys(vmalloc_to_page(kaddr)) + 446 offset_in_page(kaddr); 447 } 448 } 449 450 struct hyp_shared_pfn { 451 u64 pfn; 452 int count; 453 struct rb_node node; 454 }; 455 456 static DEFINE_MUTEX(hyp_shared_pfns_lock); 457 static struct rb_root hyp_shared_pfns = RB_ROOT; 458 459 static struct hyp_shared_pfn *find_shared_pfn(u64 pfn, struct rb_node ***node, 460 struct rb_node **parent) 461 { 462 struct hyp_shared_pfn *this; 463 464 *node = &hyp_shared_pfns.rb_node; 465 *parent = NULL; 466 while (**node) { 467 this = container_of(**node, struct hyp_shared_pfn, node); 468 *parent = **node; 469 if (this->pfn < pfn) 470 *node = &((**node)->rb_left); 471 else if (this->pfn > pfn) 472 *node = &((**node)->rb_right); 473 else 474 return this; 475 } 476 477 return NULL; 478 } 479 480 static int share_pfn_hyp(u64 pfn) 481 { 482 struct rb_node **node, *parent; 483 struct hyp_shared_pfn *this; 484 int ret = 0; 485 486 mutex_lock(&hyp_shared_pfns_lock); 487 this = find_shared_pfn(pfn, &node, &parent); 488 if (this) { 489 this->count++; 490 goto unlock; 491 } 492 493 this = kzalloc_obj(*this); 494 if (!this) { 495 ret = -ENOMEM; 496 goto unlock; 497 } 498 499 this->pfn = pfn; 500 this->count = 1; 501 rb_link_node(&this->node, parent, node); 502 rb_insert_color(&this->node, &hyp_shared_pfns); 503 ret = kvm_call_hyp_nvhe(__pkvm_host_share_hyp, pfn); 504 unlock: 505 mutex_unlock(&hyp_shared_pfns_lock); 506 507 return ret; 508 } 509 510 static int unshare_pfn_hyp(u64 pfn) 511 { 512 struct rb_node **node, *parent; 513 struct hyp_shared_pfn *this; 514 int ret = 0; 515 516 mutex_lock(&hyp_shared_pfns_lock); 517 this = find_shared_pfn(pfn, &node, &parent); 518 if (WARN_ON(!this)) { 519 ret = -ENOENT; 520 goto unlock; 521 } 522 523 this->count--; 524 if (this->count) 525 goto unlock; 526 527 rb_erase(&this->node, &hyp_shared_pfns); 528 kfree(this); 529 ret = kvm_call_hyp_nvhe(__pkvm_host_unshare_hyp, pfn); 530 unlock: 531 mutex_unlock(&hyp_shared_pfns_lock); 532 533 return ret; 534 } 535 536 int kvm_share_hyp(void *from, void *to) 537 { 538 phys_addr_t start, end, cur; 539 u64 pfn; 540 int ret; 541 542 if (is_kernel_in_hyp_mode()) 543 return 0; 544 545 /* 546 * The share hcall maps things in the 'fixed-offset' region of the hyp 547 * VA space, so we can only share physically contiguous data-structures 548 * for now. 549 */ 550 if (is_vmalloc_or_module_addr(from) || is_vmalloc_or_module_addr(to)) 551 return -EINVAL; 552 553 if (kvm_host_owns_hyp_mappings()) 554 return create_hyp_mappings(from, to, PAGE_HYP); 555 556 start = ALIGN_DOWN(__pa(from), PAGE_SIZE); 557 end = PAGE_ALIGN(__pa(to)); 558 for (cur = start; cur < end; cur += PAGE_SIZE) { 559 pfn = __phys_to_pfn(cur); 560 ret = share_pfn_hyp(pfn); 561 if (ret) 562 return ret; 563 } 564 565 return 0; 566 } 567 568 void kvm_unshare_hyp(void *from, void *to) 569 { 570 phys_addr_t start, end, cur; 571 u64 pfn; 572 573 if (is_kernel_in_hyp_mode() || kvm_host_owns_hyp_mappings() || !from) 574 return; 575 576 start = ALIGN_DOWN(__pa(from), PAGE_SIZE); 577 end = PAGE_ALIGN(__pa(to)); 578 for (cur = start; cur < end; cur += PAGE_SIZE) { 579 pfn = __phys_to_pfn(cur); 580 WARN_ON(unshare_pfn_hyp(pfn)); 581 } 582 } 583 584 /** 585 * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode 586 * @from: The virtual kernel start address of the range 587 * @to: The virtual kernel end address of the range (exclusive) 588 * @prot: The protection to be applied to this range 589 * 590 * The same virtual address as the kernel virtual address is also used 591 * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying 592 * physical pages. 593 */ 594 int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot) 595 { 596 phys_addr_t phys_addr; 597 unsigned long virt_addr; 598 unsigned long start = kern_hyp_va((unsigned long)from); 599 unsigned long end = kern_hyp_va((unsigned long)to); 600 601 if (is_kernel_in_hyp_mode()) 602 return 0; 603 604 if (!kvm_host_owns_hyp_mappings()) 605 return -EPERM; 606 607 start = start & PAGE_MASK; 608 end = PAGE_ALIGN(end); 609 610 for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) { 611 int err; 612 613 phys_addr = kvm_kaddr_to_phys(from + virt_addr - start); 614 err = __create_hyp_mappings(virt_addr, PAGE_SIZE, phys_addr, 615 prot); 616 if (err) 617 return err; 618 } 619 620 return 0; 621 } 622 623 static int __hyp_alloc_private_va_range(unsigned long base) 624 { 625 lockdep_assert_held(&kvm_hyp_pgd_mutex); 626 627 if (!PAGE_ALIGNED(base)) 628 return -EINVAL; 629 630 /* 631 * Verify that BIT(VA_BITS - 1) hasn't been flipped by 632 * allocating the new area, as it would indicate we've 633 * overflowed the idmap/IO address range. 634 */ 635 if ((base ^ io_map_base) & BIT(VA_BITS - 1)) 636 return -ENOMEM; 637 638 io_map_base = base; 639 640 return 0; 641 } 642 643 /** 644 * hyp_alloc_private_va_range - Allocates a private VA range. 645 * @size: The size of the VA range to reserve. 646 * @haddr: The hypervisor virtual start address of the allocation. 647 * 648 * The private virtual address (VA) range is allocated below io_map_base 649 * and aligned based on the order of @size. 650 * 651 * Return: 0 on success or negative error code on failure. 652 */ 653 int hyp_alloc_private_va_range(size_t size, unsigned long *haddr) 654 { 655 unsigned long base; 656 int ret = 0; 657 658 mutex_lock(&kvm_hyp_pgd_mutex); 659 660 /* 661 * This assumes that we have enough space below the idmap 662 * page to allocate our VAs. If not, the check in 663 * __hyp_alloc_private_va_range() will kick. A potential 664 * alternative would be to detect that overflow and switch 665 * to an allocation above the idmap. 666 * 667 * The allocated size is always a multiple of PAGE_SIZE. 668 */ 669 size = PAGE_ALIGN(size); 670 base = io_map_base - size; 671 ret = __hyp_alloc_private_va_range(base); 672 673 mutex_unlock(&kvm_hyp_pgd_mutex); 674 675 if (!ret) 676 *haddr = base; 677 678 return ret; 679 } 680 681 static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size, 682 unsigned long *haddr, 683 enum kvm_pgtable_prot prot) 684 { 685 unsigned long addr; 686 int ret = 0; 687 688 if (!kvm_host_owns_hyp_mappings()) { 689 addr = kvm_call_hyp_nvhe(__pkvm_create_private_mapping, 690 phys_addr, size, prot); 691 if (IS_ERR_VALUE(addr)) 692 return addr; 693 *haddr = addr; 694 695 return 0; 696 } 697 698 size = PAGE_ALIGN(size + offset_in_page(phys_addr)); 699 ret = hyp_alloc_private_va_range(size, &addr); 700 if (ret) 701 return ret; 702 703 ret = __create_hyp_mappings(addr, size, phys_addr, prot); 704 if (ret) 705 return ret; 706 707 *haddr = addr + offset_in_page(phys_addr); 708 return ret; 709 } 710 711 int create_hyp_stack(phys_addr_t phys_addr, unsigned long *haddr) 712 { 713 unsigned long base; 714 size_t size; 715 int ret; 716 717 mutex_lock(&kvm_hyp_pgd_mutex); 718 /* 719 * Efficient stack verification using the NVHE_STACK_SHIFT bit implies 720 * an alignment of our allocation on the order of the size. 721 */ 722 size = NVHE_STACK_SIZE * 2; 723 base = ALIGN_DOWN(io_map_base - size, size); 724 725 ret = __hyp_alloc_private_va_range(base); 726 727 mutex_unlock(&kvm_hyp_pgd_mutex); 728 729 if (ret) { 730 kvm_err("Cannot allocate hyp stack guard page\n"); 731 return ret; 732 } 733 734 /* 735 * Since the stack grows downwards, map the stack to the page 736 * at the higher address and leave the lower guard page 737 * unbacked. 738 * 739 * Any valid stack address now has the NVHE_STACK_SHIFT bit as 1 740 * and addresses corresponding to the guard page have the 741 * NVHE_STACK_SHIFT bit as 0 - this is used for overflow detection. 742 */ 743 ret = __create_hyp_mappings(base + NVHE_STACK_SIZE, NVHE_STACK_SIZE, 744 phys_addr, PAGE_HYP); 745 if (ret) 746 kvm_err("Cannot map hyp stack\n"); 747 748 *haddr = base + size; 749 750 return ret; 751 } 752 753 /** 754 * create_hyp_io_mappings - Map IO into both kernel and HYP 755 * @phys_addr: The physical start address which gets mapped 756 * @size: Size of the region being mapped 757 * @kaddr: Kernel VA for this mapping 758 * @haddr: HYP VA for this mapping 759 */ 760 int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size, 761 void __iomem **kaddr, 762 void __iomem **haddr) 763 { 764 unsigned long addr; 765 int ret; 766 767 if (is_protected_kvm_enabled()) 768 return -EPERM; 769 770 *kaddr = ioremap(phys_addr, size); 771 if (!*kaddr) 772 return -ENOMEM; 773 774 if (is_kernel_in_hyp_mode()) { 775 *haddr = *kaddr; 776 return 0; 777 } 778 779 ret = __create_hyp_private_mapping(phys_addr, size, 780 &addr, PAGE_HYP_DEVICE); 781 if (ret) { 782 iounmap(*kaddr); 783 *kaddr = NULL; 784 *haddr = NULL; 785 return ret; 786 } 787 788 *haddr = (void __iomem *)addr; 789 return 0; 790 } 791 792 /** 793 * create_hyp_exec_mappings - Map an executable range into HYP 794 * @phys_addr: The physical start address which gets mapped 795 * @size: Size of the region being mapped 796 * @haddr: HYP VA for this mapping 797 */ 798 int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size, 799 void **haddr) 800 { 801 unsigned long addr; 802 int ret; 803 804 BUG_ON(is_kernel_in_hyp_mode()); 805 806 ret = __create_hyp_private_mapping(phys_addr, size, 807 &addr, PAGE_HYP_EXEC); 808 if (ret) { 809 *haddr = NULL; 810 return ret; 811 } 812 813 *haddr = (void *)addr; 814 return 0; 815 } 816 817 static struct kvm_pgtable_mm_ops kvm_user_mm_ops = { 818 /* We shouldn't need any other callback to walk the PT */ 819 .phys_to_virt = kvm_host_va, 820 }; 821 822 static int get_user_mapping_size(struct kvm *kvm, u64 addr) 823 { 824 struct kvm_pgtable pgt = { 825 .pgd = (kvm_pteref_t)kvm->mm->pgd, 826 .ia_bits = vabits_actual, 827 .start_level = (KVM_PGTABLE_LAST_LEVEL - 828 ARM64_HW_PGTABLE_LEVELS(pgt.ia_bits) + 1), 829 .mm_ops = &kvm_user_mm_ops, 830 }; 831 unsigned long flags; 832 kvm_pte_t pte = 0; /* Keep GCC quiet... */ 833 s8 level = S8_MAX; 834 int ret; 835 836 /* 837 * Disable IRQs so that we hazard against a concurrent 838 * teardown of the userspace page tables (which relies on 839 * IPI-ing threads). 840 */ 841 local_irq_save(flags); 842 ret = kvm_pgtable_get_leaf(&pgt, addr, &pte, &level); 843 local_irq_restore(flags); 844 845 if (ret) 846 return ret; 847 848 /* 849 * Not seeing an error, but not updating level? Something went 850 * deeply wrong... 851 */ 852 if (WARN_ON(level > KVM_PGTABLE_LAST_LEVEL)) 853 return -EFAULT; 854 if (WARN_ON(level < KVM_PGTABLE_FIRST_LEVEL)) 855 return -EFAULT; 856 857 /* Oops, the userspace PTs are gone... Replay the fault */ 858 if (!kvm_pte_valid(pte)) 859 return -EAGAIN; 860 861 return BIT(ARM64_HW_PGTABLE_LEVEL_SHIFT(level)); 862 } 863 864 static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = { 865 .zalloc_page = stage2_memcache_zalloc_page, 866 .zalloc_pages_exact = kvm_s2_zalloc_pages_exact, 867 .free_pages_exact = kvm_s2_free_pages_exact, 868 .free_unlinked_table = stage2_free_unlinked_table, 869 .get_page = kvm_host_get_page, 870 .put_page = kvm_s2_put_page, 871 .page_count = kvm_host_page_count, 872 .phys_to_virt = kvm_host_va, 873 .virt_to_phys = kvm_host_pa, 874 .dcache_clean_inval_poc = clean_dcache_guest_page, 875 .icache_inval_pou = invalidate_icache_guest_page, 876 }; 877 878 static int kvm_init_ipa_range(struct kvm_s2_mmu *mmu, unsigned long type) 879 { 880 u32 kvm_ipa_limit = get_kvm_ipa_limit(); 881 u64 mmfr0, mmfr1; 882 u32 phys_shift; 883 884 phys_shift = KVM_VM_TYPE_ARM_IPA_SIZE(type); 885 if (is_protected_kvm_enabled()) { 886 phys_shift = kvm_ipa_limit; 887 } else if (phys_shift) { 888 if (phys_shift > kvm_ipa_limit || 889 phys_shift < ARM64_MIN_PARANGE_BITS) 890 return -EINVAL; 891 } else { 892 phys_shift = KVM_PHYS_SHIFT; 893 if (phys_shift > kvm_ipa_limit) { 894 pr_warn_once("%s using unsupported default IPA limit, upgrade your VMM\n", 895 current->comm); 896 return -EINVAL; 897 } 898 } 899 900 mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); 901 mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); 902 mmu->vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift); 903 904 return 0; 905 } 906 907 /* 908 * Assume that @pgt is valid and unlinked from the KVM MMU to free the 909 * page-table without taking the kvm_mmu_lock and without performing any 910 * TLB invalidations. 911 * 912 * Also, the range of addresses can be large enough to cause need_resched 913 * warnings, for instance on CONFIG_PREEMPT_NONE kernels. Hence, invoke 914 * cond_resched() periodically to prevent hogging the CPU for a long time 915 * and schedule something else, if required. 916 */ 917 static void stage2_destroy_range(struct kvm_pgtable *pgt, phys_addr_t addr, 918 phys_addr_t end) 919 { 920 u64 next; 921 922 do { 923 next = stage2_range_addr_end(addr, end); 924 KVM_PGT_FN(kvm_pgtable_stage2_destroy_range)(pgt, addr, 925 next - addr); 926 if (next != end) 927 cond_resched(); 928 } while (addr = next, addr != end); 929 } 930 931 static void kvm_stage2_destroy(struct kvm_pgtable *pgt) 932 { 933 unsigned int ia_bits = VTCR_EL2_IPA(pgt->mmu->vtcr); 934 935 stage2_destroy_range(pgt, 0, BIT(ia_bits)); 936 KVM_PGT_FN(kvm_pgtable_stage2_destroy_pgd)(pgt); 937 } 938 939 /** 940 * kvm_init_stage2_mmu - Initialise a S2 MMU structure 941 * @kvm: The pointer to the KVM structure 942 * @mmu: The pointer to the s2 MMU structure 943 * @type: The machine type of the virtual machine 944 * 945 * Allocates only the stage-2 HW PGD level table(s). 946 * Note we don't need locking here as this is only called in two cases: 947 * 948 * - when the VM is created, which can't race against anything 949 * 950 * - when secondary kvm_s2_mmu structures are initialised for NV 951 * guests, and the caller must hold kvm->lock as this is called on a 952 * per-vcpu basis. 953 */ 954 int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type) 955 { 956 int cpu, err; 957 struct kvm_pgtable *pgt; 958 959 /* 960 * If we already have our page tables in place, and that the 961 * MMU context is the canonical one, we have a bug somewhere, 962 * as this is only supposed to ever happen once per VM. 963 * 964 * Otherwise, we're building nested page tables, and that's 965 * probably because userspace called KVM_ARM_VCPU_INIT more 966 * than once on the same vcpu. Since that's actually legal, 967 * don't kick a fuss and leave gracefully. 968 */ 969 if (mmu->pgt != NULL) { 970 if (kvm_is_nested_s2_mmu(kvm, mmu)) 971 return 0; 972 973 kvm_err("kvm_arch already initialized?\n"); 974 return -EINVAL; 975 } 976 977 err = kvm_init_ipa_range(mmu, type); 978 if (err) 979 return err; 980 981 pgt = kzalloc_obj(*pgt, GFP_KERNEL_ACCOUNT); 982 if (!pgt) 983 return -ENOMEM; 984 985 mmu->arch = &kvm->arch; 986 err = KVM_PGT_FN(kvm_pgtable_stage2_init)(pgt, mmu, &kvm_s2_mm_ops); 987 if (err) 988 goto out_free_pgtable; 989 990 mmu->pgt = pgt; 991 if (is_protected_kvm_enabled()) 992 return 0; 993 994 mmu->last_vcpu_ran = alloc_percpu(typeof(*mmu->last_vcpu_ran)); 995 if (!mmu->last_vcpu_ran) { 996 err = -ENOMEM; 997 goto out_destroy_pgtable; 998 } 999 1000 for_each_possible_cpu(cpu) 1001 *per_cpu_ptr(mmu->last_vcpu_ran, cpu) = -1; 1002 1003 /* The eager page splitting is disabled by default */ 1004 mmu->split_page_chunk_size = KVM_ARM_EAGER_SPLIT_CHUNK_SIZE_DEFAULT; 1005 mmu->split_page_cache.gfp_zero = __GFP_ZERO; 1006 1007 mmu->pgd_phys = __pa(pgt->pgd); 1008 1009 if (kvm_is_nested_s2_mmu(kvm, mmu)) 1010 kvm_init_nested_s2_mmu(mmu); 1011 1012 return 0; 1013 1014 out_destroy_pgtable: 1015 kvm_stage2_destroy(pgt); 1016 mmu->pgt = NULL; 1017 out_free_pgtable: 1018 kfree(pgt); 1019 return err; 1020 } 1021 1022 void kvm_uninit_stage2_mmu(struct kvm *kvm) 1023 { 1024 kvm_free_stage2_pgd(&kvm->arch.mmu); 1025 kvm_mmu_free_memory_cache(&kvm->arch.mmu.split_page_cache); 1026 } 1027 1028 static void stage2_unmap_memslot(struct kvm *kvm, 1029 struct kvm_memory_slot *memslot) 1030 { 1031 hva_t hva = memslot->userspace_addr; 1032 phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT; 1033 phys_addr_t size = PAGE_SIZE * memslot->npages; 1034 hva_t reg_end = hva + size; 1035 1036 /* 1037 * A memory region could potentially cover multiple VMAs, and any holes 1038 * between them, so iterate over all of them to find out if we should 1039 * unmap any of them. 1040 * 1041 * +--------------------------------------------+ 1042 * +---------------+----------------+ +----------------+ 1043 * | : VMA 1 | VMA 2 | | VMA 3 : | 1044 * +---------------+----------------+ +----------------+ 1045 * | memory region | 1046 * +--------------------------------------------+ 1047 */ 1048 do { 1049 struct vm_area_struct *vma; 1050 hva_t vm_start, vm_end; 1051 1052 vma = find_vma_intersection(current->mm, hva, reg_end); 1053 if (!vma) 1054 break; 1055 1056 /* 1057 * Take the intersection of this VMA with the memory region 1058 */ 1059 vm_start = max(hva, vma->vm_start); 1060 vm_end = min(reg_end, vma->vm_end); 1061 1062 if (!(vma->vm_flags & VM_PFNMAP)) { 1063 gpa_t gpa = addr + (vm_start - memslot->userspace_addr); 1064 kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, vm_end - vm_start, true); 1065 } 1066 hva = vm_end; 1067 } while (hva < reg_end); 1068 } 1069 1070 /** 1071 * stage2_unmap_vm - Unmap Stage-2 RAM mappings 1072 * @kvm: The struct kvm pointer 1073 * 1074 * Go through the memregions and unmap any regular RAM 1075 * backing memory already mapped to the VM. 1076 */ 1077 void stage2_unmap_vm(struct kvm *kvm) 1078 { 1079 struct kvm_memslots *slots; 1080 struct kvm_memory_slot *memslot; 1081 int idx, bkt; 1082 1083 idx = srcu_read_lock(&kvm->srcu); 1084 mmap_read_lock(current->mm); 1085 write_lock(&kvm->mmu_lock); 1086 1087 slots = kvm_memslots(kvm); 1088 kvm_for_each_memslot(memslot, bkt, slots) 1089 stage2_unmap_memslot(kvm, memslot); 1090 1091 kvm_nested_s2_unmap(kvm, true); 1092 1093 write_unlock(&kvm->mmu_lock); 1094 mmap_read_unlock(current->mm); 1095 srcu_read_unlock(&kvm->srcu, idx); 1096 } 1097 1098 void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu) 1099 { 1100 struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu); 1101 struct kvm_pgtable *pgt = NULL; 1102 1103 write_lock(&kvm->mmu_lock); 1104 pgt = mmu->pgt; 1105 if (pgt) { 1106 mmu->pgd_phys = 0; 1107 mmu->pgt = NULL; 1108 free_percpu(mmu->last_vcpu_ran); 1109 } 1110 1111 if (kvm_is_nested_s2_mmu(kvm, mmu)) 1112 kvm_init_nested_s2_mmu(mmu); 1113 1114 write_unlock(&kvm->mmu_lock); 1115 1116 if (pgt) { 1117 kvm_stage2_destroy(pgt); 1118 kfree(pgt); 1119 } 1120 } 1121 1122 static void hyp_mc_free_fn(void *addr, void *mc) 1123 { 1124 struct kvm_hyp_memcache *memcache = mc; 1125 1126 if (memcache->flags & HYP_MEMCACHE_ACCOUNT_STAGE2) 1127 kvm_account_pgtable_pages(addr, -1); 1128 1129 free_page((unsigned long)addr); 1130 } 1131 1132 static void *hyp_mc_alloc_fn(void *mc) 1133 { 1134 struct kvm_hyp_memcache *memcache = mc; 1135 void *addr; 1136 1137 addr = (void *)__get_free_page(GFP_KERNEL_ACCOUNT); 1138 if (addr && memcache->flags & HYP_MEMCACHE_ACCOUNT_STAGE2) 1139 kvm_account_pgtable_pages(addr, 1); 1140 1141 return addr; 1142 } 1143 1144 void free_hyp_memcache(struct kvm_hyp_memcache *mc) 1145 { 1146 if (!is_protected_kvm_enabled()) 1147 return; 1148 1149 kfree(mc->mapping); 1150 __free_hyp_memcache(mc, hyp_mc_free_fn, kvm_host_va, mc); 1151 } 1152 1153 int topup_hyp_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages) 1154 { 1155 if (!is_protected_kvm_enabled()) 1156 return 0; 1157 1158 if (!mc->mapping) { 1159 mc->mapping = kzalloc_obj(struct pkvm_mapping, 1160 GFP_KERNEL_ACCOUNT); 1161 if (!mc->mapping) 1162 return -ENOMEM; 1163 } 1164 1165 return __topup_hyp_memcache(mc, min_pages, hyp_mc_alloc_fn, 1166 kvm_host_pa, mc); 1167 } 1168 1169 /** 1170 * kvm_phys_addr_ioremap - map a device range to guest IPA 1171 * 1172 * @kvm: The KVM pointer 1173 * @guest_ipa: The IPA at which to insert the mapping 1174 * @pa: The physical address of the device 1175 * @size: The size of the mapping 1176 * @writable: Whether or not to create a writable mapping 1177 */ 1178 int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, 1179 phys_addr_t pa, unsigned long size, bool writable) 1180 { 1181 phys_addr_t addr; 1182 int ret = 0; 1183 struct kvm_mmu_memory_cache cache = { .gfp_zero = __GFP_ZERO }; 1184 struct kvm_s2_mmu *mmu = &kvm->arch.mmu; 1185 struct kvm_pgtable *pgt = mmu->pgt; 1186 enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_DEVICE | 1187 KVM_PGTABLE_PROT_R | 1188 (writable ? KVM_PGTABLE_PROT_W : 0); 1189 1190 if (is_protected_kvm_enabled()) 1191 return -EPERM; 1192 1193 size += offset_in_page(guest_ipa); 1194 guest_ipa &= PAGE_MASK; 1195 1196 for (addr = guest_ipa; addr < guest_ipa + size; addr += PAGE_SIZE) { 1197 ret = kvm_mmu_topup_memory_cache(&cache, 1198 kvm_mmu_cache_min_pages(mmu)); 1199 if (ret) 1200 break; 1201 1202 write_lock(&kvm->mmu_lock); 1203 ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, addr, PAGE_SIZE, 1204 pa, prot, &cache, 0); 1205 write_unlock(&kvm->mmu_lock); 1206 if (ret) 1207 break; 1208 1209 pa += PAGE_SIZE; 1210 } 1211 1212 kvm_mmu_free_memory_cache(&cache); 1213 return ret; 1214 } 1215 1216 /** 1217 * kvm_stage2_wp_range() - write protect stage2 memory region range 1218 * @mmu: The KVM stage-2 MMU pointer 1219 * @addr: Start address of range 1220 * @end: End address of range 1221 */ 1222 void kvm_stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end) 1223 { 1224 stage2_apply_range_resched(mmu, addr, end, KVM_PGT_FN(kvm_pgtable_stage2_wrprotect)); 1225 } 1226 1227 /** 1228 * kvm_mmu_wp_memory_region() - write protect stage 2 entries for memory slot 1229 * @kvm: The KVM pointer 1230 * @slot: The memory slot to write protect 1231 * 1232 * Called to start logging dirty pages after memory region 1233 * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns 1234 * all present PUD, PMD and PTEs are write protected in the memory region. 1235 * Afterwards read of dirty page log can be called. 1236 * 1237 * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired, 1238 * serializing operations for VM memory regions. 1239 */ 1240 static void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot) 1241 { 1242 struct kvm_memslots *slots = kvm_memslots(kvm); 1243 struct kvm_memory_slot *memslot = id_to_memslot(slots, slot); 1244 phys_addr_t start, end; 1245 1246 if (WARN_ON_ONCE(!memslot)) 1247 return; 1248 1249 start = memslot->base_gfn << PAGE_SHIFT; 1250 end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT; 1251 1252 write_lock(&kvm->mmu_lock); 1253 kvm_stage2_wp_range(&kvm->arch.mmu, start, end); 1254 kvm_nested_s2_wp(kvm); 1255 write_unlock(&kvm->mmu_lock); 1256 kvm_flush_remote_tlbs_memslot(kvm, memslot); 1257 } 1258 1259 /** 1260 * kvm_mmu_split_memory_region() - split the stage 2 blocks into PAGE_SIZE 1261 * pages for memory slot 1262 * @kvm: The KVM pointer 1263 * @slot: The memory slot to split 1264 * 1265 * Acquires kvm->mmu_lock. Called with kvm->slots_lock mutex acquired, 1266 * serializing operations for VM memory regions. 1267 */ 1268 static void kvm_mmu_split_memory_region(struct kvm *kvm, int slot) 1269 { 1270 struct kvm_memslots *slots; 1271 struct kvm_memory_slot *memslot; 1272 phys_addr_t start, end; 1273 1274 lockdep_assert_held(&kvm->slots_lock); 1275 1276 slots = kvm_memslots(kvm); 1277 memslot = id_to_memslot(slots, slot); 1278 1279 start = memslot->base_gfn << PAGE_SHIFT; 1280 end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT; 1281 1282 write_lock(&kvm->mmu_lock); 1283 kvm_mmu_split_huge_pages(kvm, start, end); 1284 write_unlock(&kvm->mmu_lock); 1285 } 1286 1287 /* 1288 * kvm_arch_mmu_enable_log_dirty_pt_masked() - enable dirty logging for selected pages. 1289 * @kvm: The KVM pointer 1290 * @slot: The memory slot associated with mask 1291 * @gfn_offset: The gfn offset in memory slot 1292 * @mask: The mask of pages at offset 'gfn_offset' in this memory 1293 * slot to enable dirty logging on 1294 * 1295 * Writes protect selected pages to enable dirty logging, and then 1296 * splits them to PAGE_SIZE. Caller must acquire kvm->mmu_lock. 1297 */ 1298 void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, 1299 struct kvm_memory_slot *slot, 1300 gfn_t gfn_offset, unsigned long mask) 1301 { 1302 phys_addr_t base_gfn = slot->base_gfn + gfn_offset; 1303 phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT; 1304 phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT; 1305 1306 lockdep_assert_held_write(&kvm->mmu_lock); 1307 1308 kvm_stage2_wp_range(&kvm->arch.mmu, start, end); 1309 1310 /* 1311 * Eager-splitting is done when manual-protect is set. We 1312 * also check for initially-all-set because we can avoid 1313 * eager-splitting if initially-all-set is false. 1314 * Initially-all-set equal false implies that huge-pages were 1315 * already split when enabling dirty logging: no need to do it 1316 * again. 1317 */ 1318 if (kvm_dirty_log_manual_protect_and_init_set(kvm)) 1319 kvm_mmu_split_huge_pages(kvm, start, end); 1320 1321 kvm_nested_s2_wp(kvm); 1322 } 1323 1324 static void kvm_send_hwpoison_signal(unsigned long address, short lsb) 1325 { 1326 send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current); 1327 } 1328 1329 static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot, 1330 unsigned long hva, 1331 unsigned long map_size) 1332 { 1333 gpa_t gpa_start; 1334 hva_t uaddr_start, uaddr_end; 1335 size_t size; 1336 1337 /* The memslot and the VMA are guaranteed to be aligned to PAGE_SIZE */ 1338 if (map_size == PAGE_SIZE) 1339 return true; 1340 1341 /* pKVM only supports PMD_SIZE huge-mappings */ 1342 if (is_protected_kvm_enabled() && map_size != PMD_SIZE) 1343 return false; 1344 1345 size = memslot->npages * PAGE_SIZE; 1346 1347 gpa_start = memslot->base_gfn << PAGE_SHIFT; 1348 1349 uaddr_start = memslot->userspace_addr; 1350 uaddr_end = uaddr_start + size; 1351 1352 /* 1353 * Pages belonging to memslots that don't have the same alignment 1354 * within a PMD/PUD for userspace and IPA cannot be mapped with stage-2 1355 * PMD/PUD entries, because we'll end up mapping the wrong pages. 1356 * 1357 * Consider a layout like the following: 1358 * 1359 * memslot->userspace_addr: 1360 * +-----+--------------------+--------------------+---+ 1361 * |abcde|fgh Stage-1 block | Stage-1 block tv|xyz| 1362 * +-----+--------------------+--------------------+---+ 1363 * 1364 * memslot->base_gfn << PAGE_SHIFT: 1365 * +---+--------------------+--------------------+-----+ 1366 * |abc|def Stage-2 block | Stage-2 block |tvxyz| 1367 * +---+--------------------+--------------------+-----+ 1368 * 1369 * If we create those stage-2 blocks, we'll end up with this incorrect 1370 * mapping: 1371 * d -> f 1372 * e -> g 1373 * f -> h 1374 */ 1375 if ((gpa_start & (map_size - 1)) != (uaddr_start & (map_size - 1))) 1376 return false; 1377 1378 /* 1379 * Next, let's make sure we're not trying to map anything not covered 1380 * by the memslot. This means we have to prohibit block size mappings 1381 * for the beginning and end of a non-block aligned and non-block sized 1382 * memory slot (illustrated by the head and tail parts of the 1383 * userspace view above containing pages 'abcde' and 'xyz', 1384 * respectively). 1385 * 1386 * Note that it doesn't matter if we do the check using the 1387 * userspace_addr or the base_gfn, as both are equally aligned (per 1388 * the check above) and equally sized. 1389 */ 1390 return (hva & ~(map_size - 1)) >= uaddr_start && 1391 (hva & ~(map_size - 1)) + map_size <= uaddr_end; 1392 } 1393 1394 /* 1395 * Check if the given hva is backed by a transparent huge page (THP) and 1396 * whether it can be mapped using block mapping in stage2. If so, adjust 1397 * the stage2 PFN and IPA accordingly. Only PMD_SIZE THPs are currently 1398 * supported. This will need to be updated to support other THP sizes. 1399 * 1400 * Returns the size of the mapping. 1401 */ 1402 static long 1403 transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot, 1404 unsigned long hva, kvm_pfn_t *pfnp, gfn_t *gfnp) 1405 { 1406 kvm_pfn_t pfn = *pfnp; 1407 gfn_t gfn = *gfnp; 1408 1409 /* 1410 * Make sure the adjustment is done only for THP pages. Also make 1411 * sure that the HVA and IPA are sufficiently aligned and that the 1412 * block map is contained within the memslot. 1413 */ 1414 if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) { 1415 int sz = get_user_mapping_size(kvm, hva); 1416 1417 if (sz < 0) 1418 return sz; 1419 1420 if (sz < PMD_SIZE) 1421 return PAGE_SIZE; 1422 1423 gfn &= ~(PTRS_PER_PMD - 1); 1424 *gfnp = gfn; 1425 pfn &= ~(PTRS_PER_PMD - 1); 1426 *pfnp = pfn; 1427 1428 return PMD_SIZE; 1429 } 1430 1431 /* Use page mapping if we cannot use block mapping. */ 1432 return PAGE_SIZE; 1433 } 1434 1435 static int get_vma_page_shift(struct vm_area_struct *vma, unsigned long hva) 1436 { 1437 unsigned long pa; 1438 1439 if (is_vm_hugetlb_page(vma) && !(vma->vm_flags & VM_PFNMAP)) 1440 return huge_page_shift(hstate_vma(vma)); 1441 1442 if (!(vma->vm_flags & VM_PFNMAP)) 1443 return PAGE_SHIFT; 1444 1445 VM_BUG_ON(is_vm_hugetlb_page(vma)); 1446 1447 pa = (vma->vm_pgoff << PAGE_SHIFT) + (hva - vma->vm_start); 1448 1449 #ifndef __PAGETABLE_PMD_FOLDED 1450 if ((hva & (PUD_SIZE - 1)) == (pa & (PUD_SIZE - 1)) && 1451 ALIGN_DOWN(hva, PUD_SIZE) >= vma->vm_start && 1452 ALIGN(hva, PUD_SIZE) <= vma->vm_end) 1453 return PUD_SHIFT; 1454 #endif 1455 1456 if ((hva & (PMD_SIZE - 1)) == (pa & (PMD_SIZE - 1)) && 1457 ALIGN_DOWN(hva, PMD_SIZE) >= vma->vm_start && 1458 ALIGN(hva, PMD_SIZE) <= vma->vm_end) 1459 return PMD_SHIFT; 1460 1461 return PAGE_SHIFT; 1462 } 1463 1464 /* 1465 * The page will be mapped in stage 2 as Normal Cacheable, so the VM will be 1466 * able to see the page's tags and therefore they must be initialised first. If 1467 * PG_mte_tagged is set, tags have already been initialised. 1468 * 1469 * Must be called with kvm->mmu_lock held to ensure the memory remains mapped 1470 * while the tags are zeroed. 1471 */ 1472 static void sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn, 1473 unsigned long size) 1474 { 1475 unsigned long i, nr_pages = size >> PAGE_SHIFT; 1476 struct page *page = pfn_to_page(pfn); 1477 struct folio *folio = page_folio(page); 1478 1479 if (!kvm_has_mte(kvm)) 1480 return; 1481 1482 if (folio_test_hugetlb(folio)) { 1483 /* Hugetlb has MTE flags set on head page only */ 1484 if (folio_try_hugetlb_mte_tagging(folio)) { 1485 for (i = 0; i < nr_pages; i++, page++) 1486 mte_clear_page_tags(page_address(page)); 1487 folio_set_hugetlb_mte_tagged(folio); 1488 } 1489 return; 1490 } 1491 1492 for (i = 0; i < nr_pages; i++, page++) { 1493 if (try_page_mte_tagging(page)) { 1494 mte_clear_page_tags(page_address(page)); 1495 set_page_mte_tagged(page); 1496 } 1497 } 1498 } 1499 1500 static bool kvm_vma_mte_allowed(struct vm_area_struct *vma) 1501 { 1502 return vma->vm_flags & VM_MTE_ALLOWED; 1503 } 1504 1505 static bool kvm_vma_is_cacheable(struct vm_area_struct *vma) 1506 { 1507 switch (FIELD_GET(PTE_ATTRINDX_MASK, pgprot_val(vma->vm_page_prot))) { 1508 case MT_NORMAL_NC: 1509 case MT_DEVICE_nGnRnE: 1510 case MT_DEVICE_nGnRE: 1511 return false; 1512 default: 1513 return true; 1514 } 1515 } 1516 1517 static void *get_mmu_memcache(struct kvm_vcpu *vcpu) 1518 { 1519 if (!is_protected_kvm_enabled()) 1520 return &vcpu->arch.mmu_page_cache; 1521 else 1522 return &vcpu->arch.pkvm_memcache; 1523 } 1524 1525 static int topup_mmu_memcache(struct kvm_vcpu *vcpu, void *memcache) 1526 { 1527 int min_pages = kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu); 1528 1529 if (!is_protected_kvm_enabled()) 1530 return kvm_mmu_topup_memory_cache(memcache, min_pages); 1531 1532 return topup_hyp_memcache(memcache, min_pages); 1533 } 1534 1535 /* 1536 * Potentially reduce shadow S2 permissions to match the guest's own S2. For 1537 * exec faults, we'd only reach this point if the guest actually allowed it (see 1538 * kvm_s2_handle_perm_fault). 1539 * 1540 * Also encode the level of the original translation in the SW bits of the leaf 1541 * entry as a proxy for the span of that translation. This will be retrieved on 1542 * TLB invalidation from the guest and used to limit the invalidation scope if a 1543 * TTL hint or a range isn't provided. 1544 */ 1545 static enum kvm_pgtable_prot adjust_nested_fault_perms(struct kvm_s2_trans *nested, 1546 enum kvm_pgtable_prot prot) 1547 { 1548 if (!kvm_s2_trans_writable(nested)) 1549 prot &= ~KVM_PGTABLE_PROT_W; 1550 if (!kvm_s2_trans_readable(nested)) 1551 prot &= ~KVM_PGTABLE_PROT_R; 1552 1553 return prot | kvm_encode_nested_level(nested); 1554 } 1555 1556 static enum kvm_pgtable_prot adjust_nested_exec_perms(struct kvm *kvm, 1557 struct kvm_s2_trans *nested, 1558 enum kvm_pgtable_prot prot) 1559 { 1560 if (!kvm_s2_trans_exec_el0(kvm, nested)) 1561 prot &= ~KVM_PGTABLE_PROT_UX; 1562 if (!kvm_s2_trans_exec_el1(kvm, nested)) 1563 prot &= ~KVM_PGTABLE_PROT_PX; 1564 1565 return prot; 1566 } 1567 1568 struct kvm_s2_fault_desc { 1569 struct kvm_vcpu *vcpu; 1570 phys_addr_t fault_ipa; 1571 struct kvm_s2_trans *nested; 1572 struct kvm_memory_slot *memslot; 1573 unsigned long hva; 1574 }; 1575 1576 static int gmem_abort(const struct kvm_s2_fault_desc *s2fd) 1577 { 1578 bool write_fault, exec_fault; 1579 bool perm_fault = kvm_vcpu_trap_is_permission_fault(s2fd->vcpu); 1580 enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_SHARED; 1581 enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; 1582 struct kvm_pgtable *pgt = s2fd->vcpu->arch.hw_mmu->pgt; 1583 unsigned long mmu_seq; 1584 struct page *page; 1585 struct kvm *kvm = s2fd->vcpu->kvm; 1586 void *memcache = NULL; 1587 kvm_pfn_t pfn; 1588 gfn_t gfn; 1589 int ret; 1590 1591 if (!perm_fault) { 1592 memcache = get_mmu_memcache(s2fd->vcpu); 1593 ret = topup_mmu_memcache(s2fd->vcpu, memcache); 1594 if (ret) 1595 return ret; 1596 } 1597 1598 if (s2fd->nested) 1599 gfn = kvm_s2_trans_output(s2fd->nested) >> PAGE_SHIFT; 1600 else 1601 gfn = s2fd->fault_ipa >> PAGE_SHIFT; 1602 1603 write_fault = kvm_is_write_fault(s2fd->vcpu); 1604 exec_fault = kvm_vcpu_trap_is_exec_fault(s2fd->vcpu); 1605 1606 VM_WARN_ON_ONCE(write_fault && exec_fault); 1607 1608 mmu_seq = kvm->mmu_invalidate_seq; 1609 /* Pairs with the smp_wmb() in kvm_mmu_invalidate_end(). */ 1610 smp_rmb(); 1611 1612 ret = kvm_gmem_get_pfn(kvm, s2fd->memslot, gfn, &pfn, &page, NULL); 1613 if (ret) { 1614 kvm_prepare_memory_fault_exit(s2fd->vcpu, s2fd->fault_ipa, PAGE_SIZE, 1615 write_fault, exec_fault, false); 1616 return ret; 1617 } 1618 1619 if (!(s2fd->memslot->flags & KVM_MEM_READONLY)) 1620 prot |= KVM_PGTABLE_PROT_W; 1621 1622 if (s2fd->nested) 1623 prot = adjust_nested_fault_perms(s2fd->nested, prot); 1624 1625 if (exec_fault || cpus_have_final_cap(ARM64_HAS_CACHE_DIC)) 1626 prot |= KVM_PGTABLE_PROT_X; 1627 1628 if (s2fd->nested) 1629 prot = adjust_nested_exec_perms(kvm, s2fd->nested, prot); 1630 1631 kvm_fault_lock(kvm); 1632 if (mmu_invalidate_retry(kvm, mmu_seq)) { 1633 ret = -EAGAIN; 1634 goto out_unlock; 1635 } 1636 1637 if (perm_fault) { 1638 /* 1639 * Drop the SW bits in favour of those stored in the 1640 * PTE, which will be preserved. 1641 */ 1642 prot &= ~KVM_NV_GUEST_MAP_SZ; 1643 ret = KVM_PGT_FN(kvm_pgtable_stage2_relax_perms)(pgt, s2fd->fault_ipa, 1644 prot, flags); 1645 } else { 1646 ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, s2fd->fault_ipa, PAGE_SIZE, 1647 __pfn_to_phys(pfn), prot, 1648 memcache, flags); 1649 } 1650 1651 out_unlock: 1652 kvm_release_faultin_page(kvm, page, !!ret, prot & KVM_PGTABLE_PROT_W); 1653 kvm_fault_unlock(kvm); 1654 1655 if ((prot & KVM_PGTABLE_PROT_W) && !ret) 1656 mark_page_dirty_in_slot(kvm, s2fd->memslot, gfn); 1657 1658 return ret != -EAGAIN ? ret : 0; 1659 } 1660 1661 struct kvm_s2_fault_vma_info { 1662 unsigned long mmu_seq; 1663 long vma_pagesize; 1664 vm_flags_t vm_flags; 1665 unsigned long max_map_size; 1666 struct page *page; 1667 kvm_pfn_t pfn; 1668 gfn_t gfn; 1669 bool device; 1670 bool mte_allowed; 1671 bool is_vma_cacheable; 1672 bool map_writable; 1673 bool map_non_cacheable; 1674 }; 1675 1676 static int pkvm_mem_abort(const struct kvm_s2_fault_desc *s2fd) 1677 { 1678 unsigned int flags = FOLL_HWPOISON | FOLL_LONGTERM | FOLL_WRITE; 1679 struct kvm_vcpu *vcpu = s2fd->vcpu; 1680 struct kvm_pgtable *pgt = vcpu->arch.hw_mmu->pgt; 1681 struct mm_struct *mm = current->mm; 1682 struct kvm *kvm = vcpu->kvm; 1683 void *hyp_memcache; 1684 struct page *page; 1685 int ret; 1686 1687 hyp_memcache = get_mmu_memcache(vcpu); 1688 ret = topup_mmu_memcache(vcpu, hyp_memcache); 1689 if (ret) 1690 return -ENOMEM; 1691 1692 ret = account_locked_vm(mm, 1, true); 1693 if (ret) 1694 return ret; 1695 1696 mmap_read_lock(mm); 1697 ret = pin_user_pages(s2fd->hva, 1, flags, &page); 1698 mmap_read_unlock(mm); 1699 1700 if (ret == -EHWPOISON) { 1701 kvm_send_hwpoison_signal(s2fd->hva, PAGE_SHIFT); 1702 ret = 0; 1703 goto dec_account; 1704 } else if (ret != 1) { 1705 ret = -EFAULT; 1706 goto dec_account; 1707 } else if (!folio_test_swapbacked(page_folio(page))) { 1708 /* 1709 * We really can't deal with page-cache pages returned by GUP 1710 * because (a) we may trigger writeback of a page for which we 1711 * no longer have access and (b) page_mkclean() won't find the 1712 * stage-2 mapping in the rmap so we can get out-of-whack with 1713 * the filesystem when marking the page dirty during unpinning 1714 * (see cc5095747edf ("ext4: don't BUG if someone dirty pages 1715 * without asking ext4 first")). 1716 * 1717 * Ideally we'd just restrict ourselves to anonymous pages, but 1718 * we also want to allow memfd (i.e. shmem) pages, so check for 1719 * pages backed by swap in the knowledge that the GUP pin will 1720 * prevent try_to_unmap() from succeeding. 1721 */ 1722 ret = -EIO; 1723 goto unpin; 1724 } 1725 1726 write_lock(&kvm->mmu_lock); 1727 ret = pkvm_pgtable_stage2_map(pgt, s2fd->fault_ipa, PAGE_SIZE, 1728 page_to_phys(page), KVM_PGTABLE_PROT_RWX, 1729 hyp_memcache, 0); 1730 write_unlock(&kvm->mmu_lock); 1731 if (ret) { 1732 if (ret == -EAGAIN) 1733 ret = 0; 1734 goto unpin; 1735 } 1736 1737 return 0; 1738 unpin: 1739 unpin_user_pages(&page, 1); 1740 dec_account: 1741 account_locked_vm(mm, 1, false); 1742 return ret; 1743 } 1744 1745 static short kvm_s2_resolve_vma_size(const struct kvm_s2_fault_desc *s2fd, 1746 struct kvm_s2_fault_vma_info *s2vi, 1747 struct vm_area_struct *vma) 1748 { 1749 short vma_shift; 1750 1751 if (memslot_is_logging(s2fd->memslot)) { 1752 s2vi->max_map_size = PAGE_SIZE; 1753 vma_shift = PAGE_SHIFT; 1754 } else { 1755 s2vi->max_map_size = PUD_SIZE; 1756 vma_shift = get_vma_page_shift(vma, s2fd->hva); 1757 } 1758 1759 switch (vma_shift) { 1760 #ifndef __PAGETABLE_PMD_FOLDED 1761 case PUD_SHIFT: 1762 if (fault_supports_stage2_huge_mapping(s2fd->memslot, s2fd->hva, PUD_SIZE)) 1763 break; 1764 fallthrough; 1765 #endif 1766 case CONT_PMD_SHIFT: 1767 vma_shift = PMD_SHIFT; 1768 fallthrough; 1769 case PMD_SHIFT: 1770 if (fault_supports_stage2_huge_mapping(s2fd->memslot, s2fd->hva, PMD_SIZE)) 1771 break; 1772 fallthrough; 1773 case CONT_PTE_SHIFT: 1774 vma_shift = PAGE_SHIFT; 1775 s2vi->max_map_size = PAGE_SIZE; 1776 fallthrough; 1777 case PAGE_SHIFT: 1778 break; 1779 default: 1780 WARN_ONCE(1, "Unknown vma_shift %d", vma_shift); 1781 } 1782 1783 if (s2fd->nested) { 1784 unsigned long max_map_size; 1785 1786 max_map_size = min(s2vi->max_map_size, PUD_SIZE); 1787 1788 /* 1789 * If we're about to create a shadow stage 2 entry, then we 1790 * can only create a block mapping if the guest stage 2 page 1791 * table uses at least as big a mapping. 1792 */ 1793 max_map_size = min(kvm_s2_trans_size(s2fd->nested), max_map_size); 1794 1795 /* 1796 * Be careful that if the mapping size falls between 1797 * two host sizes, take the smallest of the two. 1798 */ 1799 if (max_map_size >= PMD_SIZE && max_map_size < PUD_SIZE) 1800 max_map_size = PMD_SIZE; 1801 else if (max_map_size >= PAGE_SIZE && max_map_size < PMD_SIZE) 1802 max_map_size = PAGE_SIZE; 1803 1804 s2vi->max_map_size = max_map_size; 1805 vma_shift = min_t(short, vma_shift, __ffs(max_map_size)); 1806 } 1807 1808 return vma_shift; 1809 } 1810 1811 static bool kvm_s2_fault_is_perm(const struct kvm_s2_fault_desc *s2fd) 1812 { 1813 return kvm_vcpu_trap_is_permission_fault(s2fd->vcpu); 1814 } 1815 1816 static int kvm_s2_fault_get_vma_info(const struct kvm_s2_fault_desc *s2fd, 1817 struct kvm_s2_fault_vma_info *s2vi) 1818 { 1819 struct vm_area_struct *vma; 1820 struct kvm *kvm = s2fd->vcpu->kvm; 1821 1822 mmap_read_lock(current->mm); 1823 vma = vma_lookup(current->mm, s2fd->hva); 1824 if (unlikely(!vma)) { 1825 kvm_err("Failed to find VMA for hva 0x%lx\n", s2fd->hva); 1826 mmap_read_unlock(current->mm); 1827 return -EFAULT; 1828 } 1829 1830 s2vi->vma_pagesize = BIT(kvm_s2_resolve_vma_size(s2fd, s2vi, vma)); 1831 1832 /* 1833 * Both the canonical IPA and fault IPA must be aligned to the 1834 * mapping size to ensure we find the right PFN and lay down the 1835 * mapping in the right place. 1836 */ 1837 s2vi->gfn = ALIGN_DOWN(s2fd->fault_ipa, s2vi->vma_pagesize) >> PAGE_SHIFT; 1838 1839 s2vi->mte_allowed = kvm_vma_mte_allowed(vma); 1840 1841 s2vi->vm_flags = vma->vm_flags; 1842 1843 s2vi->is_vma_cacheable = kvm_vma_is_cacheable(vma); 1844 1845 /* 1846 * Read mmu_invalidate_seq so that KVM can detect if the results of 1847 * vma_lookup() or __kvm_faultin_pfn() become stale prior to 1848 * acquiring kvm->mmu_lock. 1849 * 1850 * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs 1851 * with the smp_wmb() in kvm_mmu_invalidate_end(). 1852 */ 1853 s2vi->mmu_seq = kvm->mmu_invalidate_seq; 1854 mmap_read_unlock(current->mm); 1855 1856 return 0; 1857 } 1858 1859 static gfn_t get_canonical_gfn(const struct kvm_s2_fault_desc *s2fd, 1860 const struct kvm_s2_fault_vma_info *s2vi) 1861 { 1862 phys_addr_t ipa; 1863 1864 if (!s2fd->nested) 1865 return s2vi->gfn; 1866 1867 ipa = kvm_s2_trans_output(s2fd->nested); 1868 return ALIGN_DOWN(ipa, s2vi->vma_pagesize) >> PAGE_SHIFT; 1869 } 1870 1871 static int kvm_s2_fault_pin_pfn(const struct kvm_s2_fault_desc *s2fd, 1872 struct kvm_s2_fault_vma_info *s2vi) 1873 { 1874 int ret; 1875 1876 ret = kvm_s2_fault_get_vma_info(s2fd, s2vi); 1877 if (ret) 1878 return ret; 1879 1880 s2vi->pfn = __kvm_faultin_pfn(s2fd->memslot, get_canonical_gfn(s2fd, s2vi), 1881 kvm_is_write_fault(s2fd->vcpu) ? FOLL_WRITE : 0, 1882 &s2vi->map_writable, &s2vi->page); 1883 if (unlikely(is_error_noslot_pfn(s2vi->pfn))) { 1884 if (s2vi->pfn == KVM_PFN_ERR_HWPOISON) { 1885 kvm_send_hwpoison_signal(s2fd->hva, __ffs(s2vi->vma_pagesize)); 1886 return 0; 1887 } 1888 return -EFAULT; 1889 } 1890 1891 /* 1892 * Check if this is non-struct page memory PFN, and cannot support 1893 * CMOs. It could potentially be unsafe to access as cacheable. 1894 */ 1895 if (s2vi->vm_flags & (VM_PFNMAP | VM_MIXEDMAP) && !pfn_is_map_memory(s2vi->pfn)) { 1896 if (s2vi->is_vma_cacheable) { 1897 /* 1898 * Whilst the VMA owner expects cacheable mapping to this 1899 * PFN, hardware also has to support the FWB and CACHE DIC 1900 * features. 1901 * 1902 * ARM64 KVM relies on kernel VA mapping to the PFN to 1903 * perform cache maintenance as the CMO instructions work on 1904 * virtual addresses. VM_PFNMAP region are not necessarily 1905 * mapped to a KVA and hence the presence of hardware features 1906 * S2FWB and CACHE DIC are mandatory to avoid the need for 1907 * cache maintenance. 1908 */ 1909 if (!kvm_supports_cacheable_pfnmap()) { 1910 kvm_release_faultin_page(s2fd->vcpu->kvm, s2vi->page, true, false); 1911 return -EFAULT; 1912 } 1913 } else { 1914 /* 1915 * If the page was identified as device early by looking at 1916 * the VMA flags, vma_pagesize is already representing the 1917 * largest quantity we can map. If instead it was mapped 1918 * via __kvm_faultin_pfn(), vma_pagesize is set to PAGE_SIZE 1919 * and must not be upgraded. 1920 * 1921 * In both cases, we don't let transparent_hugepage_adjust() 1922 * change things at the last minute. 1923 */ 1924 s2vi->map_non_cacheable = true; 1925 } 1926 1927 s2vi->device = true; 1928 } 1929 1930 return 1; 1931 } 1932 1933 static int kvm_s2_fault_compute_prot(const struct kvm_s2_fault_desc *s2fd, 1934 const struct kvm_s2_fault_vma_info *s2vi, 1935 enum kvm_pgtable_prot *prot) 1936 { 1937 struct kvm *kvm = s2fd->vcpu->kvm; 1938 1939 if (kvm_vcpu_trap_is_exec_fault(s2fd->vcpu) && s2vi->map_non_cacheable) 1940 return -ENOEXEC; 1941 1942 /* 1943 * Guest performs atomic/exclusive operations on memory with unsupported 1944 * attributes (e.g. ld64b/st64b on normal memory when no FEAT_LS64WB) 1945 * and trigger the exception here. Since the memslot is valid, inject 1946 * the fault back to the guest. 1947 */ 1948 if (esr_fsc_is_excl_atomic_fault(kvm_vcpu_get_esr(s2fd->vcpu))) { 1949 kvm_inject_dabt_excl_atomic(s2fd->vcpu, kvm_vcpu_get_hfar(s2fd->vcpu)); 1950 return 1; 1951 } 1952 1953 *prot = KVM_PGTABLE_PROT_R; 1954 1955 if (s2vi->map_writable && (s2vi->device || 1956 !memslot_is_logging(s2fd->memslot) || 1957 kvm_is_write_fault(s2fd->vcpu))) 1958 *prot |= KVM_PGTABLE_PROT_W; 1959 1960 if (s2fd->nested) 1961 *prot = adjust_nested_fault_perms(s2fd->nested, *prot); 1962 1963 if (kvm_vcpu_trap_is_exec_fault(s2fd->vcpu)) 1964 *prot |= KVM_PGTABLE_PROT_X; 1965 1966 if (s2vi->map_non_cacheable) 1967 *prot |= (s2vi->vm_flags & VM_ALLOW_ANY_UNCACHED) ? 1968 KVM_PGTABLE_PROT_NORMAL_NC : KVM_PGTABLE_PROT_DEVICE; 1969 else if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC)) 1970 *prot |= KVM_PGTABLE_PROT_X; 1971 1972 if (s2fd->nested) 1973 *prot = adjust_nested_exec_perms(kvm, s2fd->nested, *prot); 1974 1975 if (!kvm_s2_fault_is_perm(s2fd) && !s2vi->map_non_cacheable && kvm_has_mte(kvm)) { 1976 /* Check the VMM hasn't introduced a new disallowed VMA */ 1977 if (!s2vi->mte_allowed) 1978 return -EFAULT; 1979 } 1980 1981 return 0; 1982 } 1983 1984 static int kvm_s2_fault_map(const struct kvm_s2_fault_desc *s2fd, 1985 const struct kvm_s2_fault_vma_info *s2vi, 1986 enum kvm_pgtable_prot prot, 1987 void *memcache) 1988 { 1989 enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_SHARED; 1990 bool writable = prot & KVM_PGTABLE_PROT_W; 1991 struct kvm *kvm = s2fd->vcpu->kvm; 1992 struct kvm_pgtable *pgt; 1993 long perm_fault_granule; 1994 long mapping_size; 1995 kvm_pfn_t pfn; 1996 gfn_t gfn; 1997 int ret; 1998 1999 kvm_fault_lock(kvm); 2000 pgt = s2fd->vcpu->arch.hw_mmu->pgt; 2001 ret = -EAGAIN; 2002 if (mmu_invalidate_retry(kvm, s2vi->mmu_seq)) 2003 goto out_unlock; 2004 2005 perm_fault_granule = (kvm_s2_fault_is_perm(s2fd) ? 2006 kvm_vcpu_trap_get_perm_fault_granule(s2fd->vcpu) : 0); 2007 mapping_size = s2vi->vma_pagesize; 2008 pfn = s2vi->pfn; 2009 gfn = s2vi->gfn; 2010 2011 /* 2012 * If we are not forced to use page mapping, check if we are 2013 * backed by a THP and thus use block mapping if possible. 2014 */ 2015 if (mapping_size == PAGE_SIZE && 2016 !(s2vi->max_map_size == PAGE_SIZE || s2vi->map_non_cacheable)) { 2017 if (perm_fault_granule > PAGE_SIZE) { 2018 mapping_size = perm_fault_granule; 2019 } else { 2020 mapping_size = transparent_hugepage_adjust(kvm, s2fd->memslot, 2021 s2fd->hva, &pfn, 2022 &gfn); 2023 if (mapping_size < 0) { 2024 ret = mapping_size; 2025 goto out_unlock; 2026 } 2027 } 2028 } 2029 2030 if (!perm_fault_granule && !s2vi->map_non_cacheable && kvm_has_mte(kvm)) 2031 sanitise_mte_tags(kvm, pfn, mapping_size); 2032 2033 /* 2034 * Under the premise of getting a FSC_PERM fault, we just need to relax 2035 * permissions only if mapping_size equals perm_fault_granule. Otherwise, 2036 * kvm_pgtable_stage2_map() should be called to change block size. 2037 */ 2038 if (mapping_size == perm_fault_granule) { 2039 /* 2040 * Drop the SW bits in favour of those stored in the 2041 * PTE, which will be preserved. 2042 */ 2043 prot &= ~KVM_NV_GUEST_MAP_SZ; 2044 ret = KVM_PGT_FN(kvm_pgtable_stage2_relax_perms)(pgt, gfn_to_gpa(gfn), 2045 prot, flags); 2046 } else { 2047 ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, gfn_to_gpa(gfn), mapping_size, 2048 __pfn_to_phys(pfn), prot, 2049 memcache, flags); 2050 } 2051 2052 out_unlock: 2053 kvm_release_faultin_page(kvm, s2vi->page, !!ret, writable); 2054 kvm_fault_unlock(kvm); 2055 2056 /* 2057 * Mark the page dirty only if the fault is handled successfully, 2058 * making sure we adjust the canonical IPA if the mapping size has 2059 * been updated (via a THP upgrade, for example). 2060 */ 2061 if (writable && !ret) { 2062 phys_addr_t ipa = gfn_to_gpa(get_canonical_gfn(s2fd, s2vi)); 2063 ipa &= ~(mapping_size - 1); 2064 mark_page_dirty_in_slot(kvm, s2fd->memslot, gpa_to_gfn(ipa)); 2065 } 2066 2067 if (ret != -EAGAIN) 2068 return ret; 2069 return 0; 2070 } 2071 2072 static int user_mem_abort(const struct kvm_s2_fault_desc *s2fd) 2073 { 2074 bool perm_fault = kvm_vcpu_trap_is_permission_fault(s2fd->vcpu); 2075 struct kvm_s2_fault_vma_info s2vi = {}; 2076 enum kvm_pgtable_prot prot; 2077 void *memcache; 2078 int ret; 2079 2080 /* 2081 * Permission faults just need to update the existing leaf entry, 2082 * and so normally don't require allocations from the memcache. The 2083 * only exception to this is when dirty logging is enabled at runtime 2084 * and a write fault needs to collapse a block entry into a table. 2085 */ 2086 memcache = get_mmu_memcache(s2fd->vcpu); 2087 if (!perm_fault || (memslot_is_logging(s2fd->memslot) && 2088 kvm_is_write_fault(s2fd->vcpu))) { 2089 ret = topup_mmu_memcache(s2fd->vcpu, memcache); 2090 if (ret) 2091 return ret; 2092 } 2093 2094 /* 2095 * Let's check if we will get back a huge page backed by hugetlbfs, or 2096 * get block mapping for device MMIO region. 2097 */ 2098 ret = kvm_s2_fault_pin_pfn(s2fd, &s2vi); 2099 if (ret != 1) 2100 return ret; 2101 2102 ret = kvm_s2_fault_compute_prot(s2fd, &s2vi, &prot); 2103 if (ret) { 2104 kvm_release_page_unused(s2vi.page); 2105 return ret; 2106 } 2107 2108 return kvm_s2_fault_map(s2fd, &s2vi, prot, memcache); 2109 } 2110 2111 /* Resolve the access fault by making the page young again. */ 2112 static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) 2113 { 2114 enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_SHARED; 2115 struct kvm_s2_mmu *mmu; 2116 2117 trace_kvm_access_fault(fault_ipa); 2118 2119 read_lock(&vcpu->kvm->mmu_lock); 2120 mmu = vcpu->arch.hw_mmu; 2121 KVM_PGT_FN(kvm_pgtable_stage2_mkyoung)(mmu->pgt, fault_ipa, flags); 2122 read_unlock(&vcpu->kvm->mmu_lock); 2123 } 2124 2125 /* 2126 * Returns true if the SEA should be handled locally within KVM if the abort 2127 * is caused by a kernel memory allocation (e.g. stage-2 table memory). 2128 */ 2129 static bool host_owns_sea(struct kvm_vcpu *vcpu, u64 esr) 2130 { 2131 /* 2132 * Without FEAT_RAS HCR_EL2.TEA is RES0, meaning any external abort 2133 * taken from a guest EL to EL2 is due to a host-imposed access (e.g. 2134 * stage-2 PTW). 2135 */ 2136 if (!cpus_have_final_cap(ARM64_HAS_RAS_EXTN)) 2137 return true; 2138 2139 /* KVM owns the VNCR when the vCPU isn't in a nested context. */ 2140 if (is_hyp_ctxt(vcpu) && !kvm_vcpu_trap_is_iabt(vcpu) && (esr & ESR_ELx_VNCR)) 2141 return true; 2142 2143 /* 2144 * Determining if an external abort during a table walk happened at 2145 * stage-2 is only possible with S1PTW is set. Otherwise, since KVM 2146 * sets HCR_EL2.TEA, SEAs due to a stage-1 walk (i.e. accessing the 2147 * PA of the stage-1 descriptor) can reach here and are reported 2148 * with a TTW ESR value. 2149 */ 2150 return (esr_fsc_is_sea_ttw(esr) && (esr & ESR_ELx_S1PTW)); 2151 } 2152 2153 int kvm_handle_guest_sea(struct kvm_vcpu *vcpu) 2154 { 2155 struct kvm *kvm = vcpu->kvm; 2156 struct kvm_run *run = vcpu->run; 2157 u64 esr = kvm_vcpu_get_esr(vcpu); 2158 u64 esr_mask = ESR_ELx_EC_MASK | 2159 ESR_ELx_IL | 2160 ESR_ELx_FnV | 2161 ESR_ELx_EA | 2162 ESR_ELx_CM | 2163 ESR_ELx_WNR | 2164 ESR_ELx_FSC; 2165 u64 ipa; 2166 2167 /* 2168 * Give APEI the opportunity to claim the abort before handling it 2169 * within KVM. apei_claim_sea() expects to be called with IRQs enabled. 2170 */ 2171 lockdep_assert_irqs_enabled(); 2172 if (apei_claim_sea(NULL) == 0) 2173 return 1; 2174 2175 if (host_owns_sea(vcpu, esr) || 2176 !test_bit(KVM_ARCH_FLAG_EXIT_SEA, &vcpu->kvm->arch.flags)) 2177 return kvm_inject_serror(vcpu); 2178 2179 /* ESR_ELx.SET is RES0 when FEAT_RAS isn't implemented. */ 2180 if (kvm_has_ras(kvm)) 2181 esr_mask |= ESR_ELx_SET_MASK; 2182 2183 /* 2184 * Exit to userspace, and provide faulting guest virtual and physical 2185 * addresses in case userspace wants to emulate SEA to guest by 2186 * writing to FAR_ELx and HPFAR_ELx registers. 2187 */ 2188 memset(&run->arm_sea, 0, sizeof(run->arm_sea)); 2189 run->exit_reason = KVM_EXIT_ARM_SEA; 2190 run->arm_sea.esr = esr & esr_mask; 2191 2192 if (!(esr & ESR_ELx_FnV)) 2193 run->arm_sea.gva = kvm_vcpu_get_hfar(vcpu); 2194 2195 ipa = kvm_vcpu_get_fault_ipa(vcpu); 2196 if (ipa != INVALID_GPA) { 2197 run->arm_sea.flags |= KVM_EXIT_ARM_SEA_FLAG_GPA_VALID; 2198 run->arm_sea.gpa = ipa; 2199 } 2200 2201 return 0; 2202 } 2203 2204 /** 2205 * kvm_handle_guest_abort - handles all 2nd stage aborts 2206 * @vcpu: the VCPU pointer 2207 * 2208 * Any abort that gets to the host is almost guaranteed to be caused by a 2209 * missing second stage translation table entry, which can mean that either the 2210 * guest simply needs more memory and we must allocate an appropriate page or it 2211 * can mean that the guest tried to access I/O memory, which is emulated by user 2212 * space. The distinction is based on the IPA causing the fault and whether this 2213 * memory region has been registered as standard RAM by user space. 2214 */ 2215 int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) 2216 { 2217 struct kvm_s2_trans nested_trans, *nested = NULL; 2218 unsigned long esr; 2219 phys_addr_t fault_ipa; /* The address we faulted on */ 2220 phys_addr_t ipa; /* Always the IPA in the L1 guest phys space */ 2221 struct kvm_memory_slot *memslot; 2222 unsigned long hva; 2223 bool is_iabt, write_fault, writable; 2224 gfn_t gfn; 2225 int ret, idx; 2226 2227 if (kvm_vcpu_abt_issea(vcpu)) 2228 return kvm_handle_guest_sea(vcpu); 2229 2230 esr = kvm_vcpu_get_esr(vcpu); 2231 2232 /* 2233 * The fault IPA should be reliable at this point as we're not dealing 2234 * with an SEA. 2235 */ 2236 ipa = fault_ipa = kvm_vcpu_get_fault_ipa(vcpu); 2237 if (KVM_BUG_ON(ipa == INVALID_GPA, vcpu->kvm)) 2238 return -EFAULT; 2239 2240 is_iabt = kvm_vcpu_trap_is_iabt(vcpu); 2241 2242 if (esr_fsc_is_translation_fault(esr)) { 2243 /* Beyond sanitised PARange (which is the IPA limit) */ 2244 if (fault_ipa >= BIT_ULL(get_kvm_ipa_limit())) { 2245 kvm_inject_size_fault(vcpu); 2246 return 1; 2247 } 2248 2249 /* Falls between the IPA range and the PARange? */ 2250 if (fault_ipa >= BIT_ULL(VTCR_EL2_IPA(vcpu->arch.hw_mmu->vtcr))) { 2251 fault_ipa |= FAR_TO_FIPA_OFFSET(kvm_vcpu_get_hfar(vcpu)); 2252 2253 return kvm_inject_sea(vcpu, is_iabt, fault_ipa); 2254 } 2255 } 2256 2257 trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_esr(vcpu), 2258 kvm_vcpu_get_hfar(vcpu), fault_ipa); 2259 2260 /* Check the stage-2 fault is trans. fault or write fault */ 2261 if (!esr_fsc_is_translation_fault(esr) && 2262 !esr_fsc_is_permission_fault(esr) && 2263 !esr_fsc_is_access_flag_fault(esr) && 2264 !esr_fsc_is_excl_atomic_fault(esr)) { 2265 kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n", 2266 kvm_vcpu_trap_get_class(vcpu), 2267 (unsigned long)kvm_vcpu_trap_get_fault(vcpu), 2268 (unsigned long)kvm_vcpu_get_esr(vcpu)); 2269 return -EFAULT; 2270 } 2271 2272 idx = srcu_read_lock(&vcpu->kvm->srcu); 2273 2274 /* 2275 * We may have faulted on a shadow stage 2 page table if we are 2276 * running a nested guest. In this case, we have to resolve the L2 2277 * IPA to the L1 IPA first, before knowing what kind of memory should 2278 * back the L1 IPA. 2279 * 2280 * If the shadow stage 2 page table walk faults, then we simply inject 2281 * this to the guest and carry on. 2282 * 2283 * If there are no shadow S2 PTs because S2 is disabled, there is 2284 * nothing to walk and we treat it as a 1:1 before going through the 2285 * canonical translation. 2286 */ 2287 if (kvm_is_nested_s2_mmu(vcpu->kvm,vcpu->arch.hw_mmu) && 2288 vcpu->arch.hw_mmu->nested_stage2_enabled) { 2289 u32 esr; 2290 2291 ret = kvm_walk_nested_s2(vcpu, fault_ipa, &nested_trans); 2292 if (ret == -EAGAIN) { 2293 ret = 1; 2294 goto out_unlock; 2295 } 2296 2297 if (ret) { 2298 esr = kvm_s2_trans_esr(&nested_trans); 2299 kvm_inject_s2_fault(vcpu, esr); 2300 goto out_unlock; 2301 } 2302 2303 ret = kvm_s2_handle_perm_fault(vcpu, &nested_trans); 2304 if (ret) { 2305 esr = kvm_s2_trans_esr(&nested_trans); 2306 kvm_inject_s2_fault(vcpu, esr); 2307 goto out_unlock; 2308 } 2309 2310 ipa = kvm_s2_trans_output(&nested_trans); 2311 nested = &nested_trans; 2312 } 2313 2314 gfn = ipa >> PAGE_SHIFT; 2315 memslot = gfn_to_memslot(vcpu->kvm, gfn); 2316 hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable); 2317 write_fault = kvm_is_write_fault(vcpu); 2318 if (kvm_is_error_hva(hva) || (write_fault && !writable)) { 2319 /* 2320 * The guest has put either its instructions or its page-tables 2321 * somewhere it shouldn't have. Userspace won't be able to do 2322 * anything about this (there's no syndrome for a start), so 2323 * re-inject the abort back into the guest. 2324 */ 2325 if (is_iabt) { 2326 ret = -ENOEXEC; 2327 goto out; 2328 } 2329 2330 if (kvm_vcpu_abt_iss1tw(vcpu)) { 2331 ret = kvm_inject_sea_dabt(vcpu, kvm_vcpu_get_hfar(vcpu)); 2332 goto out_unlock; 2333 } 2334 2335 /* 2336 * Check for a cache maintenance operation. Since we 2337 * ended-up here, we know it is outside of any memory 2338 * slot. But we can't find out if that is for a device, 2339 * or if the guest is just being stupid. The only thing 2340 * we know for sure is that this range cannot be cached. 2341 * 2342 * So let's assume that the guest is just being 2343 * cautious, and skip the instruction. 2344 */ 2345 if (kvm_is_error_hva(hva) && kvm_vcpu_dabt_is_cm(vcpu)) { 2346 kvm_incr_pc(vcpu); 2347 ret = 1; 2348 goto out_unlock; 2349 } 2350 2351 /* 2352 * The IPA is reported as [MAX:12], so we need to 2353 * complement it with the bottom 12 bits from the 2354 * faulting VA. This is always 12 bits, irrespective 2355 * of the page size. 2356 */ 2357 ipa |= FAR_TO_FIPA_OFFSET(kvm_vcpu_get_hfar(vcpu)); 2358 ret = io_mem_abort(vcpu, ipa); 2359 goto out_unlock; 2360 } 2361 2362 /* Userspace should not be able to register out-of-bounds IPAs */ 2363 VM_BUG_ON(ipa >= kvm_phys_size(vcpu->arch.hw_mmu)); 2364 2365 if (esr_fsc_is_access_flag_fault(esr)) { 2366 handle_access_fault(vcpu, fault_ipa); 2367 ret = 1; 2368 goto out_unlock; 2369 } 2370 2371 const struct kvm_s2_fault_desc s2fd = { 2372 .vcpu = vcpu, 2373 .fault_ipa = fault_ipa, 2374 .nested = nested, 2375 .memslot = memslot, 2376 .hva = hva, 2377 }; 2378 2379 if (kvm_vm_is_protected(vcpu->kvm)) { 2380 ret = pkvm_mem_abort(&s2fd); 2381 } else { 2382 VM_WARN_ON_ONCE(kvm_vcpu_trap_is_permission_fault(vcpu) && 2383 !write_fault && 2384 !kvm_vcpu_trap_is_exec_fault(vcpu)); 2385 2386 if (kvm_slot_has_gmem(memslot)) 2387 ret = gmem_abort(&s2fd); 2388 else 2389 ret = user_mem_abort(&s2fd); 2390 } 2391 2392 if (ret == 0) 2393 ret = 1; 2394 out: 2395 if (ret == -ENOEXEC) 2396 ret = kvm_inject_sea_iabt(vcpu, kvm_vcpu_get_hfar(vcpu)); 2397 out_unlock: 2398 srcu_read_unlock(&vcpu->kvm->srcu, idx); 2399 return ret; 2400 } 2401 2402 bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) 2403 { 2404 if (!kvm->arch.mmu.pgt || kvm_vm_is_protected(kvm)) 2405 return false; 2406 2407 __unmap_stage2_range(&kvm->arch.mmu, range->start << PAGE_SHIFT, 2408 (range->end - range->start) << PAGE_SHIFT, 2409 range->may_block); 2410 2411 kvm_nested_s2_unmap(kvm, range->may_block); 2412 return false; 2413 } 2414 2415 bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 2416 { 2417 u64 size = (range->end - range->start) << PAGE_SHIFT; 2418 2419 if (!kvm->arch.mmu.pgt || kvm_vm_is_protected(kvm)) 2420 return false; 2421 2422 return KVM_PGT_FN(kvm_pgtable_stage2_test_clear_young)(kvm->arch.mmu.pgt, 2423 range->start << PAGE_SHIFT, 2424 size, true); 2425 /* 2426 * TODO: Handle nested_mmu structures here using the reverse mapping in 2427 * a later version of patch series. 2428 */ 2429 } 2430 2431 bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 2432 { 2433 u64 size = (range->end - range->start) << PAGE_SHIFT; 2434 2435 if (!kvm->arch.mmu.pgt || kvm_vm_is_protected(kvm)) 2436 return false; 2437 2438 return KVM_PGT_FN(kvm_pgtable_stage2_test_clear_young)(kvm->arch.mmu.pgt, 2439 range->start << PAGE_SHIFT, 2440 size, false); 2441 } 2442 2443 phys_addr_t kvm_mmu_get_httbr(void) 2444 { 2445 return __pa(hyp_pgtable->pgd); 2446 } 2447 2448 phys_addr_t kvm_get_idmap_vector(void) 2449 { 2450 return hyp_idmap_vector; 2451 } 2452 2453 static int kvm_map_idmap_text(void) 2454 { 2455 unsigned long size = hyp_idmap_end - hyp_idmap_start; 2456 int err = __create_hyp_mappings(hyp_idmap_start, size, hyp_idmap_start, 2457 PAGE_HYP_EXEC); 2458 if (err) 2459 kvm_err("Failed to idmap %lx-%lx\n", 2460 hyp_idmap_start, hyp_idmap_end); 2461 2462 return err; 2463 } 2464 2465 static void *kvm_hyp_zalloc_page(void *arg) 2466 { 2467 return (void *)get_zeroed_page(GFP_KERNEL); 2468 } 2469 2470 static struct kvm_pgtable_mm_ops kvm_hyp_mm_ops = { 2471 .zalloc_page = kvm_hyp_zalloc_page, 2472 .get_page = kvm_host_get_page, 2473 .put_page = kvm_host_put_page, 2474 .phys_to_virt = kvm_host_va, 2475 .virt_to_phys = kvm_host_pa, 2476 }; 2477 2478 int __init kvm_mmu_init(u32 hyp_va_bits) 2479 { 2480 int err; 2481 2482 hyp_idmap_start = __pa_symbol(__hyp_idmap_text_start); 2483 hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE); 2484 hyp_idmap_end = __pa_symbol(__hyp_idmap_text_end); 2485 hyp_idmap_end = ALIGN(hyp_idmap_end, PAGE_SIZE); 2486 hyp_idmap_vector = __pa_symbol(__kvm_hyp_init); 2487 2488 /* 2489 * We rely on the linker script to ensure at build time that the HYP 2490 * init code does not cross a page boundary. 2491 */ 2492 BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK); 2493 2494 kvm_debug("Using %u-bit virtual addresses at EL2\n", hyp_va_bits); 2495 kvm_debug("IDMAP page: %lx\n", hyp_idmap_start); 2496 kvm_debug("HYP VA range: %lx:%lx\n", 2497 kern_hyp_va(PAGE_OFFSET), 2498 kern_hyp_va((unsigned long)high_memory - 1)); 2499 2500 if (hyp_idmap_start >= kern_hyp_va(PAGE_OFFSET) && 2501 hyp_idmap_start < kern_hyp_va((unsigned long)high_memory - 1) && 2502 hyp_idmap_start != (unsigned long)__hyp_idmap_text_start) { 2503 /* 2504 * The idmap page is intersecting with the VA space, 2505 * it is not safe to continue further. 2506 */ 2507 kvm_err("IDMAP intersecting with HYP VA, unable to continue\n"); 2508 err = -EINVAL; 2509 goto out; 2510 } 2511 2512 hyp_pgtable = kzalloc_obj(*hyp_pgtable); 2513 if (!hyp_pgtable) { 2514 kvm_err("Hyp mode page-table not allocated\n"); 2515 err = -ENOMEM; 2516 goto out; 2517 } 2518 2519 err = kvm_pgtable_hyp_init(hyp_pgtable, hyp_va_bits, &kvm_hyp_mm_ops); 2520 if (err) 2521 goto out_free_pgtable; 2522 2523 err = kvm_map_idmap_text(); 2524 if (err) 2525 goto out_destroy_pgtable; 2526 2527 io_map_base = hyp_idmap_start; 2528 __hyp_va_bits = hyp_va_bits; 2529 return 0; 2530 2531 out_destroy_pgtable: 2532 kvm_pgtable_hyp_destroy(hyp_pgtable); 2533 out_free_pgtable: 2534 kfree(hyp_pgtable); 2535 hyp_pgtable = NULL; 2536 out: 2537 return err; 2538 } 2539 2540 void kvm_arch_commit_memory_region(struct kvm *kvm, 2541 struct kvm_memory_slot *old, 2542 const struct kvm_memory_slot *new, 2543 enum kvm_mr_change change) 2544 { 2545 bool log_dirty_pages = new && new->flags & KVM_MEM_LOG_DIRTY_PAGES; 2546 2547 /* 2548 * At this point memslot has been committed and there is an 2549 * allocated dirty_bitmap[], dirty pages will be tracked while the 2550 * memory slot is write protected. 2551 */ 2552 if (log_dirty_pages) { 2553 2554 if (change == KVM_MR_DELETE) 2555 return; 2556 2557 /* 2558 * Huge and normal pages are write-protected and split 2559 * on either of these two cases: 2560 * 2561 * 1. with initial-all-set: gradually with CLEAR ioctls, 2562 */ 2563 if (kvm_dirty_log_manual_protect_and_init_set(kvm)) 2564 return; 2565 /* 2566 * or 2567 * 2. without initial-all-set: all in one shot when 2568 * enabling dirty logging. 2569 */ 2570 kvm_mmu_wp_memory_region(kvm, new->id); 2571 kvm_mmu_split_memory_region(kvm, new->id); 2572 } else { 2573 /* 2574 * Free any leftovers from the eager page splitting cache. Do 2575 * this when deleting, moving, disabling dirty logging, or 2576 * creating the memslot (a nop). Doing it for deletes makes 2577 * sure we don't leak memory, and there's no need to keep the 2578 * cache around for any of the other cases. 2579 */ 2580 kvm_mmu_free_memory_cache(&kvm->arch.mmu.split_page_cache); 2581 } 2582 } 2583 2584 int kvm_arch_prepare_memory_region(struct kvm *kvm, 2585 const struct kvm_memory_slot *old, 2586 struct kvm_memory_slot *new, 2587 enum kvm_mr_change change) 2588 { 2589 hva_t hva, reg_end; 2590 int ret = 0; 2591 2592 if (kvm_vm_is_protected(kvm)) { 2593 /* Cannot modify memslots once a pVM has run. */ 2594 if (pkvm_hyp_vm_is_created(kvm) && 2595 (change == KVM_MR_DELETE || change == KVM_MR_MOVE)) { 2596 return -EPERM; 2597 } 2598 2599 if (new && 2600 new->flags & (KVM_MEM_LOG_DIRTY_PAGES | KVM_MEM_READONLY)) { 2601 return -EPERM; 2602 } 2603 } 2604 2605 if (change != KVM_MR_CREATE && change != KVM_MR_MOVE && 2606 change != KVM_MR_FLAGS_ONLY) 2607 return 0; 2608 2609 /* 2610 * Prevent userspace from creating a memory region outside of the IPA 2611 * space addressable by the KVM guest IPA space. 2612 */ 2613 if ((new->base_gfn + new->npages) > (kvm_phys_size(&kvm->arch.mmu) >> PAGE_SHIFT)) 2614 return -EFAULT; 2615 2616 /* 2617 * Only support guest_memfd backed memslots with mappable memory, since 2618 * there aren't any CoCo VMs that support only private memory on arm64. 2619 */ 2620 if (kvm_slot_has_gmem(new) && !kvm_memslot_is_gmem_only(new)) 2621 return -EINVAL; 2622 2623 hva = new->userspace_addr; 2624 reg_end = hva + (new->npages << PAGE_SHIFT); 2625 2626 mmap_read_lock(current->mm); 2627 /* 2628 * A memory region could potentially cover multiple VMAs, and any holes 2629 * between them, so iterate over all of them. 2630 * 2631 * +--------------------------------------------+ 2632 * +---------------+----------------+ +----------------+ 2633 * | : VMA 1 | VMA 2 | | VMA 3 : | 2634 * +---------------+----------------+ +----------------+ 2635 * | memory region | 2636 * +--------------------------------------------+ 2637 */ 2638 do { 2639 struct vm_area_struct *vma; 2640 2641 vma = find_vma_intersection(current->mm, hva, reg_end); 2642 if (!vma) 2643 break; 2644 2645 if (kvm_has_mte(kvm) && !kvm_vma_mte_allowed(vma)) { 2646 ret = -EINVAL; 2647 break; 2648 } 2649 2650 if (vma->vm_flags & VM_PFNMAP) { 2651 /* IO region dirty page logging not allowed */ 2652 if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) { 2653 ret = -EINVAL; 2654 break; 2655 } 2656 2657 /* 2658 * Cacheable PFNMAP is allowed only if the hardware 2659 * supports it. 2660 */ 2661 if (kvm_vma_is_cacheable(vma) && !kvm_supports_cacheable_pfnmap()) { 2662 ret = -EINVAL; 2663 break; 2664 } 2665 } 2666 hva = min(reg_end, vma->vm_end); 2667 } while (hva < reg_end); 2668 2669 mmap_read_unlock(current->mm); 2670 return ret; 2671 } 2672 2673 void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) 2674 { 2675 } 2676 2677 void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) 2678 { 2679 } 2680 2681 void kvm_arch_flush_shadow_memslot(struct kvm *kvm, 2682 struct kvm_memory_slot *slot) 2683 { 2684 gpa_t gpa = slot->base_gfn << PAGE_SHIFT; 2685 phys_addr_t size = slot->npages << PAGE_SHIFT; 2686 2687 write_lock(&kvm->mmu_lock); 2688 kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, size, true); 2689 kvm_nested_s2_unmap(kvm, true); 2690 write_unlock(&kvm->mmu_lock); 2691 } 2692 2693 /* 2694 * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized). 2695 * 2696 * Main problems: 2697 * - S/W ops are local to a CPU (not broadcast) 2698 * - We have line migration behind our back (speculation) 2699 * - System caches don't support S/W at all (damn!) 2700 * 2701 * In the face of the above, the best we can do is to try and convert 2702 * S/W ops to VA ops. Because the guest is not allowed to infer the 2703 * S/W to PA mapping, it can only use S/W to nuke the whole cache, 2704 * which is a rather good thing for us. 2705 * 2706 * Also, it is only used when turning caches on/off ("The expected 2707 * usage of the cache maintenance instructions that operate by set/way 2708 * is associated with the cache maintenance instructions associated 2709 * with the powerdown and powerup of caches, if this is required by 2710 * the implementation."). 2711 * 2712 * We use the following policy: 2713 * 2714 * - If we trap a S/W operation, we enable VM trapping to detect 2715 * caches being turned on/off, and do a full clean. 2716 * 2717 * - We flush the caches on both caches being turned on and off. 2718 * 2719 * - Once the caches are enabled, we stop trapping VM ops. 2720 */ 2721 void kvm_set_way_flush(struct kvm_vcpu *vcpu) 2722 { 2723 unsigned long hcr = *vcpu_hcr(vcpu); 2724 2725 /* 2726 * If this is the first time we do a S/W operation 2727 * (i.e. HCR_TVM not set) flush the whole memory, and set the 2728 * VM trapping. 2729 * 2730 * Otherwise, rely on the VM trapping to wait for the MMU + 2731 * Caches to be turned off. At that point, we'll be able to 2732 * clean the caches again. 2733 */ 2734 if (!(hcr & HCR_TVM)) { 2735 trace_kvm_set_way_flush(*vcpu_pc(vcpu), 2736 vcpu_has_cache_enabled(vcpu)); 2737 stage2_flush_vm(vcpu->kvm); 2738 *vcpu_hcr(vcpu) = hcr | HCR_TVM; 2739 } 2740 } 2741 2742 void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled) 2743 { 2744 bool now_enabled = vcpu_has_cache_enabled(vcpu); 2745 2746 /* 2747 * If switching the MMU+caches on, need to invalidate the caches. 2748 * If switching it off, need to clean the caches. 2749 * Clean + invalidate does the trick always. 2750 */ 2751 if (now_enabled != was_enabled) 2752 stage2_flush_vm(vcpu->kvm); 2753 2754 /* Caches are now on, stop trapping VM ops (until a S/W op) */ 2755 if (now_enabled) 2756 *vcpu_hcr(vcpu) &= ~HCR_TVM; 2757 2758 trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled); 2759 } 2760