1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2012 - Virtual Open Systems and Columbia University 4 * Author: Christoffer Dall <c.dall@virtualopensystems.com> 5 */ 6 7 #include <linux/acpi.h> 8 #include <linux/mman.h> 9 #include <linux/kvm_host.h> 10 #include <linux/io.h> 11 #include <linux/hugetlb.h> 12 #include <linux/sched/signal.h> 13 #include <trace/events/kvm.h> 14 #include <asm/acpi.h> 15 #include <asm/pgalloc.h> 16 #include <asm/cacheflush.h> 17 #include <asm/kvm_arm.h> 18 #include <asm/kvm_mmu.h> 19 #include <asm/kvm_pgtable.h> 20 #include <asm/kvm_pkvm.h> 21 #include <asm/kvm_asm.h> 22 #include <asm/kvm_emulate.h> 23 #include <asm/virt.h> 24 25 #include "trace.h" 26 27 static struct kvm_pgtable *hyp_pgtable; 28 static DEFINE_MUTEX(kvm_hyp_pgd_mutex); 29 30 static unsigned long __ro_after_init hyp_idmap_start; 31 static unsigned long __ro_after_init hyp_idmap_end; 32 static phys_addr_t __ro_after_init hyp_idmap_vector; 33 34 u32 __ro_after_init __hyp_va_bits; 35 36 static unsigned long __ro_after_init io_map_base; 37 38 #define KVM_PGT_FN(fn) (!is_protected_kvm_enabled() ? fn : p ## fn) 39 40 static phys_addr_t __stage2_range_addr_end(phys_addr_t addr, phys_addr_t end, 41 phys_addr_t size) 42 { 43 phys_addr_t boundary = ALIGN_DOWN(addr + size, size); 44 45 return (boundary - 1 < end - 1) ? boundary : end; 46 } 47 48 static phys_addr_t stage2_range_addr_end(phys_addr_t addr, phys_addr_t end) 49 { 50 phys_addr_t size = kvm_granule_size(KVM_PGTABLE_MIN_BLOCK_LEVEL); 51 52 return __stage2_range_addr_end(addr, end, size); 53 } 54 55 /* 56 * Release kvm_mmu_lock periodically if the memory region is large. Otherwise, 57 * we may see kernel panics with CONFIG_DETECT_HUNG_TASK, 58 * CONFIG_LOCKUP_DETECTOR, CONFIG_LOCKDEP. Additionally, holding the lock too 59 * long will also starve other vCPUs. We have to also make sure that the page 60 * tables are not freed while we released the lock. 61 */ 62 static int stage2_apply_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, 63 phys_addr_t end, 64 int (*fn)(struct kvm_pgtable *, u64, u64), 65 bool resched) 66 { 67 struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu); 68 int ret; 69 u64 next; 70 71 do { 72 struct kvm_pgtable *pgt = mmu->pgt; 73 if (!pgt) 74 return -EINVAL; 75 76 next = stage2_range_addr_end(addr, end); 77 ret = fn(pgt, addr, next - addr); 78 if (ret) 79 break; 80 81 if (resched && next != end) 82 cond_resched_rwlock_write(&kvm->mmu_lock); 83 } while (addr = next, addr != end); 84 85 return ret; 86 } 87 88 #define stage2_apply_range_resched(mmu, addr, end, fn) \ 89 stage2_apply_range(mmu, addr, end, fn, true) 90 91 /* 92 * Get the maximum number of page-tables pages needed to split a range 93 * of blocks into PAGE_SIZE PTEs. It assumes the range is already 94 * mapped at level 2, or at level 1 if allowed. 95 */ 96 static int kvm_mmu_split_nr_page_tables(u64 range) 97 { 98 int n = 0; 99 100 if (KVM_PGTABLE_MIN_BLOCK_LEVEL < 2) 101 n += DIV_ROUND_UP(range, PUD_SIZE); 102 n += DIV_ROUND_UP(range, PMD_SIZE); 103 return n; 104 } 105 106 static bool need_split_memcache_topup_or_resched(struct kvm *kvm) 107 { 108 struct kvm_mmu_memory_cache *cache; 109 u64 chunk_size, min; 110 111 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) 112 return true; 113 114 chunk_size = kvm->arch.mmu.split_page_chunk_size; 115 min = kvm_mmu_split_nr_page_tables(chunk_size); 116 cache = &kvm->arch.mmu.split_page_cache; 117 return kvm_mmu_memory_cache_nr_free_objects(cache) < min; 118 } 119 120 static int kvm_mmu_split_huge_pages(struct kvm *kvm, phys_addr_t addr, 121 phys_addr_t end) 122 { 123 struct kvm_mmu_memory_cache *cache; 124 struct kvm_pgtable *pgt; 125 int ret, cache_capacity; 126 u64 next, chunk_size; 127 128 lockdep_assert_held_write(&kvm->mmu_lock); 129 130 chunk_size = kvm->arch.mmu.split_page_chunk_size; 131 cache_capacity = kvm_mmu_split_nr_page_tables(chunk_size); 132 133 if (chunk_size == 0) 134 return 0; 135 136 cache = &kvm->arch.mmu.split_page_cache; 137 138 do { 139 if (need_split_memcache_topup_or_resched(kvm)) { 140 write_unlock(&kvm->mmu_lock); 141 cond_resched(); 142 /* Eager page splitting is best-effort. */ 143 ret = __kvm_mmu_topup_memory_cache(cache, 144 cache_capacity, 145 cache_capacity); 146 write_lock(&kvm->mmu_lock); 147 if (ret) 148 break; 149 } 150 151 pgt = kvm->arch.mmu.pgt; 152 if (!pgt) 153 return -EINVAL; 154 155 next = __stage2_range_addr_end(addr, end, chunk_size); 156 ret = KVM_PGT_FN(kvm_pgtable_stage2_split)(pgt, addr, next - addr, cache); 157 if (ret) 158 break; 159 } while (addr = next, addr != end); 160 161 return ret; 162 } 163 164 static bool memslot_is_logging(struct kvm_memory_slot *memslot) 165 { 166 return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY); 167 } 168 169 /** 170 * kvm_arch_flush_remote_tlbs() - flush all VM TLB entries for v7/8 171 * @kvm: pointer to kvm structure. 172 * 173 * Interface to HYP function to flush all VM TLB entries 174 */ 175 int kvm_arch_flush_remote_tlbs(struct kvm *kvm) 176 { 177 if (is_protected_kvm_enabled()) 178 kvm_call_hyp_nvhe(__pkvm_tlb_flush_vmid, kvm->arch.pkvm.handle); 179 else 180 kvm_call_hyp(__kvm_tlb_flush_vmid, &kvm->arch.mmu); 181 return 0; 182 } 183 184 int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, 185 gfn_t gfn, u64 nr_pages) 186 { 187 u64 size = nr_pages << PAGE_SHIFT; 188 u64 addr = gfn << PAGE_SHIFT; 189 190 if (is_protected_kvm_enabled()) 191 kvm_call_hyp_nvhe(__pkvm_tlb_flush_vmid, kvm->arch.pkvm.handle); 192 else 193 kvm_tlb_flush_vmid_range(&kvm->arch.mmu, addr, size); 194 return 0; 195 } 196 197 static void *stage2_memcache_zalloc_page(void *arg) 198 { 199 struct kvm_mmu_memory_cache *mc = arg; 200 void *virt; 201 202 /* Allocated with __GFP_ZERO, so no need to zero */ 203 virt = kvm_mmu_memory_cache_alloc(mc); 204 if (virt) 205 kvm_account_pgtable_pages(virt, 1); 206 return virt; 207 } 208 209 static void *kvm_host_zalloc_pages_exact(size_t size) 210 { 211 return alloc_pages_exact(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO); 212 } 213 214 static void *kvm_s2_zalloc_pages_exact(size_t size) 215 { 216 void *virt = kvm_host_zalloc_pages_exact(size); 217 218 if (virt) 219 kvm_account_pgtable_pages(virt, (size >> PAGE_SHIFT)); 220 return virt; 221 } 222 223 static void kvm_s2_free_pages_exact(void *virt, size_t size) 224 { 225 kvm_account_pgtable_pages(virt, -(size >> PAGE_SHIFT)); 226 free_pages_exact(virt, size); 227 } 228 229 static struct kvm_pgtable_mm_ops kvm_s2_mm_ops; 230 231 static void stage2_free_unlinked_table_rcu_cb(struct rcu_head *head) 232 { 233 struct page *page = container_of(head, struct page, rcu_head); 234 void *pgtable = page_to_virt(page); 235 s8 level = page_private(page); 236 237 KVM_PGT_FN(kvm_pgtable_stage2_free_unlinked)(&kvm_s2_mm_ops, pgtable, level); 238 } 239 240 static void stage2_free_unlinked_table(void *addr, s8 level) 241 { 242 struct page *page = virt_to_page(addr); 243 244 set_page_private(page, (unsigned long)level); 245 call_rcu(&page->rcu_head, stage2_free_unlinked_table_rcu_cb); 246 } 247 248 static void kvm_host_get_page(void *addr) 249 { 250 get_page(virt_to_page(addr)); 251 } 252 253 static void kvm_host_put_page(void *addr) 254 { 255 put_page(virt_to_page(addr)); 256 } 257 258 static void kvm_s2_put_page(void *addr) 259 { 260 struct page *p = virt_to_page(addr); 261 /* Dropping last refcount, the page will be freed */ 262 if (page_count(p) == 1) 263 kvm_account_pgtable_pages(addr, -1); 264 put_page(p); 265 } 266 267 static int kvm_host_page_count(void *addr) 268 { 269 return page_count(virt_to_page(addr)); 270 } 271 272 static phys_addr_t kvm_host_pa(void *addr) 273 { 274 return __pa(addr); 275 } 276 277 static void *kvm_host_va(phys_addr_t phys) 278 { 279 return __va(phys); 280 } 281 282 static void clean_dcache_guest_page(void *va, size_t size) 283 { 284 __clean_dcache_guest_page(va, size); 285 } 286 287 static void invalidate_icache_guest_page(void *va, size_t size) 288 { 289 __invalidate_icache_guest_page(va, size); 290 } 291 292 /* 293 * Unmapping vs dcache management: 294 * 295 * If a guest maps certain memory pages as uncached, all writes will 296 * bypass the data cache and go directly to RAM. However, the CPUs 297 * can still speculate reads (not writes) and fill cache lines with 298 * data. 299 * 300 * Those cache lines will be *clean* cache lines though, so a 301 * clean+invalidate operation is equivalent to an invalidate 302 * operation, because no cache lines are marked dirty. 303 * 304 * Those clean cache lines could be filled prior to an uncached write 305 * by the guest, and the cache coherent IO subsystem would therefore 306 * end up writing old data to disk. 307 * 308 * This is why right after unmapping a page/section and invalidating 309 * the corresponding TLBs, we flush to make sure the IO subsystem will 310 * never hit in the cache. 311 * 312 * This is all avoided on systems that have ARM64_HAS_STAGE2_FWB, as 313 * we then fully enforce cacheability of RAM, no matter what the guest 314 * does. 315 */ 316 /** 317 * __unmap_stage2_range -- Clear stage2 page table entries to unmap a range 318 * @mmu: The KVM stage-2 MMU pointer 319 * @start: The intermediate physical base address of the range to unmap 320 * @size: The size of the area to unmap 321 * @may_block: Whether or not we are permitted to block 322 * 323 * Clear a range of stage-2 mappings, lowering the various ref-counts. Must 324 * be called while holding mmu_lock (unless for freeing the stage2 pgd before 325 * destroying the VM), otherwise another faulting VCPU may come in and mess 326 * with things behind our backs. 327 */ 328 static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size, 329 bool may_block) 330 { 331 struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu); 332 phys_addr_t end = start + size; 333 334 lockdep_assert_held_write(&kvm->mmu_lock); 335 WARN_ON(size & ~PAGE_MASK); 336 WARN_ON(stage2_apply_range(mmu, start, end, KVM_PGT_FN(kvm_pgtable_stage2_unmap), 337 may_block)); 338 } 339 340 void kvm_stage2_unmap_range(struct kvm_s2_mmu *mmu, phys_addr_t start, 341 u64 size, bool may_block) 342 { 343 if (kvm_vm_is_protected(kvm_s2_mmu_to_kvm(mmu))) 344 return; 345 346 __unmap_stage2_range(mmu, start, size, may_block); 347 } 348 349 void kvm_stage2_flush_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end) 350 { 351 stage2_apply_range_resched(mmu, addr, end, KVM_PGT_FN(kvm_pgtable_stage2_flush)); 352 } 353 354 static void stage2_flush_memslot(struct kvm *kvm, 355 struct kvm_memory_slot *memslot) 356 { 357 phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT; 358 phys_addr_t end = addr + PAGE_SIZE * memslot->npages; 359 360 kvm_stage2_flush_range(&kvm->arch.mmu, addr, end); 361 } 362 363 /** 364 * stage2_flush_vm - Invalidate cache for pages mapped in stage 2 365 * @kvm: The struct kvm pointer 366 * 367 * Go through the stage 2 page tables and invalidate any cache lines 368 * backing memory already mapped to the VM. 369 */ 370 static void stage2_flush_vm(struct kvm *kvm) 371 { 372 struct kvm_memslots *slots; 373 struct kvm_memory_slot *memslot; 374 int idx, bkt; 375 376 idx = srcu_read_lock(&kvm->srcu); 377 write_lock(&kvm->mmu_lock); 378 379 slots = kvm_memslots(kvm); 380 kvm_for_each_memslot(memslot, bkt, slots) 381 stage2_flush_memslot(kvm, memslot); 382 383 kvm_nested_s2_flush(kvm); 384 385 write_unlock(&kvm->mmu_lock); 386 srcu_read_unlock(&kvm->srcu, idx); 387 } 388 389 /** 390 * free_hyp_pgds - free Hyp-mode page tables 391 */ 392 void __init free_hyp_pgds(void) 393 { 394 mutex_lock(&kvm_hyp_pgd_mutex); 395 if (hyp_pgtable) { 396 kvm_pgtable_hyp_destroy(hyp_pgtable); 397 kfree(hyp_pgtable); 398 hyp_pgtable = NULL; 399 } 400 mutex_unlock(&kvm_hyp_pgd_mutex); 401 } 402 403 static bool kvm_host_owns_hyp_mappings(void) 404 { 405 if (is_kernel_in_hyp_mode()) 406 return false; 407 408 if (static_branch_likely(&kvm_protected_mode_initialized)) 409 return false; 410 411 /* 412 * This can happen at boot time when __create_hyp_mappings() is called 413 * after the hyp protection has been enabled, but the static key has 414 * not been flipped yet. 415 */ 416 if (!hyp_pgtable && is_protected_kvm_enabled()) 417 return false; 418 419 WARN_ON(!hyp_pgtable); 420 421 return true; 422 } 423 424 int __create_hyp_mappings(unsigned long start, unsigned long size, 425 unsigned long phys, enum kvm_pgtable_prot prot) 426 { 427 int err; 428 429 if (WARN_ON(!kvm_host_owns_hyp_mappings())) 430 return -EINVAL; 431 432 mutex_lock(&kvm_hyp_pgd_mutex); 433 err = kvm_pgtable_hyp_map(hyp_pgtable, start, size, phys, prot); 434 mutex_unlock(&kvm_hyp_pgd_mutex); 435 436 return err; 437 } 438 439 static phys_addr_t kvm_kaddr_to_phys(void *kaddr) 440 { 441 if (!is_vmalloc_addr(kaddr)) { 442 BUG_ON(!virt_addr_valid(kaddr)); 443 return __pa(kaddr); 444 } else { 445 return page_to_phys(vmalloc_to_page(kaddr)) + 446 offset_in_page(kaddr); 447 } 448 } 449 450 struct hyp_shared_pfn { 451 u64 pfn; 452 int count; 453 struct rb_node node; 454 }; 455 456 static DEFINE_MUTEX(hyp_shared_pfns_lock); 457 static struct rb_root hyp_shared_pfns = RB_ROOT; 458 459 static struct hyp_shared_pfn *find_shared_pfn(u64 pfn, struct rb_node ***node, 460 struct rb_node **parent) 461 { 462 struct hyp_shared_pfn *this; 463 464 *node = &hyp_shared_pfns.rb_node; 465 *parent = NULL; 466 while (**node) { 467 this = container_of(**node, struct hyp_shared_pfn, node); 468 *parent = **node; 469 if (this->pfn < pfn) 470 *node = &((**node)->rb_left); 471 else if (this->pfn > pfn) 472 *node = &((**node)->rb_right); 473 else 474 return this; 475 } 476 477 return NULL; 478 } 479 480 static int share_pfn_hyp(u64 pfn) 481 { 482 struct rb_node **node, *parent; 483 struct hyp_shared_pfn *this; 484 int ret = 0; 485 486 mutex_lock(&hyp_shared_pfns_lock); 487 this = find_shared_pfn(pfn, &node, &parent); 488 if (this) { 489 this->count++; 490 goto unlock; 491 } 492 493 this = kzalloc_obj(*this); 494 if (!this) { 495 ret = -ENOMEM; 496 goto unlock; 497 } 498 499 this->pfn = pfn; 500 this->count = 1; 501 rb_link_node(&this->node, parent, node); 502 rb_insert_color(&this->node, &hyp_shared_pfns); 503 ret = kvm_call_hyp_nvhe(__pkvm_host_share_hyp, pfn); 504 unlock: 505 mutex_unlock(&hyp_shared_pfns_lock); 506 507 return ret; 508 } 509 510 static int unshare_pfn_hyp(u64 pfn) 511 { 512 struct rb_node **node, *parent; 513 struct hyp_shared_pfn *this; 514 int ret = 0; 515 516 mutex_lock(&hyp_shared_pfns_lock); 517 this = find_shared_pfn(pfn, &node, &parent); 518 if (WARN_ON(!this)) { 519 ret = -ENOENT; 520 goto unlock; 521 } 522 523 this->count--; 524 if (this->count) 525 goto unlock; 526 527 rb_erase(&this->node, &hyp_shared_pfns); 528 kfree(this); 529 ret = kvm_call_hyp_nvhe(__pkvm_host_unshare_hyp, pfn); 530 unlock: 531 mutex_unlock(&hyp_shared_pfns_lock); 532 533 return ret; 534 } 535 536 int kvm_share_hyp(void *from, void *to) 537 { 538 phys_addr_t start, end, cur; 539 u64 pfn; 540 int ret; 541 542 if (is_kernel_in_hyp_mode()) 543 return 0; 544 545 /* 546 * The share hcall maps things in the 'fixed-offset' region of the hyp 547 * VA space, so we can only share physically contiguous data-structures 548 * for now. 549 */ 550 if (is_vmalloc_or_module_addr(from) || is_vmalloc_or_module_addr(to)) 551 return -EINVAL; 552 553 if (kvm_host_owns_hyp_mappings()) 554 return create_hyp_mappings(from, to, PAGE_HYP); 555 556 start = ALIGN_DOWN(__pa(from), PAGE_SIZE); 557 end = PAGE_ALIGN(__pa(to)); 558 for (cur = start; cur < end; cur += PAGE_SIZE) { 559 pfn = __phys_to_pfn(cur); 560 ret = share_pfn_hyp(pfn); 561 if (ret) 562 return ret; 563 } 564 565 return 0; 566 } 567 568 void kvm_unshare_hyp(void *from, void *to) 569 { 570 phys_addr_t start, end, cur; 571 u64 pfn; 572 573 if (is_kernel_in_hyp_mode() || kvm_host_owns_hyp_mappings() || !from) 574 return; 575 576 start = ALIGN_DOWN(__pa(from), PAGE_SIZE); 577 end = PAGE_ALIGN(__pa(to)); 578 for (cur = start; cur < end; cur += PAGE_SIZE) { 579 pfn = __phys_to_pfn(cur); 580 WARN_ON(unshare_pfn_hyp(pfn)); 581 } 582 } 583 584 /** 585 * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode 586 * @from: The virtual kernel start address of the range 587 * @to: The virtual kernel end address of the range (exclusive) 588 * @prot: The protection to be applied to this range 589 * 590 * The same virtual address as the kernel virtual address is also used 591 * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying 592 * physical pages. 593 */ 594 int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot) 595 { 596 phys_addr_t phys_addr; 597 unsigned long virt_addr; 598 unsigned long start = kern_hyp_va((unsigned long)from); 599 unsigned long end = kern_hyp_va((unsigned long)to); 600 601 if (is_kernel_in_hyp_mode()) 602 return 0; 603 604 if (!kvm_host_owns_hyp_mappings()) 605 return -EPERM; 606 607 start = start & PAGE_MASK; 608 end = PAGE_ALIGN(end); 609 610 for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) { 611 int err; 612 613 phys_addr = kvm_kaddr_to_phys(from + virt_addr - start); 614 err = __create_hyp_mappings(virt_addr, PAGE_SIZE, phys_addr, 615 prot); 616 if (err) 617 return err; 618 } 619 620 return 0; 621 } 622 623 static int __hyp_alloc_private_va_range(unsigned long base) 624 { 625 lockdep_assert_held(&kvm_hyp_pgd_mutex); 626 627 if (!PAGE_ALIGNED(base)) 628 return -EINVAL; 629 630 /* 631 * Verify that BIT(VA_BITS - 1) hasn't been flipped by 632 * allocating the new area, as it would indicate we've 633 * overflowed the idmap/IO address range. 634 */ 635 if ((base ^ io_map_base) & BIT(VA_BITS - 1)) 636 return -ENOMEM; 637 638 io_map_base = base; 639 640 return 0; 641 } 642 643 /** 644 * hyp_alloc_private_va_range - Allocates a private VA range. 645 * @size: The size of the VA range to reserve. 646 * @haddr: The hypervisor virtual start address of the allocation. 647 * 648 * The private virtual address (VA) range is allocated below io_map_base 649 * and aligned based on the order of @size. 650 * 651 * Return: 0 on success or negative error code on failure. 652 */ 653 int hyp_alloc_private_va_range(size_t size, unsigned long *haddr) 654 { 655 unsigned long base; 656 int ret = 0; 657 658 mutex_lock(&kvm_hyp_pgd_mutex); 659 660 /* 661 * This assumes that we have enough space below the idmap 662 * page to allocate our VAs. If not, the check in 663 * __hyp_alloc_private_va_range() will kick. A potential 664 * alternative would be to detect that overflow and switch 665 * to an allocation above the idmap. 666 * 667 * The allocated size is always a multiple of PAGE_SIZE. 668 */ 669 size = PAGE_ALIGN(size); 670 base = io_map_base - size; 671 ret = __hyp_alloc_private_va_range(base); 672 673 mutex_unlock(&kvm_hyp_pgd_mutex); 674 675 if (!ret) 676 *haddr = base; 677 678 return ret; 679 } 680 681 static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size, 682 unsigned long *haddr, 683 enum kvm_pgtable_prot prot) 684 { 685 unsigned long addr; 686 int ret = 0; 687 688 if (!kvm_host_owns_hyp_mappings()) { 689 addr = kvm_call_hyp_nvhe(__pkvm_create_private_mapping, 690 phys_addr, size, prot); 691 if (IS_ERR_VALUE(addr)) 692 return addr; 693 *haddr = addr; 694 695 return 0; 696 } 697 698 size = PAGE_ALIGN(size + offset_in_page(phys_addr)); 699 ret = hyp_alloc_private_va_range(size, &addr); 700 if (ret) 701 return ret; 702 703 ret = __create_hyp_mappings(addr, size, phys_addr, prot); 704 if (ret) 705 return ret; 706 707 *haddr = addr + offset_in_page(phys_addr); 708 return ret; 709 } 710 711 int create_hyp_stack(phys_addr_t phys_addr, unsigned long *haddr) 712 { 713 unsigned long base; 714 size_t size; 715 int ret; 716 717 mutex_lock(&kvm_hyp_pgd_mutex); 718 /* 719 * Efficient stack verification using the NVHE_STACK_SHIFT bit implies 720 * an alignment of our allocation on the order of the size. 721 */ 722 size = NVHE_STACK_SIZE * 2; 723 base = ALIGN_DOWN(io_map_base - size, size); 724 725 ret = __hyp_alloc_private_va_range(base); 726 727 mutex_unlock(&kvm_hyp_pgd_mutex); 728 729 if (ret) { 730 kvm_err("Cannot allocate hyp stack guard page\n"); 731 return ret; 732 } 733 734 /* 735 * Since the stack grows downwards, map the stack to the page 736 * at the higher address and leave the lower guard page 737 * unbacked. 738 * 739 * Any valid stack address now has the NVHE_STACK_SHIFT bit as 1 740 * and addresses corresponding to the guard page have the 741 * NVHE_STACK_SHIFT bit as 0 - this is used for overflow detection. 742 */ 743 ret = __create_hyp_mappings(base + NVHE_STACK_SIZE, NVHE_STACK_SIZE, 744 phys_addr, PAGE_HYP); 745 if (ret) 746 kvm_err("Cannot map hyp stack\n"); 747 748 *haddr = base + size; 749 750 return ret; 751 } 752 753 /** 754 * create_hyp_io_mappings - Map IO into both kernel and HYP 755 * @phys_addr: The physical start address which gets mapped 756 * @size: Size of the region being mapped 757 * @kaddr: Kernel VA for this mapping 758 * @haddr: HYP VA for this mapping 759 */ 760 int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size, 761 void __iomem **kaddr, 762 void __iomem **haddr) 763 { 764 unsigned long addr; 765 int ret; 766 767 if (is_protected_kvm_enabled()) 768 return -EPERM; 769 770 *kaddr = ioremap(phys_addr, size); 771 if (!*kaddr) 772 return -ENOMEM; 773 774 if (is_kernel_in_hyp_mode()) { 775 *haddr = *kaddr; 776 return 0; 777 } 778 779 ret = __create_hyp_private_mapping(phys_addr, size, 780 &addr, PAGE_HYP_DEVICE); 781 if (ret) { 782 iounmap(*kaddr); 783 *kaddr = NULL; 784 *haddr = NULL; 785 return ret; 786 } 787 788 *haddr = (void __iomem *)addr; 789 return 0; 790 } 791 792 /** 793 * create_hyp_exec_mappings - Map an executable range into HYP 794 * @phys_addr: The physical start address which gets mapped 795 * @size: Size of the region being mapped 796 * @haddr: HYP VA for this mapping 797 */ 798 int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size, 799 void **haddr) 800 { 801 unsigned long addr; 802 int ret; 803 804 BUG_ON(is_kernel_in_hyp_mode()); 805 806 ret = __create_hyp_private_mapping(phys_addr, size, 807 &addr, PAGE_HYP_EXEC); 808 if (ret) { 809 *haddr = NULL; 810 return ret; 811 } 812 813 *haddr = (void *)addr; 814 return 0; 815 } 816 817 static struct kvm_pgtable_mm_ops kvm_user_mm_ops = { 818 /* We shouldn't need any other callback to walk the PT */ 819 .phys_to_virt = kvm_host_va, 820 }; 821 822 static int get_user_mapping_size(struct kvm *kvm, u64 addr) 823 { 824 struct kvm_pgtable pgt = { 825 .pgd = (kvm_pteref_t)kvm->mm->pgd, 826 .ia_bits = vabits_actual, 827 .start_level = (KVM_PGTABLE_LAST_LEVEL - 828 ARM64_HW_PGTABLE_LEVELS(pgt.ia_bits) + 1), 829 .mm_ops = &kvm_user_mm_ops, 830 }; 831 unsigned long flags; 832 kvm_pte_t pte = 0; /* Keep GCC quiet... */ 833 s8 level = S8_MAX; 834 int ret; 835 836 /* 837 * Disable IRQs so that we hazard against a concurrent 838 * teardown of the userspace page tables (which relies on 839 * IPI-ing threads). 840 */ 841 local_irq_save(flags); 842 ret = kvm_pgtable_get_leaf(&pgt, addr, &pte, &level); 843 local_irq_restore(flags); 844 845 if (ret) 846 return ret; 847 848 /* 849 * Not seeing an error, but not updating level? Something went 850 * deeply wrong... 851 */ 852 if (WARN_ON(level > KVM_PGTABLE_LAST_LEVEL)) 853 return -EFAULT; 854 if (WARN_ON(level < KVM_PGTABLE_FIRST_LEVEL)) 855 return -EFAULT; 856 857 /* Oops, the userspace PTs are gone... Replay the fault */ 858 if (!kvm_pte_valid(pte)) 859 return -EAGAIN; 860 861 return BIT(ARM64_HW_PGTABLE_LEVEL_SHIFT(level)); 862 } 863 864 static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = { 865 .zalloc_page = stage2_memcache_zalloc_page, 866 .zalloc_pages_exact = kvm_s2_zalloc_pages_exact, 867 .free_pages_exact = kvm_s2_free_pages_exact, 868 .free_unlinked_table = stage2_free_unlinked_table, 869 .get_page = kvm_host_get_page, 870 .put_page = kvm_s2_put_page, 871 .page_count = kvm_host_page_count, 872 .phys_to_virt = kvm_host_va, 873 .virt_to_phys = kvm_host_pa, 874 .dcache_clean_inval_poc = clean_dcache_guest_page, 875 .icache_inval_pou = invalidate_icache_guest_page, 876 }; 877 878 static int kvm_init_ipa_range(struct kvm_s2_mmu *mmu, unsigned long type) 879 { 880 u32 kvm_ipa_limit = get_kvm_ipa_limit(); 881 u64 mmfr0, mmfr1; 882 u32 phys_shift; 883 884 phys_shift = KVM_VM_TYPE_ARM_IPA_SIZE(type); 885 if (is_protected_kvm_enabled()) { 886 phys_shift = kvm_ipa_limit; 887 } else if (phys_shift) { 888 if (phys_shift > kvm_ipa_limit || 889 phys_shift < ARM64_MIN_PARANGE_BITS) 890 return -EINVAL; 891 } else { 892 phys_shift = KVM_PHYS_SHIFT; 893 if (phys_shift > kvm_ipa_limit) { 894 pr_warn_once("%s using unsupported default IPA limit, upgrade your VMM\n", 895 current->comm); 896 return -EINVAL; 897 } 898 } 899 900 mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); 901 mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); 902 mmu->vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift); 903 904 return 0; 905 } 906 907 /* 908 * Assume that @pgt is valid and unlinked from the KVM MMU to free the 909 * page-table without taking the kvm_mmu_lock and without performing any 910 * TLB invalidations. 911 * 912 * Also, the range of addresses can be large enough to cause need_resched 913 * warnings, for instance on CONFIG_PREEMPT_NONE kernels. Hence, invoke 914 * cond_resched() periodically to prevent hogging the CPU for a long time 915 * and schedule something else, if required. 916 */ 917 static void stage2_destroy_range(struct kvm_pgtable *pgt, phys_addr_t addr, 918 phys_addr_t end) 919 { 920 u64 next; 921 922 do { 923 next = stage2_range_addr_end(addr, end); 924 KVM_PGT_FN(kvm_pgtable_stage2_destroy_range)(pgt, addr, 925 next - addr); 926 if (next != end) 927 cond_resched(); 928 } while (addr = next, addr != end); 929 } 930 931 static void kvm_stage2_destroy(struct kvm_pgtable *pgt) 932 { 933 unsigned int ia_bits = VTCR_EL2_IPA(pgt->mmu->vtcr); 934 935 stage2_destroy_range(pgt, 0, BIT(ia_bits)); 936 KVM_PGT_FN(kvm_pgtable_stage2_destroy_pgd)(pgt); 937 } 938 939 /** 940 * kvm_init_stage2_mmu - Initialise a S2 MMU structure 941 * @kvm: The pointer to the KVM structure 942 * @mmu: The pointer to the s2 MMU structure 943 * @type: The machine type of the virtual machine 944 * 945 * Allocates only the stage-2 HW PGD level table(s). 946 * Note we don't need locking here as this is only called in two cases: 947 * 948 * - when the VM is created, which can't race against anything 949 * 950 * - when secondary kvm_s2_mmu structures are initialised for NV 951 * guests, and the caller must hold kvm->lock as this is called on a 952 * per-vcpu basis. 953 */ 954 int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type) 955 { 956 int cpu, err; 957 struct kvm_pgtable *pgt; 958 959 /* 960 * If we already have our page tables in place, and that the 961 * MMU context is the canonical one, we have a bug somewhere, 962 * as this is only supposed to ever happen once per VM. 963 * 964 * Otherwise, we're building nested page tables, and that's 965 * probably because userspace called KVM_ARM_VCPU_INIT more 966 * than once on the same vcpu. Since that's actually legal, 967 * don't kick a fuss and leave gracefully. 968 */ 969 if (mmu->pgt != NULL) { 970 if (kvm_is_nested_s2_mmu(kvm, mmu)) 971 return 0; 972 973 kvm_err("kvm_arch already initialized?\n"); 974 return -EINVAL; 975 } 976 977 err = kvm_init_ipa_range(mmu, type); 978 if (err) 979 return err; 980 981 pgt = kzalloc_obj(*pgt, GFP_KERNEL_ACCOUNT); 982 if (!pgt) 983 return -ENOMEM; 984 985 mmu->arch = &kvm->arch; 986 err = KVM_PGT_FN(kvm_pgtable_stage2_init)(pgt, mmu, &kvm_s2_mm_ops); 987 if (err) 988 goto out_free_pgtable; 989 990 mmu->pgt = pgt; 991 if (is_protected_kvm_enabled()) 992 return 0; 993 994 mmu->last_vcpu_ran = alloc_percpu(typeof(*mmu->last_vcpu_ran)); 995 if (!mmu->last_vcpu_ran) { 996 err = -ENOMEM; 997 goto out_destroy_pgtable; 998 } 999 1000 for_each_possible_cpu(cpu) 1001 *per_cpu_ptr(mmu->last_vcpu_ran, cpu) = -1; 1002 1003 /* The eager page splitting is disabled by default */ 1004 mmu->split_page_chunk_size = KVM_ARM_EAGER_SPLIT_CHUNK_SIZE_DEFAULT; 1005 mmu->split_page_cache.gfp_zero = __GFP_ZERO; 1006 1007 mmu->pgd_phys = __pa(pgt->pgd); 1008 1009 if (kvm_is_nested_s2_mmu(kvm, mmu)) 1010 kvm_init_nested_s2_mmu(mmu); 1011 1012 return 0; 1013 1014 out_destroy_pgtable: 1015 kvm_stage2_destroy(pgt); 1016 mmu->pgt = NULL; 1017 out_free_pgtable: 1018 kfree(pgt); 1019 return err; 1020 } 1021 1022 void kvm_uninit_stage2_mmu(struct kvm *kvm) 1023 { 1024 kvm_free_stage2_pgd(&kvm->arch.mmu); 1025 kvm_mmu_free_memory_cache(&kvm->arch.mmu.split_page_cache); 1026 } 1027 1028 static void stage2_unmap_memslot(struct kvm *kvm, 1029 struct kvm_memory_slot *memslot) 1030 { 1031 hva_t hva = memslot->userspace_addr; 1032 phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT; 1033 phys_addr_t size = PAGE_SIZE * memslot->npages; 1034 hva_t reg_end = hva + size; 1035 1036 /* 1037 * A memory region could potentially cover multiple VMAs, and any holes 1038 * between them, so iterate over all of them to find out if we should 1039 * unmap any of them. 1040 * 1041 * +--------------------------------------------+ 1042 * +---------------+----------------+ +----------------+ 1043 * | : VMA 1 | VMA 2 | | VMA 3 : | 1044 * +---------------+----------------+ +----------------+ 1045 * | memory region | 1046 * +--------------------------------------------+ 1047 */ 1048 do { 1049 struct vm_area_struct *vma; 1050 hva_t vm_start, vm_end; 1051 1052 vma = find_vma_intersection(current->mm, hva, reg_end); 1053 if (!vma) 1054 break; 1055 1056 /* 1057 * Take the intersection of this VMA with the memory region 1058 */ 1059 vm_start = max(hva, vma->vm_start); 1060 vm_end = min(reg_end, vma->vm_end); 1061 1062 if (!(vma->vm_flags & VM_PFNMAP)) { 1063 gpa_t gpa = addr + (vm_start - memslot->userspace_addr); 1064 kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, vm_end - vm_start, true); 1065 } 1066 hva = vm_end; 1067 } while (hva < reg_end); 1068 } 1069 1070 /** 1071 * stage2_unmap_vm - Unmap Stage-2 RAM mappings 1072 * @kvm: The struct kvm pointer 1073 * 1074 * Go through the memregions and unmap any regular RAM 1075 * backing memory already mapped to the VM. 1076 */ 1077 void stage2_unmap_vm(struct kvm *kvm) 1078 { 1079 struct kvm_memslots *slots; 1080 struct kvm_memory_slot *memslot; 1081 int idx, bkt; 1082 1083 idx = srcu_read_lock(&kvm->srcu); 1084 mmap_read_lock(current->mm); 1085 write_lock(&kvm->mmu_lock); 1086 1087 slots = kvm_memslots(kvm); 1088 kvm_for_each_memslot(memslot, bkt, slots) 1089 stage2_unmap_memslot(kvm, memslot); 1090 1091 kvm_nested_s2_unmap(kvm, true); 1092 1093 write_unlock(&kvm->mmu_lock); 1094 mmap_read_unlock(current->mm); 1095 srcu_read_unlock(&kvm->srcu, idx); 1096 } 1097 1098 void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu) 1099 { 1100 struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu); 1101 struct kvm_pgtable *pgt = NULL; 1102 1103 write_lock(&kvm->mmu_lock); 1104 pgt = mmu->pgt; 1105 if (pgt) { 1106 mmu->pgd_phys = 0; 1107 mmu->pgt = NULL; 1108 free_percpu(mmu->last_vcpu_ran); 1109 } 1110 1111 if (kvm_is_nested_s2_mmu(kvm, mmu)) 1112 kvm_init_nested_s2_mmu(mmu); 1113 1114 write_unlock(&kvm->mmu_lock); 1115 1116 if (pgt) { 1117 kvm_stage2_destroy(pgt); 1118 kfree(pgt); 1119 } 1120 } 1121 1122 static void hyp_mc_free_fn(void *addr, void *mc) 1123 { 1124 struct kvm_hyp_memcache *memcache = mc; 1125 1126 if (memcache->flags & HYP_MEMCACHE_ACCOUNT_STAGE2) 1127 kvm_account_pgtable_pages(addr, -1); 1128 1129 free_page((unsigned long)addr); 1130 } 1131 1132 static void *hyp_mc_alloc_fn(void *mc) 1133 { 1134 struct kvm_hyp_memcache *memcache = mc; 1135 void *addr; 1136 1137 addr = (void *)__get_free_page(GFP_KERNEL_ACCOUNT); 1138 if (addr && memcache->flags & HYP_MEMCACHE_ACCOUNT_STAGE2) 1139 kvm_account_pgtable_pages(addr, 1); 1140 1141 return addr; 1142 } 1143 1144 void free_hyp_memcache(struct kvm_hyp_memcache *mc) 1145 { 1146 if (!is_protected_kvm_enabled()) 1147 return; 1148 1149 kfree(mc->mapping); 1150 __free_hyp_memcache(mc, hyp_mc_free_fn, kvm_host_va, mc); 1151 } 1152 1153 int topup_hyp_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages) 1154 { 1155 if (!is_protected_kvm_enabled()) 1156 return 0; 1157 1158 if (!mc->mapping) { 1159 mc->mapping = kzalloc_obj(struct pkvm_mapping, 1160 GFP_KERNEL_ACCOUNT); 1161 if (!mc->mapping) 1162 return -ENOMEM; 1163 } 1164 1165 return __topup_hyp_memcache(mc, min_pages, hyp_mc_alloc_fn, 1166 kvm_host_pa, mc); 1167 } 1168 1169 /** 1170 * kvm_phys_addr_ioremap - map a device range to guest IPA 1171 * 1172 * @kvm: The KVM pointer 1173 * @guest_ipa: The IPA at which to insert the mapping 1174 * @pa: The physical address of the device 1175 * @size: The size of the mapping 1176 * @writable: Whether or not to create a writable mapping 1177 */ 1178 int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, 1179 phys_addr_t pa, unsigned long size, bool writable) 1180 { 1181 phys_addr_t addr; 1182 int ret = 0; 1183 struct kvm_mmu_memory_cache cache = { .gfp_zero = __GFP_ZERO }; 1184 struct kvm_s2_mmu *mmu = &kvm->arch.mmu; 1185 struct kvm_pgtable *pgt = mmu->pgt; 1186 enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_DEVICE | 1187 KVM_PGTABLE_PROT_R | 1188 (writable ? KVM_PGTABLE_PROT_W : 0); 1189 1190 if (is_protected_kvm_enabled()) 1191 return -EPERM; 1192 1193 size += offset_in_page(guest_ipa); 1194 guest_ipa &= PAGE_MASK; 1195 1196 for (addr = guest_ipa; addr < guest_ipa + size; addr += PAGE_SIZE) { 1197 ret = kvm_mmu_topup_memory_cache(&cache, 1198 kvm_mmu_cache_min_pages(mmu)); 1199 if (ret) 1200 break; 1201 1202 write_lock(&kvm->mmu_lock); 1203 ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, addr, PAGE_SIZE, 1204 pa, prot, &cache, 0); 1205 write_unlock(&kvm->mmu_lock); 1206 if (ret) 1207 break; 1208 1209 pa += PAGE_SIZE; 1210 } 1211 1212 kvm_mmu_free_memory_cache(&cache); 1213 return ret; 1214 } 1215 1216 /** 1217 * kvm_stage2_wp_range() - write protect stage2 memory region range 1218 * @mmu: The KVM stage-2 MMU pointer 1219 * @addr: Start address of range 1220 * @end: End address of range 1221 */ 1222 void kvm_stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end) 1223 { 1224 stage2_apply_range_resched(mmu, addr, end, KVM_PGT_FN(kvm_pgtable_stage2_wrprotect)); 1225 } 1226 1227 /** 1228 * kvm_mmu_wp_memory_region() - write protect stage 2 entries for memory slot 1229 * @kvm: The KVM pointer 1230 * @slot: The memory slot to write protect 1231 * 1232 * Called to start logging dirty pages after memory region 1233 * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns 1234 * all present PUD, PMD and PTEs are write protected in the memory region. 1235 * Afterwards read of dirty page log can be called. 1236 * 1237 * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired, 1238 * serializing operations for VM memory regions. 1239 */ 1240 static void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot) 1241 { 1242 struct kvm_memslots *slots = kvm_memslots(kvm); 1243 struct kvm_memory_slot *memslot = id_to_memslot(slots, slot); 1244 phys_addr_t start, end; 1245 1246 if (WARN_ON_ONCE(!memslot)) 1247 return; 1248 1249 start = memslot->base_gfn << PAGE_SHIFT; 1250 end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT; 1251 1252 write_lock(&kvm->mmu_lock); 1253 kvm_stage2_wp_range(&kvm->arch.mmu, start, end); 1254 kvm_nested_s2_wp(kvm); 1255 write_unlock(&kvm->mmu_lock); 1256 kvm_flush_remote_tlbs_memslot(kvm, memslot); 1257 } 1258 1259 /** 1260 * kvm_mmu_split_memory_region() - split the stage 2 blocks into PAGE_SIZE 1261 * pages for memory slot 1262 * @kvm: The KVM pointer 1263 * @slot: The memory slot to split 1264 * 1265 * Acquires kvm->mmu_lock. Called with kvm->slots_lock mutex acquired, 1266 * serializing operations for VM memory regions. 1267 */ 1268 static void kvm_mmu_split_memory_region(struct kvm *kvm, int slot) 1269 { 1270 struct kvm_memslots *slots; 1271 struct kvm_memory_slot *memslot; 1272 phys_addr_t start, end; 1273 1274 lockdep_assert_held(&kvm->slots_lock); 1275 1276 slots = kvm_memslots(kvm); 1277 memslot = id_to_memslot(slots, slot); 1278 1279 start = memslot->base_gfn << PAGE_SHIFT; 1280 end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT; 1281 1282 write_lock(&kvm->mmu_lock); 1283 kvm_mmu_split_huge_pages(kvm, start, end); 1284 write_unlock(&kvm->mmu_lock); 1285 } 1286 1287 /* 1288 * kvm_arch_mmu_enable_log_dirty_pt_masked() - enable dirty logging for selected pages. 1289 * @kvm: The KVM pointer 1290 * @slot: The memory slot associated with mask 1291 * @gfn_offset: The gfn offset in memory slot 1292 * @mask: The mask of pages at offset 'gfn_offset' in this memory 1293 * slot to enable dirty logging on 1294 * 1295 * Writes protect selected pages to enable dirty logging, and then 1296 * splits them to PAGE_SIZE. Caller must acquire kvm->mmu_lock. 1297 */ 1298 void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, 1299 struct kvm_memory_slot *slot, 1300 gfn_t gfn_offset, unsigned long mask) 1301 { 1302 phys_addr_t base_gfn = slot->base_gfn + gfn_offset; 1303 phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT; 1304 phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT; 1305 1306 lockdep_assert_held_write(&kvm->mmu_lock); 1307 1308 kvm_stage2_wp_range(&kvm->arch.mmu, start, end); 1309 1310 /* 1311 * Eager-splitting is done when manual-protect is set. We 1312 * also check for initially-all-set because we can avoid 1313 * eager-splitting if initially-all-set is false. 1314 * Initially-all-set equal false implies that huge-pages were 1315 * already split when enabling dirty logging: no need to do it 1316 * again. 1317 */ 1318 if (kvm_dirty_log_manual_protect_and_init_set(kvm)) 1319 kvm_mmu_split_huge_pages(kvm, start, end); 1320 1321 kvm_nested_s2_wp(kvm); 1322 } 1323 1324 static void kvm_send_hwpoison_signal(unsigned long address, short lsb) 1325 { 1326 send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current); 1327 } 1328 1329 static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot, 1330 unsigned long hva, 1331 unsigned long map_size) 1332 { 1333 gpa_t gpa_start; 1334 hva_t uaddr_start, uaddr_end; 1335 size_t size; 1336 1337 /* The memslot and the VMA are guaranteed to be aligned to PAGE_SIZE */ 1338 if (map_size == PAGE_SIZE) 1339 return true; 1340 1341 /* pKVM only supports PMD_SIZE huge-mappings */ 1342 if (is_protected_kvm_enabled() && map_size != PMD_SIZE) 1343 return false; 1344 1345 size = memslot->npages * PAGE_SIZE; 1346 1347 gpa_start = memslot->base_gfn << PAGE_SHIFT; 1348 1349 uaddr_start = memslot->userspace_addr; 1350 uaddr_end = uaddr_start + size; 1351 1352 /* 1353 * Pages belonging to memslots that don't have the same alignment 1354 * within a PMD/PUD for userspace and IPA cannot be mapped with stage-2 1355 * PMD/PUD entries, because we'll end up mapping the wrong pages. 1356 * 1357 * Consider a layout like the following: 1358 * 1359 * memslot->userspace_addr: 1360 * +-----+--------------------+--------------------+---+ 1361 * |abcde|fgh Stage-1 block | Stage-1 block tv|xyz| 1362 * +-----+--------------------+--------------------+---+ 1363 * 1364 * memslot->base_gfn << PAGE_SHIFT: 1365 * +---+--------------------+--------------------+-----+ 1366 * |abc|def Stage-2 block | Stage-2 block |tvxyz| 1367 * +---+--------------------+--------------------+-----+ 1368 * 1369 * If we create those stage-2 blocks, we'll end up with this incorrect 1370 * mapping: 1371 * d -> f 1372 * e -> g 1373 * f -> h 1374 */ 1375 if ((gpa_start & (map_size - 1)) != (uaddr_start & (map_size - 1))) 1376 return false; 1377 1378 /* 1379 * Next, let's make sure we're not trying to map anything not covered 1380 * by the memslot. This means we have to prohibit block size mappings 1381 * for the beginning and end of a non-block aligned and non-block sized 1382 * memory slot (illustrated by the head and tail parts of the 1383 * userspace view above containing pages 'abcde' and 'xyz', 1384 * respectively). 1385 * 1386 * Note that it doesn't matter if we do the check using the 1387 * userspace_addr or the base_gfn, as both are equally aligned (per 1388 * the check above) and equally sized. 1389 */ 1390 return (hva & ~(map_size - 1)) >= uaddr_start && 1391 (hva & ~(map_size - 1)) + map_size <= uaddr_end; 1392 } 1393 1394 /* 1395 * Check if the given hva is backed by a transparent huge page (THP) and 1396 * whether it can be mapped using block mapping in stage2. If so, adjust 1397 * the stage2 PFN and IPA accordingly. Only PMD_SIZE THPs are currently 1398 * supported. This will need to be updated to support other THP sizes. 1399 * 1400 * Returns the size of the mapping. 1401 */ 1402 static long 1403 transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot, 1404 unsigned long hva, kvm_pfn_t *pfnp, gfn_t *gfnp) 1405 { 1406 kvm_pfn_t pfn = *pfnp; 1407 gfn_t gfn = *gfnp; 1408 1409 /* 1410 * Make sure the adjustment is done only for THP pages. Also make 1411 * sure that the HVA and IPA are sufficiently aligned and that the 1412 * block map is contained within the memslot. 1413 */ 1414 if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) { 1415 int sz = get_user_mapping_size(kvm, hva); 1416 1417 if (sz < 0) 1418 return sz; 1419 1420 if (sz < PMD_SIZE) 1421 return PAGE_SIZE; 1422 1423 gfn &= ~(PTRS_PER_PMD - 1); 1424 *gfnp = gfn; 1425 pfn &= ~(PTRS_PER_PMD - 1); 1426 *pfnp = pfn; 1427 1428 return PMD_SIZE; 1429 } 1430 1431 /* Use page mapping if we cannot use block mapping. */ 1432 return PAGE_SIZE; 1433 } 1434 1435 static int get_vma_page_shift(struct vm_area_struct *vma, unsigned long hva) 1436 { 1437 unsigned long pa; 1438 1439 if (is_vm_hugetlb_page(vma) && !(vma->vm_flags & VM_PFNMAP)) 1440 return huge_page_shift(hstate_vma(vma)); 1441 1442 if (!(vma->vm_flags & VM_PFNMAP)) 1443 return PAGE_SHIFT; 1444 1445 VM_BUG_ON(is_vm_hugetlb_page(vma)); 1446 1447 pa = (vma->vm_pgoff << PAGE_SHIFT) + (hva - vma->vm_start); 1448 1449 #ifndef __PAGETABLE_PMD_FOLDED 1450 if ((hva & (PUD_SIZE - 1)) == (pa & (PUD_SIZE - 1)) && 1451 ALIGN_DOWN(hva, PUD_SIZE) >= vma->vm_start && 1452 ALIGN(hva, PUD_SIZE) <= vma->vm_end) 1453 return PUD_SHIFT; 1454 #endif 1455 1456 if ((hva & (PMD_SIZE - 1)) == (pa & (PMD_SIZE - 1)) && 1457 ALIGN_DOWN(hva, PMD_SIZE) >= vma->vm_start && 1458 ALIGN(hva, PMD_SIZE) <= vma->vm_end) 1459 return PMD_SHIFT; 1460 1461 return PAGE_SHIFT; 1462 } 1463 1464 /* 1465 * The page will be mapped in stage 2 as Normal Cacheable, so the VM will be 1466 * able to see the page's tags and therefore they must be initialised first. If 1467 * PG_mte_tagged is set, tags have already been initialised. 1468 * 1469 * Must be called with kvm->mmu_lock held to ensure the memory remains mapped 1470 * while the tags are zeroed. 1471 */ 1472 static void sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn, 1473 unsigned long size) 1474 { 1475 unsigned long i, nr_pages = size >> PAGE_SHIFT; 1476 struct page *page = pfn_to_page(pfn); 1477 struct folio *folio = page_folio(page); 1478 1479 if (!kvm_has_mte(kvm)) 1480 return; 1481 1482 if (folio_test_hugetlb(folio)) { 1483 /* Hugetlb has MTE flags set on head page only */ 1484 if (folio_try_hugetlb_mte_tagging(folio)) { 1485 for (i = 0; i < nr_pages; i++, page++) 1486 mte_clear_page_tags(page_address(page)); 1487 folio_set_hugetlb_mte_tagged(folio); 1488 } 1489 return; 1490 } 1491 1492 for (i = 0; i < nr_pages; i++, page++) { 1493 if (try_page_mte_tagging(page)) { 1494 mte_clear_page_tags(page_address(page)); 1495 set_page_mte_tagged(page); 1496 } 1497 } 1498 } 1499 1500 static bool kvm_vma_mte_allowed(struct vm_area_struct *vma) 1501 { 1502 return vma->vm_flags & VM_MTE_ALLOWED; 1503 } 1504 1505 static bool kvm_vma_is_cacheable(struct vm_area_struct *vma) 1506 { 1507 switch (FIELD_GET(PTE_ATTRINDX_MASK, pgprot_val(vma->vm_page_prot))) { 1508 case MT_NORMAL_NC: 1509 case MT_DEVICE_nGnRnE: 1510 case MT_DEVICE_nGnRE: 1511 return false; 1512 default: 1513 return true; 1514 } 1515 } 1516 1517 static void *get_mmu_memcache(struct kvm_vcpu *vcpu) 1518 { 1519 if (!is_protected_kvm_enabled()) 1520 return &vcpu->arch.mmu_page_cache; 1521 else 1522 return &vcpu->arch.pkvm_memcache; 1523 } 1524 1525 static int topup_mmu_memcache(struct kvm_vcpu *vcpu, void *memcache) 1526 { 1527 int min_pages = kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu); 1528 1529 if (!is_protected_kvm_enabled()) 1530 return kvm_mmu_topup_memory_cache(memcache, min_pages); 1531 1532 return topup_hyp_memcache(memcache, min_pages); 1533 } 1534 1535 /* 1536 * Potentially reduce shadow S2 permissions to match the guest's own S2. For 1537 * exec faults, we'd only reach this point if the guest actually allowed it (see 1538 * kvm_s2_handle_perm_fault). 1539 * 1540 * Also encode the level of the original translation in the SW bits of the leaf 1541 * entry as a proxy for the span of that translation. This will be retrieved on 1542 * TLB invalidation from the guest and used to limit the invalidation scope if a 1543 * TTL hint or a range isn't provided. 1544 */ 1545 static enum kvm_pgtable_prot adjust_nested_fault_perms(struct kvm_s2_trans *nested, 1546 enum kvm_pgtable_prot prot) 1547 { 1548 if (!kvm_s2_trans_writable(nested)) 1549 prot &= ~KVM_PGTABLE_PROT_W; 1550 if (!kvm_s2_trans_readable(nested)) 1551 prot &= ~KVM_PGTABLE_PROT_R; 1552 1553 return prot | kvm_encode_nested_level(nested); 1554 } 1555 1556 static enum kvm_pgtable_prot adjust_nested_exec_perms(struct kvm *kvm, 1557 struct kvm_s2_trans *nested, 1558 enum kvm_pgtable_prot prot) 1559 { 1560 if (!kvm_s2_trans_exec_el0(kvm, nested)) 1561 prot &= ~KVM_PGTABLE_PROT_UX; 1562 if (!kvm_s2_trans_exec_el1(kvm, nested)) 1563 prot &= ~KVM_PGTABLE_PROT_PX; 1564 1565 return prot; 1566 } 1567 1568 struct kvm_s2_fault_desc { 1569 struct kvm_vcpu *vcpu; 1570 phys_addr_t fault_ipa; 1571 struct kvm_s2_trans *nested; 1572 struct kvm_memory_slot *memslot; 1573 unsigned long hva; 1574 }; 1575 1576 static int gmem_abort(const struct kvm_s2_fault_desc *s2fd) 1577 { 1578 bool write_fault, exec_fault; 1579 enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_SHARED; 1580 enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; 1581 struct kvm_pgtable *pgt = s2fd->vcpu->arch.hw_mmu->pgt; 1582 unsigned long mmu_seq; 1583 struct page *page; 1584 struct kvm *kvm = s2fd->vcpu->kvm; 1585 void *memcache; 1586 kvm_pfn_t pfn; 1587 gfn_t gfn; 1588 int ret; 1589 1590 memcache = get_mmu_memcache(s2fd->vcpu); 1591 ret = topup_mmu_memcache(s2fd->vcpu, memcache); 1592 if (ret) 1593 return ret; 1594 1595 if (s2fd->nested) 1596 gfn = kvm_s2_trans_output(s2fd->nested) >> PAGE_SHIFT; 1597 else 1598 gfn = s2fd->fault_ipa >> PAGE_SHIFT; 1599 1600 write_fault = kvm_is_write_fault(s2fd->vcpu); 1601 exec_fault = kvm_vcpu_trap_is_exec_fault(s2fd->vcpu); 1602 1603 VM_WARN_ON_ONCE(write_fault && exec_fault); 1604 1605 mmu_seq = kvm->mmu_invalidate_seq; 1606 /* Pairs with the smp_wmb() in kvm_mmu_invalidate_end(). */ 1607 smp_rmb(); 1608 1609 ret = kvm_gmem_get_pfn(kvm, s2fd->memslot, gfn, &pfn, &page, NULL); 1610 if (ret) { 1611 kvm_prepare_memory_fault_exit(s2fd->vcpu, s2fd->fault_ipa, PAGE_SIZE, 1612 write_fault, exec_fault, false); 1613 return ret; 1614 } 1615 1616 if (!(s2fd->memslot->flags & KVM_MEM_READONLY)) 1617 prot |= KVM_PGTABLE_PROT_W; 1618 1619 if (s2fd->nested) 1620 prot = adjust_nested_fault_perms(s2fd->nested, prot); 1621 1622 if (exec_fault || cpus_have_final_cap(ARM64_HAS_CACHE_DIC)) 1623 prot |= KVM_PGTABLE_PROT_X; 1624 1625 if (s2fd->nested) 1626 prot = adjust_nested_exec_perms(kvm, s2fd->nested, prot); 1627 1628 kvm_fault_lock(kvm); 1629 if (mmu_invalidate_retry(kvm, mmu_seq)) { 1630 ret = -EAGAIN; 1631 goto out_unlock; 1632 } 1633 1634 ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, s2fd->fault_ipa, PAGE_SIZE, 1635 __pfn_to_phys(pfn), prot, 1636 memcache, flags); 1637 1638 out_unlock: 1639 kvm_release_faultin_page(kvm, page, !!ret, prot & KVM_PGTABLE_PROT_W); 1640 kvm_fault_unlock(kvm); 1641 1642 if ((prot & KVM_PGTABLE_PROT_W) && !ret) 1643 mark_page_dirty_in_slot(kvm, s2fd->memslot, gfn); 1644 1645 return ret != -EAGAIN ? ret : 0; 1646 } 1647 1648 struct kvm_s2_fault_vma_info { 1649 unsigned long mmu_seq; 1650 long vma_pagesize; 1651 vm_flags_t vm_flags; 1652 unsigned long max_map_size; 1653 struct page *page; 1654 kvm_pfn_t pfn; 1655 gfn_t gfn; 1656 bool device; 1657 bool mte_allowed; 1658 bool is_vma_cacheable; 1659 bool map_writable; 1660 bool map_non_cacheable; 1661 }; 1662 1663 static int pkvm_mem_abort(const struct kvm_s2_fault_desc *s2fd) 1664 { 1665 unsigned int flags = FOLL_HWPOISON | FOLL_LONGTERM | FOLL_WRITE; 1666 struct kvm_vcpu *vcpu = s2fd->vcpu; 1667 struct kvm_pgtable *pgt = vcpu->arch.hw_mmu->pgt; 1668 struct mm_struct *mm = current->mm; 1669 struct kvm *kvm = vcpu->kvm; 1670 void *hyp_memcache; 1671 struct page *page; 1672 int ret; 1673 1674 hyp_memcache = get_mmu_memcache(vcpu); 1675 ret = topup_mmu_memcache(vcpu, hyp_memcache); 1676 if (ret) 1677 return -ENOMEM; 1678 1679 ret = account_locked_vm(mm, 1, true); 1680 if (ret) 1681 return ret; 1682 1683 mmap_read_lock(mm); 1684 ret = pin_user_pages(s2fd->hva, 1, flags, &page); 1685 mmap_read_unlock(mm); 1686 1687 if (ret == -EHWPOISON) { 1688 kvm_send_hwpoison_signal(s2fd->hva, PAGE_SHIFT); 1689 ret = 0; 1690 goto dec_account; 1691 } else if (ret != 1) { 1692 ret = -EFAULT; 1693 goto dec_account; 1694 } else if (!folio_test_swapbacked(page_folio(page))) { 1695 /* 1696 * We really can't deal with page-cache pages returned by GUP 1697 * because (a) we may trigger writeback of a page for which we 1698 * no longer have access and (b) page_mkclean() won't find the 1699 * stage-2 mapping in the rmap so we can get out-of-whack with 1700 * the filesystem when marking the page dirty during unpinning 1701 * (see cc5095747edf ("ext4: don't BUG if someone dirty pages 1702 * without asking ext4 first")). 1703 * 1704 * Ideally we'd just restrict ourselves to anonymous pages, but 1705 * we also want to allow memfd (i.e. shmem) pages, so check for 1706 * pages backed by swap in the knowledge that the GUP pin will 1707 * prevent try_to_unmap() from succeeding. 1708 */ 1709 ret = -EIO; 1710 goto unpin; 1711 } 1712 1713 write_lock(&kvm->mmu_lock); 1714 ret = pkvm_pgtable_stage2_map(pgt, s2fd->fault_ipa, PAGE_SIZE, 1715 page_to_phys(page), KVM_PGTABLE_PROT_RWX, 1716 hyp_memcache, 0); 1717 write_unlock(&kvm->mmu_lock); 1718 if (ret) { 1719 if (ret == -EAGAIN) 1720 ret = 0; 1721 goto unpin; 1722 } 1723 1724 return 0; 1725 unpin: 1726 unpin_user_pages(&page, 1); 1727 dec_account: 1728 account_locked_vm(mm, 1, false); 1729 return ret; 1730 } 1731 1732 static short kvm_s2_resolve_vma_size(const struct kvm_s2_fault_desc *s2fd, 1733 struct kvm_s2_fault_vma_info *s2vi, 1734 struct vm_area_struct *vma) 1735 { 1736 short vma_shift; 1737 1738 if (memslot_is_logging(s2fd->memslot)) { 1739 s2vi->max_map_size = PAGE_SIZE; 1740 vma_shift = PAGE_SHIFT; 1741 } else { 1742 s2vi->max_map_size = PUD_SIZE; 1743 vma_shift = get_vma_page_shift(vma, s2fd->hva); 1744 } 1745 1746 switch (vma_shift) { 1747 #ifndef __PAGETABLE_PMD_FOLDED 1748 case PUD_SHIFT: 1749 if (fault_supports_stage2_huge_mapping(s2fd->memslot, s2fd->hva, PUD_SIZE)) 1750 break; 1751 fallthrough; 1752 #endif 1753 case CONT_PMD_SHIFT: 1754 vma_shift = PMD_SHIFT; 1755 fallthrough; 1756 case PMD_SHIFT: 1757 if (fault_supports_stage2_huge_mapping(s2fd->memslot, s2fd->hva, PMD_SIZE)) 1758 break; 1759 fallthrough; 1760 case CONT_PTE_SHIFT: 1761 vma_shift = PAGE_SHIFT; 1762 s2vi->max_map_size = PAGE_SIZE; 1763 fallthrough; 1764 case PAGE_SHIFT: 1765 break; 1766 default: 1767 WARN_ONCE(1, "Unknown vma_shift %d", vma_shift); 1768 } 1769 1770 if (s2fd->nested) { 1771 unsigned long max_map_size; 1772 1773 max_map_size = min(s2vi->max_map_size, PUD_SIZE); 1774 1775 /* 1776 * If we're about to create a shadow stage 2 entry, then we 1777 * can only create a block mapping if the guest stage 2 page 1778 * table uses at least as big a mapping. 1779 */ 1780 max_map_size = min(kvm_s2_trans_size(s2fd->nested), max_map_size); 1781 1782 /* 1783 * Be careful that if the mapping size falls between 1784 * two host sizes, take the smallest of the two. 1785 */ 1786 if (max_map_size >= PMD_SIZE && max_map_size < PUD_SIZE) 1787 max_map_size = PMD_SIZE; 1788 else if (max_map_size >= PAGE_SIZE && max_map_size < PMD_SIZE) 1789 max_map_size = PAGE_SIZE; 1790 1791 s2vi->max_map_size = max_map_size; 1792 vma_shift = min_t(short, vma_shift, __ffs(max_map_size)); 1793 } 1794 1795 return vma_shift; 1796 } 1797 1798 static bool kvm_s2_fault_is_perm(const struct kvm_s2_fault_desc *s2fd) 1799 { 1800 return kvm_vcpu_trap_is_permission_fault(s2fd->vcpu); 1801 } 1802 1803 static int kvm_s2_fault_get_vma_info(const struct kvm_s2_fault_desc *s2fd, 1804 struct kvm_s2_fault_vma_info *s2vi) 1805 { 1806 struct vm_area_struct *vma; 1807 struct kvm *kvm = s2fd->vcpu->kvm; 1808 1809 mmap_read_lock(current->mm); 1810 vma = vma_lookup(current->mm, s2fd->hva); 1811 if (unlikely(!vma)) { 1812 kvm_err("Failed to find VMA for hva 0x%lx\n", s2fd->hva); 1813 mmap_read_unlock(current->mm); 1814 return -EFAULT; 1815 } 1816 1817 s2vi->vma_pagesize = BIT(kvm_s2_resolve_vma_size(s2fd, s2vi, vma)); 1818 1819 /* 1820 * Both the canonical IPA and fault IPA must be aligned to the 1821 * mapping size to ensure we find the right PFN and lay down the 1822 * mapping in the right place. 1823 */ 1824 s2vi->gfn = ALIGN_DOWN(s2fd->fault_ipa, s2vi->vma_pagesize) >> PAGE_SHIFT; 1825 1826 s2vi->mte_allowed = kvm_vma_mte_allowed(vma); 1827 1828 s2vi->vm_flags = vma->vm_flags; 1829 1830 s2vi->is_vma_cacheable = kvm_vma_is_cacheable(vma); 1831 1832 /* 1833 * Read mmu_invalidate_seq so that KVM can detect if the results of 1834 * vma_lookup() or __kvm_faultin_pfn() become stale prior to 1835 * acquiring kvm->mmu_lock. 1836 * 1837 * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs 1838 * with the smp_wmb() in kvm_mmu_invalidate_end(). 1839 */ 1840 s2vi->mmu_seq = kvm->mmu_invalidate_seq; 1841 mmap_read_unlock(current->mm); 1842 1843 return 0; 1844 } 1845 1846 static gfn_t get_canonical_gfn(const struct kvm_s2_fault_desc *s2fd, 1847 const struct kvm_s2_fault_vma_info *s2vi) 1848 { 1849 phys_addr_t ipa; 1850 1851 if (!s2fd->nested) 1852 return s2vi->gfn; 1853 1854 ipa = kvm_s2_trans_output(s2fd->nested); 1855 return ALIGN_DOWN(ipa, s2vi->vma_pagesize) >> PAGE_SHIFT; 1856 } 1857 1858 static int kvm_s2_fault_pin_pfn(const struct kvm_s2_fault_desc *s2fd, 1859 struct kvm_s2_fault_vma_info *s2vi) 1860 { 1861 int ret; 1862 1863 ret = kvm_s2_fault_get_vma_info(s2fd, s2vi); 1864 if (ret) 1865 return ret; 1866 1867 s2vi->pfn = __kvm_faultin_pfn(s2fd->memslot, get_canonical_gfn(s2fd, s2vi), 1868 kvm_is_write_fault(s2fd->vcpu) ? FOLL_WRITE : 0, 1869 &s2vi->map_writable, &s2vi->page); 1870 if (unlikely(is_error_noslot_pfn(s2vi->pfn))) { 1871 if (s2vi->pfn == KVM_PFN_ERR_HWPOISON) { 1872 kvm_send_hwpoison_signal(s2fd->hva, __ffs(s2vi->vma_pagesize)); 1873 return 0; 1874 } 1875 return -EFAULT; 1876 } 1877 1878 /* 1879 * Check if this is non-struct page memory PFN, and cannot support 1880 * CMOs. It could potentially be unsafe to access as cacheable. 1881 */ 1882 if (s2vi->vm_flags & (VM_PFNMAP | VM_MIXEDMAP) && !pfn_is_map_memory(s2vi->pfn)) { 1883 if (s2vi->is_vma_cacheable) { 1884 /* 1885 * Whilst the VMA owner expects cacheable mapping to this 1886 * PFN, hardware also has to support the FWB and CACHE DIC 1887 * features. 1888 * 1889 * ARM64 KVM relies on kernel VA mapping to the PFN to 1890 * perform cache maintenance as the CMO instructions work on 1891 * virtual addresses. VM_PFNMAP region are not necessarily 1892 * mapped to a KVA and hence the presence of hardware features 1893 * S2FWB and CACHE DIC are mandatory to avoid the need for 1894 * cache maintenance. 1895 */ 1896 if (!kvm_supports_cacheable_pfnmap()) { 1897 kvm_release_faultin_page(s2fd->vcpu->kvm, s2vi->page, true, false); 1898 return -EFAULT; 1899 } 1900 } else { 1901 /* 1902 * If the page was identified as device early by looking at 1903 * the VMA flags, vma_pagesize is already representing the 1904 * largest quantity we can map. If instead it was mapped 1905 * via __kvm_faultin_pfn(), vma_pagesize is set to PAGE_SIZE 1906 * and must not be upgraded. 1907 * 1908 * In both cases, we don't let transparent_hugepage_adjust() 1909 * change things at the last minute. 1910 */ 1911 s2vi->map_non_cacheable = true; 1912 } 1913 1914 s2vi->device = true; 1915 } 1916 1917 return 1; 1918 } 1919 1920 static int kvm_s2_fault_compute_prot(const struct kvm_s2_fault_desc *s2fd, 1921 const struct kvm_s2_fault_vma_info *s2vi, 1922 enum kvm_pgtable_prot *prot) 1923 { 1924 struct kvm *kvm = s2fd->vcpu->kvm; 1925 1926 if (kvm_vcpu_trap_is_exec_fault(s2fd->vcpu) && s2vi->map_non_cacheable) 1927 return -ENOEXEC; 1928 1929 /* 1930 * Guest performs atomic/exclusive operations on memory with unsupported 1931 * attributes (e.g. ld64b/st64b on normal memory when no FEAT_LS64WB) 1932 * and trigger the exception here. Since the memslot is valid, inject 1933 * the fault back to the guest. 1934 */ 1935 if (esr_fsc_is_excl_atomic_fault(kvm_vcpu_get_esr(s2fd->vcpu))) { 1936 kvm_inject_dabt_excl_atomic(s2fd->vcpu, kvm_vcpu_get_hfar(s2fd->vcpu)); 1937 return 1; 1938 } 1939 1940 *prot = KVM_PGTABLE_PROT_R; 1941 1942 if (s2vi->map_writable && (s2vi->device || 1943 !memslot_is_logging(s2fd->memslot) || 1944 kvm_is_write_fault(s2fd->vcpu))) 1945 *prot |= KVM_PGTABLE_PROT_W; 1946 1947 if (s2fd->nested) 1948 *prot = adjust_nested_fault_perms(s2fd->nested, *prot); 1949 1950 if (kvm_vcpu_trap_is_exec_fault(s2fd->vcpu)) 1951 *prot |= KVM_PGTABLE_PROT_X; 1952 1953 if (s2vi->map_non_cacheable) 1954 *prot |= (s2vi->vm_flags & VM_ALLOW_ANY_UNCACHED) ? 1955 KVM_PGTABLE_PROT_NORMAL_NC : KVM_PGTABLE_PROT_DEVICE; 1956 else if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC)) 1957 *prot |= KVM_PGTABLE_PROT_X; 1958 1959 if (s2fd->nested) 1960 *prot = adjust_nested_exec_perms(kvm, s2fd->nested, *prot); 1961 1962 if (!kvm_s2_fault_is_perm(s2fd) && !s2vi->map_non_cacheable && kvm_has_mte(kvm)) { 1963 /* Check the VMM hasn't introduced a new disallowed VMA */ 1964 if (!s2vi->mte_allowed) 1965 return -EFAULT; 1966 } 1967 1968 return 0; 1969 } 1970 1971 static int kvm_s2_fault_map(const struct kvm_s2_fault_desc *s2fd, 1972 const struct kvm_s2_fault_vma_info *s2vi, 1973 enum kvm_pgtable_prot prot, 1974 void *memcache) 1975 { 1976 enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_SHARED; 1977 bool writable = prot & KVM_PGTABLE_PROT_W; 1978 struct kvm *kvm = s2fd->vcpu->kvm; 1979 struct kvm_pgtable *pgt; 1980 long perm_fault_granule; 1981 long mapping_size; 1982 kvm_pfn_t pfn; 1983 gfn_t gfn; 1984 int ret; 1985 1986 kvm_fault_lock(kvm); 1987 pgt = s2fd->vcpu->arch.hw_mmu->pgt; 1988 ret = -EAGAIN; 1989 if (mmu_invalidate_retry(kvm, s2vi->mmu_seq)) 1990 goto out_unlock; 1991 1992 perm_fault_granule = (kvm_s2_fault_is_perm(s2fd) ? 1993 kvm_vcpu_trap_get_perm_fault_granule(s2fd->vcpu) : 0); 1994 mapping_size = s2vi->vma_pagesize; 1995 pfn = s2vi->pfn; 1996 gfn = s2vi->gfn; 1997 1998 /* 1999 * If we are not forced to use page mapping, check if we are 2000 * backed by a THP and thus use block mapping if possible. 2001 */ 2002 if (mapping_size == PAGE_SIZE && 2003 !(s2vi->max_map_size == PAGE_SIZE || s2vi->map_non_cacheable)) { 2004 if (perm_fault_granule > PAGE_SIZE) { 2005 mapping_size = perm_fault_granule; 2006 } else { 2007 mapping_size = transparent_hugepage_adjust(kvm, s2fd->memslot, 2008 s2fd->hva, &pfn, 2009 &gfn); 2010 if (mapping_size < 0) { 2011 ret = mapping_size; 2012 goto out_unlock; 2013 } 2014 } 2015 } 2016 2017 if (!perm_fault_granule && !s2vi->map_non_cacheable && kvm_has_mte(kvm)) 2018 sanitise_mte_tags(kvm, pfn, mapping_size); 2019 2020 /* 2021 * Under the premise of getting a FSC_PERM fault, we just need to relax 2022 * permissions only if mapping_size equals perm_fault_granule. Otherwise, 2023 * kvm_pgtable_stage2_map() should be called to change block size. 2024 */ 2025 if (mapping_size == perm_fault_granule) { 2026 /* 2027 * Drop the SW bits in favour of those stored in the 2028 * PTE, which will be preserved. 2029 */ 2030 prot &= ~KVM_NV_GUEST_MAP_SZ; 2031 ret = KVM_PGT_FN(kvm_pgtable_stage2_relax_perms)(pgt, gfn_to_gpa(gfn), 2032 prot, flags); 2033 } else { 2034 ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, gfn_to_gpa(gfn), mapping_size, 2035 __pfn_to_phys(pfn), prot, 2036 memcache, flags); 2037 } 2038 2039 out_unlock: 2040 kvm_release_faultin_page(kvm, s2vi->page, !!ret, writable); 2041 kvm_fault_unlock(kvm); 2042 2043 /* 2044 * Mark the page dirty only if the fault is handled successfully, 2045 * making sure we adjust the canonical IPA if the mapping size has 2046 * been updated (via a THP upgrade, for example). 2047 */ 2048 if (writable && !ret) { 2049 phys_addr_t ipa = gfn_to_gpa(get_canonical_gfn(s2fd, s2vi)); 2050 ipa &= ~(mapping_size - 1); 2051 mark_page_dirty_in_slot(kvm, s2fd->memslot, gpa_to_gfn(ipa)); 2052 } 2053 2054 if (ret != -EAGAIN) 2055 return ret; 2056 return 0; 2057 } 2058 2059 static int user_mem_abort(const struct kvm_s2_fault_desc *s2fd) 2060 { 2061 bool perm_fault = kvm_vcpu_trap_is_permission_fault(s2fd->vcpu); 2062 struct kvm_s2_fault_vma_info s2vi = {}; 2063 enum kvm_pgtable_prot prot; 2064 void *memcache; 2065 int ret; 2066 2067 /* 2068 * Permission faults just need to update the existing leaf entry, 2069 * and so normally don't require allocations from the memcache. The 2070 * only exception to this is when dirty logging is enabled at runtime 2071 * and a write fault needs to collapse a block entry into a table. 2072 */ 2073 memcache = get_mmu_memcache(s2fd->vcpu); 2074 if (!perm_fault || (memslot_is_logging(s2fd->memslot) && 2075 kvm_is_write_fault(s2fd->vcpu))) { 2076 ret = topup_mmu_memcache(s2fd->vcpu, memcache); 2077 if (ret) 2078 return ret; 2079 } 2080 2081 /* 2082 * Let's check if we will get back a huge page backed by hugetlbfs, or 2083 * get block mapping for device MMIO region. 2084 */ 2085 ret = kvm_s2_fault_pin_pfn(s2fd, &s2vi); 2086 if (ret != 1) 2087 return ret; 2088 2089 ret = kvm_s2_fault_compute_prot(s2fd, &s2vi, &prot); 2090 if (ret) { 2091 kvm_release_page_unused(s2vi.page); 2092 return ret; 2093 } 2094 2095 return kvm_s2_fault_map(s2fd, &s2vi, prot, memcache); 2096 } 2097 2098 /* Resolve the access fault by making the page young again. */ 2099 static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) 2100 { 2101 enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_SHARED; 2102 struct kvm_s2_mmu *mmu; 2103 2104 trace_kvm_access_fault(fault_ipa); 2105 2106 read_lock(&vcpu->kvm->mmu_lock); 2107 mmu = vcpu->arch.hw_mmu; 2108 KVM_PGT_FN(kvm_pgtable_stage2_mkyoung)(mmu->pgt, fault_ipa, flags); 2109 read_unlock(&vcpu->kvm->mmu_lock); 2110 } 2111 2112 /* 2113 * Returns true if the SEA should be handled locally within KVM if the abort 2114 * is caused by a kernel memory allocation (e.g. stage-2 table memory). 2115 */ 2116 static bool host_owns_sea(struct kvm_vcpu *vcpu, u64 esr) 2117 { 2118 /* 2119 * Without FEAT_RAS HCR_EL2.TEA is RES0, meaning any external abort 2120 * taken from a guest EL to EL2 is due to a host-imposed access (e.g. 2121 * stage-2 PTW). 2122 */ 2123 if (!cpus_have_final_cap(ARM64_HAS_RAS_EXTN)) 2124 return true; 2125 2126 /* KVM owns the VNCR when the vCPU isn't in a nested context. */ 2127 if (is_hyp_ctxt(vcpu) && !kvm_vcpu_trap_is_iabt(vcpu) && (esr & ESR_ELx_VNCR)) 2128 return true; 2129 2130 /* 2131 * Determining if an external abort during a table walk happened at 2132 * stage-2 is only possible with S1PTW is set. Otherwise, since KVM 2133 * sets HCR_EL2.TEA, SEAs due to a stage-1 walk (i.e. accessing the 2134 * PA of the stage-1 descriptor) can reach here and are reported 2135 * with a TTW ESR value. 2136 */ 2137 return (esr_fsc_is_sea_ttw(esr) && (esr & ESR_ELx_S1PTW)); 2138 } 2139 2140 int kvm_handle_guest_sea(struct kvm_vcpu *vcpu) 2141 { 2142 struct kvm *kvm = vcpu->kvm; 2143 struct kvm_run *run = vcpu->run; 2144 u64 esr = kvm_vcpu_get_esr(vcpu); 2145 u64 esr_mask = ESR_ELx_EC_MASK | 2146 ESR_ELx_IL | 2147 ESR_ELx_FnV | 2148 ESR_ELx_EA | 2149 ESR_ELx_CM | 2150 ESR_ELx_WNR | 2151 ESR_ELx_FSC; 2152 u64 ipa; 2153 2154 /* 2155 * Give APEI the opportunity to claim the abort before handling it 2156 * within KVM. apei_claim_sea() expects to be called with IRQs enabled. 2157 */ 2158 lockdep_assert_irqs_enabled(); 2159 if (apei_claim_sea(NULL) == 0) 2160 return 1; 2161 2162 if (host_owns_sea(vcpu, esr) || 2163 !test_bit(KVM_ARCH_FLAG_EXIT_SEA, &vcpu->kvm->arch.flags)) 2164 return kvm_inject_serror(vcpu); 2165 2166 /* ESR_ELx.SET is RES0 when FEAT_RAS isn't implemented. */ 2167 if (kvm_has_ras(kvm)) 2168 esr_mask |= ESR_ELx_SET_MASK; 2169 2170 /* 2171 * Exit to userspace, and provide faulting guest virtual and physical 2172 * addresses in case userspace wants to emulate SEA to guest by 2173 * writing to FAR_ELx and HPFAR_ELx registers. 2174 */ 2175 memset(&run->arm_sea, 0, sizeof(run->arm_sea)); 2176 run->exit_reason = KVM_EXIT_ARM_SEA; 2177 run->arm_sea.esr = esr & esr_mask; 2178 2179 if (!(esr & ESR_ELx_FnV)) 2180 run->arm_sea.gva = kvm_vcpu_get_hfar(vcpu); 2181 2182 ipa = kvm_vcpu_get_fault_ipa(vcpu); 2183 if (ipa != INVALID_GPA) { 2184 run->arm_sea.flags |= KVM_EXIT_ARM_SEA_FLAG_GPA_VALID; 2185 run->arm_sea.gpa = ipa; 2186 } 2187 2188 return 0; 2189 } 2190 2191 /** 2192 * kvm_handle_guest_abort - handles all 2nd stage aborts 2193 * @vcpu: the VCPU pointer 2194 * 2195 * Any abort that gets to the host is almost guaranteed to be caused by a 2196 * missing second stage translation table entry, which can mean that either the 2197 * guest simply needs more memory and we must allocate an appropriate page or it 2198 * can mean that the guest tried to access I/O memory, which is emulated by user 2199 * space. The distinction is based on the IPA causing the fault and whether this 2200 * memory region has been registered as standard RAM by user space. 2201 */ 2202 int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) 2203 { 2204 struct kvm_s2_trans nested_trans, *nested = NULL; 2205 unsigned long esr; 2206 phys_addr_t fault_ipa; /* The address we faulted on */ 2207 phys_addr_t ipa; /* Always the IPA in the L1 guest phys space */ 2208 struct kvm_memory_slot *memslot; 2209 unsigned long hva; 2210 bool is_iabt, write_fault, writable; 2211 gfn_t gfn; 2212 int ret, idx; 2213 2214 if (kvm_vcpu_abt_issea(vcpu)) 2215 return kvm_handle_guest_sea(vcpu); 2216 2217 esr = kvm_vcpu_get_esr(vcpu); 2218 2219 /* 2220 * The fault IPA should be reliable at this point as we're not dealing 2221 * with an SEA. 2222 */ 2223 ipa = fault_ipa = kvm_vcpu_get_fault_ipa(vcpu); 2224 if (KVM_BUG_ON(ipa == INVALID_GPA, vcpu->kvm)) 2225 return -EFAULT; 2226 2227 is_iabt = kvm_vcpu_trap_is_iabt(vcpu); 2228 2229 if (esr_fsc_is_translation_fault(esr)) { 2230 /* Beyond sanitised PARange (which is the IPA limit) */ 2231 if (fault_ipa >= BIT_ULL(get_kvm_ipa_limit())) { 2232 kvm_inject_size_fault(vcpu); 2233 return 1; 2234 } 2235 2236 /* Falls between the IPA range and the PARange? */ 2237 if (fault_ipa >= BIT_ULL(VTCR_EL2_IPA(vcpu->arch.hw_mmu->vtcr))) { 2238 fault_ipa |= FAR_TO_FIPA_OFFSET(kvm_vcpu_get_hfar(vcpu)); 2239 2240 return kvm_inject_sea(vcpu, is_iabt, fault_ipa); 2241 } 2242 } 2243 2244 trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_esr(vcpu), 2245 kvm_vcpu_get_hfar(vcpu), fault_ipa); 2246 2247 /* Check the stage-2 fault is trans. fault or write fault */ 2248 if (!esr_fsc_is_translation_fault(esr) && 2249 !esr_fsc_is_permission_fault(esr) && 2250 !esr_fsc_is_access_flag_fault(esr) && 2251 !esr_fsc_is_excl_atomic_fault(esr)) { 2252 kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n", 2253 kvm_vcpu_trap_get_class(vcpu), 2254 (unsigned long)kvm_vcpu_trap_get_fault(vcpu), 2255 (unsigned long)kvm_vcpu_get_esr(vcpu)); 2256 return -EFAULT; 2257 } 2258 2259 idx = srcu_read_lock(&vcpu->kvm->srcu); 2260 2261 /* 2262 * We may have faulted on a shadow stage 2 page table if we are 2263 * running a nested guest. In this case, we have to resolve the L2 2264 * IPA to the L1 IPA first, before knowing what kind of memory should 2265 * back the L1 IPA. 2266 * 2267 * If the shadow stage 2 page table walk faults, then we simply inject 2268 * this to the guest and carry on. 2269 * 2270 * If there are no shadow S2 PTs because S2 is disabled, there is 2271 * nothing to walk and we treat it as a 1:1 before going through the 2272 * canonical translation. 2273 */ 2274 if (kvm_is_nested_s2_mmu(vcpu->kvm,vcpu->arch.hw_mmu) && 2275 vcpu->arch.hw_mmu->nested_stage2_enabled) { 2276 u32 esr; 2277 2278 ret = kvm_walk_nested_s2(vcpu, fault_ipa, &nested_trans); 2279 if (ret == -EAGAIN) { 2280 ret = 1; 2281 goto out_unlock; 2282 } 2283 2284 if (ret) { 2285 esr = kvm_s2_trans_esr(&nested_trans); 2286 kvm_inject_s2_fault(vcpu, esr); 2287 goto out_unlock; 2288 } 2289 2290 ret = kvm_s2_handle_perm_fault(vcpu, &nested_trans); 2291 if (ret) { 2292 esr = kvm_s2_trans_esr(&nested_trans); 2293 kvm_inject_s2_fault(vcpu, esr); 2294 goto out_unlock; 2295 } 2296 2297 ipa = kvm_s2_trans_output(&nested_trans); 2298 nested = &nested_trans; 2299 } 2300 2301 gfn = ipa >> PAGE_SHIFT; 2302 memslot = gfn_to_memslot(vcpu->kvm, gfn); 2303 hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable); 2304 write_fault = kvm_is_write_fault(vcpu); 2305 if (kvm_is_error_hva(hva) || (write_fault && !writable)) { 2306 /* 2307 * The guest has put either its instructions or its page-tables 2308 * somewhere it shouldn't have. Userspace won't be able to do 2309 * anything about this (there's no syndrome for a start), so 2310 * re-inject the abort back into the guest. 2311 */ 2312 if (is_iabt) { 2313 ret = -ENOEXEC; 2314 goto out; 2315 } 2316 2317 if (kvm_vcpu_abt_iss1tw(vcpu)) { 2318 ret = kvm_inject_sea_dabt(vcpu, kvm_vcpu_get_hfar(vcpu)); 2319 goto out_unlock; 2320 } 2321 2322 /* 2323 * Check for a cache maintenance operation. Since we 2324 * ended-up here, we know it is outside of any memory 2325 * slot. But we can't find out if that is for a device, 2326 * or if the guest is just being stupid. The only thing 2327 * we know for sure is that this range cannot be cached. 2328 * 2329 * So let's assume that the guest is just being 2330 * cautious, and skip the instruction. 2331 */ 2332 if (kvm_is_error_hva(hva) && kvm_vcpu_dabt_is_cm(vcpu)) { 2333 kvm_incr_pc(vcpu); 2334 ret = 1; 2335 goto out_unlock; 2336 } 2337 2338 /* 2339 * The IPA is reported as [MAX:12], so we need to 2340 * complement it with the bottom 12 bits from the 2341 * faulting VA. This is always 12 bits, irrespective 2342 * of the page size. 2343 */ 2344 ipa |= FAR_TO_FIPA_OFFSET(kvm_vcpu_get_hfar(vcpu)); 2345 ret = io_mem_abort(vcpu, ipa); 2346 goto out_unlock; 2347 } 2348 2349 /* Userspace should not be able to register out-of-bounds IPAs */ 2350 VM_BUG_ON(ipa >= kvm_phys_size(vcpu->arch.hw_mmu)); 2351 2352 if (esr_fsc_is_access_flag_fault(esr)) { 2353 handle_access_fault(vcpu, fault_ipa); 2354 ret = 1; 2355 goto out_unlock; 2356 } 2357 2358 const struct kvm_s2_fault_desc s2fd = { 2359 .vcpu = vcpu, 2360 .fault_ipa = fault_ipa, 2361 .nested = nested, 2362 .memslot = memslot, 2363 .hva = hva, 2364 }; 2365 2366 if (kvm_vm_is_protected(vcpu->kvm)) { 2367 ret = pkvm_mem_abort(&s2fd); 2368 } else { 2369 VM_WARN_ON_ONCE(kvm_vcpu_trap_is_permission_fault(vcpu) && 2370 !write_fault && 2371 !kvm_vcpu_trap_is_exec_fault(vcpu)); 2372 2373 if (kvm_slot_has_gmem(memslot)) 2374 ret = gmem_abort(&s2fd); 2375 else 2376 ret = user_mem_abort(&s2fd); 2377 } 2378 2379 if (ret == 0) 2380 ret = 1; 2381 out: 2382 if (ret == -ENOEXEC) 2383 ret = kvm_inject_sea_iabt(vcpu, kvm_vcpu_get_hfar(vcpu)); 2384 out_unlock: 2385 srcu_read_unlock(&vcpu->kvm->srcu, idx); 2386 return ret; 2387 } 2388 2389 bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) 2390 { 2391 if (!kvm->arch.mmu.pgt || kvm_vm_is_protected(kvm)) 2392 return false; 2393 2394 __unmap_stage2_range(&kvm->arch.mmu, range->start << PAGE_SHIFT, 2395 (range->end - range->start) << PAGE_SHIFT, 2396 range->may_block); 2397 2398 kvm_nested_s2_unmap(kvm, range->may_block); 2399 return false; 2400 } 2401 2402 bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 2403 { 2404 u64 size = (range->end - range->start) << PAGE_SHIFT; 2405 2406 if (!kvm->arch.mmu.pgt || kvm_vm_is_protected(kvm)) 2407 return false; 2408 2409 return KVM_PGT_FN(kvm_pgtable_stage2_test_clear_young)(kvm->arch.mmu.pgt, 2410 range->start << PAGE_SHIFT, 2411 size, true); 2412 /* 2413 * TODO: Handle nested_mmu structures here using the reverse mapping in 2414 * a later version of patch series. 2415 */ 2416 } 2417 2418 bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 2419 { 2420 u64 size = (range->end - range->start) << PAGE_SHIFT; 2421 2422 if (!kvm->arch.mmu.pgt || kvm_vm_is_protected(kvm)) 2423 return false; 2424 2425 return KVM_PGT_FN(kvm_pgtable_stage2_test_clear_young)(kvm->arch.mmu.pgt, 2426 range->start << PAGE_SHIFT, 2427 size, false); 2428 } 2429 2430 phys_addr_t kvm_mmu_get_httbr(void) 2431 { 2432 return __pa(hyp_pgtable->pgd); 2433 } 2434 2435 phys_addr_t kvm_get_idmap_vector(void) 2436 { 2437 return hyp_idmap_vector; 2438 } 2439 2440 static int kvm_map_idmap_text(void) 2441 { 2442 unsigned long size = hyp_idmap_end - hyp_idmap_start; 2443 int err = __create_hyp_mappings(hyp_idmap_start, size, hyp_idmap_start, 2444 PAGE_HYP_EXEC); 2445 if (err) 2446 kvm_err("Failed to idmap %lx-%lx\n", 2447 hyp_idmap_start, hyp_idmap_end); 2448 2449 return err; 2450 } 2451 2452 static void *kvm_hyp_zalloc_page(void *arg) 2453 { 2454 return (void *)get_zeroed_page(GFP_KERNEL); 2455 } 2456 2457 static struct kvm_pgtable_mm_ops kvm_hyp_mm_ops = { 2458 .zalloc_page = kvm_hyp_zalloc_page, 2459 .get_page = kvm_host_get_page, 2460 .put_page = kvm_host_put_page, 2461 .phys_to_virt = kvm_host_va, 2462 .virt_to_phys = kvm_host_pa, 2463 }; 2464 2465 int __init kvm_mmu_init(u32 hyp_va_bits) 2466 { 2467 int err; 2468 2469 hyp_idmap_start = __pa_symbol(__hyp_idmap_text_start); 2470 hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE); 2471 hyp_idmap_end = __pa_symbol(__hyp_idmap_text_end); 2472 hyp_idmap_end = ALIGN(hyp_idmap_end, PAGE_SIZE); 2473 hyp_idmap_vector = __pa_symbol(__kvm_hyp_init); 2474 2475 /* 2476 * We rely on the linker script to ensure at build time that the HYP 2477 * init code does not cross a page boundary. 2478 */ 2479 BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK); 2480 2481 kvm_debug("Using %u-bit virtual addresses at EL2\n", hyp_va_bits); 2482 kvm_debug("IDMAP page: %lx\n", hyp_idmap_start); 2483 kvm_debug("HYP VA range: %lx:%lx\n", 2484 kern_hyp_va(PAGE_OFFSET), 2485 kern_hyp_va((unsigned long)high_memory - 1)); 2486 2487 if (hyp_idmap_start >= kern_hyp_va(PAGE_OFFSET) && 2488 hyp_idmap_start < kern_hyp_va((unsigned long)high_memory - 1) && 2489 hyp_idmap_start != (unsigned long)__hyp_idmap_text_start) { 2490 /* 2491 * The idmap page is intersecting with the VA space, 2492 * it is not safe to continue further. 2493 */ 2494 kvm_err("IDMAP intersecting with HYP VA, unable to continue\n"); 2495 err = -EINVAL; 2496 goto out; 2497 } 2498 2499 hyp_pgtable = kzalloc_obj(*hyp_pgtable); 2500 if (!hyp_pgtable) { 2501 kvm_err("Hyp mode page-table not allocated\n"); 2502 err = -ENOMEM; 2503 goto out; 2504 } 2505 2506 err = kvm_pgtable_hyp_init(hyp_pgtable, hyp_va_bits, &kvm_hyp_mm_ops); 2507 if (err) 2508 goto out_free_pgtable; 2509 2510 err = kvm_map_idmap_text(); 2511 if (err) 2512 goto out_destroy_pgtable; 2513 2514 io_map_base = hyp_idmap_start; 2515 __hyp_va_bits = hyp_va_bits; 2516 return 0; 2517 2518 out_destroy_pgtable: 2519 kvm_pgtable_hyp_destroy(hyp_pgtable); 2520 out_free_pgtable: 2521 kfree(hyp_pgtable); 2522 hyp_pgtable = NULL; 2523 out: 2524 return err; 2525 } 2526 2527 void kvm_arch_commit_memory_region(struct kvm *kvm, 2528 struct kvm_memory_slot *old, 2529 const struct kvm_memory_slot *new, 2530 enum kvm_mr_change change) 2531 { 2532 bool log_dirty_pages = new && new->flags & KVM_MEM_LOG_DIRTY_PAGES; 2533 2534 /* 2535 * At this point memslot has been committed and there is an 2536 * allocated dirty_bitmap[], dirty pages will be tracked while the 2537 * memory slot is write protected. 2538 */ 2539 if (log_dirty_pages) { 2540 2541 if (change == KVM_MR_DELETE) 2542 return; 2543 2544 /* 2545 * Huge and normal pages are write-protected and split 2546 * on either of these two cases: 2547 * 2548 * 1. with initial-all-set: gradually with CLEAR ioctls, 2549 */ 2550 if (kvm_dirty_log_manual_protect_and_init_set(kvm)) 2551 return; 2552 /* 2553 * or 2554 * 2. without initial-all-set: all in one shot when 2555 * enabling dirty logging. 2556 */ 2557 kvm_mmu_wp_memory_region(kvm, new->id); 2558 kvm_mmu_split_memory_region(kvm, new->id); 2559 } else { 2560 /* 2561 * Free any leftovers from the eager page splitting cache. Do 2562 * this when deleting, moving, disabling dirty logging, or 2563 * creating the memslot (a nop). Doing it for deletes makes 2564 * sure we don't leak memory, and there's no need to keep the 2565 * cache around for any of the other cases. 2566 */ 2567 kvm_mmu_free_memory_cache(&kvm->arch.mmu.split_page_cache); 2568 } 2569 } 2570 2571 int kvm_arch_prepare_memory_region(struct kvm *kvm, 2572 const struct kvm_memory_slot *old, 2573 struct kvm_memory_slot *new, 2574 enum kvm_mr_change change) 2575 { 2576 hva_t hva, reg_end; 2577 int ret = 0; 2578 2579 if (kvm_vm_is_protected(kvm)) { 2580 /* Cannot modify memslots once a pVM has run. */ 2581 if (pkvm_hyp_vm_is_created(kvm) && 2582 (change == KVM_MR_DELETE || change == KVM_MR_MOVE)) { 2583 return -EPERM; 2584 } 2585 2586 if (new && 2587 new->flags & (KVM_MEM_LOG_DIRTY_PAGES | KVM_MEM_READONLY)) { 2588 return -EPERM; 2589 } 2590 } 2591 2592 if (change != KVM_MR_CREATE && change != KVM_MR_MOVE && 2593 change != KVM_MR_FLAGS_ONLY) 2594 return 0; 2595 2596 /* 2597 * Prevent userspace from creating a memory region outside of the IPA 2598 * space addressable by the KVM guest IPA space. 2599 */ 2600 if ((new->base_gfn + new->npages) > (kvm_phys_size(&kvm->arch.mmu) >> PAGE_SHIFT)) 2601 return -EFAULT; 2602 2603 /* 2604 * Only support guest_memfd backed memslots with mappable memory, since 2605 * there aren't any CoCo VMs that support only private memory on arm64. 2606 */ 2607 if (kvm_slot_has_gmem(new) && !kvm_memslot_is_gmem_only(new)) 2608 return -EINVAL; 2609 2610 hva = new->userspace_addr; 2611 reg_end = hva + (new->npages << PAGE_SHIFT); 2612 2613 mmap_read_lock(current->mm); 2614 /* 2615 * A memory region could potentially cover multiple VMAs, and any holes 2616 * between them, so iterate over all of them. 2617 * 2618 * +--------------------------------------------+ 2619 * +---------------+----------------+ +----------------+ 2620 * | : VMA 1 | VMA 2 | | VMA 3 : | 2621 * +---------------+----------------+ +----------------+ 2622 * | memory region | 2623 * +--------------------------------------------+ 2624 */ 2625 do { 2626 struct vm_area_struct *vma; 2627 2628 vma = find_vma_intersection(current->mm, hva, reg_end); 2629 if (!vma) 2630 break; 2631 2632 if (kvm_has_mte(kvm) && !kvm_vma_mte_allowed(vma)) { 2633 ret = -EINVAL; 2634 break; 2635 } 2636 2637 if (vma->vm_flags & VM_PFNMAP) { 2638 /* IO region dirty page logging not allowed */ 2639 if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) { 2640 ret = -EINVAL; 2641 break; 2642 } 2643 2644 /* 2645 * Cacheable PFNMAP is allowed only if the hardware 2646 * supports it. 2647 */ 2648 if (kvm_vma_is_cacheable(vma) && !kvm_supports_cacheable_pfnmap()) { 2649 ret = -EINVAL; 2650 break; 2651 } 2652 } 2653 hva = min(reg_end, vma->vm_end); 2654 } while (hva < reg_end); 2655 2656 mmap_read_unlock(current->mm); 2657 return ret; 2658 } 2659 2660 void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) 2661 { 2662 } 2663 2664 void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) 2665 { 2666 } 2667 2668 void kvm_arch_flush_shadow_memslot(struct kvm *kvm, 2669 struct kvm_memory_slot *slot) 2670 { 2671 gpa_t gpa = slot->base_gfn << PAGE_SHIFT; 2672 phys_addr_t size = slot->npages << PAGE_SHIFT; 2673 2674 write_lock(&kvm->mmu_lock); 2675 kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, size, true); 2676 kvm_nested_s2_unmap(kvm, true); 2677 write_unlock(&kvm->mmu_lock); 2678 } 2679 2680 /* 2681 * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized). 2682 * 2683 * Main problems: 2684 * - S/W ops are local to a CPU (not broadcast) 2685 * - We have line migration behind our back (speculation) 2686 * - System caches don't support S/W at all (damn!) 2687 * 2688 * In the face of the above, the best we can do is to try and convert 2689 * S/W ops to VA ops. Because the guest is not allowed to infer the 2690 * S/W to PA mapping, it can only use S/W to nuke the whole cache, 2691 * which is a rather good thing for us. 2692 * 2693 * Also, it is only used when turning caches on/off ("The expected 2694 * usage of the cache maintenance instructions that operate by set/way 2695 * is associated with the cache maintenance instructions associated 2696 * with the powerdown and powerup of caches, if this is required by 2697 * the implementation."). 2698 * 2699 * We use the following policy: 2700 * 2701 * - If we trap a S/W operation, we enable VM trapping to detect 2702 * caches being turned on/off, and do a full clean. 2703 * 2704 * - We flush the caches on both caches being turned on and off. 2705 * 2706 * - Once the caches are enabled, we stop trapping VM ops. 2707 */ 2708 void kvm_set_way_flush(struct kvm_vcpu *vcpu) 2709 { 2710 unsigned long hcr = *vcpu_hcr(vcpu); 2711 2712 /* 2713 * If this is the first time we do a S/W operation 2714 * (i.e. HCR_TVM not set) flush the whole memory, and set the 2715 * VM trapping. 2716 * 2717 * Otherwise, rely on the VM trapping to wait for the MMU + 2718 * Caches to be turned off. At that point, we'll be able to 2719 * clean the caches again. 2720 */ 2721 if (!(hcr & HCR_TVM)) { 2722 trace_kvm_set_way_flush(*vcpu_pc(vcpu), 2723 vcpu_has_cache_enabled(vcpu)); 2724 stage2_flush_vm(vcpu->kvm); 2725 *vcpu_hcr(vcpu) = hcr | HCR_TVM; 2726 } 2727 } 2728 2729 void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled) 2730 { 2731 bool now_enabled = vcpu_has_cache_enabled(vcpu); 2732 2733 /* 2734 * If switching the MMU+caches on, need to invalidate the caches. 2735 * If switching it off, need to clean the caches. 2736 * Clean + invalidate does the trick always. 2737 */ 2738 if (now_enabled != was_enabled) 2739 stage2_flush_vm(vcpu->kvm); 2740 2741 /* Caches are now on, stop trapping VM ops (until a S/W op) */ 2742 if (now_enabled) 2743 *vcpu_hcr(vcpu) &= ~HCR_TVM; 2744 2745 trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled); 2746 } 2747