1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2012 - Virtual Open Systems and Columbia University 4 * Author: Christoffer Dall <c.dall@virtualopensystems.com> 5 */ 6 7 #include <linux/acpi.h> 8 #include <linux/mman.h> 9 #include <linux/kvm_host.h> 10 #include <linux/io.h> 11 #include <linux/hugetlb.h> 12 #include <linux/sched/signal.h> 13 #include <trace/events/kvm.h> 14 #include <asm/acpi.h> 15 #include <asm/pgalloc.h> 16 #include <asm/cacheflush.h> 17 #include <asm/kvm_arm.h> 18 #include <asm/kvm_mmu.h> 19 #include <asm/kvm_pgtable.h> 20 #include <asm/kvm_pkvm.h> 21 #include <asm/kvm_asm.h> 22 #include <asm/kvm_emulate.h> 23 #include <asm/virt.h> 24 25 #include "trace.h" 26 27 static struct kvm_pgtable *hyp_pgtable; 28 static DEFINE_MUTEX(kvm_hyp_pgd_mutex); 29 30 static unsigned long __ro_after_init hyp_idmap_start; 31 static unsigned long __ro_after_init hyp_idmap_end; 32 static phys_addr_t __ro_after_init hyp_idmap_vector; 33 34 u32 __ro_after_init __hyp_va_bits; 35 36 static unsigned long __ro_after_init io_map_base; 37 38 #define KVM_PGT_FN(fn) (!is_protected_kvm_enabled() ? fn : p ## fn) 39 40 static phys_addr_t __stage2_range_addr_end(phys_addr_t addr, phys_addr_t end, 41 phys_addr_t size) 42 { 43 phys_addr_t boundary = ALIGN_DOWN(addr + size, size); 44 45 return (boundary - 1 < end - 1) ? boundary : end; 46 } 47 48 static phys_addr_t stage2_range_addr_end(phys_addr_t addr, phys_addr_t end) 49 { 50 phys_addr_t size = kvm_granule_size(KVM_PGTABLE_MIN_BLOCK_LEVEL); 51 52 return __stage2_range_addr_end(addr, end, size); 53 } 54 55 /* 56 * Release kvm_mmu_lock periodically if the memory region is large. Otherwise, 57 * we may see kernel panics with CONFIG_DETECT_HUNG_TASK, 58 * CONFIG_LOCKUP_DETECTOR, CONFIG_LOCKDEP. Additionally, holding the lock too 59 * long will also starve other vCPUs. We have to also make sure that the page 60 * tables are not freed while we released the lock. 61 */ 62 static int stage2_apply_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, 63 phys_addr_t end, 64 int (*fn)(struct kvm_pgtable *, u64, u64), 65 bool resched) 66 { 67 struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu); 68 int ret; 69 u64 next; 70 71 do { 72 struct kvm_pgtable *pgt = mmu->pgt; 73 if (!pgt) 74 return -EINVAL; 75 76 next = stage2_range_addr_end(addr, end); 77 ret = fn(pgt, addr, next - addr); 78 if (ret) 79 break; 80 81 if (resched && next != end) 82 cond_resched_rwlock_write(&kvm->mmu_lock); 83 } while (addr = next, addr != end); 84 85 return ret; 86 } 87 88 #define stage2_apply_range_resched(mmu, addr, end, fn) \ 89 stage2_apply_range(mmu, addr, end, fn, true) 90 91 /* 92 * Get the maximum number of page-tables pages needed to split a range 93 * of blocks into PAGE_SIZE PTEs. It assumes the range is already 94 * mapped at level 2, or at level 1 if allowed. 95 */ 96 static int kvm_mmu_split_nr_page_tables(u64 range) 97 { 98 int n = 0; 99 100 if (KVM_PGTABLE_MIN_BLOCK_LEVEL < 2) 101 n += DIV_ROUND_UP(range, PUD_SIZE); 102 n += DIV_ROUND_UP(range, PMD_SIZE); 103 return n; 104 } 105 106 static bool need_split_memcache_topup_or_resched(struct kvm *kvm) 107 { 108 struct kvm_mmu_memory_cache *cache; 109 u64 chunk_size, min; 110 111 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) 112 return true; 113 114 chunk_size = kvm->arch.mmu.split_page_chunk_size; 115 min = kvm_mmu_split_nr_page_tables(chunk_size); 116 cache = &kvm->arch.mmu.split_page_cache; 117 return kvm_mmu_memory_cache_nr_free_objects(cache) < min; 118 } 119 120 static int kvm_mmu_split_huge_pages(struct kvm *kvm, phys_addr_t addr, 121 phys_addr_t end) 122 { 123 struct kvm_mmu_memory_cache *cache; 124 struct kvm_pgtable *pgt; 125 int ret, cache_capacity; 126 u64 next, chunk_size; 127 128 lockdep_assert_held_write(&kvm->mmu_lock); 129 130 chunk_size = kvm->arch.mmu.split_page_chunk_size; 131 cache_capacity = kvm_mmu_split_nr_page_tables(chunk_size); 132 133 if (chunk_size == 0) 134 return 0; 135 136 cache = &kvm->arch.mmu.split_page_cache; 137 138 do { 139 if (need_split_memcache_topup_or_resched(kvm)) { 140 write_unlock(&kvm->mmu_lock); 141 cond_resched(); 142 /* Eager page splitting is best-effort. */ 143 ret = __kvm_mmu_topup_memory_cache(cache, 144 cache_capacity, 145 cache_capacity); 146 write_lock(&kvm->mmu_lock); 147 if (ret) 148 break; 149 } 150 151 pgt = kvm->arch.mmu.pgt; 152 if (!pgt) 153 return -EINVAL; 154 155 next = __stage2_range_addr_end(addr, end, chunk_size); 156 ret = KVM_PGT_FN(kvm_pgtable_stage2_split)(pgt, addr, next - addr, cache); 157 if (ret) 158 break; 159 } while (addr = next, addr != end); 160 161 return ret; 162 } 163 164 static bool memslot_is_logging(struct kvm_memory_slot *memslot) 165 { 166 return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY); 167 } 168 169 /** 170 * kvm_arch_flush_remote_tlbs() - flush all VM TLB entries for v7/8 171 * @kvm: pointer to kvm structure. 172 * 173 * Interface to HYP function to flush all VM TLB entries 174 */ 175 int kvm_arch_flush_remote_tlbs(struct kvm *kvm) 176 { 177 if (is_protected_kvm_enabled()) 178 kvm_call_hyp_nvhe(__pkvm_tlb_flush_vmid, kvm->arch.pkvm.handle); 179 else 180 kvm_call_hyp(__kvm_tlb_flush_vmid, &kvm->arch.mmu); 181 return 0; 182 } 183 184 int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, 185 gfn_t gfn, u64 nr_pages) 186 { 187 u64 size = nr_pages << PAGE_SHIFT; 188 u64 addr = gfn << PAGE_SHIFT; 189 190 if (is_protected_kvm_enabled()) 191 kvm_call_hyp_nvhe(__pkvm_tlb_flush_vmid, kvm->arch.pkvm.handle); 192 else 193 kvm_tlb_flush_vmid_range(&kvm->arch.mmu, addr, size); 194 return 0; 195 } 196 197 static void *stage2_memcache_zalloc_page(void *arg) 198 { 199 struct kvm_mmu_memory_cache *mc = arg; 200 void *virt; 201 202 /* Allocated with __GFP_ZERO, so no need to zero */ 203 virt = kvm_mmu_memory_cache_alloc(mc); 204 if (virt) 205 kvm_account_pgtable_pages(virt, 1); 206 return virt; 207 } 208 209 static void *kvm_host_zalloc_pages_exact(size_t size) 210 { 211 return alloc_pages_exact(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO); 212 } 213 214 static void *kvm_s2_zalloc_pages_exact(size_t size) 215 { 216 void *virt = kvm_host_zalloc_pages_exact(size); 217 218 if (virt) 219 kvm_account_pgtable_pages(virt, (size >> PAGE_SHIFT)); 220 return virt; 221 } 222 223 static void kvm_s2_free_pages_exact(void *virt, size_t size) 224 { 225 kvm_account_pgtable_pages(virt, -(size >> PAGE_SHIFT)); 226 free_pages_exact(virt, size); 227 } 228 229 static struct kvm_pgtable_mm_ops kvm_s2_mm_ops; 230 231 static void stage2_free_unlinked_table_rcu_cb(struct rcu_head *head) 232 { 233 struct page *page = container_of(head, struct page, rcu_head); 234 void *pgtable = page_to_virt(page); 235 s8 level = page_private(page); 236 237 KVM_PGT_FN(kvm_pgtable_stage2_free_unlinked)(&kvm_s2_mm_ops, pgtable, level); 238 } 239 240 static void stage2_free_unlinked_table(void *addr, s8 level) 241 { 242 struct page *page = virt_to_page(addr); 243 244 set_page_private(page, (unsigned long)level); 245 call_rcu(&page->rcu_head, stage2_free_unlinked_table_rcu_cb); 246 } 247 248 static void kvm_host_get_page(void *addr) 249 { 250 get_page(virt_to_page(addr)); 251 } 252 253 static void kvm_host_put_page(void *addr) 254 { 255 put_page(virt_to_page(addr)); 256 } 257 258 static void kvm_s2_put_page(void *addr) 259 { 260 struct page *p = virt_to_page(addr); 261 /* Dropping last refcount, the page will be freed */ 262 if (page_count(p) == 1) 263 kvm_account_pgtable_pages(addr, -1); 264 put_page(p); 265 } 266 267 static int kvm_host_page_count(void *addr) 268 { 269 return page_count(virt_to_page(addr)); 270 } 271 272 static phys_addr_t kvm_host_pa(void *addr) 273 { 274 return __pa(addr); 275 } 276 277 static void *kvm_host_va(phys_addr_t phys) 278 { 279 return __va(phys); 280 } 281 282 static void clean_dcache_guest_page(void *va, size_t size) 283 { 284 __clean_dcache_guest_page(va, size); 285 } 286 287 static void invalidate_icache_guest_page(void *va, size_t size) 288 { 289 __invalidate_icache_guest_page(va, size); 290 } 291 292 /* 293 * Unmapping vs dcache management: 294 * 295 * If a guest maps certain memory pages as uncached, all writes will 296 * bypass the data cache and go directly to RAM. However, the CPUs 297 * can still speculate reads (not writes) and fill cache lines with 298 * data. 299 * 300 * Those cache lines will be *clean* cache lines though, so a 301 * clean+invalidate operation is equivalent to an invalidate 302 * operation, because no cache lines are marked dirty. 303 * 304 * Those clean cache lines could be filled prior to an uncached write 305 * by the guest, and the cache coherent IO subsystem would therefore 306 * end up writing old data to disk. 307 * 308 * This is why right after unmapping a page/section and invalidating 309 * the corresponding TLBs, we flush to make sure the IO subsystem will 310 * never hit in the cache. 311 * 312 * This is all avoided on systems that have ARM64_HAS_STAGE2_FWB, as 313 * we then fully enforce cacheability of RAM, no matter what the guest 314 * does. 315 */ 316 /** 317 * __unmap_stage2_range -- Clear stage2 page table entries to unmap a range 318 * @mmu: The KVM stage-2 MMU pointer 319 * @start: The intermediate physical base address of the range to unmap 320 * @size: The size of the area to unmap 321 * @may_block: Whether or not we are permitted to block 322 * 323 * Clear a range of stage-2 mappings, lowering the various ref-counts. Must 324 * be called while holding mmu_lock (unless for freeing the stage2 pgd before 325 * destroying the VM), otherwise another faulting VCPU may come in and mess 326 * with things behind our backs. 327 */ 328 static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size, 329 bool may_block) 330 { 331 struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu); 332 phys_addr_t end = start + size; 333 334 lockdep_assert_held_write(&kvm->mmu_lock); 335 WARN_ON(size & ~PAGE_MASK); 336 WARN_ON(stage2_apply_range(mmu, start, end, KVM_PGT_FN(kvm_pgtable_stage2_unmap), 337 may_block)); 338 } 339 340 void kvm_stage2_unmap_range(struct kvm_s2_mmu *mmu, phys_addr_t start, 341 u64 size, bool may_block) 342 { 343 if (kvm_vm_is_protected(kvm_s2_mmu_to_kvm(mmu))) 344 return; 345 346 __unmap_stage2_range(mmu, start, size, may_block); 347 } 348 349 void kvm_stage2_flush_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end) 350 { 351 stage2_apply_range_resched(mmu, addr, end, KVM_PGT_FN(kvm_pgtable_stage2_flush)); 352 } 353 354 static void stage2_flush_memslot(struct kvm *kvm, 355 struct kvm_memory_slot *memslot) 356 { 357 phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT; 358 phys_addr_t end = addr + PAGE_SIZE * memslot->npages; 359 360 kvm_stage2_flush_range(&kvm->arch.mmu, addr, end); 361 } 362 363 /** 364 * stage2_flush_vm - Invalidate cache for pages mapped in stage 2 365 * @kvm: The struct kvm pointer 366 * 367 * Go through the stage 2 page tables and invalidate any cache lines 368 * backing memory already mapped to the VM. 369 */ 370 static void stage2_flush_vm(struct kvm *kvm) 371 { 372 struct kvm_memslots *slots; 373 struct kvm_memory_slot *memslot; 374 int idx, bkt; 375 376 idx = srcu_read_lock(&kvm->srcu); 377 write_lock(&kvm->mmu_lock); 378 379 slots = kvm_memslots(kvm); 380 kvm_for_each_memslot(memslot, bkt, slots) 381 stage2_flush_memslot(kvm, memslot); 382 383 kvm_nested_s2_flush(kvm); 384 385 write_unlock(&kvm->mmu_lock); 386 srcu_read_unlock(&kvm->srcu, idx); 387 } 388 389 /** 390 * free_hyp_pgds - free Hyp-mode page tables 391 */ 392 void __init free_hyp_pgds(void) 393 { 394 mutex_lock(&kvm_hyp_pgd_mutex); 395 if (hyp_pgtable) { 396 kvm_pgtable_hyp_destroy(hyp_pgtable); 397 kfree(hyp_pgtable); 398 hyp_pgtable = NULL; 399 } 400 mutex_unlock(&kvm_hyp_pgd_mutex); 401 } 402 403 static bool kvm_host_owns_hyp_mappings(void) 404 { 405 if (is_kernel_in_hyp_mode()) 406 return false; 407 408 if (static_branch_likely(&kvm_protected_mode_initialized)) 409 return false; 410 411 /* 412 * This can happen at boot time when __create_hyp_mappings() is called 413 * after the hyp protection has been enabled, but the static key has 414 * not been flipped yet. 415 */ 416 if (!hyp_pgtable && is_protected_kvm_enabled()) 417 return false; 418 419 WARN_ON(!hyp_pgtable); 420 421 return true; 422 } 423 424 int __create_hyp_mappings(unsigned long start, unsigned long size, 425 unsigned long phys, enum kvm_pgtable_prot prot) 426 { 427 int err; 428 429 if (WARN_ON(!kvm_host_owns_hyp_mappings())) 430 return -EINVAL; 431 432 mutex_lock(&kvm_hyp_pgd_mutex); 433 err = kvm_pgtable_hyp_map(hyp_pgtable, start, size, phys, prot); 434 mutex_unlock(&kvm_hyp_pgd_mutex); 435 436 return err; 437 } 438 439 static phys_addr_t kvm_kaddr_to_phys(void *kaddr) 440 { 441 if (!is_vmalloc_addr(kaddr)) { 442 BUG_ON(!virt_addr_valid(kaddr)); 443 return __pa(kaddr); 444 } else { 445 return page_to_phys(vmalloc_to_page(kaddr)) + 446 offset_in_page(kaddr); 447 } 448 } 449 450 struct hyp_shared_pfn { 451 u64 pfn; 452 int count; 453 struct rb_node node; 454 }; 455 456 static DEFINE_MUTEX(hyp_shared_pfns_lock); 457 static struct rb_root hyp_shared_pfns = RB_ROOT; 458 459 static struct hyp_shared_pfn *find_shared_pfn(u64 pfn, struct rb_node ***node, 460 struct rb_node **parent) 461 { 462 struct hyp_shared_pfn *this; 463 464 *node = &hyp_shared_pfns.rb_node; 465 *parent = NULL; 466 while (**node) { 467 this = container_of(**node, struct hyp_shared_pfn, node); 468 *parent = **node; 469 if (this->pfn < pfn) 470 *node = &((**node)->rb_left); 471 else if (this->pfn > pfn) 472 *node = &((**node)->rb_right); 473 else 474 return this; 475 } 476 477 return NULL; 478 } 479 480 static int share_pfn_hyp(u64 pfn) 481 { 482 struct rb_node **node, *parent; 483 struct hyp_shared_pfn *this; 484 int ret = 0; 485 486 mutex_lock(&hyp_shared_pfns_lock); 487 this = find_shared_pfn(pfn, &node, &parent); 488 if (this) { 489 this->count++; 490 goto unlock; 491 } 492 493 this = kzalloc_obj(*this); 494 if (!this) { 495 ret = -ENOMEM; 496 goto unlock; 497 } 498 499 this->pfn = pfn; 500 this->count = 1; 501 rb_link_node(&this->node, parent, node); 502 rb_insert_color(&this->node, &hyp_shared_pfns); 503 ret = kvm_call_hyp_nvhe(__pkvm_host_share_hyp, pfn); 504 unlock: 505 mutex_unlock(&hyp_shared_pfns_lock); 506 507 return ret; 508 } 509 510 static int unshare_pfn_hyp(u64 pfn) 511 { 512 struct rb_node **node, *parent; 513 struct hyp_shared_pfn *this; 514 int ret = 0; 515 516 mutex_lock(&hyp_shared_pfns_lock); 517 this = find_shared_pfn(pfn, &node, &parent); 518 if (WARN_ON(!this)) { 519 ret = -ENOENT; 520 goto unlock; 521 } 522 523 this->count--; 524 if (this->count) 525 goto unlock; 526 527 rb_erase(&this->node, &hyp_shared_pfns); 528 kfree(this); 529 ret = kvm_call_hyp_nvhe(__pkvm_host_unshare_hyp, pfn); 530 unlock: 531 mutex_unlock(&hyp_shared_pfns_lock); 532 533 return ret; 534 } 535 536 int kvm_share_hyp(void *from, void *to) 537 { 538 phys_addr_t start, end, cur; 539 u64 pfn; 540 int ret; 541 542 if (is_kernel_in_hyp_mode()) 543 return 0; 544 545 /* 546 * The share hcall maps things in the 'fixed-offset' region of the hyp 547 * VA space, so we can only share physically contiguous data-structures 548 * for now. 549 */ 550 if (is_vmalloc_or_module_addr(from) || is_vmalloc_or_module_addr(to)) 551 return -EINVAL; 552 553 if (kvm_host_owns_hyp_mappings()) 554 return create_hyp_mappings(from, to, PAGE_HYP); 555 556 start = ALIGN_DOWN(__pa(from), PAGE_SIZE); 557 end = PAGE_ALIGN(__pa(to)); 558 for (cur = start; cur < end; cur += PAGE_SIZE) { 559 pfn = __phys_to_pfn(cur); 560 ret = share_pfn_hyp(pfn); 561 if (ret) 562 return ret; 563 } 564 565 return 0; 566 } 567 568 void kvm_unshare_hyp(void *from, void *to) 569 { 570 phys_addr_t start, end, cur; 571 u64 pfn; 572 573 if (is_kernel_in_hyp_mode() || kvm_host_owns_hyp_mappings() || !from) 574 return; 575 576 start = ALIGN_DOWN(__pa(from), PAGE_SIZE); 577 end = PAGE_ALIGN(__pa(to)); 578 for (cur = start; cur < end; cur += PAGE_SIZE) { 579 pfn = __phys_to_pfn(cur); 580 WARN_ON(unshare_pfn_hyp(pfn)); 581 } 582 } 583 584 /** 585 * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode 586 * @from: The virtual kernel start address of the range 587 * @to: The virtual kernel end address of the range (exclusive) 588 * @prot: The protection to be applied to this range 589 * 590 * The same virtual address as the kernel virtual address is also used 591 * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying 592 * physical pages. 593 */ 594 int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot) 595 { 596 phys_addr_t phys_addr; 597 unsigned long virt_addr; 598 unsigned long start = kern_hyp_va((unsigned long)from); 599 unsigned long end = kern_hyp_va((unsigned long)to); 600 601 if (is_kernel_in_hyp_mode()) 602 return 0; 603 604 if (!kvm_host_owns_hyp_mappings()) 605 return -EPERM; 606 607 start = start & PAGE_MASK; 608 end = PAGE_ALIGN(end); 609 610 for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) { 611 int err; 612 613 phys_addr = kvm_kaddr_to_phys(from + virt_addr - start); 614 err = __create_hyp_mappings(virt_addr, PAGE_SIZE, phys_addr, 615 prot); 616 if (err) 617 return err; 618 } 619 620 return 0; 621 } 622 623 static int __hyp_alloc_private_va_range(unsigned long base) 624 { 625 lockdep_assert_held(&kvm_hyp_pgd_mutex); 626 627 if (!PAGE_ALIGNED(base)) 628 return -EINVAL; 629 630 /* 631 * Verify that BIT(VA_BITS - 1) hasn't been flipped by 632 * allocating the new area, as it would indicate we've 633 * overflowed the idmap/IO address range. 634 */ 635 if ((base ^ io_map_base) & BIT(VA_BITS - 1)) 636 return -ENOMEM; 637 638 io_map_base = base; 639 640 return 0; 641 } 642 643 /** 644 * hyp_alloc_private_va_range - Allocates a private VA range. 645 * @size: The size of the VA range to reserve. 646 * @haddr: The hypervisor virtual start address of the allocation. 647 * 648 * The private virtual address (VA) range is allocated below io_map_base 649 * and aligned based on the order of @size. 650 * 651 * Return: 0 on success or negative error code on failure. 652 */ 653 int hyp_alloc_private_va_range(size_t size, unsigned long *haddr) 654 { 655 unsigned long base; 656 int ret = 0; 657 658 mutex_lock(&kvm_hyp_pgd_mutex); 659 660 /* 661 * This assumes that we have enough space below the idmap 662 * page to allocate our VAs. If not, the check in 663 * __hyp_alloc_private_va_range() will kick. A potential 664 * alternative would be to detect that overflow and switch 665 * to an allocation above the idmap. 666 * 667 * The allocated size is always a multiple of PAGE_SIZE. 668 */ 669 size = PAGE_ALIGN(size); 670 base = io_map_base - size; 671 ret = __hyp_alloc_private_va_range(base); 672 673 mutex_unlock(&kvm_hyp_pgd_mutex); 674 675 if (!ret) 676 *haddr = base; 677 678 return ret; 679 } 680 681 static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size, 682 unsigned long *haddr, 683 enum kvm_pgtable_prot prot) 684 { 685 unsigned long addr; 686 int ret = 0; 687 688 if (!kvm_host_owns_hyp_mappings()) { 689 addr = kvm_call_hyp_nvhe(__pkvm_create_private_mapping, 690 phys_addr, size, prot); 691 if (IS_ERR_VALUE(addr)) 692 return addr; 693 *haddr = addr; 694 695 return 0; 696 } 697 698 size = PAGE_ALIGN(size + offset_in_page(phys_addr)); 699 ret = hyp_alloc_private_va_range(size, &addr); 700 if (ret) 701 return ret; 702 703 ret = __create_hyp_mappings(addr, size, phys_addr, prot); 704 if (ret) 705 return ret; 706 707 *haddr = addr + offset_in_page(phys_addr); 708 return ret; 709 } 710 711 int create_hyp_stack(phys_addr_t phys_addr, unsigned long *haddr) 712 { 713 unsigned long base; 714 size_t size; 715 int ret; 716 717 mutex_lock(&kvm_hyp_pgd_mutex); 718 /* 719 * Efficient stack verification using the NVHE_STACK_SHIFT bit implies 720 * an alignment of our allocation on the order of the size. 721 */ 722 size = NVHE_STACK_SIZE * 2; 723 base = ALIGN_DOWN(io_map_base - size, size); 724 725 ret = __hyp_alloc_private_va_range(base); 726 727 mutex_unlock(&kvm_hyp_pgd_mutex); 728 729 if (ret) { 730 kvm_err("Cannot allocate hyp stack guard page\n"); 731 return ret; 732 } 733 734 /* 735 * Since the stack grows downwards, map the stack to the page 736 * at the higher address and leave the lower guard page 737 * unbacked. 738 * 739 * Any valid stack address now has the NVHE_STACK_SHIFT bit as 1 740 * and addresses corresponding to the guard page have the 741 * NVHE_STACK_SHIFT bit as 0 - this is used for overflow detection. 742 */ 743 ret = __create_hyp_mappings(base + NVHE_STACK_SIZE, NVHE_STACK_SIZE, 744 phys_addr, PAGE_HYP); 745 if (ret) 746 kvm_err("Cannot map hyp stack\n"); 747 748 *haddr = base + size; 749 750 return ret; 751 } 752 753 /** 754 * create_hyp_io_mappings - Map IO into both kernel and HYP 755 * @phys_addr: The physical start address which gets mapped 756 * @size: Size of the region being mapped 757 * @kaddr: Kernel VA for this mapping 758 * @haddr: HYP VA for this mapping 759 */ 760 int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size, 761 void __iomem **kaddr, 762 void __iomem **haddr) 763 { 764 unsigned long addr; 765 int ret; 766 767 if (is_protected_kvm_enabled()) 768 return -EPERM; 769 770 *kaddr = ioremap(phys_addr, size); 771 if (!*kaddr) 772 return -ENOMEM; 773 774 if (is_kernel_in_hyp_mode()) { 775 *haddr = *kaddr; 776 return 0; 777 } 778 779 ret = __create_hyp_private_mapping(phys_addr, size, 780 &addr, PAGE_HYP_DEVICE); 781 if (ret) { 782 iounmap(*kaddr); 783 *kaddr = NULL; 784 *haddr = NULL; 785 return ret; 786 } 787 788 *haddr = (void __iomem *)addr; 789 return 0; 790 } 791 792 /** 793 * create_hyp_exec_mappings - Map an executable range into HYP 794 * @phys_addr: The physical start address which gets mapped 795 * @size: Size of the region being mapped 796 * @haddr: HYP VA for this mapping 797 */ 798 int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size, 799 void **haddr) 800 { 801 unsigned long addr; 802 int ret; 803 804 BUG_ON(is_kernel_in_hyp_mode()); 805 806 ret = __create_hyp_private_mapping(phys_addr, size, 807 &addr, PAGE_HYP_EXEC); 808 if (ret) { 809 *haddr = NULL; 810 return ret; 811 } 812 813 *haddr = (void *)addr; 814 return 0; 815 } 816 817 static struct kvm_pgtable_mm_ops kvm_user_mm_ops = { 818 /* We shouldn't need any other callback to walk the PT */ 819 .phys_to_virt = kvm_host_va, 820 }; 821 822 static int get_user_mapping_size(struct kvm *kvm, u64 addr) 823 { 824 struct kvm_pgtable pgt = { 825 .pgd = (kvm_pteref_t)kvm->mm->pgd, 826 .ia_bits = vabits_actual, 827 .start_level = (KVM_PGTABLE_LAST_LEVEL - 828 ARM64_HW_PGTABLE_LEVELS(pgt.ia_bits) + 1), 829 .mm_ops = &kvm_user_mm_ops, 830 }; 831 unsigned long flags; 832 kvm_pte_t pte = 0; /* Keep GCC quiet... */ 833 s8 level = S8_MAX; 834 int ret; 835 836 /* 837 * Disable IRQs so that we hazard against a concurrent 838 * teardown of the userspace page tables (which relies on 839 * IPI-ing threads). 840 */ 841 local_irq_save(flags); 842 ret = kvm_pgtable_get_leaf(&pgt, addr, &pte, &level); 843 local_irq_restore(flags); 844 845 if (ret) 846 return ret; 847 848 /* 849 * Not seeing an error, but not updating level? Something went 850 * deeply wrong... 851 */ 852 if (WARN_ON(level > KVM_PGTABLE_LAST_LEVEL)) 853 return -EFAULT; 854 if (WARN_ON(level < KVM_PGTABLE_FIRST_LEVEL)) 855 return -EFAULT; 856 857 /* Oops, the userspace PTs are gone... Replay the fault */ 858 if (!kvm_pte_valid(pte)) 859 return -EAGAIN; 860 861 return BIT(ARM64_HW_PGTABLE_LEVEL_SHIFT(level)); 862 } 863 864 static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = { 865 .zalloc_page = stage2_memcache_zalloc_page, 866 .zalloc_pages_exact = kvm_s2_zalloc_pages_exact, 867 .free_pages_exact = kvm_s2_free_pages_exact, 868 .free_unlinked_table = stage2_free_unlinked_table, 869 .get_page = kvm_host_get_page, 870 .put_page = kvm_s2_put_page, 871 .page_count = kvm_host_page_count, 872 .phys_to_virt = kvm_host_va, 873 .virt_to_phys = kvm_host_pa, 874 .dcache_clean_inval_poc = clean_dcache_guest_page, 875 .icache_inval_pou = invalidate_icache_guest_page, 876 }; 877 878 static int kvm_init_ipa_range(struct kvm_s2_mmu *mmu, unsigned long type) 879 { 880 u32 kvm_ipa_limit = get_kvm_ipa_limit(); 881 u64 mmfr0, mmfr1; 882 u32 phys_shift; 883 884 phys_shift = KVM_VM_TYPE_ARM_IPA_SIZE(type); 885 if (is_protected_kvm_enabled()) { 886 phys_shift = kvm_ipa_limit; 887 } else if (phys_shift) { 888 if (phys_shift > kvm_ipa_limit || 889 phys_shift < ARM64_MIN_PARANGE_BITS) 890 return -EINVAL; 891 } else { 892 phys_shift = KVM_PHYS_SHIFT; 893 if (phys_shift > kvm_ipa_limit) { 894 pr_warn_once("%s using unsupported default IPA limit, upgrade your VMM\n", 895 current->comm); 896 return -EINVAL; 897 } 898 } 899 900 mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); 901 mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); 902 mmu->vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift); 903 904 return 0; 905 } 906 907 /* 908 * Assume that @pgt is valid and unlinked from the KVM MMU to free the 909 * page-table without taking the kvm_mmu_lock and without performing any 910 * TLB invalidations. 911 * 912 * Also, the range of addresses can be large enough to cause need_resched 913 * warnings, for instance on CONFIG_PREEMPT_NONE kernels. Hence, invoke 914 * cond_resched() periodically to prevent hogging the CPU for a long time 915 * and schedule something else, if required. 916 */ 917 static void stage2_destroy_range(struct kvm_pgtable *pgt, phys_addr_t addr, 918 phys_addr_t end) 919 { 920 u64 next; 921 922 do { 923 next = stage2_range_addr_end(addr, end); 924 KVM_PGT_FN(kvm_pgtable_stage2_destroy_range)(pgt, addr, 925 next - addr); 926 if (next != end) 927 cond_resched(); 928 } while (addr = next, addr != end); 929 } 930 931 static void kvm_stage2_destroy(struct kvm_pgtable *pgt) 932 { 933 unsigned int ia_bits = VTCR_EL2_IPA(pgt->mmu->vtcr); 934 935 stage2_destroy_range(pgt, 0, BIT(ia_bits)); 936 KVM_PGT_FN(kvm_pgtable_stage2_destroy_pgd)(pgt); 937 } 938 939 /** 940 * kvm_init_stage2_mmu - Initialise a S2 MMU structure 941 * @kvm: The pointer to the KVM structure 942 * @mmu: The pointer to the s2 MMU structure 943 * @type: The machine type of the virtual machine 944 * 945 * Allocates only the stage-2 HW PGD level table(s). 946 * Note we don't need locking here as this is only called in two cases: 947 * 948 * - when the VM is created, which can't race against anything 949 * 950 * - when secondary kvm_s2_mmu structures are initialised for NV 951 * guests, and the caller must hold kvm->lock as this is called on a 952 * per-vcpu basis. 953 */ 954 int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type) 955 { 956 int cpu, err; 957 struct kvm_pgtable *pgt; 958 959 /* 960 * If we already have our page tables in place, and that the 961 * MMU context is the canonical one, we have a bug somewhere, 962 * as this is only supposed to ever happen once per VM. 963 * 964 * Otherwise, we're building nested page tables, and that's 965 * probably because userspace called KVM_ARM_VCPU_INIT more 966 * than once on the same vcpu. Since that's actually legal, 967 * don't kick a fuss and leave gracefully. 968 */ 969 if (mmu->pgt != NULL) { 970 if (kvm_is_nested_s2_mmu(kvm, mmu)) 971 return 0; 972 973 kvm_err("kvm_arch already initialized?\n"); 974 return -EINVAL; 975 } 976 977 err = kvm_init_ipa_range(mmu, type); 978 if (err) 979 return err; 980 981 pgt = kzalloc_obj(*pgt, GFP_KERNEL_ACCOUNT); 982 if (!pgt) 983 return -ENOMEM; 984 985 mmu->arch = &kvm->arch; 986 err = KVM_PGT_FN(kvm_pgtable_stage2_init)(pgt, mmu, &kvm_s2_mm_ops); 987 if (err) 988 goto out_free_pgtable; 989 990 mmu->pgt = pgt; 991 if (is_protected_kvm_enabled()) 992 return 0; 993 994 mmu->last_vcpu_ran = alloc_percpu(typeof(*mmu->last_vcpu_ran)); 995 if (!mmu->last_vcpu_ran) { 996 err = -ENOMEM; 997 goto out_destroy_pgtable; 998 } 999 1000 for_each_possible_cpu(cpu) 1001 *per_cpu_ptr(mmu->last_vcpu_ran, cpu) = -1; 1002 1003 /* The eager page splitting is disabled by default */ 1004 mmu->split_page_chunk_size = KVM_ARM_EAGER_SPLIT_CHUNK_SIZE_DEFAULT; 1005 mmu->split_page_cache.gfp_zero = __GFP_ZERO; 1006 1007 mmu->pgd_phys = __pa(pgt->pgd); 1008 1009 if (kvm_is_nested_s2_mmu(kvm, mmu)) 1010 kvm_init_nested_s2_mmu(mmu); 1011 1012 return 0; 1013 1014 out_destroy_pgtable: 1015 kvm_stage2_destroy(pgt); 1016 mmu->pgt = NULL; 1017 out_free_pgtable: 1018 kfree(pgt); 1019 return err; 1020 } 1021 1022 void kvm_uninit_stage2_mmu(struct kvm *kvm) 1023 { 1024 kvm_free_stage2_pgd(&kvm->arch.mmu); 1025 kvm_mmu_free_memory_cache(&kvm->arch.mmu.split_page_cache); 1026 } 1027 1028 static void stage2_unmap_memslot(struct kvm *kvm, 1029 struct kvm_memory_slot *memslot) 1030 { 1031 hva_t hva = memslot->userspace_addr; 1032 phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT; 1033 phys_addr_t size = PAGE_SIZE * memslot->npages; 1034 hva_t reg_end = hva + size; 1035 1036 /* 1037 * A memory region could potentially cover multiple VMAs, and any holes 1038 * between them, so iterate over all of them to find out if we should 1039 * unmap any of them. 1040 * 1041 * +--------------------------------------------+ 1042 * +---------------+----------------+ +----------------+ 1043 * | : VMA 1 | VMA 2 | | VMA 3 : | 1044 * +---------------+----------------+ +----------------+ 1045 * | memory region | 1046 * +--------------------------------------------+ 1047 */ 1048 do { 1049 struct vm_area_struct *vma; 1050 hva_t vm_start, vm_end; 1051 1052 vma = find_vma_intersection(current->mm, hva, reg_end); 1053 if (!vma) 1054 break; 1055 1056 /* 1057 * Take the intersection of this VMA with the memory region 1058 */ 1059 vm_start = max(hva, vma->vm_start); 1060 vm_end = min(reg_end, vma->vm_end); 1061 1062 if (!(vma->vm_flags & VM_PFNMAP)) { 1063 gpa_t gpa = addr + (vm_start - memslot->userspace_addr); 1064 kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, vm_end - vm_start, true); 1065 } 1066 hva = vm_end; 1067 } while (hva < reg_end); 1068 } 1069 1070 /** 1071 * stage2_unmap_vm - Unmap Stage-2 RAM mappings 1072 * @kvm: The struct kvm pointer 1073 * 1074 * Go through the memregions and unmap any regular RAM 1075 * backing memory already mapped to the VM. 1076 */ 1077 void stage2_unmap_vm(struct kvm *kvm) 1078 { 1079 struct kvm_memslots *slots; 1080 struct kvm_memory_slot *memslot; 1081 int idx, bkt; 1082 1083 idx = srcu_read_lock(&kvm->srcu); 1084 mmap_read_lock(current->mm); 1085 write_lock(&kvm->mmu_lock); 1086 1087 slots = kvm_memslots(kvm); 1088 kvm_for_each_memslot(memslot, bkt, slots) 1089 stage2_unmap_memslot(kvm, memslot); 1090 1091 kvm_nested_s2_unmap(kvm, true); 1092 1093 write_unlock(&kvm->mmu_lock); 1094 mmap_read_unlock(current->mm); 1095 srcu_read_unlock(&kvm->srcu, idx); 1096 } 1097 1098 void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu) 1099 { 1100 struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu); 1101 struct kvm_pgtable *pgt = NULL; 1102 1103 write_lock(&kvm->mmu_lock); 1104 pgt = mmu->pgt; 1105 if (pgt) { 1106 mmu->pgd_phys = 0; 1107 mmu->pgt = NULL; 1108 free_percpu(mmu->last_vcpu_ran); 1109 } 1110 1111 if (kvm_is_nested_s2_mmu(kvm, mmu)) 1112 kvm_init_nested_s2_mmu(mmu); 1113 1114 write_unlock(&kvm->mmu_lock); 1115 1116 if (pgt) { 1117 kvm_stage2_destroy(pgt); 1118 kfree(pgt); 1119 } 1120 } 1121 1122 static void hyp_mc_free_fn(void *addr, void *mc) 1123 { 1124 struct kvm_hyp_memcache *memcache = mc; 1125 1126 if (memcache->flags & HYP_MEMCACHE_ACCOUNT_STAGE2) 1127 kvm_account_pgtable_pages(addr, -1); 1128 1129 free_page((unsigned long)addr); 1130 } 1131 1132 static void *hyp_mc_alloc_fn(void *mc) 1133 { 1134 struct kvm_hyp_memcache *memcache = mc; 1135 void *addr; 1136 1137 addr = (void *)__get_free_page(GFP_KERNEL_ACCOUNT); 1138 if (addr && memcache->flags & HYP_MEMCACHE_ACCOUNT_STAGE2) 1139 kvm_account_pgtable_pages(addr, 1); 1140 1141 return addr; 1142 } 1143 1144 void free_hyp_memcache(struct kvm_hyp_memcache *mc) 1145 { 1146 if (!is_protected_kvm_enabled()) 1147 return; 1148 1149 kfree(mc->mapping); 1150 __free_hyp_memcache(mc, hyp_mc_free_fn, kvm_host_va, mc); 1151 } 1152 1153 int topup_hyp_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages) 1154 { 1155 if (!is_protected_kvm_enabled()) 1156 return 0; 1157 1158 if (!mc->mapping) { 1159 mc->mapping = kzalloc_obj(struct pkvm_mapping, 1160 GFP_KERNEL_ACCOUNT); 1161 if (!mc->mapping) 1162 return -ENOMEM; 1163 } 1164 1165 return __topup_hyp_memcache(mc, min_pages, hyp_mc_alloc_fn, 1166 kvm_host_pa, mc); 1167 } 1168 1169 /** 1170 * kvm_phys_addr_ioremap - map a device range to guest IPA 1171 * 1172 * @kvm: The KVM pointer 1173 * @guest_ipa: The IPA at which to insert the mapping 1174 * @pa: The physical address of the device 1175 * @size: The size of the mapping 1176 * @writable: Whether or not to create a writable mapping 1177 */ 1178 int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, 1179 phys_addr_t pa, unsigned long size, bool writable) 1180 { 1181 phys_addr_t addr; 1182 int ret = 0; 1183 struct kvm_mmu_memory_cache cache = { .gfp_zero = __GFP_ZERO }; 1184 struct kvm_s2_mmu *mmu = &kvm->arch.mmu; 1185 struct kvm_pgtable *pgt = mmu->pgt; 1186 enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_DEVICE | 1187 KVM_PGTABLE_PROT_R | 1188 (writable ? KVM_PGTABLE_PROT_W : 0); 1189 1190 if (is_protected_kvm_enabled()) 1191 return -EPERM; 1192 1193 size += offset_in_page(guest_ipa); 1194 guest_ipa &= PAGE_MASK; 1195 1196 for (addr = guest_ipa; addr < guest_ipa + size; addr += PAGE_SIZE) { 1197 ret = kvm_mmu_topup_memory_cache(&cache, 1198 kvm_mmu_cache_min_pages(mmu)); 1199 if (ret) 1200 break; 1201 1202 write_lock(&kvm->mmu_lock); 1203 ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, addr, PAGE_SIZE, 1204 pa, prot, &cache, 0); 1205 write_unlock(&kvm->mmu_lock); 1206 if (ret) 1207 break; 1208 1209 pa += PAGE_SIZE; 1210 } 1211 1212 kvm_mmu_free_memory_cache(&cache); 1213 return ret; 1214 } 1215 1216 /** 1217 * kvm_stage2_wp_range() - write protect stage2 memory region range 1218 * @mmu: The KVM stage-2 MMU pointer 1219 * @addr: Start address of range 1220 * @end: End address of range 1221 */ 1222 void kvm_stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end) 1223 { 1224 stage2_apply_range_resched(mmu, addr, end, KVM_PGT_FN(kvm_pgtable_stage2_wrprotect)); 1225 } 1226 1227 /** 1228 * kvm_mmu_wp_memory_region() - write protect stage 2 entries for memory slot 1229 * @kvm: The KVM pointer 1230 * @slot: The memory slot to write protect 1231 * 1232 * Called to start logging dirty pages after memory region 1233 * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns 1234 * all present PUD, PMD and PTEs are write protected in the memory region. 1235 * Afterwards read of dirty page log can be called. 1236 * 1237 * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired, 1238 * serializing operations for VM memory regions. 1239 */ 1240 static void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot) 1241 { 1242 struct kvm_memslots *slots = kvm_memslots(kvm); 1243 struct kvm_memory_slot *memslot = id_to_memslot(slots, slot); 1244 phys_addr_t start, end; 1245 1246 if (WARN_ON_ONCE(!memslot)) 1247 return; 1248 1249 start = memslot->base_gfn << PAGE_SHIFT; 1250 end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT; 1251 1252 write_lock(&kvm->mmu_lock); 1253 kvm_stage2_wp_range(&kvm->arch.mmu, start, end); 1254 kvm_nested_s2_wp(kvm); 1255 write_unlock(&kvm->mmu_lock); 1256 kvm_flush_remote_tlbs_memslot(kvm, memslot); 1257 } 1258 1259 /** 1260 * kvm_mmu_split_memory_region() - split the stage 2 blocks into PAGE_SIZE 1261 * pages for memory slot 1262 * @kvm: The KVM pointer 1263 * @slot: The memory slot to split 1264 * 1265 * Acquires kvm->mmu_lock. Called with kvm->slots_lock mutex acquired, 1266 * serializing operations for VM memory regions. 1267 */ 1268 static void kvm_mmu_split_memory_region(struct kvm *kvm, int slot) 1269 { 1270 struct kvm_memslots *slots; 1271 struct kvm_memory_slot *memslot; 1272 phys_addr_t start, end; 1273 1274 lockdep_assert_held(&kvm->slots_lock); 1275 1276 slots = kvm_memslots(kvm); 1277 memslot = id_to_memslot(slots, slot); 1278 1279 start = memslot->base_gfn << PAGE_SHIFT; 1280 end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT; 1281 1282 write_lock(&kvm->mmu_lock); 1283 kvm_mmu_split_huge_pages(kvm, start, end); 1284 write_unlock(&kvm->mmu_lock); 1285 } 1286 1287 /* 1288 * kvm_arch_mmu_enable_log_dirty_pt_masked() - enable dirty logging for selected pages. 1289 * @kvm: The KVM pointer 1290 * @slot: The memory slot associated with mask 1291 * @gfn_offset: The gfn offset in memory slot 1292 * @mask: The mask of pages at offset 'gfn_offset' in this memory 1293 * slot to enable dirty logging on 1294 * 1295 * Writes protect selected pages to enable dirty logging, and then 1296 * splits them to PAGE_SIZE. Caller must acquire kvm->mmu_lock. 1297 */ 1298 void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, 1299 struct kvm_memory_slot *slot, 1300 gfn_t gfn_offset, unsigned long mask) 1301 { 1302 phys_addr_t base_gfn = slot->base_gfn + gfn_offset; 1303 phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT; 1304 phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT; 1305 1306 lockdep_assert_held_write(&kvm->mmu_lock); 1307 1308 kvm_stage2_wp_range(&kvm->arch.mmu, start, end); 1309 1310 /* 1311 * Eager-splitting is done when manual-protect is set. We 1312 * also check for initially-all-set because we can avoid 1313 * eager-splitting if initially-all-set is false. 1314 * Initially-all-set equal false implies that huge-pages were 1315 * already split when enabling dirty logging: no need to do it 1316 * again. 1317 */ 1318 if (kvm_dirty_log_manual_protect_and_init_set(kvm)) 1319 kvm_mmu_split_huge_pages(kvm, start, end); 1320 1321 kvm_nested_s2_wp(kvm); 1322 } 1323 1324 static void kvm_send_hwpoison_signal(unsigned long address, short lsb) 1325 { 1326 send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current); 1327 } 1328 1329 static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot, 1330 unsigned long hva, 1331 unsigned long map_size) 1332 { 1333 gpa_t gpa_start; 1334 hva_t uaddr_start, uaddr_end; 1335 size_t size; 1336 1337 /* The memslot and the VMA are guaranteed to be aligned to PAGE_SIZE */ 1338 if (map_size == PAGE_SIZE) 1339 return true; 1340 1341 /* pKVM only supports PMD_SIZE huge-mappings */ 1342 if (is_protected_kvm_enabled() && map_size != PMD_SIZE) 1343 return false; 1344 1345 size = memslot->npages * PAGE_SIZE; 1346 1347 gpa_start = memslot->base_gfn << PAGE_SHIFT; 1348 1349 uaddr_start = memslot->userspace_addr; 1350 uaddr_end = uaddr_start + size; 1351 1352 /* 1353 * Pages belonging to memslots that don't have the same alignment 1354 * within a PMD/PUD for userspace and IPA cannot be mapped with stage-2 1355 * PMD/PUD entries, because we'll end up mapping the wrong pages. 1356 * 1357 * Consider a layout like the following: 1358 * 1359 * memslot->userspace_addr: 1360 * +-----+--------------------+--------------------+---+ 1361 * |abcde|fgh Stage-1 block | Stage-1 block tv|xyz| 1362 * +-----+--------------------+--------------------+---+ 1363 * 1364 * memslot->base_gfn << PAGE_SHIFT: 1365 * +---+--------------------+--------------------+-----+ 1366 * |abc|def Stage-2 block | Stage-2 block |tvxyz| 1367 * +---+--------------------+--------------------+-----+ 1368 * 1369 * If we create those stage-2 blocks, we'll end up with this incorrect 1370 * mapping: 1371 * d -> f 1372 * e -> g 1373 * f -> h 1374 */ 1375 if ((gpa_start & (map_size - 1)) != (uaddr_start & (map_size - 1))) 1376 return false; 1377 1378 /* 1379 * Next, let's make sure we're not trying to map anything not covered 1380 * by the memslot. This means we have to prohibit block size mappings 1381 * for the beginning and end of a non-block aligned and non-block sized 1382 * memory slot (illustrated by the head and tail parts of the 1383 * userspace view above containing pages 'abcde' and 'xyz', 1384 * respectively). 1385 * 1386 * Note that it doesn't matter if we do the check using the 1387 * userspace_addr or the base_gfn, as both are equally aligned (per 1388 * the check above) and equally sized. 1389 */ 1390 return (hva & ~(map_size - 1)) >= uaddr_start && 1391 (hva & ~(map_size - 1)) + map_size <= uaddr_end; 1392 } 1393 1394 /* 1395 * Check if the given hva is backed by a transparent huge page (THP) and 1396 * whether it can be mapped using block mapping in stage2. If so, adjust 1397 * the stage2 PFN and IPA accordingly. Only PMD_SIZE THPs are currently 1398 * supported. This will need to be updated to support other THP sizes. 1399 * 1400 * Returns the size of the mapping. 1401 */ 1402 static long 1403 transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot, 1404 unsigned long hva, kvm_pfn_t *pfnp, gfn_t *gfnp) 1405 { 1406 kvm_pfn_t pfn = *pfnp; 1407 gfn_t gfn = *gfnp; 1408 1409 /* 1410 * Make sure the adjustment is done only for THP pages. Also make 1411 * sure that the HVA and IPA are sufficiently aligned and that the 1412 * block map is contained within the memslot. 1413 */ 1414 if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) { 1415 int sz = get_user_mapping_size(kvm, hva); 1416 1417 if (sz < 0) 1418 return sz; 1419 1420 if (sz < PMD_SIZE) 1421 return PAGE_SIZE; 1422 1423 gfn &= ~(PTRS_PER_PMD - 1); 1424 *gfnp = gfn; 1425 pfn &= ~(PTRS_PER_PMD - 1); 1426 *pfnp = pfn; 1427 1428 return PMD_SIZE; 1429 } 1430 1431 /* Use page mapping if we cannot use block mapping. */ 1432 return PAGE_SIZE; 1433 } 1434 1435 static int get_vma_page_shift(struct vm_area_struct *vma, unsigned long hva) 1436 { 1437 unsigned long pa; 1438 1439 if (is_vm_hugetlb_page(vma) && !(vma->vm_flags & VM_PFNMAP)) 1440 return huge_page_shift(hstate_vma(vma)); 1441 1442 if (!(vma->vm_flags & VM_PFNMAP)) 1443 return PAGE_SHIFT; 1444 1445 VM_BUG_ON(is_vm_hugetlb_page(vma)); 1446 1447 pa = (vma->vm_pgoff << PAGE_SHIFT) + (hva - vma->vm_start); 1448 1449 #ifndef __PAGETABLE_PMD_FOLDED 1450 if ((hva & (PUD_SIZE - 1)) == (pa & (PUD_SIZE - 1)) && 1451 ALIGN_DOWN(hva, PUD_SIZE) >= vma->vm_start && 1452 ALIGN(hva, PUD_SIZE) <= vma->vm_end) 1453 return PUD_SHIFT; 1454 #endif 1455 1456 if ((hva & (PMD_SIZE - 1)) == (pa & (PMD_SIZE - 1)) && 1457 ALIGN_DOWN(hva, PMD_SIZE) >= vma->vm_start && 1458 ALIGN(hva, PMD_SIZE) <= vma->vm_end) 1459 return PMD_SHIFT; 1460 1461 return PAGE_SHIFT; 1462 } 1463 1464 /* 1465 * The page will be mapped in stage 2 as Normal Cacheable, so the VM will be 1466 * able to see the page's tags and therefore they must be initialised first. If 1467 * PG_mte_tagged is set, tags have already been initialised. 1468 * 1469 * Must be called with kvm->mmu_lock held to ensure the memory remains mapped 1470 * while the tags are zeroed. 1471 */ 1472 static void sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn, 1473 unsigned long size) 1474 { 1475 unsigned long i, nr_pages = size >> PAGE_SHIFT; 1476 struct page *page = pfn_to_page(pfn); 1477 struct folio *folio = page_folio(page); 1478 1479 if (!kvm_has_mte(kvm)) 1480 return; 1481 1482 if (is_zero_pfn(pfn)) { 1483 WARN_ON_ONCE(nr_pages != 1); 1484 return; 1485 } 1486 1487 if (folio_test_hugetlb(folio)) { 1488 /* Hugetlb has MTE flags set on head page only */ 1489 if (folio_try_hugetlb_mte_tagging(folio)) { 1490 for (i = 0; i < nr_pages; i++, page++) 1491 mte_clear_page_tags(page_address(page)); 1492 folio_set_hugetlb_mte_tagged(folio); 1493 } 1494 return; 1495 } 1496 1497 for (i = 0; i < nr_pages; i++, page++) { 1498 if (try_page_mte_tagging(page)) { 1499 mte_clear_page_tags(page_address(page)); 1500 set_page_mte_tagged(page); 1501 } 1502 } 1503 } 1504 1505 static bool kvm_vma_mte_allowed(struct vm_area_struct *vma) 1506 { 1507 return vma->vm_flags & VM_MTE_ALLOWED; 1508 } 1509 1510 static bool kvm_vma_is_cacheable(struct vm_area_struct *vma) 1511 { 1512 switch (FIELD_GET(PTE_ATTRINDX_MASK, pgprot_val(vma->vm_page_prot))) { 1513 case MT_NORMAL_NC: 1514 case MT_DEVICE_nGnRnE: 1515 case MT_DEVICE_nGnRE: 1516 return false; 1517 default: 1518 return true; 1519 } 1520 } 1521 1522 static void *get_mmu_memcache(struct kvm_vcpu *vcpu) 1523 { 1524 if (!is_protected_kvm_enabled()) 1525 return &vcpu->arch.mmu_page_cache; 1526 else 1527 return &vcpu->arch.pkvm_memcache; 1528 } 1529 1530 static int topup_mmu_memcache(struct kvm_vcpu *vcpu, void *memcache) 1531 { 1532 int min_pages = kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu); 1533 1534 if (!is_protected_kvm_enabled()) 1535 return kvm_mmu_topup_memory_cache(memcache, min_pages); 1536 1537 return topup_hyp_memcache(memcache, min_pages); 1538 } 1539 1540 /* 1541 * Potentially reduce shadow S2 permissions to match the guest's own S2. For 1542 * exec faults, we'd only reach this point if the guest actually allowed it (see 1543 * kvm_s2_handle_perm_fault). 1544 * 1545 * Also encode the level of the original translation in the SW bits of the leaf 1546 * entry as a proxy for the span of that translation. This will be retrieved on 1547 * TLB invalidation from the guest and used to limit the invalidation scope if a 1548 * TTL hint or a range isn't provided. 1549 */ 1550 static enum kvm_pgtable_prot adjust_nested_fault_perms(struct kvm_s2_trans *nested, 1551 enum kvm_pgtable_prot prot) 1552 { 1553 if (!kvm_s2_trans_writable(nested)) 1554 prot &= ~KVM_PGTABLE_PROT_W; 1555 if (!kvm_s2_trans_readable(nested)) 1556 prot &= ~KVM_PGTABLE_PROT_R; 1557 1558 return prot | kvm_encode_nested_level(nested); 1559 } 1560 1561 static enum kvm_pgtable_prot adjust_nested_exec_perms(struct kvm *kvm, 1562 struct kvm_s2_trans *nested, 1563 enum kvm_pgtable_prot prot) 1564 { 1565 if (!kvm_s2_trans_exec_el0(kvm, nested)) 1566 prot &= ~KVM_PGTABLE_PROT_UX; 1567 if (!kvm_s2_trans_exec_el1(kvm, nested)) 1568 prot &= ~KVM_PGTABLE_PROT_PX; 1569 1570 return prot; 1571 } 1572 1573 struct kvm_s2_fault_desc { 1574 struct kvm_vcpu *vcpu; 1575 phys_addr_t fault_ipa; 1576 struct kvm_s2_trans *nested; 1577 struct kvm_memory_slot *memslot; 1578 unsigned long hva; 1579 }; 1580 1581 static int gmem_abort(const struct kvm_s2_fault_desc *s2fd) 1582 { 1583 bool write_fault, exec_fault; 1584 bool perm_fault = kvm_vcpu_trap_is_permission_fault(s2fd->vcpu); 1585 enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_SHARED; 1586 enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; 1587 struct kvm_pgtable *pgt = s2fd->vcpu->arch.hw_mmu->pgt; 1588 unsigned long mmu_seq; 1589 struct page *page; 1590 struct kvm *kvm = s2fd->vcpu->kvm; 1591 void *memcache = NULL; 1592 kvm_pfn_t pfn; 1593 gfn_t gfn; 1594 int ret; 1595 1596 if (!perm_fault) { 1597 memcache = get_mmu_memcache(s2fd->vcpu); 1598 ret = topup_mmu_memcache(s2fd->vcpu, memcache); 1599 if (ret) 1600 return ret; 1601 } 1602 1603 if (s2fd->nested) 1604 gfn = kvm_s2_trans_output(s2fd->nested) >> PAGE_SHIFT; 1605 else 1606 gfn = s2fd->fault_ipa >> PAGE_SHIFT; 1607 1608 write_fault = kvm_is_write_fault(s2fd->vcpu); 1609 exec_fault = kvm_vcpu_trap_is_exec_fault(s2fd->vcpu); 1610 1611 VM_WARN_ON_ONCE(write_fault && exec_fault); 1612 1613 mmu_seq = kvm->mmu_invalidate_seq; 1614 /* Pairs with the smp_wmb() in kvm_mmu_invalidate_end(). */ 1615 smp_rmb(); 1616 1617 ret = kvm_gmem_get_pfn(kvm, s2fd->memslot, gfn, &pfn, &page, NULL); 1618 if (ret) { 1619 kvm_prepare_memory_fault_exit(s2fd->vcpu, s2fd->fault_ipa, PAGE_SIZE, 1620 write_fault, exec_fault, false); 1621 return ret; 1622 } 1623 1624 if (!(s2fd->memslot->flags & KVM_MEM_READONLY)) 1625 prot |= KVM_PGTABLE_PROT_W; 1626 1627 if (s2fd->nested) 1628 prot = adjust_nested_fault_perms(s2fd->nested, prot); 1629 1630 if (exec_fault || cpus_have_final_cap(ARM64_HAS_CACHE_DIC)) 1631 prot |= KVM_PGTABLE_PROT_X; 1632 1633 if (s2fd->nested) 1634 prot = adjust_nested_exec_perms(kvm, s2fd->nested, prot); 1635 1636 kvm_fault_lock(kvm); 1637 if (mmu_invalidate_retry(kvm, mmu_seq)) { 1638 ret = -EAGAIN; 1639 goto out_unlock; 1640 } 1641 1642 if (perm_fault) { 1643 /* 1644 * Drop the SW bits in favour of those stored in the 1645 * PTE, which will be preserved. 1646 */ 1647 prot &= ~KVM_NV_GUEST_MAP_SZ; 1648 ret = KVM_PGT_FN(kvm_pgtable_stage2_relax_perms)(pgt, s2fd->fault_ipa, 1649 prot, flags); 1650 } else { 1651 ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, s2fd->fault_ipa, PAGE_SIZE, 1652 __pfn_to_phys(pfn), prot, 1653 memcache, flags); 1654 } 1655 1656 out_unlock: 1657 kvm_release_faultin_page(kvm, page, !!ret, prot & KVM_PGTABLE_PROT_W); 1658 kvm_fault_unlock(kvm); 1659 1660 if ((prot & KVM_PGTABLE_PROT_W) && !ret) 1661 mark_page_dirty_in_slot(kvm, s2fd->memslot, gfn); 1662 1663 return ret != -EAGAIN ? ret : 0; 1664 } 1665 1666 struct kvm_s2_fault_vma_info { 1667 unsigned long mmu_seq; 1668 long vma_pagesize; 1669 vm_flags_t vm_flags; 1670 unsigned long max_map_size; 1671 struct page *page; 1672 kvm_pfn_t pfn; 1673 gfn_t gfn; 1674 bool device; 1675 bool mte_allowed; 1676 bool is_vma_cacheable; 1677 bool map_writable; 1678 bool map_non_cacheable; 1679 }; 1680 1681 static int pkvm_mem_abort(const struct kvm_s2_fault_desc *s2fd) 1682 { 1683 unsigned int flags = FOLL_HWPOISON | FOLL_LONGTERM | FOLL_WRITE; 1684 struct kvm_vcpu *vcpu = s2fd->vcpu; 1685 struct kvm_pgtable *pgt = vcpu->arch.hw_mmu->pgt; 1686 struct mm_struct *mm = current->mm; 1687 struct kvm *kvm = vcpu->kvm; 1688 void *hyp_memcache; 1689 struct page *page; 1690 int ret; 1691 1692 hyp_memcache = get_mmu_memcache(vcpu); 1693 ret = topup_mmu_memcache(vcpu, hyp_memcache); 1694 if (ret) 1695 return -ENOMEM; 1696 1697 ret = account_locked_vm(mm, 1, true); 1698 if (ret) 1699 return ret; 1700 1701 mmap_read_lock(mm); 1702 ret = pin_user_pages(s2fd->hva, 1, flags, &page); 1703 mmap_read_unlock(mm); 1704 1705 if (ret == -EHWPOISON) { 1706 kvm_send_hwpoison_signal(s2fd->hva, PAGE_SHIFT); 1707 ret = 0; 1708 goto dec_account; 1709 } else if (ret != 1) { 1710 ret = -EFAULT; 1711 goto dec_account; 1712 } else if (!folio_test_swapbacked(page_folio(page))) { 1713 /* 1714 * We really can't deal with page-cache pages returned by GUP 1715 * because (a) we may trigger writeback of a page for which we 1716 * no longer have access and (b) page_mkclean() won't find the 1717 * stage-2 mapping in the rmap so we can get out-of-whack with 1718 * the filesystem when marking the page dirty during unpinning 1719 * (see cc5095747edf ("ext4: don't BUG if someone dirty pages 1720 * without asking ext4 first")). 1721 * 1722 * Ideally we'd just restrict ourselves to anonymous pages, but 1723 * we also want to allow memfd (i.e. shmem) pages, so check for 1724 * pages backed by swap in the knowledge that the GUP pin will 1725 * prevent try_to_unmap() from succeeding. 1726 */ 1727 ret = -EIO; 1728 goto unpin; 1729 } 1730 1731 write_lock(&kvm->mmu_lock); 1732 ret = pkvm_pgtable_stage2_map(pgt, s2fd->fault_ipa, PAGE_SIZE, 1733 page_to_phys(page), KVM_PGTABLE_PROT_RWX, 1734 hyp_memcache, 0); 1735 write_unlock(&kvm->mmu_lock); 1736 if (ret) { 1737 if (ret == -EAGAIN) 1738 ret = 0; 1739 goto unpin; 1740 } 1741 1742 return 0; 1743 unpin: 1744 unpin_user_pages(&page, 1); 1745 dec_account: 1746 account_locked_vm(mm, 1, false); 1747 return ret; 1748 } 1749 1750 static short kvm_s2_resolve_vma_size(const struct kvm_s2_fault_desc *s2fd, 1751 struct kvm_s2_fault_vma_info *s2vi, 1752 struct vm_area_struct *vma) 1753 { 1754 short vma_shift; 1755 1756 if (memslot_is_logging(s2fd->memslot)) { 1757 s2vi->max_map_size = PAGE_SIZE; 1758 vma_shift = PAGE_SHIFT; 1759 } else { 1760 s2vi->max_map_size = PUD_SIZE; 1761 vma_shift = get_vma_page_shift(vma, s2fd->hva); 1762 } 1763 1764 switch (vma_shift) { 1765 #ifndef __PAGETABLE_PMD_FOLDED 1766 case PUD_SHIFT: 1767 if (fault_supports_stage2_huge_mapping(s2fd->memslot, s2fd->hva, PUD_SIZE)) 1768 break; 1769 fallthrough; 1770 #endif 1771 case CONT_PMD_SHIFT: 1772 vma_shift = PMD_SHIFT; 1773 fallthrough; 1774 case PMD_SHIFT: 1775 if (fault_supports_stage2_huge_mapping(s2fd->memslot, s2fd->hva, PMD_SIZE)) 1776 break; 1777 fallthrough; 1778 case CONT_PTE_SHIFT: 1779 vma_shift = PAGE_SHIFT; 1780 s2vi->max_map_size = PAGE_SIZE; 1781 fallthrough; 1782 case PAGE_SHIFT: 1783 break; 1784 default: 1785 WARN_ONCE(1, "Unknown vma_shift %d", vma_shift); 1786 } 1787 1788 if (s2fd->nested) { 1789 unsigned long max_map_size; 1790 1791 max_map_size = min(s2vi->max_map_size, PUD_SIZE); 1792 1793 /* 1794 * If we're about to create a shadow stage 2 entry, then we 1795 * can only create a block mapping if the guest stage 2 page 1796 * table uses at least as big a mapping. 1797 */ 1798 max_map_size = min(kvm_s2_trans_size(s2fd->nested), max_map_size); 1799 1800 /* 1801 * Be careful that if the mapping size falls between 1802 * two host sizes, take the smallest of the two. 1803 */ 1804 if (max_map_size >= PMD_SIZE && max_map_size < PUD_SIZE) 1805 max_map_size = PMD_SIZE; 1806 else if (max_map_size >= PAGE_SIZE && max_map_size < PMD_SIZE) 1807 max_map_size = PAGE_SIZE; 1808 1809 s2vi->max_map_size = max_map_size; 1810 vma_shift = min_t(short, vma_shift, __ffs(max_map_size)); 1811 } 1812 1813 return vma_shift; 1814 } 1815 1816 static bool kvm_s2_fault_is_perm(const struct kvm_s2_fault_desc *s2fd) 1817 { 1818 return kvm_vcpu_trap_is_permission_fault(s2fd->vcpu); 1819 } 1820 1821 static int kvm_s2_fault_get_vma_info(const struct kvm_s2_fault_desc *s2fd, 1822 struct kvm_s2_fault_vma_info *s2vi) 1823 { 1824 struct vm_area_struct *vma; 1825 struct kvm *kvm = s2fd->vcpu->kvm; 1826 1827 mmap_read_lock(current->mm); 1828 vma = vma_lookup(current->mm, s2fd->hva); 1829 if (unlikely(!vma)) { 1830 kvm_err("Failed to find VMA for hva 0x%lx\n", s2fd->hva); 1831 mmap_read_unlock(current->mm); 1832 return -EFAULT; 1833 } 1834 1835 s2vi->vma_pagesize = BIT(kvm_s2_resolve_vma_size(s2fd, s2vi, vma)); 1836 1837 /* 1838 * Both the canonical IPA and fault IPA must be aligned to the 1839 * mapping size to ensure we find the right PFN and lay down the 1840 * mapping in the right place. 1841 */ 1842 s2vi->gfn = ALIGN_DOWN(s2fd->fault_ipa, s2vi->vma_pagesize) >> PAGE_SHIFT; 1843 1844 s2vi->mte_allowed = kvm_vma_mte_allowed(vma); 1845 1846 s2vi->vm_flags = vma->vm_flags; 1847 1848 s2vi->is_vma_cacheable = kvm_vma_is_cacheable(vma); 1849 1850 /* 1851 * Read mmu_invalidate_seq so that KVM can detect if the results of 1852 * vma_lookup() or __kvm_faultin_pfn() become stale prior to 1853 * acquiring kvm->mmu_lock. 1854 * 1855 * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs 1856 * with the smp_wmb() in kvm_mmu_invalidate_end(). 1857 */ 1858 s2vi->mmu_seq = kvm->mmu_invalidate_seq; 1859 mmap_read_unlock(current->mm); 1860 1861 return 0; 1862 } 1863 1864 static gfn_t get_canonical_gfn(const struct kvm_s2_fault_desc *s2fd, 1865 const struct kvm_s2_fault_vma_info *s2vi) 1866 { 1867 phys_addr_t ipa; 1868 1869 if (!s2fd->nested) 1870 return s2vi->gfn; 1871 1872 ipa = kvm_s2_trans_output(s2fd->nested); 1873 return ALIGN_DOWN(ipa, s2vi->vma_pagesize) >> PAGE_SHIFT; 1874 } 1875 1876 static int kvm_s2_fault_pin_pfn(const struct kvm_s2_fault_desc *s2fd, 1877 struct kvm_s2_fault_vma_info *s2vi) 1878 { 1879 int ret; 1880 1881 ret = kvm_s2_fault_get_vma_info(s2fd, s2vi); 1882 if (ret) 1883 return ret; 1884 1885 s2vi->pfn = __kvm_faultin_pfn(s2fd->memslot, get_canonical_gfn(s2fd, s2vi), 1886 kvm_is_write_fault(s2fd->vcpu) ? FOLL_WRITE : 0, 1887 &s2vi->map_writable, &s2vi->page); 1888 if (unlikely(is_error_noslot_pfn(s2vi->pfn))) { 1889 if (s2vi->pfn == KVM_PFN_ERR_HWPOISON) { 1890 kvm_send_hwpoison_signal(s2fd->hva, __ffs(s2vi->vma_pagesize)); 1891 return 0; 1892 } 1893 return -EFAULT; 1894 } 1895 1896 /* 1897 * Check if this is non-struct page memory PFN, and cannot support 1898 * CMOs. It could potentially be unsafe to access as cacheable. 1899 */ 1900 if (s2vi->vm_flags & (VM_PFNMAP | VM_MIXEDMAP) && !pfn_is_map_memory(s2vi->pfn)) { 1901 if (s2vi->is_vma_cacheable) { 1902 /* 1903 * Whilst the VMA owner expects cacheable mapping to this 1904 * PFN, hardware also has to support the FWB and CACHE DIC 1905 * features. 1906 * 1907 * ARM64 KVM relies on kernel VA mapping to the PFN to 1908 * perform cache maintenance as the CMO instructions work on 1909 * virtual addresses. VM_PFNMAP region are not necessarily 1910 * mapped to a KVA and hence the presence of hardware features 1911 * S2FWB and CACHE DIC are mandatory to avoid the need for 1912 * cache maintenance. 1913 */ 1914 if (!kvm_supports_cacheable_pfnmap()) { 1915 kvm_release_faultin_page(s2fd->vcpu->kvm, s2vi->page, true, false); 1916 return -EFAULT; 1917 } 1918 } else { 1919 /* 1920 * If the page was identified as device early by looking at 1921 * the VMA flags, vma_pagesize is already representing the 1922 * largest quantity we can map. If instead it was mapped 1923 * via __kvm_faultin_pfn(), vma_pagesize is set to PAGE_SIZE 1924 * and must not be upgraded. 1925 * 1926 * In both cases, we don't let transparent_hugepage_adjust() 1927 * change things at the last minute. 1928 */ 1929 s2vi->map_non_cacheable = true; 1930 } 1931 1932 s2vi->device = true; 1933 } 1934 1935 return 1; 1936 } 1937 1938 static int kvm_s2_fault_compute_prot(const struct kvm_s2_fault_desc *s2fd, 1939 const struct kvm_s2_fault_vma_info *s2vi, 1940 enum kvm_pgtable_prot *prot) 1941 { 1942 struct kvm *kvm = s2fd->vcpu->kvm; 1943 1944 if (kvm_vcpu_trap_is_exec_fault(s2fd->vcpu) && s2vi->map_non_cacheable) 1945 return -ENOEXEC; 1946 1947 /* 1948 * Guest performs atomic/exclusive operations on memory with unsupported 1949 * attributes (e.g. ld64b/st64b on normal memory when no FEAT_LS64WB) 1950 * and trigger the exception here. Since the memslot is valid, inject 1951 * the fault back to the guest. 1952 */ 1953 if (esr_fsc_is_excl_atomic_fault(kvm_vcpu_get_esr(s2fd->vcpu))) { 1954 kvm_inject_dabt_excl_atomic(s2fd->vcpu, kvm_vcpu_get_hfar(s2fd->vcpu)); 1955 return 1; 1956 } 1957 1958 *prot = KVM_PGTABLE_PROT_R; 1959 1960 if (s2vi->map_writable && (s2vi->device || 1961 !memslot_is_logging(s2fd->memslot) || 1962 kvm_is_write_fault(s2fd->vcpu))) 1963 *prot |= KVM_PGTABLE_PROT_W; 1964 1965 if (s2fd->nested) 1966 *prot = adjust_nested_fault_perms(s2fd->nested, *prot); 1967 1968 if (kvm_vcpu_trap_is_exec_fault(s2fd->vcpu)) 1969 *prot |= KVM_PGTABLE_PROT_X; 1970 1971 if (s2vi->map_non_cacheable) 1972 *prot |= (s2vi->vm_flags & VM_ALLOW_ANY_UNCACHED) ? 1973 KVM_PGTABLE_PROT_NORMAL_NC : KVM_PGTABLE_PROT_DEVICE; 1974 else if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC)) 1975 *prot |= KVM_PGTABLE_PROT_X; 1976 1977 if (s2fd->nested) 1978 *prot = adjust_nested_exec_perms(kvm, s2fd->nested, *prot); 1979 1980 if (!kvm_s2_fault_is_perm(s2fd) && !s2vi->map_non_cacheable && kvm_has_mte(kvm)) { 1981 /* Check the VMM hasn't introduced a new disallowed VMA */ 1982 if (!s2vi->mte_allowed) 1983 return -EFAULT; 1984 } 1985 1986 return 0; 1987 } 1988 1989 static int kvm_s2_fault_map(const struct kvm_s2_fault_desc *s2fd, 1990 const struct kvm_s2_fault_vma_info *s2vi, 1991 enum kvm_pgtable_prot prot, 1992 void *memcache) 1993 { 1994 enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_SHARED; 1995 bool writable = prot & KVM_PGTABLE_PROT_W; 1996 struct kvm *kvm = s2fd->vcpu->kvm; 1997 struct kvm_pgtable *pgt; 1998 long perm_fault_granule; 1999 long mapping_size; 2000 kvm_pfn_t pfn; 2001 gfn_t gfn; 2002 int ret; 2003 2004 kvm_fault_lock(kvm); 2005 pgt = s2fd->vcpu->arch.hw_mmu->pgt; 2006 ret = -EAGAIN; 2007 if (mmu_invalidate_retry(kvm, s2vi->mmu_seq)) 2008 goto out_unlock; 2009 2010 perm_fault_granule = (kvm_s2_fault_is_perm(s2fd) ? 2011 kvm_vcpu_trap_get_perm_fault_granule(s2fd->vcpu) : 0); 2012 mapping_size = s2vi->vma_pagesize; 2013 pfn = s2vi->pfn; 2014 gfn = s2vi->gfn; 2015 2016 /* 2017 * If we are not forced to use page mapping, check if we are 2018 * backed by a THP and thus use block mapping if possible. 2019 */ 2020 if (mapping_size == PAGE_SIZE && 2021 !(s2vi->max_map_size == PAGE_SIZE || s2vi->map_non_cacheable)) { 2022 if (perm_fault_granule > PAGE_SIZE) { 2023 mapping_size = perm_fault_granule; 2024 } else { 2025 mapping_size = transparent_hugepage_adjust(kvm, s2fd->memslot, 2026 s2fd->hva, &pfn, 2027 &gfn); 2028 if (mapping_size < 0) { 2029 ret = mapping_size; 2030 goto out_unlock; 2031 } 2032 } 2033 } 2034 2035 if (!perm_fault_granule && !s2vi->map_non_cacheable && kvm_has_mte(kvm)) 2036 sanitise_mte_tags(kvm, pfn, mapping_size); 2037 2038 /* 2039 * Under the premise of getting a FSC_PERM fault, we just need to relax 2040 * permissions only if mapping_size equals perm_fault_granule. Otherwise, 2041 * kvm_pgtable_stage2_map() should be called to change block size. 2042 */ 2043 if (mapping_size == perm_fault_granule) { 2044 /* 2045 * Drop the SW bits in favour of those stored in the 2046 * PTE, which will be preserved. 2047 */ 2048 prot &= ~KVM_NV_GUEST_MAP_SZ; 2049 ret = KVM_PGT_FN(kvm_pgtable_stage2_relax_perms)(pgt, gfn_to_gpa(gfn), 2050 prot, flags); 2051 } else { 2052 ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, gfn_to_gpa(gfn), mapping_size, 2053 __pfn_to_phys(pfn), prot, 2054 memcache, flags); 2055 } 2056 2057 out_unlock: 2058 kvm_release_faultin_page(kvm, s2vi->page, !!ret, writable); 2059 kvm_fault_unlock(kvm); 2060 2061 /* 2062 * Mark the page dirty only if the fault is handled successfully, 2063 * making sure we adjust the canonical IPA if the mapping size has 2064 * been updated (via a THP upgrade, for example). 2065 */ 2066 if (writable && !ret) { 2067 phys_addr_t ipa = gfn_to_gpa(get_canonical_gfn(s2fd, s2vi)); 2068 ipa &= ~(mapping_size - 1); 2069 mark_page_dirty_in_slot(kvm, s2fd->memslot, gpa_to_gfn(ipa)); 2070 } 2071 2072 if (ret != -EAGAIN) 2073 return ret; 2074 return 0; 2075 } 2076 2077 static int user_mem_abort(const struct kvm_s2_fault_desc *s2fd) 2078 { 2079 bool perm_fault = kvm_vcpu_trap_is_permission_fault(s2fd->vcpu); 2080 struct kvm_s2_fault_vma_info s2vi = {}; 2081 enum kvm_pgtable_prot prot; 2082 void *memcache; 2083 int ret; 2084 2085 /* 2086 * Permission faults just need to update the existing leaf entry, 2087 * and so normally don't require allocations from the memcache. The 2088 * only exception to this is when dirty logging is enabled at runtime 2089 * and a write fault needs to collapse a block entry into a table. 2090 */ 2091 memcache = get_mmu_memcache(s2fd->vcpu); 2092 if (!perm_fault || (memslot_is_logging(s2fd->memslot) && 2093 kvm_is_write_fault(s2fd->vcpu))) { 2094 ret = topup_mmu_memcache(s2fd->vcpu, memcache); 2095 if (ret) 2096 return ret; 2097 } 2098 2099 /* 2100 * Let's check if we will get back a huge page backed by hugetlbfs, or 2101 * get block mapping for device MMIO region. 2102 */ 2103 ret = kvm_s2_fault_pin_pfn(s2fd, &s2vi); 2104 if (ret != 1) 2105 return ret; 2106 2107 ret = kvm_s2_fault_compute_prot(s2fd, &s2vi, &prot); 2108 if (ret) { 2109 kvm_release_page_unused(s2vi.page); 2110 return ret; 2111 } 2112 2113 return kvm_s2_fault_map(s2fd, &s2vi, prot, memcache); 2114 } 2115 2116 /* Resolve the access fault by making the page young again. */ 2117 static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) 2118 { 2119 enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_SHARED; 2120 struct kvm_s2_mmu *mmu; 2121 2122 trace_kvm_access_fault(fault_ipa); 2123 2124 read_lock(&vcpu->kvm->mmu_lock); 2125 mmu = vcpu->arch.hw_mmu; 2126 KVM_PGT_FN(kvm_pgtable_stage2_mkyoung)(mmu->pgt, fault_ipa, flags); 2127 read_unlock(&vcpu->kvm->mmu_lock); 2128 } 2129 2130 /* 2131 * Returns true if the SEA should be handled locally within KVM if the abort 2132 * is caused by a kernel memory allocation (e.g. stage-2 table memory). 2133 */ 2134 static bool host_owns_sea(struct kvm_vcpu *vcpu, u64 esr) 2135 { 2136 /* 2137 * Without FEAT_RAS HCR_EL2.TEA is RES0, meaning any external abort 2138 * taken from a guest EL to EL2 is due to a host-imposed access (e.g. 2139 * stage-2 PTW). 2140 */ 2141 if (!cpus_have_final_cap(ARM64_HAS_RAS_EXTN)) 2142 return true; 2143 2144 /* KVM owns the VNCR when the vCPU isn't in a nested context. */ 2145 if (is_hyp_ctxt(vcpu) && !kvm_vcpu_trap_is_iabt(vcpu) && (esr & ESR_ELx_VNCR)) 2146 return true; 2147 2148 /* 2149 * Determining if an external abort during a table walk happened at 2150 * stage-2 is only possible with S1PTW is set. Otherwise, since KVM 2151 * sets HCR_EL2.TEA, SEAs due to a stage-1 walk (i.e. accessing the 2152 * PA of the stage-1 descriptor) can reach here and are reported 2153 * with a TTW ESR value. 2154 */ 2155 return (esr_fsc_is_sea_ttw(esr) && (esr & ESR_ELx_S1PTW)); 2156 } 2157 2158 int kvm_handle_guest_sea(struct kvm_vcpu *vcpu) 2159 { 2160 struct kvm *kvm = vcpu->kvm; 2161 struct kvm_run *run = vcpu->run; 2162 u64 esr = kvm_vcpu_get_esr(vcpu); 2163 u64 esr_mask = ESR_ELx_EC_MASK | 2164 ESR_ELx_IL | 2165 ESR_ELx_FnV | 2166 ESR_ELx_EA | 2167 ESR_ELx_CM | 2168 ESR_ELx_WNR | 2169 ESR_ELx_FSC; 2170 u64 ipa; 2171 2172 /* 2173 * Give APEI the opportunity to claim the abort before handling it 2174 * within KVM. apei_claim_sea() expects to be called with IRQs enabled. 2175 */ 2176 lockdep_assert_irqs_enabled(); 2177 if (apei_claim_sea(NULL) == 0) 2178 return 1; 2179 2180 if (host_owns_sea(vcpu, esr) || 2181 !test_bit(KVM_ARCH_FLAG_EXIT_SEA, &vcpu->kvm->arch.flags)) 2182 return kvm_inject_serror(vcpu); 2183 2184 /* ESR_ELx.SET is RES0 when FEAT_RAS isn't implemented. */ 2185 if (kvm_has_ras(kvm)) 2186 esr_mask |= ESR_ELx_SET_MASK; 2187 2188 /* 2189 * Exit to userspace, and provide faulting guest virtual and physical 2190 * addresses in case userspace wants to emulate SEA to guest by 2191 * writing to FAR_ELx and HPFAR_ELx registers. 2192 */ 2193 memset(&run->arm_sea, 0, sizeof(run->arm_sea)); 2194 run->exit_reason = KVM_EXIT_ARM_SEA; 2195 run->arm_sea.esr = esr & esr_mask; 2196 2197 if (!(esr & ESR_ELx_FnV)) 2198 run->arm_sea.gva = kvm_vcpu_get_hfar(vcpu); 2199 2200 ipa = kvm_vcpu_get_fault_ipa(vcpu); 2201 if (ipa != INVALID_GPA) { 2202 run->arm_sea.flags |= KVM_EXIT_ARM_SEA_FLAG_GPA_VALID; 2203 run->arm_sea.gpa = ipa; 2204 } 2205 2206 return 0; 2207 } 2208 2209 /** 2210 * kvm_handle_guest_abort - handles all 2nd stage aborts 2211 * @vcpu: the VCPU pointer 2212 * 2213 * Any abort that gets to the host is almost guaranteed to be caused by a 2214 * missing second stage translation table entry, which can mean that either the 2215 * guest simply needs more memory and we must allocate an appropriate page or it 2216 * can mean that the guest tried to access I/O memory, which is emulated by user 2217 * space. The distinction is based on the IPA causing the fault and whether this 2218 * memory region has been registered as standard RAM by user space. 2219 */ 2220 int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) 2221 { 2222 struct kvm_s2_trans nested_trans, *nested = NULL; 2223 unsigned long esr; 2224 phys_addr_t fault_ipa; /* The address we faulted on */ 2225 phys_addr_t ipa; /* Always the IPA in the L1 guest phys space */ 2226 struct kvm_memory_slot *memslot; 2227 unsigned long hva; 2228 bool is_iabt, write_fault, writable; 2229 gfn_t gfn; 2230 int ret, idx; 2231 2232 if (kvm_vcpu_abt_issea(vcpu)) 2233 return kvm_handle_guest_sea(vcpu); 2234 2235 esr = kvm_vcpu_get_esr(vcpu); 2236 2237 /* 2238 * The fault IPA should be reliable at this point as we're not dealing 2239 * with an SEA. 2240 */ 2241 ipa = fault_ipa = kvm_vcpu_get_fault_ipa(vcpu); 2242 if (KVM_BUG_ON(ipa == INVALID_GPA, vcpu->kvm)) 2243 return -EFAULT; 2244 2245 is_iabt = kvm_vcpu_trap_is_iabt(vcpu); 2246 2247 if (esr_fsc_is_translation_fault(esr)) { 2248 /* Beyond sanitised PARange (which is the IPA limit) */ 2249 if (fault_ipa >= BIT_ULL(get_kvm_ipa_limit())) { 2250 kvm_inject_size_fault(vcpu); 2251 return 1; 2252 } 2253 2254 /* Falls between the IPA range and the PARange? */ 2255 if (fault_ipa >= BIT_ULL(VTCR_EL2_IPA(vcpu->arch.hw_mmu->vtcr))) { 2256 fault_ipa |= FAR_TO_FIPA_OFFSET(kvm_vcpu_get_hfar(vcpu)); 2257 2258 return kvm_inject_sea(vcpu, is_iabt, fault_ipa); 2259 } 2260 } 2261 2262 trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_esr(vcpu), 2263 kvm_vcpu_get_hfar(vcpu), fault_ipa); 2264 2265 /* Check the stage-2 fault is trans. fault or write fault */ 2266 if (!esr_fsc_is_translation_fault(esr) && 2267 !esr_fsc_is_permission_fault(esr) && 2268 !esr_fsc_is_access_flag_fault(esr) && 2269 !esr_fsc_is_excl_atomic_fault(esr)) { 2270 kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n", 2271 kvm_vcpu_trap_get_class(vcpu), 2272 (unsigned long)kvm_vcpu_trap_get_fault(vcpu), 2273 (unsigned long)kvm_vcpu_get_esr(vcpu)); 2274 return -EFAULT; 2275 } 2276 2277 idx = srcu_read_lock(&vcpu->kvm->srcu); 2278 2279 /* 2280 * We may have faulted on a shadow stage 2 page table if we are 2281 * running a nested guest. In this case, we have to resolve the L2 2282 * IPA to the L1 IPA first, before knowing what kind of memory should 2283 * back the L1 IPA. 2284 * 2285 * If the shadow stage 2 page table walk faults, then we simply inject 2286 * this to the guest and carry on. 2287 * 2288 * If there are no shadow S2 PTs because S2 is disabled, there is 2289 * nothing to walk and we treat it as a 1:1 before going through the 2290 * canonical translation. 2291 */ 2292 if (kvm_is_nested_s2_mmu(vcpu->kvm,vcpu->arch.hw_mmu) && 2293 vcpu->arch.hw_mmu->nested_stage2_enabled) { 2294 u32 esr; 2295 2296 ret = kvm_walk_nested_s2(vcpu, fault_ipa, &nested_trans); 2297 if (ret == -EAGAIN) { 2298 ret = 1; 2299 goto out_unlock; 2300 } 2301 2302 if (ret) { 2303 esr = kvm_s2_trans_esr(&nested_trans); 2304 kvm_inject_s2_fault(vcpu, esr); 2305 goto out_unlock; 2306 } 2307 2308 ret = kvm_s2_handle_perm_fault(vcpu, &nested_trans); 2309 if (ret) { 2310 esr = kvm_s2_trans_esr(&nested_trans); 2311 kvm_inject_s2_fault(vcpu, esr); 2312 goto out_unlock; 2313 } 2314 2315 ipa = kvm_s2_trans_output(&nested_trans); 2316 nested = &nested_trans; 2317 } 2318 2319 gfn = ipa >> PAGE_SHIFT; 2320 memslot = gfn_to_memslot(vcpu->kvm, gfn); 2321 hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable); 2322 write_fault = kvm_is_write_fault(vcpu); 2323 if (kvm_is_error_hva(hva) || (write_fault && !writable)) { 2324 /* 2325 * The guest has put either its instructions or its page-tables 2326 * somewhere it shouldn't have. Userspace won't be able to do 2327 * anything about this (there's no syndrome for a start), so 2328 * re-inject the abort back into the guest. 2329 */ 2330 if (is_iabt) { 2331 ret = -ENOEXEC; 2332 goto out; 2333 } 2334 2335 if (kvm_vcpu_abt_iss1tw(vcpu)) { 2336 ret = kvm_inject_sea_dabt(vcpu, kvm_vcpu_get_hfar(vcpu)); 2337 goto out_unlock; 2338 } 2339 2340 /* 2341 * Check for a cache maintenance operation. Since we 2342 * ended-up here, we know it is outside of any memory 2343 * slot. But we can't find out if that is for a device, 2344 * or if the guest is just being stupid. The only thing 2345 * we know for sure is that this range cannot be cached. 2346 * 2347 * So let's assume that the guest is just being 2348 * cautious, and skip the instruction. 2349 */ 2350 if (kvm_is_error_hva(hva) && kvm_vcpu_dabt_is_cm(vcpu)) { 2351 kvm_incr_pc(vcpu); 2352 ret = 1; 2353 goto out_unlock; 2354 } 2355 2356 /* 2357 * The IPA is reported as [MAX:12], so we need to 2358 * complement it with the bottom 12 bits from the 2359 * faulting VA. This is always 12 bits, irrespective 2360 * of the page size. 2361 */ 2362 ipa |= FAR_TO_FIPA_OFFSET(kvm_vcpu_get_hfar(vcpu)); 2363 ret = io_mem_abort(vcpu, ipa); 2364 goto out_unlock; 2365 } 2366 2367 /* Userspace should not be able to register out-of-bounds IPAs */ 2368 VM_BUG_ON(ipa >= kvm_phys_size(vcpu->arch.hw_mmu)); 2369 2370 if (esr_fsc_is_access_flag_fault(esr)) { 2371 handle_access_fault(vcpu, fault_ipa); 2372 ret = 1; 2373 goto out_unlock; 2374 } 2375 2376 const struct kvm_s2_fault_desc s2fd = { 2377 .vcpu = vcpu, 2378 .fault_ipa = fault_ipa, 2379 .nested = nested, 2380 .memslot = memslot, 2381 .hva = hva, 2382 }; 2383 2384 if (kvm_vm_is_protected(vcpu->kvm)) { 2385 ret = pkvm_mem_abort(&s2fd); 2386 } else { 2387 VM_WARN_ON_ONCE(kvm_vcpu_trap_is_permission_fault(vcpu) && 2388 !write_fault && 2389 !kvm_vcpu_trap_is_exec_fault(vcpu)); 2390 2391 if (kvm_slot_has_gmem(memslot)) 2392 ret = gmem_abort(&s2fd); 2393 else 2394 ret = user_mem_abort(&s2fd); 2395 } 2396 2397 if (ret == 0) 2398 ret = 1; 2399 out: 2400 if (ret == -ENOEXEC) 2401 ret = kvm_inject_sea_iabt(vcpu, kvm_vcpu_get_hfar(vcpu)); 2402 out_unlock: 2403 srcu_read_unlock(&vcpu->kvm->srcu, idx); 2404 return ret; 2405 } 2406 2407 bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) 2408 { 2409 if (!kvm->arch.mmu.pgt || kvm_vm_is_protected(kvm)) 2410 return false; 2411 2412 __unmap_stage2_range(&kvm->arch.mmu, range->start << PAGE_SHIFT, 2413 (range->end - range->start) << PAGE_SHIFT, 2414 range->may_block); 2415 2416 kvm_nested_s2_unmap(kvm, range->may_block); 2417 return false; 2418 } 2419 2420 bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 2421 { 2422 u64 size = (range->end - range->start) << PAGE_SHIFT; 2423 2424 if (!kvm->arch.mmu.pgt || kvm_vm_is_protected(kvm)) 2425 return false; 2426 2427 return KVM_PGT_FN(kvm_pgtable_stage2_test_clear_young)(kvm->arch.mmu.pgt, 2428 range->start << PAGE_SHIFT, 2429 size, true); 2430 /* 2431 * TODO: Handle nested_mmu structures here using the reverse mapping in 2432 * a later version of patch series. 2433 */ 2434 } 2435 2436 bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 2437 { 2438 u64 size = (range->end - range->start) << PAGE_SHIFT; 2439 2440 if (!kvm->arch.mmu.pgt || kvm_vm_is_protected(kvm)) 2441 return false; 2442 2443 return KVM_PGT_FN(kvm_pgtable_stage2_test_clear_young)(kvm->arch.mmu.pgt, 2444 range->start << PAGE_SHIFT, 2445 size, false); 2446 } 2447 2448 phys_addr_t kvm_mmu_get_httbr(void) 2449 { 2450 return __pa(hyp_pgtable->pgd); 2451 } 2452 2453 phys_addr_t kvm_get_idmap_vector(void) 2454 { 2455 return hyp_idmap_vector; 2456 } 2457 2458 static int kvm_map_idmap_text(void) 2459 { 2460 unsigned long size = hyp_idmap_end - hyp_idmap_start; 2461 int err = __create_hyp_mappings(hyp_idmap_start, size, hyp_idmap_start, 2462 PAGE_HYP_EXEC); 2463 if (err) 2464 kvm_err("Failed to idmap %lx-%lx\n", 2465 hyp_idmap_start, hyp_idmap_end); 2466 2467 return err; 2468 } 2469 2470 static void *kvm_hyp_zalloc_page(void *arg) 2471 { 2472 return (void *)get_zeroed_page(GFP_KERNEL); 2473 } 2474 2475 static struct kvm_pgtable_mm_ops kvm_hyp_mm_ops = { 2476 .zalloc_page = kvm_hyp_zalloc_page, 2477 .get_page = kvm_host_get_page, 2478 .put_page = kvm_host_put_page, 2479 .phys_to_virt = kvm_host_va, 2480 .virt_to_phys = kvm_host_pa, 2481 }; 2482 2483 int __init kvm_mmu_init(u32 hyp_va_bits) 2484 { 2485 int err; 2486 2487 hyp_idmap_start = __pa_symbol(__hyp_idmap_text_start); 2488 hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE); 2489 hyp_idmap_end = __pa_symbol(__hyp_idmap_text_end); 2490 hyp_idmap_end = ALIGN(hyp_idmap_end, PAGE_SIZE); 2491 hyp_idmap_vector = __pa_symbol(__kvm_hyp_init); 2492 2493 /* 2494 * We rely on the linker script to ensure at build time that the HYP 2495 * init code does not cross a page boundary. 2496 */ 2497 BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK); 2498 2499 kvm_debug("Using %u-bit virtual addresses at EL2\n", hyp_va_bits); 2500 kvm_debug("IDMAP page: %lx\n", hyp_idmap_start); 2501 kvm_debug("HYP VA range: %lx:%lx\n", 2502 kern_hyp_va(PAGE_OFFSET), 2503 kern_hyp_va((unsigned long)high_memory - 1)); 2504 2505 if (hyp_idmap_start >= kern_hyp_va(PAGE_OFFSET) && 2506 hyp_idmap_start < kern_hyp_va((unsigned long)high_memory - 1) && 2507 hyp_idmap_start != (unsigned long)__hyp_idmap_text_start) { 2508 /* 2509 * The idmap page is intersecting with the VA space, 2510 * it is not safe to continue further. 2511 */ 2512 kvm_err("IDMAP intersecting with HYP VA, unable to continue\n"); 2513 err = -EINVAL; 2514 goto out; 2515 } 2516 2517 hyp_pgtable = kzalloc_obj(*hyp_pgtable); 2518 if (!hyp_pgtable) { 2519 kvm_err("Hyp mode page-table not allocated\n"); 2520 err = -ENOMEM; 2521 goto out; 2522 } 2523 2524 err = kvm_pgtable_hyp_init(hyp_pgtable, hyp_va_bits, &kvm_hyp_mm_ops); 2525 if (err) 2526 goto out_free_pgtable; 2527 2528 err = kvm_map_idmap_text(); 2529 if (err) 2530 goto out_destroy_pgtable; 2531 2532 io_map_base = hyp_idmap_start; 2533 __hyp_va_bits = hyp_va_bits; 2534 return 0; 2535 2536 out_destroy_pgtable: 2537 kvm_pgtable_hyp_destroy(hyp_pgtable); 2538 out_free_pgtable: 2539 kfree(hyp_pgtable); 2540 hyp_pgtable = NULL; 2541 out: 2542 return err; 2543 } 2544 2545 void kvm_arch_commit_memory_region(struct kvm *kvm, 2546 struct kvm_memory_slot *old, 2547 const struct kvm_memory_slot *new, 2548 enum kvm_mr_change change) 2549 { 2550 bool log_dirty_pages = new && new->flags & KVM_MEM_LOG_DIRTY_PAGES; 2551 2552 /* 2553 * At this point memslot has been committed and there is an 2554 * allocated dirty_bitmap[], dirty pages will be tracked while the 2555 * memory slot is write protected. 2556 */ 2557 if (log_dirty_pages) { 2558 2559 if (change == KVM_MR_DELETE) 2560 return; 2561 2562 /* 2563 * Huge and normal pages are write-protected and split 2564 * on either of these two cases: 2565 * 2566 * 1. with initial-all-set: gradually with CLEAR ioctls, 2567 */ 2568 if (kvm_dirty_log_manual_protect_and_init_set(kvm)) 2569 return; 2570 /* 2571 * or 2572 * 2. without initial-all-set: all in one shot when 2573 * enabling dirty logging. 2574 */ 2575 kvm_mmu_wp_memory_region(kvm, new->id); 2576 kvm_mmu_split_memory_region(kvm, new->id); 2577 } else { 2578 /* 2579 * Free any leftovers from the eager page splitting cache. Do 2580 * this when deleting, moving, disabling dirty logging, or 2581 * creating the memslot (a nop). Doing it for deletes makes 2582 * sure we don't leak memory, and there's no need to keep the 2583 * cache around for any of the other cases. 2584 */ 2585 kvm_mmu_free_memory_cache(&kvm->arch.mmu.split_page_cache); 2586 } 2587 } 2588 2589 int kvm_arch_prepare_memory_region(struct kvm *kvm, 2590 const struct kvm_memory_slot *old, 2591 struct kvm_memory_slot *new, 2592 enum kvm_mr_change change) 2593 { 2594 hva_t hva, reg_end; 2595 int ret = 0; 2596 2597 if (kvm_vm_is_protected(kvm)) { 2598 /* Cannot modify memslots once a pVM has run. */ 2599 if (pkvm_hyp_vm_is_created(kvm) && 2600 (change == KVM_MR_DELETE || change == KVM_MR_MOVE)) { 2601 return -EPERM; 2602 } 2603 2604 if (new && 2605 new->flags & (KVM_MEM_LOG_DIRTY_PAGES | KVM_MEM_READONLY)) { 2606 return -EPERM; 2607 } 2608 } 2609 2610 if (change != KVM_MR_CREATE && change != KVM_MR_MOVE && 2611 change != KVM_MR_FLAGS_ONLY) 2612 return 0; 2613 2614 /* 2615 * Prevent userspace from creating a memory region outside of the IPA 2616 * space addressable by the KVM guest IPA space. 2617 */ 2618 if ((new->base_gfn + new->npages) > (kvm_phys_size(&kvm->arch.mmu) >> PAGE_SHIFT)) 2619 return -EFAULT; 2620 2621 /* 2622 * Only support guest_memfd backed memslots with mappable memory, since 2623 * there aren't any CoCo VMs that support only private memory on arm64. 2624 */ 2625 if (kvm_slot_has_gmem(new) && !kvm_memslot_is_gmem_only(new)) 2626 return -EINVAL; 2627 2628 hva = new->userspace_addr; 2629 reg_end = hva + (new->npages << PAGE_SHIFT); 2630 2631 mmap_read_lock(current->mm); 2632 /* 2633 * A memory region could potentially cover multiple VMAs, and any holes 2634 * between them, so iterate over all of them. 2635 * 2636 * +--------------------------------------------+ 2637 * +---------------+----------------+ +----------------+ 2638 * | : VMA 1 | VMA 2 | | VMA 3 : | 2639 * +---------------+----------------+ +----------------+ 2640 * | memory region | 2641 * +--------------------------------------------+ 2642 */ 2643 do { 2644 struct vm_area_struct *vma; 2645 2646 vma = find_vma_intersection(current->mm, hva, reg_end); 2647 if (!vma) 2648 break; 2649 2650 if (kvm_has_mte(kvm) && !kvm_vma_mte_allowed(vma)) { 2651 ret = -EINVAL; 2652 break; 2653 } 2654 2655 if (vma->vm_flags & VM_PFNMAP) { 2656 /* IO region dirty page logging not allowed */ 2657 if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) { 2658 ret = -EINVAL; 2659 break; 2660 } 2661 2662 /* 2663 * Cacheable PFNMAP is allowed only if the hardware 2664 * supports it. 2665 */ 2666 if (kvm_vma_is_cacheable(vma) && !kvm_supports_cacheable_pfnmap()) { 2667 ret = -EINVAL; 2668 break; 2669 } 2670 } 2671 hva = min(reg_end, vma->vm_end); 2672 } while (hva < reg_end); 2673 2674 mmap_read_unlock(current->mm); 2675 return ret; 2676 } 2677 2678 void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) 2679 { 2680 } 2681 2682 void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) 2683 { 2684 } 2685 2686 void kvm_arch_flush_shadow_memslot(struct kvm *kvm, 2687 struct kvm_memory_slot *slot) 2688 { 2689 gpa_t gpa = slot->base_gfn << PAGE_SHIFT; 2690 phys_addr_t size = slot->npages << PAGE_SHIFT; 2691 2692 write_lock(&kvm->mmu_lock); 2693 kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, size, true); 2694 kvm_nested_s2_unmap(kvm, true); 2695 write_unlock(&kvm->mmu_lock); 2696 } 2697 2698 /* 2699 * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized). 2700 * 2701 * Main problems: 2702 * - S/W ops are local to a CPU (not broadcast) 2703 * - We have line migration behind our back (speculation) 2704 * - System caches don't support S/W at all (damn!) 2705 * 2706 * In the face of the above, the best we can do is to try and convert 2707 * S/W ops to VA ops. Because the guest is not allowed to infer the 2708 * S/W to PA mapping, it can only use S/W to nuke the whole cache, 2709 * which is a rather good thing for us. 2710 * 2711 * Also, it is only used when turning caches on/off ("The expected 2712 * usage of the cache maintenance instructions that operate by set/way 2713 * is associated with the cache maintenance instructions associated 2714 * with the powerdown and powerup of caches, if this is required by 2715 * the implementation."). 2716 * 2717 * We use the following policy: 2718 * 2719 * - If we trap a S/W operation, we enable VM trapping to detect 2720 * caches being turned on/off, and do a full clean. 2721 * 2722 * - We flush the caches on both caches being turned on and off. 2723 * 2724 * - Once the caches are enabled, we stop trapping VM ops. 2725 */ 2726 void kvm_set_way_flush(struct kvm_vcpu *vcpu) 2727 { 2728 unsigned long hcr = *vcpu_hcr(vcpu); 2729 2730 /* 2731 * If this is the first time we do a S/W operation 2732 * (i.e. HCR_TVM not set) flush the whole memory, and set the 2733 * VM trapping. 2734 * 2735 * Otherwise, rely on the VM trapping to wait for the MMU + 2736 * Caches to be turned off. At that point, we'll be able to 2737 * clean the caches again. 2738 */ 2739 if (!(hcr & HCR_TVM)) { 2740 trace_kvm_set_way_flush(*vcpu_pc(vcpu), 2741 vcpu_has_cache_enabled(vcpu)); 2742 stage2_flush_vm(vcpu->kvm); 2743 *vcpu_hcr(vcpu) = hcr | HCR_TVM; 2744 } 2745 } 2746 2747 void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled) 2748 { 2749 bool now_enabled = vcpu_has_cache_enabled(vcpu); 2750 2751 /* 2752 * If switching the MMU+caches on, need to invalidate the caches. 2753 * If switching it off, need to clean the caches. 2754 * Clean + invalidate does the trick always. 2755 */ 2756 if (now_enabled != was_enabled) 2757 stage2_flush_vm(vcpu->kvm); 2758 2759 /* Caches are now on, stop trapping VM ops (until a S/W op) */ 2760 if (now_enabled) 2761 *vcpu_hcr(vcpu) &= ~HCR_TVM; 2762 2763 trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled); 2764 } 2765