1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2012 - Virtual Open Systems and Columbia University 4 * Author: Christoffer Dall <c.dall@virtualopensystems.com> 5 */ 6 7 #include <linux/mman.h> 8 #include <linux/kvm_host.h> 9 #include <linux/io.h> 10 #include <linux/hugetlb.h> 11 #include <linux/sched/signal.h> 12 #include <trace/events/kvm.h> 13 #include <asm/pgalloc.h> 14 #include <asm/cacheflush.h> 15 #include <asm/kvm_arm.h> 16 #include <asm/kvm_mmu.h> 17 #include <asm/kvm_pgtable.h> 18 #include <asm/kvm_pkvm.h> 19 #include <asm/kvm_ras.h> 20 #include <asm/kvm_asm.h> 21 #include <asm/kvm_emulate.h> 22 #include <asm/virt.h> 23 24 #include "trace.h" 25 26 static struct kvm_pgtable *hyp_pgtable; 27 static DEFINE_MUTEX(kvm_hyp_pgd_mutex); 28 29 static unsigned long __ro_after_init hyp_idmap_start; 30 static unsigned long __ro_after_init hyp_idmap_end; 31 static phys_addr_t __ro_after_init hyp_idmap_vector; 32 33 u32 __ro_after_init __hyp_va_bits; 34 35 static unsigned long __ro_after_init io_map_base; 36 37 #define KVM_PGT_FN(fn) (!is_protected_kvm_enabled() ? fn : p ## fn) 38 39 static phys_addr_t __stage2_range_addr_end(phys_addr_t addr, phys_addr_t end, 40 phys_addr_t size) 41 { 42 phys_addr_t boundary = ALIGN_DOWN(addr + size, size); 43 44 return (boundary - 1 < end - 1) ? boundary : end; 45 } 46 47 static phys_addr_t stage2_range_addr_end(phys_addr_t addr, phys_addr_t end) 48 { 49 phys_addr_t size = kvm_granule_size(KVM_PGTABLE_MIN_BLOCK_LEVEL); 50 51 return __stage2_range_addr_end(addr, end, size); 52 } 53 54 /* 55 * Release kvm_mmu_lock periodically if the memory region is large. Otherwise, 56 * we may see kernel panics with CONFIG_DETECT_HUNG_TASK, 57 * CONFIG_LOCKUP_DETECTOR, CONFIG_LOCKDEP. Additionally, holding the lock too 58 * long will also starve other vCPUs. We have to also make sure that the page 59 * tables are not freed while we released the lock. 60 */ 61 static int stage2_apply_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, 62 phys_addr_t end, 63 int (*fn)(struct kvm_pgtable *, u64, u64), 64 bool resched) 65 { 66 struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu); 67 int ret; 68 u64 next; 69 70 do { 71 struct kvm_pgtable *pgt = mmu->pgt; 72 if (!pgt) 73 return -EINVAL; 74 75 next = stage2_range_addr_end(addr, end); 76 ret = fn(pgt, addr, next - addr); 77 if (ret) 78 break; 79 80 if (resched && next != end) 81 cond_resched_rwlock_write(&kvm->mmu_lock); 82 } while (addr = next, addr != end); 83 84 return ret; 85 } 86 87 #define stage2_apply_range_resched(mmu, addr, end, fn) \ 88 stage2_apply_range(mmu, addr, end, fn, true) 89 90 /* 91 * Get the maximum number of page-tables pages needed to split a range 92 * of blocks into PAGE_SIZE PTEs. It assumes the range is already 93 * mapped at level 2, or at level 1 if allowed. 94 */ 95 static int kvm_mmu_split_nr_page_tables(u64 range) 96 { 97 int n = 0; 98 99 if (KVM_PGTABLE_MIN_BLOCK_LEVEL < 2) 100 n += DIV_ROUND_UP(range, PUD_SIZE); 101 n += DIV_ROUND_UP(range, PMD_SIZE); 102 return n; 103 } 104 105 static bool need_split_memcache_topup_or_resched(struct kvm *kvm) 106 { 107 struct kvm_mmu_memory_cache *cache; 108 u64 chunk_size, min; 109 110 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) 111 return true; 112 113 chunk_size = kvm->arch.mmu.split_page_chunk_size; 114 min = kvm_mmu_split_nr_page_tables(chunk_size); 115 cache = &kvm->arch.mmu.split_page_cache; 116 return kvm_mmu_memory_cache_nr_free_objects(cache) < min; 117 } 118 119 static int kvm_mmu_split_huge_pages(struct kvm *kvm, phys_addr_t addr, 120 phys_addr_t end) 121 { 122 struct kvm_mmu_memory_cache *cache; 123 struct kvm_pgtable *pgt; 124 int ret, cache_capacity; 125 u64 next, chunk_size; 126 127 lockdep_assert_held_write(&kvm->mmu_lock); 128 129 chunk_size = kvm->arch.mmu.split_page_chunk_size; 130 cache_capacity = kvm_mmu_split_nr_page_tables(chunk_size); 131 132 if (chunk_size == 0) 133 return 0; 134 135 cache = &kvm->arch.mmu.split_page_cache; 136 137 do { 138 if (need_split_memcache_topup_or_resched(kvm)) { 139 write_unlock(&kvm->mmu_lock); 140 cond_resched(); 141 /* Eager page splitting is best-effort. */ 142 ret = __kvm_mmu_topup_memory_cache(cache, 143 cache_capacity, 144 cache_capacity); 145 write_lock(&kvm->mmu_lock); 146 if (ret) 147 break; 148 } 149 150 pgt = kvm->arch.mmu.pgt; 151 if (!pgt) 152 return -EINVAL; 153 154 next = __stage2_range_addr_end(addr, end, chunk_size); 155 ret = KVM_PGT_FN(kvm_pgtable_stage2_split)(pgt, addr, next - addr, cache); 156 if (ret) 157 break; 158 } while (addr = next, addr != end); 159 160 return ret; 161 } 162 163 static bool memslot_is_logging(struct kvm_memory_slot *memslot) 164 { 165 return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY); 166 } 167 168 /** 169 * kvm_arch_flush_remote_tlbs() - flush all VM TLB entries for v7/8 170 * @kvm: pointer to kvm structure. 171 * 172 * Interface to HYP function to flush all VM TLB entries 173 */ 174 int kvm_arch_flush_remote_tlbs(struct kvm *kvm) 175 { 176 if (is_protected_kvm_enabled()) 177 kvm_call_hyp_nvhe(__pkvm_tlb_flush_vmid, kvm->arch.pkvm.handle); 178 else 179 kvm_call_hyp(__kvm_tlb_flush_vmid, &kvm->arch.mmu); 180 return 0; 181 } 182 183 int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, 184 gfn_t gfn, u64 nr_pages) 185 { 186 u64 size = nr_pages << PAGE_SHIFT; 187 u64 addr = gfn << PAGE_SHIFT; 188 189 if (is_protected_kvm_enabled()) 190 kvm_call_hyp_nvhe(__pkvm_tlb_flush_vmid, kvm->arch.pkvm.handle); 191 else 192 kvm_tlb_flush_vmid_range(&kvm->arch.mmu, addr, size); 193 return 0; 194 } 195 196 static bool kvm_is_device_pfn(unsigned long pfn) 197 { 198 return !pfn_is_map_memory(pfn); 199 } 200 201 static void *stage2_memcache_zalloc_page(void *arg) 202 { 203 struct kvm_mmu_memory_cache *mc = arg; 204 void *virt; 205 206 /* Allocated with __GFP_ZERO, so no need to zero */ 207 virt = kvm_mmu_memory_cache_alloc(mc); 208 if (virt) 209 kvm_account_pgtable_pages(virt, 1); 210 return virt; 211 } 212 213 static void *kvm_host_zalloc_pages_exact(size_t size) 214 { 215 return alloc_pages_exact(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO); 216 } 217 218 static void *kvm_s2_zalloc_pages_exact(size_t size) 219 { 220 void *virt = kvm_host_zalloc_pages_exact(size); 221 222 if (virt) 223 kvm_account_pgtable_pages(virt, (size >> PAGE_SHIFT)); 224 return virt; 225 } 226 227 static void kvm_s2_free_pages_exact(void *virt, size_t size) 228 { 229 kvm_account_pgtable_pages(virt, -(size >> PAGE_SHIFT)); 230 free_pages_exact(virt, size); 231 } 232 233 static struct kvm_pgtable_mm_ops kvm_s2_mm_ops; 234 235 static void stage2_free_unlinked_table_rcu_cb(struct rcu_head *head) 236 { 237 struct page *page = container_of(head, struct page, rcu_head); 238 void *pgtable = page_to_virt(page); 239 s8 level = page_private(page); 240 241 KVM_PGT_FN(kvm_pgtable_stage2_free_unlinked)(&kvm_s2_mm_ops, pgtable, level); 242 } 243 244 static void stage2_free_unlinked_table(void *addr, s8 level) 245 { 246 struct page *page = virt_to_page(addr); 247 248 set_page_private(page, (unsigned long)level); 249 call_rcu(&page->rcu_head, stage2_free_unlinked_table_rcu_cb); 250 } 251 252 static void kvm_host_get_page(void *addr) 253 { 254 get_page(virt_to_page(addr)); 255 } 256 257 static void kvm_host_put_page(void *addr) 258 { 259 put_page(virt_to_page(addr)); 260 } 261 262 static void kvm_s2_put_page(void *addr) 263 { 264 struct page *p = virt_to_page(addr); 265 /* Dropping last refcount, the page will be freed */ 266 if (page_count(p) == 1) 267 kvm_account_pgtable_pages(addr, -1); 268 put_page(p); 269 } 270 271 static int kvm_host_page_count(void *addr) 272 { 273 return page_count(virt_to_page(addr)); 274 } 275 276 static phys_addr_t kvm_host_pa(void *addr) 277 { 278 return __pa(addr); 279 } 280 281 static void *kvm_host_va(phys_addr_t phys) 282 { 283 return __va(phys); 284 } 285 286 static void clean_dcache_guest_page(void *va, size_t size) 287 { 288 __clean_dcache_guest_page(va, size); 289 } 290 291 static void invalidate_icache_guest_page(void *va, size_t size) 292 { 293 __invalidate_icache_guest_page(va, size); 294 } 295 296 /* 297 * Unmapping vs dcache management: 298 * 299 * If a guest maps certain memory pages as uncached, all writes will 300 * bypass the data cache and go directly to RAM. However, the CPUs 301 * can still speculate reads (not writes) and fill cache lines with 302 * data. 303 * 304 * Those cache lines will be *clean* cache lines though, so a 305 * clean+invalidate operation is equivalent to an invalidate 306 * operation, because no cache lines are marked dirty. 307 * 308 * Those clean cache lines could be filled prior to an uncached write 309 * by the guest, and the cache coherent IO subsystem would therefore 310 * end up writing old data to disk. 311 * 312 * This is why right after unmapping a page/section and invalidating 313 * the corresponding TLBs, we flush to make sure the IO subsystem will 314 * never hit in the cache. 315 * 316 * This is all avoided on systems that have ARM64_HAS_STAGE2_FWB, as 317 * we then fully enforce cacheability of RAM, no matter what the guest 318 * does. 319 */ 320 /** 321 * __unmap_stage2_range -- Clear stage2 page table entries to unmap a range 322 * @mmu: The KVM stage-2 MMU pointer 323 * @start: The intermediate physical base address of the range to unmap 324 * @size: The size of the area to unmap 325 * @may_block: Whether or not we are permitted to block 326 * 327 * Clear a range of stage-2 mappings, lowering the various ref-counts. Must 328 * be called while holding mmu_lock (unless for freeing the stage2 pgd before 329 * destroying the VM), otherwise another faulting VCPU may come in and mess 330 * with things behind our backs. 331 */ 332 static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size, 333 bool may_block) 334 { 335 struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu); 336 phys_addr_t end = start + size; 337 338 lockdep_assert_held_write(&kvm->mmu_lock); 339 WARN_ON(size & ~PAGE_MASK); 340 WARN_ON(stage2_apply_range(mmu, start, end, KVM_PGT_FN(kvm_pgtable_stage2_unmap), 341 may_block)); 342 } 343 344 void kvm_stage2_unmap_range(struct kvm_s2_mmu *mmu, phys_addr_t start, 345 u64 size, bool may_block) 346 { 347 __unmap_stage2_range(mmu, start, size, may_block); 348 } 349 350 void kvm_stage2_flush_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end) 351 { 352 stage2_apply_range_resched(mmu, addr, end, KVM_PGT_FN(kvm_pgtable_stage2_flush)); 353 } 354 355 static void stage2_flush_memslot(struct kvm *kvm, 356 struct kvm_memory_slot *memslot) 357 { 358 phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT; 359 phys_addr_t end = addr + PAGE_SIZE * memslot->npages; 360 361 kvm_stage2_flush_range(&kvm->arch.mmu, addr, end); 362 } 363 364 /** 365 * stage2_flush_vm - Invalidate cache for pages mapped in stage 2 366 * @kvm: The struct kvm pointer 367 * 368 * Go through the stage 2 page tables and invalidate any cache lines 369 * backing memory already mapped to the VM. 370 */ 371 static void stage2_flush_vm(struct kvm *kvm) 372 { 373 struct kvm_memslots *slots; 374 struct kvm_memory_slot *memslot; 375 int idx, bkt; 376 377 idx = srcu_read_lock(&kvm->srcu); 378 write_lock(&kvm->mmu_lock); 379 380 slots = kvm_memslots(kvm); 381 kvm_for_each_memslot(memslot, bkt, slots) 382 stage2_flush_memslot(kvm, memslot); 383 384 kvm_nested_s2_flush(kvm); 385 386 write_unlock(&kvm->mmu_lock); 387 srcu_read_unlock(&kvm->srcu, idx); 388 } 389 390 /** 391 * free_hyp_pgds - free Hyp-mode page tables 392 */ 393 void __init free_hyp_pgds(void) 394 { 395 mutex_lock(&kvm_hyp_pgd_mutex); 396 if (hyp_pgtable) { 397 kvm_pgtable_hyp_destroy(hyp_pgtable); 398 kfree(hyp_pgtable); 399 hyp_pgtable = NULL; 400 } 401 mutex_unlock(&kvm_hyp_pgd_mutex); 402 } 403 404 static bool kvm_host_owns_hyp_mappings(void) 405 { 406 if (is_kernel_in_hyp_mode()) 407 return false; 408 409 if (static_branch_likely(&kvm_protected_mode_initialized)) 410 return false; 411 412 /* 413 * This can happen at boot time when __create_hyp_mappings() is called 414 * after the hyp protection has been enabled, but the static key has 415 * not been flipped yet. 416 */ 417 if (!hyp_pgtable && is_protected_kvm_enabled()) 418 return false; 419 420 WARN_ON(!hyp_pgtable); 421 422 return true; 423 } 424 425 int __create_hyp_mappings(unsigned long start, unsigned long size, 426 unsigned long phys, enum kvm_pgtable_prot prot) 427 { 428 int err; 429 430 if (WARN_ON(!kvm_host_owns_hyp_mappings())) 431 return -EINVAL; 432 433 mutex_lock(&kvm_hyp_pgd_mutex); 434 err = kvm_pgtable_hyp_map(hyp_pgtable, start, size, phys, prot); 435 mutex_unlock(&kvm_hyp_pgd_mutex); 436 437 return err; 438 } 439 440 static phys_addr_t kvm_kaddr_to_phys(void *kaddr) 441 { 442 if (!is_vmalloc_addr(kaddr)) { 443 BUG_ON(!virt_addr_valid(kaddr)); 444 return __pa(kaddr); 445 } else { 446 return page_to_phys(vmalloc_to_page(kaddr)) + 447 offset_in_page(kaddr); 448 } 449 } 450 451 struct hyp_shared_pfn { 452 u64 pfn; 453 int count; 454 struct rb_node node; 455 }; 456 457 static DEFINE_MUTEX(hyp_shared_pfns_lock); 458 static struct rb_root hyp_shared_pfns = RB_ROOT; 459 460 static struct hyp_shared_pfn *find_shared_pfn(u64 pfn, struct rb_node ***node, 461 struct rb_node **parent) 462 { 463 struct hyp_shared_pfn *this; 464 465 *node = &hyp_shared_pfns.rb_node; 466 *parent = NULL; 467 while (**node) { 468 this = container_of(**node, struct hyp_shared_pfn, node); 469 *parent = **node; 470 if (this->pfn < pfn) 471 *node = &((**node)->rb_left); 472 else if (this->pfn > pfn) 473 *node = &((**node)->rb_right); 474 else 475 return this; 476 } 477 478 return NULL; 479 } 480 481 static int share_pfn_hyp(u64 pfn) 482 { 483 struct rb_node **node, *parent; 484 struct hyp_shared_pfn *this; 485 int ret = 0; 486 487 mutex_lock(&hyp_shared_pfns_lock); 488 this = find_shared_pfn(pfn, &node, &parent); 489 if (this) { 490 this->count++; 491 goto unlock; 492 } 493 494 this = kzalloc(sizeof(*this), GFP_KERNEL); 495 if (!this) { 496 ret = -ENOMEM; 497 goto unlock; 498 } 499 500 this->pfn = pfn; 501 this->count = 1; 502 rb_link_node(&this->node, parent, node); 503 rb_insert_color(&this->node, &hyp_shared_pfns); 504 ret = kvm_call_hyp_nvhe(__pkvm_host_share_hyp, pfn, 1); 505 unlock: 506 mutex_unlock(&hyp_shared_pfns_lock); 507 508 return ret; 509 } 510 511 static int unshare_pfn_hyp(u64 pfn) 512 { 513 struct rb_node **node, *parent; 514 struct hyp_shared_pfn *this; 515 int ret = 0; 516 517 mutex_lock(&hyp_shared_pfns_lock); 518 this = find_shared_pfn(pfn, &node, &parent); 519 if (WARN_ON(!this)) { 520 ret = -ENOENT; 521 goto unlock; 522 } 523 524 this->count--; 525 if (this->count) 526 goto unlock; 527 528 rb_erase(&this->node, &hyp_shared_pfns); 529 kfree(this); 530 ret = kvm_call_hyp_nvhe(__pkvm_host_unshare_hyp, pfn, 1); 531 unlock: 532 mutex_unlock(&hyp_shared_pfns_lock); 533 534 return ret; 535 } 536 537 int kvm_share_hyp(void *from, void *to) 538 { 539 phys_addr_t start, end, cur; 540 u64 pfn; 541 int ret; 542 543 if (is_kernel_in_hyp_mode()) 544 return 0; 545 546 /* 547 * The share hcall maps things in the 'fixed-offset' region of the hyp 548 * VA space, so we can only share physically contiguous data-structures 549 * for now. 550 */ 551 if (is_vmalloc_or_module_addr(from) || is_vmalloc_or_module_addr(to)) 552 return -EINVAL; 553 554 if (kvm_host_owns_hyp_mappings()) 555 return create_hyp_mappings(from, to, PAGE_HYP); 556 557 start = ALIGN_DOWN(__pa(from), PAGE_SIZE); 558 end = PAGE_ALIGN(__pa(to)); 559 for (cur = start; cur < end; cur += PAGE_SIZE) { 560 pfn = __phys_to_pfn(cur); 561 ret = share_pfn_hyp(pfn); 562 if (ret) 563 return ret; 564 } 565 566 return 0; 567 } 568 569 void kvm_unshare_hyp(void *from, void *to) 570 { 571 phys_addr_t start, end, cur; 572 u64 pfn; 573 574 if (is_kernel_in_hyp_mode() || kvm_host_owns_hyp_mappings() || !from) 575 return; 576 577 start = ALIGN_DOWN(__pa(from), PAGE_SIZE); 578 end = PAGE_ALIGN(__pa(to)); 579 for (cur = start; cur < end; cur += PAGE_SIZE) { 580 pfn = __phys_to_pfn(cur); 581 WARN_ON(unshare_pfn_hyp(pfn)); 582 } 583 } 584 585 /** 586 * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode 587 * @from: The virtual kernel start address of the range 588 * @to: The virtual kernel end address of the range (exclusive) 589 * @prot: The protection to be applied to this range 590 * 591 * The same virtual address as the kernel virtual address is also used 592 * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying 593 * physical pages. 594 */ 595 int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot) 596 { 597 phys_addr_t phys_addr; 598 unsigned long virt_addr; 599 unsigned long start = kern_hyp_va((unsigned long)from); 600 unsigned long end = kern_hyp_va((unsigned long)to); 601 602 if (is_kernel_in_hyp_mode()) 603 return 0; 604 605 if (!kvm_host_owns_hyp_mappings()) 606 return -EPERM; 607 608 start = start & PAGE_MASK; 609 end = PAGE_ALIGN(end); 610 611 for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) { 612 int err; 613 614 phys_addr = kvm_kaddr_to_phys(from + virt_addr - start); 615 err = __create_hyp_mappings(virt_addr, PAGE_SIZE, phys_addr, 616 prot); 617 if (err) 618 return err; 619 } 620 621 return 0; 622 } 623 624 static int __hyp_alloc_private_va_range(unsigned long base) 625 { 626 lockdep_assert_held(&kvm_hyp_pgd_mutex); 627 628 if (!PAGE_ALIGNED(base)) 629 return -EINVAL; 630 631 /* 632 * Verify that BIT(VA_BITS - 1) hasn't been flipped by 633 * allocating the new area, as it would indicate we've 634 * overflowed the idmap/IO address range. 635 */ 636 if ((base ^ io_map_base) & BIT(VA_BITS - 1)) 637 return -ENOMEM; 638 639 io_map_base = base; 640 641 return 0; 642 } 643 644 /** 645 * hyp_alloc_private_va_range - Allocates a private VA range. 646 * @size: The size of the VA range to reserve. 647 * @haddr: The hypervisor virtual start address of the allocation. 648 * 649 * The private virtual address (VA) range is allocated below io_map_base 650 * and aligned based on the order of @size. 651 * 652 * Return: 0 on success or negative error code on failure. 653 */ 654 int hyp_alloc_private_va_range(size_t size, unsigned long *haddr) 655 { 656 unsigned long base; 657 int ret = 0; 658 659 mutex_lock(&kvm_hyp_pgd_mutex); 660 661 /* 662 * This assumes that we have enough space below the idmap 663 * page to allocate our VAs. If not, the check in 664 * __hyp_alloc_private_va_range() will kick. A potential 665 * alternative would be to detect that overflow and switch 666 * to an allocation above the idmap. 667 * 668 * The allocated size is always a multiple of PAGE_SIZE. 669 */ 670 size = PAGE_ALIGN(size); 671 base = io_map_base - size; 672 ret = __hyp_alloc_private_va_range(base); 673 674 mutex_unlock(&kvm_hyp_pgd_mutex); 675 676 if (!ret) 677 *haddr = base; 678 679 return ret; 680 } 681 682 static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size, 683 unsigned long *haddr, 684 enum kvm_pgtable_prot prot) 685 { 686 unsigned long addr; 687 int ret = 0; 688 689 if (!kvm_host_owns_hyp_mappings()) { 690 addr = kvm_call_hyp_nvhe(__pkvm_create_private_mapping, 691 phys_addr, size, prot); 692 if (IS_ERR_VALUE(addr)) 693 return addr; 694 *haddr = addr; 695 696 return 0; 697 } 698 699 size = PAGE_ALIGN(size + offset_in_page(phys_addr)); 700 ret = hyp_alloc_private_va_range(size, &addr); 701 if (ret) 702 return ret; 703 704 ret = __create_hyp_mappings(addr, size, phys_addr, prot); 705 if (ret) 706 return ret; 707 708 *haddr = addr + offset_in_page(phys_addr); 709 return ret; 710 } 711 712 int create_hyp_stack(phys_addr_t phys_addr, unsigned long *haddr) 713 { 714 unsigned long base; 715 size_t size; 716 int ret; 717 718 mutex_lock(&kvm_hyp_pgd_mutex); 719 /* 720 * Efficient stack verification using the NVHE_STACK_SHIFT bit implies 721 * an alignment of our allocation on the order of the size. 722 */ 723 size = NVHE_STACK_SIZE * 2; 724 base = ALIGN_DOWN(io_map_base - size, size); 725 726 ret = __hyp_alloc_private_va_range(base); 727 728 mutex_unlock(&kvm_hyp_pgd_mutex); 729 730 if (ret) { 731 kvm_err("Cannot allocate hyp stack guard page\n"); 732 return ret; 733 } 734 735 /* 736 * Since the stack grows downwards, map the stack to the page 737 * at the higher address and leave the lower guard page 738 * unbacked. 739 * 740 * Any valid stack address now has the NVHE_STACK_SHIFT bit as 1 741 * and addresses corresponding to the guard page have the 742 * NVHE_STACK_SHIFT bit as 0 - this is used for overflow detection. 743 */ 744 ret = __create_hyp_mappings(base + NVHE_STACK_SIZE, NVHE_STACK_SIZE, 745 phys_addr, PAGE_HYP); 746 if (ret) 747 kvm_err("Cannot map hyp stack\n"); 748 749 *haddr = base + size; 750 751 return ret; 752 } 753 754 /** 755 * create_hyp_io_mappings - Map IO into both kernel and HYP 756 * @phys_addr: The physical start address which gets mapped 757 * @size: Size of the region being mapped 758 * @kaddr: Kernel VA for this mapping 759 * @haddr: HYP VA for this mapping 760 */ 761 int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size, 762 void __iomem **kaddr, 763 void __iomem **haddr) 764 { 765 unsigned long addr; 766 int ret; 767 768 if (is_protected_kvm_enabled()) 769 return -EPERM; 770 771 *kaddr = ioremap(phys_addr, size); 772 if (!*kaddr) 773 return -ENOMEM; 774 775 if (is_kernel_in_hyp_mode()) { 776 *haddr = *kaddr; 777 return 0; 778 } 779 780 ret = __create_hyp_private_mapping(phys_addr, size, 781 &addr, PAGE_HYP_DEVICE); 782 if (ret) { 783 iounmap(*kaddr); 784 *kaddr = NULL; 785 *haddr = NULL; 786 return ret; 787 } 788 789 *haddr = (void __iomem *)addr; 790 return 0; 791 } 792 793 /** 794 * create_hyp_exec_mappings - Map an executable range into HYP 795 * @phys_addr: The physical start address which gets mapped 796 * @size: Size of the region being mapped 797 * @haddr: HYP VA for this mapping 798 */ 799 int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size, 800 void **haddr) 801 { 802 unsigned long addr; 803 int ret; 804 805 BUG_ON(is_kernel_in_hyp_mode()); 806 807 ret = __create_hyp_private_mapping(phys_addr, size, 808 &addr, PAGE_HYP_EXEC); 809 if (ret) { 810 *haddr = NULL; 811 return ret; 812 } 813 814 *haddr = (void *)addr; 815 return 0; 816 } 817 818 static struct kvm_pgtable_mm_ops kvm_user_mm_ops = { 819 /* We shouldn't need any other callback to walk the PT */ 820 .phys_to_virt = kvm_host_va, 821 }; 822 823 static int get_user_mapping_size(struct kvm *kvm, u64 addr) 824 { 825 struct kvm_pgtable pgt = { 826 .pgd = (kvm_pteref_t)kvm->mm->pgd, 827 .ia_bits = vabits_actual, 828 .start_level = (KVM_PGTABLE_LAST_LEVEL - 829 ARM64_HW_PGTABLE_LEVELS(pgt.ia_bits) + 1), 830 .mm_ops = &kvm_user_mm_ops, 831 }; 832 unsigned long flags; 833 kvm_pte_t pte = 0; /* Keep GCC quiet... */ 834 s8 level = S8_MAX; 835 int ret; 836 837 /* 838 * Disable IRQs so that we hazard against a concurrent 839 * teardown of the userspace page tables (which relies on 840 * IPI-ing threads). 841 */ 842 local_irq_save(flags); 843 ret = kvm_pgtable_get_leaf(&pgt, addr, &pte, &level); 844 local_irq_restore(flags); 845 846 if (ret) 847 return ret; 848 849 /* 850 * Not seeing an error, but not updating level? Something went 851 * deeply wrong... 852 */ 853 if (WARN_ON(level > KVM_PGTABLE_LAST_LEVEL)) 854 return -EFAULT; 855 if (WARN_ON(level < KVM_PGTABLE_FIRST_LEVEL)) 856 return -EFAULT; 857 858 /* Oops, the userspace PTs are gone... Replay the fault */ 859 if (!kvm_pte_valid(pte)) 860 return -EAGAIN; 861 862 return BIT(ARM64_HW_PGTABLE_LEVEL_SHIFT(level)); 863 } 864 865 static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = { 866 .zalloc_page = stage2_memcache_zalloc_page, 867 .zalloc_pages_exact = kvm_s2_zalloc_pages_exact, 868 .free_pages_exact = kvm_s2_free_pages_exact, 869 .free_unlinked_table = stage2_free_unlinked_table, 870 .get_page = kvm_host_get_page, 871 .put_page = kvm_s2_put_page, 872 .page_count = kvm_host_page_count, 873 .phys_to_virt = kvm_host_va, 874 .virt_to_phys = kvm_host_pa, 875 .dcache_clean_inval_poc = clean_dcache_guest_page, 876 .icache_inval_pou = invalidate_icache_guest_page, 877 }; 878 879 static int kvm_init_ipa_range(struct kvm_s2_mmu *mmu, unsigned long type) 880 { 881 u32 kvm_ipa_limit = get_kvm_ipa_limit(); 882 u64 mmfr0, mmfr1; 883 u32 phys_shift; 884 885 if (type & ~KVM_VM_TYPE_ARM_IPA_SIZE_MASK) 886 return -EINVAL; 887 888 phys_shift = KVM_VM_TYPE_ARM_IPA_SIZE(type); 889 if (is_protected_kvm_enabled()) { 890 phys_shift = kvm_ipa_limit; 891 } else if (phys_shift) { 892 if (phys_shift > kvm_ipa_limit || 893 phys_shift < ARM64_MIN_PARANGE_BITS) 894 return -EINVAL; 895 } else { 896 phys_shift = KVM_PHYS_SHIFT; 897 if (phys_shift > kvm_ipa_limit) { 898 pr_warn_once("%s using unsupported default IPA limit, upgrade your VMM\n", 899 current->comm); 900 return -EINVAL; 901 } 902 } 903 904 mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); 905 mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); 906 mmu->vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift); 907 908 return 0; 909 } 910 911 /** 912 * kvm_init_stage2_mmu - Initialise a S2 MMU structure 913 * @kvm: The pointer to the KVM structure 914 * @mmu: The pointer to the s2 MMU structure 915 * @type: The machine type of the virtual machine 916 * 917 * Allocates only the stage-2 HW PGD level table(s). 918 * Note we don't need locking here as this is only called in two cases: 919 * 920 * - when the VM is created, which can't race against anything 921 * 922 * - when secondary kvm_s2_mmu structures are initialised for NV 923 * guests, and the caller must hold kvm->lock as this is called on a 924 * per-vcpu basis. 925 */ 926 int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type) 927 { 928 int cpu, err; 929 struct kvm_pgtable *pgt; 930 931 /* 932 * If we already have our page tables in place, and that the 933 * MMU context is the canonical one, we have a bug somewhere, 934 * as this is only supposed to ever happen once per VM. 935 * 936 * Otherwise, we're building nested page tables, and that's 937 * probably because userspace called KVM_ARM_VCPU_INIT more 938 * than once on the same vcpu. Since that's actually legal, 939 * don't kick a fuss and leave gracefully. 940 */ 941 if (mmu->pgt != NULL) { 942 if (kvm_is_nested_s2_mmu(kvm, mmu)) 943 return 0; 944 945 kvm_err("kvm_arch already initialized?\n"); 946 return -EINVAL; 947 } 948 949 err = kvm_init_ipa_range(mmu, type); 950 if (err) 951 return err; 952 953 pgt = kzalloc(sizeof(*pgt), GFP_KERNEL_ACCOUNT); 954 if (!pgt) 955 return -ENOMEM; 956 957 mmu->arch = &kvm->arch; 958 err = KVM_PGT_FN(kvm_pgtable_stage2_init)(pgt, mmu, &kvm_s2_mm_ops); 959 if (err) 960 goto out_free_pgtable; 961 962 mmu->pgt = pgt; 963 if (is_protected_kvm_enabled()) 964 return 0; 965 966 mmu->last_vcpu_ran = alloc_percpu(typeof(*mmu->last_vcpu_ran)); 967 if (!mmu->last_vcpu_ran) { 968 err = -ENOMEM; 969 goto out_destroy_pgtable; 970 } 971 972 for_each_possible_cpu(cpu) 973 *per_cpu_ptr(mmu->last_vcpu_ran, cpu) = -1; 974 975 /* The eager page splitting is disabled by default */ 976 mmu->split_page_chunk_size = KVM_ARM_EAGER_SPLIT_CHUNK_SIZE_DEFAULT; 977 mmu->split_page_cache.gfp_zero = __GFP_ZERO; 978 979 mmu->pgd_phys = __pa(pgt->pgd); 980 981 if (kvm_is_nested_s2_mmu(kvm, mmu)) 982 kvm_init_nested_s2_mmu(mmu); 983 984 return 0; 985 986 out_destroy_pgtable: 987 KVM_PGT_FN(kvm_pgtable_stage2_destroy)(pgt); 988 out_free_pgtable: 989 kfree(pgt); 990 return err; 991 } 992 993 void kvm_uninit_stage2_mmu(struct kvm *kvm) 994 { 995 kvm_free_stage2_pgd(&kvm->arch.mmu); 996 kvm_mmu_free_memory_cache(&kvm->arch.mmu.split_page_cache); 997 } 998 999 static void stage2_unmap_memslot(struct kvm *kvm, 1000 struct kvm_memory_slot *memslot) 1001 { 1002 hva_t hva = memslot->userspace_addr; 1003 phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT; 1004 phys_addr_t size = PAGE_SIZE * memslot->npages; 1005 hva_t reg_end = hva + size; 1006 1007 /* 1008 * A memory region could potentially cover multiple VMAs, and any holes 1009 * between them, so iterate over all of them to find out if we should 1010 * unmap any of them. 1011 * 1012 * +--------------------------------------------+ 1013 * +---------------+----------------+ +----------------+ 1014 * | : VMA 1 | VMA 2 | | VMA 3 : | 1015 * +---------------+----------------+ +----------------+ 1016 * | memory region | 1017 * +--------------------------------------------+ 1018 */ 1019 do { 1020 struct vm_area_struct *vma; 1021 hva_t vm_start, vm_end; 1022 1023 vma = find_vma_intersection(current->mm, hva, reg_end); 1024 if (!vma) 1025 break; 1026 1027 /* 1028 * Take the intersection of this VMA with the memory region 1029 */ 1030 vm_start = max(hva, vma->vm_start); 1031 vm_end = min(reg_end, vma->vm_end); 1032 1033 if (!(vma->vm_flags & VM_PFNMAP)) { 1034 gpa_t gpa = addr + (vm_start - memslot->userspace_addr); 1035 kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, vm_end - vm_start, true); 1036 } 1037 hva = vm_end; 1038 } while (hva < reg_end); 1039 } 1040 1041 /** 1042 * stage2_unmap_vm - Unmap Stage-2 RAM mappings 1043 * @kvm: The struct kvm pointer 1044 * 1045 * Go through the memregions and unmap any regular RAM 1046 * backing memory already mapped to the VM. 1047 */ 1048 void stage2_unmap_vm(struct kvm *kvm) 1049 { 1050 struct kvm_memslots *slots; 1051 struct kvm_memory_slot *memslot; 1052 int idx, bkt; 1053 1054 idx = srcu_read_lock(&kvm->srcu); 1055 mmap_read_lock(current->mm); 1056 write_lock(&kvm->mmu_lock); 1057 1058 slots = kvm_memslots(kvm); 1059 kvm_for_each_memslot(memslot, bkt, slots) 1060 stage2_unmap_memslot(kvm, memslot); 1061 1062 kvm_nested_s2_unmap(kvm, true); 1063 1064 write_unlock(&kvm->mmu_lock); 1065 mmap_read_unlock(current->mm); 1066 srcu_read_unlock(&kvm->srcu, idx); 1067 } 1068 1069 void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu) 1070 { 1071 struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu); 1072 struct kvm_pgtable *pgt = NULL; 1073 1074 write_lock(&kvm->mmu_lock); 1075 pgt = mmu->pgt; 1076 if (pgt) { 1077 mmu->pgd_phys = 0; 1078 mmu->pgt = NULL; 1079 free_percpu(mmu->last_vcpu_ran); 1080 } 1081 write_unlock(&kvm->mmu_lock); 1082 1083 if (pgt) { 1084 KVM_PGT_FN(kvm_pgtable_stage2_destroy)(pgt); 1085 kfree(pgt); 1086 } 1087 } 1088 1089 static void hyp_mc_free_fn(void *addr, void *mc) 1090 { 1091 struct kvm_hyp_memcache *memcache = mc; 1092 1093 if (memcache->flags & HYP_MEMCACHE_ACCOUNT_STAGE2) 1094 kvm_account_pgtable_pages(addr, -1); 1095 1096 free_page((unsigned long)addr); 1097 } 1098 1099 static void *hyp_mc_alloc_fn(void *mc) 1100 { 1101 struct kvm_hyp_memcache *memcache = mc; 1102 void *addr; 1103 1104 addr = (void *)__get_free_page(GFP_KERNEL_ACCOUNT); 1105 if (addr && memcache->flags & HYP_MEMCACHE_ACCOUNT_STAGE2) 1106 kvm_account_pgtable_pages(addr, 1); 1107 1108 return addr; 1109 } 1110 1111 void free_hyp_memcache(struct kvm_hyp_memcache *mc) 1112 { 1113 if (!is_protected_kvm_enabled()) 1114 return; 1115 1116 kfree(mc->mapping); 1117 __free_hyp_memcache(mc, hyp_mc_free_fn, kvm_host_va, mc); 1118 } 1119 1120 int topup_hyp_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages) 1121 { 1122 if (!is_protected_kvm_enabled()) 1123 return 0; 1124 1125 if (!mc->mapping) { 1126 mc->mapping = kzalloc(sizeof(struct pkvm_mapping), GFP_KERNEL_ACCOUNT); 1127 if (!mc->mapping) 1128 return -ENOMEM; 1129 } 1130 1131 return __topup_hyp_memcache(mc, min_pages, hyp_mc_alloc_fn, 1132 kvm_host_pa, mc); 1133 } 1134 1135 /** 1136 * kvm_phys_addr_ioremap - map a device range to guest IPA 1137 * 1138 * @kvm: The KVM pointer 1139 * @guest_ipa: The IPA at which to insert the mapping 1140 * @pa: The physical address of the device 1141 * @size: The size of the mapping 1142 * @writable: Whether or not to create a writable mapping 1143 */ 1144 int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, 1145 phys_addr_t pa, unsigned long size, bool writable) 1146 { 1147 phys_addr_t addr; 1148 int ret = 0; 1149 struct kvm_mmu_memory_cache cache = { .gfp_zero = __GFP_ZERO }; 1150 struct kvm_s2_mmu *mmu = &kvm->arch.mmu; 1151 struct kvm_pgtable *pgt = mmu->pgt; 1152 enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_DEVICE | 1153 KVM_PGTABLE_PROT_R | 1154 (writable ? KVM_PGTABLE_PROT_W : 0); 1155 1156 if (is_protected_kvm_enabled()) 1157 return -EPERM; 1158 1159 size += offset_in_page(guest_ipa); 1160 guest_ipa &= PAGE_MASK; 1161 1162 for (addr = guest_ipa; addr < guest_ipa + size; addr += PAGE_SIZE) { 1163 ret = kvm_mmu_topup_memory_cache(&cache, 1164 kvm_mmu_cache_min_pages(mmu)); 1165 if (ret) 1166 break; 1167 1168 write_lock(&kvm->mmu_lock); 1169 ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, addr, PAGE_SIZE, 1170 pa, prot, &cache, 0); 1171 write_unlock(&kvm->mmu_lock); 1172 if (ret) 1173 break; 1174 1175 pa += PAGE_SIZE; 1176 } 1177 1178 kvm_mmu_free_memory_cache(&cache); 1179 return ret; 1180 } 1181 1182 /** 1183 * kvm_stage2_wp_range() - write protect stage2 memory region range 1184 * @mmu: The KVM stage-2 MMU pointer 1185 * @addr: Start address of range 1186 * @end: End address of range 1187 */ 1188 void kvm_stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end) 1189 { 1190 stage2_apply_range_resched(mmu, addr, end, KVM_PGT_FN(kvm_pgtable_stage2_wrprotect)); 1191 } 1192 1193 /** 1194 * kvm_mmu_wp_memory_region() - write protect stage 2 entries for memory slot 1195 * @kvm: The KVM pointer 1196 * @slot: The memory slot to write protect 1197 * 1198 * Called to start logging dirty pages after memory region 1199 * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns 1200 * all present PUD, PMD and PTEs are write protected in the memory region. 1201 * Afterwards read of dirty page log can be called. 1202 * 1203 * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired, 1204 * serializing operations for VM memory regions. 1205 */ 1206 static void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot) 1207 { 1208 struct kvm_memslots *slots = kvm_memslots(kvm); 1209 struct kvm_memory_slot *memslot = id_to_memslot(slots, slot); 1210 phys_addr_t start, end; 1211 1212 if (WARN_ON_ONCE(!memslot)) 1213 return; 1214 1215 start = memslot->base_gfn << PAGE_SHIFT; 1216 end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT; 1217 1218 write_lock(&kvm->mmu_lock); 1219 kvm_stage2_wp_range(&kvm->arch.mmu, start, end); 1220 kvm_nested_s2_wp(kvm); 1221 write_unlock(&kvm->mmu_lock); 1222 kvm_flush_remote_tlbs_memslot(kvm, memslot); 1223 } 1224 1225 /** 1226 * kvm_mmu_split_memory_region() - split the stage 2 blocks into PAGE_SIZE 1227 * pages for memory slot 1228 * @kvm: The KVM pointer 1229 * @slot: The memory slot to split 1230 * 1231 * Acquires kvm->mmu_lock. Called with kvm->slots_lock mutex acquired, 1232 * serializing operations for VM memory regions. 1233 */ 1234 static void kvm_mmu_split_memory_region(struct kvm *kvm, int slot) 1235 { 1236 struct kvm_memslots *slots; 1237 struct kvm_memory_slot *memslot; 1238 phys_addr_t start, end; 1239 1240 lockdep_assert_held(&kvm->slots_lock); 1241 1242 slots = kvm_memslots(kvm); 1243 memslot = id_to_memslot(slots, slot); 1244 1245 start = memslot->base_gfn << PAGE_SHIFT; 1246 end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT; 1247 1248 write_lock(&kvm->mmu_lock); 1249 kvm_mmu_split_huge_pages(kvm, start, end); 1250 write_unlock(&kvm->mmu_lock); 1251 } 1252 1253 /* 1254 * kvm_arch_mmu_enable_log_dirty_pt_masked() - enable dirty logging for selected pages. 1255 * @kvm: The KVM pointer 1256 * @slot: The memory slot associated with mask 1257 * @gfn_offset: The gfn offset in memory slot 1258 * @mask: The mask of pages at offset 'gfn_offset' in this memory 1259 * slot to enable dirty logging on 1260 * 1261 * Writes protect selected pages to enable dirty logging, and then 1262 * splits them to PAGE_SIZE. Caller must acquire kvm->mmu_lock. 1263 */ 1264 void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, 1265 struct kvm_memory_slot *slot, 1266 gfn_t gfn_offset, unsigned long mask) 1267 { 1268 phys_addr_t base_gfn = slot->base_gfn + gfn_offset; 1269 phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT; 1270 phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT; 1271 1272 lockdep_assert_held_write(&kvm->mmu_lock); 1273 1274 kvm_stage2_wp_range(&kvm->arch.mmu, start, end); 1275 1276 /* 1277 * Eager-splitting is done when manual-protect is set. We 1278 * also check for initially-all-set because we can avoid 1279 * eager-splitting if initially-all-set is false. 1280 * Initially-all-set equal false implies that huge-pages were 1281 * already split when enabling dirty logging: no need to do it 1282 * again. 1283 */ 1284 if (kvm_dirty_log_manual_protect_and_init_set(kvm)) 1285 kvm_mmu_split_huge_pages(kvm, start, end); 1286 1287 kvm_nested_s2_wp(kvm); 1288 } 1289 1290 static void kvm_send_hwpoison_signal(unsigned long address, short lsb) 1291 { 1292 send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current); 1293 } 1294 1295 static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot, 1296 unsigned long hva, 1297 unsigned long map_size) 1298 { 1299 gpa_t gpa_start; 1300 hva_t uaddr_start, uaddr_end; 1301 size_t size; 1302 1303 /* The memslot and the VMA are guaranteed to be aligned to PAGE_SIZE */ 1304 if (map_size == PAGE_SIZE) 1305 return true; 1306 1307 size = memslot->npages * PAGE_SIZE; 1308 1309 gpa_start = memslot->base_gfn << PAGE_SHIFT; 1310 1311 uaddr_start = memslot->userspace_addr; 1312 uaddr_end = uaddr_start + size; 1313 1314 /* 1315 * Pages belonging to memslots that don't have the same alignment 1316 * within a PMD/PUD for userspace and IPA cannot be mapped with stage-2 1317 * PMD/PUD entries, because we'll end up mapping the wrong pages. 1318 * 1319 * Consider a layout like the following: 1320 * 1321 * memslot->userspace_addr: 1322 * +-----+--------------------+--------------------+---+ 1323 * |abcde|fgh Stage-1 block | Stage-1 block tv|xyz| 1324 * +-----+--------------------+--------------------+---+ 1325 * 1326 * memslot->base_gfn << PAGE_SHIFT: 1327 * +---+--------------------+--------------------+-----+ 1328 * |abc|def Stage-2 block | Stage-2 block |tvxyz| 1329 * +---+--------------------+--------------------+-----+ 1330 * 1331 * If we create those stage-2 blocks, we'll end up with this incorrect 1332 * mapping: 1333 * d -> f 1334 * e -> g 1335 * f -> h 1336 */ 1337 if ((gpa_start & (map_size - 1)) != (uaddr_start & (map_size - 1))) 1338 return false; 1339 1340 /* 1341 * Next, let's make sure we're not trying to map anything not covered 1342 * by the memslot. This means we have to prohibit block size mappings 1343 * for the beginning and end of a non-block aligned and non-block sized 1344 * memory slot (illustrated by the head and tail parts of the 1345 * userspace view above containing pages 'abcde' and 'xyz', 1346 * respectively). 1347 * 1348 * Note that it doesn't matter if we do the check using the 1349 * userspace_addr or the base_gfn, as both are equally aligned (per 1350 * the check above) and equally sized. 1351 */ 1352 return (hva & ~(map_size - 1)) >= uaddr_start && 1353 (hva & ~(map_size - 1)) + map_size <= uaddr_end; 1354 } 1355 1356 /* 1357 * Check if the given hva is backed by a transparent huge page (THP) and 1358 * whether it can be mapped using block mapping in stage2. If so, adjust 1359 * the stage2 PFN and IPA accordingly. Only PMD_SIZE THPs are currently 1360 * supported. This will need to be updated to support other THP sizes. 1361 * 1362 * Returns the size of the mapping. 1363 */ 1364 static long 1365 transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot, 1366 unsigned long hva, kvm_pfn_t *pfnp, 1367 phys_addr_t *ipap) 1368 { 1369 kvm_pfn_t pfn = *pfnp; 1370 1371 /* 1372 * Make sure the adjustment is done only for THP pages. Also make 1373 * sure that the HVA and IPA are sufficiently aligned and that the 1374 * block map is contained within the memslot. 1375 */ 1376 if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) { 1377 int sz = get_user_mapping_size(kvm, hva); 1378 1379 if (sz < 0) 1380 return sz; 1381 1382 if (sz < PMD_SIZE) 1383 return PAGE_SIZE; 1384 1385 *ipap &= PMD_MASK; 1386 pfn &= ~(PTRS_PER_PMD - 1); 1387 *pfnp = pfn; 1388 1389 return PMD_SIZE; 1390 } 1391 1392 /* Use page mapping if we cannot use block mapping. */ 1393 return PAGE_SIZE; 1394 } 1395 1396 static int get_vma_page_shift(struct vm_area_struct *vma, unsigned long hva) 1397 { 1398 unsigned long pa; 1399 1400 if (is_vm_hugetlb_page(vma) && !(vma->vm_flags & VM_PFNMAP)) 1401 return huge_page_shift(hstate_vma(vma)); 1402 1403 if (!(vma->vm_flags & VM_PFNMAP)) 1404 return PAGE_SHIFT; 1405 1406 VM_BUG_ON(is_vm_hugetlb_page(vma)); 1407 1408 pa = (vma->vm_pgoff << PAGE_SHIFT) + (hva - vma->vm_start); 1409 1410 #ifndef __PAGETABLE_PMD_FOLDED 1411 if ((hva & (PUD_SIZE - 1)) == (pa & (PUD_SIZE - 1)) && 1412 ALIGN_DOWN(hva, PUD_SIZE) >= vma->vm_start && 1413 ALIGN(hva, PUD_SIZE) <= vma->vm_end) 1414 return PUD_SHIFT; 1415 #endif 1416 1417 if ((hva & (PMD_SIZE - 1)) == (pa & (PMD_SIZE - 1)) && 1418 ALIGN_DOWN(hva, PMD_SIZE) >= vma->vm_start && 1419 ALIGN(hva, PMD_SIZE) <= vma->vm_end) 1420 return PMD_SHIFT; 1421 1422 return PAGE_SHIFT; 1423 } 1424 1425 /* 1426 * The page will be mapped in stage 2 as Normal Cacheable, so the VM will be 1427 * able to see the page's tags and therefore they must be initialised first. If 1428 * PG_mte_tagged is set, tags have already been initialised. 1429 * 1430 * The race in the test/set of the PG_mte_tagged flag is handled by: 1431 * - preventing VM_SHARED mappings in a memslot with MTE preventing two VMs 1432 * racing to santise the same page 1433 * - mmap_lock protects between a VM faulting a page in and the VMM performing 1434 * an mprotect() to add VM_MTE 1435 */ 1436 static void sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn, 1437 unsigned long size) 1438 { 1439 unsigned long i, nr_pages = size >> PAGE_SHIFT; 1440 struct page *page = pfn_to_page(pfn); 1441 struct folio *folio = page_folio(page); 1442 1443 if (!kvm_has_mte(kvm)) 1444 return; 1445 1446 if (folio_test_hugetlb(folio)) { 1447 /* Hugetlb has MTE flags set on head page only */ 1448 if (folio_try_hugetlb_mte_tagging(folio)) { 1449 for (i = 0; i < nr_pages; i++, page++) 1450 mte_clear_page_tags(page_address(page)); 1451 folio_set_hugetlb_mte_tagged(folio); 1452 } 1453 return; 1454 } 1455 1456 for (i = 0; i < nr_pages; i++, page++) { 1457 if (try_page_mte_tagging(page)) { 1458 mte_clear_page_tags(page_address(page)); 1459 set_page_mte_tagged(page); 1460 } 1461 } 1462 } 1463 1464 static bool kvm_vma_mte_allowed(struct vm_area_struct *vma) 1465 { 1466 return vma->vm_flags & VM_MTE_ALLOWED; 1467 } 1468 1469 static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, 1470 struct kvm_s2_trans *nested, 1471 struct kvm_memory_slot *memslot, unsigned long hva, 1472 bool fault_is_perm) 1473 { 1474 int ret = 0; 1475 bool write_fault, writable, force_pte = false; 1476 bool exec_fault, mte_allowed; 1477 bool device = false, vfio_allow_any_uc = false; 1478 unsigned long mmu_seq; 1479 phys_addr_t ipa = fault_ipa; 1480 struct kvm *kvm = vcpu->kvm; 1481 struct vm_area_struct *vma; 1482 short vma_shift; 1483 void *memcache; 1484 gfn_t gfn; 1485 kvm_pfn_t pfn; 1486 bool logging_active = memslot_is_logging(memslot); 1487 long vma_pagesize, fault_granule; 1488 enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; 1489 struct kvm_pgtable *pgt; 1490 struct page *page; 1491 enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED; 1492 1493 if (fault_is_perm) 1494 fault_granule = kvm_vcpu_trap_get_perm_fault_granule(vcpu); 1495 write_fault = kvm_is_write_fault(vcpu); 1496 exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu); 1497 VM_BUG_ON(write_fault && exec_fault); 1498 1499 if (fault_is_perm && !write_fault && !exec_fault) { 1500 kvm_err("Unexpected L2 read permission error\n"); 1501 return -EFAULT; 1502 } 1503 1504 /* 1505 * Permission faults just need to update the existing leaf entry, 1506 * and so normally don't require allocations from the memcache. The 1507 * only exception to this is when dirty logging is enabled at runtime 1508 * and a write fault needs to collapse a block entry into a table. 1509 */ 1510 if (!fault_is_perm || (logging_active && write_fault)) { 1511 int min_pages = kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu); 1512 1513 if (!is_protected_kvm_enabled()) { 1514 memcache = &vcpu->arch.mmu_page_cache; 1515 ret = kvm_mmu_topup_memory_cache(memcache, min_pages); 1516 } else { 1517 memcache = &vcpu->arch.pkvm_memcache; 1518 ret = topup_hyp_memcache(memcache, min_pages); 1519 } 1520 if (ret) 1521 return ret; 1522 } 1523 1524 /* 1525 * Let's check if we will get back a huge page backed by hugetlbfs, or 1526 * get block mapping for device MMIO region. 1527 */ 1528 mmap_read_lock(current->mm); 1529 vma = vma_lookup(current->mm, hva); 1530 if (unlikely(!vma)) { 1531 kvm_err("Failed to find VMA for hva 0x%lx\n", hva); 1532 mmap_read_unlock(current->mm); 1533 return -EFAULT; 1534 } 1535 1536 /* 1537 * logging_active is guaranteed to never be true for VM_PFNMAP 1538 * memslots. 1539 */ 1540 if (logging_active || is_protected_kvm_enabled()) { 1541 force_pte = true; 1542 vma_shift = PAGE_SHIFT; 1543 } else { 1544 vma_shift = get_vma_page_shift(vma, hva); 1545 } 1546 1547 switch (vma_shift) { 1548 #ifndef __PAGETABLE_PMD_FOLDED 1549 case PUD_SHIFT: 1550 if (fault_supports_stage2_huge_mapping(memslot, hva, PUD_SIZE)) 1551 break; 1552 fallthrough; 1553 #endif 1554 case CONT_PMD_SHIFT: 1555 vma_shift = PMD_SHIFT; 1556 fallthrough; 1557 case PMD_SHIFT: 1558 if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) 1559 break; 1560 fallthrough; 1561 case CONT_PTE_SHIFT: 1562 vma_shift = PAGE_SHIFT; 1563 force_pte = true; 1564 fallthrough; 1565 case PAGE_SHIFT: 1566 break; 1567 default: 1568 WARN_ONCE(1, "Unknown vma_shift %d", vma_shift); 1569 } 1570 1571 vma_pagesize = 1UL << vma_shift; 1572 1573 if (nested) { 1574 unsigned long max_map_size; 1575 1576 max_map_size = force_pte ? PAGE_SIZE : PUD_SIZE; 1577 1578 ipa = kvm_s2_trans_output(nested); 1579 1580 /* 1581 * If we're about to create a shadow stage 2 entry, then we 1582 * can only create a block mapping if the guest stage 2 page 1583 * table uses at least as big a mapping. 1584 */ 1585 max_map_size = min(kvm_s2_trans_size(nested), max_map_size); 1586 1587 /* 1588 * Be careful that if the mapping size falls between 1589 * two host sizes, take the smallest of the two. 1590 */ 1591 if (max_map_size >= PMD_SIZE && max_map_size < PUD_SIZE) 1592 max_map_size = PMD_SIZE; 1593 else if (max_map_size >= PAGE_SIZE && max_map_size < PMD_SIZE) 1594 max_map_size = PAGE_SIZE; 1595 1596 force_pte = (max_map_size == PAGE_SIZE); 1597 vma_pagesize = min(vma_pagesize, (long)max_map_size); 1598 } 1599 1600 /* 1601 * Both the canonical IPA and fault IPA must be hugepage-aligned to 1602 * ensure we find the right PFN and lay down the mapping in the right 1603 * place. 1604 */ 1605 if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE) { 1606 fault_ipa &= ~(vma_pagesize - 1); 1607 ipa &= ~(vma_pagesize - 1); 1608 } 1609 1610 gfn = ipa >> PAGE_SHIFT; 1611 mte_allowed = kvm_vma_mte_allowed(vma); 1612 1613 vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED; 1614 1615 /* Don't use the VMA after the unlock -- it may have vanished */ 1616 vma = NULL; 1617 1618 /* 1619 * Read mmu_invalidate_seq so that KVM can detect if the results of 1620 * vma_lookup() or __kvm_faultin_pfn() become stale prior to 1621 * acquiring kvm->mmu_lock. 1622 * 1623 * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs 1624 * with the smp_wmb() in kvm_mmu_invalidate_end(). 1625 */ 1626 mmu_seq = vcpu->kvm->mmu_invalidate_seq; 1627 mmap_read_unlock(current->mm); 1628 1629 pfn = __kvm_faultin_pfn(memslot, gfn, write_fault ? FOLL_WRITE : 0, 1630 &writable, &page); 1631 if (pfn == KVM_PFN_ERR_HWPOISON) { 1632 kvm_send_hwpoison_signal(hva, vma_shift); 1633 return 0; 1634 } 1635 if (is_error_noslot_pfn(pfn)) 1636 return -EFAULT; 1637 1638 if (kvm_is_device_pfn(pfn)) { 1639 /* 1640 * If the page was identified as device early by looking at 1641 * the VMA flags, vma_pagesize is already representing the 1642 * largest quantity we can map. If instead it was mapped 1643 * via __kvm_faultin_pfn(), vma_pagesize is set to PAGE_SIZE 1644 * and must not be upgraded. 1645 * 1646 * In both cases, we don't let transparent_hugepage_adjust() 1647 * change things at the last minute. 1648 */ 1649 device = true; 1650 } else if (logging_active && !write_fault) { 1651 /* 1652 * Only actually map the page as writable if this was a write 1653 * fault. 1654 */ 1655 writable = false; 1656 } 1657 1658 if (exec_fault && device) 1659 return -ENOEXEC; 1660 1661 /* 1662 * Potentially reduce shadow S2 permissions to match the guest's own 1663 * S2. For exec faults, we'd only reach this point if the guest 1664 * actually allowed it (see kvm_s2_handle_perm_fault). 1665 * 1666 * Also encode the level of the original translation in the SW bits 1667 * of the leaf entry as a proxy for the span of that translation. 1668 * This will be retrieved on TLB invalidation from the guest and 1669 * used to limit the invalidation scope if a TTL hint or a range 1670 * isn't provided. 1671 */ 1672 if (nested) { 1673 writable &= kvm_s2_trans_writable(nested); 1674 if (!kvm_s2_trans_readable(nested)) 1675 prot &= ~KVM_PGTABLE_PROT_R; 1676 1677 prot |= kvm_encode_nested_level(nested); 1678 } 1679 1680 kvm_fault_lock(kvm); 1681 pgt = vcpu->arch.hw_mmu->pgt; 1682 if (mmu_invalidate_retry(kvm, mmu_seq)) { 1683 ret = -EAGAIN; 1684 goto out_unlock; 1685 } 1686 1687 /* 1688 * If we are not forced to use page mapping, check if we are 1689 * backed by a THP and thus use block mapping if possible. 1690 */ 1691 if (vma_pagesize == PAGE_SIZE && !(force_pte || device)) { 1692 if (fault_is_perm && fault_granule > PAGE_SIZE) 1693 vma_pagesize = fault_granule; 1694 else 1695 vma_pagesize = transparent_hugepage_adjust(kvm, memslot, 1696 hva, &pfn, 1697 &fault_ipa); 1698 1699 if (vma_pagesize < 0) { 1700 ret = vma_pagesize; 1701 goto out_unlock; 1702 } 1703 } 1704 1705 if (!fault_is_perm && !device && kvm_has_mte(kvm)) { 1706 /* Check the VMM hasn't introduced a new disallowed VMA */ 1707 if (mte_allowed) { 1708 sanitise_mte_tags(kvm, pfn, vma_pagesize); 1709 } else { 1710 ret = -EFAULT; 1711 goto out_unlock; 1712 } 1713 } 1714 1715 if (writable) 1716 prot |= KVM_PGTABLE_PROT_W; 1717 1718 if (exec_fault) 1719 prot |= KVM_PGTABLE_PROT_X; 1720 1721 if (device) { 1722 if (vfio_allow_any_uc) 1723 prot |= KVM_PGTABLE_PROT_NORMAL_NC; 1724 else 1725 prot |= KVM_PGTABLE_PROT_DEVICE; 1726 } else if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC) && 1727 (!nested || kvm_s2_trans_executable(nested))) { 1728 prot |= KVM_PGTABLE_PROT_X; 1729 } 1730 1731 /* 1732 * Under the premise of getting a FSC_PERM fault, we just need to relax 1733 * permissions only if vma_pagesize equals fault_granule. Otherwise, 1734 * kvm_pgtable_stage2_map() should be called to change block size. 1735 */ 1736 if (fault_is_perm && vma_pagesize == fault_granule) { 1737 /* 1738 * Drop the SW bits in favour of those stored in the 1739 * PTE, which will be preserved. 1740 */ 1741 prot &= ~KVM_NV_GUEST_MAP_SZ; 1742 ret = KVM_PGT_FN(kvm_pgtable_stage2_relax_perms)(pgt, fault_ipa, prot, flags); 1743 } else { 1744 ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, fault_ipa, vma_pagesize, 1745 __pfn_to_phys(pfn), prot, 1746 memcache, flags); 1747 } 1748 1749 out_unlock: 1750 kvm_release_faultin_page(kvm, page, !!ret, writable); 1751 kvm_fault_unlock(kvm); 1752 1753 /* Mark the page dirty only if the fault is handled successfully */ 1754 if (writable && !ret) 1755 mark_page_dirty_in_slot(kvm, memslot, gfn); 1756 1757 return ret != -EAGAIN ? ret : 0; 1758 } 1759 1760 /* Resolve the access fault by making the page young again. */ 1761 static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) 1762 { 1763 enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED; 1764 struct kvm_s2_mmu *mmu; 1765 1766 trace_kvm_access_fault(fault_ipa); 1767 1768 read_lock(&vcpu->kvm->mmu_lock); 1769 mmu = vcpu->arch.hw_mmu; 1770 KVM_PGT_FN(kvm_pgtable_stage2_mkyoung)(mmu->pgt, fault_ipa, flags); 1771 read_unlock(&vcpu->kvm->mmu_lock); 1772 } 1773 1774 /** 1775 * kvm_handle_guest_abort - handles all 2nd stage aborts 1776 * @vcpu: the VCPU pointer 1777 * 1778 * Any abort that gets to the host is almost guaranteed to be caused by a 1779 * missing second stage translation table entry, which can mean that either the 1780 * guest simply needs more memory and we must allocate an appropriate page or it 1781 * can mean that the guest tried to access I/O memory, which is emulated by user 1782 * space. The distinction is based on the IPA causing the fault and whether this 1783 * memory region has been registered as standard RAM by user space. 1784 */ 1785 int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) 1786 { 1787 struct kvm_s2_trans nested_trans, *nested = NULL; 1788 unsigned long esr; 1789 phys_addr_t fault_ipa; /* The address we faulted on */ 1790 phys_addr_t ipa; /* Always the IPA in the L1 guest phys space */ 1791 struct kvm_memory_slot *memslot; 1792 unsigned long hva; 1793 bool is_iabt, write_fault, writable; 1794 gfn_t gfn; 1795 int ret, idx; 1796 1797 /* Synchronous External Abort? */ 1798 if (kvm_vcpu_abt_issea(vcpu)) { 1799 /* 1800 * For RAS the host kernel may handle this abort. 1801 * There is no need to pass the error into the guest. 1802 */ 1803 if (kvm_handle_guest_sea()) 1804 kvm_inject_vabt(vcpu); 1805 1806 return 1; 1807 } 1808 1809 esr = kvm_vcpu_get_esr(vcpu); 1810 1811 /* 1812 * The fault IPA should be reliable at this point as we're not dealing 1813 * with an SEA. 1814 */ 1815 ipa = fault_ipa = kvm_vcpu_get_fault_ipa(vcpu); 1816 if (KVM_BUG_ON(ipa == INVALID_GPA, vcpu->kvm)) 1817 return -EFAULT; 1818 1819 is_iabt = kvm_vcpu_trap_is_iabt(vcpu); 1820 1821 if (esr_fsc_is_translation_fault(esr)) { 1822 /* Beyond sanitised PARange (which is the IPA limit) */ 1823 if (fault_ipa >= BIT_ULL(get_kvm_ipa_limit())) { 1824 kvm_inject_size_fault(vcpu); 1825 return 1; 1826 } 1827 1828 /* Falls between the IPA range and the PARange? */ 1829 if (fault_ipa >= BIT_ULL(VTCR_EL2_IPA(vcpu->arch.hw_mmu->vtcr))) { 1830 fault_ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0); 1831 1832 if (is_iabt) 1833 kvm_inject_pabt(vcpu, fault_ipa); 1834 else 1835 kvm_inject_dabt(vcpu, fault_ipa); 1836 return 1; 1837 } 1838 } 1839 1840 trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_esr(vcpu), 1841 kvm_vcpu_get_hfar(vcpu), fault_ipa); 1842 1843 /* Check the stage-2 fault is trans. fault or write fault */ 1844 if (!esr_fsc_is_translation_fault(esr) && 1845 !esr_fsc_is_permission_fault(esr) && 1846 !esr_fsc_is_access_flag_fault(esr)) { 1847 kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n", 1848 kvm_vcpu_trap_get_class(vcpu), 1849 (unsigned long)kvm_vcpu_trap_get_fault(vcpu), 1850 (unsigned long)kvm_vcpu_get_esr(vcpu)); 1851 return -EFAULT; 1852 } 1853 1854 idx = srcu_read_lock(&vcpu->kvm->srcu); 1855 1856 /* 1857 * We may have faulted on a shadow stage 2 page table if we are 1858 * running a nested guest. In this case, we have to resolve the L2 1859 * IPA to the L1 IPA first, before knowing what kind of memory should 1860 * back the L1 IPA. 1861 * 1862 * If the shadow stage 2 page table walk faults, then we simply inject 1863 * this to the guest and carry on. 1864 * 1865 * If there are no shadow S2 PTs because S2 is disabled, there is 1866 * nothing to walk and we treat it as a 1:1 before going through the 1867 * canonical translation. 1868 */ 1869 if (kvm_is_nested_s2_mmu(vcpu->kvm,vcpu->arch.hw_mmu) && 1870 vcpu->arch.hw_mmu->nested_stage2_enabled) { 1871 u32 esr; 1872 1873 ret = kvm_walk_nested_s2(vcpu, fault_ipa, &nested_trans); 1874 if (ret) { 1875 esr = kvm_s2_trans_esr(&nested_trans); 1876 kvm_inject_s2_fault(vcpu, esr); 1877 goto out_unlock; 1878 } 1879 1880 ret = kvm_s2_handle_perm_fault(vcpu, &nested_trans); 1881 if (ret) { 1882 esr = kvm_s2_trans_esr(&nested_trans); 1883 kvm_inject_s2_fault(vcpu, esr); 1884 goto out_unlock; 1885 } 1886 1887 ipa = kvm_s2_trans_output(&nested_trans); 1888 nested = &nested_trans; 1889 } 1890 1891 gfn = ipa >> PAGE_SHIFT; 1892 memslot = gfn_to_memslot(vcpu->kvm, gfn); 1893 hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable); 1894 write_fault = kvm_is_write_fault(vcpu); 1895 if (kvm_is_error_hva(hva) || (write_fault && !writable)) { 1896 /* 1897 * The guest has put either its instructions or its page-tables 1898 * somewhere it shouldn't have. Userspace won't be able to do 1899 * anything about this (there's no syndrome for a start), so 1900 * re-inject the abort back into the guest. 1901 */ 1902 if (is_iabt) { 1903 ret = -ENOEXEC; 1904 goto out; 1905 } 1906 1907 if (kvm_vcpu_abt_iss1tw(vcpu)) { 1908 kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu)); 1909 ret = 1; 1910 goto out_unlock; 1911 } 1912 1913 /* 1914 * Check for a cache maintenance operation. Since we 1915 * ended-up here, we know it is outside of any memory 1916 * slot. But we can't find out if that is for a device, 1917 * or if the guest is just being stupid. The only thing 1918 * we know for sure is that this range cannot be cached. 1919 * 1920 * So let's assume that the guest is just being 1921 * cautious, and skip the instruction. 1922 */ 1923 if (kvm_is_error_hva(hva) && kvm_vcpu_dabt_is_cm(vcpu)) { 1924 kvm_incr_pc(vcpu); 1925 ret = 1; 1926 goto out_unlock; 1927 } 1928 1929 /* 1930 * The IPA is reported as [MAX:12], so we need to 1931 * complement it with the bottom 12 bits from the 1932 * faulting VA. This is always 12 bits, irrespective 1933 * of the page size. 1934 */ 1935 ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0); 1936 ret = io_mem_abort(vcpu, ipa); 1937 goto out_unlock; 1938 } 1939 1940 /* Userspace should not be able to register out-of-bounds IPAs */ 1941 VM_BUG_ON(ipa >= kvm_phys_size(vcpu->arch.hw_mmu)); 1942 1943 if (esr_fsc_is_access_flag_fault(esr)) { 1944 handle_access_fault(vcpu, fault_ipa); 1945 ret = 1; 1946 goto out_unlock; 1947 } 1948 1949 ret = user_mem_abort(vcpu, fault_ipa, nested, memslot, hva, 1950 esr_fsc_is_permission_fault(esr)); 1951 if (ret == 0) 1952 ret = 1; 1953 out: 1954 if (ret == -ENOEXEC) { 1955 kvm_inject_pabt(vcpu, kvm_vcpu_get_hfar(vcpu)); 1956 ret = 1; 1957 } 1958 out_unlock: 1959 srcu_read_unlock(&vcpu->kvm->srcu, idx); 1960 return ret; 1961 } 1962 1963 bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) 1964 { 1965 if (!kvm->arch.mmu.pgt) 1966 return false; 1967 1968 __unmap_stage2_range(&kvm->arch.mmu, range->start << PAGE_SHIFT, 1969 (range->end - range->start) << PAGE_SHIFT, 1970 range->may_block); 1971 1972 kvm_nested_s2_unmap(kvm, range->may_block); 1973 return false; 1974 } 1975 1976 bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 1977 { 1978 u64 size = (range->end - range->start) << PAGE_SHIFT; 1979 1980 if (!kvm->arch.mmu.pgt) 1981 return false; 1982 1983 return KVM_PGT_FN(kvm_pgtable_stage2_test_clear_young)(kvm->arch.mmu.pgt, 1984 range->start << PAGE_SHIFT, 1985 size, true); 1986 /* 1987 * TODO: Handle nested_mmu structures here using the reverse mapping in 1988 * a later version of patch series. 1989 */ 1990 } 1991 1992 bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 1993 { 1994 u64 size = (range->end - range->start) << PAGE_SHIFT; 1995 1996 if (!kvm->arch.mmu.pgt) 1997 return false; 1998 1999 return KVM_PGT_FN(kvm_pgtable_stage2_test_clear_young)(kvm->arch.mmu.pgt, 2000 range->start << PAGE_SHIFT, 2001 size, false); 2002 } 2003 2004 phys_addr_t kvm_mmu_get_httbr(void) 2005 { 2006 return __pa(hyp_pgtable->pgd); 2007 } 2008 2009 phys_addr_t kvm_get_idmap_vector(void) 2010 { 2011 return hyp_idmap_vector; 2012 } 2013 2014 static int kvm_map_idmap_text(void) 2015 { 2016 unsigned long size = hyp_idmap_end - hyp_idmap_start; 2017 int err = __create_hyp_mappings(hyp_idmap_start, size, hyp_idmap_start, 2018 PAGE_HYP_EXEC); 2019 if (err) 2020 kvm_err("Failed to idmap %lx-%lx\n", 2021 hyp_idmap_start, hyp_idmap_end); 2022 2023 return err; 2024 } 2025 2026 static void *kvm_hyp_zalloc_page(void *arg) 2027 { 2028 return (void *)get_zeroed_page(GFP_KERNEL); 2029 } 2030 2031 static struct kvm_pgtable_mm_ops kvm_hyp_mm_ops = { 2032 .zalloc_page = kvm_hyp_zalloc_page, 2033 .get_page = kvm_host_get_page, 2034 .put_page = kvm_host_put_page, 2035 .phys_to_virt = kvm_host_va, 2036 .virt_to_phys = kvm_host_pa, 2037 }; 2038 2039 int __init kvm_mmu_init(u32 *hyp_va_bits) 2040 { 2041 int err; 2042 u32 idmap_bits; 2043 u32 kernel_bits; 2044 2045 hyp_idmap_start = __pa_symbol(__hyp_idmap_text_start); 2046 hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE); 2047 hyp_idmap_end = __pa_symbol(__hyp_idmap_text_end); 2048 hyp_idmap_end = ALIGN(hyp_idmap_end, PAGE_SIZE); 2049 hyp_idmap_vector = __pa_symbol(__kvm_hyp_init); 2050 2051 /* 2052 * We rely on the linker script to ensure at build time that the HYP 2053 * init code does not cross a page boundary. 2054 */ 2055 BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK); 2056 2057 /* 2058 * The ID map is always configured for 48 bits of translation, which 2059 * may be fewer than the number of VA bits used by the regular kernel 2060 * stage 1, when VA_BITS=52. 2061 * 2062 * At EL2, there is only one TTBR register, and we can't switch between 2063 * translation tables *and* update TCR_EL2.T0SZ at the same time. Bottom 2064 * line: we need to use the extended range with *both* our translation 2065 * tables. 2066 * 2067 * So use the maximum of the idmap VA bits and the regular kernel stage 2068 * 1 VA bits to assure that the hypervisor can both ID map its code page 2069 * and map any kernel memory. 2070 */ 2071 idmap_bits = IDMAP_VA_BITS; 2072 kernel_bits = vabits_actual; 2073 *hyp_va_bits = max(idmap_bits, kernel_bits); 2074 2075 kvm_debug("Using %u-bit virtual addresses at EL2\n", *hyp_va_bits); 2076 kvm_debug("IDMAP page: %lx\n", hyp_idmap_start); 2077 kvm_debug("HYP VA range: %lx:%lx\n", 2078 kern_hyp_va(PAGE_OFFSET), 2079 kern_hyp_va((unsigned long)high_memory - 1)); 2080 2081 if (hyp_idmap_start >= kern_hyp_va(PAGE_OFFSET) && 2082 hyp_idmap_start < kern_hyp_va((unsigned long)high_memory - 1) && 2083 hyp_idmap_start != (unsigned long)__hyp_idmap_text_start) { 2084 /* 2085 * The idmap page is intersecting with the VA space, 2086 * it is not safe to continue further. 2087 */ 2088 kvm_err("IDMAP intersecting with HYP VA, unable to continue\n"); 2089 err = -EINVAL; 2090 goto out; 2091 } 2092 2093 hyp_pgtable = kzalloc(sizeof(*hyp_pgtable), GFP_KERNEL); 2094 if (!hyp_pgtable) { 2095 kvm_err("Hyp mode page-table not allocated\n"); 2096 err = -ENOMEM; 2097 goto out; 2098 } 2099 2100 err = kvm_pgtable_hyp_init(hyp_pgtable, *hyp_va_bits, &kvm_hyp_mm_ops); 2101 if (err) 2102 goto out_free_pgtable; 2103 2104 err = kvm_map_idmap_text(); 2105 if (err) 2106 goto out_destroy_pgtable; 2107 2108 io_map_base = hyp_idmap_start; 2109 __hyp_va_bits = *hyp_va_bits; 2110 return 0; 2111 2112 out_destroy_pgtable: 2113 kvm_pgtable_hyp_destroy(hyp_pgtable); 2114 out_free_pgtable: 2115 kfree(hyp_pgtable); 2116 hyp_pgtable = NULL; 2117 out: 2118 return err; 2119 } 2120 2121 void kvm_arch_commit_memory_region(struct kvm *kvm, 2122 struct kvm_memory_slot *old, 2123 const struct kvm_memory_slot *new, 2124 enum kvm_mr_change change) 2125 { 2126 bool log_dirty_pages = new && new->flags & KVM_MEM_LOG_DIRTY_PAGES; 2127 2128 /* 2129 * At this point memslot has been committed and there is an 2130 * allocated dirty_bitmap[], dirty pages will be tracked while the 2131 * memory slot is write protected. 2132 */ 2133 if (log_dirty_pages) { 2134 2135 if (change == KVM_MR_DELETE) 2136 return; 2137 2138 /* 2139 * Huge and normal pages are write-protected and split 2140 * on either of these two cases: 2141 * 2142 * 1. with initial-all-set: gradually with CLEAR ioctls, 2143 */ 2144 if (kvm_dirty_log_manual_protect_and_init_set(kvm)) 2145 return; 2146 /* 2147 * or 2148 * 2. without initial-all-set: all in one shot when 2149 * enabling dirty logging. 2150 */ 2151 kvm_mmu_wp_memory_region(kvm, new->id); 2152 kvm_mmu_split_memory_region(kvm, new->id); 2153 } else { 2154 /* 2155 * Free any leftovers from the eager page splitting cache. Do 2156 * this when deleting, moving, disabling dirty logging, or 2157 * creating the memslot (a nop). Doing it for deletes makes 2158 * sure we don't leak memory, and there's no need to keep the 2159 * cache around for any of the other cases. 2160 */ 2161 kvm_mmu_free_memory_cache(&kvm->arch.mmu.split_page_cache); 2162 } 2163 } 2164 2165 int kvm_arch_prepare_memory_region(struct kvm *kvm, 2166 const struct kvm_memory_slot *old, 2167 struct kvm_memory_slot *new, 2168 enum kvm_mr_change change) 2169 { 2170 hva_t hva, reg_end; 2171 int ret = 0; 2172 2173 if (change != KVM_MR_CREATE && change != KVM_MR_MOVE && 2174 change != KVM_MR_FLAGS_ONLY) 2175 return 0; 2176 2177 /* 2178 * Prevent userspace from creating a memory region outside of the IPA 2179 * space addressable by the KVM guest IPA space. 2180 */ 2181 if ((new->base_gfn + new->npages) > (kvm_phys_size(&kvm->arch.mmu) >> PAGE_SHIFT)) 2182 return -EFAULT; 2183 2184 hva = new->userspace_addr; 2185 reg_end = hva + (new->npages << PAGE_SHIFT); 2186 2187 mmap_read_lock(current->mm); 2188 /* 2189 * A memory region could potentially cover multiple VMAs, and any holes 2190 * between them, so iterate over all of them. 2191 * 2192 * +--------------------------------------------+ 2193 * +---------------+----------------+ +----------------+ 2194 * | : VMA 1 | VMA 2 | | VMA 3 : | 2195 * +---------------+----------------+ +----------------+ 2196 * | memory region | 2197 * +--------------------------------------------+ 2198 */ 2199 do { 2200 struct vm_area_struct *vma; 2201 2202 vma = find_vma_intersection(current->mm, hva, reg_end); 2203 if (!vma) 2204 break; 2205 2206 if (kvm_has_mte(kvm) && !kvm_vma_mte_allowed(vma)) { 2207 ret = -EINVAL; 2208 break; 2209 } 2210 2211 if (vma->vm_flags & VM_PFNMAP) { 2212 /* IO region dirty page logging not allowed */ 2213 if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) { 2214 ret = -EINVAL; 2215 break; 2216 } 2217 } 2218 hva = min(reg_end, vma->vm_end); 2219 } while (hva < reg_end); 2220 2221 mmap_read_unlock(current->mm); 2222 return ret; 2223 } 2224 2225 void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) 2226 { 2227 } 2228 2229 void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) 2230 { 2231 } 2232 2233 void kvm_arch_flush_shadow_memslot(struct kvm *kvm, 2234 struct kvm_memory_slot *slot) 2235 { 2236 gpa_t gpa = slot->base_gfn << PAGE_SHIFT; 2237 phys_addr_t size = slot->npages << PAGE_SHIFT; 2238 2239 write_lock(&kvm->mmu_lock); 2240 kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, size, true); 2241 kvm_nested_s2_unmap(kvm, true); 2242 write_unlock(&kvm->mmu_lock); 2243 } 2244 2245 /* 2246 * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized). 2247 * 2248 * Main problems: 2249 * - S/W ops are local to a CPU (not broadcast) 2250 * - We have line migration behind our back (speculation) 2251 * - System caches don't support S/W at all (damn!) 2252 * 2253 * In the face of the above, the best we can do is to try and convert 2254 * S/W ops to VA ops. Because the guest is not allowed to infer the 2255 * S/W to PA mapping, it can only use S/W to nuke the whole cache, 2256 * which is a rather good thing for us. 2257 * 2258 * Also, it is only used when turning caches on/off ("The expected 2259 * usage of the cache maintenance instructions that operate by set/way 2260 * is associated with the cache maintenance instructions associated 2261 * with the powerdown and powerup of caches, if this is required by 2262 * the implementation."). 2263 * 2264 * We use the following policy: 2265 * 2266 * - If we trap a S/W operation, we enable VM trapping to detect 2267 * caches being turned on/off, and do a full clean. 2268 * 2269 * - We flush the caches on both caches being turned on and off. 2270 * 2271 * - Once the caches are enabled, we stop trapping VM ops. 2272 */ 2273 void kvm_set_way_flush(struct kvm_vcpu *vcpu) 2274 { 2275 unsigned long hcr = *vcpu_hcr(vcpu); 2276 2277 /* 2278 * If this is the first time we do a S/W operation 2279 * (i.e. HCR_TVM not set) flush the whole memory, and set the 2280 * VM trapping. 2281 * 2282 * Otherwise, rely on the VM trapping to wait for the MMU + 2283 * Caches to be turned off. At that point, we'll be able to 2284 * clean the caches again. 2285 */ 2286 if (!(hcr & HCR_TVM)) { 2287 trace_kvm_set_way_flush(*vcpu_pc(vcpu), 2288 vcpu_has_cache_enabled(vcpu)); 2289 stage2_flush_vm(vcpu->kvm); 2290 *vcpu_hcr(vcpu) = hcr | HCR_TVM; 2291 } 2292 } 2293 2294 void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled) 2295 { 2296 bool now_enabled = vcpu_has_cache_enabled(vcpu); 2297 2298 /* 2299 * If switching the MMU+caches on, need to invalidate the caches. 2300 * If switching it off, need to clean the caches. 2301 * Clean + invalidate does the trick always. 2302 */ 2303 if (now_enabled != was_enabled) 2304 stage2_flush_vm(vcpu->kvm); 2305 2306 /* Caches are now on, stop trapping VM ops (until a S/W op) */ 2307 if (now_enabled) 2308 *vcpu_hcr(vcpu) &= ~HCR_TVM; 2309 2310 trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled); 2311 } 2312