1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2012 - Virtual Open Systems and Columbia University 4 * Author: Christoffer Dall <c.dall@virtualopensystems.com> 5 */ 6 7 #include <linux/mman.h> 8 #include <linux/kvm_host.h> 9 #include <linux/io.h> 10 #include <linux/hugetlb.h> 11 #include <linux/sched/signal.h> 12 #include <trace/events/kvm.h> 13 #include <asm/pgalloc.h> 14 #include <asm/cacheflush.h> 15 #include <asm/kvm_arm.h> 16 #include <asm/kvm_mmu.h> 17 #include <asm/kvm_pgtable.h> 18 #include <asm/kvm_ras.h> 19 #include <asm/kvm_asm.h> 20 #include <asm/kvm_emulate.h> 21 #include <asm/virt.h> 22 23 #include "trace.h" 24 25 static struct kvm_pgtable *hyp_pgtable; 26 static DEFINE_MUTEX(kvm_hyp_pgd_mutex); 27 28 static unsigned long __ro_after_init hyp_idmap_start; 29 static unsigned long __ro_after_init hyp_idmap_end; 30 static phys_addr_t __ro_after_init hyp_idmap_vector; 31 32 static unsigned long __ro_after_init io_map_base; 33 34 static phys_addr_t __stage2_range_addr_end(phys_addr_t addr, phys_addr_t end, 35 phys_addr_t size) 36 { 37 phys_addr_t boundary = ALIGN_DOWN(addr + size, size); 38 39 return (boundary - 1 < end - 1) ? boundary : end; 40 } 41 42 static phys_addr_t stage2_range_addr_end(phys_addr_t addr, phys_addr_t end) 43 { 44 phys_addr_t size = kvm_granule_size(KVM_PGTABLE_MIN_BLOCK_LEVEL); 45 46 return __stage2_range_addr_end(addr, end, size); 47 } 48 49 /* 50 * Release kvm_mmu_lock periodically if the memory region is large. Otherwise, 51 * we may see kernel panics with CONFIG_DETECT_HUNG_TASK, 52 * CONFIG_LOCKUP_DETECTOR, CONFIG_LOCKDEP. Additionally, holding the lock too 53 * long will also starve other vCPUs. We have to also make sure that the page 54 * tables are not freed while we released the lock. 55 */ 56 static int stage2_apply_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, 57 phys_addr_t end, 58 int (*fn)(struct kvm_pgtable *, u64, u64), 59 bool resched) 60 { 61 struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu); 62 int ret; 63 u64 next; 64 65 do { 66 struct kvm_pgtable *pgt = mmu->pgt; 67 if (!pgt) 68 return -EINVAL; 69 70 next = stage2_range_addr_end(addr, end); 71 ret = fn(pgt, addr, next - addr); 72 if (ret) 73 break; 74 75 if (resched && next != end) 76 cond_resched_rwlock_write(&kvm->mmu_lock); 77 } while (addr = next, addr != end); 78 79 return ret; 80 } 81 82 #define stage2_apply_range_resched(mmu, addr, end, fn) \ 83 stage2_apply_range(mmu, addr, end, fn, true) 84 85 /* 86 * Get the maximum number of page-tables pages needed to split a range 87 * of blocks into PAGE_SIZE PTEs. It assumes the range is already 88 * mapped at level 2, or at level 1 if allowed. 89 */ 90 static int kvm_mmu_split_nr_page_tables(u64 range) 91 { 92 int n = 0; 93 94 if (KVM_PGTABLE_MIN_BLOCK_LEVEL < 2) 95 n += DIV_ROUND_UP(range, PUD_SIZE); 96 n += DIV_ROUND_UP(range, PMD_SIZE); 97 return n; 98 } 99 100 static bool need_split_memcache_topup_or_resched(struct kvm *kvm) 101 { 102 struct kvm_mmu_memory_cache *cache; 103 u64 chunk_size, min; 104 105 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) 106 return true; 107 108 chunk_size = kvm->arch.mmu.split_page_chunk_size; 109 min = kvm_mmu_split_nr_page_tables(chunk_size); 110 cache = &kvm->arch.mmu.split_page_cache; 111 return kvm_mmu_memory_cache_nr_free_objects(cache) < min; 112 } 113 114 static int kvm_mmu_split_huge_pages(struct kvm *kvm, phys_addr_t addr, 115 phys_addr_t end) 116 { 117 struct kvm_mmu_memory_cache *cache; 118 struct kvm_pgtable *pgt; 119 int ret, cache_capacity; 120 u64 next, chunk_size; 121 122 lockdep_assert_held_write(&kvm->mmu_lock); 123 124 chunk_size = kvm->arch.mmu.split_page_chunk_size; 125 cache_capacity = kvm_mmu_split_nr_page_tables(chunk_size); 126 127 if (chunk_size == 0) 128 return 0; 129 130 cache = &kvm->arch.mmu.split_page_cache; 131 132 do { 133 if (need_split_memcache_topup_or_resched(kvm)) { 134 write_unlock(&kvm->mmu_lock); 135 cond_resched(); 136 /* Eager page splitting is best-effort. */ 137 ret = __kvm_mmu_topup_memory_cache(cache, 138 cache_capacity, 139 cache_capacity); 140 write_lock(&kvm->mmu_lock); 141 if (ret) 142 break; 143 } 144 145 pgt = kvm->arch.mmu.pgt; 146 if (!pgt) 147 return -EINVAL; 148 149 next = __stage2_range_addr_end(addr, end, chunk_size); 150 ret = kvm_pgtable_stage2_split(pgt, addr, next - addr, cache); 151 if (ret) 152 break; 153 } while (addr = next, addr != end); 154 155 return ret; 156 } 157 158 static bool memslot_is_logging(struct kvm_memory_slot *memslot) 159 { 160 return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY); 161 } 162 163 /** 164 * kvm_arch_flush_remote_tlbs() - flush all VM TLB entries for v7/8 165 * @kvm: pointer to kvm structure. 166 * 167 * Interface to HYP function to flush all VM TLB entries 168 */ 169 int kvm_arch_flush_remote_tlbs(struct kvm *kvm) 170 { 171 kvm_call_hyp(__kvm_tlb_flush_vmid, &kvm->arch.mmu); 172 return 0; 173 } 174 175 int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, 176 gfn_t gfn, u64 nr_pages) 177 { 178 kvm_tlb_flush_vmid_range(&kvm->arch.mmu, 179 gfn << PAGE_SHIFT, nr_pages << PAGE_SHIFT); 180 return 0; 181 } 182 183 static bool kvm_is_device_pfn(unsigned long pfn) 184 { 185 return !pfn_is_map_memory(pfn); 186 } 187 188 static void *stage2_memcache_zalloc_page(void *arg) 189 { 190 struct kvm_mmu_memory_cache *mc = arg; 191 void *virt; 192 193 /* Allocated with __GFP_ZERO, so no need to zero */ 194 virt = kvm_mmu_memory_cache_alloc(mc); 195 if (virt) 196 kvm_account_pgtable_pages(virt, 1); 197 return virt; 198 } 199 200 static void *kvm_host_zalloc_pages_exact(size_t size) 201 { 202 return alloc_pages_exact(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO); 203 } 204 205 static void *kvm_s2_zalloc_pages_exact(size_t size) 206 { 207 void *virt = kvm_host_zalloc_pages_exact(size); 208 209 if (virt) 210 kvm_account_pgtable_pages(virt, (size >> PAGE_SHIFT)); 211 return virt; 212 } 213 214 static void kvm_s2_free_pages_exact(void *virt, size_t size) 215 { 216 kvm_account_pgtable_pages(virt, -(size >> PAGE_SHIFT)); 217 free_pages_exact(virt, size); 218 } 219 220 static struct kvm_pgtable_mm_ops kvm_s2_mm_ops; 221 222 static void stage2_free_unlinked_table_rcu_cb(struct rcu_head *head) 223 { 224 struct page *page = container_of(head, struct page, rcu_head); 225 void *pgtable = page_to_virt(page); 226 s8 level = page_private(page); 227 228 kvm_pgtable_stage2_free_unlinked(&kvm_s2_mm_ops, pgtable, level); 229 } 230 231 static void stage2_free_unlinked_table(void *addr, s8 level) 232 { 233 struct page *page = virt_to_page(addr); 234 235 set_page_private(page, (unsigned long)level); 236 call_rcu(&page->rcu_head, stage2_free_unlinked_table_rcu_cb); 237 } 238 239 static void kvm_host_get_page(void *addr) 240 { 241 get_page(virt_to_page(addr)); 242 } 243 244 static void kvm_host_put_page(void *addr) 245 { 246 put_page(virt_to_page(addr)); 247 } 248 249 static void kvm_s2_put_page(void *addr) 250 { 251 struct page *p = virt_to_page(addr); 252 /* Dropping last refcount, the page will be freed */ 253 if (page_count(p) == 1) 254 kvm_account_pgtable_pages(addr, -1); 255 put_page(p); 256 } 257 258 static int kvm_host_page_count(void *addr) 259 { 260 return page_count(virt_to_page(addr)); 261 } 262 263 static phys_addr_t kvm_host_pa(void *addr) 264 { 265 return __pa(addr); 266 } 267 268 static void *kvm_host_va(phys_addr_t phys) 269 { 270 return __va(phys); 271 } 272 273 static void clean_dcache_guest_page(void *va, size_t size) 274 { 275 __clean_dcache_guest_page(va, size); 276 } 277 278 static void invalidate_icache_guest_page(void *va, size_t size) 279 { 280 __invalidate_icache_guest_page(va, size); 281 } 282 283 /* 284 * Unmapping vs dcache management: 285 * 286 * If a guest maps certain memory pages as uncached, all writes will 287 * bypass the data cache and go directly to RAM. However, the CPUs 288 * can still speculate reads (not writes) and fill cache lines with 289 * data. 290 * 291 * Those cache lines will be *clean* cache lines though, so a 292 * clean+invalidate operation is equivalent to an invalidate 293 * operation, because no cache lines are marked dirty. 294 * 295 * Those clean cache lines could be filled prior to an uncached write 296 * by the guest, and the cache coherent IO subsystem would therefore 297 * end up writing old data to disk. 298 * 299 * This is why right after unmapping a page/section and invalidating 300 * the corresponding TLBs, we flush to make sure the IO subsystem will 301 * never hit in the cache. 302 * 303 * This is all avoided on systems that have ARM64_HAS_STAGE2_FWB, as 304 * we then fully enforce cacheability of RAM, no matter what the guest 305 * does. 306 */ 307 /** 308 * __unmap_stage2_range -- Clear stage2 page table entries to unmap a range 309 * @mmu: The KVM stage-2 MMU pointer 310 * @start: The intermediate physical base address of the range to unmap 311 * @size: The size of the area to unmap 312 * @may_block: Whether or not we are permitted to block 313 * 314 * Clear a range of stage-2 mappings, lowering the various ref-counts. Must 315 * be called while holding mmu_lock (unless for freeing the stage2 pgd before 316 * destroying the VM), otherwise another faulting VCPU may come in and mess 317 * with things behind our backs. 318 */ 319 static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size, 320 bool may_block) 321 { 322 struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu); 323 phys_addr_t end = start + size; 324 325 lockdep_assert_held_write(&kvm->mmu_lock); 326 WARN_ON(size & ~PAGE_MASK); 327 WARN_ON(stage2_apply_range(mmu, start, end, kvm_pgtable_stage2_unmap, 328 may_block)); 329 } 330 331 void kvm_stage2_unmap_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size) 332 { 333 __unmap_stage2_range(mmu, start, size, true); 334 } 335 336 void kvm_stage2_flush_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end) 337 { 338 stage2_apply_range_resched(mmu, addr, end, kvm_pgtable_stage2_flush); 339 } 340 341 static void stage2_flush_memslot(struct kvm *kvm, 342 struct kvm_memory_slot *memslot) 343 { 344 phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT; 345 phys_addr_t end = addr + PAGE_SIZE * memslot->npages; 346 347 kvm_stage2_flush_range(&kvm->arch.mmu, addr, end); 348 } 349 350 /** 351 * stage2_flush_vm - Invalidate cache for pages mapped in stage 2 352 * @kvm: The struct kvm pointer 353 * 354 * Go through the stage 2 page tables and invalidate any cache lines 355 * backing memory already mapped to the VM. 356 */ 357 static void stage2_flush_vm(struct kvm *kvm) 358 { 359 struct kvm_memslots *slots; 360 struct kvm_memory_slot *memslot; 361 int idx, bkt; 362 363 idx = srcu_read_lock(&kvm->srcu); 364 write_lock(&kvm->mmu_lock); 365 366 slots = kvm_memslots(kvm); 367 kvm_for_each_memslot(memslot, bkt, slots) 368 stage2_flush_memslot(kvm, memslot); 369 370 kvm_nested_s2_flush(kvm); 371 372 write_unlock(&kvm->mmu_lock); 373 srcu_read_unlock(&kvm->srcu, idx); 374 } 375 376 /** 377 * free_hyp_pgds - free Hyp-mode page tables 378 */ 379 void __init free_hyp_pgds(void) 380 { 381 mutex_lock(&kvm_hyp_pgd_mutex); 382 if (hyp_pgtable) { 383 kvm_pgtable_hyp_destroy(hyp_pgtable); 384 kfree(hyp_pgtable); 385 hyp_pgtable = NULL; 386 } 387 mutex_unlock(&kvm_hyp_pgd_mutex); 388 } 389 390 static bool kvm_host_owns_hyp_mappings(void) 391 { 392 if (is_kernel_in_hyp_mode()) 393 return false; 394 395 if (static_branch_likely(&kvm_protected_mode_initialized)) 396 return false; 397 398 /* 399 * This can happen at boot time when __create_hyp_mappings() is called 400 * after the hyp protection has been enabled, but the static key has 401 * not been flipped yet. 402 */ 403 if (!hyp_pgtable && is_protected_kvm_enabled()) 404 return false; 405 406 WARN_ON(!hyp_pgtable); 407 408 return true; 409 } 410 411 int __create_hyp_mappings(unsigned long start, unsigned long size, 412 unsigned long phys, enum kvm_pgtable_prot prot) 413 { 414 int err; 415 416 if (WARN_ON(!kvm_host_owns_hyp_mappings())) 417 return -EINVAL; 418 419 mutex_lock(&kvm_hyp_pgd_mutex); 420 err = kvm_pgtable_hyp_map(hyp_pgtable, start, size, phys, prot); 421 mutex_unlock(&kvm_hyp_pgd_mutex); 422 423 return err; 424 } 425 426 static phys_addr_t kvm_kaddr_to_phys(void *kaddr) 427 { 428 if (!is_vmalloc_addr(kaddr)) { 429 BUG_ON(!virt_addr_valid(kaddr)); 430 return __pa(kaddr); 431 } else { 432 return page_to_phys(vmalloc_to_page(kaddr)) + 433 offset_in_page(kaddr); 434 } 435 } 436 437 struct hyp_shared_pfn { 438 u64 pfn; 439 int count; 440 struct rb_node node; 441 }; 442 443 static DEFINE_MUTEX(hyp_shared_pfns_lock); 444 static struct rb_root hyp_shared_pfns = RB_ROOT; 445 446 static struct hyp_shared_pfn *find_shared_pfn(u64 pfn, struct rb_node ***node, 447 struct rb_node **parent) 448 { 449 struct hyp_shared_pfn *this; 450 451 *node = &hyp_shared_pfns.rb_node; 452 *parent = NULL; 453 while (**node) { 454 this = container_of(**node, struct hyp_shared_pfn, node); 455 *parent = **node; 456 if (this->pfn < pfn) 457 *node = &((**node)->rb_left); 458 else if (this->pfn > pfn) 459 *node = &((**node)->rb_right); 460 else 461 return this; 462 } 463 464 return NULL; 465 } 466 467 static int share_pfn_hyp(u64 pfn) 468 { 469 struct rb_node **node, *parent; 470 struct hyp_shared_pfn *this; 471 int ret = 0; 472 473 mutex_lock(&hyp_shared_pfns_lock); 474 this = find_shared_pfn(pfn, &node, &parent); 475 if (this) { 476 this->count++; 477 goto unlock; 478 } 479 480 this = kzalloc(sizeof(*this), GFP_KERNEL); 481 if (!this) { 482 ret = -ENOMEM; 483 goto unlock; 484 } 485 486 this->pfn = pfn; 487 this->count = 1; 488 rb_link_node(&this->node, parent, node); 489 rb_insert_color(&this->node, &hyp_shared_pfns); 490 ret = kvm_call_hyp_nvhe(__pkvm_host_share_hyp, pfn, 1); 491 unlock: 492 mutex_unlock(&hyp_shared_pfns_lock); 493 494 return ret; 495 } 496 497 static int unshare_pfn_hyp(u64 pfn) 498 { 499 struct rb_node **node, *parent; 500 struct hyp_shared_pfn *this; 501 int ret = 0; 502 503 mutex_lock(&hyp_shared_pfns_lock); 504 this = find_shared_pfn(pfn, &node, &parent); 505 if (WARN_ON(!this)) { 506 ret = -ENOENT; 507 goto unlock; 508 } 509 510 this->count--; 511 if (this->count) 512 goto unlock; 513 514 rb_erase(&this->node, &hyp_shared_pfns); 515 kfree(this); 516 ret = kvm_call_hyp_nvhe(__pkvm_host_unshare_hyp, pfn, 1); 517 unlock: 518 mutex_unlock(&hyp_shared_pfns_lock); 519 520 return ret; 521 } 522 523 int kvm_share_hyp(void *from, void *to) 524 { 525 phys_addr_t start, end, cur; 526 u64 pfn; 527 int ret; 528 529 if (is_kernel_in_hyp_mode()) 530 return 0; 531 532 /* 533 * The share hcall maps things in the 'fixed-offset' region of the hyp 534 * VA space, so we can only share physically contiguous data-structures 535 * for now. 536 */ 537 if (is_vmalloc_or_module_addr(from) || is_vmalloc_or_module_addr(to)) 538 return -EINVAL; 539 540 if (kvm_host_owns_hyp_mappings()) 541 return create_hyp_mappings(from, to, PAGE_HYP); 542 543 start = ALIGN_DOWN(__pa(from), PAGE_SIZE); 544 end = PAGE_ALIGN(__pa(to)); 545 for (cur = start; cur < end; cur += PAGE_SIZE) { 546 pfn = __phys_to_pfn(cur); 547 ret = share_pfn_hyp(pfn); 548 if (ret) 549 return ret; 550 } 551 552 return 0; 553 } 554 555 void kvm_unshare_hyp(void *from, void *to) 556 { 557 phys_addr_t start, end, cur; 558 u64 pfn; 559 560 if (is_kernel_in_hyp_mode() || kvm_host_owns_hyp_mappings() || !from) 561 return; 562 563 start = ALIGN_DOWN(__pa(from), PAGE_SIZE); 564 end = PAGE_ALIGN(__pa(to)); 565 for (cur = start; cur < end; cur += PAGE_SIZE) { 566 pfn = __phys_to_pfn(cur); 567 WARN_ON(unshare_pfn_hyp(pfn)); 568 } 569 } 570 571 /** 572 * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode 573 * @from: The virtual kernel start address of the range 574 * @to: The virtual kernel end address of the range (exclusive) 575 * @prot: The protection to be applied to this range 576 * 577 * The same virtual address as the kernel virtual address is also used 578 * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying 579 * physical pages. 580 */ 581 int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot) 582 { 583 phys_addr_t phys_addr; 584 unsigned long virt_addr; 585 unsigned long start = kern_hyp_va((unsigned long)from); 586 unsigned long end = kern_hyp_va((unsigned long)to); 587 588 if (is_kernel_in_hyp_mode()) 589 return 0; 590 591 if (!kvm_host_owns_hyp_mappings()) 592 return -EPERM; 593 594 start = start & PAGE_MASK; 595 end = PAGE_ALIGN(end); 596 597 for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) { 598 int err; 599 600 phys_addr = kvm_kaddr_to_phys(from + virt_addr - start); 601 err = __create_hyp_mappings(virt_addr, PAGE_SIZE, phys_addr, 602 prot); 603 if (err) 604 return err; 605 } 606 607 return 0; 608 } 609 610 static int __hyp_alloc_private_va_range(unsigned long base) 611 { 612 lockdep_assert_held(&kvm_hyp_pgd_mutex); 613 614 if (!PAGE_ALIGNED(base)) 615 return -EINVAL; 616 617 /* 618 * Verify that BIT(VA_BITS - 1) hasn't been flipped by 619 * allocating the new area, as it would indicate we've 620 * overflowed the idmap/IO address range. 621 */ 622 if ((base ^ io_map_base) & BIT(VA_BITS - 1)) 623 return -ENOMEM; 624 625 io_map_base = base; 626 627 return 0; 628 } 629 630 /** 631 * hyp_alloc_private_va_range - Allocates a private VA range. 632 * @size: The size of the VA range to reserve. 633 * @haddr: The hypervisor virtual start address of the allocation. 634 * 635 * The private virtual address (VA) range is allocated below io_map_base 636 * and aligned based on the order of @size. 637 * 638 * Return: 0 on success or negative error code on failure. 639 */ 640 int hyp_alloc_private_va_range(size_t size, unsigned long *haddr) 641 { 642 unsigned long base; 643 int ret = 0; 644 645 mutex_lock(&kvm_hyp_pgd_mutex); 646 647 /* 648 * This assumes that we have enough space below the idmap 649 * page to allocate our VAs. If not, the check in 650 * __hyp_alloc_private_va_range() will kick. A potential 651 * alternative would be to detect that overflow and switch 652 * to an allocation above the idmap. 653 * 654 * The allocated size is always a multiple of PAGE_SIZE. 655 */ 656 size = PAGE_ALIGN(size); 657 base = io_map_base - size; 658 ret = __hyp_alloc_private_va_range(base); 659 660 mutex_unlock(&kvm_hyp_pgd_mutex); 661 662 if (!ret) 663 *haddr = base; 664 665 return ret; 666 } 667 668 static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size, 669 unsigned long *haddr, 670 enum kvm_pgtable_prot prot) 671 { 672 unsigned long addr; 673 int ret = 0; 674 675 if (!kvm_host_owns_hyp_mappings()) { 676 addr = kvm_call_hyp_nvhe(__pkvm_create_private_mapping, 677 phys_addr, size, prot); 678 if (IS_ERR_VALUE(addr)) 679 return addr; 680 *haddr = addr; 681 682 return 0; 683 } 684 685 size = PAGE_ALIGN(size + offset_in_page(phys_addr)); 686 ret = hyp_alloc_private_va_range(size, &addr); 687 if (ret) 688 return ret; 689 690 ret = __create_hyp_mappings(addr, size, phys_addr, prot); 691 if (ret) 692 return ret; 693 694 *haddr = addr + offset_in_page(phys_addr); 695 return ret; 696 } 697 698 int create_hyp_stack(phys_addr_t phys_addr, unsigned long *haddr) 699 { 700 unsigned long base; 701 size_t size; 702 int ret; 703 704 mutex_lock(&kvm_hyp_pgd_mutex); 705 /* 706 * Efficient stack verification using the PAGE_SHIFT bit implies 707 * an alignment of our allocation on the order of the size. 708 */ 709 size = PAGE_SIZE * 2; 710 base = ALIGN_DOWN(io_map_base - size, size); 711 712 ret = __hyp_alloc_private_va_range(base); 713 714 mutex_unlock(&kvm_hyp_pgd_mutex); 715 716 if (ret) { 717 kvm_err("Cannot allocate hyp stack guard page\n"); 718 return ret; 719 } 720 721 /* 722 * Since the stack grows downwards, map the stack to the page 723 * at the higher address and leave the lower guard page 724 * unbacked. 725 * 726 * Any valid stack address now has the PAGE_SHIFT bit as 1 727 * and addresses corresponding to the guard page have the 728 * PAGE_SHIFT bit as 0 - this is used for overflow detection. 729 */ 730 ret = __create_hyp_mappings(base + PAGE_SIZE, PAGE_SIZE, phys_addr, 731 PAGE_HYP); 732 if (ret) 733 kvm_err("Cannot map hyp stack\n"); 734 735 *haddr = base + size; 736 737 return ret; 738 } 739 740 /** 741 * create_hyp_io_mappings - Map IO into both kernel and HYP 742 * @phys_addr: The physical start address which gets mapped 743 * @size: Size of the region being mapped 744 * @kaddr: Kernel VA for this mapping 745 * @haddr: HYP VA for this mapping 746 */ 747 int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size, 748 void __iomem **kaddr, 749 void __iomem **haddr) 750 { 751 unsigned long addr; 752 int ret; 753 754 if (is_protected_kvm_enabled()) 755 return -EPERM; 756 757 *kaddr = ioremap(phys_addr, size); 758 if (!*kaddr) 759 return -ENOMEM; 760 761 if (is_kernel_in_hyp_mode()) { 762 *haddr = *kaddr; 763 return 0; 764 } 765 766 ret = __create_hyp_private_mapping(phys_addr, size, 767 &addr, PAGE_HYP_DEVICE); 768 if (ret) { 769 iounmap(*kaddr); 770 *kaddr = NULL; 771 *haddr = NULL; 772 return ret; 773 } 774 775 *haddr = (void __iomem *)addr; 776 return 0; 777 } 778 779 /** 780 * create_hyp_exec_mappings - Map an executable range into HYP 781 * @phys_addr: The physical start address which gets mapped 782 * @size: Size of the region being mapped 783 * @haddr: HYP VA for this mapping 784 */ 785 int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size, 786 void **haddr) 787 { 788 unsigned long addr; 789 int ret; 790 791 BUG_ON(is_kernel_in_hyp_mode()); 792 793 ret = __create_hyp_private_mapping(phys_addr, size, 794 &addr, PAGE_HYP_EXEC); 795 if (ret) { 796 *haddr = NULL; 797 return ret; 798 } 799 800 *haddr = (void *)addr; 801 return 0; 802 } 803 804 static struct kvm_pgtable_mm_ops kvm_user_mm_ops = { 805 /* We shouldn't need any other callback to walk the PT */ 806 .phys_to_virt = kvm_host_va, 807 }; 808 809 static int get_user_mapping_size(struct kvm *kvm, u64 addr) 810 { 811 struct kvm_pgtable pgt = { 812 .pgd = (kvm_pteref_t)kvm->mm->pgd, 813 .ia_bits = vabits_actual, 814 .start_level = (KVM_PGTABLE_LAST_LEVEL - 815 ARM64_HW_PGTABLE_LEVELS(pgt.ia_bits) + 1), 816 .mm_ops = &kvm_user_mm_ops, 817 }; 818 unsigned long flags; 819 kvm_pte_t pte = 0; /* Keep GCC quiet... */ 820 s8 level = S8_MAX; 821 int ret; 822 823 /* 824 * Disable IRQs so that we hazard against a concurrent 825 * teardown of the userspace page tables (which relies on 826 * IPI-ing threads). 827 */ 828 local_irq_save(flags); 829 ret = kvm_pgtable_get_leaf(&pgt, addr, &pte, &level); 830 local_irq_restore(flags); 831 832 if (ret) 833 return ret; 834 835 /* 836 * Not seeing an error, but not updating level? Something went 837 * deeply wrong... 838 */ 839 if (WARN_ON(level > KVM_PGTABLE_LAST_LEVEL)) 840 return -EFAULT; 841 if (WARN_ON(level < KVM_PGTABLE_FIRST_LEVEL)) 842 return -EFAULT; 843 844 /* Oops, the userspace PTs are gone... Replay the fault */ 845 if (!kvm_pte_valid(pte)) 846 return -EAGAIN; 847 848 return BIT(ARM64_HW_PGTABLE_LEVEL_SHIFT(level)); 849 } 850 851 static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = { 852 .zalloc_page = stage2_memcache_zalloc_page, 853 .zalloc_pages_exact = kvm_s2_zalloc_pages_exact, 854 .free_pages_exact = kvm_s2_free_pages_exact, 855 .free_unlinked_table = stage2_free_unlinked_table, 856 .get_page = kvm_host_get_page, 857 .put_page = kvm_s2_put_page, 858 .page_count = kvm_host_page_count, 859 .phys_to_virt = kvm_host_va, 860 .virt_to_phys = kvm_host_pa, 861 .dcache_clean_inval_poc = clean_dcache_guest_page, 862 .icache_inval_pou = invalidate_icache_guest_page, 863 }; 864 865 static int kvm_init_ipa_range(struct kvm_s2_mmu *mmu, unsigned long type) 866 { 867 u32 kvm_ipa_limit = get_kvm_ipa_limit(); 868 u64 mmfr0, mmfr1; 869 u32 phys_shift; 870 871 if (type & ~KVM_VM_TYPE_ARM_IPA_SIZE_MASK) 872 return -EINVAL; 873 874 phys_shift = KVM_VM_TYPE_ARM_IPA_SIZE(type); 875 if (is_protected_kvm_enabled()) { 876 phys_shift = kvm_ipa_limit; 877 } else if (phys_shift) { 878 if (phys_shift > kvm_ipa_limit || 879 phys_shift < ARM64_MIN_PARANGE_BITS) 880 return -EINVAL; 881 } else { 882 phys_shift = KVM_PHYS_SHIFT; 883 if (phys_shift > kvm_ipa_limit) { 884 pr_warn_once("%s using unsupported default IPA limit, upgrade your VMM\n", 885 current->comm); 886 return -EINVAL; 887 } 888 } 889 890 mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); 891 mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); 892 mmu->vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift); 893 894 return 0; 895 } 896 897 /** 898 * kvm_init_stage2_mmu - Initialise a S2 MMU structure 899 * @kvm: The pointer to the KVM structure 900 * @mmu: The pointer to the s2 MMU structure 901 * @type: The machine type of the virtual machine 902 * 903 * Allocates only the stage-2 HW PGD level table(s). 904 * Note we don't need locking here as this is only called in two cases: 905 * 906 * - when the VM is created, which can't race against anything 907 * 908 * - when secondary kvm_s2_mmu structures are initialised for NV 909 * guests, and the caller must hold kvm->lock as this is called on a 910 * per-vcpu basis. 911 */ 912 int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type) 913 { 914 int cpu, err; 915 struct kvm_pgtable *pgt; 916 917 /* 918 * If we already have our page tables in place, and that the 919 * MMU context is the canonical one, we have a bug somewhere, 920 * as this is only supposed to ever happen once per VM. 921 * 922 * Otherwise, we're building nested page tables, and that's 923 * probably because userspace called KVM_ARM_VCPU_INIT more 924 * than once on the same vcpu. Since that's actually legal, 925 * don't kick a fuss and leave gracefully. 926 */ 927 if (mmu->pgt != NULL) { 928 if (kvm_is_nested_s2_mmu(kvm, mmu)) 929 return 0; 930 931 kvm_err("kvm_arch already initialized?\n"); 932 return -EINVAL; 933 } 934 935 err = kvm_init_ipa_range(mmu, type); 936 if (err) 937 return err; 938 939 pgt = kzalloc(sizeof(*pgt), GFP_KERNEL_ACCOUNT); 940 if (!pgt) 941 return -ENOMEM; 942 943 mmu->arch = &kvm->arch; 944 err = kvm_pgtable_stage2_init(pgt, mmu, &kvm_s2_mm_ops); 945 if (err) 946 goto out_free_pgtable; 947 948 mmu->last_vcpu_ran = alloc_percpu(typeof(*mmu->last_vcpu_ran)); 949 if (!mmu->last_vcpu_ran) { 950 err = -ENOMEM; 951 goto out_destroy_pgtable; 952 } 953 954 for_each_possible_cpu(cpu) 955 *per_cpu_ptr(mmu->last_vcpu_ran, cpu) = -1; 956 957 /* The eager page splitting is disabled by default */ 958 mmu->split_page_chunk_size = KVM_ARM_EAGER_SPLIT_CHUNK_SIZE_DEFAULT; 959 mmu->split_page_cache.gfp_zero = __GFP_ZERO; 960 961 mmu->pgt = pgt; 962 mmu->pgd_phys = __pa(pgt->pgd); 963 964 if (kvm_is_nested_s2_mmu(kvm, mmu)) 965 kvm_init_nested_s2_mmu(mmu); 966 967 return 0; 968 969 out_destroy_pgtable: 970 kvm_pgtable_stage2_destroy(pgt); 971 out_free_pgtable: 972 kfree(pgt); 973 return err; 974 } 975 976 void kvm_uninit_stage2_mmu(struct kvm *kvm) 977 { 978 kvm_free_stage2_pgd(&kvm->arch.mmu); 979 kvm_mmu_free_memory_cache(&kvm->arch.mmu.split_page_cache); 980 } 981 982 static void stage2_unmap_memslot(struct kvm *kvm, 983 struct kvm_memory_slot *memslot) 984 { 985 hva_t hva = memslot->userspace_addr; 986 phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT; 987 phys_addr_t size = PAGE_SIZE * memslot->npages; 988 hva_t reg_end = hva + size; 989 990 /* 991 * A memory region could potentially cover multiple VMAs, and any holes 992 * between them, so iterate over all of them to find out if we should 993 * unmap any of them. 994 * 995 * +--------------------------------------------+ 996 * +---------------+----------------+ +----------------+ 997 * | : VMA 1 | VMA 2 | | VMA 3 : | 998 * +---------------+----------------+ +----------------+ 999 * | memory region | 1000 * +--------------------------------------------+ 1001 */ 1002 do { 1003 struct vm_area_struct *vma; 1004 hva_t vm_start, vm_end; 1005 1006 vma = find_vma_intersection(current->mm, hva, reg_end); 1007 if (!vma) 1008 break; 1009 1010 /* 1011 * Take the intersection of this VMA with the memory region 1012 */ 1013 vm_start = max(hva, vma->vm_start); 1014 vm_end = min(reg_end, vma->vm_end); 1015 1016 if (!(vma->vm_flags & VM_PFNMAP)) { 1017 gpa_t gpa = addr + (vm_start - memslot->userspace_addr); 1018 kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, vm_end - vm_start); 1019 } 1020 hva = vm_end; 1021 } while (hva < reg_end); 1022 } 1023 1024 /** 1025 * stage2_unmap_vm - Unmap Stage-2 RAM mappings 1026 * @kvm: The struct kvm pointer 1027 * 1028 * Go through the memregions and unmap any regular RAM 1029 * backing memory already mapped to the VM. 1030 */ 1031 void stage2_unmap_vm(struct kvm *kvm) 1032 { 1033 struct kvm_memslots *slots; 1034 struct kvm_memory_slot *memslot; 1035 int idx, bkt; 1036 1037 idx = srcu_read_lock(&kvm->srcu); 1038 mmap_read_lock(current->mm); 1039 write_lock(&kvm->mmu_lock); 1040 1041 slots = kvm_memslots(kvm); 1042 kvm_for_each_memslot(memslot, bkt, slots) 1043 stage2_unmap_memslot(kvm, memslot); 1044 1045 kvm_nested_s2_unmap(kvm); 1046 1047 write_unlock(&kvm->mmu_lock); 1048 mmap_read_unlock(current->mm); 1049 srcu_read_unlock(&kvm->srcu, idx); 1050 } 1051 1052 void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu) 1053 { 1054 struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu); 1055 struct kvm_pgtable *pgt = NULL; 1056 1057 write_lock(&kvm->mmu_lock); 1058 pgt = mmu->pgt; 1059 if (pgt) { 1060 mmu->pgd_phys = 0; 1061 mmu->pgt = NULL; 1062 free_percpu(mmu->last_vcpu_ran); 1063 } 1064 write_unlock(&kvm->mmu_lock); 1065 1066 if (pgt) { 1067 kvm_pgtable_stage2_destroy(pgt); 1068 kfree(pgt); 1069 } 1070 } 1071 1072 static void hyp_mc_free_fn(void *addr, void *unused) 1073 { 1074 free_page((unsigned long)addr); 1075 } 1076 1077 static void *hyp_mc_alloc_fn(void *unused) 1078 { 1079 return (void *)__get_free_page(GFP_KERNEL_ACCOUNT); 1080 } 1081 1082 void free_hyp_memcache(struct kvm_hyp_memcache *mc) 1083 { 1084 if (is_protected_kvm_enabled()) 1085 __free_hyp_memcache(mc, hyp_mc_free_fn, 1086 kvm_host_va, NULL); 1087 } 1088 1089 int topup_hyp_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages) 1090 { 1091 if (!is_protected_kvm_enabled()) 1092 return 0; 1093 1094 return __topup_hyp_memcache(mc, min_pages, hyp_mc_alloc_fn, 1095 kvm_host_pa, NULL); 1096 } 1097 1098 /** 1099 * kvm_phys_addr_ioremap - map a device range to guest IPA 1100 * 1101 * @kvm: The KVM pointer 1102 * @guest_ipa: The IPA at which to insert the mapping 1103 * @pa: The physical address of the device 1104 * @size: The size of the mapping 1105 * @writable: Whether or not to create a writable mapping 1106 */ 1107 int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, 1108 phys_addr_t pa, unsigned long size, bool writable) 1109 { 1110 phys_addr_t addr; 1111 int ret = 0; 1112 struct kvm_mmu_memory_cache cache = { .gfp_zero = __GFP_ZERO }; 1113 struct kvm_s2_mmu *mmu = &kvm->arch.mmu; 1114 struct kvm_pgtable *pgt = mmu->pgt; 1115 enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_DEVICE | 1116 KVM_PGTABLE_PROT_R | 1117 (writable ? KVM_PGTABLE_PROT_W : 0); 1118 1119 if (is_protected_kvm_enabled()) 1120 return -EPERM; 1121 1122 size += offset_in_page(guest_ipa); 1123 guest_ipa &= PAGE_MASK; 1124 1125 for (addr = guest_ipa; addr < guest_ipa + size; addr += PAGE_SIZE) { 1126 ret = kvm_mmu_topup_memory_cache(&cache, 1127 kvm_mmu_cache_min_pages(mmu)); 1128 if (ret) 1129 break; 1130 1131 write_lock(&kvm->mmu_lock); 1132 ret = kvm_pgtable_stage2_map(pgt, addr, PAGE_SIZE, pa, prot, 1133 &cache, 0); 1134 write_unlock(&kvm->mmu_lock); 1135 if (ret) 1136 break; 1137 1138 pa += PAGE_SIZE; 1139 } 1140 1141 kvm_mmu_free_memory_cache(&cache); 1142 return ret; 1143 } 1144 1145 /** 1146 * kvm_stage2_wp_range() - write protect stage2 memory region range 1147 * @mmu: The KVM stage-2 MMU pointer 1148 * @addr: Start address of range 1149 * @end: End address of range 1150 */ 1151 void kvm_stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end) 1152 { 1153 stage2_apply_range_resched(mmu, addr, end, kvm_pgtable_stage2_wrprotect); 1154 } 1155 1156 /** 1157 * kvm_mmu_wp_memory_region() - write protect stage 2 entries for memory slot 1158 * @kvm: The KVM pointer 1159 * @slot: The memory slot to write protect 1160 * 1161 * Called to start logging dirty pages after memory region 1162 * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns 1163 * all present PUD, PMD and PTEs are write protected in the memory region. 1164 * Afterwards read of dirty page log can be called. 1165 * 1166 * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired, 1167 * serializing operations for VM memory regions. 1168 */ 1169 static void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot) 1170 { 1171 struct kvm_memslots *slots = kvm_memslots(kvm); 1172 struct kvm_memory_slot *memslot = id_to_memslot(slots, slot); 1173 phys_addr_t start, end; 1174 1175 if (WARN_ON_ONCE(!memslot)) 1176 return; 1177 1178 start = memslot->base_gfn << PAGE_SHIFT; 1179 end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT; 1180 1181 write_lock(&kvm->mmu_lock); 1182 kvm_stage2_wp_range(&kvm->arch.mmu, start, end); 1183 kvm_nested_s2_wp(kvm); 1184 write_unlock(&kvm->mmu_lock); 1185 kvm_flush_remote_tlbs_memslot(kvm, memslot); 1186 } 1187 1188 /** 1189 * kvm_mmu_split_memory_region() - split the stage 2 blocks into PAGE_SIZE 1190 * pages for memory slot 1191 * @kvm: The KVM pointer 1192 * @slot: The memory slot to split 1193 * 1194 * Acquires kvm->mmu_lock. Called with kvm->slots_lock mutex acquired, 1195 * serializing operations for VM memory regions. 1196 */ 1197 static void kvm_mmu_split_memory_region(struct kvm *kvm, int slot) 1198 { 1199 struct kvm_memslots *slots; 1200 struct kvm_memory_slot *memslot; 1201 phys_addr_t start, end; 1202 1203 lockdep_assert_held(&kvm->slots_lock); 1204 1205 slots = kvm_memslots(kvm); 1206 memslot = id_to_memslot(slots, slot); 1207 1208 start = memslot->base_gfn << PAGE_SHIFT; 1209 end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT; 1210 1211 write_lock(&kvm->mmu_lock); 1212 kvm_mmu_split_huge_pages(kvm, start, end); 1213 write_unlock(&kvm->mmu_lock); 1214 } 1215 1216 /* 1217 * kvm_arch_mmu_enable_log_dirty_pt_masked() - enable dirty logging for selected pages. 1218 * @kvm: The KVM pointer 1219 * @slot: The memory slot associated with mask 1220 * @gfn_offset: The gfn offset in memory slot 1221 * @mask: The mask of pages at offset 'gfn_offset' in this memory 1222 * slot to enable dirty logging on 1223 * 1224 * Writes protect selected pages to enable dirty logging, and then 1225 * splits them to PAGE_SIZE. Caller must acquire kvm->mmu_lock. 1226 */ 1227 void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, 1228 struct kvm_memory_slot *slot, 1229 gfn_t gfn_offset, unsigned long mask) 1230 { 1231 phys_addr_t base_gfn = slot->base_gfn + gfn_offset; 1232 phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT; 1233 phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT; 1234 1235 lockdep_assert_held_write(&kvm->mmu_lock); 1236 1237 kvm_stage2_wp_range(&kvm->arch.mmu, start, end); 1238 1239 /* 1240 * Eager-splitting is done when manual-protect is set. We 1241 * also check for initially-all-set because we can avoid 1242 * eager-splitting if initially-all-set is false. 1243 * Initially-all-set equal false implies that huge-pages were 1244 * already split when enabling dirty logging: no need to do it 1245 * again. 1246 */ 1247 if (kvm_dirty_log_manual_protect_and_init_set(kvm)) 1248 kvm_mmu_split_huge_pages(kvm, start, end); 1249 1250 kvm_nested_s2_wp(kvm); 1251 } 1252 1253 static void kvm_send_hwpoison_signal(unsigned long address, short lsb) 1254 { 1255 send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current); 1256 } 1257 1258 static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot, 1259 unsigned long hva, 1260 unsigned long map_size) 1261 { 1262 gpa_t gpa_start; 1263 hva_t uaddr_start, uaddr_end; 1264 size_t size; 1265 1266 /* The memslot and the VMA are guaranteed to be aligned to PAGE_SIZE */ 1267 if (map_size == PAGE_SIZE) 1268 return true; 1269 1270 size = memslot->npages * PAGE_SIZE; 1271 1272 gpa_start = memslot->base_gfn << PAGE_SHIFT; 1273 1274 uaddr_start = memslot->userspace_addr; 1275 uaddr_end = uaddr_start + size; 1276 1277 /* 1278 * Pages belonging to memslots that don't have the same alignment 1279 * within a PMD/PUD for userspace and IPA cannot be mapped with stage-2 1280 * PMD/PUD entries, because we'll end up mapping the wrong pages. 1281 * 1282 * Consider a layout like the following: 1283 * 1284 * memslot->userspace_addr: 1285 * +-----+--------------------+--------------------+---+ 1286 * |abcde|fgh Stage-1 block | Stage-1 block tv|xyz| 1287 * +-----+--------------------+--------------------+---+ 1288 * 1289 * memslot->base_gfn << PAGE_SHIFT: 1290 * +---+--------------------+--------------------+-----+ 1291 * |abc|def Stage-2 block | Stage-2 block |tvxyz| 1292 * +---+--------------------+--------------------+-----+ 1293 * 1294 * If we create those stage-2 blocks, we'll end up with this incorrect 1295 * mapping: 1296 * d -> f 1297 * e -> g 1298 * f -> h 1299 */ 1300 if ((gpa_start & (map_size - 1)) != (uaddr_start & (map_size - 1))) 1301 return false; 1302 1303 /* 1304 * Next, let's make sure we're not trying to map anything not covered 1305 * by the memslot. This means we have to prohibit block size mappings 1306 * for the beginning and end of a non-block aligned and non-block sized 1307 * memory slot (illustrated by the head and tail parts of the 1308 * userspace view above containing pages 'abcde' and 'xyz', 1309 * respectively). 1310 * 1311 * Note that it doesn't matter if we do the check using the 1312 * userspace_addr or the base_gfn, as both are equally aligned (per 1313 * the check above) and equally sized. 1314 */ 1315 return (hva & ~(map_size - 1)) >= uaddr_start && 1316 (hva & ~(map_size - 1)) + map_size <= uaddr_end; 1317 } 1318 1319 /* 1320 * Check if the given hva is backed by a transparent huge page (THP) and 1321 * whether it can be mapped using block mapping in stage2. If so, adjust 1322 * the stage2 PFN and IPA accordingly. Only PMD_SIZE THPs are currently 1323 * supported. This will need to be updated to support other THP sizes. 1324 * 1325 * Returns the size of the mapping. 1326 */ 1327 static long 1328 transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot, 1329 unsigned long hva, kvm_pfn_t *pfnp, 1330 phys_addr_t *ipap) 1331 { 1332 kvm_pfn_t pfn = *pfnp; 1333 1334 /* 1335 * Make sure the adjustment is done only for THP pages. Also make 1336 * sure that the HVA and IPA are sufficiently aligned and that the 1337 * block map is contained within the memslot. 1338 */ 1339 if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) { 1340 int sz = get_user_mapping_size(kvm, hva); 1341 1342 if (sz < 0) 1343 return sz; 1344 1345 if (sz < PMD_SIZE) 1346 return PAGE_SIZE; 1347 1348 *ipap &= PMD_MASK; 1349 pfn &= ~(PTRS_PER_PMD - 1); 1350 *pfnp = pfn; 1351 1352 return PMD_SIZE; 1353 } 1354 1355 /* Use page mapping if we cannot use block mapping. */ 1356 return PAGE_SIZE; 1357 } 1358 1359 static int get_vma_page_shift(struct vm_area_struct *vma, unsigned long hva) 1360 { 1361 unsigned long pa; 1362 1363 if (is_vm_hugetlb_page(vma) && !(vma->vm_flags & VM_PFNMAP)) 1364 return huge_page_shift(hstate_vma(vma)); 1365 1366 if (!(vma->vm_flags & VM_PFNMAP)) 1367 return PAGE_SHIFT; 1368 1369 VM_BUG_ON(is_vm_hugetlb_page(vma)); 1370 1371 pa = (vma->vm_pgoff << PAGE_SHIFT) + (hva - vma->vm_start); 1372 1373 #ifndef __PAGETABLE_PMD_FOLDED 1374 if ((hva & (PUD_SIZE - 1)) == (pa & (PUD_SIZE - 1)) && 1375 ALIGN_DOWN(hva, PUD_SIZE) >= vma->vm_start && 1376 ALIGN(hva, PUD_SIZE) <= vma->vm_end) 1377 return PUD_SHIFT; 1378 #endif 1379 1380 if ((hva & (PMD_SIZE - 1)) == (pa & (PMD_SIZE - 1)) && 1381 ALIGN_DOWN(hva, PMD_SIZE) >= vma->vm_start && 1382 ALIGN(hva, PMD_SIZE) <= vma->vm_end) 1383 return PMD_SHIFT; 1384 1385 return PAGE_SHIFT; 1386 } 1387 1388 /* 1389 * The page will be mapped in stage 2 as Normal Cacheable, so the VM will be 1390 * able to see the page's tags and therefore they must be initialised first. If 1391 * PG_mte_tagged is set, tags have already been initialised. 1392 * 1393 * The race in the test/set of the PG_mte_tagged flag is handled by: 1394 * - preventing VM_SHARED mappings in a memslot with MTE preventing two VMs 1395 * racing to santise the same page 1396 * - mmap_lock protects between a VM faulting a page in and the VMM performing 1397 * an mprotect() to add VM_MTE 1398 */ 1399 static void sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn, 1400 unsigned long size) 1401 { 1402 unsigned long i, nr_pages = size >> PAGE_SHIFT; 1403 struct page *page = pfn_to_page(pfn); 1404 1405 if (!kvm_has_mte(kvm)) 1406 return; 1407 1408 for (i = 0; i < nr_pages; i++, page++) { 1409 if (try_page_mte_tagging(page)) { 1410 mte_clear_page_tags(page_address(page)); 1411 set_page_mte_tagged(page); 1412 } 1413 } 1414 } 1415 1416 static bool kvm_vma_mte_allowed(struct vm_area_struct *vma) 1417 { 1418 return vma->vm_flags & VM_MTE_ALLOWED; 1419 } 1420 1421 static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, 1422 struct kvm_s2_trans *nested, 1423 struct kvm_memory_slot *memslot, unsigned long hva, 1424 bool fault_is_perm) 1425 { 1426 int ret = 0; 1427 bool write_fault, writable, force_pte = false; 1428 bool exec_fault, mte_allowed; 1429 bool device = false, vfio_allow_any_uc = false; 1430 unsigned long mmu_seq; 1431 phys_addr_t ipa = fault_ipa; 1432 struct kvm *kvm = vcpu->kvm; 1433 struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache; 1434 struct vm_area_struct *vma; 1435 short vma_shift; 1436 gfn_t gfn; 1437 kvm_pfn_t pfn; 1438 bool logging_active = memslot_is_logging(memslot); 1439 long vma_pagesize, fault_granule; 1440 enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; 1441 struct kvm_pgtable *pgt; 1442 1443 if (fault_is_perm) 1444 fault_granule = kvm_vcpu_trap_get_perm_fault_granule(vcpu); 1445 write_fault = kvm_is_write_fault(vcpu); 1446 exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu); 1447 VM_BUG_ON(write_fault && exec_fault); 1448 1449 if (fault_is_perm && !write_fault && !exec_fault) { 1450 kvm_err("Unexpected L2 read permission error\n"); 1451 return -EFAULT; 1452 } 1453 1454 /* 1455 * Permission faults just need to update the existing leaf entry, 1456 * and so normally don't require allocations from the memcache. The 1457 * only exception to this is when dirty logging is enabled at runtime 1458 * and a write fault needs to collapse a block entry into a table. 1459 */ 1460 if (!fault_is_perm || (logging_active && write_fault)) { 1461 ret = kvm_mmu_topup_memory_cache(memcache, 1462 kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu)); 1463 if (ret) 1464 return ret; 1465 } 1466 1467 /* 1468 * Let's check if we will get back a huge page backed by hugetlbfs, or 1469 * get block mapping for device MMIO region. 1470 */ 1471 mmap_read_lock(current->mm); 1472 vma = vma_lookup(current->mm, hva); 1473 if (unlikely(!vma)) { 1474 kvm_err("Failed to find VMA for hva 0x%lx\n", hva); 1475 mmap_read_unlock(current->mm); 1476 return -EFAULT; 1477 } 1478 1479 /* 1480 * logging_active is guaranteed to never be true for VM_PFNMAP 1481 * memslots. 1482 */ 1483 if (logging_active) { 1484 force_pte = true; 1485 vma_shift = PAGE_SHIFT; 1486 } else { 1487 vma_shift = get_vma_page_shift(vma, hva); 1488 } 1489 1490 switch (vma_shift) { 1491 #ifndef __PAGETABLE_PMD_FOLDED 1492 case PUD_SHIFT: 1493 if (fault_supports_stage2_huge_mapping(memslot, hva, PUD_SIZE)) 1494 break; 1495 fallthrough; 1496 #endif 1497 case CONT_PMD_SHIFT: 1498 vma_shift = PMD_SHIFT; 1499 fallthrough; 1500 case PMD_SHIFT: 1501 if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) 1502 break; 1503 fallthrough; 1504 case CONT_PTE_SHIFT: 1505 vma_shift = PAGE_SHIFT; 1506 force_pte = true; 1507 fallthrough; 1508 case PAGE_SHIFT: 1509 break; 1510 default: 1511 WARN_ONCE(1, "Unknown vma_shift %d", vma_shift); 1512 } 1513 1514 vma_pagesize = 1UL << vma_shift; 1515 1516 if (nested) { 1517 unsigned long max_map_size; 1518 1519 max_map_size = force_pte ? PAGE_SIZE : PUD_SIZE; 1520 1521 ipa = kvm_s2_trans_output(nested); 1522 1523 /* 1524 * If we're about to create a shadow stage 2 entry, then we 1525 * can only create a block mapping if the guest stage 2 page 1526 * table uses at least as big a mapping. 1527 */ 1528 max_map_size = min(kvm_s2_trans_size(nested), max_map_size); 1529 1530 /* 1531 * Be careful that if the mapping size falls between 1532 * two host sizes, take the smallest of the two. 1533 */ 1534 if (max_map_size >= PMD_SIZE && max_map_size < PUD_SIZE) 1535 max_map_size = PMD_SIZE; 1536 else if (max_map_size >= PAGE_SIZE && max_map_size < PMD_SIZE) 1537 max_map_size = PAGE_SIZE; 1538 1539 force_pte = (max_map_size == PAGE_SIZE); 1540 vma_pagesize = min(vma_pagesize, (long)max_map_size); 1541 } 1542 1543 if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE) 1544 fault_ipa &= ~(vma_pagesize - 1); 1545 1546 gfn = ipa >> PAGE_SHIFT; 1547 mte_allowed = kvm_vma_mte_allowed(vma); 1548 1549 vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED; 1550 1551 /* Don't use the VMA after the unlock -- it may have vanished */ 1552 vma = NULL; 1553 1554 /* 1555 * Read mmu_invalidate_seq so that KVM can detect if the results of 1556 * vma_lookup() or __gfn_to_pfn_memslot() become stale prior to 1557 * acquiring kvm->mmu_lock. 1558 * 1559 * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs 1560 * with the smp_wmb() in kvm_mmu_invalidate_end(). 1561 */ 1562 mmu_seq = vcpu->kvm->mmu_invalidate_seq; 1563 mmap_read_unlock(current->mm); 1564 1565 pfn = __gfn_to_pfn_memslot(memslot, gfn, false, false, NULL, 1566 write_fault, &writable, NULL); 1567 if (pfn == KVM_PFN_ERR_HWPOISON) { 1568 kvm_send_hwpoison_signal(hva, vma_shift); 1569 return 0; 1570 } 1571 if (is_error_noslot_pfn(pfn)) 1572 return -EFAULT; 1573 1574 if (kvm_is_device_pfn(pfn)) { 1575 /* 1576 * If the page was identified as device early by looking at 1577 * the VMA flags, vma_pagesize is already representing the 1578 * largest quantity we can map. If instead it was mapped 1579 * via gfn_to_pfn_prot(), vma_pagesize is set to PAGE_SIZE 1580 * and must not be upgraded. 1581 * 1582 * In both cases, we don't let transparent_hugepage_adjust() 1583 * change things at the last minute. 1584 */ 1585 device = true; 1586 } else if (logging_active && !write_fault) { 1587 /* 1588 * Only actually map the page as writable if this was a write 1589 * fault. 1590 */ 1591 writable = false; 1592 } 1593 1594 if (exec_fault && device) 1595 return -ENOEXEC; 1596 1597 /* 1598 * Potentially reduce shadow S2 permissions to match the guest's own 1599 * S2. For exec faults, we'd only reach this point if the guest 1600 * actually allowed it (see kvm_s2_handle_perm_fault). 1601 * 1602 * Also encode the level of the original translation in the SW bits 1603 * of the leaf entry as a proxy for the span of that translation. 1604 * This will be retrieved on TLB invalidation from the guest and 1605 * used to limit the invalidation scope if a TTL hint or a range 1606 * isn't provided. 1607 */ 1608 if (nested) { 1609 writable &= kvm_s2_trans_writable(nested); 1610 if (!kvm_s2_trans_readable(nested)) 1611 prot &= ~KVM_PGTABLE_PROT_R; 1612 1613 prot |= kvm_encode_nested_level(nested); 1614 } 1615 1616 read_lock(&kvm->mmu_lock); 1617 pgt = vcpu->arch.hw_mmu->pgt; 1618 if (mmu_invalidate_retry(kvm, mmu_seq)) { 1619 ret = -EAGAIN; 1620 goto out_unlock; 1621 } 1622 1623 /* 1624 * If we are not forced to use page mapping, check if we are 1625 * backed by a THP and thus use block mapping if possible. 1626 */ 1627 if (vma_pagesize == PAGE_SIZE && !(force_pte || device)) { 1628 if (fault_is_perm && fault_granule > PAGE_SIZE) 1629 vma_pagesize = fault_granule; 1630 else 1631 vma_pagesize = transparent_hugepage_adjust(kvm, memslot, 1632 hva, &pfn, 1633 &fault_ipa); 1634 1635 if (vma_pagesize < 0) { 1636 ret = vma_pagesize; 1637 goto out_unlock; 1638 } 1639 } 1640 1641 if (!fault_is_perm && !device && kvm_has_mte(kvm)) { 1642 /* Check the VMM hasn't introduced a new disallowed VMA */ 1643 if (mte_allowed) { 1644 sanitise_mte_tags(kvm, pfn, vma_pagesize); 1645 } else { 1646 ret = -EFAULT; 1647 goto out_unlock; 1648 } 1649 } 1650 1651 if (writable) 1652 prot |= KVM_PGTABLE_PROT_W; 1653 1654 if (exec_fault) 1655 prot |= KVM_PGTABLE_PROT_X; 1656 1657 if (device) { 1658 if (vfio_allow_any_uc) 1659 prot |= KVM_PGTABLE_PROT_NORMAL_NC; 1660 else 1661 prot |= KVM_PGTABLE_PROT_DEVICE; 1662 } else if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC) && 1663 (!nested || kvm_s2_trans_executable(nested))) { 1664 prot |= KVM_PGTABLE_PROT_X; 1665 } 1666 1667 /* 1668 * Under the premise of getting a FSC_PERM fault, we just need to relax 1669 * permissions only if vma_pagesize equals fault_granule. Otherwise, 1670 * kvm_pgtable_stage2_map() should be called to change block size. 1671 */ 1672 if (fault_is_perm && vma_pagesize == fault_granule) { 1673 /* 1674 * Drop the SW bits in favour of those stored in the 1675 * PTE, which will be preserved. 1676 */ 1677 prot &= ~KVM_NV_GUEST_MAP_SZ; 1678 ret = kvm_pgtable_stage2_relax_perms(pgt, fault_ipa, prot); 1679 } else { 1680 ret = kvm_pgtable_stage2_map(pgt, fault_ipa, vma_pagesize, 1681 __pfn_to_phys(pfn), prot, 1682 memcache, 1683 KVM_PGTABLE_WALK_HANDLE_FAULT | 1684 KVM_PGTABLE_WALK_SHARED); 1685 } 1686 1687 out_unlock: 1688 read_unlock(&kvm->mmu_lock); 1689 1690 /* Mark the page dirty only if the fault is handled successfully */ 1691 if (writable && !ret) { 1692 kvm_set_pfn_dirty(pfn); 1693 mark_page_dirty_in_slot(kvm, memslot, gfn); 1694 } 1695 1696 kvm_release_pfn_clean(pfn); 1697 return ret != -EAGAIN ? ret : 0; 1698 } 1699 1700 /* Resolve the access fault by making the page young again. */ 1701 static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) 1702 { 1703 kvm_pte_t pte; 1704 struct kvm_s2_mmu *mmu; 1705 1706 trace_kvm_access_fault(fault_ipa); 1707 1708 read_lock(&vcpu->kvm->mmu_lock); 1709 mmu = vcpu->arch.hw_mmu; 1710 pte = kvm_pgtable_stage2_mkyoung(mmu->pgt, fault_ipa); 1711 read_unlock(&vcpu->kvm->mmu_lock); 1712 1713 if (kvm_pte_valid(pte)) 1714 kvm_set_pfn_accessed(kvm_pte_to_pfn(pte)); 1715 } 1716 1717 /** 1718 * kvm_handle_guest_abort - handles all 2nd stage aborts 1719 * @vcpu: the VCPU pointer 1720 * 1721 * Any abort that gets to the host is almost guaranteed to be caused by a 1722 * missing second stage translation table entry, which can mean that either the 1723 * guest simply needs more memory and we must allocate an appropriate page or it 1724 * can mean that the guest tried to access I/O memory, which is emulated by user 1725 * space. The distinction is based on the IPA causing the fault and whether this 1726 * memory region has been registered as standard RAM by user space. 1727 */ 1728 int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) 1729 { 1730 struct kvm_s2_trans nested_trans, *nested = NULL; 1731 unsigned long esr; 1732 phys_addr_t fault_ipa; /* The address we faulted on */ 1733 phys_addr_t ipa; /* Always the IPA in the L1 guest phys space */ 1734 struct kvm_memory_slot *memslot; 1735 unsigned long hva; 1736 bool is_iabt, write_fault, writable; 1737 gfn_t gfn; 1738 int ret, idx; 1739 1740 esr = kvm_vcpu_get_esr(vcpu); 1741 1742 ipa = fault_ipa = kvm_vcpu_get_fault_ipa(vcpu); 1743 is_iabt = kvm_vcpu_trap_is_iabt(vcpu); 1744 1745 if (esr_fsc_is_translation_fault(esr)) { 1746 /* Beyond sanitised PARange (which is the IPA limit) */ 1747 if (fault_ipa >= BIT_ULL(get_kvm_ipa_limit())) { 1748 kvm_inject_size_fault(vcpu); 1749 return 1; 1750 } 1751 1752 /* Falls between the IPA range and the PARange? */ 1753 if (fault_ipa >= BIT_ULL(vcpu->arch.hw_mmu->pgt->ia_bits)) { 1754 fault_ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0); 1755 1756 if (is_iabt) 1757 kvm_inject_pabt(vcpu, fault_ipa); 1758 else 1759 kvm_inject_dabt(vcpu, fault_ipa); 1760 return 1; 1761 } 1762 } 1763 1764 /* Synchronous External Abort? */ 1765 if (kvm_vcpu_abt_issea(vcpu)) { 1766 /* 1767 * For RAS the host kernel may handle this abort. 1768 * There is no need to pass the error into the guest. 1769 */ 1770 if (kvm_handle_guest_sea(fault_ipa, kvm_vcpu_get_esr(vcpu))) 1771 kvm_inject_vabt(vcpu); 1772 1773 return 1; 1774 } 1775 1776 trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_esr(vcpu), 1777 kvm_vcpu_get_hfar(vcpu), fault_ipa); 1778 1779 /* Check the stage-2 fault is trans. fault or write fault */ 1780 if (!esr_fsc_is_translation_fault(esr) && 1781 !esr_fsc_is_permission_fault(esr) && 1782 !esr_fsc_is_access_flag_fault(esr)) { 1783 kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n", 1784 kvm_vcpu_trap_get_class(vcpu), 1785 (unsigned long)kvm_vcpu_trap_get_fault(vcpu), 1786 (unsigned long)kvm_vcpu_get_esr(vcpu)); 1787 return -EFAULT; 1788 } 1789 1790 idx = srcu_read_lock(&vcpu->kvm->srcu); 1791 1792 /* 1793 * We may have faulted on a shadow stage 2 page table if we are 1794 * running a nested guest. In this case, we have to resolve the L2 1795 * IPA to the L1 IPA first, before knowing what kind of memory should 1796 * back the L1 IPA. 1797 * 1798 * If the shadow stage 2 page table walk faults, then we simply inject 1799 * this to the guest and carry on. 1800 * 1801 * If there are no shadow S2 PTs because S2 is disabled, there is 1802 * nothing to walk and we treat it as a 1:1 before going through the 1803 * canonical translation. 1804 */ 1805 if (kvm_is_nested_s2_mmu(vcpu->kvm,vcpu->arch.hw_mmu) && 1806 vcpu->arch.hw_mmu->nested_stage2_enabled) { 1807 u32 esr; 1808 1809 ret = kvm_walk_nested_s2(vcpu, fault_ipa, &nested_trans); 1810 if (ret) { 1811 esr = kvm_s2_trans_esr(&nested_trans); 1812 kvm_inject_s2_fault(vcpu, esr); 1813 goto out_unlock; 1814 } 1815 1816 ret = kvm_s2_handle_perm_fault(vcpu, &nested_trans); 1817 if (ret) { 1818 esr = kvm_s2_trans_esr(&nested_trans); 1819 kvm_inject_s2_fault(vcpu, esr); 1820 goto out_unlock; 1821 } 1822 1823 ipa = kvm_s2_trans_output(&nested_trans); 1824 nested = &nested_trans; 1825 } 1826 1827 gfn = ipa >> PAGE_SHIFT; 1828 memslot = gfn_to_memslot(vcpu->kvm, gfn); 1829 hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable); 1830 write_fault = kvm_is_write_fault(vcpu); 1831 if (kvm_is_error_hva(hva) || (write_fault && !writable)) { 1832 /* 1833 * The guest has put either its instructions or its page-tables 1834 * somewhere it shouldn't have. Userspace won't be able to do 1835 * anything about this (there's no syndrome for a start), so 1836 * re-inject the abort back into the guest. 1837 */ 1838 if (is_iabt) { 1839 ret = -ENOEXEC; 1840 goto out; 1841 } 1842 1843 if (kvm_vcpu_abt_iss1tw(vcpu)) { 1844 kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu)); 1845 ret = 1; 1846 goto out_unlock; 1847 } 1848 1849 /* 1850 * Check for a cache maintenance operation. Since we 1851 * ended-up here, we know it is outside of any memory 1852 * slot. But we can't find out if that is for a device, 1853 * or if the guest is just being stupid. The only thing 1854 * we know for sure is that this range cannot be cached. 1855 * 1856 * So let's assume that the guest is just being 1857 * cautious, and skip the instruction. 1858 */ 1859 if (kvm_is_error_hva(hva) && kvm_vcpu_dabt_is_cm(vcpu)) { 1860 kvm_incr_pc(vcpu); 1861 ret = 1; 1862 goto out_unlock; 1863 } 1864 1865 /* 1866 * The IPA is reported as [MAX:12], so we need to 1867 * complement it with the bottom 12 bits from the 1868 * faulting VA. This is always 12 bits, irrespective 1869 * of the page size. 1870 */ 1871 ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0); 1872 ret = io_mem_abort(vcpu, ipa); 1873 goto out_unlock; 1874 } 1875 1876 /* Userspace should not be able to register out-of-bounds IPAs */ 1877 VM_BUG_ON(ipa >= kvm_phys_size(vcpu->arch.hw_mmu)); 1878 1879 if (esr_fsc_is_access_flag_fault(esr)) { 1880 handle_access_fault(vcpu, fault_ipa); 1881 ret = 1; 1882 goto out_unlock; 1883 } 1884 1885 ret = user_mem_abort(vcpu, fault_ipa, nested, memslot, hva, 1886 esr_fsc_is_permission_fault(esr)); 1887 if (ret == 0) 1888 ret = 1; 1889 out: 1890 if (ret == -ENOEXEC) { 1891 kvm_inject_pabt(vcpu, kvm_vcpu_get_hfar(vcpu)); 1892 ret = 1; 1893 } 1894 out_unlock: 1895 srcu_read_unlock(&vcpu->kvm->srcu, idx); 1896 return ret; 1897 } 1898 1899 bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) 1900 { 1901 if (!kvm->arch.mmu.pgt) 1902 return false; 1903 1904 __unmap_stage2_range(&kvm->arch.mmu, range->start << PAGE_SHIFT, 1905 (range->end - range->start) << PAGE_SHIFT, 1906 range->may_block); 1907 1908 kvm_nested_s2_unmap(kvm); 1909 return false; 1910 } 1911 1912 bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 1913 { 1914 u64 size = (range->end - range->start) << PAGE_SHIFT; 1915 1916 if (!kvm->arch.mmu.pgt) 1917 return false; 1918 1919 return kvm_pgtable_stage2_test_clear_young(kvm->arch.mmu.pgt, 1920 range->start << PAGE_SHIFT, 1921 size, true); 1922 /* 1923 * TODO: Handle nested_mmu structures here using the reverse mapping in 1924 * a later version of patch series. 1925 */ 1926 } 1927 1928 bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 1929 { 1930 u64 size = (range->end - range->start) << PAGE_SHIFT; 1931 1932 if (!kvm->arch.mmu.pgt) 1933 return false; 1934 1935 return kvm_pgtable_stage2_test_clear_young(kvm->arch.mmu.pgt, 1936 range->start << PAGE_SHIFT, 1937 size, false); 1938 } 1939 1940 phys_addr_t kvm_mmu_get_httbr(void) 1941 { 1942 return __pa(hyp_pgtable->pgd); 1943 } 1944 1945 phys_addr_t kvm_get_idmap_vector(void) 1946 { 1947 return hyp_idmap_vector; 1948 } 1949 1950 static int kvm_map_idmap_text(void) 1951 { 1952 unsigned long size = hyp_idmap_end - hyp_idmap_start; 1953 int err = __create_hyp_mappings(hyp_idmap_start, size, hyp_idmap_start, 1954 PAGE_HYP_EXEC); 1955 if (err) 1956 kvm_err("Failed to idmap %lx-%lx\n", 1957 hyp_idmap_start, hyp_idmap_end); 1958 1959 return err; 1960 } 1961 1962 static void *kvm_hyp_zalloc_page(void *arg) 1963 { 1964 return (void *)get_zeroed_page(GFP_KERNEL); 1965 } 1966 1967 static struct kvm_pgtable_mm_ops kvm_hyp_mm_ops = { 1968 .zalloc_page = kvm_hyp_zalloc_page, 1969 .get_page = kvm_host_get_page, 1970 .put_page = kvm_host_put_page, 1971 .phys_to_virt = kvm_host_va, 1972 .virt_to_phys = kvm_host_pa, 1973 }; 1974 1975 int __init kvm_mmu_init(u32 *hyp_va_bits) 1976 { 1977 int err; 1978 u32 idmap_bits; 1979 u32 kernel_bits; 1980 1981 hyp_idmap_start = __pa_symbol(__hyp_idmap_text_start); 1982 hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE); 1983 hyp_idmap_end = __pa_symbol(__hyp_idmap_text_end); 1984 hyp_idmap_end = ALIGN(hyp_idmap_end, PAGE_SIZE); 1985 hyp_idmap_vector = __pa_symbol(__kvm_hyp_init); 1986 1987 /* 1988 * We rely on the linker script to ensure at build time that the HYP 1989 * init code does not cross a page boundary. 1990 */ 1991 BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK); 1992 1993 /* 1994 * The ID map is always configured for 48 bits of translation, which 1995 * may be fewer than the number of VA bits used by the regular kernel 1996 * stage 1, when VA_BITS=52. 1997 * 1998 * At EL2, there is only one TTBR register, and we can't switch between 1999 * translation tables *and* update TCR_EL2.T0SZ at the same time. Bottom 2000 * line: we need to use the extended range with *both* our translation 2001 * tables. 2002 * 2003 * So use the maximum of the idmap VA bits and the regular kernel stage 2004 * 1 VA bits to assure that the hypervisor can both ID map its code page 2005 * and map any kernel memory. 2006 */ 2007 idmap_bits = IDMAP_VA_BITS; 2008 kernel_bits = vabits_actual; 2009 *hyp_va_bits = max(idmap_bits, kernel_bits); 2010 2011 kvm_debug("Using %u-bit virtual addresses at EL2\n", *hyp_va_bits); 2012 kvm_debug("IDMAP page: %lx\n", hyp_idmap_start); 2013 kvm_debug("HYP VA range: %lx:%lx\n", 2014 kern_hyp_va(PAGE_OFFSET), 2015 kern_hyp_va((unsigned long)high_memory - 1)); 2016 2017 if (hyp_idmap_start >= kern_hyp_va(PAGE_OFFSET) && 2018 hyp_idmap_start < kern_hyp_va((unsigned long)high_memory - 1) && 2019 hyp_idmap_start != (unsigned long)__hyp_idmap_text_start) { 2020 /* 2021 * The idmap page is intersecting with the VA space, 2022 * it is not safe to continue further. 2023 */ 2024 kvm_err("IDMAP intersecting with HYP VA, unable to continue\n"); 2025 err = -EINVAL; 2026 goto out; 2027 } 2028 2029 hyp_pgtable = kzalloc(sizeof(*hyp_pgtable), GFP_KERNEL); 2030 if (!hyp_pgtable) { 2031 kvm_err("Hyp mode page-table not allocated\n"); 2032 err = -ENOMEM; 2033 goto out; 2034 } 2035 2036 err = kvm_pgtable_hyp_init(hyp_pgtable, *hyp_va_bits, &kvm_hyp_mm_ops); 2037 if (err) 2038 goto out_free_pgtable; 2039 2040 err = kvm_map_idmap_text(); 2041 if (err) 2042 goto out_destroy_pgtable; 2043 2044 io_map_base = hyp_idmap_start; 2045 return 0; 2046 2047 out_destroy_pgtable: 2048 kvm_pgtable_hyp_destroy(hyp_pgtable); 2049 out_free_pgtable: 2050 kfree(hyp_pgtable); 2051 hyp_pgtable = NULL; 2052 out: 2053 return err; 2054 } 2055 2056 void kvm_arch_commit_memory_region(struct kvm *kvm, 2057 struct kvm_memory_slot *old, 2058 const struct kvm_memory_slot *new, 2059 enum kvm_mr_change change) 2060 { 2061 bool log_dirty_pages = new && new->flags & KVM_MEM_LOG_DIRTY_PAGES; 2062 2063 /* 2064 * At this point memslot has been committed and there is an 2065 * allocated dirty_bitmap[], dirty pages will be tracked while the 2066 * memory slot is write protected. 2067 */ 2068 if (log_dirty_pages) { 2069 2070 if (change == KVM_MR_DELETE) 2071 return; 2072 2073 /* 2074 * Huge and normal pages are write-protected and split 2075 * on either of these two cases: 2076 * 2077 * 1. with initial-all-set: gradually with CLEAR ioctls, 2078 */ 2079 if (kvm_dirty_log_manual_protect_and_init_set(kvm)) 2080 return; 2081 /* 2082 * or 2083 * 2. without initial-all-set: all in one shot when 2084 * enabling dirty logging. 2085 */ 2086 kvm_mmu_wp_memory_region(kvm, new->id); 2087 kvm_mmu_split_memory_region(kvm, new->id); 2088 } else { 2089 /* 2090 * Free any leftovers from the eager page splitting cache. Do 2091 * this when deleting, moving, disabling dirty logging, or 2092 * creating the memslot (a nop). Doing it for deletes makes 2093 * sure we don't leak memory, and there's no need to keep the 2094 * cache around for any of the other cases. 2095 */ 2096 kvm_mmu_free_memory_cache(&kvm->arch.mmu.split_page_cache); 2097 } 2098 } 2099 2100 int kvm_arch_prepare_memory_region(struct kvm *kvm, 2101 const struct kvm_memory_slot *old, 2102 struct kvm_memory_slot *new, 2103 enum kvm_mr_change change) 2104 { 2105 hva_t hva, reg_end; 2106 int ret = 0; 2107 2108 if (change != KVM_MR_CREATE && change != KVM_MR_MOVE && 2109 change != KVM_MR_FLAGS_ONLY) 2110 return 0; 2111 2112 /* 2113 * Prevent userspace from creating a memory region outside of the IPA 2114 * space addressable by the KVM guest IPA space. 2115 */ 2116 if ((new->base_gfn + new->npages) > (kvm_phys_size(&kvm->arch.mmu) >> PAGE_SHIFT)) 2117 return -EFAULT; 2118 2119 hva = new->userspace_addr; 2120 reg_end = hva + (new->npages << PAGE_SHIFT); 2121 2122 mmap_read_lock(current->mm); 2123 /* 2124 * A memory region could potentially cover multiple VMAs, and any holes 2125 * between them, so iterate over all of them. 2126 * 2127 * +--------------------------------------------+ 2128 * +---------------+----------------+ +----------------+ 2129 * | : VMA 1 | VMA 2 | | VMA 3 : | 2130 * +---------------+----------------+ +----------------+ 2131 * | memory region | 2132 * +--------------------------------------------+ 2133 */ 2134 do { 2135 struct vm_area_struct *vma; 2136 2137 vma = find_vma_intersection(current->mm, hva, reg_end); 2138 if (!vma) 2139 break; 2140 2141 if (kvm_has_mte(kvm) && !kvm_vma_mte_allowed(vma)) { 2142 ret = -EINVAL; 2143 break; 2144 } 2145 2146 if (vma->vm_flags & VM_PFNMAP) { 2147 /* IO region dirty page logging not allowed */ 2148 if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) { 2149 ret = -EINVAL; 2150 break; 2151 } 2152 } 2153 hva = min(reg_end, vma->vm_end); 2154 } while (hva < reg_end); 2155 2156 mmap_read_unlock(current->mm); 2157 return ret; 2158 } 2159 2160 void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) 2161 { 2162 } 2163 2164 void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) 2165 { 2166 } 2167 2168 void kvm_arch_flush_shadow_memslot(struct kvm *kvm, 2169 struct kvm_memory_slot *slot) 2170 { 2171 gpa_t gpa = slot->base_gfn << PAGE_SHIFT; 2172 phys_addr_t size = slot->npages << PAGE_SHIFT; 2173 2174 write_lock(&kvm->mmu_lock); 2175 kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, size); 2176 kvm_nested_s2_unmap(kvm); 2177 write_unlock(&kvm->mmu_lock); 2178 } 2179 2180 /* 2181 * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized). 2182 * 2183 * Main problems: 2184 * - S/W ops are local to a CPU (not broadcast) 2185 * - We have line migration behind our back (speculation) 2186 * - System caches don't support S/W at all (damn!) 2187 * 2188 * In the face of the above, the best we can do is to try and convert 2189 * S/W ops to VA ops. Because the guest is not allowed to infer the 2190 * S/W to PA mapping, it can only use S/W to nuke the whole cache, 2191 * which is a rather good thing for us. 2192 * 2193 * Also, it is only used when turning caches on/off ("The expected 2194 * usage of the cache maintenance instructions that operate by set/way 2195 * is associated with the cache maintenance instructions associated 2196 * with the powerdown and powerup of caches, if this is required by 2197 * the implementation."). 2198 * 2199 * We use the following policy: 2200 * 2201 * - If we trap a S/W operation, we enable VM trapping to detect 2202 * caches being turned on/off, and do a full clean. 2203 * 2204 * - We flush the caches on both caches being turned on and off. 2205 * 2206 * - Once the caches are enabled, we stop trapping VM ops. 2207 */ 2208 void kvm_set_way_flush(struct kvm_vcpu *vcpu) 2209 { 2210 unsigned long hcr = *vcpu_hcr(vcpu); 2211 2212 /* 2213 * If this is the first time we do a S/W operation 2214 * (i.e. HCR_TVM not set) flush the whole memory, and set the 2215 * VM trapping. 2216 * 2217 * Otherwise, rely on the VM trapping to wait for the MMU + 2218 * Caches to be turned off. At that point, we'll be able to 2219 * clean the caches again. 2220 */ 2221 if (!(hcr & HCR_TVM)) { 2222 trace_kvm_set_way_flush(*vcpu_pc(vcpu), 2223 vcpu_has_cache_enabled(vcpu)); 2224 stage2_flush_vm(vcpu->kvm); 2225 *vcpu_hcr(vcpu) = hcr | HCR_TVM; 2226 } 2227 } 2228 2229 void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled) 2230 { 2231 bool now_enabled = vcpu_has_cache_enabled(vcpu); 2232 2233 /* 2234 * If switching the MMU+caches on, need to invalidate the caches. 2235 * If switching it off, need to clean the caches. 2236 * Clean + invalidate does the trick always. 2237 */ 2238 if (now_enabled != was_enabled) 2239 stage2_flush_vm(vcpu->kvm); 2240 2241 /* Caches are now on, stop trapping VM ops (until a S/W op) */ 2242 if (now_enabled) 2243 *vcpu_hcr(vcpu) &= ~HCR_TVM; 2244 2245 trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled); 2246 } 2247