1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2020 - Google LLC 4 * Author: Quentin Perret <qperret@google.com> 5 */ 6 7 #include <linux/init.h> 8 #include <linux/interval_tree_generic.h> 9 #include <linux/kmemleak.h> 10 #include <linux/kvm_host.h> 11 #include <asm/kvm_mmu.h> 12 #include <linux/memblock.h> 13 #include <linux/mutex.h> 14 15 #include <asm/kvm_pkvm.h> 16 17 #include "hyp_constants.h" 18 19 DEFINE_STATIC_KEY_FALSE(kvm_protected_mode_initialized); 20 21 static struct memblock_region *hyp_memory = kvm_nvhe_sym(hyp_memory); 22 static unsigned int *hyp_memblock_nr_ptr = &kvm_nvhe_sym(hyp_memblock_nr); 23 24 phys_addr_t hyp_mem_base; 25 phys_addr_t hyp_mem_size; 26 27 static int __init register_memblock_regions(void) 28 { 29 struct memblock_region *reg; 30 31 for_each_mem_region(reg) { 32 if (*hyp_memblock_nr_ptr >= HYP_MEMBLOCK_REGIONS) 33 return -ENOMEM; 34 35 hyp_memory[*hyp_memblock_nr_ptr] = *reg; 36 (*hyp_memblock_nr_ptr)++; 37 } 38 39 return 0; 40 } 41 42 void __init kvm_hyp_reserve(void) 43 { 44 u64 hyp_mem_pages = 0; 45 int ret; 46 47 if (!is_hyp_mode_available() || is_kernel_in_hyp_mode()) 48 return; 49 50 if (kvm_get_mode() != KVM_MODE_PROTECTED) 51 return; 52 53 ret = register_memblock_regions(); 54 if (ret) { 55 *hyp_memblock_nr_ptr = 0; 56 kvm_err("Failed to register hyp memblocks: %d\n", ret); 57 return; 58 } 59 60 hyp_mem_pages += hyp_s1_pgtable_pages(); 61 hyp_mem_pages += host_s2_pgtable_pages(); 62 hyp_mem_pages += hyp_vm_table_pages(); 63 hyp_mem_pages += hyp_vmemmap_pages(STRUCT_HYP_PAGE_SIZE); 64 hyp_mem_pages += pkvm_selftest_pages(); 65 hyp_mem_pages += hyp_ffa_proxy_pages(); 66 67 /* 68 * Try to allocate a PMD-aligned region to reduce TLB pressure once 69 * this is unmapped from the host stage-2, and fallback to PAGE_SIZE. 70 */ 71 hyp_mem_size = hyp_mem_pages << PAGE_SHIFT; 72 hyp_mem_base = memblock_phys_alloc(ALIGN(hyp_mem_size, PMD_SIZE), 73 PMD_SIZE); 74 if (!hyp_mem_base) 75 hyp_mem_base = memblock_phys_alloc(hyp_mem_size, PAGE_SIZE); 76 else 77 hyp_mem_size = ALIGN(hyp_mem_size, PMD_SIZE); 78 79 if (!hyp_mem_base) { 80 kvm_err("Failed to reserve hyp memory\n"); 81 return; 82 } 83 84 kvm_info("Reserved %lld MiB at 0x%llx\n", hyp_mem_size >> 20, 85 hyp_mem_base); 86 } 87 88 static void __pkvm_destroy_hyp_vm(struct kvm *kvm) 89 { 90 if (pkvm_hyp_vm_is_created(kvm)) { 91 WARN_ON(kvm_call_hyp_nvhe(__pkvm_finalize_teardown_vm, 92 kvm->arch.pkvm.handle)); 93 } else if (kvm->arch.pkvm.handle) { 94 /* 95 * The VM could have been reserved but hyp initialization has 96 * failed. Make sure to unreserve it. 97 */ 98 kvm_call_hyp_nvhe(__pkvm_unreserve_vm, kvm->arch.pkvm.handle); 99 } 100 101 kvm->arch.pkvm.handle = 0; 102 kvm->arch.pkvm.is_created = false; 103 free_hyp_memcache(&kvm->arch.pkvm.teardown_mc); 104 free_hyp_memcache(&kvm->arch.pkvm.stage2_teardown_mc); 105 } 106 107 static int __pkvm_create_hyp_vcpu(struct kvm_vcpu *vcpu) 108 { 109 size_t hyp_vcpu_sz = PAGE_ALIGN(PKVM_HYP_VCPU_SIZE); 110 pkvm_handle_t handle = vcpu->kvm->arch.pkvm.handle; 111 void *hyp_vcpu; 112 int ret; 113 114 vcpu->arch.pkvm_memcache.flags |= HYP_MEMCACHE_ACCOUNT_STAGE2; 115 116 hyp_vcpu = alloc_pages_exact(hyp_vcpu_sz, GFP_KERNEL_ACCOUNT); 117 if (!hyp_vcpu) 118 return -ENOMEM; 119 120 ret = kvm_call_hyp_nvhe(__pkvm_init_vcpu, handle, vcpu, hyp_vcpu); 121 if (!ret) 122 vcpu_set_flag(vcpu, VCPU_PKVM_FINALIZED); 123 else 124 free_pages_exact(hyp_vcpu, hyp_vcpu_sz); 125 126 return ret; 127 } 128 129 /* 130 * Allocates and donates memory for hypervisor VM structs at EL2. 131 * 132 * Allocates space for the VM state, which includes the hyp vm as well as 133 * the hyp vcpus. 134 * 135 * Stores an opaque handler in the kvm struct for future reference. 136 * 137 * Return 0 on success, negative error code on failure. 138 */ 139 static int __pkvm_create_hyp_vm(struct kvm *kvm) 140 { 141 size_t pgd_sz, hyp_vm_sz; 142 void *pgd, *hyp_vm; 143 int ret; 144 145 if (kvm->created_vcpus < 1) 146 return -EINVAL; 147 148 pgd_sz = kvm_pgtable_stage2_pgd_size(kvm->arch.mmu.vtcr); 149 150 /* 151 * The PGD pages will be reclaimed using a hyp_memcache which implies 152 * page granularity. So, use alloc_pages_exact() to get individual 153 * refcounts. 154 */ 155 pgd = alloc_pages_exact(pgd_sz, GFP_KERNEL_ACCOUNT); 156 if (!pgd) 157 return -ENOMEM; 158 159 /* Allocate memory to donate to hyp for vm and vcpu pointers. */ 160 hyp_vm_sz = PAGE_ALIGN(size_add(PKVM_HYP_VM_SIZE, 161 size_mul(sizeof(void *), 162 kvm->created_vcpus))); 163 hyp_vm = alloc_pages_exact(hyp_vm_sz, GFP_KERNEL_ACCOUNT); 164 if (!hyp_vm) { 165 ret = -ENOMEM; 166 goto free_pgd; 167 } 168 169 /* Donate the VM memory to hyp and let hyp initialize it. */ 170 ret = kvm_call_hyp_nvhe(__pkvm_init_vm, kvm, hyp_vm, pgd); 171 if (ret) 172 goto free_vm; 173 174 kvm->arch.pkvm.is_created = true; 175 kvm->arch.pkvm.stage2_teardown_mc.flags |= HYP_MEMCACHE_ACCOUNT_STAGE2; 176 kvm_account_pgtable_pages(pgd, pgd_sz / PAGE_SIZE); 177 178 return 0; 179 free_vm: 180 free_pages_exact(hyp_vm, hyp_vm_sz); 181 free_pgd: 182 free_pages_exact(pgd, pgd_sz); 183 return ret; 184 } 185 186 bool pkvm_hyp_vm_is_created(struct kvm *kvm) 187 { 188 return READ_ONCE(kvm->arch.pkvm.is_created); 189 } 190 191 int pkvm_create_hyp_vm(struct kvm *kvm) 192 { 193 int ret = 0; 194 195 /* 196 * Synchronise with kvm_arch_prepare_memory_region(), as we 197 * prevent memslot modifications on a pVM that has been run. 198 */ 199 mutex_lock(&kvm->slots_lock); 200 mutex_lock(&kvm->arch.config_lock); 201 if (!pkvm_hyp_vm_is_created(kvm)) 202 ret = __pkvm_create_hyp_vm(kvm); 203 mutex_unlock(&kvm->arch.config_lock); 204 mutex_unlock(&kvm->slots_lock); 205 206 return ret; 207 } 208 209 int pkvm_create_hyp_vcpu(struct kvm_vcpu *vcpu) 210 { 211 int ret = 0; 212 213 mutex_lock(&vcpu->kvm->arch.config_lock); 214 if (!vcpu_get_flag(vcpu, VCPU_PKVM_FINALIZED)) 215 ret = __pkvm_create_hyp_vcpu(vcpu); 216 mutex_unlock(&vcpu->kvm->arch.config_lock); 217 218 return ret; 219 } 220 221 void pkvm_destroy_hyp_vm(struct kvm *kvm) 222 { 223 mutex_lock(&kvm->arch.config_lock); 224 __pkvm_destroy_hyp_vm(kvm); 225 mutex_unlock(&kvm->arch.config_lock); 226 } 227 228 int pkvm_init_host_vm(struct kvm *kvm, unsigned long type) 229 { 230 int ret; 231 bool protected = type & KVM_VM_TYPE_ARM_PROTECTED; 232 233 if (pkvm_hyp_vm_is_created(kvm)) 234 return -EINVAL; 235 236 /* VM is already reserved, no need to proceed. */ 237 if (kvm->arch.pkvm.handle) 238 return 0; 239 240 /* Reserve the VM in hyp and obtain a hyp handle for the VM. */ 241 ret = kvm_call_hyp_nvhe(__pkvm_reserve_vm); 242 if (ret < 0) 243 return ret; 244 245 kvm->arch.pkvm.handle = ret; 246 kvm->arch.pkvm.is_protected = protected; 247 if (protected) { 248 pr_warn_once("kvm: protected VMs are experimental and for development only, tainting kernel\n"); 249 add_taint(TAINT_USER, LOCKDEP_STILL_OK); 250 } 251 252 return 0; 253 } 254 255 static void __init _kvm_host_prot_finalize(void *arg) 256 { 257 int *err = arg; 258 259 if (WARN_ON(kvm_call_hyp_nvhe(__pkvm_prot_finalize))) 260 WRITE_ONCE(*err, -EINVAL); 261 } 262 263 static int __init pkvm_drop_host_privileges(void) 264 { 265 int ret = 0; 266 267 /* 268 * Flip the static key upfront as that may no longer be possible 269 * once the host stage 2 is installed. 270 */ 271 static_branch_enable(&kvm_protected_mode_initialized); 272 on_each_cpu(_kvm_host_prot_finalize, &ret, 1); 273 return ret; 274 } 275 276 static int __init finalize_pkvm(void) 277 { 278 int ret; 279 280 if (!is_protected_kvm_enabled() || !is_kvm_arm_initialised()) 281 return 0; 282 283 /* 284 * Exclude HYP sections from kmemleak so that they don't get peeked 285 * at, which would end badly once inaccessible. 286 */ 287 kmemleak_free_part(__hyp_bss_start, __hyp_bss_end - __hyp_bss_start); 288 kmemleak_free_part(__hyp_data_start, __hyp_data_end - __hyp_data_start); 289 kmemleak_free_part(__hyp_rodata_start, __hyp_rodata_end - __hyp_rodata_start); 290 kmemleak_free_part_phys(hyp_mem_base, hyp_mem_size); 291 292 ret = pkvm_drop_host_privileges(); 293 if (ret) 294 pr_err("Failed to finalize Hyp protection: %d\n", ret); 295 296 return ret; 297 } 298 device_initcall_sync(finalize_pkvm); 299 300 static u64 __pkvm_mapping_start(struct pkvm_mapping *m) 301 { 302 return m->gfn * PAGE_SIZE; 303 } 304 305 static u64 __pkvm_mapping_end(struct pkvm_mapping *m) 306 { 307 return (m->gfn + m->nr_pages) * PAGE_SIZE - 1; 308 } 309 310 INTERVAL_TREE_DEFINE(struct pkvm_mapping, node, u64, __subtree_last, 311 __pkvm_mapping_start, __pkvm_mapping_end, static, 312 pkvm_mapping); 313 314 /* 315 * __tmp is updated to iter_first(pkvm_mappings) *before* entering the body of the loop to allow 316 * freeing of __map inline. 317 */ 318 #define for_each_mapping_in_range_safe(__pgt, __start, __end, __map) \ 319 for (struct pkvm_mapping *__tmp = pkvm_mapping_iter_first(&(__pgt)->pkvm_mappings, \ 320 __start, __end - 1); \ 321 __tmp && ({ \ 322 __map = __tmp; \ 323 __tmp = pkvm_mapping_iter_next(__map, __start, __end - 1); \ 324 true; \ 325 }); \ 326 ) 327 328 int pkvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu, 329 struct kvm_pgtable_mm_ops *mm_ops) 330 { 331 pgt->pkvm_mappings = RB_ROOT_CACHED; 332 pgt->mmu = mmu; 333 334 return 0; 335 } 336 337 static int __pkvm_pgtable_stage2_reclaim(struct kvm_pgtable *pgt, u64 start, u64 end) 338 { 339 struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu); 340 pkvm_handle_t handle = kvm->arch.pkvm.handle; 341 struct pkvm_mapping *mapping; 342 int ret; 343 344 for_each_mapping_in_range_safe(pgt, start, end, mapping) { 345 struct page *page; 346 347 ret = kvm_call_hyp_nvhe(__pkvm_reclaim_dying_guest_page, 348 handle, mapping->gfn); 349 if (WARN_ON(ret)) 350 continue; 351 352 page = pfn_to_page(mapping->pfn); 353 WARN_ON_ONCE(mapping->nr_pages != 1); 354 unpin_user_pages_dirty_lock(&page, 1, true); 355 account_locked_vm(current->mm, 1, false); 356 pkvm_mapping_remove(mapping, &pgt->pkvm_mappings); 357 kfree(mapping); 358 } 359 360 return 0; 361 } 362 363 static int __pkvm_pgtable_stage2_unshare(struct kvm_pgtable *pgt, u64 start, u64 end) 364 { 365 struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu); 366 pkvm_handle_t handle = kvm->arch.pkvm.handle; 367 struct pkvm_mapping *mapping; 368 int ret; 369 370 for_each_mapping_in_range_safe(pgt, start, end, mapping) { 371 ret = kvm_call_hyp_nvhe(__pkvm_host_unshare_guest, handle, mapping->gfn, 372 mapping->nr_pages); 373 if (WARN_ON(ret)) 374 return ret; 375 pkvm_mapping_remove(mapping, &pgt->pkvm_mappings); 376 kfree(mapping); 377 } 378 379 return 0; 380 } 381 382 void pkvm_pgtable_stage2_destroy_range(struct kvm_pgtable *pgt, 383 u64 addr, u64 size) 384 { 385 struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu); 386 pkvm_handle_t handle = kvm->arch.pkvm.handle; 387 388 if (!handle) 389 return; 390 391 if (pkvm_hyp_vm_is_created(kvm) && !kvm->arch.pkvm.is_dying) { 392 WARN_ON(kvm_call_hyp_nvhe(__pkvm_start_teardown_vm, handle)); 393 kvm->arch.pkvm.is_dying = true; 394 } 395 396 if (kvm_vm_is_protected(kvm)) 397 __pkvm_pgtable_stage2_reclaim(pgt, addr, addr + size); 398 else 399 __pkvm_pgtable_stage2_unshare(pgt, addr, addr + size); 400 } 401 402 void pkvm_pgtable_stage2_destroy_pgd(struct kvm_pgtable *pgt) 403 { 404 /* Expected to be called after all pKVM mappings have been released. */ 405 WARN_ON_ONCE(!RB_EMPTY_ROOT(&pgt->pkvm_mappings.rb_root)); 406 } 407 408 int pkvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, 409 u64 phys, enum kvm_pgtable_prot prot, 410 void *mc, enum kvm_pgtable_walk_flags flags) 411 { 412 struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu); 413 struct pkvm_mapping *mapping = NULL; 414 struct kvm_hyp_memcache *cache = mc; 415 u64 gfn = addr >> PAGE_SHIFT; 416 u64 pfn = phys >> PAGE_SHIFT; 417 u64 end = addr + size; 418 int ret; 419 420 lockdep_assert_held_write(&kvm->mmu_lock); 421 mapping = pkvm_mapping_iter_first(&pgt->pkvm_mappings, addr, end - 1); 422 423 if (kvm_vm_is_protected(kvm)) { 424 /* Protected VMs are mapped using RWX page-granular mappings */ 425 if (WARN_ON_ONCE(size != PAGE_SIZE)) 426 return -EINVAL; 427 428 if (WARN_ON_ONCE(prot != KVM_PGTABLE_PROT_RWX)) 429 return -EINVAL; 430 431 /* 432 * We either raced with another vCPU or the guest PTE 433 * has been poisoned by an erroneous host access. 434 */ 435 if (mapping) { 436 ret = kvm_call_hyp_nvhe(__pkvm_vcpu_in_poison_fault); 437 return ret ? -EFAULT : -EAGAIN; 438 } 439 440 ret = kvm_call_hyp_nvhe(__pkvm_host_donate_guest, pfn, gfn); 441 } else { 442 if (WARN_ON_ONCE(size != PAGE_SIZE && size != PMD_SIZE)) 443 return -EINVAL; 444 445 /* 446 * We either raced with another vCPU or we're changing between 447 * page and block mappings. As per user_mem_abort(), same-size 448 * permission faults are handled in the relax_perms() path. 449 */ 450 if (mapping) { 451 if (size == (mapping->nr_pages * PAGE_SIZE)) 452 return -EAGAIN; 453 454 /* 455 * Remove _any_ pkvm_mapping overlapping with the range, 456 * bigger or smaller. 457 */ 458 ret = __pkvm_pgtable_stage2_unshare(pgt, addr, end); 459 if (ret) 460 return ret; 461 462 mapping = NULL; 463 } 464 465 ret = kvm_call_hyp_nvhe(__pkvm_host_share_guest, pfn, gfn, 466 size / PAGE_SIZE, prot); 467 } 468 469 if (WARN_ON(ret)) 470 return ret; 471 472 swap(mapping, cache->mapping); 473 mapping->gfn = gfn; 474 mapping->pfn = pfn; 475 mapping->nr_pages = size / PAGE_SIZE; 476 pkvm_mapping_insert(mapping, &pgt->pkvm_mappings); 477 478 return ret; 479 } 480 481 int pkvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size) 482 { 483 struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu); 484 485 if (WARN_ON(kvm_vm_is_protected(kvm))) 486 return -EPERM; 487 488 lockdep_assert_held_write(&kvm->mmu_lock); 489 490 return __pkvm_pgtable_stage2_unshare(pgt, addr, addr + size); 491 } 492 493 int pkvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size) 494 { 495 struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu); 496 pkvm_handle_t handle = kvm->arch.pkvm.handle; 497 struct pkvm_mapping *mapping; 498 int ret = 0; 499 500 if (WARN_ON(kvm_vm_is_protected(kvm))) 501 return -EPERM; 502 503 lockdep_assert_held(&kvm->mmu_lock); 504 for_each_mapping_in_range_safe(pgt, addr, addr + size, mapping) { 505 ret = kvm_call_hyp_nvhe(__pkvm_host_wrprotect_guest, handle, mapping->gfn, 506 mapping->nr_pages); 507 if (WARN_ON(ret)) 508 break; 509 } 510 511 return ret; 512 } 513 514 int pkvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size) 515 { 516 struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu); 517 struct pkvm_mapping *mapping; 518 519 lockdep_assert_held(&kvm->mmu_lock); 520 for_each_mapping_in_range_safe(pgt, addr, addr + size, mapping) 521 __clean_dcache_guest_page(pfn_to_kaddr(mapping->pfn), 522 PAGE_SIZE * mapping->nr_pages); 523 524 return 0; 525 } 526 527 bool pkvm_pgtable_stage2_test_clear_young(struct kvm_pgtable *pgt, u64 addr, u64 size, bool mkold) 528 { 529 struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu); 530 pkvm_handle_t handle = kvm->arch.pkvm.handle; 531 struct pkvm_mapping *mapping; 532 bool young = false; 533 534 if (WARN_ON(kvm_vm_is_protected(kvm))) 535 return false; 536 537 lockdep_assert_held(&kvm->mmu_lock); 538 for_each_mapping_in_range_safe(pgt, addr, addr + size, mapping) 539 young |= kvm_call_hyp_nvhe(__pkvm_host_test_clear_young_guest, handle, mapping->gfn, 540 mapping->nr_pages, mkold); 541 542 return young; 543 } 544 545 int pkvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr, enum kvm_pgtable_prot prot, 546 enum kvm_pgtable_walk_flags flags) 547 { 548 if (WARN_ON(kvm_vm_is_protected(kvm_s2_mmu_to_kvm(pgt->mmu)))) 549 return -EPERM; 550 551 return kvm_call_hyp_nvhe(__pkvm_host_relax_perms_guest, addr >> PAGE_SHIFT, prot); 552 } 553 554 void pkvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr, 555 enum kvm_pgtable_walk_flags flags) 556 { 557 if (WARN_ON(kvm_vm_is_protected(kvm_s2_mmu_to_kvm(pgt->mmu)))) 558 return; 559 560 WARN_ON(kvm_call_hyp_nvhe(__pkvm_host_mkyoung_guest, addr >> PAGE_SHIFT)); 561 } 562 563 void pkvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, s8 level) 564 { 565 WARN_ON_ONCE(1); 566 } 567 568 kvm_pte_t *pkvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt, u64 phys, s8 level, 569 enum kvm_pgtable_prot prot, void *mc, bool force_pte) 570 { 571 WARN_ON_ONCE(1); 572 return NULL; 573 } 574 575 int pkvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size, 576 struct kvm_mmu_memory_cache *mc) 577 { 578 WARN_ON_ONCE(1); 579 return -EINVAL; 580 } 581 582 /* 583 * Forcefully reclaim a page from the guest, zeroing its contents and 584 * poisoning the stage-2 pte so that pages can no longer be mapped at 585 * the same IPA. The page remains pinned until the guest is destroyed. 586 */ 587 bool pkvm_force_reclaim_guest_page(phys_addr_t phys) 588 { 589 int ret = kvm_call_hyp_nvhe(__pkvm_force_reclaim_guest_page, phys); 590 591 return !ret || ret == -EAGAIN; 592 } 593