1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2021 Google LLC 4 * Author: Fuad Tabba <tabba@google.com> 5 */ 6 7 #include <kvm/arm_hypercalls.h> 8 9 #include <linux/kvm_host.h> 10 #include <linux/mm.h> 11 12 #include <asm/kvm_emulate.h> 13 14 #include <nvhe/mem_protect.h> 15 #include <nvhe/memory.h> 16 #include <nvhe/pkvm.h> 17 #include <nvhe/trap_handler.h> 18 19 /* Used by icache_is_aliasing(). */ 20 unsigned long __icache_flags; 21 22 /* Used by kvm_get_vttbr(). */ 23 unsigned int kvm_arm_vmid_bits; 24 25 unsigned int kvm_host_sve_max_vl; 26 27 /* 28 * The currently loaded hyp vCPU for each physical CPU. Used in protected mode 29 * for both protected and non-protected VMs. 30 */ 31 static DEFINE_PER_CPU(struct pkvm_hyp_vcpu *, loaded_hyp_vcpu); 32 33 static void pkvm_vcpu_reset_hcr(struct kvm_vcpu *vcpu) 34 { 35 vcpu->arch.hcr_el2 = HCR_GUEST_FLAGS; 36 37 if (has_hvhe()) 38 vcpu->arch.hcr_el2 |= HCR_E2H; 39 40 if (cpus_have_final_cap(ARM64_HAS_RAS_EXTN)) { 41 /* route synchronous external abort exceptions to EL2 */ 42 vcpu->arch.hcr_el2 |= HCR_TEA; 43 /* trap error record accesses */ 44 vcpu->arch.hcr_el2 |= HCR_TERR; 45 } 46 47 if (cpus_have_final_cap(ARM64_HAS_STAGE2_FWB)) 48 vcpu->arch.hcr_el2 |= HCR_FWB; 49 50 if (cpus_have_final_cap(ARM64_HAS_EVT) && 51 !cpus_have_final_cap(ARM64_MISMATCHED_CACHE_TYPE) && 52 kvm_read_vm_id_reg(vcpu->kvm, SYS_CTR_EL0) == read_cpuid(CTR_EL0)) 53 vcpu->arch.hcr_el2 |= HCR_TID4; 54 else 55 vcpu->arch.hcr_el2 |= HCR_TID2; 56 57 if (vcpu_has_ptrauth(vcpu)) 58 vcpu->arch.hcr_el2 |= (HCR_API | HCR_APK); 59 60 if (kvm_has_mte(vcpu->kvm)) 61 vcpu->arch.hcr_el2 |= HCR_ATA; 62 } 63 64 static void pvm_init_traps_hcr(struct kvm_vcpu *vcpu) 65 { 66 struct kvm *kvm = vcpu->kvm; 67 u64 val = vcpu->arch.hcr_el2; 68 69 /* No support for AArch32. */ 70 val |= HCR_RW; 71 72 /* 73 * Always trap: 74 * - Feature id registers: to control features exposed to guests 75 * - Implementation-defined features 76 */ 77 val |= HCR_TACR | HCR_TIDCP | HCR_TID3 | HCR_TID1; 78 79 if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, RAS, IMP)) { 80 val |= HCR_TERR | HCR_TEA; 81 val &= ~(HCR_FIEN); 82 } 83 84 if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, AMU, IMP)) 85 val &= ~(HCR_AMVOFFEN); 86 87 if (!kvm_has_mte(kvm)) { 88 val |= HCR_TID5; 89 val &= ~(HCR_DCT | HCR_ATA); 90 } 91 92 if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, LO, IMP)) 93 val |= HCR_TLOR; 94 95 vcpu->arch.hcr_el2 = val; 96 } 97 98 static void pvm_init_traps_mdcr(struct kvm_vcpu *vcpu) 99 { 100 struct kvm *kvm = vcpu->kvm; 101 u64 val = vcpu->arch.mdcr_el2; 102 103 if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMUVer, IMP)) { 104 val |= MDCR_EL2_TPM | MDCR_EL2_TPMCR; 105 val &= ~(MDCR_EL2_HPME | MDCR_EL2_MTPME | MDCR_EL2_HPMN_MASK); 106 } 107 108 if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, DebugVer, IMP)) 109 val |= MDCR_EL2_TDRA | MDCR_EL2_TDA; 110 111 if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, DoubleLock, IMP)) 112 val |= MDCR_EL2_TDOSA; 113 114 if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMSVer, IMP)) { 115 val |= MDCR_EL2_TPMS; 116 val &= ~MDCR_EL2_E2PB_MASK; 117 } 118 119 if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, TraceFilt, IMP)) 120 val |= MDCR_EL2_TTRF; 121 122 if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, TraceBuffer, IMP)) 123 val &= ~MDCR_EL2_E2TB_MASK; 124 125 /* Trap Debug Communications Channel registers */ 126 if (!kvm_has_feat(kvm, ID_AA64MMFR0_EL1, FGT, IMP)) 127 val |= MDCR_EL2_TDCC; 128 129 vcpu->arch.mdcr_el2 = val; 130 } 131 132 /* 133 * Check that cpu features that are neither trapped nor supported are not 134 * enabled for protected VMs. 135 */ 136 static int pkvm_check_pvm_cpu_features(struct kvm_vcpu *vcpu) 137 { 138 struct kvm *kvm = vcpu->kvm; 139 140 /* No AArch32 support for protected guests. */ 141 if (kvm_has_feat(kvm, ID_AA64PFR0_EL1, EL0, AARCH32) || 142 kvm_has_feat(kvm, ID_AA64PFR0_EL1, EL1, AARCH32)) 143 return -EINVAL; 144 145 /* 146 * Linux guests assume support for floating-point and Advanced SIMD. Do 147 * not change the trapping behavior for these from the KVM default. 148 */ 149 if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, FP, IMP) || 150 !kvm_has_feat(kvm, ID_AA64PFR0_EL1, AdvSIMD, IMP)) 151 return -EINVAL; 152 153 /* No SME support in KVM right now. Check to catch if it changes. */ 154 if (kvm_has_feat(kvm, ID_AA64PFR1_EL1, SME, IMP)) 155 return -EINVAL; 156 157 return 0; 158 } 159 160 /* 161 * Initialize trap register values in protected mode. 162 */ 163 static int pkvm_vcpu_init_traps(struct pkvm_hyp_vcpu *hyp_vcpu) 164 { 165 struct kvm_vcpu *vcpu = &hyp_vcpu->vcpu; 166 int ret; 167 168 vcpu->arch.mdcr_el2 = 0; 169 170 pkvm_vcpu_reset_hcr(vcpu); 171 172 if ((!pkvm_hyp_vcpu_is_protected(hyp_vcpu))) { 173 struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu; 174 175 /* Trust the host for non-protected vcpu features. */ 176 vcpu->arch.hcrx_el2 = host_vcpu->arch.hcrx_el2; 177 return 0; 178 } 179 180 ret = pkvm_check_pvm_cpu_features(vcpu); 181 if (ret) 182 return ret; 183 184 pvm_init_traps_hcr(vcpu); 185 pvm_init_traps_mdcr(vcpu); 186 vcpu_set_hcrx(vcpu); 187 188 return 0; 189 } 190 191 /* 192 * Start the VM table handle at the offset defined instead of at 0. 193 * Mainly for sanity checking and debugging. 194 */ 195 #define HANDLE_OFFSET 0x1000 196 197 /* 198 * Marks a reserved but not yet used entry in the VM table. 199 */ 200 #define RESERVED_ENTRY ((void *)0xa110ca7ed) 201 202 static unsigned int vm_handle_to_idx(pkvm_handle_t handle) 203 { 204 return handle - HANDLE_OFFSET; 205 } 206 207 static pkvm_handle_t idx_to_vm_handle(unsigned int idx) 208 { 209 return idx + HANDLE_OFFSET; 210 } 211 212 /* 213 * Spinlock for protecting state related to the VM table. Protects writes 214 * to 'vm_table', 'nr_table_entries', and other per-vm state on initialization. 215 * Also protects reads and writes to 'last_hyp_vcpu_lookup'. 216 */ 217 DEFINE_HYP_SPINLOCK(vm_table_lock); 218 219 /* 220 * A table that tracks all VMs in protected mode. 221 * Allocated during hyp initialization and setup. 222 */ 223 static struct pkvm_hyp_vm **vm_table; 224 225 void pkvm_hyp_vm_table_init(void *tbl) 226 { 227 BUILD_BUG_ON((u64)HANDLE_OFFSET + KVM_MAX_PVMS > (pkvm_handle_t)-1); 228 WARN_ON(vm_table); 229 vm_table = tbl; 230 } 231 232 /* 233 * Return the hyp vm structure corresponding to the handle. 234 */ 235 struct pkvm_hyp_vm *get_vm_by_handle(pkvm_handle_t handle) 236 { 237 unsigned int idx = vm_handle_to_idx(handle); 238 239 hyp_assert_lock_held(&vm_table_lock); 240 241 if (unlikely(idx >= KVM_MAX_PVMS)) 242 return NULL; 243 244 /* A reserved entry doesn't represent an initialized VM. */ 245 if (unlikely(vm_table[idx] == RESERVED_ENTRY)) 246 return NULL; 247 248 return vm_table[idx]; 249 } 250 251 struct pkvm_hyp_vcpu *pkvm_load_hyp_vcpu(pkvm_handle_t handle, 252 unsigned int vcpu_idx) 253 { 254 struct pkvm_hyp_vcpu *hyp_vcpu = NULL; 255 struct pkvm_hyp_vm *hyp_vm; 256 257 /* Cannot load a new vcpu without putting the old one first. */ 258 if (__this_cpu_read(loaded_hyp_vcpu)) 259 return NULL; 260 261 hyp_spin_lock(&vm_table_lock); 262 hyp_vm = get_vm_by_handle(handle); 263 if (!hyp_vm || hyp_vm->kvm.arch.pkvm.is_dying) 264 goto unlock; 265 266 if (hyp_vm->kvm.created_vcpus <= vcpu_idx) 267 goto unlock; 268 269 /* Pairs with smp_store_release() in register_hyp_vcpu(). */ 270 hyp_vcpu = smp_load_acquire(&hyp_vm->vcpus[vcpu_idx]); 271 if (!hyp_vcpu) 272 goto unlock; 273 274 /* Ensure vcpu isn't loaded on more than one cpu simultaneously. */ 275 if (unlikely(hyp_vcpu->loaded_hyp_vcpu)) { 276 hyp_vcpu = NULL; 277 goto unlock; 278 } 279 280 hyp_vcpu->loaded_hyp_vcpu = this_cpu_ptr(&loaded_hyp_vcpu); 281 hyp_page_ref_inc(hyp_virt_to_page(hyp_vm)); 282 unlock: 283 hyp_spin_unlock(&vm_table_lock); 284 285 if (hyp_vcpu) 286 __this_cpu_write(loaded_hyp_vcpu, hyp_vcpu); 287 return hyp_vcpu; 288 } 289 290 void pkvm_put_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu) 291 { 292 struct pkvm_hyp_vm *hyp_vm = pkvm_hyp_vcpu_to_hyp_vm(hyp_vcpu); 293 294 hyp_spin_lock(&vm_table_lock); 295 hyp_vcpu->loaded_hyp_vcpu = NULL; 296 __this_cpu_write(loaded_hyp_vcpu, NULL); 297 hyp_page_ref_dec(hyp_virt_to_page(hyp_vm)); 298 hyp_spin_unlock(&vm_table_lock); 299 } 300 301 struct pkvm_hyp_vcpu *pkvm_get_loaded_hyp_vcpu(void) 302 { 303 return __this_cpu_read(loaded_hyp_vcpu); 304 305 } 306 307 struct pkvm_hyp_vm *get_pkvm_hyp_vm(pkvm_handle_t handle) 308 { 309 struct pkvm_hyp_vm *hyp_vm; 310 311 hyp_spin_lock(&vm_table_lock); 312 hyp_vm = get_vm_by_handle(handle); 313 if (hyp_vm) 314 hyp_page_ref_inc(hyp_virt_to_page(hyp_vm)); 315 hyp_spin_unlock(&vm_table_lock); 316 317 return hyp_vm; 318 } 319 320 void put_pkvm_hyp_vm(struct pkvm_hyp_vm *hyp_vm) 321 { 322 hyp_spin_lock(&vm_table_lock); 323 hyp_page_ref_dec(hyp_virt_to_page(hyp_vm)); 324 hyp_spin_unlock(&vm_table_lock); 325 } 326 327 struct pkvm_hyp_vm *get_np_pkvm_hyp_vm(pkvm_handle_t handle) 328 { 329 struct pkvm_hyp_vm *hyp_vm = get_pkvm_hyp_vm(handle); 330 331 if (hyp_vm && pkvm_hyp_vm_is_protected(hyp_vm)) { 332 put_pkvm_hyp_vm(hyp_vm); 333 hyp_vm = NULL; 334 } 335 336 return hyp_vm; 337 } 338 339 static void pkvm_init_features_from_host(struct pkvm_hyp_vm *hyp_vm, const struct kvm *host_kvm) 340 { 341 struct kvm *kvm = &hyp_vm->kvm; 342 unsigned long host_arch_flags = READ_ONCE(host_kvm->arch.flags); 343 DECLARE_BITMAP(allowed_features, KVM_VCPU_MAX_FEATURES); 344 345 /* CTR_EL0 is always under host control, even for protected VMs. */ 346 hyp_vm->kvm.arch.ctr_el0 = host_kvm->arch.ctr_el0; 347 348 /* Preserve the vgic model so that GICv3 emulation works */ 349 hyp_vm->kvm.arch.vgic.vgic_model = host_kvm->arch.vgic.vgic_model; 350 351 /* No restrictions for non-protected VMs. */ 352 if (!kvm_vm_is_protected(kvm)) { 353 hyp_vm->kvm.arch.flags = host_arch_flags; 354 hyp_vm->kvm.arch.flags &= ~BIT_ULL(KVM_ARCH_FLAG_ID_REGS_INITIALIZED); 355 356 bitmap_copy(kvm->arch.vcpu_features, 357 host_kvm->arch.vcpu_features, 358 KVM_VCPU_MAX_FEATURES); 359 360 if (test_bit(KVM_ARCH_FLAG_WRITABLE_IMP_ID_REGS, &host_arch_flags)) 361 hyp_vm->kvm.arch.midr_el1 = host_kvm->arch.midr_el1; 362 363 return; 364 } 365 366 if (kvm_pkvm_ext_allowed(kvm, KVM_CAP_ARM_MTE)) 367 kvm->arch.flags |= host_arch_flags & BIT(KVM_ARCH_FLAG_MTE_ENABLED); 368 369 bitmap_zero(allowed_features, KVM_VCPU_MAX_FEATURES); 370 371 set_bit(KVM_ARM_VCPU_PSCI_0_2, allowed_features); 372 373 if (kvm_pkvm_ext_allowed(kvm, KVM_CAP_ARM_PMU_V3)) 374 set_bit(KVM_ARM_VCPU_PMU_V3, allowed_features); 375 376 if (kvm_pkvm_ext_allowed(kvm, KVM_CAP_ARM_PTRAUTH_ADDRESS)) 377 set_bit(KVM_ARM_VCPU_PTRAUTH_ADDRESS, allowed_features); 378 379 if (kvm_pkvm_ext_allowed(kvm, KVM_CAP_ARM_PTRAUTH_GENERIC)) 380 set_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, allowed_features); 381 382 if (kvm_pkvm_ext_allowed(kvm, KVM_CAP_ARM_SVE)) { 383 set_bit(KVM_ARM_VCPU_SVE, allowed_features); 384 kvm->arch.flags |= host_arch_flags & BIT(KVM_ARCH_FLAG_GUEST_HAS_SVE); 385 } 386 387 bitmap_and(kvm->arch.vcpu_features, host_kvm->arch.vcpu_features, 388 allowed_features, KVM_VCPU_MAX_FEATURES); 389 } 390 391 static void unpin_host_vcpu(struct kvm_vcpu *host_vcpu) 392 { 393 if (host_vcpu) 394 hyp_unpin_shared_mem(host_vcpu, host_vcpu + 1); 395 } 396 397 static void unpin_host_sve_state(struct pkvm_hyp_vcpu *hyp_vcpu) 398 { 399 void *sve_state; 400 401 if (!vcpu_has_feature(&hyp_vcpu->vcpu, KVM_ARM_VCPU_SVE)) 402 return; 403 404 sve_state = hyp_vcpu->vcpu.arch.sve_state; 405 hyp_unpin_shared_mem(sve_state, 406 sve_state + vcpu_sve_state_size(&hyp_vcpu->vcpu)); 407 } 408 409 static void unpin_host_vcpus(struct pkvm_hyp_vcpu *hyp_vcpus[], 410 unsigned int nr_vcpus) 411 { 412 int i; 413 414 for (i = 0; i < nr_vcpus; i++) { 415 struct pkvm_hyp_vcpu *hyp_vcpu = hyp_vcpus[i]; 416 417 if (!hyp_vcpu) 418 continue; 419 420 unpin_host_vcpu(hyp_vcpu->host_vcpu); 421 unpin_host_sve_state(hyp_vcpu); 422 } 423 } 424 425 static void init_pkvm_hyp_vm(struct kvm *host_kvm, struct pkvm_hyp_vm *hyp_vm, 426 unsigned int nr_vcpus, pkvm_handle_t handle) 427 { 428 struct kvm_s2_mmu *mmu = &hyp_vm->kvm.arch.mmu; 429 int idx = vm_handle_to_idx(handle); 430 431 hyp_vm->kvm.arch.pkvm.handle = handle; 432 433 hyp_vm->host_kvm = host_kvm; 434 hyp_vm->kvm.created_vcpus = nr_vcpus; 435 hyp_vm->kvm.arch.pkvm.is_protected = READ_ONCE(host_kvm->arch.pkvm.is_protected); 436 hyp_vm->kvm.arch.pkvm.is_created = true; 437 hyp_vm->kvm.arch.flags = 0; 438 pkvm_init_features_from_host(hyp_vm, host_kvm); 439 440 /* VMID 0 is reserved for the host */ 441 atomic64_set(&mmu->vmid.id, idx + 1); 442 443 mmu->vtcr = host_mmu.arch.mmu.vtcr; 444 mmu->arch = &hyp_vm->kvm.arch; 445 mmu->pgt = &hyp_vm->pgt; 446 } 447 448 static int pkvm_vcpu_init_sve(struct pkvm_hyp_vcpu *hyp_vcpu, struct kvm_vcpu *host_vcpu) 449 { 450 struct kvm_vcpu *vcpu = &hyp_vcpu->vcpu; 451 unsigned int sve_max_vl; 452 size_t sve_state_size; 453 void *sve_state; 454 int ret = 0; 455 456 if (!vcpu_has_feature(vcpu, KVM_ARM_VCPU_SVE)) { 457 vcpu_clear_flag(vcpu, VCPU_SVE_FINALIZED); 458 return 0; 459 } 460 461 /* Limit guest vector length to the maximum supported by the host. */ 462 sve_max_vl = min(READ_ONCE(host_vcpu->arch.sve_max_vl), kvm_host_sve_max_vl); 463 sve_state_size = sve_state_size_from_vl(sve_max_vl); 464 sve_state = kern_hyp_va(READ_ONCE(host_vcpu->arch.sve_state)); 465 466 if (!sve_state || !sve_state_size) { 467 ret = -EINVAL; 468 goto err; 469 } 470 471 ret = hyp_pin_shared_mem(sve_state, sve_state + sve_state_size); 472 if (ret) 473 goto err; 474 475 vcpu->arch.sve_state = sve_state; 476 vcpu->arch.sve_max_vl = sve_max_vl; 477 478 return 0; 479 err: 480 clear_bit(KVM_ARM_VCPU_SVE, vcpu->kvm->arch.vcpu_features); 481 return ret; 482 } 483 484 static int vm_copy_id_regs(struct pkvm_hyp_vcpu *hyp_vcpu) 485 { 486 struct pkvm_hyp_vm *hyp_vm = pkvm_hyp_vcpu_to_hyp_vm(hyp_vcpu); 487 const struct kvm *host_kvm = hyp_vm->host_kvm; 488 struct kvm *kvm = &hyp_vm->kvm; 489 490 if (!test_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &host_kvm->arch.flags)) 491 return -EINVAL; 492 493 if (test_and_set_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags)) 494 return 0; 495 496 memcpy(kvm->arch.id_regs, host_kvm->arch.id_regs, sizeof(kvm->arch.id_regs)); 497 498 return 0; 499 } 500 501 static int pkvm_vcpu_init_sysregs(struct pkvm_hyp_vcpu *hyp_vcpu) 502 { 503 int ret = 0; 504 505 if (pkvm_hyp_vcpu_is_protected(hyp_vcpu)) 506 kvm_init_pvm_id_regs(&hyp_vcpu->vcpu); 507 else 508 ret = vm_copy_id_regs(hyp_vcpu); 509 510 return ret; 511 } 512 513 static int init_pkvm_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu, 514 struct pkvm_hyp_vm *hyp_vm, 515 struct kvm_vcpu *host_vcpu) 516 { 517 int ret = 0; 518 519 if (hyp_pin_shared_mem(host_vcpu, host_vcpu + 1)) 520 return -EBUSY; 521 522 hyp_vcpu->host_vcpu = host_vcpu; 523 524 hyp_vcpu->vcpu.kvm = &hyp_vm->kvm; 525 hyp_vcpu->vcpu.vcpu_id = READ_ONCE(host_vcpu->vcpu_id); 526 hyp_vcpu->vcpu.vcpu_idx = READ_ONCE(host_vcpu->vcpu_idx); 527 528 hyp_vcpu->vcpu.arch.hw_mmu = &hyp_vm->kvm.arch.mmu; 529 hyp_vcpu->vcpu.arch.cflags = READ_ONCE(host_vcpu->arch.cflags); 530 hyp_vcpu->vcpu.arch.mp_state.mp_state = KVM_MP_STATE_STOPPED; 531 532 ret = pkvm_vcpu_init_sysregs(hyp_vcpu); 533 if (ret) 534 goto done; 535 536 ret = pkvm_vcpu_init_traps(hyp_vcpu); 537 if (ret) 538 goto done; 539 540 ret = pkvm_vcpu_init_sve(hyp_vcpu, host_vcpu); 541 done: 542 if (ret) 543 unpin_host_vcpu(host_vcpu); 544 return ret; 545 } 546 547 static int find_free_vm_table_entry(void) 548 { 549 int i; 550 551 for (i = 0; i < KVM_MAX_PVMS; ++i) { 552 if (!vm_table[i]) 553 return i; 554 } 555 556 return -ENOMEM; 557 } 558 559 /* 560 * Reserve a VM table entry. 561 * 562 * Return a unique handle to the VM on success, 563 * negative error code on failure. 564 */ 565 static int allocate_vm_table_entry(void) 566 { 567 int idx; 568 569 hyp_assert_lock_held(&vm_table_lock); 570 571 /* 572 * Initializing protected state might have failed, yet a malicious 573 * host could trigger this function. Thus, ensure that 'vm_table' 574 * exists. 575 */ 576 if (unlikely(!vm_table)) 577 return -EINVAL; 578 579 idx = find_free_vm_table_entry(); 580 if (unlikely(idx < 0)) 581 return idx; 582 583 vm_table[idx] = RESERVED_ENTRY; 584 585 return idx; 586 } 587 588 static int __insert_vm_table_entry(pkvm_handle_t handle, 589 struct pkvm_hyp_vm *hyp_vm) 590 { 591 unsigned int idx; 592 593 hyp_assert_lock_held(&vm_table_lock); 594 595 /* 596 * Initializing protected state might have failed, yet a malicious 597 * host could trigger this function. Thus, ensure that 'vm_table' 598 * exists. 599 */ 600 if (unlikely(!vm_table)) 601 return -EINVAL; 602 603 idx = vm_handle_to_idx(handle); 604 if (unlikely(idx >= KVM_MAX_PVMS)) 605 return -EINVAL; 606 607 if (unlikely(vm_table[idx] != RESERVED_ENTRY)) 608 return -EINVAL; 609 610 vm_table[idx] = hyp_vm; 611 612 return 0; 613 } 614 615 /* 616 * Insert a pointer to the initialized VM into the VM table. 617 * 618 * Return 0 on success, or negative error code on failure. 619 */ 620 static int insert_vm_table_entry(pkvm_handle_t handle, 621 struct pkvm_hyp_vm *hyp_vm) 622 { 623 int ret; 624 625 hyp_spin_lock(&vm_table_lock); 626 ret = __insert_vm_table_entry(handle, hyp_vm); 627 hyp_spin_unlock(&vm_table_lock); 628 629 return ret; 630 } 631 632 /* 633 * Deallocate and remove the VM table entry corresponding to the handle. 634 */ 635 static void remove_vm_table_entry(pkvm_handle_t handle) 636 { 637 hyp_assert_lock_held(&vm_table_lock); 638 vm_table[vm_handle_to_idx(handle)] = NULL; 639 } 640 641 static size_t pkvm_get_hyp_vm_size(unsigned int nr_vcpus) 642 { 643 return size_add(sizeof(struct pkvm_hyp_vm), 644 size_mul(sizeof(struct pkvm_hyp_vcpu *), nr_vcpus)); 645 } 646 647 static void *map_donated_memory_noclear(unsigned long host_va, size_t size) 648 { 649 void *va = (void *)kern_hyp_va(host_va); 650 651 if (!PAGE_ALIGNED(va)) 652 return NULL; 653 654 if (__pkvm_host_donate_hyp(hyp_virt_to_pfn(va), 655 PAGE_ALIGN(size) >> PAGE_SHIFT)) 656 return NULL; 657 658 return va; 659 } 660 661 static void *map_donated_memory(unsigned long host_va, size_t size) 662 { 663 void *va = map_donated_memory_noclear(host_va, size); 664 665 if (va) 666 memset(va, 0, size); 667 668 return va; 669 } 670 671 static void __unmap_donated_memory(void *va, size_t size) 672 { 673 kvm_flush_dcache_to_poc(va, size); 674 WARN_ON(__pkvm_hyp_donate_host(hyp_virt_to_pfn(va), 675 PAGE_ALIGN(size) >> PAGE_SHIFT)); 676 } 677 678 static void unmap_donated_memory(void *va, size_t size) 679 { 680 if (!va) 681 return; 682 683 memset(va, 0, size); 684 __unmap_donated_memory(va, size); 685 } 686 687 static void unmap_donated_memory_noclear(void *va, size_t size) 688 { 689 if (!va) 690 return; 691 692 __unmap_donated_memory(va, size); 693 } 694 695 /* 696 * Reserves an entry in the hypervisor for a new VM in protected mode. 697 * 698 * Return a unique handle to the VM on success, negative error code on failure. 699 */ 700 int __pkvm_reserve_vm(void) 701 { 702 int ret; 703 704 hyp_spin_lock(&vm_table_lock); 705 ret = allocate_vm_table_entry(); 706 hyp_spin_unlock(&vm_table_lock); 707 708 if (ret < 0) 709 return ret; 710 711 return idx_to_vm_handle(ret); 712 } 713 714 /* 715 * Removes a reserved entry, but only if is hasn't been used yet. 716 * Otherwise, the VM needs to be destroyed. 717 */ 718 void __pkvm_unreserve_vm(pkvm_handle_t handle) 719 { 720 unsigned int idx = vm_handle_to_idx(handle); 721 722 if (unlikely(!vm_table)) 723 return; 724 725 hyp_spin_lock(&vm_table_lock); 726 if (likely(idx < KVM_MAX_PVMS && vm_table[idx] == RESERVED_ENTRY)) 727 remove_vm_table_entry(handle); 728 hyp_spin_unlock(&vm_table_lock); 729 } 730 731 #ifdef CONFIG_NVHE_EL2_DEBUG 732 static struct pkvm_hyp_vm selftest_vm = { 733 .kvm = { 734 .arch = { 735 .mmu = { 736 .arch = &selftest_vm.kvm.arch, 737 .pgt = &selftest_vm.pgt, 738 }, 739 }, 740 }, 741 }; 742 743 static struct pkvm_hyp_vcpu selftest_vcpu = { 744 .vcpu = { 745 .arch = { 746 .hw_mmu = &selftest_vm.kvm.arch.mmu, 747 }, 748 .kvm = &selftest_vm.kvm, 749 }, 750 }; 751 752 struct pkvm_hyp_vcpu *init_selftest_vm(void *virt) 753 { 754 struct hyp_page *p = hyp_virt_to_page(virt); 755 unsigned long min_pages, seeded = 0; 756 int i; 757 758 selftest_vm.kvm.arch.mmu.vtcr = host_mmu.arch.mmu.vtcr; 759 WARN_ON(kvm_guest_prepare_stage2(&selftest_vm, virt)); 760 761 /* 762 * Mirror pkvm_refill_memcache() for the share/donate pre-checks; 763 * the selftest invokes those functions directly and would 764 * otherwise see an empty memcache. 765 */ 766 min_pages = kvm_mmu_cache_min_pages(&selftest_vm.kvm.arch.mmu); 767 768 for (i = 0; i < pkvm_selftest_pages(); i++) { 769 if (p[i].refcount) 770 continue; 771 p[i].refcount = 1; 772 if (seeded < min_pages) { 773 push_hyp_memcache(&selftest_vcpu.vcpu.arch.pkvm_memcache, 774 hyp_page_to_virt(&p[i]), hyp_virt_to_phys); 775 seeded++; 776 } else { 777 hyp_put_page(&selftest_vm.pool, hyp_page_to_virt(&p[i])); 778 } 779 } 780 781 selftest_vm.kvm.arch.pkvm.handle = __pkvm_reserve_vm(); 782 insert_vm_table_entry(selftest_vm.kvm.arch.pkvm.handle, &selftest_vm); 783 return &selftest_vcpu; 784 } 785 786 void teardown_selftest_vm(void) 787 { 788 hyp_spin_lock(&vm_table_lock); 789 remove_vm_table_entry(selftest_vm.kvm.arch.pkvm.handle); 790 hyp_spin_unlock(&vm_table_lock); 791 } 792 #endif /* CONFIG_NVHE_EL2_DEBUG */ 793 794 /* 795 * Initialize the hypervisor copy of the VM state using host-donated memory. 796 * 797 * Unmap the donated memory from the host at stage 2. 798 * 799 * host_kvm: A pointer to the host's struct kvm. 800 * vm_hva: The host va of the area being donated for the VM state. 801 * Must be page aligned. 802 * pgd_hva: The host va of the area being donated for the stage-2 PGD for 803 * the VM. Must be page aligned. Its size is implied by the VM's 804 * VTCR. 805 * 806 * Return 0 success, negative error code on failure. 807 */ 808 int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva, 809 unsigned long pgd_hva) 810 { 811 struct pkvm_hyp_vm *hyp_vm = NULL; 812 size_t vm_size, pgd_size; 813 unsigned int nr_vcpus; 814 pkvm_handle_t handle; 815 void *pgd = NULL; 816 int ret; 817 818 ret = hyp_pin_shared_mem(host_kvm, host_kvm + 1); 819 if (ret) 820 return ret; 821 822 nr_vcpus = READ_ONCE(host_kvm->created_vcpus); 823 if (nr_vcpus < 1) { 824 ret = -EINVAL; 825 goto err_unpin_kvm; 826 } 827 828 handle = READ_ONCE(host_kvm->arch.pkvm.handle); 829 if (unlikely(handle < HANDLE_OFFSET)) { 830 ret = -EINVAL; 831 goto err_unpin_kvm; 832 } 833 834 vm_size = pkvm_get_hyp_vm_size(nr_vcpus); 835 pgd_size = kvm_pgtable_stage2_pgd_size(host_mmu.arch.mmu.vtcr); 836 837 ret = -ENOMEM; 838 839 hyp_vm = map_donated_memory(vm_hva, vm_size); 840 if (!hyp_vm) 841 goto err_remove_mappings; 842 843 pgd = map_donated_memory_noclear(pgd_hva, pgd_size); 844 if (!pgd) 845 goto err_remove_mappings; 846 847 init_pkvm_hyp_vm(host_kvm, hyp_vm, nr_vcpus, handle); 848 849 ret = kvm_guest_prepare_stage2(hyp_vm, pgd); 850 if (ret) 851 goto err_remove_mappings; 852 853 /* Must be called last since this publishes the VM. */ 854 ret = insert_vm_table_entry(handle, hyp_vm); 855 if (ret) 856 goto err_destroy_stage2; 857 858 return 0; 859 860 err_destroy_stage2: 861 kvm_guest_destroy_stage2(hyp_vm); 862 err_remove_mappings: 863 unmap_donated_memory(hyp_vm, vm_size); 864 unmap_donated_memory(pgd, pgd_size); 865 err_unpin_kvm: 866 hyp_unpin_shared_mem(host_kvm, host_kvm + 1); 867 return ret; 868 } 869 870 /* 871 * Initialize the hypervisor copy of the vCPU state using host-donated memory. 872 * 873 * handle: The hypervisor handle for the vm. 874 * host_vcpu: A pointer to the corresponding host vcpu. 875 * vcpu_hva: The host va of the area being donated for the vcpu state. 876 * Must be page aligned. The size of the area must be equal to 877 * the page-aligned size of 'struct pkvm_hyp_vcpu'. 878 * Return 0 on success, negative error code on failure. 879 */ 880 static int register_hyp_vcpu(struct pkvm_hyp_vm *hyp_vm, 881 struct pkvm_hyp_vcpu *hyp_vcpu) 882 { 883 unsigned int idx = hyp_vcpu->vcpu.vcpu_idx; 884 885 if (idx >= hyp_vm->kvm.created_vcpus) 886 return -EINVAL; 887 888 if (hyp_vm->vcpus[idx]) 889 return -EINVAL; 890 891 /* 892 * Ensure the hyp_vcpu is initialised before publishing it to 893 * the vCPU-load path via 'hyp_vm->vcpus[]'. 894 */ 895 smp_store_release(&hyp_vm->vcpus[idx], hyp_vcpu); 896 return 0; 897 } 898 899 int __pkvm_init_vcpu(pkvm_handle_t handle, struct kvm_vcpu *host_vcpu, 900 unsigned long vcpu_hva) 901 { 902 struct pkvm_hyp_vcpu *hyp_vcpu; 903 struct pkvm_hyp_vm *hyp_vm; 904 int ret; 905 906 hyp_vcpu = map_donated_memory(vcpu_hva, sizeof(*hyp_vcpu)); 907 if (!hyp_vcpu) 908 return -ENOMEM; 909 910 hyp_spin_lock(&vm_table_lock); 911 912 hyp_vm = get_vm_by_handle(handle); 913 if (!hyp_vm) { 914 ret = -ENOENT; 915 goto unlock; 916 } 917 918 ret = init_pkvm_hyp_vcpu(hyp_vcpu, hyp_vm, host_vcpu); 919 if (ret) 920 goto unlock; 921 922 ret = register_hyp_vcpu(hyp_vm, hyp_vcpu); 923 if (ret) { 924 unpin_host_vcpu(host_vcpu); 925 unpin_host_sve_state(hyp_vcpu); 926 } 927 unlock: 928 hyp_spin_unlock(&vm_table_lock); 929 930 if (ret) 931 unmap_donated_memory(hyp_vcpu, sizeof(*hyp_vcpu)); 932 return ret; 933 } 934 935 static void 936 teardown_donated_memory(struct kvm_hyp_memcache *mc, void *addr, size_t size) 937 { 938 size = PAGE_ALIGN(size); 939 memset(addr, 0, size); 940 941 for (void *start = addr; start < addr + size; start += PAGE_SIZE) 942 push_hyp_memcache(mc, start, hyp_virt_to_phys); 943 944 unmap_donated_memory_noclear(addr, size); 945 } 946 947 int __pkvm_reclaim_dying_guest_page(pkvm_handle_t handle, u64 gfn) 948 { 949 struct pkvm_hyp_vm *hyp_vm = get_pkvm_hyp_vm(handle); 950 int ret = -EINVAL; 951 952 if (!hyp_vm) 953 return ret; 954 955 if (hyp_vm->kvm.arch.pkvm.is_dying) 956 ret = __pkvm_host_reclaim_page_guest(gfn, hyp_vm); 957 958 put_pkvm_hyp_vm(hyp_vm); 959 return ret; 960 } 961 962 static struct pkvm_hyp_vm *get_pkvm_unref_hyp_vm_locked(pkvm_handle_t handle) 963 { 964 struct pkvm_hyp_vm *hyp_vm; 965 966 hyp_assert_lock_held(&vm_table_lock); 967 968 hyp_vm = get_vm_by_handle(handle); 969 if (!hyp_vm || hyp_page_count(hyp_vm)) 970 return NULL; 971 972 return hyp_vm; 973 } 974 975 int __pkvm_start_teardown_vm(pkvm_handle_t handle) 976 { 977 struct pkvm_hyp_vm *hyp_vm; 978 int ret = 0; 979 980 hyp_spin_lock(&vm_table_lock); 981 hyp_vm = get_pkvm_unref_hyp_vm_locked(handle); 982 if (!hyp_vm || hyp_vm->kvm.arch.pkvm.is_dying) { 983 ret = -EINVAL; 984 goto unlock; 985 } 986 987 hyp_vm->kvm.arch.pkvm.is_dying = true; 988 unlock: 989 hyp_spin_unlock(&vm_table_lock); 990 991 return ret; 992 } 993 994 int __pkvm_finalize_teardown_vm(pkvm_handle_t handle) 995 { 996 struct kvm_hyp_memcache *mc, *stage2_mc; 997 struct pkvm_hyp_vm *hyp_vm; 998 struct kvm *host_kvm; 999 unsigned int idx; 1000 size_t vm_size; 1001 int err; 1002 1003 hyp_spin_lock(&vm_table_lock); 1004 hyp_vm = get_pkvm_unref_hyp_vm_locked(handle); 1005 if (!hyp_vm || !hyp_vm->kvm.arch.pkvm.is_dying) { 1006 err = -EINVAL; 1007 goto err_unlock; 1008 } 1009 1010 host_kvm = hyp_vm->host_kvm; 1011 1012 /* Ensure the VMID is clean before it can be reallocated */ 1013 __kvm_tlb_flush_vmid(&hyp_vm->kvm.arch.mmu); 1014 remove_vm_table_entry(handle); 1015 hyp_spin_unlock(&vm_table_lock); 1016 1017 /* Reclaim guest pages (including page-table pages) */ 1018 mc = &host_kvm->arch.pkvm.teardown_mc; 1019 stage2_mc = &host_kvm->arch.pkvm.stage2_teardown_mc; 1020 reclaim_pgtable_pages(hyp_vm, stage2_mc); 1021 unpin_host_vcpus(hyp_vm->vcpus, hyp_vm->kvm.created_vcpus); 1022 1023 /* Push the metadata pages to the teardown memcache */ 1024 for (idx = 0; idx < hyp_vm->kvm.created_vcpus; ++idx) { 1025 struct pkvm_hyp_vcpu *hyp_vcpu = hyp_vm->vcpus[idx]; 1026 struct kvm_hyp_memcache *vcpu_mc; 1027 1028 if (!hyp_vcpu) 1029 continue; 1030 1031 vcpu_mc = &hyp_vcpu->vcpu.arch.pkvm_memcache; 1032 1033 while (vcpu_mc->nr_pages) { 1034 void *addr = pop_hyp_memcache(vcpu_mc, hyp_phys_to_virt); 1035 1036 push_hyp_memcache(stage2_mc, addr, hyp_virt_to_phys); 1037 unmap_donated_memory_noclear(addr, PAGE_SIZE); 1038 } 1039 1040 teardown_donated_memory(mc, hyp_vcpu, sizeof(*hyp_vcpu)); 1041 } 1042 1043 vm_size = pkvm_get_hyp_vm_size(hyp_vm->kvm.created_vcpus); 1044 teardown_donated_memory(mc, hyp_vm, vm_size); 1045 hyp_unpin_shared_mem(host_kvm, host_kvm + 1); 1046 return 0; 1047 1048 err_unlock: 1049 hyp_spin_unlock(&vm_table_lock); 1050 return err; 1051 } 1052 1053 static u64 __pkvm_memshare_page_req(struct kvm_vcpu *vcpu, u64 ipa) 1054 { 1055 u64 elr; 1056 1057 /* Fake up a data abort (level 3 translation fault on write) */ 1058 vcpu->arch.fault.esr_el2 = (ESR_ELx_EC_DABT_LOW << ESR_ELx_EC_SHIFT) | 1059 ESR_ELx_WNR | ESR_ELx_FSC_FAULT | 1060 FIELD_PREP(ESR_ELx_FSC_LEVEL, 3); 1061 1062 /* Shuffle the IPA around into the HPFAR */ 1063 vcpu->arch.fault.hpfar_el2 = (HPFAR_EL2_NS | (ipa >> 8)) & HPFAR_MASK; 1064 1065 /* This is a virtual address. 0's good. Let's go with 0. */ 1066 vcpu->arch.fault.far_el2 = 0; 1067 1068 /* Rewind the ELR so we return to the HVC once the IPA is mapped */ 1069 elr = read_sysreg(elr_el2); 1070 elr -= 4; 1071 write_sysreg(elr, elr_el2); 1072 1073 return ARM_EXCEPTION_TRAP; 1074 } 1075 1076 static bool pkvm_memshare_call(u64 *ret, struct kvm_vcpu *vcpu, u64 *exit_code) 1077 { 1078 struct pkvm_hyp_vcpu *hyp_vcpu; 1079 u64 ipa = smccc_get_arg1(vcpu); 1080 1081 if (!PAGE_ALIGNED(ipa)) 1082 goto out_guest; 1083 1084 hyp_vcpu = container_of(vcpu, struct pkvm_hyp_vcpu, vcpu); 1085 switch (__pkvm_guest_share_host(hyp_vcpu, hyp_phys_to_pfn(ipa))) { 1086 case 0: 1087 ret[0] = SMCCC_RET_SUCCESS; 1088 goto out_guest; 1089 case -ENOENT: 1090 /* 1091 * Convert the exception into a data abort so that the page 1092 * being shared is mapped into the guest next time. 1093 */ 1094 *exit_code = __pkvm_memshare_page_req(vcpu, ipa); 1095 goto out_host; 1096 } 1097 1098 out_guest: 1099 return true; 1100 out_host: 1101 return false; 1102 } 1103 1104 static void pkvm_memunshare_call(u64 *ret, struct kvm_vcpu *vcpu) 1105 { 1106 struct pkvm_hyp_vcpu *hyp_vcpu; 1107 u64 ipa = smccc_get_arg1(vcpu); 1108 1109 if (!PAGE_ALIGNED(ipa)) 1110 return; 1111 1112 hyp_vcpu = container_of(vcpu, struct pkvm_hyp_vcpu, vcpu); 1113 if (!__pkvm_guest_unshare_host(hyp_vcpu, hyp_phys_to_pfn(ipa))) 1114 ret[0] = SMCCC_RET_SUCCESS; 1115 } 1116 1117 /* 1118 * Handler for protected VM HVC calls. 1119 * 1120 * Returns true if the hypervisor has handled the exit (and control 1121 * should return to the guest) or false if it hasn't (and the handling 1122 * should be performed by the host). 1123 */ 1124 bool kvm_handle_pvm_hvc64(struct kvm_vcpu *vcpu, u64 *exit_code) 1125 { 1126 u64 val[4] = { SMCCC_RET_INVALID_PARAMETER }; 1127 bool handled = true; 1128 1129 switch (smccc_get_function(vcpu)) { 1130 case ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID: 1131 val[0] = BIT(ARM_SMCCC_KVM_FUNC_FEATURES); 1132 val[0] |= BIT(ARM_SMCCC_KVM_FUNC_HYP_MEMINFO); 1133 val[0] |= BIT(ARM_SMCCC_KVM_FUNC_MEM_SHARE); 1134 val[0] |= BIT(ARM_SMCCC_KVM_FUNC_MEM_UNSHARE); 1135 break; 1136 case ARM_SMCCC_VENDOR_HYP_KVM_HYP_MEMINFO_FUNC_ID: 1137 if (smccc_get_arg1(vcpu) || 1138 smccc_get_arg2(vcpu) || 1139 smccc_get_arg3(vcpu)) { 1140 break; 1141 } 1142 1143 val[0] = PAGE_SIZE; 1144 break; 1145 case ARM_SMCCC_VENDOR_HYP_KVM_MEM_SHARE_FUNC_ID: 1146 if (smccc_get_arg2(vcpu) || 1147 smccc_get_arg3(vcpu)) { 1148 break; 1149 } 1150 1151 handled = pkvm_memshare_call(val, vcpu, exit_code); 1152 break; 1153 case ARM_SMCCC_VENDOR_HYP_KVM_MEM_UNSHARE_FUNC_ID: 1154 if (smccc_get_arg2(vcpu) || 1155 smccc_get_arg3(vcpu)) { 1156 break; 1157 } 1158 1159 pkvm_memunshare_call(val, vcpu); 1160 break; 1161 default: 1162 /* Punt everything else back to the host, for now. */ 1163 handled = false; 1164 } 1165 1166 if (handled) 1167 smccc_set_retval(vcpu, val[0], val[1], val[2], val[3]); 1168 return handled; 1169 } 1170