1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/cleanup.h> 3 #include <linux/cpu.h> 4 #include <asm/cpufeature.h> 5 #include <asm/fpu/xcr.h> 6 #include <linux/misc_cgroup.h> 7 #include <linux/mmu_context.h> 8 #include <asm/tdx.h> 9 #include "capabilities.h" 10 #include "mmu.h" 11 #include "x86_ops.h" 12 #include "lapic.h" 13 #include "tdx.h" 14 #include "vmx.h" 15 #include "mmu/spte.h" 16 #include "common.h" 17 #include "posted_intr.h" 18 #include "irq.h" 19 #include <trace/events/kvm.h> 20 #include "trace.h" 21 22 #pragma GCC poison to_vmx 23 24 #undef pr_fmt 25 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 26 27 #define pr_tdx_error(__fn, __err) \ 28 pr_err_ratelimited("SEAMCALL %s failed: 0x%llx\n", #__fn, __err) 29 30 #define __pr_tdx_error_N(__fn_str, __err, __fmt, ...) \ 31 pr_err_ratelimited("SEAMCALL " __fn_str " failed: 0x%llx, " __fmt, __err, __VA_ARGS__) 32 33 #define pr_tdx_error_1(__fn, __err, __rcx) \ 34 __pr_tdx_error_N(#__fn, __err, "rcx 0x%llx\n", __rcx) 35 36 #define pr_tdx_error_2(__fn, __err, __rcx, __rdx) \ 37 __pr_tdx_error_N(#__fn, __err, "rcx 0x%llx, rdx 0x%llx\n", __rcx, __rdx) 38 39 #define pr_tdx_error_3(__fn, __err, __rcx, __rdx, __r8) \ 40 __pr_tdx_error_N(#__fn, __err, "rcx 0x%llx, rdx 0x%llx, r8 0x%llx\n", __rcx, __rdx, __r8) 41 42 bool enable_tdx __ro_after_init; 43 module_param_named(tdx, enable_tdx, bool, 0444); 44 45 #define TDX_SHARED_BIT_PWL_5 gpa_to_gfn(BIT_ULL(51)) 46 #define TDX_SHARED_BIT_PWL_4 gpa_to_gfn(BIT_ULL(47)) 47 48 static enum cpuhp_state tdx_cpuhp_state; 49 50 static const struct tdx_sys_info *tdx_sysinfo; 51 52 void tdh_vp_rd_failed(struct vcpu_tdx *tdx, char *uclass, u32 field, u64 err) 53 { 54 KVM_BUG_ON(1, tdx->vcpu.kvm); 55 pr_err("TDH_VP_RD[%s.0x%x] failed 0x%llx\n", uclass, field, err); 56 } 57 58 void tdh_vp_wr_failed(struct vcpu_tdx *tdx, char *uclass, char *op, u32 field, 59 u64 val, u64 err) 60 { 61 KVM_BUG_ON(1, tdx->vcpu.kvm); 62 pr_err("TDH_VP_WR[%s.0x%x]%s0x%llx failed: 0x%llx\n", uclass, field, op, val, err); 63 } 64 65 #define KVM_SUPPORTED_TD_ATTRS (TDX_TD_ATTR_SEPT_VE_DISABLE) 66 67 static __always_inline struct kvm_tdx *to_kvm_tdx(struct kvm *kvm) 68 { 69 return container_of(kvm, struct kvm_tdx, kvm); 70 } 71 72 static __always_inline struct vcpu_tdx *to_tdx(struct kvm_vcpu *vcpu) 73 { 74 return container_of(vcpu, struct vcpu_tdx, vcpu); 75 } 76 77 static u64 tdx_get_supported_attrs(const struct tdx_sys_info_td_conf *td_conf) 78 { 79 u64 val = KVM_SUPPORTED_TD_ATTRS; 80 81 if ((val & td_conf->attributes_fixed1) != td_conf->attributes_fixed1) 82 return 0; 83 84 val &= td_conf->attributes_fixed0; 85 86 return val; 87 } 88 89 static u64 tdx_get_supported_xfam(const struct tdx_sys_info_td_conf *td_conf) 90 { 91 u64 val = kvm_caps.supported_xcr0 | kvm_caps.supported_xss; 92 93 if ((val & td_conf->xfam_fixed1) != td_conf->xfam_fixed1) 94 return 0; 95 96 val &= td_conf->xfam_fixed0; 97 98 return val; 99 } 100 101 static int tdx_get_guest_phys_addr_bits(const u32 eax) 102 { 103 return (eax & GENMASK(23, 16)) >> 16; 104 } 105 106 static u32 tdx_set_guest_phys_addr_bits(const u32 eax, int addr_bits) 107 { 108 return (eax & ~GENMASK(23, 16)) | (addr_bits & 0xff) << 16; 109 } 110 111 #define TDX_FEATURE_TSX (__feature_bit(X86_FEATURE_HLE) | __feature_bit(X86_FEATURE_RTM)) 112 113 static bool has_tsx(const struct kvm_cpuid_entry2 *entry) 114 { 115 return entry->function == 7 && entry->index == 0 && 116 (entry->ebx & TDX_FEATURE_TSX); 117 } 118 119 static void clear_tsx(struct kvm_cpuid_entry2 *entry) 120 { 121 entry->ebx &= ~TDX_FEATURE_TSX; 122 } 123 124 static bool has_waitpkg(const struct kvm_cpuid_entry2 *entry) 125 { 126 return entry->function == 7 && entry->index == 0 && 127 (entry->ecx & __feature_bit(X86_FEATURE_WAITPKG)); 128 } 129 130 static void clear_waitpkg(struct kvm_cpuid_entry2 *entry) 131 { 132 entry->ecx &= ~__feature_bit(X86_FEATURE_WAITPKG); 133 } 134 135 static void tdx_clear_unsupported_cpuid(struct kvm_cpuid_entry2 *entry) 136 { 137 if (has_tsx(entry)) 138 clear_tsx(entry); 139 140 if (has_waitpkg(entry)) 141 clear_waitpkg(entry); 142 } 143 144 static bool tdx_unsupported_cpuid(const struct kvm_cpuid_entry2 *entry) 145 { 146 return has_tsx(entry) || has_waitpkg(entry); 147 } 148 149 #define KVM_TDX_CPUID_NO_SUBLEAF ((__u32)-1) 150 151 static void td_init_cpuid_entry2(struct kvm_cpuid_entry2 *entry, unsigned char idx) 152 { 153 const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf; 154 155 entry->function = (u32)td_conf->cpuid_config_leaves[idx]; 156 entry->index = td_conf->cpuid_config_leaves[idx] >> 32; 157 entry->eax = (u32)td_conf->cpuid_config_values[idx][0]; 158 entry->ebx = td_conf->cpuid_config_values[idx][0] >> 32; 159 entry->ecx = (u32)td_conf->cpuid_config_values[idx][1]; 160 entry->edx = td_conf->cpuid_config_values[idx][1] >> 32; 161 162 if (entry->index == KVM_TDX_CPUID_NO_SUBLEAF) 163 entry->index = 0; 164 165 /* 166 * The TDX module doesn't allow configuring the guest phys addr bits 167 * (EAX[23:16]). However, KVM uses it as an interface to the userspace 168 * to configure the GPAW. Report these bits as configurable. 169 */ 170 if (entry->function == 0x80000008) 171 entry->eax = tdx_set_guest_phys_addr_bits(entry->eax, 0xff); 172 173 tdx_clear_unsupported_cpuid(entry); 174 } 175 176 #define TDVMCALLINFO_SETUP_EVENT_NOTIFY_INTERRUPT BIT(1) 177 178 static int init_kvm_tdx_caps(const struct tdx_sys_info_td_conf *td_conf, 179 struct kvm_tdx_capabilities *caps) 180 { 181 int i; 182 183 caps->supported_attrs = tdx_get_supported_attrs(td_conf); 184 if (!caps->supported_attrs) 185 return -EIO; 186 187 caps->supported_xfam = tdx_get_supported_xfam(td_conf); 188 if (!caps->supported_xfam) 189 return -EIO; 190 191 caps->cpuid.nent = td_conf->num_cpuid_config; 192 193 caps->user_tdvmcallinfo_1_r11 = 194 TDVMCALLINFO_SETUP_EVENT_NOTIFY_INTERRUPT; 195 196 for (i = 0; i < td_conf->num_cpuid_config; i++) 197 td_init_cpuid_entry2(&caps->cpuid.entries[i], i); 198 199 return 0; 200 } 201 202 /* 203 * Some SEAMCALLs acquire the TDX module globally, and can fail with 204 * TDX_OPERAND_BUSY. Use a global mutex to serialize these SEAMCALLs. 205 */ 206 static DEFINE_MUTEX(tdx_lock); 207 208 static atomic_t nr_configured_hkid; 209 210 static bool tdx_operand_busy(u64 err) 211 { 212 return (err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_BUSY; 213 } 214 215 216 /* 217 * A per-CPU list of TD vCPUs associated with a given CPU. 218 * Protected by interrupt mask. Only manipulated by the CPU owning this per-CPU 219 * list. 220 * - When a vCPU is loaded onto a CPU, it is removed from the per-CPU list of 221 * the old CPU during the IPI callback running on the old CPU, and then added 222 * to the per-CPU list of the new CPU. 223 * - When a TD is tearing down, all vCPUs are disassociated from their current 224 * running CPUs and removed from the per-CPU list during the IPI callback 225 * running on those CPUs. 226 * - When a CPU is brought down, traverse the per-CPU list to disassociate all 227 * associated TD vCPUs and remove them from the per-CPU list. 228 */ 229 static DEFINE_PER_CPU(struct list_head, associated_tdvcpus); 230 231 static __always_inline unsigned long tdvmcall_exit_type(struct kvm_vcpu *vcpu) 232 { 233 return to_tdx(vcpu)->vp_enter_args.r10; 234 } 235 236 static __always_inline unsigned long tdvmcall_leaf(struct kvm_vcpu *vcpu) 237 { 238 return to_tdx(vcpu)->vp_enter_args.r11; 239 } 240 241 static __always_inline void tdvmcall_set_return_code(struct kvm_vcpu *vcpu, 242 long val) 243 { 244 to_tdx(vcpu)->vp_enter_args.r10 = val; 245 } 246 247 static __always_inline void tdvmcall_set_return_val(struct kvm_vcpu *vcpu, 248 unsigned long val) 249 { 250 to_tdx(vcpu)->vp_enter_args.r11 = val; 251 } 252 253 static inline void tdx_hkid_free(struct kvm_tdx *kvm_tdx) 254 { 255 tdx_guest_keyid_free(kvm_tdx->hkid); 256 kvm_tdx->hkid = -1; 257 atomic_dec(&nr_configured_hkid); 258 misc_cg_uncharge(MISC_CG_RES_TDX, kvm_tdx->misc_cg, 1); 259 put_misc_cg(kvm_tdx->misc_cg); 260 kvm_tdx->misc_cg = NULL; 261 } 262 263 static inline bool is_hkid_assigned(struct kvm_tdx *kvm_tdx) 264 { 265 return kvm_tdx->hkid > 0; 266 } 267 268 static inline void tdx_disassociate_vp(struct kvm_vcpu *vcpu) 269 { 270 lockdep_assert_irqs_disabled(); 271 272 list_del(&to_tdx(vcpu)->cpu_list); 273 274 /* 275 * Ensure tdx->cpu_list is updated before setting vcpu->cpu to -1, 276 * otherwise, a different CPU can see vcpu->cpu = -1 and add the vCPU 277 * to its list before it's deleted from this CPU's list. 278 */ 279 smp_wmb(); 280 281 vcpu->cpu = -1; 282 } 283 284 static void tdx_no_vcpus_enter_start(struct kvm *kvm) 285 { 286 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 287 288 lockdep_assert_held_write(&kvm->mmu_lock); 289 290 WRITE_ONCE(kvm_tdx->wait_for_sept_zap, true); 291 292 kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE); 293 } 294 295 static void tdx_no_vcpus_enter_stop(struct kvm *kvm) 296 { 297 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 298 299 lockdep_assert_held_write(&kvm->mmu_lock); 300 301 WRITE_ONCE(kvm_tdx->wait_for_sept_zap, false); 302 } 303 304 /* TDH.PHYMEM.PAGE.RECLAIM is allowed only when destroying the TD. */ 305 static int __tdx_reclaim_page(struct page *page) 306 { 307 u64 err, rcx, rdx, r8; 308 309 err = tdh_phymem_page_reclaim(page, &rcx, &rdx, &r8); 310 311 /* 312 * No need to check for TDX_OPERAND_BUSY; all TD pages are freed 313 * before the HKID is released and control pages have also been 314 * released at this point, so there is no possibility of contention. 315 */ 316 if (WARN_ON_ONCE(err)) { 317 pr_tdx_error_3(TDH_PHYMEM_PAGE_RECLAIM, err, rcx, rdx, r8); 318 return -EIO; 319 } 320 return 0; 321 } 322 323 static int tdx_reclaim_page(struct page *page) 324 { 325 int r; 326 327 r = __tdx_reclaim_page(page); 328 if (!r) 329 tdx_quirk_reset_page(page); 330 return r; 331 } 332 333 334 /* 335 * Reclaim the TD control page(s) which are crypto-protected by TDX guest's 336 * private KeyID. Assume the cache associated with the TDX private KeyID has 337 * been flushed. 338 */ 339 static void tdx_reclaim_control_page(struct page *ctrl_page) 340 { 341 /* 342 * Leak the page if the kernel failed to reclaim the page. 343 * The kernel cannot use it safely anymore. 344 */ 345 if (tdx_reclaim_page(ctrl_page)) 346 return; 347 348 __free_page(ctrl_page); 349 } 350 351 struct tdx_flush_vp_arg { 352 struct kvm_vcpu *vcpu; 353 u64 err; 354 }; 355 356 static void tdx_flush_vp(void *_arg) 357 { 358 struct tdx_flush_vp_arg *arg = _arg; 359 struct kvm_vcpu *vcpu = arg->vcpu; 360 u64 err; 361 362 arg->err = 0; 363 lockdep_assert_irqs_disabled(); 364 365 /* Task migration can race with CPU offlining. */ 366 if (unlikely(vcpu->cpu != raw_smp_processor_id())) 367 return; 368 369 /* 370 * No need to do TDH_VP_FLUSH if the vCPU hasn't been initialized. The 371 * list tracking still needs to be updated so that it's correct if/when 372 * the vCPU does get initialized. 373 */ 374 if (to_tdx(vcpu)->state != VCPU_TD_STATE_UNINITIALIZED) { 375 /* 376 * No need to retry. TDX Resources needed for TDH.VP.FLUSH are: 377 * TDVPR as exclusive, TDR as shared, and TDCS as shared. This 378 * vp flush function is called when destructing vCPU/TD or vCPU 379 * migration. No other thread uses TDVPR in those cases. 380 */ 381 err = tdh_vp_flush(&to_tdx(vcpu)->vp); 382 if (unlikely(err && err != TDX_VCPU_NOT_ASSOCIATED)) { 383 /* 384 * This function is called in IPI context. Do not use 385 * printk to avoid console semaphore. 386 * The caller prints out the error message, instead. 387 */ 388 if (err) 389 arg->err = err; 390 } 391 } 392 393 tdx_disassociate_vp(vcpu); 394 } 395 396 static void tdx_flush_vp_on_cpu(struct kvm_vcpu *vcpu) 397 { 398 struct tdx_flush_vp_arg arg = { 399 .vcpu = vcpu, 400 }; 401 int cpu = vcpu->cpu; 402 403 if (unlikely(cpu == -1)) 404 return; 405 406 smp_call_function_single(cpu, tdx_flush_vp, &arg, 1); 407 if (KVM_BUG_ON(arg.err, vcpu->kvm)) 408 pr_tdx_error(TDH_VP_FLUSH, arg.err); 409 } 410 411 void tdx_disable_virtualization_cpu(void) 412 { 413 int cpu = raw_smp_processor_id(); 414 struct list_head *tdvcpus = &per_cpu(associated_tdvcpus, cpu); 415 struct tdx_flush_vp_arg arg; 416 struct vcpu_tdx *tdx, *tmp; 417 unsigned long flags; 418 419 local_irq_save(flags); 420 /* Safe variant needed as tdx_disassociate_vp() deletes the entry. */ 421 list_for_each_entry_safe(tdx, tmp, tdvcpus, cpu_list) { 422 arg.vcpu = &tdx->vcpu; 423 tdx_flush_vp(&arg); 424 } 425 local_irq_restore(flags); 426 427 /* 428 * Flush cache now if kexec is possible: this is necessary to avoid 429 * having dirty private memory cachelines when the new kernel boots, 430 * but WBINVD is a relatively expensive operation and doing it during 431 * kexec can exacerbate races in native_stop_other_cpus(). Do it 432 * now, since this is a safe moment and there is going to be no more 433 * TDX activity on this CPU from this point on. 434 */ 435 tdx_cpu_flush_cache_for_kexec(); 436 } 437 438 #define TDX_SEAMCALL_RETRIES 10000 439 440 static void smp_func_do_phymem_cache_wb(void *unused) 441 { 442 u64 err = 0; 443 bool resume; 444 int i; 445 446 /* 447 * TDH.PHYMEM.CACHE.WB flushes caches associated with any TDX private 448 * KeyID on the package or core. The TDX module may not finish the 449 * cache flush but return TDX_INTERRUPTED_RESUMEABLE instead. The 450 * kernel should retry it until it returns success w/o rescheduling. 451 */ 452 for (i = TDX_SEAMCALL_RETRIES; i > 0; i--) { 453 resume = !!err; 454 err = tdh_phymem_cache_wb(resume); 455 switch (err) { 456 case TDX_INTERRUPTED_RESUMABLE: 457 continue; 458 case TDX_NO_HKID_READY_TO_WBCACHE: 459 err = TDX_SUCCESS; /* Already done by other thread */ 460 fallthrough; 461 default: 462 goto out; 463 } 464 } 465 466 out: 467 if (WARN_ON_ONCE(err)) 468 pr_tdx_error(TDH_PHYMEM_CACHE_WB, err); 469 } 470 471 void tdx_mmu_release_hkid(struct kvm *kvm) 472 { 473 bool packages_allocated, targets_allocated; 474 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 475 cpumask_var_t packages, targets; 476 struct kvm_vcpu *vcpu; 477 unsigned long j; 478 int i; 479 u64 err; 480 481 if (!is_hkid_assigned(kvm_tdx)) 482 return; 483 484 packages_allocated = zalloc_cpumask_var(&packages, GFP_KERNEL); 485 targets_allocated = zalloc_cpumask_var(&targets, GFP_KERNEL); 486 cpus_read_lock(); 487 488 kvm_for_each_vcpu(j, vcpu, kvm) 489 tdx_flush_vp_on_cpu(vcpu); 490 491 /* 492 * TDH.PHYMEM.CACHE.WB tries to acquire the TDX module global lock 493 * and can fail with TDX_OPERAND_BUSY when it fails to get the lock. 494 * Multiple TDX guests can be destroyed simultaneously. Take the 495 * mutex to prevent it from getting error. 496 */ 497 mutex_lock(&tdx_lock); 498 499 /* 500 * Releasing HKID is in vm_destroy(). 501 * After the above flushing vps, there should be no more vCPU 502 * associations, as all vCPU fds have been released at this stage. 503 */ 504 err = tdh_mng_vpflushdone(&kvm_tdx->td); 505 if (err == TDX_FLUSHVP_NOT_DONE) 506 goto out; 507 if (KVM_BUG_ON(err, kvm)) { 508 pr_tdx_error(TDH_MNG_VPFLUSHDONE, err); 509 pr_err("tdh_mng_vpflushdone() failed. HKID %d is leaked.\n", 510 kvm_tdx->hkid); 511 goto out; 512 } 513 514 for_each_online_cpu(i) { 515 if (packages_allocated && 516 cpumask_test_and_set_cpu(topology_physical_package_id(i), 517 packages)) 518 continue; 519 if (targets_allocated) 520 cpumask_set_cpu(i, targets); 521 } 522 if (targets_allocated) 523 on_each_cpu_mask(targets, smp_func_do_phymem_cache_wb, NULL, true); 524 else 525 on_each_cpu(smp_func_do_phymem_cache_wb, NULL, true); 526 /* 527 * In the case of error in smp_func_do_phymem_cache_wb(), the following 528 * tdh_mng_key_freeid() will fail. 529 */ 530 err = tdh_mng_key_freeid(&kvm_tdx->td); 531 if (KVM_BUG_ON(err, kvm)) { 532 pr_tdx_error(TDH_MNG_KEY_FREEID, err); 533 pr_err("tdh_mng_key_freeid() failed. HKID %d is leaked.\n", 534 kvm_tdx->hkid); 535 } else { 536 tdx_hkid_free(kvm_tdx); 537 } 538 539 out: 540 mutex_unlock(&tdx_lock); 541 cpus_read_unlock(); 542 free_cpumask_var(targets); 543 free_cpumask_var(packages); 544 } 545 546 static void tdx_reclaim_td_control_pages(struct kvm *kvm) 547 { 548 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 549 u64 err; 550 int i; 551 552 /* 553 * tdx_mmu_release_hkid() failed to reclaim HKID. Something went wrong 554 * heavily with TDX module. Give up freeing TD pages. As the function 555 * already warned, don't warn it again. 556 */ 557 if (is_hkid_assigned(kvm_tdx)) 558 return; 559 560 if (kvm_tdx->td.tdcs_pages) { 561 for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) { 562 if (!kvm_tdx->td.tdcs_pages[i]) 563 continue; 564 565 tdx_reclaim_control_page(kvm_tdx->td.tdcs_pages[i]); 566 } 567 kfree(kvm_tdx->td.tdcs_pages); 568 kvm_tdx->td.tdcs_pages = NULL; 569 } 570 571 if (!kvm_tdx->td.tdr_page) 572 return; 573 574 if (__tdx_reclaim_page(kvm_tdx->td.tdr_page)) 575 return; 576 577 /* 578 * Use a SEAMCALL to ask the TDX module to flush the cache based on the 579 * KeyID. TDX module may access TDR while operating on TD (Especially 580 * when it is reclaiming TDCS). 581 */ 582 err = tdh_phymem_page_wbinvd_tdr(&kvm_tdx->td); 583 if (KVM_BUG_ON(err, kvm)) { 584 pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err); 585 return; 586 } 587 tdx_quirk_reset_page(kvm_tdx->td.tdr_page); 588 589 __free_page(kvm_tdx->td.tdr_page); 590 kvm_tdx->td.tdr_page = NULL; 591 } 592 593 void tdx_vm_destroy(struct kvm *kvm) 594 { 595 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 596 597 tdx_reclaim_td_control_pages(kvm); 598 599 kvm_tdx->state = TD_STATE_UNINITIALIZED; 600 } 601 602 static int tdx_do_tdh_mng_key_config(void *param) 603 { 604 struct kvm_tdx *kvm_tdx = param; 605 u64 err; 606 607 /* TDX_RND_NO_ENTROPY related retries are handled by sc_retry() */ 608 err = tdh_mng_key_config(&kvm_tdx->td); 609 610 if (KVM_BUG_ON(err, &kvm_tdx->kvm)) { 611 pr_tdx_error(TDH_MNG_KEY_CONFIG, err); 612 return -EIO; 613 } 614 615 return 0; 616 } 617 618 int tdx_vm_init(struct kvm *kvm) 619 { 620 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 621 622 kvm->arch.has_protected_state = true; 623 /* 624 * TDX Module doesn't allow the hypervisor to modify the EOI-bitmap, 625 * i.e. all EOIs are accelerated and never trigger exits. 626 */ 627 kvm->arch.has_protected_eoi = true; 628 kvm->arch.has_private_mem = true; 629 kvm->arch.disabled_quirks |= KVM_X86_QUIRK_IGNORE_GUEST_PAT; 630 631 /* 632 * Because guest TD is protected, VMM can't parse the instruction in TD. 633 * Instead, guest uses MMIO hypercall. For unmodified device driver, 634 * #VE needs to be injected for MMIO and #VE handler in TD converts MMIO 635 * instruction into MMIO hypercall. 636 * 637 * SPTE value for MMIO needs to be setup so that #VE is injected into 638 * TD instead of triggering EPT MISCONFIG. 639 * - RWX=0 so that EPT violation is triggered. 640 * - suppress #VE bit is cleared to inject #VE. 641 */ 642 kvm_mmu_set_mmio_spte_value(kvm, 0); 643 644 /* 645 * TDX has its own limit of maximum vCPUs it can support for all 646 * TDX guests in addition to KVM_MAX_VCPUS. TDX module reports 647 * such limit via the MAX_VCPU_PER_TD global metadata. In 648 * practice, it reflects the number of logical CPUs that ALL 649 * platforms that the TDX module supports can possibly have. 650 * 651 * Limit TDX guest's maximum vCPUs to the number of logical CPUs 652 * the platform has. Simply forwarding the MAX_VCPU_PER_TD to 653 * userspace would result in an unpredictable ABI. 654 */ 655 kvm->max_vcpus = min_t(int, kvm->max_vcpus, num_present_cpus()); 656 657 kvm_tdx->state = TD_STATE_UNINITIALIZED; 658 659 return 0; 660 } 661 662 int tdx_vcpu_create(struct kvm_vcpu *vcpu) 663 { 664 struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); 665 struct vcpu_tdx *tdx = to_tdx(vcpu); 666 667 if (kvm_tdx->state != TD_STATE_INITIALIZED) 668 return -EIO; 669 670 /* 671 * TDX module mandates APICv, which requires an in-kernel local APIC. 672 * Disallow an in-kernel I/O APIC, because level-triggered interrupts 673 * and thus the I/O APIC as a whole can't be faithfully emulated in KVM. 674 */ 675 if (!irqchip_split(vcpu->kvm)) 676 return -EINVAL; 677 678 fpstate_set_confidential(&vcpu->arch.guest_fpu); 679 vcpu->arch.apic->guest_apic_protected = true; 680 INIT_LIST_HEAD(&tdx->vt.pi_wakeup_list); 681 682 vcpu->arch.efer = EFER_SCE | EFER_LME | EFER_LMA | EFER_NX; 683 684 vcpu->arch.switch_db_regs = KVM_DEBUGREG_AUTO_SWITCH; 685 vcpu->arch.cr0_guest_owned_bits = -1ul; 686 vcpu->arch.cr4_guest_owned_bits = -1ul; 687 688 /* KVM can't change TSC offset/multiplier as TDX module manages them. */ 689 vcpu->arch.guest_tsc_protected = true; 690 vcpu->arch.tsc_offset = kvm_tdx->tsc_offset; 691 vcpu->arch.l1_tsc_offset = vcpu->arch.tsc_offset; 692 vcpu->arch.tsc_scaling_ratio = kvm_tdx->tsc_multiplier; 693 vcpu->arch.l1_tsc_scaling_ratio = kvm_tdx->tsc_multiplier; 694 695 vcpu->arch.guest_state_protected = 696 !(to_kvm_tdx(vcpu->kvm)->attributes & TDX_TD_ATTR_DEBUG); 697 698 if ((kvm_tdx->xfam & XFEATURE_MASK_XTILE) == XFEATURE_MASK_XTILE) 699 vcpu->arch.xfd_no_write_intercept = true; 700 701 tdx->vt.pi_desc.nv = POSTED_INTR_VECTOR; 702 __pi_set_sn(&tdx->vt.pi_desc); 703 704 tdx->state = VCPU_TD_STATE_UNINITIALIZED; 705 706 return 0; 707 } 708 709 void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 710 { 711 struct vcpu_tdx *tdx = to_tdx(vcpu); 712 713 vmx_vcpu_pi_load(vcpu, cpu); 714 if (vcpu->cpu == cpu || !is_hkid_assigned(to_kvm_tdx(vcpu->kvm))) 715 return; 716 717 tdx_flush_vp_on_cpu(vcpu); 718 719 KVM_BUG_ON(cpu != raw_smp_processor_id(), vcpu->kvm); 720 local_irq_disable(); 721 /* 722 * Pairs with the smp_wmb() in tdx_disassociate_vp() to ensure 723 * vcpu->cpu is read before tdx->cpu_list. 724 */ 725 smp_rmb(); 726 727 list_add(&tdx->cpu_list, &per_cpu(associated_tdvcpus, cpu)); 728 local_irq_enable(); 729 } 730 731 bool tdx_interrupt_allowed(struct kvm_vcpu *vcpu) 732 { 733 /* 734 * KVM can't get the interrupt status of TDX guest and it assumes 735 * interrupt is always allowed unless TDX guest calls TDVMCALL with HLT, 736 * which passes the interrupt blocked flag. 737 */ 738 return vmx_get_exit_reason(vcpu).basic != EXIT_REASON_HLT || 739 !to_tdx(vcpu)->vp_enter_args.r12; 740 } 741 742 static bool tdx_protected_apic_has_interrupt(struct kvm_vcpu *vcpu) 743 { 744 u64 vcpu_state_details; 745 746 if (pi_has_pending_interrupt(vcpu)) 747 return true; 748 749 /* 750 * Only check RVI pending for HALTED case with IRQ enabled. 751 * For non-HLT cases, KVM doesn't care about STI/SS shadows. And if the 752 * interrupt was pending before TD exit, then it _must_ be blocked, 753 * otherwise the interrupt would have been serviced at the instruction 754 * boundary. 755 */ 756 if (vmx_get_exit_reason(vcpu).basic != EXIT_REASON_HLT || 757 to_tdx(vcpu)->vp_enter_args.r12) 758 return false; 759 760 vcpu_state_details = 761 td_state_non_arch_read64(to_tdx(vcpu), TD_VCPU_STATE_DETAILS_NON_ARCH); 762 763 return tdx_vcpu_state_details_intr_pending(vcpu_state_details); 764 } 765 766 /* 767 * Compared to vmx_prepare_switch_to_guest(), there is not much to do 768 * as SEAMCALL/SEAMRET calls take care of most of save and restore. 769 */ 770 void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) 771 { 772 struct vcpu_vt *vt = to_vt(vcpu); 773 774 if (vt->guest_state_loaded) 775 return; 776 777 if (likely(is_64bit_mm(current->mm))) 778 vt->msr_host_kernel_gs_base = current->thread.gsbase; 779 else 780 vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE); 781 782 vt->guest_state_loaded = true; 783 } 784 785 struct tdx_uret_msr { 786 u32 msr; 787 unsigned int slot; 788 u64 defval; 789 }; 790 791 static struct tdx_uret_msr tdx_uret_msrs[] = { 792 {.msr = MSR_SYSCALL_MASK, .defval = 0x20200 }, 793 {.msr = MSR_STAR,}, 794 {.msr = MSR_LSTAR,}, 795 {.msr = MSR_TSC_AUX,}, 796 }; 797 798 static void tdx_user_return_msr_update_cache(void) 799 { 800 int i; 801 802 for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++) 803 kvm_user_return_msr_update_cache(tdx_uret_msrs[i].slot, 804 tdx_uret_msrs[i].defval); 805 } 806 807 static void tdx_prepare_switch_to_host(struct kvm_vcpu *vcpu) 808 { 809 struct vcpu_vt *vt = to_vt(vcpu); 810 struct vcpu_tdx *tdx = to_tdx(vcpu); 811 812 if (!vt->guest_state_loaded) 813 return; 814 815 ++vcpu->stat.host_state_reload; 816 wrmsrl(MSR_KERNEL_GS_BASE, vt->msr_host_kernel_gs_base); 817 818 if (tdx->guest_entered) { 819 tdx_user_return_msr_update_cache(); 820 tdx->guest_entered = false; 821 } 822 823 vt->guest_state_loaded = false; 824 } 825 826 void tdx_vcpu_put(struct kvm_vcpu *vcpu) 827 { 828 vmx_vcpu_pi_put(vcpu); 829 tdx_prepare_switch_to_host(vcpu); 830 } 831 832 void tdx_vcpu_free(struct kvm_vcpu *vcpu) 833 { 834 struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); 835 struct vcpu_tdx *tdx = to_tdx(vcpu); 836 int i; 837 838 /* 839 * It is not possible to reclaim pages while hkid is assigned. It might 840 * be assigned if: 841 * 1. the TD VM is being destroyed but freeing hkid failed, in which 842 * case the pages are leaked 843 * 2. TD VCPU creation failed and this on the error path, in which case 844 * there is nothing to do anyway 845 */ 846 if (is_hkid_assigned(kvm_tdx)) 847 return; 848 849 if (tdx->vp.tdcx_pages) { 850 for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) { 851 if (tdx->vp.tdcx_pages[i]) 852 tdx_reclaim_control_page(tdx->vp.tdcx_pages[i]); 853 } 854 kfree(tdx->vp.tdcx_pages); 855 tdx->vp.tdcx_pages = NULL; 856 } 857 if (tdx->vp.tdvpr_page) { 858 tdx_reclaim_control_page(tdx->vp.tdvpr_page); 859 tdx->vp.tdvpr_page = 0; 860 tdx->vp.tdvpr_pa = 0; 861 } 862 863 tdx->state = VCPU_TD_STATE_UNINITIALIZED; 864 } 865 866 int tdx_vcpu_pre_run(struct kvm_vcpu *vcpu) 867 { 868 if (unlikely(to_tdx(vcpu)->state != VCPU_TD_STATE_INITIALIZED || 869 to_kvm_tdx(vcpu->kvm)->state != TD_STATE_RUNNABLE)) 870 return -EINVAL; 871 872 return 1; 873 } 874 875 static __always_inline u32 tdcall_to_vmx_exit_reason(struct kvm_vcpu *vcpu) 876 { 877 switch (tdvmcall_leaf(vcpu)) { 878 case EXIT_REASON_CPUID: 879 case EXIT_REASON_HLT: 880 case EXIT_REASON_IO_INSTRUCTION: 881 case EXIT_REASON_MSR_READ: 882 case EXIT_REASON_MSR_WRITE: 883 return tdvmcall_leaf(vcpu); 884 case EXIT_REASON_EPT_VIOLATION: 885 return EXIT_REASON_EPT_MISCONFIG; 886 default: 887 break; 888 } 889 890 return EXIT_REASON_TDCALL; 891 } 892 893 static __always_inline u32 tdx_to_vmx_exit_reason(struct kvm_vcpu *vcpu) 894 { 895 struct vcpu_tdx *tdx = to_tdx(vcpu); 896 u32 exit_reason; 897 898 switch (tdx->vp_enter_ret & TDX_SEAMCALL_STATUS_MASK) { 899 case TDX_SUCCESS: 900 case TDX_NON_RECOVERABLE_VCPU: 901 case TDX_NON_RECOVERABLE_TD: 902 case TDX_NON_RECOVERABLE_TD_NON_ACCESSIBLE: 903 case TDX_NON_RECOVERABLE_TD_WRONG_APIC_MODE: 904 break; 905 default: 906 return -1u; 907 } 908 909 exit_reason = tdx->vp_enter_ret; 910 911 switch (exit_reason) { 912 case EXIT_REASON_TDCALL: 913 if (tdvmcall_exit_type(vcpu)) 914 return EXIT_REASON_VMCALL; 915 916 return tdcall_to_vmx_exit_reason(vcpu); 917 case EXIT_REASON_EPT_MISCONFIG: 918 /* 919 * Defer KVM_BUG_ON() until tdx_handle_exit() because this is in 920 * non-instrumentable code with interrupts disabled. 921 */ 922 return -1u; 923 default: 924 break; 925 } 926 927 return exit_reason; 928 } 929 930 static noinstr void tdx_vcpu_enter_exit(struct kvm_vcpu *vcpu) 931 { 932 struct vcpu_tdx *tdx = to_tdx(vcpu); 933 struct vcpu_vt *vt = to_vt(vcpu); 934 935 guest_state_enter_irqoff(); 936 937 tdx->vp_enter_ret = tdh_vp_enter(&tdx->vp, &tdx->vp_enter_args); 938 939 vt->exit_reason.full = tdx_to_vmx_exit_reason(vcpu); 940 941 vt->exit_qualification = tdx->vp_enter_args.rcx; 942 tdx->ext_exit_qualification = tdx->vp_enter_args.rdx; 943 tdx->exit_gpa = tdx->vp_enter_args.r8; 944 vt->exit_intr_info = tdx->vp_enter_args.r9; 945 946 vmx_handle_nmi(vcpu); 947 948 guest_state_exit_irqoff(); 949 } 950 951 static bool tdx_failed_vmentry(struct kvm_vcpu *vcpu) 952 { 953 return vmx_get_exit_reason(vcpu).failed_vmentry && 954 vmx_get_exit_reason(vcpu).full != -1u; 955 } 956 957 static fastpath_t tdx_exit_handlers_fastpath(struct kvm_vcpu *vcpu) 958 { 959 u64 vp_enter_ret = to_tdx(vcpu)->vp_enter_ret; 960 961 /* 962 * TDX_OPERAND_BUSY could be returned for SEPT due to 0-step mitigation 963 * or for TD EPOCH due to contention with TDH.MEM.TRACK on TDH.VP.ENTER. 964 * 965 * When KVM requests KVM_REQ_OUTSIDE_GUEST_MODE, which has both 966 * KVM_REQUEST_WAIT and KVM_REQUEST_NO_ACTION set, it requires target 967 * vCPUs leaving fastpath so that interrupt can be enabled to ensure the 968 * IPIs can be delivered. Return EXIT_FASTPATH_EXIT_HANDLED instead of 969 * EXIT_FASTPATH_REENTER_GUEST to exit fastpath, otherwise, the 970 * requester may be blocked endlessly. 971 */ 972 if (unlikely(tdx_operand_busy(vp_enter_ret))) 973 return EXIT_FASTPATH_EXIT_HANDLED; 974 975 return EXIT_FASTPATH_NONE; 976 } 977 978 #define TDX_REGS_AVAIL_SET (BIT_ULL(VCPU_EXREG_EXIT_INFO_1) | \ 979 BIT_ULL(VCPU_EXREG_EXIT_INFO_2) | \ 980 BIT_ULL(VCPU_REGS_RAX) | \ 981 BIT_ULL(VCPU_REGS_RBX) | \ 982 BIT_ULL(VCPU_REGS_RCX) | \ 983 BIT_ULL(VCPU_REGS_RDX) | \ 984 BIT_ULL(VCPU_REGS_RBP) | \ 985 BIT_ULL(VCPU_REGS_RSI) | \ 986 BIT_ULL(VCPU_REGS_RDI) | \ 987 BIT_ULL(VCPU_REGS_R8) | \ 988 BIT_ULL(VCPU_REGS_R9) | \ 989 BIT_ULL(VCPU_REGS_R10) | \ 990 BIT_ULL(VCPU_REGS_R11) | \ 991 BIT_ULL(VCPU_REGS_R12) | \ 992 BIT_ULL(VCPU_REGS_R13) | \ 993 BIT_ULL(VCPU_REGS_R14) | \ 994 BIT_ULL(VCPU_REGS_R15)) 995 996 static void tdx_load_host_xsave_state(struct kvm_vcpu *vcpu) 997 { 998 struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); 999 1000 /* 1001 * All TDX hosts support PKRU; but even if they didn't, 1002 * vcpu->arch.host_pkru would be 0 and the wrpkru would be 1003 * skipped. 1004 */ 1005 if (vcpu->arch.host_pkru != 0) 1006 wrpkru(vcpu->arch.host_pkru); 1007 1008 if (kvm_host.xcr0 != (kvm_tdx->xfam & kvm_caps.supported_xcr0)) 1009 xsetbv(XCR_XFEATURE_ENABLED_MASK, kvm_host.xcr0); 1010 1011 /* 1012 * Likewise, even if a TDX hosts didn't support XSS both arms of 1013 * the comparison would be 0 and the wrmsrl would be skipped. 1014 */ 1015 if (kvm_host.xss != (kvm_tdx->xfam & kvm_caps.supported_xss)) 1016 wrmsrl(MSR_IA32_XSS, kvm_host.xss); 1017 } 1018 1019 #define TDX_DEBUGCTL_PRESERVED (DEBUGCTLMSR_BTF | \ 1020 DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI | \ 1021 DEBUGCTLMSR_FREEZE_IN_SMM) 1022 1023 fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) 1024 { 1025 struct vcpu_tdx *tdx = to_tdx(vcpu); 1026 struct vcpu_vt *vt = to_vt(vcpu); 1027 1028 /* 1029 * WARN if KVM wants to force an immediate exit, as the TDX module does 1030 * not guarantee entry into the guest, i.e. it's possible for KVM to 1031 * _think_ it completed entry to the guest and forced an immediate exit 1032 * without actually having done so. Luckily, KVM never needs to force 1033 * an immediate exit for TDX (KVM can't do direct event injection, so 1034 * just WARN and continue on. 1035 */ 1036 WARN_ON_ONCE(run_flags); 1037 1038 /* 1039 * Wait until retry of SEPT-zap-related SEAMCALL completes before 1040 * allowing vCPU entry to avoid contention with tdh_vp_enter() and 1041 * TDCALLs. 1042 */ 1043 if (unlikely(READ_ONCE(to_kvm_tdx(vcpu->kvm)->wait_for_sept_zap))) 1044 return EXIT_FASTPATH_EXIT_HANDLED; 1045 1046 trace_kvm_entry(vcpu, run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT); 1047 1048 if (pi_test_on(&vt->pi_desc)) { 1049 apic->send_IPI_self(POSTED_INTR_VECTOR); 1050 1051 if (pi_test_pir(kvm_lapic_get_reg(vcpu->arch.apic, APIC_LVTT) & 1052 APIC_VECTOR_MASK, &vt->pi_desc)) 1053 kvm_wait_lapic_expire(vcpu); 1054 } 1055 1056 tdx_vcpu_enter_exit(vcpu); 1057 1058 if (vcpu->arch.host_debugctl & ~TDX_DEBUGCTL_PRESERVED) 1059 update_debugctlmsr(vcpu->arch.host_debugctl); 1060 1061 tdx_load_host_xsave_state(vcpu); 1062 tdx->guest_entered = true; 1063 1064 vcpu->arch.regs_avail &= TDX_REGS_AVAIL_SET; 1065 1066 if (unlikely(tdx->vp_enter_ret == EXIT_REASON_EPT_MISCONFIG)) 1067 return EXIT_FASTPATH_NONE; 1068 1069 if (unlikely((tdx->vp_enter_ret & TDX_SW_ERROR) == TDX_SW_ERROR)) 1070 return EXIT_FASTPATH_NONE; 1071 1072 if (unlikely(vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MCE_DURING_VMENTRY)) 1073 kvm_machine_check(); 1074 1075 trace_kvm_exit(vcpu, KVM_ISA_VMX); 1076 1077 if (unlikely(tdx_failed_vmentry(vcpu))) 1078 return EXIT_FASTPATH_NONE; 1079 1080 return tdx_exit_handlers_fastpath(vcpu); 1081 } 1082 1083 void tdx_inject_nmi(struct kvm_vcpu *vcpu) 1084 { 1085 ++vcpu->stat.nmi_injections; 1086 td_management_write8(to_tdx(vcpu), TD_VCPU_PEND_NMI, 1); 1087 /* 1088 * From KVM's perspective, NMI injection is completed right after 1089 * writing to PEND_NMI. KVM doesn't care whether an NMI is injected by 1090 * the TDX module or not. 1091 */ 1092 vcpu->arch.nmi_injected = false; 1093 /* 1094 * TDX doesn't support KVM to request NMI window exit. If there is 1095 * still a pending vNMI, KVM is not able to inject it along with the 1096 * one pending in TDX module in a back-to-back way. Since the previous 1097 * vNMI is still pending in TDX module, i.e. it has not been delivered 1098 * to TDX guest yet, it's OK to collapse the pending vNMI into the 1099 * previous one. The guest is expected to handle all the NMI sources 1100 * when handling the first vNMI. 1101 */ 1102 vcpu->arch.nmi_pending = 0; 1103 } 1104 1105 static int tdx_handle_exception_nmi(struct kvm_vcpu *vcpu) 1106 { 1107 u32 intr_info = vmx_get_intr_info(vcpu); 1108 1109 /* 1110 * Machine checks are handled by handle_exception_irqoff(), or by 1111 * tdx_handle_exit() with TDX_NON_RECOVERABLE set if a #MC occurs on 1112 * VM-Entry. NMIs are handled by tdx_vcpu_enter_exit(). 1113 */ 1114 if (is_nmi(intr_info) || is_machine_check(intr_info)) 1115 return 1; 1116 1117 vcpu->run->exit_reason = KVM_EXIT_EXCEPTION; 1118 vcpu->run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK; 1119 vcpu->run->ex.error_code = 0; 1120 1121 return 0; 1122 } 1123 1124 static int complete_hypercall_exit(struct kvm_vcpu *vcpu) 1125 { 1126 tdvmcall_set_return_code(vcpu, vcpu->run->hypercall.ret); 1127 return 1; 1128 } 1129 1130 static int tdx_emulate_vmcall(struct kvm_vcpu *vcpu) 1131 { 1132 kvm_rax_write(vcpu, to_tdx(vcpu)->vp_enter_args.r10); 1133 kvm_rbx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r11); 1134 kvm_rcx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r12); 1135 kvm_rdx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r13); 1136 kvm_rsi_write(vcpu, to_tdx(vcpu)->vp_enter_args.r14); 1137 1138 return __kvm_emulate_hypercall(vcpu, 0, complete_hypercall_exit); 1139 } 1140 1141 /* 1142 * Split into chunks and check interrupt pending between chunks. This allows 1143 * for timely injection of interrupts to prevent issues with guest lockup 1144 * detection. 1145 */ 1146 #define TDX_MAP_GPA_MAX_LEN (2 * 1024 * 1024) 1147 static void __tdx_map_gpa(struct vcpu_tdx *tdx); 1148 1149 static int tdx_complete_vmcall_map_gpa(struct kvm_vcpu *vcpu) 1150 { 1151 struct vcpu_tdx *tdx = to_tdx(vcpu); 1152 1153 if (vcpu->run->hypercall.ret) { 1154 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); 1155 tdx->vp_enter_args.r11 = tdx->map_gpa_next; 1156 return 1; 1157 } 1158 1159 tdx->map_gpa_next += TDX_MAP_GPA_MAX_LEN; 1160 if (tdx->map_gpa_next >= tdx->map_gpa_end) 1161 return 1; 1162 1163 /* 1164 * Stop processing the remaining part if there is a pending interrupt, 1165 * which could be qualified to deliver. Skip checking pending RVI for 1166 * TDVMCALL_MAP_GPA, see comments in tdx_protected_apic_has_interrupt(). 1167 */ 1168 if (kvm_vcpu_has_events(vcpu)) { 1169 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_RETRY); 1170 tdx->vp_enter_args.r11 = tdx->map_gpa_next; 1171 return 1; 1172 } 1173 1174 __tdx_map_gpa(tdx); 1175 return 0; 1176 } 1177 1178 static void __tdx_map_gpa(struct vcpu_tdx *tdx) 1179 { 1180 u64 gpa = tdx->map_gpa_next; 1181 u64 size = tdx->map_gpa_end - tdx->map_gpa_next; 1182 1183 if (size > TDX_MAP_GPA_MAX_LEN) 1184 size = TDX_MAP_GPA_MAX_LEN; 1185 1186 tdx->vcpu.run->exit_reason = KVM_EXIT_HYPERCALL; 1187 tdx->vcpu.run->hypercall.nr = KVM_HC_MAP_GPA_RANGE; 1188 /* 1189 * In principle this should have been -KVM_ENOSYS, but userspace (QEMU <=9.2) 1190 * assumed that vcpu->run->hypercall.ret is never changed by KVM and thus that 1191 * it was always zero on KVM_EXIT_HYPERCALL. Since KVM is now overwriting 1192 * vcpu->run->hypercall.ret, ensuring that it is zero to not break QEMU. 1193 */ 1194 tdx->vcpu.run->hypercall.ret = 0; 1195 tdx->vcpu.run->hypercall.args[0] = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(tdx->vcpu.kvm)); 1196 tdx->vcpu.run->hypercall.args[1] = size / PAGE_SIZE; 1197 tdx->vcpu.run->hypercall.args[2] = vt_is_tdx_private_gpa(tdx->vcpu.kvm, gpa) ? 1198 KVM_MAP_GPA_RANGE_ENCRYPTED : 1199 KVM_MAP_GPA_RANGE_DECRYPTED; 1200 tdx->vcpu.run->hypercall.flags = KVM_EXIT_HYPERCALL_LONG_MODE; 1201 1202 tdx->vcpu.arch.complete_userspace_io = tdx_complete_vmcall_map_gpa; 1203 } 1204 1205 static int tdx_map_gpa(struct kvm_vcpu *vcpu) 1206 { 1207 struct vcpu_tdx *tdx = to_tdx(vcpu); 1208 u64 gpa = tdx->vp_enter_args.r12; 1209 u64 size = tdx->vp_enter_args.r13; 1210 u64 ret; 1211 1212 /* 1213 * Converting TDVMCALL_MAP_GPA to KVM_HC_MAP_GPA_RANGE requires 1214 * userspace to enable KVM_CAP_EXIT_HYPERCALL with KVM_HC_MAP_GPA_RANGE 1215 * bit set. This is a base call so it should always be supported, but 1216 * KVM has no way to ensure that userspace implements the GHCI correctly. 1217 * So if KVM_HC_MAP_GPA_RANGE does not cause a VMEXIT, return an error 1218 * to the guest. 1219 */ 1220 if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE)) { 1221 ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED; 1222 goto error; 1223 } 1224 1225 if (gpa + size <= gpa || !kvm_vcpu_is_legal_gpa(vcpu, gpa) || 1226 !kvm_vcpu_is_legal_gpa(vcpu, gpa + size - 1) || 1227 (vt_is_tdx_private_gpa(vcpu->kvm, gpa) != 1228 vt_is_tdx_private_gpa(vcpu->kvm, gpa + size - 1))) { 1229 ret = TDVMCALL_STATUS_INVALID_OPERAND; 1230 goto error; 1231 } 1232 1233 if (!PAGE_ALIGNED(gpa) || !PAGE_ALIGNED(size)) { 1234 ret = TDVMCALL_STATUS_ALIGN_ERROR; 1235 goto error; 1236 } 1237 1238 tdx->map_gpa_end = gpa + size; 1239 tdx->map_gpa_next = gpa; 1240 1241 __tdx_map_gpa(tdx); 1242 return 0; 1243 1244 error: 1245 tdvmcall_set_return_code(vcpu, ret); 1246 tdx->vp_enter_args.r11 = gpa; 1247 return 1; 1248 } 1249 1250 static int tdx_report_fatal_error(struct kvm_vcpu *vcpu) 1251 { 1252 struct vcpu_tdx *tdx = to_tdx(vcpu); 1253 u64 *regs = vcpu->run->system_event.data; 1254 u64 *module_regs = &tdx->vp_enter_args.r8; 1255 int index = VCPU_REGS_RAX; 1256 1257 vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; 1258 vcpu->run->system_event.type = KVM_SYSTEM_EVENT_TDX_FATAL; 1259 vcpu->run->system_event.ndata = 16; 1260 1261 /* Dump 16 general-purpose registers to userspace in ascending order. */ 1262 regs[index++] = tdx->vp_enter_ret; 1263 regs[index++] = tdx->vp_enter_args.rcx; 1264 regs[index++] = tdx->vp_enter_args.rdx; 1265 regs[index++] = tdx->vp_enter_args.rbx; 1266 regs[index++] = 0; 1267 regs[index++] = 0; 1268 regs[index++] = tdx->vp_enter_args.rsi; 1269 regs[index] = tdx->vp_enter_args.rdi; 1270 for (index = 0; index < 8; index++) 1271 regs[VCPU_REGS_R8 + index] = module_regs[index]; 1272 1273 return 0; 1274 } 1275 1276 static int tdx_emulate_cpuid(struct kvm_vcpu *vcpu) 1277 { 1278 u32 eax, ebx, ecx, edx; 1279 struct vcpu_tdx *tdx = to_tdx(vcpu); 1280 1281 /* EAX and ECX for cpuid is stored in R12 and R13. */ 1282 eax = tdx->vp_enter_args.r12; 1283 ecx = tdx->vp_enter_args.r13; 1284 1285 kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx, false); 1286 1287 tdx->vp_enter_args.r12 = eax; 1288 tdx->vp_enter_args.r13 = ebx; 1289 tdx->vp_enter_args.r14 = ecx; 1290 tdx->vp_enter_args.r15 = edx; 1291 1292 return 1; 1293 } 1294 1295 static int tdx_complete_pio_out(struct kvm_vcpu *vcpu) 1296 { 1297 vcpu->arch.pio.count = 0; 1298 return 1; 1299 } 1300 1301 static int tdx_complete_pio_in(struct kvm_vcpu *vcpu) 1302 { 1303 struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; 1304 unsigned long val = 0; 1305 int ret; 1306 1307 ret = ctxt->ops->pio_in_emulated(ctxt, vcpu->arch.pio.size, 1308 vcpu->arch.pio.port, &val, 1); 1309 1310 WARN_ON_ONCE(!ret); 1311 1312 tdvmcall_set_return_val(vcpu, val); 1313 1314 return 1; 1315 } 1316 1317 static int tdx_emulate_io(struct kvm_vcpu *vcpu) 1318 { 1319 struct vcpu_tdx *tdx = to_tdx(vcpu); 1320 struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; 1321 unsigned long val = 0; 1322 unsigned int port; 1323 u64 size, write; 1324 int ret; 1325 1326 ++vcpu->stat.io_exits; 1327 1328 size = tdx->vp_enter_args.r12; 1329 write = tdx->vp_enter_args.r13; 1330 port = tdx->vp_enter_args.r14; 1331 1332 if ((write != 0 && write != 1) || (size != 1 && size != 2 && size != 4)) { 1333 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); 1334 return 1; 1335 } 1336 1337 if (write) { 1338 val = tdx->vp_enter_args.r15; 1339 ret = ctxt->ops->pio_out_emulated(ctxt, size, port, &val, 1); 1340 } else { 1341 ret = ctxt->ops->pio_in_emulated(ctxt, size, port, &val, 1); 1342 } 1343 1344 if (!ret) 1345 vcpu->arch.complete_userspace_io = write ? tdx_complete_pio_out : 1346 tdx_complete_pio_in; 1347 else if (!write) 1348 tdvmcall_set_return_val(vcpu, val); 1349 1350 return ret; 1351 } 1352 1353 static int tdx_complete_mmio_read(struct kvm_vcpu *vcpu) 1354 { 1355 unsigned long val = 0; 1356 gpa_t gpa; 1357 int size; 1358 1359 gpa = vcpu->mmio_fragments[0].gpa; 1360 size = vcpu->mmio_fragments[0].len; 1361 1362 memcpy(&val, vcpu->run->mmio.data, size); 1363 tdvmcall_set_return_val(vcpu, val); 1364 trace_kvm_mmio(KVM_TRACE_MMIO_READ, size, gpa, &val); 1365 return 1; 1366 } 1367 1368 static inline int tdx_mmio_write(struct kvm_vcpu *vcpu, gpa_t gpa, int size, 1369 unsigned long val) 1370 { 1371 if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { 1372 trace_kvm_fast_mmio(gpa); 1373 return 0; 1374 } 1375 1376 trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, size, gpa, &val); 1377 if (kvm_io_bus_write(vcpu, KVM_MMIO_BUS, gpa, size, &val)) 1378 return -EOPNOTSUPP; 1379 1380 return 0; 1381 } 1382 1383 static inline int tdx_mmio_read(struct kvm_vcpu *vcpu, gpa_t gpa, int size) 1384 { 1385 unsigned long val; 1386 1387 if (kvm_io_bus_read(vcpu, KVM_MMIO_BUS, gpa, size, &val)) 1388 return -EOPNOTSUPP; 1389 1390 tdvmcall_set_return_val(vcpu, val); 1391 trace_kvm_mmio(KVM_TRACE_MMIO_READ, size, gpa, &val); 1392 return 0; 1393 } 1394 1395 static int tdx_emulate_mmio(struct kvm_vcpu *vcpu) 1396 { 1397 struct vcpu_tdx *tdx = to_tdx(vcpu); 1398 int size, write, r; 1399 unsigned long val; 1400 gpa_t gpa; 1401 1402 size = tdx->vp_enter_args.r12; 1403 write = tdx->vp_enter_args.r13; 1404 gpa = tdx->vp_enter_args.r14; 1405 val = write ? tdx->vp_enter_args.r15 : 0; 1406 1407 if (size != 1 && size != 2 && size != 4 && size != 8) 1408 goto error; 1409 if (write != 0 && write != 1) 1410 goto error; 1411 1412 /* 1413 * TDG.VP.VMCALL<MMIO> allows only shared GPA, it makes no sense to 1414 * do MMIO emulation for private GPA. 1415 */ 1416 if (vt_is_tdx_private_gpa(vcpu->kvm, gpa) || 1417 vt_is_tdx_private_gpa(vcpu->kvm, gpa + size - 1)) 1418 goto error; 1419 1420 gpa = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(vcpu->kvm)); 1421 1422 if (write) 1423 r = tdx_mmio_write(vcpu, gpa, size, val); 1424 else 1425 r = tdx_mmio_read(vcpu, gpa, size); 1426 if (!r) 1427 /* Kernel completed device emulation. */ 1428 return 1; 1429 1430 /* Request the device emulation to userspace device model. */ 1431 vcpu->mmio_is_write = write; 1432 if (!write) 1433 vcpu->arch.complete_userspace_io = tdx_complete_mmio_read; 1434 1435 vcpu->run->mmio.phys_addr = gpa; 1436 vcpu->run->mmio.len = size; 1437 vcpu->run->mmio.is_write = write; 1438 vcpu->run->exit_reason = KVM_EXIT_MMIO; 1439 1440 if (write) { 1441 memcpy(vcpu->run->mmio.data, &val, size); 1442 } else { 1443 vcpu->mmio_fragments[0].gpa = gpa; 1444 vcpu->mmio_fragments[0].len = size; 1445 trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, size, gpa, NULL); 1446 } 1447 return 0; 1448 1449 error: 1450 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); 1451 return 1; 1452 } 1453 1454 static int tdx_complete_get_td_vm_call_info(struct kvm_vcpu *vcpu) 1455 { 1456 struct vcpu_tdx *tdx = to_tdx(vcpu); 1457 1458 tdvmcall_set_return_code(vcpu, vcpu->run->tdx.get_tdvmcall_info.ret); 1459 1460 /* 1461 * For now, there is no TDVMCALL beyond GHCI base API supported by KVM 1462 * directly without the support from userspace, just set the value 1463 * returned from userspace. 1464 */ 1465 tdx->vp_enter_args.r11 = vcpu->run->tdx.get_tdvmcall_info.r11; 1466 tdx->vp_enter_args.r12 = vcpu->run->tdx.get_tdvmcall_info.r12; 1467 tdx->vp_enter_args.r13 = vcpu->run->tdx.get_tdvmcall_info.r13; 1468 tdx->vp_enter_args.r14 = vcpu->run->tdx.get_tdvmcall_info.r14; 1469 1470 return 1; 1471 } 1472 1473 static int tdx_get_td_vm_call_info(struct kvm_vcpu *vcpu) 1474 { 1475 struct vcpu_tdx *tdx = to_tdx(vcpu); 1476 1477 switch (tdx->vp_enter_args.r12) { 1478 case 0: 1479 tdx->vp_enter_args.r11 = 0; 1480 tdx->vp_enter_args.r12 = 0; 1481 tdx->vp_enter_args.r13 = 0; 1482 tdx->vp_enter_args.r14 = 0; 1483 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_SUCCESS); 1484 return 1; 1485 case 1: 1486 vcpu->run->tdx.get_tdvmcall_info.leaf = tdx->vp_enter_args.r12; 1487 vcpu->run->exit_reason = KVM_EXIT_TDX; 1488 vcpu->run->tdx.flags = 0; 1489 vcpu->run->tdx.nr = TDVMCALL_GET_TD_VM_CALL_INFO; 1490 vcpu->run->tdx.get_tdvmcall_info.ret = TDVMCALL_STATUS_SUCCESS; 1491 vcpu->run->tdx.get_tdvmcall_info.r11 = 0; 1492 vcpu->run->tdx.get_tdvmcall_info.r12 = 0; 1493 vcpu->run->tdx.get_tdvmcall_info.r13 = 0; 1494 vcpu->run->tdx.get_tdvmcall_info.r14 = 0; 1495 vcpu->arch.complete_userspace_io = tdx_complete_get_td_vm_call_info; 1496 return 0; 1497 default: 1498 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); 1499 return 1; 1500 } 1501 } 1502 1503 static int tdx_complete_simple(struct kvm_vcpu *vcpu) 1504 { 1505 tdvmcall_set_return_code(vcpu, vcpu->run->tdx.unknown.ret); 1506 return 1; 1507 } 1508 1509 static int tdx_get_quote(struct kvm_vcpu *vcpu) 1510 { 1511 struct vcpu_tdx *tdx = to_tdx(vcpu); 1512 u64 gpa = tdx->vp_enter_args.r12; 1513 u64 size = tdx->vp_enter_args.r13; 1514 1515 /* The gpa of buffer must have shared bit set. */ 1516 if (vt_is_tdx_private_gpa(vcpu->kvm, gpa)) { 1517 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); 1518 return 1; 1519 } 1520 1521 vcpu->run->exit_reason = KVM_EXIT_TDX; 1522 vcpu->run->tdx.flags = 0; 1523 vcpu->run->tdx.nr = TDVMCALL_GET_QUOTE; 1524 vcpu->run->tdx.get_quote.ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED; 1525 vcpu->run->tdx.get_quote.gpa = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(tdx->vcpu.kvm)); 1526 vcpu->run->tdx.get_quote.size = size; 1527 1528 vcpu->arch.complete_userspace_io = tdx_complete_simple; 1529 1530 return 0; 1531 } 1532 1533 static int tdx_setup_event_notify_interrupt(struct kvm_vcpu *vcpu) 1534 { 1535 struct vcpu_tdx *tdx = to_tdx(vcpu); 1536 u64 vector = tdx->vp_enter_args.r12; 1537 1538 if (vector < 32 || vector > 255) { 1539 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); 1540 return 1; 1541 } 1542 1543 vcpu->run->exit_reason = KVM_EXIT_TDX; 1544 vcpu->run->tdx.flags = 0; 1545 vcpu->run->tdx.nr = TDVMCALL_SETUP_EVENT_NOTIFY_INTERRUPT; 1546 vcpu->run->tdx.setup_event_notify.ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED; 1547 vcpu->run->tdx.setup_event_notify.vector = vector; 1548 1549 vcpu->arch.complete_userspace_io = tdx_complete_simple; 1550 1551 return 0; 1552 } 1553 1554 static int handle_tdvmcall(struct kvm_vcpu *vcpu) 1555 { 1556 switch (tdvmcall_leaf(vcpu)) { 1557 case TDVMCALL_MAP_GPA: 1558 return tdx_map_gpa(vcpu); 1559 case TDVMCALL_REPORT_FATAL_ERROR: 1560 return tdx_report_fatal_error(vcpu); 1561 case TDVMCALL_GET_TD_VM_CALL_INFO: 1562 return tdx_get_td_vm_call_info(vcpu); 1563 case TDVMCALL_GET_QUOTE: 1564 return tdx_get_quote(vcpu); 1565 case TDVMCALL_SETUP_EVENT_NOTIFY_INTERRUPT: 1566 return tdx_setup_event_notify_interrupt(vcpu); 1567 default: 1568 break; 1569 } 1570 1571 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED); 1572 return 1; 1573 } 1574 1575 void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int pgd_level) 1576 { 1577 u64 shared_bit = (pgd_level == 5) ? TDX_SHARED_BIT_PWL_5 : 1578 TDX_SHARED_BIT_PWL_4; 1579 1580 if (KVM_BUG_ON(shared_bit != kvm_gfn_direct_bits(vcpu->kvm), vcpu->kvm)) 1581 return; 1582 1583 td_vmcs_write64(to_tdx(vcpu), SHARED_EPT_POINTER, root_hpa); 1584 } 1585 1586 static void tdx_unpin(struct kvm *kvm, struct page *page) 1587 { 1588 put_page(page); 1589 } 1590 1591 static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn, 1592 enum pg_level level, struct page *page) 1593 { 1594 int tdx_level = pg_level_to_tdx_sept_level(level); 1595 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 1596 gpa_t gpa = gfn_to_gpa(gfn); 1597 u64 entry, level_state; 1598 u64 err; 1599 1600 err = tdh_mem_page_aug(&kvm_tdx->td, gpa, tdx_level, page, &entry, &level_state); 1601 if (unlikely(tdx_operand_busy(err))) { 1602 tdx_unpin(kvm, page); 1603 return -EBUSY; 1604 } 1605 1606 if (KVM_BUG_ON(err, kvm)) { 1607 pr_tdx_error_2(TDH_MEM_PAGE_AUG, err, entry, level_state); 1608 tdx_unpin(kvm, page); 1609 return -EIO; 1610 } 1611 1612 return 0; 1613 } 1614 1615 /* 1616 * KVM_TDX_INIT_MEM_REGION calls kvm_gmem_populate() to map guest pages; the 1617 * callback tdx_gmem_post_populate() then maps pages into private memory. 1618 * through the a seamcall TDH.MEM.PAGE.ADD(). The SEAMCALL also requires the 1619 * private EPT structures for the page to have been built before, which is 1620 * done via kvm_tdp_map_page(). nr_premapped counts the number of pages that 1621 * were added to the EPT structures but not added with TDH.MEM.PAGE.ADD(). 1622 * The counter has to be zero on KVM_TDX_FINALIZE_VM, to ensure that there 1623 * are no half-initialized shared EPT pages. 1624 */ 1625 static int tdx_mem_page_record_premap_cnt(struct kvm *kvm, gfn_t gfn, 1626 enum pg_level level, kvm_pfn_t pfn) 1627 { 1628 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 1629 1630 if (KVM_BUG_ON(kvm->arch.pre_fault_allowed, kvm)) 1631 return -EINVAL; 1632 1633 /* nr_premapped will be decreased when tdh_mem_page_add() is called. */ 1634 atomic64_inc(&kvm_tdx->nr_premapped); 1635 return 0; 1636 } 1637 1638 static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn, 1639 enum pg_level level, kvm_pfn_t pfn) 1640 { 1641 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 1642 struct page *page = pfn_to_page(pfn); 1643 1644 /* TODO: handle large pages. */ 1645 if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm)) 1646 return -EINVAL; 1647 1648 /* 1649 * Because guest_memfd doesn't support page migration with 1650 * a_ops->migrate_folio (yet), no callback is triggered for KVM on page 1651 * migration. Until guest_memfd supports page migration, prevent page 1652 * migration. 1653 * TODO: Once guest_memfd introduces callback on page migration, 1654 * implement it and remove get_page/put_page(). 1655 */ 1656 get_page(page); 1657 1658 /* 1659 * Read 'pre_fault_allowed' before 'kvm_tdx->state'; see matching 1660 * barrier in tdx_td_finalize(). 1661 */ 1662 smp_rmb(); 1663 if (likely(kvm_tdx->state == TD_STATE_RUNNABLE)) 1664 return tdx_mem_page_aug(kvm, gfn, level, page); 1665 1666 return tdx_mem_page_record_premap_cnt(kvm, gfn, level, pfn); 1667 } 1668 1669 static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn, 1670 enum pg_level level, struct page *page) 1671 { 1672 int tdx_level = pg_level_to_tdx_sept_level(level); 1673 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 1674 gpa_t gpa = gfn_to_gpa(gfn); 1675 u64 err, entry, level_state; 1676 1677 /* TODO: handle large pages. */ 1678 if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm)) 1679 return -EINVAL; 1680 1681 if (KVM_BUG_ON(!is_hkid_assigned(kvm_tdx), kvm)) 1682 return -EINVAL; 1683 1684 /* 1685 * When zapping private page, write lock is held. So no race condition 1686 * with other vcpu sept operation. 1687 * Race with TDH.VP.ENTER due to (0-step mitigation) and Guest TDCALLs. 1688 */ 1689 err = tdh_mem_page_remove(&kvm_tdx->td, gpa, tdx_level, &entry, 1690 &level_state); 1691 1692 if (unlikely(tdx_operand_busy(err))) { 1693 /* 1694 * The second retry is expected to succeed after kicking off all 1695 * other vCPUs and prevent them from invoking TDH.VP.ENTER. 1696 */ 1697 tdx_no_vcpus_enter_start(kvm); 1698 err = tdh_mem_page_remove(&kvm_tdx->td, gpa, tdx_level, &entry, 1699 &level_state); 1700 tdx_no_vcpus_enter_stop(kvm); 1701 } 1702 1703 if (KVM_BUG_ON(err, kvm)) { 1704 pr_tdx_error_2(TDH_MEM_PAGE_REMOVE, err, entry, level_state); 1705 return -EIO; 1706 } 1707 1708 err = tdh_phymem_page_wbinvd_hkid((u16)kvm_tdx->hkid, page); 1709 1710 if (KVM_BUG_ON(err, kvm)) { 1711 pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err); 1712 return -EIO; 1713 } 1714 tdx_quirk_reset_page(page); 1715 tdx_unpin(kvm, page); 1716 return 0; 1717 } 1718 1719 static int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn, 1720 enum pg_level level, void *private_spt) 1721 { 1722 int tdx_level = pg_level_to_tdx_sept_level(level); 1723 gpa_t gpa = gfn_to_gpa(gfn); 1724 struct page *page = virt_to_page(private_spt); 1725 u64 err, entry, level_state; 1726 1727 err = tdh_mem_sept_add(&to_kvm_tdx(kvm)->td, gpa, tdx_level, page, &entry, 1728 &level_state); 1729 if (unlikely(tdx_operand_busy(err))) 1730 return -EBUSY; 1731 1732 if (KVM_BUG_ON(err, kvm)) { 1733 pr_tdx_error_2(TDH_MEM_SEPT_ADD, err, entry, level_state); 1734 return -EIO; 1735 } 1736 1737 return 0; 1738 } 1739 1740 /* 1741 * Check if the error returned from a SEPT zap SEAMCALL is due to that a page is 1742 * mapped by KVM_TDX_INIT_MEM_REGION without tdh_mem_page_add() being called 1743 * successfully. 1744 * 1745 * Since tdh_mem_sept_add() must have been invoked successfully before a 1746 * non-leaf entry present in the mirrored page table, the SEPT ZAP related 1747 * SEAMCALLs should not encounter err TDX_EPT_WALK_FAILED. They should instead 1748 * find TDX_EPT_ENTRY_STATE_INCORRECT due to an empty leaf entry found in the 1749 * SEPT. 1750 * 1751 * Further check if the returned entry from SEPT walking is with RWX permissions 1752 * to filter out anything unexpected. 1753 * 1754 * Note: @level is pg_level, not the tdx_level. The tdx_level extracted from 1755 * level_state returned from a SEAMCALL error is the same as that passed into 1756 * the SEAMCALL. 1757 */ 1758 static int tdx_is_sept_zap_err_due_to_premap(struct kvm_tdx *kvm_tdx, u64 err, 1759 u64 entry, int level) 1760 { 1761 if (!err || kvm_tdx->state == TD_STATE_RUNNABLE) 1762 return false; 1763 1764 if (err != (TDX_EPT_ENTRY_STATE_INCORRECT | TDX_OPERAND_ID_RCX)) 1765 return false; 1766 1767 if ((is_last_spte(entry, level) && (entry & VMX_EPT_RWX_MASK))) 1768 return false; 1769 1770 return true; 1771 } 1772 1773 static int tdx_sept_zap_private_spte(struct kvm *kvm, gfn_t gfn, 1774 enum pg_level level, struct page *page) 1775 { 1776 int tdx_level = pg_level_to_tdx_sept_level(level); 1777 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 1778 gpa_t gpa = gfn_to_gpa(gfn) & KVM_HPAGE_MASK(level); 1779 u64 err, entry, level_state; 1780 1781 /* For now large page isn't supported yet. */ 1782 WARN_ON_ONCE(level != PG_LEVEL_4K); 1783 1784 err = tdh_mem_range_block(&kvm_tdx->td, gpa, tdx_level, &entry, &level_state); 1785 1786 if (unlikely(tdx_operand_busy(err))) { 1787 /* After no vCPUs enter, the second retry is expected to succeed */ 1788 tdx_no_vcpus_enter_start(kvm); 1789 err = tdh_mem_range_block(&kvm_tdx->td, gpa, tdx_level, &entry, &level_state); 1790 tdx_no_vcpus_enter_stop(kvm); 1791 } 1792 if (tdx_is_sept_zap_err_due_to_premap(kvm_tdx, err, entry, level) && 1793 !KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm)) { 1794 atomic64_dec(&kvm_tdx->nr_premapped); 1795 tdx_unpin(kvm, page); 1796 return 0; 1797 } 1798 1799 if (KVM_BUG_ON(err, kvm)) { 1800 pr_tdx_error_2(TDH_MEM_RANGE_BLOCK, err, entry, level_state); 1801 return -EIO; 1802 } 1803 return 1; 1804 } 1805 1806 /* 1807 * Ensure shared and private EPTs to be flushed on all vCPUs. 1808 * tdh_mem_track() is the only caller that increases TD epoch. An increase in 1809 * the TD epoch (e.g., to value "N + 1") is successful only if no vCPUs are 1810 * running in guest mode with the value "N - 1". 1811 * 1812 * A successful execution of tdh_mem_track() ensures that vCPUs can only run in 1813 * guest mode with TD epoch value "N" if no TD exit occurs after the TD epoch 1814 * being increased to "N + 1". 1815 * 1816 * Kicking off all vCPUs after that further results in no vCPUs can run in guest 1817 * mode with TD epoch value "N", which unblocks the next tdh_mem_track() (e.g. 1818 * to increase TD epoch to "N + 2"). 1819 * 1820 * TDX module will flush EPT on the next TD enter and make vCPUs to run in 1821 * guest mode with TD epoch value "N + 1". 1822 * 1823 * kvm_make_all_cpus_request() guarantees all vCPUs are out of guest mode by 1824 * waiting empty IPI handler ack_kick(). 1825 * 1826 * No action is required to the vCPUs being kicked off since the kicking off 1827 * occurs certainly after TD epoch increment and before the next 1828 * tdh_mem_track(). 1829 */ 1830 static void tdx_track(struct kvm *kvm) 1831 { 1832 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 1833 u64 err; 1834 1835 /* If TD isn't finalized, it's before any vcpu running. */ 1836 if (unlikely(kvm_tdx->state != TD_STATE_RUNNABLE)) 1837 return; 1838 1839 lockdep_assert_held_write(&kvm->mmu_lock); 1840 1841 err = tdh_mem_track(&kvm_tdx->td); 1842 if (unlikely(tdx_operand_busy(err))) { 1843 /* After no vCPUs enter, the second retry is expected to succeed */ 1844 tdx_no_vcpus_enter_start(kvm); 1845 err = tdh_mem_track(&kvm_tdx->td); 1846 tdx_no_vcpus_enter_stop(kvm); 1847 } 1848 1849 if (KVM_BUG_ON(err, kvm)) 1850 pr_tdx_error(TDH_MEM_TRACK, err); 1851 1852 kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE); 1853 } 1854 1855 static int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn, 1856 enum pg_level level, void *private_spt) 1857 { 1858 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 1859 1860 /* 1861 * free_external_spt() is only called after hkid is freed when TD is 1862 * tearing down. 1863 * KVM doesn't (yet) zap page table pages in mirror page table while 1864 * TD is active, though guest pages mapped in mirror page table could be 1865 * zapped during TD is active, e.g. for shared <-> private conversion 1866 * and slot move/deletion. 1867 */ 1868 if (KVM_BUG_ON(is_hkid_assigned(kvm_tdx), kvm)) 1869 return -EINVAL; 1870 1871 /* 1872 * The HKID assigned to this TD was already freed and cache was 1873 * already flushed. We don't have to flush again. 1874 */ 1875 return tdx_reclaim_page(virt_to_page(private_spt)); 1876 } 1877 1878 static int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn, 1879 enum pg_level level, kvm_pfn_t pfn) 1880 { 1881 struct page *page = pfn_to_page(pfn); 1882 int ret; 1883 1884 /* 1885 * HKID is released after all private pages have been removed, and set 1886 * before any might be populated. Warn if zapping is attempted when 1887 * there can't be anything populated in the private EPT. 1888 */ 1889 if (KVM_BUG_ON(!is_hkid_assigned(to_kvm_tdx(kvm)), kvm)) 1890 return -EINVAL; 1891 1892 ret = tdx_sept_zap_private_spte(kvm, gfn, level, page); 1893 if (ret <= 0) 1894 return ret; 1895 1896 /* 1897 * TDX requires TLB tracking before dropping private page. Do 1898 * it here, although it is also done later. 1899 */ 1900 tdx_track(kvm); 1901 1902 return tdx_sept_drop_private_spte(kvm, gfn, level, page); 1903 } 1904 1905 void tdx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, 1906 int trig_mode, int vector) 1907 { 1908 struct kvm_vcpu *vcpu = apic->vcpu; 1909 struct vcpu_tdx *tdx = to_tdx(vcpu); 1910 1911 /* TDX supports only posted interrupt. No lapic emulation. */ 1912 __vmx_deliver_posted_interrupt(vcpu, &tdx->vt.pi_desc, vector); 1913 1914 trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector); 1915 } 1916 1917 static inline bool tdx_is_sept_violation_unexpected_pending(struct kvm_vcpu *vcpu) 1918 { 1919 u64 eeq_type = to_tdx(vcpu)->ext_exit_qualification & TDX_EXT_EXIT_QUAL_TYPE_MASK; 1920 u64 eq = vmx_get_exit_qual(vcpu); 1921 1922 if (eeq_type != TDX_EXT_EXIT_QUAL_TYPE_PENDING_EPT_VIOLATION) 1923 return false; 1924 1925 return !(eq & EPT_VIOLATION_PROT_MASK) && !(eq & EPT_VIOLATION_EXEC_FOR_RING3_LIN); 1926 } 1927 1928 static int tdx_handle_ept_violation(struct kvm_vcpu *vcpu) 1929 { 1930 unsigned long exit_qual; 1931 gpa_t gpa = to_tdx(vcpu)->exit_gpa; 1932 bool local_retry = false; 1933 int ret; 1934 1935 if (vt_is_tdx_private_gpa(vcpu->kvm, gpa)) { 1936 if (tdx_is_sept_violation_unexpected_pending(vcpu)) { 1937 pr_warn("Guest access before accepting 0x%llx on vCPU %d\n", 1938 gpa, vcpu->vcpu_id); 1939 kvm_vm_dead(vcpu->kvm); 1940 return -EIO; 1941 } 1942 /* 1943 * Always treat SEPT violations as write faults. Ignore the 1944 * EXIT_QUALIFICATION reported by TDX-SEAM for SEPT violations. 1945 * TD private pages are always RWX in the SEPT tables, 1946 * i.e. they're always mapped writable. Just as importantly, 1947 * treating SEPT violations as write faults is necessary to 1948 * avoid COW allocations, which will cause TDAUGPAGE failures 1949 * due to aliasing a single HPA to multiple GPAs. 1950 */ 1951 exit_qual = EPT_VIOLATION_ACC_WRITE; 1952 1953 /* Only private GPA triggers zero-step mitigation */ 1954 local_retry = true; 1955 } else { 1956 exit_qual = vmx_get_exit_qual(vcpu); 1957 /* 1958 * EPT violation due to instruction fetch should never be 1959 * triggered from shared memory in TDX guest. If such EPT 1960 * violation occurs, treat it as broken hardware. 1961 */ 1962 if (KVM_BUG_ON(exit_qual & EPT_VIOLATION_ACC_INSTR, vcpu->kvm)) 1963 return -EIO; 1964 } 1965 1966 trace_kvm_page_fault(vcpu, gpa, exit_qual); 1967 1968 /* 1969 * To minimize TDH.VP.ENTER invocations, retry locally for private GPA 1970 * mapping in TDX. 1971 * 1972 * KVM may return RET_PF_RETRY for private GPA due to 1973 * - contentions when atomically updating SPTEs of the mirror page table 1974 * - in-progress GFN invalidation or memslot removal. 1975 * - TDX_OPERAND_BUSY error from TDH.MEM.PAGE.AUG or TDH.MEM.SEPT.ADD, 1976 * caused by contentions with TDH.VP.ENTER (with zero-step mitigation) 1977 * or certain TDCALLs. 1978 * 1979 * If TDH.VP.ENTER is invoked more times than the threshold set by the 1980 * TDX module before KVM resolves the private GPA mapping, the TDX 1981 * module will activate zero-step mitigation during TDH.VP.ENTER. This 1982 * process acquires an SEPT tree lock in the TDX module, leading to 1983 * further contentions with TDH.MEM.PAGE.AUG or TDH.MEM.SEPT.ADD 1984 * operations on other vCPUs. 1985 * 1986 * Breaking out of local retries for kvm_vcpu_has_events() is for 1987 * interrupt injection. kvm_vcpu_has_events() should not see pending 1988 * events for TDX. Since KVM can't determine if IRQs (or NMIs) are 1989 * blocked by TDs, false positives are inevitable i.e., KVM may re-enter 1990 * the guest even if the IRQ/NMI can't be delivered. 1991 * 1992 * Note: even without breaking out of local retries, zero-step 1993 * mitigation may still occur due to 1994 * - invoking of TDH.VP.ENTER after KVM_EXIT_MEMORY_FAULT, 1995 * - a single RIP causing EPT violations for more GFNs than the 1996 * threshold count. 1997 * This is safe, as triggering zero-step mitigation only introduces 1998 * contentions to page installation SEAMCALLs on other vCPUs, which will 1999 * handle retries locally in their EPT violation handlers. 2000 */ 2001 while (1) { 2002 struct kvm_memory_slot *slot; 2003 2004 ret = __vmx_handle_ept_violation(vcpu, gpa, exit_qual); 2005 2006 if (ret != RET_PF_RETRY || !local_retry) 2007 break; 2008 2009 if (kvm_vcpu_has_events(vcpu) || signal_pending(current)) 2010 break; 2011 2012 if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu)) { 2013 ret = -EIO; 2014 break; 2015 } 2016 2017 /* 2018 * Bail if the memslot is invalid, i.e. is being deleted, as 2019 * faulting in will never succeed and this task needs to drop 2020 * SRCU in order to let memslot deletion complete. 2021 */ 2022 slot = kvm_vcpu_gfn_to_memslot(vcpu, gpa_to_gfn(gpa)); 2023 if (slot && slot->flags & KVM_MEMSLOT_INVALID) 2024 break; 2025 2026 cond_resched(); 2027 } 2028 return ret; 2029 } 2030 2031 int tdx_complete_emulated_msr(struct kvm_vcpu *vcpu, int err) 2032 { 2033 if (err) { 2034 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); 2035 return 1; 2036 } 2037 2038 if (vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MSR_READ) 2039 tdvmcall_set_return_val(vcpu, kvm_read_edx_eax(vcpu)); 2040 2041 return 1; 2042 } 2043 2044 2045 int tdx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t fastpath) 2046 { 2047 struct vcpu_tdx *tdx = to_tdx(vcpu); 2048 u64 vp_enter_ret = tdx->vp_enter_ret; 2049 union vmx_exit_reason exit_reason = vmx_get_exit_reason(vcpu); 2050 2051 if (fastpath != EXIT_FASTPATH_NONE) 2052 return 1; 2053 2054 if (unlikely(vp_enter_ret == EXIT_REASON_EPT_MISCONFIG)) { 2055 KVM_BUG_ON(1, vcpu->kvm); 2056 return -EIO; 2057 } 2058 2059 /* 2060 * Handle TDX SW errors, including TDX_SEAMCALL_UD, TDX_SEAMCALL_GP and 2061 * TDX_SEAMCALL_VMFAILINVALID. 2062 */ 2063 if (unlikely((vp_enter_ret & TDX_SW_ERROR) == TDX_SW_ERROR)) { 2064 KVM_BUG_ON(!kvm_rebooting, vcpu->kvm); 2065 goto unhandled_exit; 2066 } 2067 2068 if (unlikely(tdx_failed_vmentry(vcpu))) { 2069 /* 2070 * If the guest state is protected, that means off-TD debug is 2071 * not enabled, TDX_NON_RECOVERABLE must be set. 2072 */ 2073 WARN_ON_ONCE(vcpu->arch.guest_state_protected && 2074 !(vp_enter_ret & TDX_NON_RECOVERABLE)); 2075 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; 2076 vcpu->run->fail_entry.hardware_entry_failure_reason = exit_reason.full; 2077 vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu; 2078 return 0; 2079 } 2080 2081 if (unlikely(vp_enter_ret & (TDX_ERROR | TDX_NON_RECOVERABLE)) && 2082 exit_reason.basic != EXIT_REASON_TRIPLE_FAULT) { 2083 kvm_pr_unimpl("TD vp_enter_ret 0x%llx\n", vp_enter_ret); 2084 goto unhandled_exit; 2085 } 2086 2087 WARN_ON_ONCE(exit_reason.basic != EXIT_REASON_TRIPLE_FAULT && 2088 (vp_enter_ret & TDX_SEAMCALL_STATUS_MASK) != TDX_SUCCESS); 2089 2090 switch (exit_reason.basic) { 2091 case EXIT_REASON_TRIPLE_FAULT: 2092 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; 2093 vcpu->mmio_needed = 0; 2094 return 0; 2095 case EXIT_REASON_EXCEPTION_NMI: 2096 return tdx_handle_exception_nmi(vcpu); 2097 case EXIT_REASON_EXTERNAL_INTERRUPT: 2098 ++vcpu->stat.irq_exits; 2099 return 1; 2100 case EXIT_REASON_CPUID: 2101 return tdx_emulate_cpuid(vcpu); 2102 case EXIT_REASON_HLT: 2103 return kvm_emulate_halt_noskip(vcpu); 2104 case EXIT_REASON_TDCALL: 2105 return handle_tdvmcall(vcpu); 2106 case EXIT_REASON_VMCALL: 2107 return tdx_emulate_vmcall(vcpu); 2108 case EXIT_REASON_IO_INSTRUCTION: 2109 return tdx_emulate_io(vcpu); 2110 case EXIT_REASON_MSR_READ: 2111 kvm_rcx_write(vcpu, tdx->vp_enter_args.r12); 2112 return kvm_emulate_rdmsr(vcpu); 2113 case EXIT_REASON_MSR_WRITE: 2114 kvm_rcx_write(vcpu, tdx->vp_enter_args.r12); 2115 kvm_rax_write(vcpu, tdx->vp_enter_args.r13 & -1u); 2116 kvm_rdx_write(vcpu, tdx->vp_enter_args.r13 >> 32); 2117 return kvm_emulate_wrmsr(vcpu); 2118 case EXIT_REASON_EPT_MISCONFIG: 2119 return tdx_emulate_mmio(vcpu); 2120 case EXIT_REASON_EPT_VIOLATION: 2121 return tdx_handle_ept_violation(vcpu); 2122 case EXIT_REASON_OTHER_SMI: 2123 /* 2124 * Unlike VMX, SMI in SEAM non-root mode (i.e. when 2125 * TD guest vCPU is running) will cause VM exit to TDX module, 2126 * then SEAMRET to KVM. Once it exits to KVM, SMI is delivered 2127 * and handled by kernel handler right away. 2128 * 2129 * The Other SMI exit can also be caused by the SEAM non-root 2130 * machine check delivered via Machine Check System Management 2131 * Interrupt (MSMI), but it has already been handled by the 2132 * kernel machine check handler, i.e., the memory page has been 2133 * marked as poisoned and it won't be freed to the free list 2134 * when the TDX guest is terminated (the TDX module marks the 2135 * guest as dead and prevent it from further running when 2136 * machine check happens in SEAM non-root). 2137 * 2138 * - A MSMI will not reach here, it's handled as non_recoverable 2139 * case above. 2140 * - If it's not an MSMI, no need to do anything here. 2141 */ 2142 return 1; 2143 default: 2144 break; 2145 } 2146 2147 unhandled_exit: 2148 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 2149 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; 2150 vcpu->run->internal.ndata = 2; 2151 vcpu->run->internal.data[0] = vp_enter_ret; 2152 vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu; 2153 return 0; 2154 } 2155 2156 void tdx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, 2157 u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code) 2158 { 2159 struct vcpu_tdx *tdx = to_tdx(vcpu); 2160 2161 *reason = tdx->vt.exit_reason.full; 2162 if (*reason != -1u) { 2163 *info1 = vmx_get_exit_qual(vcpu); 2164 *info2 = tdx->ext_exit_qualification; 2165 *intr_info = vmx_get_intr_info(vcpu); 2166 } else { 2167 *info1 = 0; 2168 *info2 = 0; 2169 *intr_info = 0; 2170 } 2171 2172 *error_code = 0; 2173 } 2174 2175 bool tdx_has_emulated_msr(u32 index) 2176 { 2177 switch (index) { 2178 case MSR_IA32_UCODE_REV: 2179 case MSR_IA32_ARCH_CAPABILITIES: 2180 case MSR_IA32_POWER_CTL: 2181 case MSR_IA32_CR_PAT: 2182 case MSR_MTRRcap: 2183 case MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000: 2184 case MSR_MTRRdefType: 2185 case MSR_IA32_TSC_DEADLINE: 2186 case MSR_IA32_MISC_ENABLE: 2187 case MSR_PLATFORM_INFO: 2188 case MSR_MISC_FEATURES_ENABLES: 2189 case MSR_IA32_APICBASE: 2190 case MSR_EFER: 2191 case MSR_IA32_FEAT_CTL: 2192 case MSR_IA32_MCG_CAP: 2193 case MSR_IA32_MCG_STATUS: 2194 case MSR_IA32_MCG_CTL: 2195 case MSR_IA32_MCG_EXT_CTL: 2196 case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: 2197 case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1: 2198 /* MSR_IA32_MCx_{CTL, STATUS, ADDR, MISC, CTL2} */ 2199 case MSR_KVM_POLL_CONTROL: 2200 return true; 2201 case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff: 2202 /* 2203 * x2APIC registers that are virtualized by the CPU can't be 2204 * emulated, KVM doesn't have access to the virtual APIC page. 2205 */ 2206 switch (index) { 2207 case X2APIC_MSR(APIC_TASKPRI): 2208 case X2APIC_MSR(APIC_PROCPRI): 2209 case X2APIC_MSR(APIC_EOI): 2210 case X2APIC_MSR(APIC_ISR) ... X2APIC_MSR(APIC_ISR + APIC_ISR_NR): 2211 case X2APIC_MSR(APIC_TMR) ... X2APIC_MSR(APIC_TMR + APIC_ISR_NR): 2212 case X2APIC_MSR(APIC_IRR) ... X2APIC_MSR(APIC_IRR + APIC_ISR_NR): 2213 return false; 2214 default: 2215 return true; 2216 } 2217 default: 2218 return false; 2219 } 2220 } 2221 2222 static bool tdx_is_read_only_msr(u32 index) 2223 { 2224 return index == MSR_IA32_APICBASE || index == MSR_EFER || 2225 index == MSR_IA32_FEAT_CTL; 2226 } 2227 2228 int tdx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) 2229 { 2230 switch (msr->index) { 2231 case MSR_IA32_FEAT_CTL: 2232 /* 2233 * MCE and MCA are advertised via cpuid. Guest kernel could 2234 * check if LMCE is enabled or not. 2235 */ 2236 msr->data = FEAT_CTL_LOCKED; 2237 if (vcpu->arch.mcg_cap & MCG_LMCE_P) 2238 msr->data |= FEAT_CTL_LMCE_ENABLED; 2239 return 0; 2240 case MSR_IA32_MCG_EXT_CTL: 2241 if (!msr->host_initiated && !(vcpu->arch.mcg_cap & MCG_LMCE_P)) 2242 return 1; 2243 msr->data = vcpu->arch.mcg_ext_ctl; 2244 return 0; 2245 default: 2246 if (!tdx_has_emulated_msr(msr->index)) 2247 return 1; 2248 2249 return kvm_get_msr_common(vcpu, msr); 2250 } 2251 } 2252 2253 int tdx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) 2254 { 2255 switch (msr->index) { 2256 case MSR_IA32_MCG_EXT_CTL: 2257 if ((!msr->host_initiated && !(vcpu->arch.mcg_cap & MCG_LMCE_P)) || 2258 (msr->data & ~MCG_EXT_CTL_LMCE_EN)) 2259 return 1; 2260 vcpu->arch.mcg_ext_ctl = msr->data; 2261 return 0; 2262 default: 2263 if (tdx_is_read_only_msr(msr->index)) 2264 return 1; 2265 2266 if (!tdx_has_emulated_msr(msr->index)) 2267 return 1; 2268 2269 return kvm_set_msr_common(vcpu, msr); 2270 } 2271 } 2272 2273 static int tdx_get_capabilities(struct kvm_tdx_cmd *cmd) 2274 { 2275 const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf; 2276 struct kvm_tdx_capabilities __user *user_caps; 2277 struct kvm_tdx_capabilities *caps = NULL; 2278 u32 nr_user_entries; 2279 int ret = 0; 2280 2281 /* flags is reserved for future use */ 2282 if (cmd->flags) 2283 return -EINVAL; 2284 2285 caps = kzalloc(sizeof(*caps) + 2286 sizeof(struct kvm_cpuid_entry2) * td_conf->num_cpuid_config, 2287 GFP_KERNEL); 2288 if (!caps) 2289 return -ENOMEM; 2290 2291 user_caps = u64_to_user_ptr(cmd->data); 2292 if (get_user(nr_user_entries, &user_caps->cpuid.nent)) { 2293 ret = -EFAULT; 2294 goto out; 2295 } 2296 2297 if (nr_user_entries < td_conf->num_cpuid_config) { 2298 ret = -E2BIG; 2299 goto out; 2300 } 2301 2302 ret = init_kvm_tdx_caps(td_conf, caps); 2303 if (ret) 2304 goto out; 2305 2306 if (copy_to_user(user_caps, caps, sizeof(*caps))) { 2307 ret = -EFAULT; 2308 goto out; 2309 } 2310 2311 if (copy_to_user(user_caps->cpuid.entries, caps->cpuid.entries, 2312 caps->cpuid.nent * 2313 sizeof(caps->cpuid.entries[0]))) 2314 ret = -EFAULT; 2315 2316 out: 2317 /* kfree() accepts NULL. */ 2318 kfree(caps); 2319 return ret; 2320 } 2321 2322 /* 2323 * KVM reports guest physical address in CPUID.0x800000008.EAX[23:16], which is 2324 * similar to TDX's GPAW. Use this field as the interface for userspace to 2325 * configure the GPAW and EPT level for TDs. 2326 * 2327 * Only values 48 and 52 are supported. Value 52 means GPAW-52 and EPT level 2328 * 5, Value 48 means GPAW-48 and EPT level 4. For value 48, GPAW-48 is always 2329 * supported. Value 52 is only supported when the platform supports 5 level 2330 * EPT. 2331 */ 2332 static int setup_tdparams_eptp_controls(struct kvm_cpuid2 *cpuid, 2333 struct td_params *td_params) 2334 { 2335 const struct kvm_cpuid_entry2 *entry; 2336 int guest_pa; 2337 2338 entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent, 0x80000008, 0); 2339 if (!entry) 2340 return -EINVAL; 2341 2342 guest_pa = tdx_get_guest_phys_addr_bits(entry->eax); 2343 2344 if (guest_pa != 48 && guest_pa != 52) 2345 return -EINVAL; 2346 2347 if (guest_pa == 52 && !cpu_has_vmx_ept_5levels()) 2348 return -EINVAL; 2349 2350 td_params->eptp_controls = VMX_EPTP_MT_WB; 2351 if (guest_pa == 52) { 2352 td_params->eptp_controls |= VMX_EPTP_PWL_5; 2353 td_params->config_flags |= TDX_CONFIG_FLAGS_MAX_GPAW; 2354 } else { 2355 td_params->eptp_controls |= VMX_EPTP_PWL_4; 2356 } 2357 2358 return 0; 2359 } 2360 2361 static int setup_tdparams_cpuids(struct kvm_cpuid2 *cpuid, 2362 struct td_params *td_params) 2363 { 2364 const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf; 2365 const struct kvm_cpuid_entry2 *entry; 2366 struct tdx_cpuid_value *value; 2367 int i, copy_cnt = 0; 2368 2369 /* 2370 * td_params.cpuid_values: The number and the order of cpuid_value must 2371 * be same to the one of struct tdsysinfo.{num_cpuid_config, cpuid_configs} 2372 * It's assumed that td_params was zeroed. 2373 */ 2374 for (i = 0; i < td_conf->num_cpuid_config; i++) { 2375 struct kvm_cpuid_entry2 tmp; 2376 2377 td_init_cpuid_entry2(&tmp, i); 2378 2379 entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent, 2380 tmp.function, tmp.index); 2381 if (!entry) 2382 continue; 2383 2384 if (tdx_unsupported_cpuid(entry)) 2385 return -EINVAL; 2386 2387 copy_cnt++; 2388 2389 value = &td_params->cpuid_values[i]; 2390 value->eax = entry->eax; 2391 value->ebx = entry->ebx; 2392 value->ecx = entry->ecx; 2393 value->edx = entry->edx; 2394 2395 /* 2396 * TDX module does not accept nonzero bits 16..23 for the 2397 * CPUID[0x80000008].EAX, see setup_tdparams_eptp_controls(). 2398 */ 2399 if (tmp.function == 0x80000008) 2400 value->eax = tdx_set_guest_phys_addr_bits(value->eax, 0); 2401 } 2402 2403 /* 2404 * Rely on the TDX module to reject invalid configuration, but it can't 2405 * check of leafs that don't have a proper slot in td_params->cpuid_values 2406 * to stick then. So fail if there were entries that didn't get copied to 2407 * td_params. 2408 */ 2409 if (copy_cnt != cpuid->nent) 2410 return -EINVAL; 2411 2412 return 0; 2413 } 2414 2415 static int setup_tdparams(struct kvm *kvm, struct td_params *td_params, 2416 struct kvm_tdx_init_vm *init_vm) 2417 { 2418 const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf; 2419 struct kvm_cpuid2 *cpuid = &init_vm->cpuid; 2420 int ret; 2421 2422 if (kvm->created_vcpus) 2423 return -EBUSY; 2424 2425 if (init_vm->attributes & ~tdx_get_supported_attrs(td_conf)) 2426 return -EINVAL; 2427 2428 if (init_vm->xfam & ~tdx_get_supported_xfam(td_conf)) 2429 return -EINVAL; 2430 2431 td_params->max_vcpus = kvm->max_vcpus; 2432 td_params->attributes = init_vm->attributes | td_conf->attributes_fixed1; 2433 td_params->xfam = init_vm->xfam | td_conf->xfam_fixed1; 2434 2435 td_params->config_flags = TDX_CONFIG_FLAGS_NO_RBP_MOD; 2436 td_params->tsc_frequency = TDX_TSC_KHZ_TO_25MHZ(kvm->arch.default_tsc_khz); 2437 2438 ret = setup_tdparams_eptp_controls(cpuid, td_params); 2439 if (ret) 2440 return ret; 2441 2442 ret = setup_tdparams_cpuids(cpuid, td_params); 2443 if (ret) 2444 return ret; 2445 2446 #define MEMCPY_SAME_SIZE(dst, src) \ 2447 do { \ 2448 BUILD_BUG_ON(sizeof(dst) != sizeof(src)); \ 2449 memcpy((dst), (src), sizeof(dst)); \ 2450 } while (0) 2451 2452 MEMCPY_SAME_SIZE(td_params->mrconfigid, init_vm->mrconfigid); 2453 MEMCPY_SAME_SIZE(td_params->mrowner, init_vm->mrowner); 2454 MEMCPY_SAME_SIZE(td_params->mrownerconfig, init_vm->mrownerconfig); 2455 2456 return 0; 2457 } 2458 2459 static int __tdx_td_init(struct kvm *kvm, struct td_params *td_params, 2460 u64 *seamcall_err) 2461 { 2462 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 2463 cpumask_var_t packages; 2464 struct page **tdcs_pages = NULL; 2465 struct page *tdr_page; 2466 int ret, i; 2467 u64 err, rcx; 2468 2469 *seamcall_err = 0; 2470 ret = tdx_guest_keyid_alloc(); 2471 if (ret < 0) 2472 return ret; 2473 kvm_tdx->hkid = ret; 2474 kvm_tdx->misc_cg = get_current_misc_cg(); 2475 ret = misc_cg_try_charge(MISC_CG_RES_TDX, kvm_tdx->misc_cg, 1); 2476 if (ret) 2477 goto free_hkid; 2478 2479 ret = -ENOMEM; 2480 2481 atomic_inc(&nr_configured_hkid); 2482 2483 tdr_page = alloc_page(GFP_KERNEL); 2484 if (!tdr_page) 2485 goto free_hkid; 2486 2487 kvm_tdx->td.tdcs_nr_pages = tdx_sysinfo->td_ctrl.tdcs_base_size / PAGE_SIZE; 2488 /* TDVPS = TDVPR(4K page) + TDCX(multiple 4K pages), -1 for TDVPR. */ 2489 kvm_tdx->td.tdcx_nr_pages = tdx_sysinfo->td_ctrl.tdvps_base_size / PAGE_SIZE - 1; 2490 tdcs_pages = kcalloc(kvm_tdx->td.tdcs_nr_pages, sizeof(*kvm_tdx->td.tdcs_pages), 2491 GFP_KERNEL); 2492 if (!tdcs_pages) 2493 goto free_tdr; 2494 2495 for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) { 2496 tdcs_pages[i] = alloc_page(GFP_KERNEL); 2497 if (!tdcs_pages[i]) 2498 goto free_tdcs; 2499 } 2500 2501 if (!zalloc_cpumask_var(&packages, GFP_KERNEL)) 2502 goto free_tdcs; 2503 2504 cpus_read_lock(); 2505 2506 /* 2507 * Need at least one CPU of the package to be online in order to 2508 * program all packages for host key id. Check it. 2509 */ 2510 for_each_present_cpu(i) 2511 cpumask_set_cpu(topology_physical_package_id(i), packages); 2512 for_each_online_cpu(i) 2513 cpumask_clear_cpu(topology_physical_package_id(i), packages); 2514 if (!cpumask_empty(packages)) { 2515 ret = -EIO; 2516 /* 2517 * Because it's hard for human operator to figure out the 2518 * reason, warn it. 2519 */ 2520 #define MSG_ALLPKG "All packages need to have online CPU to create TD. Online CPU and retry.\n" 2521 pr_warn_ratelimited(MSG_ALLPKG); 2522 goto free_packages; 2523 } 2524 2525 /* 2526 * TDH.MNG.CREATE tries to grab the global TDX module and fails 2527 * with TDX_OPERAND_BUSY when it fails to grab. Take the global 2528 * lock to prevent it from failure. 2529 */ 2530 mutex_lock(&tdx_lock); 2531 kvm_tdx->td.tdr_page = tdr_page; 2532 err = tdh_mng_create(&kvm_tdx->td, kvm_tdx->hkid); 2533 mutex_unlock(&tdx_lock); 2534 2535 if (err == TDX_RND_NO_ENTROPY) { 2536 ret = -EAGAIN; 2537 goto free_packages; 2538 } 2539 2540 if (WARN_ON_ONCE(err)) { 2541 pr_tdx_error(TDH_MNG_CREATE, err); 2542 ret = -EIO; 2543 goto free_packages; 2544 } 2545 2546 for_each_online_cpu(i) { 2547 int pkg = topology_physical_package_id(i); 2548 2549 if (cpumask_test_and_set_cpu(pkg, packages)) 2550 continue; 2551 2552 /* 2553 * Program the memory controller in the package with an 2554 * encryption key associated to a TDX private host key id 2555 * assigned to this TDR. Concurrent operations on same memory 2556 * controller results in TDX_OPERAND_BUSY. No locking needed 2557 * beyond the cpus_read_lock() above as it serializes against 2558 * hotplug and the first online CPU of the package is always 2559 * used. We never have two CPUs in the same socket trying to 2560 * program the key. 2561 */ 2562 ret = smp_call_on_cpu(i, tdx_do_tdh_mng_key_config, 2563 kvm_tdx, true); 2564 if (ret) 2565 break; 2566 } 2567 cpus_read_unlock(); 2568 free_cpumask_var(packages); 2569 if (ret) { 2570 i = 0; 2571 goto teardown; 2572 } 2573 2574 kvm_tdx->td.tdcs_pages = tdcs_pages; 2575 for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) { 2576 err = tdh_mng_addcx(&kvm_tdx->td, tdcs_pages[i]); 2577 if (err == TDX_RND_NO_ENTROPY) { 2578 /* Here it's hard to allow userspace to retry. */ 2579 ret = -EAGAIN; 2580 goto teardown; 2581 } 2582 if (WARN_ON_ONCE(err)) { 2583 pr_tdx_error(TDH_MNG_ADDCX, err); 2584 ret = -EIO; 2585 goto teardown; 2586 } 2587 } 2588 2589 err = tdh_mng_init(&kvm_tdx->td, __pa(td_params), &rcx); 2590 if ((err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_INVALID) { 2591 /* 2592 * Because a user gives operands, don't warn. 2593 * Return a hint to the user because it's sometimes hard for the 2594 * user to figure out which operand is invalid. SEAMCALL status 2595 * code includes which operand caused invalid operand error. 2596 */ 2597 *seamcall_err = err; 2598 ret = -EINVAL; 2599 goto teardown; 2600 } else if (WARN_ON_ONCE(err)) { 2601 pr_tdx_error_1(TDH_MNG_INIT, err, rcx); 2602 ret = -EIO; 2603 goto teardown; 2604 } 2605 2606 return 0; 2607 2608 /* 2609 * The sequence for freeing resources from a partially initialized TD 2610 * varies based on where in the initialization flow failure occurred. 2611 * Simply use the full teardown and destroy, which naturally play nice 2612 * with partial initialization. 2613 */ 2614 teardown: 2615 /* Only free pages not yet added, so start at 'i' */ 2616 for (; i < kvm_tdx->td.tdcs_nr_pages; i++) { 2617 if (tdcs_pages[i]) { 2618 __free_page(tdcs_pages[i]); 2619 tdcs_pages[i] = NULL; 2620 } 2621 } 2622 if (!kvm_tdx->td.tdcs_pages) 2623 kfree(tdcs_pages); 2624 2625 tdx_mmu_release_hkid(kvm); 2626 tdx_reclaim_td_control_pages(kvm); 2627 2628 return ret; 2629 2630 free_packages: 2631 cpus_read_unlock(); 2632 free_cpumask_var(packages); 2633 2634 free_tdcs: 2635 for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) { 2636 if (tdcs_pages[i]) 2637 __free_page(tdcs_pages[i]); 2638 } 2639 kfree(tdcs_pages); 2640 kvm_tdx->td.tdcs_pages = NULL; 2641 2642 free_tdr: 2643 if (tdr_page) 2644 __free_page(tdr_page); 2645 kvm_tdx->td.tdr_page = 0; 2646 2647 free_hkid: 2648 tdx_hkid_free(kvm_tdx); 2649 2650 return ret; 2651 } 2652 2653 static u64 tdx_td_metadata_field_read(struct kvm_tdx *tdx, u64 field_id, 2654 u64 *data) 2655 { 2656 u64 err; 2657 2658 err = tdh_mng_rd(&tdx->td, field_id, data); 2659 2660 return err; 2661 } 2662 2663 #define TDX_MD_UNREADABLE_LEAF_MASK GENMASK(30, 7) 2664 #define TDX_MD_UNREADABLE_SUBLEAF_MASK GENMASK(31, 7) 2665 2666 static int tdx_read_cpuid(struct kvm_vcpu *vcpu, u32 leaf, u32 sub_leaf, 2667 bool sub_leaf_set, int *entry_index, 2668 struct kvm_cpuid_entry2 *out) 2669 { 2670 struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); 2671 u64 field_id = TD_MD_FIELD_ID_CPUID_VALUES; 2672 u64 ebx_eax, edx_ecx; 2673 u64 err = 0; 2674 2675 if (sub_leaf > 0b1111111) 2676 return -EINVAL; 2677 2678 if (*entry_index >= KVM_MAX_CPUID_ENTRIES) 2679 return -EINVAL; 2680 2681 if (leaf & TDX_MD_UNREADABLE_LEAF_MASK || 2682 sub_leaf & TDX_MD_UNREADABLE_SUBLEAF_MASK) 2683 return -EINVAL; 2684 2685 /* 2686 * bit 23:17, REVSERVED: reserved, must be 0; 2687 * bit 16, LEAF_31: leaf number bit 31; 2688 * bit 15:9, LEAF_6_0: leaf number bits 6:0, leaf bits 30:7 are 2689 * implicitly 0; 2690 * bit 8, SUBLEAF_NA: sub-leaf not applicable flag; 2691 * bit 7:1, SUBLEAF_6_0: sub-leaf number bits 6:0. If SUBLEAF_NA is 1, 2692 * the SUBLEAF_6_0 is all-1. 2693 * sub-leaf bits 31:7 are implicitly 0; 2694 * bit 0, ELEMENT_I: Element index within field; 2695 */ 2696 field_id |= ((leaf & 0x80000000) ? 1 : 0) << 16; 2697 field_id |= (leaf & 0x7f) << 9; 2698 if (sub_leaf_set) 2699 field_id |= (sub_leaf & 0x7f) << 1; 2700 else 2701 field_id |= 0x1fe; 2702 2703 err = tdx_td_metadata_field_read(kvm_tdx, field_id, &ebx_eax); 2704 if (err) //TODO check for specific errors 2705 goto err_out; 2706 2707 out->eax = (u32) ebx_eax; 2708 out->ebx = (u32) (ebx_eax >> 32); 2709 2710 field_id++; 2711 err = tdx_td_metadata_field_read(kvm_tdx, field_id, &edx_ecx); 2712 /* 2713 * It's weird that reading edx_ecx fails while reading ebx_eax 2714 * succeeded. 2715 */ 2716 if (WARN_ON_ONCE(err)) 2717 goto err_out; 2718 2719 out->ecx = (u32) edx_ecx; 2720 out->edx = (u32) (edx_ecx >> 32); 2721 2722 out->function = leaf; 2723 out->index = sub_leaf; 2724 out->flags |= sub_leaf_set ? KVM_CPUID_FLAG_SIGNIFCANT_INDEX : 0; 2725 2726 /* 2727 * Work around missing support on old TDX modules, fetch 2728 * guest maxpa from gfn_direct_bits. 2729 */ 2730 if (leaf == 0x80000008) { 2731 gpa_t gpa_bits = gfn_to_gpa(kvm_gfn_direct_bits(vcpu->kvm)); 2732 unsigned int g_maxpa = __ffs(gpa_bits) + 1; 2733 2734 out->eax = tdx_set_guest_phys_addr_bits(out->eax, g_maxpa); 2735 } 2736 2737 (*entry_index)++; 2738 2739 return 0; 2740 2741 err_out: 2742 out->eax = 0; 2743 out->ebx = 0; 2744 out->ecx = 0; 2745 out->edx = 0; 2746 2747 return -EIO; 2748 } 2749 2750 static int tdx_td_init(struct kvm *kvm, struct kvm_tdx_cmd *cmd) 2751 { 2752 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 2753 struct kvm_tdx_init_vm *init_vm; 2754 struct td_params *td_params = NULL; 2755 int ret; 2756 2757 BUILD_BUG_ON(sizeof(*init_vm) != 256 + sizeof_field(struct kvm_tdx_init_vm, cpuid)); 2758 BUILD_BUG_ON(sizeof(struct td_params) != 1024); 2759 2760 if (kvm_tdx->state != TD_STATE_UNINITIALIZED) 2761 return -EINVAL; 2762 2763 if (cmd->flags) 2764 return -EINVAL; 2765 2766 init_vm = kmalloc(sizeof(*init_vm) + 2767 sizeof(init_vm->cpuid.entries[0]) * KVM_MAX_CPUID_ENTRIES, 2768 GFP_KERNEL); 2769 if (!init_vm) 2770 return -ENOMEM; 2771 2772 if (copy_from_user(init_vm, u64_to_user_ptr(cmd->data), sizeof(*init_vm))) { 2773 ret = -EFAULT; 2774 goto out; 2775 } 2776 2777 if (init_vm->cpuid.nent > KVM_MAX_CPUID_ENTRIES) { 2778 ret = -E2BIG; 2779 goto out; 2780 } 2781 2782 if (copy_from_user(init_vm->cpuid.entries, 2783 u64_to_user_ptr(cmd->data) + sizeof(*init_vm), 2784 flex_array_size(init_vm, cpuid.entries, init_vm->cpuid.nent))) { 2785 ret = -EFAULT; 2786 goto out; 2787 } 2788 2789 if (memchr_inv(init_vm->reserved, 0, sizeof(init_vm->reserved))) { 2790 ret = -EINVAL; 2791 goto out; 2792 } 2793 2794 if (init_vm->cpuid.padding) { 2795 ret = -EINVAL; 2796 goto out; 2797 } 2798 2799 td_params = kzalloc(sizeof(struct td_params), GFP_KERNEL); 2800 if (!td_params) { 2801 ret = -ENOMEM; 2802 goto out; 2803 } 2804 2805 ret = setup_tdparams(kvm, td_params, init_vm); 2806 if (ret) 2807 goto out; 2808 2809 ret = __tdx_td_init(kvm, td_params, &cmd->hw_error); 2810 if (ret) 2811 goto out; 2812 2813 kvm_tdx->tsc_offset = td_tdcs_exec_read64(kvm_tdx, TD_TDCS_EXEC_TSC_OFFSET); 2814 kvm_tdx->tsc_multiplier = td_tdcs_exec_read64(kvm_tdx, TD_TDCS_EXEC_TSC_MULTIPLIER); 2815 kvm_tdx->attributes = td_params->attributes; 2816 kvm_tdx->xfam = td_params->xfam; 2817 2818 if (td_params->config_flags & TDX_CONFIG_FLAGS_MAX_GPAW) 2819 kvm->arch.gfn_direct_bits = TDX_SHARED_BIT_PWL_5; 2820 else 2821 kvm->arch.gfn_direct_bits = TDX_SHARED_BIT_PWL_4; 2822 2823 kvm_tdx->state = TD_STATE_INITIALIZED; 2824 out: 2825 /* kfree() accepts NULL. */ 2826 kfree(init_vm); 2827 kfree(td_params); 2828 2829 return ret; 2830 } 2831 2832 void tdx_flush_tlb_current(struct kvm_vcpu *vcpu) 2833 { 2834 /* 2835 * flush_tlb_current() is invoked when the first time for the vcpu to 2836 * run or when root of shared EPT is invalidated. 2837 * KVM only needs to flush shared EPT because the TDX module handles TLB 2838 * invalidation for private EPT in tdh_vp_enter(); 2839 * 2840 * A single context invalidation for shared EPT can be performed here. 2841 * However, this single context invalidation requires the private EPTP 2842 * rather than the shared EPTP to flush shared EPT, as shared EPT uses 2843 * private EPTP as its ASID for TLB invalidation. 2844 * 2845 * To avoid reading back private EPTP, perform a global invalidation for 2846 * shared EPT instead to keep this function simple. 2847 */ 2848 ept_sync_global(); 2849 } 2850 2851 void tdx_flush_tlb_all(struct kvm_vcpu *vcpu) 2852 { 2853 /* 2854 * TDX has called tdx_track() in tdx_sept_remove_private_spte() to 2855 * ensure that private EPT will be flushed on the next TD enter. No need 2856 * to call tdx_track() here again even when this callback is a result of 2857 * zapping private EPT. 2858 * 2859 * Due to the lack of the context to determine which EPT has been 2860 * affected by zapping, invoke invept() directly here for both shared 2861 * EPT and private EPT for simplicity, though it's not necessary for 2862 * private EPT. 2863 */ 2864 ept_sync_global(); 2865 } 2866 2867 static int tdx_td_finalize(struct kvm *kvm, struct kvm_tdx_cmd *cmd) 2868 { 2869 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 2870 2871 guard(mutex)(&kvm->slots_lock); 2872 2873 if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE) 2874 return -EINVAL; 2875 /* 2876 * Pages are pending for KVM_TDX_INIT_MEM_REGION to issue 2877 * TDH.MEM.PAGE.ADD(). 2878 */ 2879 if (atomic64_read(&kvm_tdx->nr_premapped)) 2880 return -EINVAL; 2881 2882 cmd->hw_error = tdh_mr_finalize(&kvm_tdx->td); 2883 if (tdx_operand_busy(cmd->hw_error)) 2884 return -EBUSY; 2885 if (KVM_BUG_ON(cmd->hw_error, kvm)) { 2886 pr_tdx_error(TDH_MR_FINALIZE, cmd->hw_error); 2887 return -EIO; 2888 } 2889 2890 kvm_tdx->state = TD_STATE_RUNNABLE; 2891 /* TD_STATE_RUNNABLE must be set before 'pre_fault_allowed' */ 2892 smp_wmb(); 2893 kvm->arch.pre_fault_allowed = true; 2894 return 0; 2895 } 2896 2897 int tdx_vm_ioctl(struct kvm *kvm, void __user *argp) 2898 { 2899 struct kvm_tdx_cmd tdx_cmd; 2900 int r; 2901 2902 if (copy_from_user(&tdx_cmd, argp, sizeof(struct kvm_tdx_cmd))) 2903 return -EFAULT; 2904 2905 /* 2906 * Userspace should never set hw_error. It is used to fill 2907 * hardware-defined error by the kernel. 2908 */ 2909 if (tdx_cmd.hw_error) 2910 return -EINVAL; 2911 2912 mutex_lock(&kvm->lock); 2913 2914 switch (tdx_cmd.id) { 2915 case KVM_TDX_CAPABILITIES: 2916 r = tdx_get_capabilities(&tdx_cmd); 2917 break; 2918 case KVM_TDX_INIT_VM: 2919 r = tdx_td_init(kvm, &tdx_cmd); 2920 break; 2921 case KVM_TDX_FINALIZE_VM: 2922 r = tdx_td_finalize(kvm, &tdx_cmd); 2923 break; 2924 default: 2925 r = -EINVAL; 2926 goto out; 2927 } 2928 2929 if (copy_to_user(argp, &tdx_cmd, sizeof(struct kvm_tdx_cmd))) 2930 r = -EFAULT; 2931 2932 out: 2933 mutex_unlock(&kvm->lock); 2934 return r; 2935 } 2936 2937 /* VMM can pass one 64bit auxiliary data to vcpu via RCX for guest BIOS. */ 2938 static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx) 2939 { 2940 struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); 2941 struct vcpu_tdx *tdx = to_tdx(vcpu); 2942 struct page *page; 2943 int ret, i; 2944 u64 err; 2945 2946 page = alloc_page(GFP_KERNEL); 2947 if (!page) 2948 return -ENOMEM; 2949 tdx->vp.tdvpr_page = page; 2950 2951 /* 2952 * page_to_phys() does not work in 'noinstr' code, like guest 2953 * entry via tdh_vp_enter(). Precalculate and store it instead 2954 * of doing it at runtime later. 2955 */ 2956 tdx->vp.tdvpr_pa = page_to_phys(tdx->vp.tdvpr_page); 2957 2958 tdx->vp.tdcx_pages = kcalloc(kvm_tdx->td.tdcx_nr_pages, sizeof(*tdx->vp.tdcx_pages), 2959 GFP_KERNEL); 2960 if (!tdx->vp.tdcx_pages) { 2961 ret = -ENOMEM; 2962 goto free_tdvpr; 2963 } 2964 2965 for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) { 2966 page = alloc_page(GFP_KERNEL); 2967 if (!page) { 2968 ret = -ENOMEM; 2969 goto free_tdcx; 2970 } 2971 tdx->vp.tdcx_pages[i] = page; 2972 } 2973 2974 err = tdh_vp_create(&kvm_tdx->td, &tdx->vp); 2975 if (KVM_BUG_ON(err, vcpu->kvm)) { 2976 ret = -EIO; 2977 pr_tdx_error(TDH_VP_CREATE, err); 2978 goto free_tdcx; 2979 } 2980 2981 for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) { 2982 err = tdh_vp_addcx(&tdx->vp, tdx->vp.tdcx_pages[i]); 2983 if (KVM_BUG_ON(err, vcpu->kvm)) { 2984 pr_tdx_error(TDH_VP_ADDCX, err); 2985 /* 2986 * Pages already added are reclaimed by the vcpu_free 2987 * method, but the rest are freed here. 2988 */ 2989 for (; i < kvm_tdx->td.tdcx_nr_pages; i++) { 2990 __free_page(tdx->vp.tdcx_pages[i]); 2991 tdx->vp.tdcx_pages[i] = NULL; 2992 } 2993 return -EIO; 2994 } 2995 } 2996 2997 err = tdh_vp_init(&tdx->vp, vcpu_rcx, vcpu->vcpu_id); 2998 if (KVM_BUG_ON(err, vcpu->kvm)) { 2999 pr_tdx_error(TDH_VP_INIT, err); 3000 return -EIO; 3001 } 3002 3003 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 3004 3005 return 0; 3006 3007 free_tdcx: 3008 for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) { 3009 if (tdx->vp.tdcx_pages[i]) 3010 __free_page(tdx->vp.tdcx_pages[i]); 3011 tdx->vp.tdcx_pages[i] = NULL; 3012 } 3013 kfree(tdx->vp.tdcx_pages); 3014 tdx->vp.tdcx_pages = NULL; 3015 3016 free_tdvpr: 3017 if (tdx->vp.tdvpr_page) 3018 __free_page(tdx->vp.tdvpr_page); 3019 tdx->vp.tdvpr_page = 0; 3020 tdx->vp.tdvpr_pa = 0; 3021 3022 return ret; 3023 } 3024 3025 /* Sometimes reads multipple subleafs. Return how many enties were written. */ 3026 static int tdx_vcpu_get_cpuid_leaf(struct kvm_vcpu *vcpu, u32 leaf, int *entry_index, 3027 struct kvm_cpuid_entry2 *output_e) 3028 { 3029 int sub_leaf = 0; 3030 int ret; 3031 3032 /* First try without a subleaf */ 3033 ret = tdx_read_cpuid(vcpu, leaf, 0, false, entry_index, output_e); 3034 3035 /* If success, or invalid leaf, just give up */ 3036 if (ret != -EIO) 3037 return ret; 3038 3039 /* 3040 * If the try without a subleaf failed, try reading subleafs until 3041 * failure. The TDX module only supports 6 bits of subleaf index. 3042 */ 3043 while (1) { 3044 /* Keep reading subleafs until there is a failure. */ 3045 if (tdx_read_cpuid(vcpu, leaf, sub_leaf, true, entry_index, output_e)) 3046 return !sub_leaf; 3047 3048 sub_leaf++; 3049 output_e++; 3050 } 3051 3052 return 0; 3053 } 3054 3055 static int tdx_vcpu_get_cpuid(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd) 3056 { 3057 struct kvm_cpuid2 __user *output, *td_cpuid; 3058 int r = 0, i = 0, leaf; 3059 u32 level; 3060 3061 output = u64_to_user_ptr(cmd->data); 3062 td_cpuid = kzalloc(sizeof(*td_cpuid) + 3063 sizeof(output->entries[0]) * KVM_MAX_CPUID_ENTRIES, 3064 GFP_KERNEL); 3065 if (!td_cpuid) 3066 return -ENOMEM; 3067 3068 if (copy_from_user(td_cpuid, output, sizeof(*output))) { 3069 r = -EFAULT; 3070 goto out; 3071 } 3072 3073 /* Read max CPUID for normal range */ 3074 if (tdx_vcpu_get_cpuid_leaf(vcpu, 0, &i, &td_cpuid->entries[i])) { 3075 r = -EIO; 3076 goto out; 3077 } 3078 level = td_cpuid->entries[0].eax; 3079 3080 for (leaf = 1; leaf <= level; leaf++) 3081 tdx_vcpu_get_cpuid_leaf(vcpu, leaf, &i, &td_cpuid->entries[i]); 3082 3083 /* Read max CPUID for extended range */ 3084 if (tdx_vcpu_get_cpuid_leaf(vcpu, 0x80000000, &i, &td_cpuid->entries[i])) { 3085 r = -EIO; 3086 goto out; 3087 } 3088 level = td_cpuid->entries[i - 1].eax; 3089 3090 for (leaf = 0x80000001; leaf <= level; leaf++) 3091 tdx_vcpu_get_cpuid_leaf(vcpu, leaf, &i, &td_cpuid->entries[i]); 3092 3093 if (td_cpuid->nent < i) 3094 r = -E2BIG; 3095 td_cpuid->nent = i; 3096 3097 if (copy_to_user(output, td_cpuid, sizeof(*output))) { 3098 r = -EFAULT; 3099 goto out; 3100 } 3101 3102 if (r == -E2BIG) 3103 goto out; 3104 3105 if (copy_to_user(output->entries, td_cpuid->entries, 3106 td_cpuid->nent * sizeof(struct kvm_cpuid_entry2))) 3107 r = -EFAULT; 3108 3109 out: 3110 kfree(td_cpuid); 3111 3112 return r; 3113 } 3114 3115 static int tdx_vcpu_init(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd) 3116 { 3117 u64 apic_base; 3118 struct vcpu_tdx *tdx = to_tdx(vcpu); 3119 int ret; 3120 3121 if (cmd->flags) 3122 return -EINVAL; 3123 3124 if (tdx->state != VCPU_TD_STATE_UNINITIALIZED) 3125 return -EINVAL; 3126 3127 /* 3128 * TDX requires X2APIC, userspace is responsible for configuring guest 3129 * CPUID accordingly. 3130 */ 3131 apic_base = APIC_DEFAULT_PHYS_BASE | LAPIC_MODE_X2APIC | 3132 (kvm_vcpu_is_reset_bsp(vcpu) ? MSR_IA32_APICBASE_BSP : 0); 3133 if (kvm_apic_set_base(vcpu, apic_base, true)) 3134 return -EINVAL; 3135 3136 ret = tdx_td_vcpu_init(vcpu, (u64)cmd->data); 3137 if (ret) 3138 return ret; 3139 3140 td_vmcs_write16(tdx, POSTED_INTR_NV, POSTED_INTR_VECTOR); 3141 td_vmcs_write64(tdx, POSTED_INTR_DESC_ADDR, __pa(&tdx->vt.pi_desc)); 3142 td_vmcs_setbit32(tdx, PIN_BASED_VM_EXEC_CONTROL, PIN_BASED_POSTED_INTR); 3143 3144 tdx->state = VCPU_TD_STATE_INITIALIZED; 3145 3146 return 0; 3147 } 3148 3149 void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) 3150 { 3151 /* 3152 * Yell on INIT, as TDX doesn't support INIT, i.e. KVM should drop all 3153 * INIT events. 3154 * 3155 * Defer initializing vCPU for RESET state until KVM_TDX_INIT_VCPU, as 3156 * userspace needs to define the vCPU model before KVM can initialize 3157 * vCPU state, e.g. to enable x2APIC. 3158 */ 3159 WARN_ON_ONCE(init_event); 3160 } 3161 3162 struct tdx_gmem_post_populate_arg { 3163 struct kvm_vcpu *vcpu; 3164 __u32 flags; 3165 }; 3166 3167 static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, 3168 void __user *src, int order, void *_arg) 3169 { 3170 u64 error_code = PFERR_GUEST_FINAL_MASK | PFERR_PRIVATE_ACCESS; 3171 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 3172 struct tdx_gmem_post_populate_arg *arg = _arg; 3173 struct kvm_vcpu *vcpu = arg->vcpu; 3174 gpa_t gpa = gfn_to_gpa(gfn); 3175 u8 level = PG_LEVEL_4K; 3176 struct page *src_page; 3177 int ret, i; 3178 u64 err, entry, level_state; 3179 3180 /* 3181 * Get the source page if it has been faulted in. Return failure if the 3182 * source page has been swapped out or unmapped in primary memory. 3183 */ 3184 ret = get_user_pages_fast((unsigned long)src, 1, 0, &src_page); 3185 if (ret < 0) 3186 return ret; 3187 if (ret != 1) 3188 return -ENOMEM; 3189 3190 ret = kvm_tdp_map_page(vcpu, gpa, error_code, &level); 3191 if (ret < 0) 3192 goto out; 3193 3194 /* 3195 * The private mem cannot be zapped after kvm_tdp_map_page() 3196 * because all paths are covered by slots_lock and the 3197 * filemap invalidate lock. Check that they are indeed enough. 3198 */ 3199 if (IS_ENABLED(CONFIG_KVM_PROVE_MMU)) { 3200 scoped_guard(read_lock, &kvm->mmu_lock) { 3201 if (KVM_BUG_ON(!kvm_tdp_mmu_gpa_is_mapped(vcpu, gpa), kvm)) { 3202 ret = -EIO; 3203 goto out; 3204 } 3205 } 3206 } 3207 3208 ret = 0; 3209 err = tdh_mem_page_add(&kvm_tdx->td, gpa, pfn_to_page(pfn), 3210 src_page, &entry, &level_state); 3211 if (err) { 3212 ret = unlikely(tdx_operand_busy(err)) ? -EBUSY : -EIO; 3213 goto out; 3214 } 3215 3216 if (!KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm)) 3217 atomic64_dec(&kvm_tdx->nr_premapped); 3218 3219 if (arg->flags & KVM_TDX_MEASURE_MEMORY_REGION) { 3220 for (i = 0; i < PAGE_SIZE; i += TDX_EXTENDMR_CHUNKSIZE) { 3221 err = tdh_mr_extend(&kvm_tdx->td, gpa + i, &entry, 3222 &level_state); 3223 if (err) { 3224 ret = -EIO; 3225 break; 3226 } 3227 } 3228 } 3229 3230 out: 3231 put_page(src_page); 3232 return ret; 3233 } 3234 3235 static int tdx_vcpu_init_mem_region(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd) 3236 { 3237 struct vcpu_tdx *tdx = to_tdx(vcpu); 3238 struct kvm *kvm = vcpu->kvm; 3239 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 3240 struct kvm_tdx_init_mem_region region; 3241 struct tdx_gmem_post_populate_arg arg; 3242 long gmem_ret; 3243 int ret; 3244 3245 if (tdx->state != VCPU_TD_STATE_INITIALIZED) 3246 return -EINVAL; 3247 3248 guard(mutex)(&kvm->slots_lock); 3249 3250 /* Once TD is finalized, the initial guest memory is fixed. */ 3251 if (kvm_tdx->state == TD_STATE_RUNNABLE) 3252 return -EINVAL; 3253 3254 if (cmd->flags & ~KVM_TDX_MEASURE_MEMORY_REGION) 3255 return -EINVAL; 3256 3257 if (copy_from_user(®ion, u64_to_user_ptr(cmd->data), sizeof(region))) 3258 return -EFAULT; 3259 3260 if (!PAGE_ALIGNED(region.source_addr) || !PAGE_ALIGNED(region.gpa) || 3261 !region.nr_pages || 3262 region.gpa + (region.nr_pages << PAGE_SHIFT) <= region.gpa || 3263 !vt_is_tdx_private_gpa(kvm, region.gpa) || 3264 !vt_is_tdx_private_gpa(kvm, region.gpa + (region.nr_pages << PAGE_SHIFT) - 1)) 3265 return -EINVAL; 3266 3267 kvm_mmu_reload(vcpu); 3268 ret = 0; 3269 while (region.nr_pages) { 3270 if (signal_pending(current)) { 3271 ret = -EINTR; 3272 break; 3273 } 3274 3275 arg = (struct tdx_gmem_post_populate_arg) { 3276 .vcpu = vcpu, 3277 .flags = cmd->flags, 3278 }; 3279 gmem_ret = kvm_gmem_populate(kvm, gpa_to_gfn(region.gpa), 3280 u64_to_user_ptr(region.source_addr), 3281 1, tdx_gmem_post_populate, &arg); 3282 if (gmem_ret < 0) { 3283 ret = gmem_ret; 3284 break; 3285 } 3286 3287 if (gmem_ret != 1) { 3288 ret = -EIO; 3289 break; 3290 } 3291 3292 region.source_addr += PAGE_SIZE; 3293 region.gpa += PAGE_SIZE; 3294 region.nr_pages--; 3295 3296 cond_resched(); 3297 } 3298 3299 if (copy_to_user(u64_to_user_ptr(cmd->data), ®ion, sizeof(region))) 3300 ret = -EFAULT; 3301 return ret; 3302 } 3303 3304 int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp) 3305 { 3306 struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); 3307 struct kvm_tdx_cmd cmd; 3308 int ret; 3309 3310 if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE) 3311 return -EINVAL; 3312 3313 if (copy_from_user(&cmd, argp, sizeof(cmd))) 3314 return -EFAULT; 3315 3316 if (cmd.hw_error) 3317 return -EINVAL; 3318 3319 switch (cmd.id) { 3320 case KVM_TDX_INIT_VCPU: 3321 ret = tdx_vcpu_init(vcpu, &cmd); 3322 break; 3323 case KVM_TDX_INIT_MEM_REGION: 3324 ret = tdx_vcpu_init_mem_region(vcpu, &cmd); 3325 break; 3326 case KVM_TDX_GET_CPUID: 3327 ret = tdx_vcpu_get_cpuid(vcpu, &cmd); 3328 break; 3329 default: 3330 ret = -EINVAL; 3331 break; 3332 } 3333 3334 return ret; 3335 } 3336 3337 int tdx_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, bool is_private) 3338 { 3339 if (!is_private) 3340 return 0; 3341 3342 return PG_LEVEL_4K; 3343 } 3344 3345 static int tdx_online_cpu(unsigned int cpu) 3346 { 3347 unsigned long flags; 3348 int r; 3349 3350 /* Sanity check CPU is already in post-VMXON */ 3351 WARN_ON_ONCE(!(cr4_read_shadow() & X86_CR4_VMXE)); 3352 3353 local_irq_save(flags); 3354 r = tdx_cpu_enable(); 3355 local_irq_restore(flags); 3356 3357 return r; 3358 } 3359 3360 static int tdx_offline_cpu(unsigned int cpu) 3361 { 3362 int i; 3363 3364 /* No TD is running. Allow any cpu to be offline. */ 3365 if (!atomic_read(&nr_configured_hkid)) 3366 return 0; 3367 3368 /* 3369 * In order to reclaim TDX HKID, (i.e. when deleting guest TD), need to 3370 * call TDH.PHYMEM.PAGE.WBINVD on all packages to program all memory 3371 * controller with pconfig. If we have active TDX HKID, refuse to 3372 * offline the last online cpu. 3373 */ 3374 for_each_online_cpu(i) { 3375 /* 3376 * Found another online cpu on the same package. 3377 * Allow to offline. 3378 */ 3379 if (i != cpu && topology_physical_package_id(i) == 3380 topology_physical_package_id(cpu)) 3381 return 0; 3382 } 3383 3384 /* 3385 * This is the last cpu of this package. Don't offline it. 3386 * 3387 * Because it's hard for human operator to understand the 3388 * reason, warn it. 3389 */ 3390 #define MSG_ALLPKG_ONLINE \ 3391 "TDX requires all packages to have an online CPU. Delete all TDs in order to offline all CPUs of a package.\n" 3392 pr_warn_ratelimited(MSG_ALLPKG_ONLINE); 3393 return -EBUSY; 3394 } 3395 3396 static void __do_tdx_cleanup(void) 3397 { 3398 /* 3399 * Once TDX module is initialized, it cannot be disabled and 3400 * re-initialized again w/o runtime update (which isn't 3401 * supported by kernel). Only need to remove the cpuhp here. 3402 * The TDX host core code tracks TDX status and can handle 3403 * 'multiple enabling' scenario. 3404 */ 3405 WARN_ON_ONCE(!tdx_cpuhp_state); 3406 cpuhp_remove_state_nocalls_cpuslocked(tdx_cpuhp_state); 3407 tdx_cpuhp_state = 0; 3408 } 3409 3410 static void __tdx_cleanup(void) 3411 { 3412 cpus_read_lock(); 3413 __do_tdx_cleanup(); 3414 cpus_read_unlock(); 3415 } 3416 3417 static int __init __do_tdx_bringup(void) 3418 { 3419 int r; 3420 3421 /* 3422 * TDX-specific cpuhp callback to call tdx_cpu_enable() on all 3423 * online CPUs before calling tdx_enable(), and on any new 3424 * going-online CPU to make sure it is ready for TDX guest. 3425 */ 3426 r = cpuhp_setup_state_cpuslocked(CPUHP_AP_ONLINE_DYN, 3427 "kvm/cpu/tdx:online", 3428 tdx_online_cpu, tdx_offline_cpu); 3429 if (r < 0) 3430 return r; 3431 3432 tdx_cpuhp_state = r; 3433 3434 r = tdx_enable(); 3435 if (r) 3436 __do_tdx_cleanup(); 3437 3438 return r; 3439 } 3440 3441 static int __init __tdx_bringup(void) 3442 { 3443 const struct tdx_sys_info_td_conf *td_conf; 3444 int r, i; 3445 3446 for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++) { 3447 /* 3448 * Check if MSRs (tdx_uret_msrs) can be saved/restored 3449 * before returning to user space. 3450 * 3451 * this_cpu_ptr(user_return_msrs)->registered isn't checked 3452 * because the registration is done at vcpu runtime by 3453 * tdx_user_return_msr_update_cache(). 3454 */ 3455 tdx_uret_msrs[i].slot = kvm_find_user_return_msr(tdx_uret_msrs[i].msr); 3456 if (tdx_uret_msrs[i].slot == -1) { 3457 /* If any MSR isn't supported, it is a KVM bug */ 3458 pr_err("MSR %x isn't included by kvm_find_user_return_msr\n", 3459 tdx_uret_msrs[i].msr); 3460 return -EIO; 3461 } 3462 } 3463 3464 /* 3465 * Enabling TDX requires enabling hardware virtualization first, 3466 * as making SEAMCALLs requires CPU being in post-VMXON state. 3467 */ 3468 r = kvm_enable_virtualization(); 3469 if (r) 3470 return r; 3471 3472 cpus_read_lock(); 3473 r = __do_tdx_bringup(); 3474 cpus_read_unlock(); 3475 3476 if (r) 3477 goto tdx_bringup_err; 3478 3479 r = -EINVAL; 3480 /* Get TDX global information for later use */ 3481 tdx_sysinfo = tdx_get_sysinfo(); 3482 if (WARN_ON_ONCE(!tdx_sysinfo)) 3483 goto get_sysinfo_err; 3484 3485 /* Check TDX module and KVM capabilities */ 3486 if (!tdx_get_supported_attrs(&tdx_sysinfo->td_conf) || 3487 !tdx_get_supported_xfam(&tdx_sysinfo->td_conf)) 3488 goto get_sysinfo_err; 3489 3490 if (!(tdx_sysinfo->features.tdx_features0 & MD_FIELD_ID_FEATURES0_TOPOLOGY_ENUM)) 3491 goto get_sysinfo_err; 3492 3493 /* 3494 * TDX has its own limit of maximum vCPUs it can support for all 3495 * TDX guests in addition to KVM_MAX_VCPUS. Userspace needs to 3496 * query TDX guest's maximum vCPUs by checking KVM_CAP_MAX_VCPU 3497 * extension on per-VM basis. 3498 * 3499 * TDX module reports such limit via the MAX_VCPU_PER_TD global 3500 * metadata. Different modules may report different values. 3501 * Some old module may also not support this metadata (in which 3502 * case this limit is U16_MAX). 3503 * 3504 * In practice, the reported value reflects the maximum logical 3505 * CPUs that ALL the platforms that the module supports can 3506 * possibly have. 3507 * 3508 * Simply forwarding the MAX_VCPU_PER_TD to userspace could 3509 * result in an unpredictable ABI. KVM instead always advertise 3510 * the number of logical CPUs the platform has as the maximum 3511 * vCPUs for TDX guests. 3512 * 3513 * Make sure MAX_VCPU_PER_TD reported by TDX module is not 3514 * smaller than the number of logical CPUs, otherwise KVM will 3515 * report an unsupported value to userspace. 3516 * 3517 * Note, a platform with TDX enabled in the BIOS cannot support 3518 * physical CPU hotplug, and TDX requires the BIOS has marked 3519 * all logical CPUs in MADT table as enabled. Just use 3520 * num_present_cpus() for the number of logical CPUs. 3521 */ 3522 td_conf = &tdx_sysinfo->td_conf; 3523 if (td_conf->max_vcpus_per_td < num_present_cpus()) { 3524 pr_err("Disable TDX: MAX_VCPU_PER_TD (%u) smaller than number of logical CPUs (%u).\n", 3525 td_conf->max_vcpus_per_td, num_present_cpus()); 3526 goto get_sysinfo_err; 3527 } 3528 3529 if (misc_cg_set_capacity(MISC_CG_RES_TDX, tdx_get_nr_guest_keyids())) 3530 goto get_sysinfo_err; 3531 3532 /* 3533 * Leave hardware virtualization enabled after TDX is enabled 3534 * successfully. TDX CPU hotplug depends on this. 3535 */ 3536 return 0; 3537 3538 get_sysinfo_err: 3539 __tdx_cleanup(); 3540 tdx_bringup_err: 3541 kvm_disable_virtualization(); 3542 return r; 3543 } 3544 3545 void tdx_cleanup(void) 3546 { 3547 if (enable_tdx) { 3548 misc_cg_set_capacity(MISC_CG_RES_TDX, 0); 3549 __tdx_cleanup(); 3550 kvm_disable_virtualization(); 3551 } 3552 } 3553 3554 int __init tdx_bringup(void) 3555 { 3556 int r, i; 3557 3558 /* tdx_disable_virtualization_cpu() uses associated_tdvcpus. */ 3559 for_each_possible_cpu(i) 3560 INIT_LIST_HEAD(&per_cpu(associated_tdvcpus, i)); 3561 3562 if (!enable_tdx) 3563 return 0; 3564 3565 if (!enable_ept) { 3566 pr_err("EPT is required for TDX\n"); 3567 goto success_disable_tdx; 3568 } 3569 3570 if (!tdp_mmu_enabled || !enable_mmio_caching || !enable_ept_ad_bits) { 3571 pr_err("TDP MMU and MMIO caching and EPT A/D bit is required for TDX\n"); 3572 goto success_disable_tdx; 3573 } 3574 3575 if (!enable_apicv) { 3576 pr_err("APICv is required for TDX\n"); 3577 goto success_disable_tdx; 3578 } 3579 3580 if (!cpu_feature_enabled(X86_FEATURE_OSXSAVE)) { 3581 pr_err("tdx: OSXSAVE is required for TDX\n"); 3582 goto success_disable_tdx; 3583 } 3584 3585 if (!cpu_feature_enabled(X86_FEATURE_MOVDIR64B)) { 3586 pr_err("tdx: MOVDIR64B is required for TDX\n"); 3587 goto success_disable_tdx; 3588 } 3589 3590 if (!cpu_feature_enabled(X86_FEATURE_SELFSNOOP)) { 3591 pr_err("Self-snoop is required for TDX\n"); 3592 goto success_disable_tdx; 3593 } 3594 3595 if (!cpu_feature_enabled(X86_FEATURE_TDX_HOST_PLATFORM)) { 3596 pr_err("tdx: no TDX private KeyIDs available\n"); 3597 goto success_disable_tdx; 3598 } 3599 3600 if (!enable_virt_at_load) { 3601 pr_err("tdx: tdx requires kvm.enable_virt_at_load=1\n"); 3602 goto success_disable_tdx; 3603 } 3604 3605 /* 3606 * Ideally KVM should probe whether TDX module has been loaded 3607 * first and then try to bring it up. But TDX needs to use SEAMCALL 3608 * to probe whether the module is loaded (there is no CPUID or MSR 3609 * for that), and making SEAMCALL requires enabling virtualization 3610 * first, just like the rest steps of bringing up TDX module. 3611 * 3612 * So, for simplicity do everything in __tdx_bringup(); the first 3613 * SEAMCALL will return -ENODEV when the module is not loaded. The 3614 * only complication is having to make sure that initialization 3615 * SEAMCALLs don't return TDX_SEAMCALL_VMFAILINVALID in other 3616 * cases. 3617 */ 3618 r = __tdx_bringup(); 3619 if (r) { 3620 /* 3621 * Disable TDX only but don't fail to load module if the TDX 3622 * module could not be loaded. No need to print message saying 3623 * "module is not loaded" because it was printed when the first 3624 * SEAMCALL failed. Don't bother unwinding the S-EPT hooks or 3625 * vm_size, as kvm_x86_ops have already been finalized (and are 3626 * intentionally not exported). The S-EPT code is unreachable, 3627 * and allocating a few more bytes per VM in a should-be-rare 3628 * failure scenario is a non-issue. 3629 */ 3630 if (r == -ENODEV) 3631 goto success_disable_tdx; 3632 3633 enable_tdx = 0; 3634 } 3635 3636 return r; 3637 3638 success_disable_tdx: 3639 enable_tdx = 0; 3640 return 0; 3641 } 3642 3643 void __init tdx_hardware_setup(void) 3644 { 3645 KVM_SANITY_CHECK_VM_STRUCT_SIZE(kvm_tdx); 3646 3647 /* 3648 * Note, if the TDX module can't be loaded, KVM TDX support will be 3649 * disabled but KVM will continue loading (see tdx_bringup()). 3650 */ 3651 vt_x86_ops.vm_size = max_t(unsigned int, vt_x86_ops.vm_size, sizeof(struct kvm_tdx)); 3652 3653 vt_x86_ops.link_external_spt = tdx_sept_link_private_spt; 3654 vt_x86_ops.set_external_spte = tdx_sept_set_private_spte; 3655 vt_x86_ops.free_external_spt = tdx_sept_free_private_spt; 3656 vt_x86_ops.remove_external_spte = tdx_sept_remove_private_spte; 3657 vt_x86_ops.protected_apic_has_interrupt = tdx_protected_apic_has_interrupt; 3658 } 3659