1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/cleanup.h> 3 #include <linux/cpu.h> 4 #include <asm/cpufeature.h> 5 #include <asm/fpu/xcr.h> 6 #include <linux/misc_cgroup.h> 7 #include <linux/mmu_context.h> 8 #include <asm/tdx.h> 9 #include "capabilities.h" 10 #include "mmu.h" 11 #include "x86_ops.h" 12 #include "lapic.h" 13 #include "tdx.h" 14 #include "vmx.h" 15 #include "mmu/spte.h" 16 #include "common.h" 17 #include "posted_intr.h" 18 #include "irq.h" 19 #include <trace/events/kvm.h> 20 #include "trace.h" 21 22 #pragma GCC poison to_vmx 23 24 #undef pr_fmt 25 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 26 27 #define pr_tdx_error(__fn, __err) \ 28 pr_err_ratelimited("SEAMCALL %s failed: 0x%llx\n", #__fn, __err) 29 30 #define __pr_tdx_error_N(__fn_str, __err, __fmt, ...) \ 31 pr_err_ratelimited("SEAMCALL " __fn_str " failed: 0x%llx, " __fmt, __err, __VA_ARGS__) 32 33 #define pr_tdx_error_1(__fn, __err, __rcx) \ 34 __pr_tdx_error_N(#__fn, __err, "rcx 0x%llx\n", __rcx) 35 36 #define pr_tdx_error_2(__fn, __err, __rcx, __rdx) \ 37 __pr_tdx_error_N(#__fn, __err, "rcx 0x%llx, rdx 0x%llx\n", __rcx, __rdx) 38 39 #define pr_tdx_error_3(__fn, __err, __rcx, __rdx, __r8) \ 40 __pr_tdx_error_N(#__fn, __err, "rcx 0x%llx, rdx 0x%llx, r8 0x%llx\n", __rcx, __rdx, __r8) 41 42 bool enable_tdx __ro_after_init; 43 module_param_named(tdx, enable_tdx, bool, 0444); 44 45 #define TDX_SHARED_BIT_PWL_5 gpa_to_gfn(BIT_ULL(51)) 46 #define TDX_SHARED_BIT_PWL_4 gpa_to_gfn(BIT_ULL(47)) 47 48 static enum cpuhp_state tdx_cpuhp_state; 49 50 static const struct tdx_sys_info *tdx_sysinfo; 51 52 void tdh_vp_rd_failed(struct vcpu_tdx *tdx, char *uclass, u32 field, u64 err) 53 { 54 KVM_BUG_ON(1, tdx->vcpu.kvm); 55 pr_err("TDH_VP_RD[%s.0x%x] failed 0x%llx\n", uclass, field, err); 56 } 57 58 void tdh_vp_wr_failed(struct vcpu_tdx *tdx, char *uclass, char *op, u32 field, 59 u64 val, u64 err) 60 { 61 KVM_BUG_ON(1, tdx->vcpu.kvm); 62 pr_err("TDH_VP_WR[%s.0x%x]%s0x%llx failed: 0x%llx\n", uclass, field, op, val, err); 63 } 64 65 #define KVM_SUPPORTED_TD_ATTRS (TDX_TD_ATTR_SEPT_VE_DISABLE) 66 67 static __always_inline struct kvm_tdx *to_kvm_tdx(struct kvm *kvm) 68 { 69 return container_of(kvm, struct kvm_tdx, kvm); 70 } 71 72 static __always_inline struct vcpu_tdx *to_tdx(struct kvm_vcpu *vcpu) 73 { 74 return container_of(vcpu, struct vcpu_tdx, vcpu); 75 } 76 77 static u64 tdx_get_supported_attrs(const struct tdx_sys_info_td_conf *td_conf) 78 { 79 u64 val = KVM_SUPPORTED_TD_ATTRS; 80 81 if ((val & td_conf->attributes_fixed1) != td_conf->attributes_fixed1) 82 return 0; 83 84 val &= td_conf->attributes_fixed0; 85 86 return val; 87 } 88 89 static u64 tdx_get_supported_xfam(const struct tdx_sys_info_td_conf *td_conf) 90 { 91 u64 val = kvm_caps.supported_xcr0 | kvm_caps.supported_xss; 92 93 if ((val & td_conf->xfam_fixed1) != td_conf->xfam_fixed1) 94 return 0; 95 96 val &= td_conf->xfam_fixed0; 97 98 return val; 99 } 100 101 static int tdx_get_guest_phys_addr_bits(const u32 eax) 102 { 103 return (eax & GENMASK(23, 16)) >> 16; 104 } 105 106 static u32 tdx_set_guest_phys_addr_bits(const u32 eax, int addr_bits) 107 { 108 return (eax & ~GENMASK(23, 16)) | (addr_bits & 0xff) << 16; 109 } 110 111 #define TDX_FEATURE_TSX (__feature_bit(X86_FEATURE_HLE) | __feature_bit(X86_FEATURE_RTM)) 112 113 static bool has_tsx(const struct kvm_cpuid_entry2 *entry) 114 { 115 return entry->function == 7 && entry->index == 0 && 116 (entry->ebx & TDX_FEATURE_TSX); 117 } 118 119 static void clear_tsx(struct kvm_cpuid_entry2 *entry) 120 { 121 entry->ebx &= ~TDX_FEATURE_TSX; 122 } 123 124 static bool has_waitpkg(const struct kvm_cpuid_entry2 *entry) 125 { 126 return entry->function == 7 && entry->index == 0 && 127 (entry->ecx & __feature_bit(X86_FEATURE_WAITPKG)); 128 } 129 130 static void clear_waitpkg(struct kvm_cpuid_entry2 *entry) 131 { 132 entry->ecx &= ~__feature_bit(X86_FEATURE_WAITPKG); 133 } 134 135 static void tdx_clear_unsupported_cpuid(struct kvm_cpuid_entry2 *entry) 136 { 137 if (has_tsx(entry)) 138 clear_tsx(entry); 139 140 if (has_waitpkg(entry)) 141 clear_waitpkg(entry); 142 } 143 144 static bool tdx_unsupported_cpuid(const struct kvm_cpuid_entry2 *entry) 145 { 146 return has_tsx(entry) || has_waitpkg(entry); 147 } 148 149 #define KVM_TDX_CPUID_NO_SUBLEAF ((__u32)-1) 150 151 static void td_init_cpuid_entry2(struct kvm_cpuid_entry2 *entry, unsigned char idx) 152 { 153 const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf; 154 155 entry->function = (u32)td_conf->cpuid_config_leaves[idx]; 156 entry->index = td_conf->cpuid_config_leaves[idx] >> 32; 157 entry->eax = (u32)td_conf->cpuid_config_values[idx][0]; 158 entry->ebx = td_conf->cpuid_config_values[idx][0] >> 32; 159 entry->ecx = (u32)td_conf->cpuid_config_values[idx][1]; 160 entry->edx = td_conf->cpuid_config_values[idx][1] >> 32; 161 162 if (entry->index == KVM_TDX_CPUID_NO_SUBLEAF) 163 entry->index = 0; 164 165 /* 166 * The TDX module doesn't allow configuring the guest phys addr bits 167 * (EAX[23:16]). However, KVM uses it as an interface to the userspace 168 * to configure the GPAW. Report these bits as configurable. 169 */ 170 if (entry->function == 0x80000008) 171 entry->eax = tdx_set_guest_phys_addr_bits(entry->eax, 0xff); 172 173 tdx_clear_unsupported_cpuid(entry); 174 } 175 176 static int init_kvm_tdx_caps(const struct tdx_sys_info_td_conf *td_conf, 177 struct kvm_tdx_capabilities *caps) 178 { 179 int i; 180 181 caps->supported_attrs = tdx_get_supported_attrs(td_conf); 182 if (!caps->supported_attrs) 183 return -EIO; 184 185 caps->supported_xfam = tdx_get_supported_xfam(td_conf); 186 if (!caps->supported_xfam) 187 return -EIO; 188 189 caps->cpuid.nent = td_conf->num_cpuid_config; 190 191 for (i = 0; i < td_conf->num_cpuid_config; i++) 192 td_init_cpuid_entry2(&caps->cpuid.entries[i], i); 193 194 return 0; 195 } 196 197 /* 198 * Some SEAMCALLs acquire the TDX module globally, and can fail with 199 * TDX_OPERAND_BUSY. Use a global mutex to serialize these SEAMCALLs. 200 */ 201 static DEFINE_MUTEX(tdx_lock); 202 203 static atomic_t nr_configured_hkid; 204 205 static bool tdx_operand_busy(u64 err) 206 { 207 return (err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_BUSY; 208 } 209 210 211 /* 212 * A per-CPU list of TD vCPUs associated with a given CPU. 213 * Protected by interrupt mask. Only manipulated by the CPU owning this per-CPU 214 * list. 215 * - When a vCPU is loaded onto a CPU, it is removed from the per-CPU list of 216 * the old CPU during the IPI callback running on the old CPU, and then added 217 * to the per-CPU list of the new CPU. 218 * - When a TD is tearing down, all vCPUs are disassociated from their current 219 * running CPUs and removed from the per-CPU list during the IPI callback 220 * running on those CPUs. 221 * - When a CPU is brought down, traverse the per-CPU list to disassociate all 222 * associated TD vCPUs and remove them from the per-CPU list. 223 */ 224 static DEFINE_PER_CPU(struct list_head, associated_tdvcpus); 225 226 static __always_inline unsigned long tdvmcall_exit_type(struct kvm_vcpu *vcpu) 227 { 228 return to_tdx(vcpu)->vp_enter_args.r10; 229 } 230 231 static __always_inline unsigned long tdvmcall_leaf(struct kvm_vcpu *vcpu) 232 { 233 return to_tdx(vcpu)->vp_enter_args.r11; 234 } 235 236 static __always_inline void tdvmcall_set_return_code(struct kvm_vcpu *vcpu, 237 long val) 238 { 239 to_tdx(vcpu)->vp_enter_args.r10 = val; 240 } 241 242 static __always_inline void tdvmcall_set_return_val(struct kvm_vcpu *vcpu, 243 unsigned long val) 244 { 245 to_tdx(vcpu)->vp_enter_args.r11 = val; 246 } 247 248 static inline void tdx_hkid_free(struct kvm_tdx *kvm_tdx) 249 { 250 tdx_guest_keyid_free(kvm_tdx->hkid); 251 kvm_tdx->hkid = -1; 252 atomic_dec(&nr_configured_hkid); 253 misc_cg_uncharge(MISC_CG_RES_TDX, kvm_tdx->misc_cg, 1); 254 put_misc_cg(kvm_tdx->misc_cg); 255 kvm_tdx->misc_cg = NULL; 256 } 257 258 static inline bool is_hkid_assigned(struct kvm_tdx *kvm_tdx) 259 { 260 return kvm_tdx->hkid > 0; 261 } 262 263 static inline void tdx_disassociate_vp(struct kvm_vcpu *vcpu) 264 { 265 lockdep_assert_irqs_disabled(); 266 267 list_del(&to_tdx(vcpu)->cpu_list); 268 269 /* 270 * Ensure tdx->cpu_list is updated before setting vcpu->cpu to -1, 271 * otherwise, a different CPU can see vcpu->cpu = -1 and add the vCPU 272 * to its list before it's deleted from this CPU's list. 273 */ 274 smp_wmb(); 275 276 vcpu->cpu = -1; 277 } 278 279 static void tdx_clear_page(struct page *page) 280 { 281 const void *zero_page = (const void *) page_to_virt(ZERO_PAGE(0)); 282 void *dest = page_to_virt(page); 283 unsigned long i; 284 285 /* 286 * The page could have been poisoned. MOVDIR64B also clears 287 * the poison bit so the kernel can safely use the page again. 288 */ 289 for (i = 0; i < PAGE_SIZE; i += 64) 290 movdir64b(dest + i, zero_page); 291 /* 292 * MOVDIR64B store uses WC buffer. Prevent following memory reads 293 * from seeing potentially poisoned cache. 294 */ 295 __mb(); 296 } 297 298 static void tdx_no_vcpus_enter_start(struct kvm *kvm) 299 { 300 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 301 302 lockdep_assert_held_write(&kvm->mmu_lock); 303 304 WRITE_ONCE(kvm_tdx->wait_for_sept_zap, true); 305 306 kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE); 307 } 308 309 static void tdx_no_vcpus_enter_stop(struct kvm *kvm) 310 { 311 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 312 313 lockdep_assert_held_write(&kvm->mmu_lock); 314 315 WRITE_ONCE(kvm_tdx->wait_for_sept_zap, false); 316 } 317 318 /* TDH.PHYMEM.PAGE.RECLAIM is allowed only when destroying the TD. */ 319 static int __tdx_reclaim_page(struct page *page) 320 { 321 u64 err, rcx, rdx, r8; 322 323 err = tdh_phymem_page_reclaim(page, &rcx, &rdx, &r8); 324 325 /* 326 * No need to check for TDX_OPERAND_BUSY; all TD pages are freed 327 * before the HKID is released and control pages have also been 328 * released at this point, so there is no possibility of contention. 329 */ 330 if (WARN_ON_ONCE(err)) { 331 pr_tdx_error_3(TDH_PHYMEM_PAGE_RECLAIM, err, rcx, rdx, r8); 332 return -EIO; 333 } 334 return 0; 335 } 336 337 static int tdx_reclaim_page(struct page *page) 338 { 339 int r; 340 341 r = __tdx_reclaim_page(page); 342 if (!r) 343 tdx_clear_page(page); 344 return r; 345 } 346 347 348 /* 349 * Reclaim the TD control page(s) which are crypto-protected by TDX guest's 350 * private KeyID. Assume the cache associated with the TDX private KeyID has 351 * been flushed. 352 */ 353 static void tdx_reclaim_control_page(struct page *ctrl_page) 354 { 355 /* 356 * Leak the page if the kernel failed to reclaim the page. 357 * The kernel cannot use it safely anymore. 358 */ 359 if (tdx_reclaim_page(ctrl_page)) 360 return; 361 362 __free_page(ctrl_page); 363 } 364 365 struct tdx_flush_vp_arg { 366 struct kvm_vcpu *vcpu; 367 u64 err; 368 }; 369 370 static void tdx_flush_vp(void *_arg) 371 { 372 struct tdx_flush_vp_arg *arg = _arg; 373 struct kvm_vcpu *vcpu = arg->vcpu; 374 u64 err; 375 376 arg->err = 0; 377 lockdep_assert_irqs_disabled(); 378 379 /* Task migration can race with CPU offlining. */ 380 if (unlikely(vcpu->cpu != raw_smp_processor_id())) 381 return; 382 383 /* 384 * No need to do TDH_VP_FLUSH if the vCPU hasn't been initialized. The 385 * list tracking still needs to be updated so that it's correct if/when 386 * the vCPU does get initialized. 387 */ 388 if (to_tdx(vcpu)->state != VCPU_TD_STATE_UNINITIALIZED) { 389 /* 390 * No need to retry. TDX Resources needed for TDH.VP.FLUSH are: 391 * TDVPR as exclusive, TDR as shared, and TDCS as shared. This 392 * vp flush function is called when destructing vCPU/TD or vCPU 393 * migration. No other thread uses TDVPR in those cases. 394 */ 395 err = tdh_vp_flush(&to_tdx(vcpu)->vp); 396 if (unlikely(err && err != TDX_VCPU_NOT_ASSOCIATED)) { 397 /* 398 * This function is called in IPI context. Do not use 399 * printk to avoid console semaphore. 400 * The caller prints out the error message, instead. 401 */ 402 if (err) 403 arg->err = err; 404 } 405 } 406 407 tdx_disassociate_vp(vcpu); 408 } 409 410 static void tdx_flush_vp_on_cpu(struct kvm_vcpu *vcpu) 411 { 412 struct tdx_flush_vp_arg arg = { 413 .vcpu = vcpu, 414 }; 415 int cpu = vcpu->cpu; 416 417 if (unlikely(cpu == -1)) 418 return; 419 420 smp_call_function_single(cpu, tdx_flush_vp, &arg, 1); 421 if (KVM_BUG_ON(arg.err, vcpu->kvm)) 422 pr_tdx_error(TDH_VP_FLUSH, arg.err); 423 } 424 425 void tdx_disable_virtualization_cpu(void) 426 { 427 int cpu = raw_smp_processor_id(); 428 struct list_head *tdvcpus = &per_cpu(associated_tdvcpus, cpu); 429 struct tdx_flush_vp_arg arg; 430 struct vcpu_tdx *tdx, *tmp; 431 unsigned long flags; 432 433 local_irq_save(flags); 434 /* Safe variant needed as tdx_disassociate_vp() deletes the entry. */ 435 list_for_each_entry_safe(tdx, tmp, tdvcpus, cpu_list) { 436 arg.vcpu = &tdx->vcpu; 437 tdx_flush_vp(&arg); 438 } 439 local_irq_restore(flags); 440 } 441 442 #define TDX_SEAMCALL_RETRIES 10000 443 444 static void smp_func_do_phymem_cache_wb(void *unused) 445 { 446 u64 err = 0; 447 bool resume; 448 int i; 449 450 /* 451 * TDH.PHYMEM.CACHE.WB flushes caches associated with any TDX private 452 * KeyID on the package or core. The TDX module may not finish the 453 * cache flush but return TDX_INTERRUPTED_RESUMEABLE instead. The 454 * kernel should retry it until it returns success w/o rescheduling. 455 */ 456 for (i = TDX_SEAMCALL_RETRIES; i > 0; i--) { 457 resume = !!err; 458 err = tdh_phymem_cache_wb(resume); 459 switch (err) { 460 case TDX_INTERRUPTED_RESUMABLE: 461 continue; 462 case TDX_NO_HKID_READY_TO_WBCACHE: 463 err = TDX_SUCCESS; /* Already done by other thread */ 464 fallthrough; 465 default: 466 goto out; 467 } 468 } 469 470 out: 471 if (WARN_ON_ONCE(err)) 472 pr_tdx_error(TDH_PHYMEM_CACHE_WB, err); 473 } 474 475 void tdx_mmu_release_hkid(struct kvm *kvm) 476 { 477 bool packages_allocated, targets_allocated; 478 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 479 cpumask_var_t packages, targets; 480 struct kvm_vcpu *vcpu; 481 unsigned long j; 482 int i; 483 u64 err; 484 485 if (!is_hkid_assigned(kvm_tdx)) 486 return; 487 488 packages_allocated = zalloc_cpumask_var(&packages, GFP_KERNEL); 489 targets_allocated = zalloc_cpumask_var(&targets, GFP_KERNEL); 490 cpus_read_lock(); 491 492 kvm_for_each_vcpu(j, vcpu, kvm) 493 tdx_flush_vp_on_cpu(vcpu); 494 495 /* 496 * TDH.PHYMEM.CACHE.WB tries to acquire the TDX module global lock 497 * and can fail with TDX_OPERAND_BUSY when it fails to get the lock. 498 * Multiple TDX guests can be destroyed simultaneously. Take the 499 * mutex to prevent it from getting error. 500 */ 501 mutex_lock(&tdx_lock); 502 503 /* 504 * Releasing HKID is in vm_destroy(). 505 * After the above flushing vps, there should be no more vCPU 506 * associations, as all vCPU fds have been released at this stage. 507 */ 508 err = tdh_mng_vpflushdone(&kvm_tdx->td); 509 if (err == TDX_FLUSHVP_NOT_DONE) 510 goto out; 511 if (KVM_BUG_ON(err, kvm)) { 512 pr_tdx_error(TDH_MNG_VPFLUSHDONE, err); 513 pr_err("tdh_mng_vpflushdone() failed. HKID %d is leaked.\n", 514 kvm_tdx->hkid); 515 goto out; 516 } 517 518 for_each_online_cpu(i) { 519 if (packages_allocated && 520 cpumask_test_and_set_cpu(topology_physical_package_id(i), 521 packages)) 522 continue; 523 if (targets_allocated) 524 cpumask_set_cpu(i, targets); 525 } 526 if (targets_allocated) 527 on_each_cpu_mask(targets, smp_func_do_phymem_cache_wb, NULL, true); 528 else 529 on_each_cpu(smp_func_do_phymem_cache_wb, NULL, true); 530 /* 531 * In the case of error in smp_func_do_phymem_cache_wb(), the following 532 * tdh_mng_key_freeid() will fail. 533 */ 534 err = tdh_mng_key_freeid(&kvm_tdx->td); 535 if (KVM_BUG_ON(err, kvm)) { 536 pr_tdx_error(TDH_MNG_KEY_FREEID, err); 537 pr_err("tdh_mng_key_freeid() failed. HKID %d is leaked.\n", 538 kvm_tdx->hkid); 539 } else { 540 tdx_hkid_free(kvm_tdx); 541 } 542 543 out: 544 mutex_unlock(&tdx_lock); 545 cpus_read_unlock(); 546 free_cpumask_var(targets); 547 free_cpumask_var(packages); 548 } 549 550 static void tdx_reclaim_td_control_pages(struct kvm *kvm) 551 { 552 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 553 u64 err; 554 int i; 555 556 /* 557 * tdx_mmu_release_hkid() failed to reclaim HKID. Something went wrong 558 * heavily with TDX module. Give up freeing TD pages. As the function 559 * already warned, don't warn it again. 560 */ 561 if (is_hkid_assigned(kvm_tdx)) 562 return; 563 564 if (kvm_tdx->td.tdcs_pages) { 565 for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) { 566 if (!kvm_tdx->td.tdcs_pages[i]) 567 continue; 568 569 tdx_reclaim_control_page(kvm_tdx->td.tdcs_pages[i]); 570 } 571 kfree(kvm_tdx->td.tdcs_pages); 572 kvm_tdx->td.tdcs_pages = NULL; 573 } 574 575 if (!kvm_tdx->td.tdr_page) 576 return; 577 578 if (__tdx_reclaim_page(kvm_tdx->td.tdr_page)) 579 return; 580 581 /* 582 * Use a SEAMCALL to ask the TDX module to flush the cache based on the 583 * KeyID. TDX module may access TDR while operating on TD (Especially 584 * when it is reclaiming TDCS). 585 */ 586 err = tdh_phymem_page_wbinvd_tdr(&kvm_tdx->td); 587 if (KVM_BUG_ON(err, kvm)) { 588 pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err); 589 return; 590 } 591 tdx_clear_page(kvm_tdx->td.tdr_page); 592 593 __free_page(kvm_tdx->td.tdr_page); 594 kvm_tdx->td.tdr_page = NULL; 595 } 596 597 void tdx_vm_destroy(struct kvm *kvm) 598 { 599 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 600 601 tdx_reclaim_td_control_pages(kvm); 602 603 kvm_tdx->state = TD_STATE_UNINITIALIZED; 604 } 605 606 static int tdx_do_tdh_mng_key_config(void *param) 607 { 608 struct kvm_tdx *kvm_tdx = param; 609 u64 err; 610 611 /* TDX_RND_NO_ENTROPY related retries are handled by sc_retry() */ 612 err = tdh_mng_key_config(&kvm_tdx->td); 613 614 if (KVM_BUG_ON(err, &kvm_tdx->kvm)) { 615 pr_tdx_error(TDH_MNG_KEY_CONFIG, err); 616 return -EIO; 617 } 618 619 return 0; 620 } 621 622 int tdx_vm_init(struct kvm *kvm) 623 { 624 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 625 626 kvm->arch.has_protected_state = true; 627 kvm->arch.has_private_mem = true; 628 kvm->arch.disabled_quirks |= KVM_X86_QUIRK_IGNORE_GUEST_PAT; 629 630 /* 631 * Because guest TD is protected, VMM can't parse the instruction in TD. 632 * Instead, guest uses MMIO hypercall. For unmodified device driver, 633 * #VE needs to be injected for MMIO and #VE handler in TD converts MMIO 634 * instruction into MMIO hypercall. 635 * 636 * SPTE value for MMIO needs to be setup so that #VE is injected into 637 * TD instead of triggering EPT MISCONFIG. 638 * - RWX=0 so that EPT violation is triggered. 639 * - suppress #VE bit is cleared to inject #VE. 640 */ 641 kvm_mmu_set_mmio_spte_value(kvm, 0); 642 643 /* 644 * TDX has its own limit of maximum vCPUs it can support for all 645 * TDX guests in addition to KVM_MAX_VCPUS. TDX module reports 646 * such limit via the MAX_VCPU_PER_TD global metadata. In 647 * practice, it reflects the number of logical CPUs that ALL 648 * platforms that the TDX module supports can possibly have. 649 * 650 * Limit TDX guest's maximum vCPUs to the number of logical CPUs 651 * the platform has. Simply forwarding the MAX_VCPU_PER_TD to 652 * userspace would result in an unpredictable ABI. 653 */ 654 kvm->max_vcpus = min_t(int, kvm->max_vcpus, num_present_cpus()); 655 656 kvm_tdx->state = TD_STATE_UNINITIALIZED; 657 658 return 0; 659 } 660 661 int tdx_vcpu_create(struct kvm_vcpu *vcpu) 662 { 663 struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); 664 struct vcpu_tdx *tdx = to_tdx(vcpu); 665 666 if (kvm_tdx->state != TD_STATE_INITIALIZED) 667 return -EIO; 668 669 /* 670 * TDX module mandates APICv, which requires an in-kernel local APIC. 671 * Disallow an in-kernel I/O APIC, because level-triggered interrupts 672 * and thus the I/O APIC as a whole can't be faithfully emulated in KVM. 673 */ 674 if (!irqchip_split(vcpu->kvm)) 675 return -EINVAL; 676 677 fpstate_set_confidential(&vcpu->arch.guest_fpu); 678 vcpu->arch.apic->guest_apic_protected = true; 679 INIT_LIST_HEAD(&tdx->vt.pi_wakeup_list); 680 681 vcpu->arch.efer = EFER_SCE | EFER_LME | EFER_LMA | EFER_NX; 682 683 vcpu->arch.switch_db_regs = KVM_DEBUGREG_AUTO_SWITCH; 684 vcpu->arch.cr0_guest_owned_bits = -1ul; 685 vcpu->arch.cr4_guest_owned_bits = -1ul; 686 687 /* KVM can't change TSC offset/multiplier as TDX module manages them. */ 688 vcpu->arch.guest_tsc_protected = true; 689 vcpu->arch.tsc_offset = kvm_tdx->tsc_offset; 690 vcpu->arch.l1_tsc_offset = vcpu->arch.tsc_offset; 691 vcpu->arch.tsc_scaling_ratio = kvm_tdx->tsc_multiplier; 692 vcpu->arch.l1_tsc_scaling_ratio = kvm_tdx->tsc_multiplier; 693 694 vcpu->arch.guest_state_protected = 695 !(to_kvm_tdx(vcpu->kvm)->attributes & TDX_TD_ATTR_DEBUG); 696 697 if ((kvm_tdx->xfam & XFEATURE_MASK_XTILE) == XFEATURE_MASK_XTILE) 698 vcpu->arch.xfd_no_write_intercept = true; 699 700 tdx->vt.pi_desc.nv = POSTED_INTR_VECTOR; 701 __pi_set_sn(&tdx->vt.pi_desc); 702 703 tdx->state = VCPU_TD_STATE_UNINITIALIZED; 704 705 return 0; 706 } 707 708 void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 709 { 710 struct vcpu_tdx *tdx = to_tdx(vcpu); 711 712 vmx_vcpu_pi_load(vcpu, cpu); 713 if (vcpu->cpu == cpu || !is_hkid_assigned(to_kvm_tdx(vcpu->kvm))) 714 return; 715 716 tdx_flush_vp_on_cpu(vcpu); 717 718 KVM_BUG_ON(cpu != raw_smp_processor_id(), vcpu->kvm); 719 local_irq_disable(); 720 /* 721 * Pairs with the smp_wmb() in tdx_disassociate_vp() to ensure 722 * vcpu->cpu is read before tdx->cpu_list. 723 */ 724 smp_rmb(); 725 726 list_add(&tdx->cpu_list, &per_cpu(associated_tdvcpus, cpu)); 727 local_irq_enable(); 728 } 729 730 bool tdx_interrupt_allowed(struct kvm_vcpu *vcpu) 731 { 732 /* 733 * KVM can't get the interrupt status of TDX guest and it assumes 734 * interrupt is always allowed unless TDX guest calls TDVMCALL with HLT, 735 * which passes the interrupt blocked flag. 736 */ 737 return vmx_get_exit_reason(vcpu).basic != EXIT_REASON_HLT || 738 !to_tdx(vcpu)->vp_enter_args.r12; 739 } 740 741 bool tdx_protected_apic_has_interrupt(struct kvm_vcpu *vcpu) 742 { 743 u64 vcpu_state_details; 744 745 if (pi_has_pending_interrupt(vcpu)) 746 return true; 747 748 /* 749 * Only check RVI pending for HALTED case with IRQ enabled. 750 * For non-HLT cases, KVM doesn't care about STI/SS shadows. And if the 751 * interrupt was pending before TD exit, then it _must_ be blocked, 752 * otherwise the interrupt would have been serviced at the instruction 753 * boundary. 754 */ 755 if (vmx_get_exit_reason(vcpu).basic != EXIT_REASON_HLT || 756 to_tdx(vcpu)->vp_enter_args.r12) 757 return false; 758 759 vcpu_state_details = 760 td_state_non_arch_read64(to_tdx(vcpu), TD_VCPU_STATE_DETAILS_NON_ARCH); 761 762 return tdx_vcpu_state_details_intr_pending(vcpu_state_details); 763 } 764 765 /* 766 * Compared to vmx_prepare_switch_to_guest(), there is not much to do 767 * as SEAMCALL/SEAMRET calls take care of most of save and restore. 768 */ 769 void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) 770 { 771 struct vcpu_vt *vt = to_vt(vcpu); 772 773 if (vt->guest_state_loaded) 774 return; 775 776 if (likely(is_64bit_mm(current->mm))) 777 vt->msr_host_kernel_gs_base = current->thread.gsbase; 778 else 779 vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE); 780 781 vt->host_debugctlmsr = get_debugctlmsr(); 782 783 vt->guest_state_loaded = true; 784 } 785 786 struct tdx_uret_msr { 787 u32 msr; 788 unsigned int slot; 789 u64 defval; 790 }; 791 792 static struct tdx_uret_msr tdx_uret_msrs[] = { 793 {.msr = MSR_SYSCALL_MASK, .defval = 0x20200 }, 794 {.msr = MSR_STAR,}, 795 {.msr = MSR_LSTAR,}, 796 {.msr = MSR_TSC_AUX,}, 797 }; 798 799 static void tdx_user_return_msr_update_cache(void) 800 { 801 int i; 802 803 for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++) 804 kvm_user_return_msr_update_cache(tdx_uret_msrs[i].slot, 805 tdx_uret_msrs[i].defval); 806 } 807 808 static void tdx_prepare_switch_to_host(struct kvm_vcpu *vcpu) 809 { 810 struct vcpu_vt *vt = to_vt(vcpu); 811 struct vcpu_tdx *tdx = to_tdx(vcpu); 812 813 if (!vt->guest_state_loaded) 814 return; 815 816 ++vcpu->stat.host_state_reload; 817 wrmsrl(MSR_KERNEL_GS_BASE, vt->msr_host_kernel_gs_base); 818 819 if (tdx->guest_entered) { 820 tdx_user_return_msr_update_cache(); 821 tdx->guest_entered = false; 822 } 823 824 vt->guest_state_loaded = false; 825 } 826 827 void tdx_vcpu_put(struct kvm_vcpu *vcpu) 828 { 829 vmx_vcpu_pi_put(vcpu); 830 tdx_prepare_switch_to_host(vcpu); 831 } 832 833 void tdx_vcpu_free(struct kvm_vcpu *vcpu) 834 { 835 struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); 836 struct vcpu_tdx *tdx = to_tdx(vcpu); 837 int i; 838 839 /* 840 * It is not possible to reclaim pages while hkid is assigned. It might 841 * be assigned if: 842 * 1. the TD VM is being destroyed but freeing hkid failed, in which 843 * case the pages are leaked 844 * 2. TD VCPU creation failed and this on the error path, in which case 845 * there is nothing to do anyway 846 */ 847 if (is_hkid_assigned(kvm_tdx)) 848 return; 849 850 if (tdx->vp.tdcx_pages) { 851 for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) { 852 if (tdx->vp.tdcx_pages[i]) 853 tdx_reclaim_control_page(tdx->vp.tdcx_pages[i]); 854 } 855 kfree(tdx->vp.tdcx_pages); 856 tdx->vp.tdcx_pages = NULL; 857 } 858 if (tdx->vp.tdvpr_page) { 859 tdx_reclaim_control_page(tdx->vp.tdvpr_page); 860 tdx->vp.tdvpr_page = 0; 861 } 862 863 tdx->state = VCPU_TD_STATE_UNINITIALIZED; 864 } 865 866 int tdx_vcpu_pre_run(struct kvm_vcpu *vcpu) 867 { 868 if (unlikely(to_tdx(vcpu)->state != VCPU_TD_STATE_INITIALIZED || 869 to_kvm_tdx(vcpu->kvm)->state != TD_STATE_RUNNABLE)) 870 return -EINVAL; 871 872 return 1; 873 } 874 875 static __always_inline u32 tdcall_to_vmx_exit_reason(struct kvm_vcpu *vcpu) 876 { 877 switch (tdvmcall_leaf(vcpu)) { 878 case EXIT_REASON_CPUID: 879 case EXIT_REASON_HLT: 880 case EXIT_REASON_IO_INSTRUCTION: 881 case EXIT_REASON_MSR_READ: 882 case EXIT_REASON_MSR_WRITE: 883 return tdvmcall_leaf(vcpu); 884 case EXIT_REASON_EPT_VIOLATION: 885 return EXIT_REASON_EPT_MISCONFIG; 886 default: 887 break; 888 } 889 890 return EXIT_REASON_TDCALL; 891 } 892 893 static __always_inline u32 tdx_to_vmx_exit_reason(struct kvm_vcpu *vcpu) 894 { 895 struct vcpu_tdx *tdx = to_tdx(vcpu); 896 u32 exit_reason; 897 898 switch (tdx->vp_enter_ret & TDX_SEAMCALL_STATUS_MASK) { 899 case TDX_SUCCESS: 900 case TDX_NON_RECOVERABLE_VCPU: 901 case TDX_NON_RECOVERABLE_TD: 902 case TDX_NON_RECOVERABLE_TD_NON_ACCESSIBLE: 903 case TDX_NON_RECOVERABLE_TD_WRONG_APIC_MODE: 904 break; 905 default: 906 return -1u; 907 } 908 909 exit_reason = tdx->vp_enter_ret; 910 911 switch (exit_reason) { 912 case EXIT_REASON_TDCALL: 913 if (tdvmcall_exit_type(vcpu)) 914 return EXIT_REASON_VMCALL; 915 916 return tdcall_to_vmx_exit_reason(vcpu); 917 case EXIT_REASON_EPT_MISCONFIG: 918 /* 919 * Defer KVM_BUG_ON() until tdx_handle_exit() because this is in 920 * non-instrumentable code with interrupts disabled. 921 */ 922 return -1u; 923 default: 924 break; 925 } 926 927 return exit_reason; 928 } 929 930 static noinstr void tdx_vcpu_enter_exit(struct kvm_vcpu *vcpu) 931 { 932 struct vcpu_tdx *tdx = to_tdx(vcpu); 933 struct vcpu_vt *vt = to_vt(vcpu); 934 935 guest_state_enter_irqoff(); 936 937 tdx->vp_enter_ret = tdh_vp_enter(&tdx->vp, &tdx->vp_enter_args); 938 939 vt->exit_reason.full = tdx_to_vmx_exit_reason(vcpu); 940 941 vt->exit_qualification = tdx->vp_enter_args.rcx; 942 tdx->ext_exit_qualification = tdx->vp_enter_args.rdx; 943 tdx->exit_gpa = tdx->vp_enter_args.r8; 944 vt->exit_intr_info = tdx->vp_enter_args.r9; 945 946 vmx_handle_nmi(vcpu); 947 948 guest_state_exit_irqoff(); 949 } 950 951 static bool tdx_failed_vmentry(struct kvm_vcpu *vcpu) 952 { 953 return vmx_get_exit_reason(vcpu).failed_vmentry && 954 vmx_get_exit_reason(vcpu).full != -1u; 955 } 956 957 static fastpath_t tdx_exit_handlers_fastpath(struct kvm_vcpu *vcpu) 958 { 959 u64 vp_enter_ret = to_tdx(vcpu)->vp_enter_ret; 960 961 /* 962 * TDX_OPERAND_BUSY could be returned for SEPT due to 0-step mitigation 963 * or for TD EPOCH due to contention with TDH.MEM.TRACK on TDH.VP.ENTER. 964 * 965 * When KVM requests KVM_REQ_OUTSIDE_GUEST_MODE, which has both 966 * KVM_REQUEST_WAIT and KVM_REQUEST_NO_ACTION set, it requires target 967 * vCPUs leaving fastpath so that interrupt can be enabled to ensure the 968 * IPIs can be delivered. Return EXIT_FASTPATH_EXIT_HANDLED instead of 969 * EXIT_FASTPATH_REENTER_GUEST to exit fastpath, otherwise, the 970 * requester may be blocked endlessly. 971 */ 972 if (unlikely(tdx_operand_busy(vp_enter_ret))) 973 return EXIT_FASTPATH_EXIT_HANDLED; 974 975 return EXIT_FASTPATH_NONE; 976 } 977 978 #define TDX_REGS_AVAIL_SET (BIT_ULL(VCPU_EXREG_EXIT_INFO_1) | \ 979 BIT_ULL(VCPU_EXREG_EXIT_INFO_2) | \ 980 BIT_ULL(VCPU_REGS_RAX) | \ 981 BIT_ULL(VCPU_REGS_RBX) | \ 982 BIT_ULL(VCPU_REGS_RCX) | \ 983 BIT_ULL(VCPU_REGS_RDX) | \ 984 BIT_ULL(VCPU_REGS_RBP) | \ 985 BIT_ULL(VCPU_REGS_RSI) | \ 986 BIT_ULL(VCPU_REGS_RDI) | \ 987 BIT_ULL(VCPU_REGS_R8) | \ 988 BIT_ULL(VCPU_REGS_R9) | \ 989 BIT_ULL(VCPU_REGS_R10) | \ 990 BIT_ULL(VCPU_REGS_R11) | \ 991 BIT_ULL(VCPU_REGS_R12) | \ 992 BIT_ULL(VCPU_REGS_R13) | \ 993 BIT_ULL(VCPU_REGS_R14) | \ 994 BIT_ULL(VCPU_REGS_R15)) 995 996 static void tdx_load_host_xsave_state(struct kvm_vcpu *vcpu) 997 { 998 struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); 999 1000 /* 1001 * All TDX hosts support PKRU; but even if they didn't, 1002 * vcpu->arch.host_pkru would be 0 and the wrpkru would be 1003 * skipped. 1004 */ 1005 if (vcpu->arch.host_pkru != 0) 1006 wrpkru(vcpu->arch.host_pkru); 1007 1008 if (kvm_host.xcr0 != (kvm_tdx->xfam & kvm_caps.supported_xcr0)) 1009 xsetbv(XCR_XFEATURE_ENABLED_MASK, kvm_host.xcr0); 1010 1011 /* 1012 * Likewise, even if a TDX hosts didn't support XSS both arms of 1013 * the comparison would be 0 and the wrmsrl would be skipped. 1014 */ 1015 if (kvm_host.xss != (kvm_tdx->xfam & kvm_caps.supported_xss)) 1016 wrmsrl(MSR_IA32_XSS, kvm_host.xss); 1017 } 1018 1019 #define TDX_DEBUGCTL_PRESERVED (DEBUGCTLMSR_BTF | \ 1020 DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI | \ 1021 DEBUGCTLMSR_FREEZE_IN_SMM) 1022 1023 fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) 1024 { 1025 struct vcpu_tdx *tdx = to_tdx(vcpu); 1026 struct vcpu_vt *vt = to_vt(vcpu); 1027 1028 /* 1029 * force_immediate_exit requires vCPU entering for events injection with 1030 * an immediately exit followed. But The TDX module doesn't guarantee 1031 * entry, it's already possible for KVM to _think_ it completely entry 1032 * to the guest without actually having done so. 1033 * Since KVM never needs to force an immediate exit for TDX, and can't 1034 * do direct injection, just warn on force_immediate_exit. 1035 */ 1036 WARN_ON_ONCE(force_immediate_exit); 1037 1038 /* 1039 * Wait until retry of SEPT-zap-related SEAMCALL completes before 1040 * allowing vCPU entry to avoid contention with tdh_vp_enter() and 1041 * TDCALLs. 1042 */ 1043 if (unlikely(READ_ONCE(to_kvm_tdx(vcpu->kvm)->wait_for_sept_zap))) 1044 return EXIT_FASTPATH_EXIT_HANDLED; 1045 1046 trace_kvm_entry(vcpu, force_immediate_exit); 1047 1048 if (pi_test_on(&vt->pi_desc)) { 1049 apic->send_IPI_self(POSTED_INTR_VECTOR); 1050 1051 if (pi_test_pir(kvm_lapic_get_reg(vcpu->arch.apic, APIC_LVTT) & 1052 APIC_VECTOR_MASK, &vt->pi_desc)) 1053 kvm_wait_lapic_expire(vcpu); 1054 } 1055 1056 tdx_vcpu_enter_exit(vcpu); 1057 1058 if (vt->host_debugctlmsr & ~TDX_DEBUGCTL_PRESERVED) 1059 update_debugctlmsr(vt->host_debugctlmsr); 1060 1061 tdx_load_host_xsave_state(vcpu); 1062 tdx->guest_entered = true; 1063 1064 vcpu->arch.regs_avail &= TDX_REGS_AVAIL_SET; 1065 1066 if (unlikely(tdx->vp_enter_ret == EXIT_REASON_EPT_MISCONFIG)) 1067 return EXIT_FASTPATH_NONE; 1068 1069 if (unlikely((tdx->vp_enter_ret & TDX_SW_ERROR) == TDX_SW_ERROR)) 1070 return EXIT_FASTPATH_NONE; 1071 1072 if (unlikely(vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MCE_DURING_VMENTRY)) 1073 kvm_machine_check(); 1074 1075 trace_kvm_exit(vcpu, KVM_ISA_VMX); 1076 1077 if (unlikely(tdx_failed_vmentry(vcpu))) 1078 return EXIT_FASTPATH_NONE; 1079 1080 return tdx_exit_handlers_fastpath(vcpu); 1081 } 1082 1083 void tdx_inject_nmi(struct kvm_vcpu *vcpu) 1084 { 1085 ++vcpu->stat.nmi_injections; 1086 td_management_write8(to_tdx(vcpu), TD_VCPU_PEND_NMI, 1); 1087 /* 1088 * From KVM's perspective, NMI injection is completed right after 1089 * writing to PEND_NMI. KVM doesn't care whether an NMI is injected by 1090 * the TDX module or not. 1091 */ 1092 vcpu->arch.nmi_injected = false; 1093 /* 1094 * TDX doesn't support KVM to request NMI window exit. If there is 1095 * still a pending vNMI, KVM is not able to inject it along with the 1096 * one pending in TDX module in a back-to-back way. Since the previous 1097 * vNMI is still pending in TDX module, i.e. it has not been delivered 1098 * to TDX guest yet, it's OK to collapse the pending vNMI into the 1099 * previous one. The guest is expected to handle all the NMI sources 1100 * when handling the first vNMI. 1101 */ 1102 vcpu->arch.nmi_pending = 0; 1103 } 1104 1105 static int tdx_handle_exception_nmi(struct kvm_vcpu *vcpu) 1106 { 1107 u32 intr_info = vmx_get_intr_info(vcpu); 1108 1109 /* 1110 * Machine checks are handled by handle_exception_irqoff(), or by 1111 * tdx_handle_exit() with TDX_NON_RECOVERABLE set if a #MC occurs on 1112 * VM-Entry. NMIs are handled by tdx_vcpu_enter_exit(). 1113 */ 1114 if (is_nmi(intr_info) || is_machine_check(intr_info)) 1115 return 1; 1116 1117 vcpu->run->exit_reason = KVM_EXIT_EXCEPTION; 1118 vcpu->run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK; 1119 vcpu->run->ex.error_code = 0; 1120 1121 return 0; 1122 } 1123 1124 static int complete_hypercall_exit(struct kvm_vcpu *vcpu) 1125 { 1126 tdvmcall_set_return_code(vcpu, vcpu->run->hypercall.ret); 1127 return 1; 1128 } 1129 1130 static int tdx_emulate_vmcall(struct kvm_vcpu *vcpu) 1131 { 1132 kvm_rax_write(vcpu, to_tdx(vcpu)->vp_enter_args.r10); 1133 kvm_rbx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r11); 1134 kvm_rcx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r12); 1135 kvm_rdx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r13); 1136 kvm_rsi_write(vcpu, to_tdx(vcpu)->vp_enter_args.r14); 1137 1138 return __kvm_emulate_hypercall(vcpu, 0, complete_hypercall_exit); 1139 } 1140 1141 /* 1142 * Split into chunks and check interrupt pending between chunks. This allows 1143 * for timely injection of interrupts to prevent issues with guest lockup 1144 * detection. 1145 */ 1146 #define TDX_MAP_GPA_MAX_LEN (2 * 1024 * 1024) 1147 static void __tdx_map_gpa(struct vcpu_tdx *tdx); 1148 1149 static int tdx_complete_vmcall_map_gpa(struct kvm_vcpu *vcpu) 1150 { 1151 struct vcpu_tdx *tdx = to_tdx(vcpu); 1152 1153 if (vcpu->run->hypercall.ret) { 1154 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); 1155 tdx->vp_enter_args.r11 = tdx->map_gpa_next; 1156 return 1; 1157 } 1158 1159 tdx->map_gpa_next += TDX_MAP_GPA_MAX_LEN; 1160 if (tdx->map_gpa_next >= tdx->map_gpa_end) 1161 return 1; 1162 1163 /* 1164 * Stop processing the remaining part if there is a pending interrupt, 1165 * which could be qualified to deliver. Skip checking pending RVI for 1166 * TDVMCALL_MAP_GPA, see comments in tdx_protected_apic_has_interrupt(). 1167 */ 1168 if (kvm_vcpu_has_events(vcpu)) { 1169 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_RETRY); 1170 tdx->vp_enter_args.r11 = tdx->map_gpa_next; 1171 return 1; 1172 } 1173 1174 __tdx_map_gpa(tdx); 1175 return 0; 1176 } 1177 1178 static void __tdx_map_gpa(struct vcpu_tdx *tdx) 1179 { 1180 u64 gpa = tdx->map_gpa_next; 1181 u64 size = tdx->map_gpa_end - tdx->map_gpa_next; 1182 1183 if (size > TDX_MAP_GPA_MAX_LEN) 1184 size = TDX_MAP_GPA_MAX_LEN; 1185 1186 tdx->vcpu.run->exit_reason = KVM_EXIT_HYPERCALL; 1187 tdx->vcpu.run->hypercall.nr = KVM_HC_MAP_GPA_RANGE; 1188 /* 1189 * In principle this should have been -KVM_ENOSYS, but userspace (QEMU <=9.2) 1190 * assumed that vcpu->run->hypercall.ret is never changed by KVM and thus that 1191 * it was always zero on KVM_EXIT_HYPERCALL. Since KVM is now overwriting 1192 * vcpu->run->hypercall.ret, ensuring that it is zero to not break QEMU. 1193 */ 1194 tdx->vcpu.run->hypercall.ret = 0; 1195 tdx->vcpu.run->hypercall.args[0] = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(tdx->vcpu.kvm)); 1196 tdx->vcpu.run->hypercall.args[1] = size / PAGE_SIZE; 1197 tdx->vcpu.run->hypercall.args[2] = vt_is_tdx_private_gpa(tdx->vcpu.kvm, gpa) ? 1198 KVM_MAP_GPA_RANGE_ENCRYPTED : 1199 KVM_MAP_GPA_RANGE_DECRYPTED; 1200 tdx->vcpu.run->hypercall.flags = KVM_EXIT_HYPERCALL_LONG_MODE; 1201 1202 tdx->vcpu.arch.complete_userspace_io = tdx_complete_vmcall_map_gpa; 1203 } 1204 1205 static int tdx_map_gpa(struct kvm_vcpu *vcpu) 1206 { 1207 struct vcpu_tdx *tdx = to_tdx(vcpu); 1208 u64 gpa = tdx->vp_enter_args.r12; 1209 u64 size = tdx->vp_enter_args.r13; 1210 u64 ret; 1211 1212 /* 1213 * Converting TDVMCALL_MAP_GPA to KVM_HC_MAP_GPA_RANGE requires 1214 * userspace to enable KVM_CAP_EXIT_HYPERCALL with KVM_HC_MAP_GPA_RANGE 1215 * bit set. This is a base call so it should always be supported, but 1216 * KVM has no way to ensure that userspace implements the GHCI correctly. 1217 * So if KVM_HC_MAP_GPA_RANGE does not cause a VMEXIT, return an error 1218 * to the guest. 1219 */ 1220 if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE)) { 1221 ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED; 1222 goto error; 1223 } 1224 1225 if (gpa + size <= gpa || !kvm_vcpu_is_legal_gpa(vcpu, gpa) || 1226 !kvm_vcpu_is_legal_gpa(vcpu, gpa + size - 1) || 1227 (vt_is_tdx_private_gpa(vcpu->kvm, gpa) != 1228 vt_is_tdx_private_gpa(vcpu->kvm, gpa + size - 1))) { 1229 ret = TDVMCALL_STATUS_INVALID_OPERAND; 1230 goto error; 1231 } 1232 1233 if (!PAGE_ALIGNED(gpa) || !PAGE_ALIGNED(size)) { 1234 ret = TDVMCALL_STATUS_ALIGN_ERROR; 1235 goto error; 1236 } 1237 1238 tdx->map_gpa_end = gpa + size; 1239 tdx->map_gpa_next = gpa; 1240 1241 __tdx_map_gpa(tdx); 1242 return 0; 1243 1244 error: 1245 tdvmcall_set_return_code(vcpu, ret); 1246 tdx->vp_enter_args.r11 = gpa; 1247 return 1; 1248 } 1249 1250 static int tdx_report_fatal_error(struct kvm_vcpu *vcpu) 1251 { 1252 struct vcpu_tdx *tdx = to_tdx(vcpu); 1253 u64 *regs = vcpu->run->system_event.data; 1254 u64 *module_regs = &tdx->vp_enter_args.r8; 1255 int index = VCPU_REGS_RAX; 1256 1257 vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; 1258 vcpu->run->system_event.type = KVM_SYSTEM_EVENT_TDX_FATAL; 1259 vcpu->run->system_event.ndata = 16; 1260 1261 /* Dump 16 general-purpose registers to userspace in ascending order. */ 1262 regs[index++] = tdx->vp_enter_ret; 1263 regs[index++] = tdx->vp_enter_args.rcx; 1264 regs[index++] = tdx->vp_enter_args.rdx; 1265 regs[index++] = tdx->vp_enter_args.rbx; 1266 regs[index++] = 0; 1267 regs[index++] = 0; 1268 regs[index++] = tdx->vp_enter_args.rsi; 1269 regs[index] = tdx->vp_enter_args.rdi; 1270 for (index = 0; index < 8; index++) 1271 regs[VCPU_REGS_R8 + index] = module_regs[index]; 1272 1273 return 0; 1274 } 1275 1276 static int tdx_emulate_cpuid(struct kvm_vcpu *vcpu) 1277 { 1278 u32 eax, ebx, ecx, edx; 1279 struct vcpu_tdx *tdx = to_tdx(vcpu); 1280 1281 /* EAX and ECX for cpuid is stored in R12 and R13. */ 1282 eax = tdx->vp_enter_args.r12; 1283 ecx = tdx->vp_enter_args.r13; 1284 1285 kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx, false); 1286 1287 tdx->vp_enter_args.r12 = eax; 1288 tdx->vp_enter_args.r13 = ebx; 1289 tdx->vp_enter_args.r14 = ecx; 1290 tdx->vp_enter_args.r15 = edx; 1291 1292 return 1; 1293 } 1294 1295 static int tdx_complete_pio_out(struct kvm_vcpu *vcpu) 1296 { 1297 vcpu->arch.pio.count = 0; 1298 return 1; 1299 } 1300 1301 static int tdx_complete_pio_in(struct kvm_vcpu *vcpu) 1302 { 1303 struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; 1304 unsigned long val = 0; 1305 int ret; 1306 1307 ret = ctxt->ops->pio_in_emulated(ctxt, vcpu->arch.pio.size, 1308 vcpu->arch.pio.port, &val, 1); 1309 1310 WARN_ON_ONCE(!ret); 1311 1312 tdvmcall_set_return_val(vcpu, val); 1313 1314 return 1; 1315 } 1316 1317 static int tdx_emulate_io(struct kvm_vcpu *vcpu) 1318 { 1319 struct vcpu_tdx *tdx = to_tdx(vcpu); 1320 struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; 1321 unsigned long val = 0; 1322 unsigned int port; 1323 u64 size, write; 1324 int ret; 1325 1326 ++vcpu->stat.io_exits; 1327 1328 size = tdx->vp_enter_args.r12; 1329 write = tdx->vp_enter_args.r13; 1330 port = tdx->vp_enter_args.r14; 1331 1332 if ((write != 0 && write != 1) || (size != 1 && size != 2 && size != 4)) { 1333 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); 1334 return 1; 1335 } 1336 1337 if (write) { 1338 val = tdx->vp_enter_args.r15; 1339 ret = ctxt->ops->pio_out_emulated(ctxt, size, port, &val, 1); 1340 } else { 1341 ret = ctxt->ops->pio_in_emulated(ctxt, size, port, &val, 1); 1342 } 1343 1344 if (!ret) 1345 vcpu->arch.complete_userspace_io = write ? tdx_complete_pio_out : 1346 tdx_complete_pio_in; 1347 else if (!write) 1348 tdvmcall_set_return_val(vcpu, val); 1349 1350 return ret; 1351 } 1352 1353 static int tdx_complete_mmio_read(struct kvm_vcpu *vcpu) 1354 { 1355 unsigned long val = 0; 1356 gpa_t gpa; 1357 int size; 1358 1359 gpa = vcpu->mmio_fragments[0].gpa; 1360 size = vcpu->mmio_fragments[0].len; 1361 1362 memcpy(&val, vcpu->run->mmio.data, size); 1363 tdvmcall_set_return_val(vcpu, val); 1364 trace_kvm_mmio(KVM_TRACE_MMIO_READ, size, gpa, &val); 1365 return 1; 1366 } 1367 1368 static inline int tdx_mmio_write(struct kvm_vcpu *vcpu, gpa_t gpa, int size, 1369 unsigned long val) 1370 { 1371 if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { 1372 trace_kvm_fast_mmio(gpa); 1373 return 0; 1374 } 1375 1376 trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, size, gpa, &val); 1377 if (kvm_io_bus_write(vcpu, KVM_MMIO_BUS, gpa, size, &val)) 1378 return -EOPNOTSUPP; 1379 1380 return 0; 1381 } 1382 1383 static inline int tdx_mmio_read(struct kvm_vcpu *vcpu, gpa_t gpa, int size) 1384 { 1385 unsigned long val; 1386 1387 if (kvm_io_bus_read(vcpu, KVM_MMIO_BUS, gpa, size, &val)) 1388 return -EOPNOTSUPP; 1389 1390 tdvmcall_set_return_val(vcpu, val); 1391 trace_kvm_mmio(KVM_TRACE_MMIO_READ, size, gpa, &val); 1392 return 0; 1393 } 1394 1395 static int tdx_emulate_mmio(struct kvm_vcpu *vcpu) 1396 { 1397 struct vcpu_tdx *tdx = to_tdx(vcpu); 1398 int size, write, r; 1399 unsigned long val; 1400 gpa_t gpa; 1401 1402 size = tdx->vp_enter_args.r12; 1403 write = tdx->vp_enter_args.r13; 1404 gpa = tdx->vp_enter_args.r14; 1405 val = write ? tdx->vp_enter_args.r15 : 0; 1406 1407 if (size != 1 && size != 2 && size != 4 && size != 8) 1408 goto error; 1409 if (write != 0 && write != 1) 1410 goto error; 1411 1412 /* 1413 * TDG.VP.VMCALL<MMIO> allows only shared GPA, it makes no sense to 1414 * do MMIO emulation for private GPA. 1415 */ 1416 if (vt_is_tdx_private_gpa(vcpu->kvm, gpa) || 1417 vt_is_tdx_private_gpa(vcpu->kvm, gpa + size - 1)) 1418 goto error; 1419 1420 gpa = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(vcpu->kvm)); 1421 1422 if (write) 1423 r = tdx_mmio_write(vcpu, gpa, size, val); 1424 else 1425 r = tdx_mmio_read(vcpu, gpa, size); 1426 if (!r) 1427 /* Kernel completed device emulation. */ 1428 return 1; 1429 1430 /* Request the device emulation to userspace device model. */ 1431 vcpu->mmio_is_write = write; 1432 if (!write) 1433 vcpu->arch.complete_userspace_io = tdx_complete_mmio_read; 1434 1435 vcpu->run->mmio.phys_addr = gpa; 1436 vcpu->run->mmio.len = size; 1437 vcpu->run->mmio.is_write = write; 1438 vcpu->run->exit_reason = KVM_EXIT_MMIO; 1439 1440 if (write) { 1441 memcpy(vcpu->run->mmio.data, &val, size); 1442 } else { 1443 vcpu->mmio_fragments[0].gpa = gpa; 1444 vcpu->mmio_fragments[0].len = size; 1445 trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, size, gpa, NULL); 1446 } 1447 return 0; 1448 1449 error: 1450 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); 1451 return 1; 1452 } 1453 1454 static int tdx_complete_get_td_vm_call_info(struct kvm_vcpu *vcpu) 1455 { 1456 struct vcpu_tdx *tdx = to_tdx(vcpu); 1457 1458 tdvmcall_set_return_code(vcpu, vcpu->run->tdx.get_tdvmcall_info.ret); 1459 1460 /* 1461 * For now, there is no TDVMCALL beyond GHCI base API supported by KVM 1462 * directly without the support from userspace, just set the value 1463 * returned from userspace. 1464 */ 1465 tdx->vp_enter_args.r11 = vcpu->run->tdx.get_tdvmcall_info.r11; 1466 tdx->vp_enter_args.r12 = vcpu->run->tdx.get_tdvmcall_info.r12; 1467 tdx->vp_enter_args.r13 = vcpu->run->tdx.get_tdvmcall_info.r13; 1468 tdx->vp_enter_args.r14 = vcpu->run->tdx.get_tdvmcall_info.r14; 1469 1470 return 1; 1471 } 1472 1473 static int tdx_get_td_vm_call_info(struct kvm_vcpu *vcpu) 1474 { 1475 struct vcpu_tdx *tdx = to_tdx(vcpu); 1476 1477 switch (tdx->vp_enter_args.r12) { 1478 case 0: 1479 tdx->vp_enter_args.r11 = 0; 1480 tdx->vp_enter_args.r12 = 0; 1481 tdx->vp_enter_args.r13 = 0; 1482 tdx->vp_enter_args.r14 = 0; 1483 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_SUCCESS); 1484 return 1; 1485 case 1: 1486 vcpu->run->tdx.get_tdvmcall_info.leaf = tdx->vp_enter_args.r12; 1487 vcpu->run->exit_reason = KVM_EXIT_TDX; 1488 vcpu->run->tdx.flags = 0; 1489 vcpu->run->tdx.nr = TDVMCALL_GET_TD_VM_CALL_INFO; 1490 vcpu->run->tdx.get_tdvmcall_info.ret = TDVMCALL_STATUS_SUCCESS; 1491 vcpu->run->tdx.get_tdvmcall_info.r11 = 0; 1492 vcpu->run->tdx.get_tdvmcall_info.r12 = 0; 1493 vcpu->run->tdx.get_tdvmcall_info.r13 = 0; 1494 vcpu->run->tdx.get_tdvmcall_info.r14 = 0; 1495 vcpu->arch.complete_userspace_io = tdx_complete_get_td_vm_call_info; 1496 return 0; 1497 default: 1498 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); 1499 return 1; 1500 } 1501 } 1502 1503 static int tdx_complete_simple(struct kvm_vcpu *vcpu) 1504 { 1505 tdvmcall_set_return_code(vcpu, vcpu->run->tdx.unknown.ret); 1506 return 1; 1507 } 1508 1509 static int tdx_get_quote(struct kvm_vcpu *vcpu) 1510 { 1511 struct vcpu_tdx *tdx = to_tdx(vcpu); 1512 u64 gpa = tdx->vp_enter_args.r12; 1513 u64 size = tdx->vp_enter_args.r13; 1514 1515 /* The gpa of buffer must have shared bit set. */ 1516 if (vt_is_tdx_private_gpa(vcpu->kvm, gpa)) { 1517 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); 1518 return 1; 1519 } 1520 1521 vcpu->run->exit_reason = KVM_EXIT_TDX; 1522 vcpu->run->tdx.flags = 0; 1523 vcpu->run->tdx.nr = TDVMCALL_GET_QUOTE; 1524 vcpu->run->tdx.get_quote.ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED; 1525 vcpu->run->tdx.get_quote.gpa = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(tdx->vcpu.kvm)); 1526 vcpu->run->tdx.get_quote.size = size; 1527 1528 vcpu->arch.complete_userspace_io = tdx_complete_simple; 1529 1530 return 0; 1531 } 1532 1533 static int handle_tdvmcall(struct kvm_vcpu *vcpu) 1534 { 1535 switch (tdvmcall_leaf(vcpu)) { 1536 case TDVMCALL_MAP_GPA: 1537 return tdx_map_gpa(vcpu); 1538 case TDVMCALL_REPORT_FATAL_ERROR: 1539 return tdx_report_fatal_error(vcpu); 1540 case TDVMCALL_GET_TD_VM_CALL_INFO: 1541 return tdx_get_td_vm_call_info(vcpu); 1542 case TDVMCALL_GET_QUOTE: 1543 return tdx_get_quote(vcpu); 1544 default: 1545 break; 1546 } 1547 1548 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED); 1549 return 1; 1550 } 1551 1552 void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int pgd_level) 1553 { 1554 u64 shared_bit = (pgd_level == 5) ? TDX_SHARED_BIT_PWL_5 : 1555 TDX_SHARED_BIT_PWL_4; 1556 1557 if (KVM_BUG_ON(shared_bit != kvm_gfn_direct_bits(vcpu->kvm), vcpu->kvm)) 1558 return; 1559 1560 td_vmcs_write64(to_tdx(vcpu), SHARED_EPT_POINTER, root_hpa); 1561 } 1562 1563 static void tdx_unpin(struct kvm *kvm, struct page *page) 1564 { 1565 put_page(page); 1566 } 1567 1568 static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn, 1569 enum pg_level level, struct page *page) 1570 { 1571 int tdx_level = pg_level_to_tdx_sept_level(level); 1572 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 1573 gpa_t gpa = gfn_to_gpa(gfn); 1574 u64 entry, level_state; 1575 u64 err; 1576 1577 err = tdh_mem_page_aug(&kvm_tdx->td, gpa, tdx_level, page, &entry, &level_state); 1578 if (unlikely(tdx_operand_busy(err))) { 1579 tdx_unpin(kvm, page); 1580 return -EBUSY; 1581 } 1582 1583 if (KVM_BUG_ON(err, kvm)) { 1584 pr_tdx_error_2(TDH_MEM_PAGE_AUG, err, entry, level_state); 1585 tdx_unpin(kvm, page); 1586 return -EIO; 1587 } 1588 1589 return 0; 1590 } 1591 1592 /* 1593 * KVM_TDX_INIT_MEM_REGION calls kvm_gmem_populate() to map guest pages; the 1594 * callback tdx_gmem_post_populate() then maps pages into private memory. 1595 * through the a seamcall TDH.MEM.PAGE.ADD(). The SEAMCALL also requires the 1596 * private EPT structures for the page to have been built before, which is 1597 * done via kvm_tdp_map_page(). nr_premapped counts the number of pages that 1598 * were added to the EPT structures but not added with TDH.MEM.PAGE.ADD(). 1599 * The counter has to be zero on KVM_TDX_FINALIZE_VM, to ensure that there 1600 * are no half-initialized shared EPT pages. 1601 */ 1602 static int tdx_mem_page_record_premap_cnt(struct kvm *kvm, gfn_t gfn, 1603 enum pg_level level, kvm_pfn_t pfn) 1604 { 1605 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 1606 1607 if (KVM_BUG_ON(kvm->arch.pre_fault_allowed, kvm)) 1608 return -EINVAL; 1609 1610 /* nr_premapped will be decreased when tdh_mem_page_add() is called. */ 1611 atomic64_inc(&kvm_tdx->nr_premapped); 1612 return 0; 1613 } 1614 1615 int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn, 1616 enum pg_level level, kvm_pfn_t pfn) 1617 { 1618 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 1619 struct page *page = pfn_to_page(pfn); 1620 1621 /* TODO: handle large pages. */ 1622 if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm)) 1623 return -EINVAL; 1624 1625 /* 1626 * Because guest_memfd doesn't support page migration with 1627 * a_ops->migrate_folio (yet), no callback is triggered for KVM on page 1628 * migration. Until guest_memfd supports page migration, prevent page 1629 * migration. 1630 * TODO: Once guest_memfd introduces callback on page migration, 1631 * implement it and remove get_page/put_page(). 1632 */ 1633 get_page(page); 1634 1635 /* 1636 * Read 'pre_fault_allowed' before 'kvm_tdx->state'; see matching 1637 * barrier in tdx_td_finalize(). 1638 */ 1639 smp_rmb(); 1640 if (likely(kvm_tdx->state == TD_STATE_RUNNABLE)) 1641 return tdx_mem_page_aug(kvm, gfn, level, page); 1642 1643 return tdx_mem_page_record_premap_cnt(kvm, gfn, level, pfn); 1644 } 1645 1646 static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn, 1647 enum pg_level level, struct page *page) 1648 { 1649 int tdx_level = pg_level_to_tdx_sept_level(level); 1650 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 1651 gpa_t gpa = gfn_to_gpa(gfn); 1652 u64 err, entry, level_state; 1653 1654 /* TODO: handle large pages. */ 1655 if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm)) 1656 return -EINVAL; 1657 1658 if (KVM_BUG_ON(!is_hkid_assigned(kvm_tdx), kvm)) 1659 return -EINVAL; 1660 1661 /* 1662 * When zapping private page, write lock is held. So no race condition 1663 * with other vcpu sept operation. 1664 * Race with TDH.VP.ENTER due to (0-step mitigation) and Guest TDCALLs. 1665 */ 1666 err = tdh_mem_page_remove(&kvm_tdx->td, gpa, tdx_level, &entry, 1667 &level_state); 1668 1669 if (unlikely(tdx_operand_busy(err))) { 1670 /* 1671 * The second retry is expected to succeed after kicking off all 1672 * other vCPUs and prevent them from invoking TDH.VP.ENTER. 1673 */ 1674 tdx_no_vcpus_enter_start(kvm); 1675 err = tdh_mem_page_remove(&kvm_tdx->td, gpa, tdx_level, &entry, 1676 &level_state); 1677 tdx_no_vcpus_enter_stop(kvm); 1678 } 1679 1680 if (KVM_BUG_ON(err, kvm)) { 1681 pr_tdx_error_2(TDH_MEM_PAGE_REMOVE, err, entry, level_state); 1682 return -EIO; 1683 } 1684 1685 err = tdh_phymem_page_wbinvd_hkid((u16)kvm_tdx->hkid, page); 1686 1687 if (KVM_BUG_ON(err, kvm)) { 1688 pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err); 1689 return -EIO; 1690 } 1691 tdx_clear_page(page); 1692 tdx_unpin(kvm, page); 1693 return 0; 1694 } 1695 1696 int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn, 1697 enum pg_level level, void *private_spt) 1698 { 1699 int tdx_level = pg_level_to_tdx_sept_level(level); 1700 gpa_t gpa = gfn_to_gpa(gfn); 1701 struct page *page = virt_to_page(private_spt); 1702 u64 err, entry, level_state; 1703 1704 err = tdh_mem_sept_add(&to_kvm_tdx(kvm)->td, gpa, tdx_level, page, &entry, 1705 &level_state); 1706 if (unlikely(tdx_operand_busy(err))) 1707 return -EBUSY; 1708 1709 if (KVM_BUG_ON(err, kvm)) { 1710 pr_tdx_error_2(TDH_MEM_SEPT_ADD, err, entry, level_state); 1711 return -EIO; 1712 } 1713 1714 return 0; 1715 } 1716 1717 /* 1718 * Check if the error returned from a SEPT zap SEAMCALL is due to that a page is 1719 * mapped by KVM_TDX_INIT_MEM_REGION without tdh_mem_page_add() being called 1720 * successfully. 1721 * 1722 * Since tdh_mem_sept_add() must have been invoked successfully before a 1723 * non-leaf entry present in the mirrored page table, the SEPT ZAP related 1724 * SEAMCALLs should not encounter err TDX_EPT_WALK_FAILED. They should instead 1725 * find TDX_EPT_ENTRY_STATE_INCORRECT due to an empty leaf entry found in the 1726 * SEPT. 1727 * 1728 * Further check if the returned entry from SEPT walking is with RWX permissions 1729 * to filter out anything unexpected. 1730 * 1731 * Note: @level is pg_level, not the tdx_level. The tdx_level extracted from 1732 * level_state returned from a SEAMCALL error is the same as that passed into 1733 * the SEAMCALL. 1734 */ 1735 static int tdx_is_sept_zap_err_due_to_premap(struct kvm_tdx *kvm_tdx, u64 err, 1736 u64 entry, int level) 1737 { 1738 if (!err || kvm_tdx->state == TD_STATE_RUNNABLE) 1739 return false; 1740 1741 if (err != (TDX_EPT_ENTRY_STATE_INCORRECT | TDX_OPERAND_ID_RCX)) 1742 return false; 1743 1744 if ((is_last_spte(entry, level) && (entry & VMX_EPT_RWX_MASK))) 1745 return false; 1746 1747 return true; 1748 } 1749 1750 static int tdx_sept_zap_private_spte(struct kvm *kvm, gfn_t gfn, 1751 enum pg_level level, struct page *page) 1752 { 1753 int tdx_level = pg_level_to_tdx_sept_level(level); 1754 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 1755 gpa_t gpa = gfn_to_gpa(gfn) & KVM_HPAGE_MASK(level); 1756 u64 err, entry, level_state; 1757 1758 /* For now large page isn't supported yet. */ 1759 WARN_ON_ONCE(level != PG_LEVEL_4K); 1760 1761 err = tdh_mem_range_block(&kvm_tdx->td, gpa, tdx_level, &entry, &level_state); 1762 1763 if (unlikely(tdx_operand_busy(err))) { 1764 /* After no vCPUs enter, the second retry is expected to succeed */ 1765 tdx_no_vcpus_enter_start(kvm); 1766 err = tdh_mem_range_block(&kvm_tdx->td, gpa, tdx_level, &entry, &level_state); 1767 tdx_no_vcpus_enter_stop(kvm); 1768 } 1769 if (tdx_is_sept_zap_err_due_to_premap(kvm_tdx, err, entry, level) && 1770 !KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm)) { 1771 atomic64_dec(&kvm_tdx->nr_premapped); 1772 tdx_unpin(kvm, page); 1773 return 0; 1774 } 1775 1776 if (KVM_BUG_ON(err, kvm)) { 1777 pr_tdx_error_2(TDH_MEM_RANGE_BLOCK, err, entry, level_state); 1778 return -EIO; 1779 } 1780 return 1; 1781 } 1782 1783 /* 1784 * Ensure shared and private EPTs to be flushed on all vCPUs. 1785 * tdh_mem_track() is the only caller that increases TD epoch. An increase in 1786 * the TD epoch (e.g., to value "N + 1") is successful only if no vCPUs are 1787 * running in guest mode with the value "N - 1". 1788 * 1789 * A successful execution of tdh_mem_track() ensures that vCPUs can only run in 1790 * guest mode with TD epoch value "N" if no TD exit occurs after the TD epoch 1791 * being increased to "N + 1". 1792 * 1793 * Kicking off all vCPUs after that further results in no vCPUs can run in guest 1794 * mode with TD epoch value "N", which unblocks the next tdh_mem_track() (e.g. 1795 * to increase TD epoch to "N + 2"). 1796 * 1797 * TDX module will flush EPT on the next TD enter and make vCPUs to run in 1798 * guest mode with TD epoch value "N + 1". 1799 * 1800 * kvm_make_all_cpus_request() guarantees all vCPUs are out of guest mode by 1801 * waiting empty IPI handler ack_kick(). 1802 * 1803 * No action is required to the vCPUs being kicked off since the kicking off 1804 * occurs certainly after TD epoch increment and before the next 1805 * tdh_mem_track(). 1806 */ 1807 static void tdx_track(struct kvm *kvm) 1808 { 1809 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 1810 u64 err; 1811 1812 /* If TD isn't finalized, it's before any vcpu running. */ 1813 if (unlikely(kvm_tdx->state != TD_STATE_RUNNABLE)) 1814 return; 1815 1816 lockdep_assert_held_write(&kvm->mmu_lock); 1817 1818 err = tdh_mem_track(&kvm_tdx->td); 1819 if (unlikely(tdx_operand_busy(err))) { 1820 /* After no vCPUs enter, the second retry is expected to succeed */ 1821 tdx_no_vcpus_enter_start(kvm); 1822 err = tdh_mem_track(&kvm_tdx->td); 1823 tdx_no_vcpus_enter_stop(kvm); 1824 } 1825 1826 if (KVM_BUG_ON(err, kvm)) 1827 pr_tdx_error(TDH_MEM_TRACK, err); 1828 1829 kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE); 1830 } 1831 1832 int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn, 1833 enum pg_level level, void *private_spt) 1834 { 1835 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 1836 1837 /* 1838 * free_external_spt() is only called after hkid is freed when TD is 1839 * tearing down. 1840 * KVM doesn't (yet) zap page table pages in mirror page table while 1841 * TD is active, though guest pages mapped in mirror page table could be 1842 * zapped during TD is active, e.g. for shared <-> private conversion 1843 * and slot move/deletion. 1844 */ 1845 if (KVM_BUG_ON(is_hkid_assigned(kvm_tdx), kvm)) 1846 return -EINVAL; 1847 1848 /* 1849 * The HKID assigned to this TD was already freed and cache was 1850 * already flushed. We don't have to flush again. 1851 */ 1852 return tdx_reclaim_page(virt_to_page(private_spt)); 1853 } 1854 1855 int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn, 1856 enum pg_level level, kvm_pfn_t pfn) 1857 { 1858 struct page *page = pfn_to_page(pfn); 1859 int ret; 1860 1861 /* 1862 * HKID is released after all private pages have been removed, and set 1863 * before any might be populated. Warn if zapping is attempted when 1864 * there can't be anything populated in the private EPT. 1865 */ 1866 if (KVM_BUG_ON(!is_hkid_assigned(to_kvm_tdx(kvm)), kvm)) 1867 return -EINVAL; 1868 1869 ret = tdx_sept_zap_private_spte(kvm, gfn, level, page); 1870 if (ret <= 0) 1871 return ret; 1872 1873 /* 1874 * TDX requires TLB tracking before dropping private page. Do 1875 * it here, although it is also done later. 1876 */ 1877 tdx_track(kvm); 1878 1879 return tdx_sept_drop_private_spte(kvm, gfn, level, page); 1880 } 1881 1882 void tdx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, 1883 int trig_mode, int vector) 1884 { 1885 struct kvm_vcpu *vcpu = apic->vcpu; 1886 struct vcpu_tdx *tdx = to_tdx(vcpu); 1887 1888 /* TDX supports only posted interrupt. No lapic emulation. */ 1889 __vmx_deliver_posted_interrupt(vcpu, &tdx->vt.pi_desc, vector); 1890 1891 trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector); 1892 } 1893 1894 static inline bool tdx_is_sept_violation_unexpected_pending(struct kvm_vcpu *vcpu) 1895 { 1896 u64 eeq_type = to_tdx(vcpu)->ext_exit_qualification & TDX_EXT_EXIT_QUAL_TYPE_MASK; 1897 u64 eq = vmx_get_exit_qual(vcpu); 1898 1899 if (eeq_type != TDX_EXT_EXIT_QUAL_TYPE_PENDING_EPT_VIOLATION) 1900 return false; 1901 1902 return !(eq & EPT_VIOLATION_PROT_MASK) && !(eq & EPT_VIOLATION_EXEC_FOR_RING3_LIN); 1903 } 1904 1905 static int tdx_handle_ept_violation(struct kvm_vcpu *vcpu) 1906 { 1907 unsigned long exit_qual; 1908 gpa_t gpa = to_tdx(vcpu)->exit_gpa; 1909 bool local_retry = false; 1910 int ret; 1911 1912 if (vt_is_tdx_private_gpa(vcpu->kvm, gpa)) { 1913 if (tdx_is_sept_violation_unexpected_pending(vcpu)) { 1914 pr_warn("Guest access before accepting 0x%llx on vCPU %d\n", 1915 gpa, vcpu->vcpu_id); 1916 kvm_vm_dead(vcpu->kvm); 1917 return -EIO; 1918 } 1919 /* 1920 * Always treat SEPT violations as write faults. Ignore the 1921 * EXIT_QUALIFICATION reported by TDX-SEAM for SEPT violations. 1922 * TD private pages are always RWX in the SEPT tables, 1923 * i.e. they're always mapped writable. Just as importantly, 1924 * treating SEPT violations as write faults is necessary to 1925 * avoid COW allocations, which will cause TDAUGPAGE failures 1926 * due to aliasing a single HPA to multiple GPAs. 1927 */ 1928 exit_qual = EPT_VIOLATION_ACC_WRITE; 1929 1930 /* Only private GPA triggers zero-step mitigation */ 1931 local_retry = true; 1932 } else { 1933 exit_qual = vmx_get_exit_qual(vcpu); 1934 /* 1935 * EPT violation due to instruction fetch should never be 1936 * triggered from shared memory in TDX guest. If such EPT 1937 * violation occurs, treat it as broken hardware. 1938 */ 1939 if (KVM_BUG_ON(exit_qual & EPT_VIOLATION_ACC_INSTR, vcpu->kvm)) 1940 return -EIO; 1941 } 1942 1943 trace_kvm_page_fault(vcpu, gpa, exit_qual); 1944 1945 /* 1946 * To minimize TDH.VP.ENTER invocations, retry locally for private GPA 1947 * mapping in TDX. 1948 * 1949 * KVM may return RET_PF_RETRY for private GPA due to 1950 * - contentions when atomically updating SPTEs of the mirror page table 1951 * - in-progress GFN invalidation or memslot removal. 1952 * - TDX_OPERAND_BUSY error from TDH.MEM.PAGE.AUG or TDH.MEM.SEPT.ADD, 1953 * caused by contentions with TDH.VP.ENTER (with zero-step mitigation) 1954 * or certain TDCALLs. 1955 * 1956 * If TDH.VP.ENTER is invoked more times than the threshold set by the 1957 * TDX module before KVM resolves the private GPA mapping, the TDX 1958 * module will activate zero-step mitigation during TDH.VP.ENTER. This 1959 * process acquires an SEPT tree lock in the TDX module, leading to 1960 * further contentions with TDH.MEM.PAGE.AUG or TDH.MEM.SEPT.ADD 1961 * operations on other vCPUs. 1962 * 1963 * Breaking out of local retries for kvm_vcpu_has_events() is for 1964 * interrupt injection. kvm_vcpu_has_events() should not see pending 1965 * events for TDX. Since KVM can't determine if IRQs (or NMIs) are 1966 * blocked by TDs, false positives are inevitable i.e., KVM may re-enter 1967 * the guest even if the IRQ/NMI can't be delivered. 1968 * 1969 * Note: even without breaking out of local retries, zero-step 1970 * mitigation may still occur due to 1971 * - invoking of TDH.VP.ENTER after KVM_EXIT_MEMORY_FAULT, 1972 * - a single RIP causing EPT violations for more GFNs than the 1973 * threshold count. 1974 * This is safe, as triggering zero-step mitigation only introduces 1975 * contentions to page installation SEAMCALLs on other vCPUs, which will 1976 * handle retries locally in their EPT violation handlers. 1977 */ 1978 while (1) { 1979 ret = __vmx_handle_ept_violation(vcpu, gpa, exit_qual); 1980 1981 if (ret != RET_PF_RETRY || !local_retry) 1982 break; 1983 1984 if (kvm_vcpu_has_events(vcpu) || signal_pending(current)) 1985 break; 1986 1987 if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu)) { 1988 ret = -EIO; 1989 break; 1990 } 1991 1992 cond_resched(); 1993 } 1994 return ret; 1995 } 1996 1997 int tdx_complete_emulated_msr(struct kvm_vcpu *vcpu, int err) 1998 { 1999 if (err) { 2000 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); 2001 return 1; 2002 } 2003 2004 if (vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MSR_READ) 2005 tdvmcall_set_return_val(vcpu, kvm_read_edx_eax(vcpu)); 2006 2007 return 1; 2008 } 2009 2010 2011 int tdx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t fastpath) 2012 { 2013 struct vcpu_tdx *tdx = to_tdx(vcpu); 2014 u64 vp_enter_ret = tdx->vp_enter_ret; 2015 union vmx_exit_reason exit_reason = vmx_get_exit_reason(vcpu); 2016 2017 if (fastpath != EXIT_FASTPATH_NONE) 2018 return 1; 2019 2020 if (unlikely(vp_enter_ret == EXIT_REASON_EPT_MISCONFIG)) { 2021 KVM_BUG_ON(1, vcpu->kvm); 2022 return -EIO; 2023 } 2024 2025 /* 2026 * Handle TDX SW errors, including TDX_SEAMCALL_UD, TDX_SEAMCALL_GP and 2027 * TDX_SEAMCALL_VMFAILINVALID. 2028 */ 2029 if (unlikely((vp_enter_ret & TDX_SW_ERROR) == TDX_SW_ERROR)) { 2030 KVM_BUG_ON(!kvm_rebooting, vcpu->kvm); 2031 goto unhandled_exit; 2032 } 2033 2034 if (unlikely(tdx_failed_vmentry(vcpu))) { 2035 /* 2036 * If the guest state is protected, that means off-TD debug is 2037 * not enabled, TDX_NON_RECOVERABLE must be set. 2038 */ 2039 WARN_ON_ONCE(vcpu->arch.guest_state_protected && 2040 !(vp_enter_ret & TDX_NON_RECOVERABLE)); 2041 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; 2042 vcpu->run->fail_entry.hardware_entry_failure_reason = exit_reason.full; 2043 vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu; 2044 return 0; 2045 } 2046 2047 if (unlikely(vp_enter_ret & (TDX_ERROR | TDX_NON_RECOVERABLE)) && 2048 exit_reason.basic != EXIT_REASON_TRIPLE_FAULT) { 2049 kvm_pr_unimpl("TD vp_enter_ret 0x%llx\n", vp_enter_ret); 2050 goto unhandled_exit; 2051 } 2052 2053 WARN_ON_ONCE(exit_reason.basic != EXIT_REASON_TRIPLE_FAULT && 2054 (vp_enter_ret & TDX_SEAMCALL_STATUS_MASK) != TDX_SUCCESS); 2055 2056 switch (exit_reason.basic) { 2057 case EXIT_REASON_TRIPLE_FAULT: 2058 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; 2059 vcpu->mmio_needed = 0; 2060 return 0; 2061 case EXIT_REASON_EXCEPTION_NMI: 2062 return tdx_handle_exception_nmi(vcpu); 2063 case EXIT_REASON_EXTERNAL_INTERRUPT: 2064 ++vcpu->stat.irq_exits; 2065 return 1; 2066 case EXIT_REASON_CPUID: 2067 return tdx_emulate_cpuid(vcpu); 2068 case EXIT_REASON_HLT: 2069 return kvm_emulate_halt_noskip(vcpu); 2070 case EXIT_REASON_TDCALL: 2071 return handle_tdvmcall(vcpu); 2072 case EXIT_REASON_VMCALL: 2073 return tdx_emulate_vmcall(vcpu); 2074 case EXIT_REASON_IO_INSTRUCTION: 2075 return tdx_emulate_io(vcpu); 2076 case EXIT_REASON_MSR_READ: 2077 kvm_rcx_write(vcpu, tdx->vp_enter_args.r12); 2078 return kvm_emulate_rdmsr(vcpu); 2079 case EXIT_REASON_MSR_WRITE: 2080 kvm_rcx_write(vcpu, tdx->vp_enter_args.r12); 2081 kvm_rax_write(vcpu, tdx->vp_enter_args.r13 & -1u); 2082 kvm_rdx_write(vcpu, tdx->vp_enter_args.r13 >> 32); 2083 return kvm_emulate_wrmsr(vcpu); 2084 case EXIT_REASON_EPT_MISCONFIG: 2085 return tdx_emulate_mmio(vcpu); 2086 case EXIT_REASON_EPT_VIOLATION: 2087 return tdx_handle_ept_violation(vcpu); 2088 case EXIT_REASON_OTHER_SMI: 2089 /* 2090 * Unlike VMX, SMI in SEAM non-root mode (i.e. when 2091 * TD guest vCPU is running) will cause VM exit to TDX module, 2092 * then SEAMRET to KVM. Once it exits to KVM, SMI is delivered 2093 * and handled by kernel handler right away. 2094 * 2095 * The Other SMI exit can also be caused by the SEAM non-root 2096 * machine check delivered via Machine Check System Management 2097 * Interrupt (MSMI), but it has already been handled by the 2098 * kernel machine check handler, i.e., the memory page has been 2099 * marked as poisoned and it won't be freed to the free list 2100 * when the TDX guest is terminated (the TDX module marks the 2101 * guest as dead and prevent it from further running when 2102 * machine check happens in SEAM non-root). 2103 * 2104 * - A MSMI will not reach here, it's handled as non_recoverable 2105 * case above. 2106 * - If it's not an MSMI, no need to do anything here. 2107 */ 2108 return 1; 2109 default: 2110 break; 2111 } 2112 2113 unhandled_exit: 2114 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 2115 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; 2116 vcpu->run->internal.ndata = 2; 2117 vcpu->run->internal.data[0] = vp_enter_ret; 2118 vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu; 2119 return 0; 2120 } 2121 2122 void tdx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, 2123 u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code) 2124 { 2125 struct vcpu_tdx *tdx = to_tdx(vcpu); 2126 2127 *reason = tdx->vt.exit_reason.full; 2128 if (*reason != -1u) { 2129 *info1 = vmx_get_exit_qual(vcpu); 2130 *info2 = tdx->ext_exit_qualification; 2131 *intr_info = vmx_get_intr_info(vcpu); 2132 } else { 2133 *info1 = 0; 2134 *info2 = 0; 2135 *intr_info = 0; 2136 } 2137 2138 *error_code = 0; 2139 } 2140 2141 bool tdx_has_emulated_msr(u32 index) 2142 { 2143 switch (index) { 2144 case MSR_IA32_UCODE_REV: 2145 case MSR_IA32_ARCH_CAPABILITIES: 2146 case MSR_IA32_POWER_CTL: 2147 case MSR_IA32_CR_PAT: 2148 case MSR_MTRRcap: 2149 case MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000: 2150 case MSR_MTRRdefType: 2151 case MSR_IA32_TSC_DEADLINE: 2152 case MSR_IA32_MISC_ENABLE: 2153 case MSR_PLATFORM_INFO: 2154 case MSR_MISC_FEATURES_ENABLES: 2155 case MSR_IA32_APICBASE: 2156 case MSR_EFER: 2157 case MSR_IA32_FEAT_CTL: 2158 case MSR_IA32_MCG_CAP: 2159 case MSR_IA32_MCG_STATUS: 2160 case MSR_IA32_MCG_CTL: 2161 case MSR_IA32_MCG_EXT_CTL: 2162 case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: 2163 case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1: 2164 /* MSR_IA32_MCx_{CTL, STATUS, ADDR, MISC, CTL2} */ 2165 case MSR_KVM_POLL_CONTROL: 2166 return true; 2167 case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff: 2168 /* 2169 * x2APIC registers that are virtualized by the CPU can't be 2170 * emulated, KVM doesn't have access to the virtual APIC page. 2171 */ 2172 switch (index) { 2173 case X2APIC_MSR(APIC_TASKPRI): 2174 case X2APIC_MSR(APIC_PROCPRI): 2175 case X2APIC_MSR(APIC_EOI): 2176 case X2APIC_MSR(APIC_ISR) ... X2APIC_MSR(APIC_ISR + APIC_ISR_NR): 2177 case X2APIC_MSR(APIC_TMR) ... X2APIC_MSR(APIC_TMR + APIC_ISR_NR): 2178 case X2APIC_MSR(APIC_IRR) ... X2APIC_MSR(APIC_IRR + APIC_ISR_NR): 2179 return false; 2180 default: 2181 return true; 2182 } 2183 default: 2184 return false; 2185 } 2186 } 2187 2188 static bool tdx_is_read_only_msr(u32 index) 2189 { 2190 return index == MSR_IA32_APICBASE || index == MSR_EFER || 2191 index == MSR_IA32_FEAT_CTL; 2192 } 2193 2194 int tdx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) 2195 { 2196 switch (msr->index) { 2197 case MSR_IA32_FEAT_CTL: 2198 /* 2199 * MCE and MCA are advertised via cpuid. Guest kernel could 2200 * check if LMCE is enabled or not. 2201 */ 2202 msr->data = FEAT_CTL_LOCKED; 2203 if (vcpu->arch.mcg_cap & MCG_LMCE_P) 2204 msr->data |= FEAT_CTL_LMCE_ENABLED; 2205 return 0; 2206 case MSR_IA32_MCG_EXT_CTL: 2207 if (!msr->host_initiated && !(vcpu->arch.mcg_cap & MCG_LMCE_P)) 2208 return 1; 2209 msr->data = vcpu->arch.mcg_ext_ctl; 2210 return 0; 2211 default: 2212 if (!tdx_has_emulated_msr(msr->index)) 2213 return 1; 2214 2215 return kvm_get_msr_common(vcpu, msr); 2216 } 2217 } 2218 2219 int tdx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) 2220 { 2221 switch (msr->index) { 2222 case MSR_IA32_MCG_EXT_CTL: 2223 if ((!msr->host_initiated && !(vcpu->arch.mcg_cap & MCG_LMCE_P)) || 2224 (msr->data & ~MCG_EXT_CTL_LMCE_EN)) 2225 return 1; 2226 vcpu->arch.mcg_ext_ctl = msr->data; 2227 return 0; 2228 default: 2229 if (tdx_is_read_only_msr(msr->index)) 2230 return 1; 2231 2232 if (!tdx_has_emulated_msr(msr->index)) 2233 return 1; 2234 2235 return kvm_set_msr_common(vcpu, msr); 2236 } 2237 } 2238 2239 static int tdx_get_capabilities(struct kvm_tdx_cmd *cmd) 2240 { 2241 const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf; 2242 struct kvm_tdx_capabilities __user *user_caps; 2243 struct kvm_tdx_capabilities *caps = NULL; 2244 int ret = 0; 2245 2246 /* flags is reserved for future use */ 2247 if (cmd->flags) 2248 return -EINVAL; 2249 2250 caps = kmalloc(sizeof(*caps) + 2251 sizeof(struct kvm_cpuid_entry2) * td_conf->num_cpuid_config, 2252 GFP_KERNEL); 2253 if (!caps) 2254 return -ENOMEM; 2255 2256 user_caps = u64_to_user_ptr(cmd->data); 2257 if (copy_from_user(caps, user_caps, sizeof(*caps))) { 2258 ret = -EFAULT; 2259 goto out; 2260 } 2261 2262 if (caps->cpuid.nent < td_conf->num_cpuid_config) { 2263 ret = -E2BIG; 2264 goto out; 2265 } 2266 2267 ret = init_kvm_tdx_caps(td_conf, caps); 2268 if (ret) 2269 goto out; 2270 2271 if (copy_to_user(user_caps, caps, sizeof(*caps))) { 2272 ret = -EFAULT; 2273 goto out; 2274 } 2275 2276 if (copy_to_user(user_caps->cpuid.entries, caps->cpuid.entries, 2277 caps->cpuid.nent * 2278 sizeof(caps->cpuid.entries[0]))) 2279 ret = -EFAULT; 2280 2281 out: 2282 /* kfree() accepts NULL. */ 2283 kfree(caps); 2284 return ret; 2285 } 2286 2287 /* 2288 * KVM reports guest physical address in CPUID.0x800000008.EAX[23:16], which is 2289 * similar to TDX's GPAW. Use this field as the interface for userspace to 2290 * configure the GPAW and EPT level for TDs. 2291 * 2292 * Only values 48 and 52 are supported. Value 52 means GPAW-52 and EPT level 2293 * 5, Value 48 means GPAW-48 and EPT level 4. For value 48, GPAW-48 is always 2294 * supported. Value 52 is only supported when the platform supports 5 level 2295 * EPT. 2296 */ 2297 static int setup_tdparams_eptp_controls(struct kvm_cpuid2 *cpuid, 2298 struct td_params *td_params) 2299 { 2300 const struct kvm_cpuid_entry2 *entry; 2301 int guest_pa; 2302 2303 entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent, 0x80000008, 0); 2304 if (!entry) 2305 return -EINVAL; 2306 2307 guest_pa = tdx_get_guest_phys_addr_bits(entry->eax); 2308 2309 if (guest_pa != 48 && guest_pa != 52) 2310 return -EINVAL; 2311 2312 if (guest_pa == 52 && !cpu_has_vmx_ept_5levels()) 2313 return -EINVAL; 2314 2315 td_params->eptp_controls = VMX_EPTP_MT_WB; 2316 if (guest_pa == 52) { 2317 td_params->eptp_controls |= VMX_EPTP_PWL_5; 2318 td_params->config_flags |= TDX_CONFIG_FLAGS_MAX_GPAW; 2319 } else { 2320 td_params->eptp_controls |= VMX_EPTP_PWL_4; 2321 } 2322 2323 return 0; 2324 } 2325 2326 static int setup_tdparams_cpuids(struct kvm_cpuid2 *cpuid, 2327 struct td_params *td_params) 2328 { 2329 const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf; 2330 const struct kvm_cpuid_entry2 *entry; 2331 struct tdx_cpuid_value *value; 2332 int i, copy_cnt = 0; 2333 2334 /* 2335 * td_params.cpuid_values: The number and the order of cpuid_value must 2336 * be same to the one of struct tdsysinfo.{num_cpuid_config, cpuid_configs} 2337 * It's assumed that td_params was zeroed. 2338 */ 2339 for (i = 0; i < td_conf->num_cpuid_config; i++) { 2340 struct kvm_cpuid_entry2 tmp; 2341 2342 td_init_cpuid_entry2(&tmp, i); 2343 2344 entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent, 2345 tmp.function, tmp.index); 2346 if (!entry) 2347 continue; 2348 2349 if (tdx_unsupported_cpuid(entry)) 2350 return -EINVAL; 2351 2352 copy_cnt++; 2353 2354 value = &td_params->cpuid_values[i]; 2355 value->eax = entry->eax; 2356 value->ebx = entry->ebx; 2357 value->ecx = entry->ecx; 2358 value->edx = entry->edx; 2359 2360 /* 2361 * TDX module does not accept nonzero bits 16..23 for the 2362 * CPUID[0x80000008].EAX, see setup_tdparams_eptp_controls(). 2363 */ 2364 if (tmp.function == 0x80000008) 2365 value->eax = tdx_set_guest_phys_addr_bits(value->eax, 0); 2366 } 2367 2368 /* 2369 * Rely on the TDX module to reject invalid configuration, but it can't 2370 * check of leafs that don't have a proper slot in td_params->cpuid_values 2371 * to stick then. So fail if there were entries that didn't get copied to 2372 * td_params. 2373 */ 2374 if (copy_cnt != cpuid->nent) 2375 return -EINVAL; 2376 2377 return 0; 2378 } 2379 2380 static int setup_tdparams(struct kvm *kvm, struct td_params *td_params, 2381 struct kvm_tdx_init_vm *init_vm) 2382 { 2383 const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf; 2384 struct kvm_cpuid2 *cpuid = &init_vm->cpuid; 2385 int ret; 2386 2387 if (kvm->created_vcpus) 2388 return -EBUSY; 2389 2390 if (init_vm->attributes & ~tdx_get_supported_attrs(td_conf)) 2391 return -EINVAL; 2392 2393 if (init_vm->xfam & ~tdx_get_supported_xfam(td_conf)) 2394 return -EINVAL; 2395 2396 td_params->max_vcpus = kvm->max_vcpus; 2397 td_params->attributes = init_vm->attributes | td_conf->attributes_fixed1; 2398 td_params->xfam = init_vm->xfam | td_conf->xfam_fixed1; 2399 2400 td_params->config_flags = TDX_CONFIG_FLAGS_NO_RBP_MOD; 2401 td_params->tsc_frequency = TDX_TSC_KHZ_TO_25MHZ(kvm->arch.default_tsc_khz); 2402 2403 ret = setup_tdparams_eptp_controls(cpuid, td_params); 2404 if (ret) 2405 return ret; 2406 2407 ret = setup_tdparams_cpuids(cpuid, td_params); 2408 if (ret) 2409 return ret; 2410 2411 #define MEMCPY_SAME_SIZE(dst, src) \ 2412 do { \ 2413 BUILD_BUG_ON(sizeof(dst) != sizeof(src)); \ 2414 memcpy((dst), (src), sizeof(dst)); \ 2415 } while (0) 2416 2417 MEMCPY_SAME_SIZE(td_params->mrconfigid, init_vm->mrconfigid); 2418 MEMCPY_SAME_SIZE(td_params->mrowner, init_vm->mrowner); 2419 MEMCPY_SAME_SIZE(td_params->mrownerconfig, init_vm->mrownerconfig); 2420 2421 return 0; 2422 } 2423 2424 static int __tdx_td_init(struct kvm *kvm, struct td_params *td_params, 2425 u64 *seamcall_err) 2426 { 2427 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 2428 cpumask_var_t packages; 2429 struct page **tdcs_pages = NULL; 2430 struct page *tdr_page; 2431 int ret, i; 2432 u64 err, rcx; 2433 2434 *seamcall_err = 0; 2435 ret = tdx_guest_keyid_alloc(); 2436 if (ret < 0) 2437 return ret; 2438 kvm_tdx->hkid = ret; 2439 kvm_tdx->misc_cg = get_current_misc_cg(); 2440 ret = misc_cg_try_charge(MISC_CG_RES_TDX, kvm_tdx->misc_cg, 1); 2441 if (ret) 2442 goto free_hkid; 2443 2444 ret = -ENOMEM; 2445 2446 atomic_inc(&nr_configured_hkid); 2447 2448 tdr_page = alloc_page(GFP_KERNEL); 2449 if (!tdr_page) 2450 goto free_hkid; 2451 2452 kvm_tdx->td.tdcs_nr_pages = tdx_sysinfo->td_ctrl.tdcs_base_size / PAGE_SIZE; 2453 /* TDVPS = TDVPR(4K page) + TDCX(multiple 4K pages), -1 for TDVPR. */ 2454 kvm_tdx->td.tdcx_nr_pages = tdx_sysinfo->td_ctrl.tdvps_base_size / PAGE_SIZE - 1; 2455 tdcs_pages = kcalloc(kvm_tdx->td.tdcs_nr_pages, sizeof(*kvm_tdx->td.tdcs_pages), 2456 GFP_KERNEL | __GFP_ZERO); 2457 if (!tdcs_pages) 2458 goto free_tdr; 2459 2460 for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) { 2461 tdcs_pages[i] = alloc_page(GFP_KERNEL); 2462 if (!tdcs_pages[i]) 2463 goto free_tdcs; 2464 } 2465 2466 if (!zalloc_cpumask_var(&packages, GFP_KERNEL)) 2467 goto free_tdcs; 2468 2469 cpus_read_lock(); 2470 2471 /* 2472 * Need at least one CPU of the package to be online in order to 2473 * program all packages for host key id. Check it. 2474 */ 2475 for_each_present_cpu(i) 2476 cpumask_set_cpu(topology_physical_package_id(i), packages); 2477 for_each_online_cpu(i) 2478 cpumask_clear_cpu(topology_physical_package_id(i), packages); 2479 if (!cpumask_empty(packages)) { 2480 ret = -EIO; 2481 /* 2482 * Because it's hard for human operator to figure out the 2483 * reason, warn it. 2484 */ 2485 #define MSG_ALLPKG "All packages need to have online CPU to create TD. Online CPU and retry.\n" 2486 pr_warn_ratelimited(MSG_ALLPKG); 2487 goto free_packages; 2488 } 2489 2490 /* 2491 * TDH.MNG.CREATE tries to grab the global TDX module and fails 2492 * with TDX_OPERAND_BUSY when it fails to grab. Take the global 2493 * lock to prevent it from failure. 2494 */ 2495 mutex_lock(&tdx_lock); 2496 kvm_tdx->td.tdr_page = tdr_page; 2497 err = tdh_mng_create(&kvm_tdx->td, kvm_tdx->hkid); 2498 mutex_unlock(&tdx_lock); 2499 2500 if (err == TDX_RND_NO_ENTROPY) { 2501 ret = -EAGAIN; 2502 goto free_packages; 2503 } 2504 2505 if (WARN_ON_ONCE(err)) { 2506 pr_tdx_error(TDH_MNG_CREATE, err); 2507 ret = -EIO; 2508 goto free_packages; 2509 } 2510 2511 for_each_online_cpu(i) { 2512 int pkg = topology_physical_package_id(i); 2513 2514 if (cpumask_test_and_set_cpu(pkg, packages)) 2515 continue; 2516 2517 /* 2518 * Program the memory controller in the package with an 2519 * encryption key associated to a TDX private host key id 2520 * assigned to this TDR. Concurrent operations on same memory 2521 * controller results in TDX_OPERAND_BUSY. No locking needed 2522 * beyond the cpus_read_lock() above as it serializes against 2523 * hotplug and the first online CPU of the package is always 2524 * used. We never have two CPUs in the same socket trying to 2525 * program the key. 2526 */ 2527 ret = smp_call_on_cpu(i, tdx_do_tdh_mng_key_config, 2528 kvm_tdx, true); 2529 if (ret) 2530 break; 2531 } 2532 cpus_read_unlock(); 2533 free_cpumask_var(packages); 2534 if (ret) { 2535 i = 0; 2536 goto teardown; 2537 } 2538 2539 kvm_tdx->td.tdcs_pages = tdcs_pages; 2540 for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) { 2541 err = tdh_mng_addcx(&kvm_tdx->td, tdcs_pages[i]); 2542 if (err == TDX_RND_NO_ENTROPY) { 2543 /* Here it's hard to allow userspace to retry. */ 2544 ret = -EAGAIN; 2545 goto teardown; 2546 } 2547 if (WARN_ON_ONCE(err)) { 2548 pr_tdx_error(TDH_MNG_ADDCX, err); 2549 ret = -EIO; 2550 goto teardown; 2551 } 2552 } 2553 2554 err = tdh_mng_init(&kvm_tdx->td, __pa(td_params), &rcx); 2555 if ((err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_INVALID) { 2556 /* 2557 * Because a user gives operands, don't warn. 2558 * Return a hint to the user because it's sometimes hard for the 2559 * user to figure out which operand is invalid. SEAMCALL status 2560 * code includes which operand caused invalid operand error. 2561 */ 2562 *seamcall_err = err; 2563 ret = -EINVAL; 2564 goto teardown; 2565 } else if (WARN_ON_ONCE(err)) { 2566 pr_tdx_error_1(TDH_MNG_INIT, err, rcx); 2567 ret = -EIO; 2568 goto teardown; 2569 } 2570 2571 return 0; 2572 2573 /* 2574 * The sequence for freeing resources from a partially initialized TD 2575 * varies based on where in the initialization flow failure occurred. 2576 * Simply use the full teardown and destroy, which naturally play nice 2577 * with partial initialization. 2578 */ 2579 teardown: 2580 /* Only free pages not yet added, so start at 'i' */ 2581 for (; i < kvm_tdx->td.tdcs_nr_pages; i++) { 2582 if (tdcs_pages[i]) { 2583 __free_page(tdcs_pages[i]); 2584 tdcs_pages[i] = NULL; 2585 } 2586 } 2587 if (!kvm_tdx->td.tdcs_pages) 2588 kfree(tdcs_pages); 2589 2590 tdx_mmu_release_hkid(kvm); 2591 tdx_reclaim_td_control_pages(kvm); 2592 2593 return ret; 2594 2595 free_packages: 2596 cpus_read_unlock(); 2597 free_cpumask_var(packages); 2598 2599 free_tdcs: 2600 for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) { 2601 if (tdcs_pages[i]) 2602 __free_page(tdcs_pages[i]); 2603 } 2604 kfree(tdcs_pages); 2605 kvm_tdx->td.tdcs_pages = NULL; 2606 2607 free_tdr: 2608 if (tdr_page) 2609 __free_page(tdr_page); 2610 kvm_tdx->td.tdr_page = 0; 2611 2612 free_hkid: 2613 tdx_hkid_free(kvm_tdx); 2614 2615 return ret; 2616 } 2617 2618 static u64 tdx_td_metadata_field_read(struct kvm_tdx *tdx, u64 field_id, 2619 u64 *data) 2620 { 2621 u64 err; 2622 2623 err = tdh_mng_rd(&tdx->td, field_id, data); 2624 2625 return err; 2626 } 2627 2628 #define TDX_MD_UNREADABLE_LEAF_MASK GENMASK(30, 7) 2629 #define TDX_MD_UNREADABLE_SUBLEAF_MASK GENMASK(31, 7) 2630 2631 static int tdx_read_cpuid(struct kvm_vcpu *vcpu, u32 leaf, u32 sub_leaf, 2632 bool sub_leaf_set, int *entry_index, 2633 struct kvm_cpuid_entry2 *out) 2634 { 2635 struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); 2636 u64 field_id = TD_MD_FIELD_ID_CPUID_VALUES; 2637 u64 ebx_eax, edx_ecx; 2638 u64 err = 0; 2639 2640 if (sub_leaf > 0b1111111) 2641 return -EINVAL; 2642 2643 if (*entry_index >= KVM_MAX_CPUID_ENTRIES) 2644 return -EINVAL; 2645 2646 if (leaf & TDX_MD_UNREADABLE_LEAF_MASK || 2647 sub_leaf & TDX_MD_UNREADABLE_SUBLEAF_MASK) 2648 return -EINVAL; 2649 2650 /* 2651 * bit 23:17, REVSERVED: reserved, must be 0; 2652 * bit 16, LEAF_31: leaf number bit 31; 2653 * bit 15:9, LEAF_6_0: leaf number bits 6:0, leaf bits 30:7 are 2654 * implicitly 0; 2655 * bit 8, SUBLEAF_NA: sub-leaf not applicable flag; 2656 * bit 7:1, SUBLEAF_6_0: sub-leaf number bits 6:0. If SUBLEAF_NA is 1, 2657 * the SUBLEAF_6_0 is all-1. 2658 * sub-leaf bits 31:7 are implicitly 0; 2659 * bit 0, ELEMENT_I: Element index within field; 2660 */ 2661 field_id |= ((leaf & 0x80000000) ? 1 : 0) << 16; 2662 field_id |= (leaf & 0x7f) << 9; 2663 if (sub_leaf_set) 2664 field_id |= (sub_leaf & 0x7f) << 1; 2665 else 2666 field_id |= 0x1fe; 2667 2668 err = tdx_td_metadata_field_read(kvm_tdx, field_id, &ebx_eax); 2669 if (err) //TODO check for specific errors 2670 goto err_out; 2671 2672 out->eax = (u32) ebx_eax; 2673 out->ebx = (u32) (ebx_eax >> 32); 2674 2675 field_id++; 2676 err = tdx_td_metadata_field_read(kvm_tdx, field_id, &edx_ecx); 2677 /* 2678 * It's weird that reading edx_ecx fails while reading ebx_eax 2679 * succeeded. 2680 */ 2681 if (WARN_ON_ONCE(err)) 2682 goto err_out; 2683 2684 out->ecx = (u32) edx_ecx; 2685 out->edx = (u32) (edx_ecx >> 32); 2686 2687 out->function = leaf; 2688 out->index = sub_leaf; 2689 out->flags |= sub_leaf_set ? KVM_CPUID_FLAG_SIGNIFCANT_INDEX : 0; 2690 2691 /* 2692 * Work around missing support on old TDX modules, fetch 2693 * guest maxpa from gfn_direct_bits. 2694 */ 2695 if (leaf == 0x80000008) { 2696 gpa_t gpa_bits = gfn_to_gpa(kvm_gfn_direct_bits(vcpu->kvm)); 2697 unsigned int g_maxpa = __ffs(gpa_bits) + 1; 2698 2699 out->eax = tdx_set_guest_phys_addr_bits(out->eax, g_maxpa); 2700 } 2701 2702 (*entry_index)++; 2703 2704 return 0; 2705 2706 err_out: 2707 out->eax = 0; 2708 out->ebx = 0; 2709 out->ecx = 0; 2710 out->edx = 0; 2711 2712 return -EIO; 2713 } 2714 2715 static int tdx_td_init(struct kvm *kvm, struct kvm_tdx_cmd *cmd) 2716 { 2717 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 2718 struct kvm_tdx_init_vm *init_vm; 2719 struct td_params *td_params = NULL; 2720 int ret; 2721 2722 BUILD_BUG_ON(sizeof(*init_vm) != 256 + sizeof_field(struct kvm_tdx_init_vm, cpuid)); 2723 BUILD_BUG_ON(sizeof(struct td_params) != 1024); 2724 2725 if (kvm_tdx->state != TD_STATE_UNINITIALIZED) 2726 return -EINVAL; 2727 2728 if (cmd->flags) 2729 return -EINVAL; 2730 2731 init_vm = kmalloc(sizeof(*init_vm) + 2732 sizeof(init_vm->cpuid.entries[0]) * KVM_MAX_CPUID_ENTRIES, 2733 GFP_KERNEL); 2734 if (!init_vm) 2735 return -ENOMEM; 2736 2737 if (copy_from_user(init_vm, u64_to_user_ptr(cmd->data), sizeof(*init_vm))) { 2738 ret = -EFAULT; 2739 goto out; 2740 } 2741 2742 if (init_vm->cpuid.nent > KVM_MAX_CPUID_ENTRIES) { 2743 ret = -E2BIG; 2744 goto out; 2745 } 2746 2747 if (copy_from_user(init_vm->cpuid.entries, 2748 u64_to_user_ptr(cmd->data) + sizeof(*init_vm), 2749 flex_array_size(init_vm, cpuid.entries, init_vm->cpuid.nent))) { 2750 ret = -EFAULT; 2751 goto out; 2752 } 2753 2754 if (memchr_inv(init_vm->reserved, 0, sizeof(init_vm->reserved))) { 2755 ret = -EINVAL; 2756 goto out; 2757 } 2758 2759 if (init_vm->cpuid.padding) { 2760 ret = -EINVAL; 2761 goto out; 2762 } 2763 2764 td_params = kzalloc(sizeof(struct td_params), GFP_KERNEL); 2765 if (!td_params) { 2766 ret = -ENOMEM; 2767 goto out; 2768 } 2769 2770 ret = setup_tdparams(kvm, td_params, init_vm); 2771 if (ret) 2772 goto out; 2773 2774 ret = __tdx_td_init(kvm, td_params, &cmd->hw_error); 2775 if (ret) 2776 goto out; 2777 2778 kvm_tdx->tsc_offset = td_tdcs_exec_read64(kvm_tdx, TD_TDCS_EXEC_TSC_OFFSET); 2779 kvm_tdx->tsc_multiplier = td_tdcs_exec_read64(kvm_tdx, TD_TDCS_EXEC_TSC_MULTIPLIER); 2780 kvm_tdx->attributes = td_params->attributes; 2781 kvm_tdx->xfam = td_params->xfam; 2782 2783 if (td_params->config_flags & TDX_CONFIG_FLAGS_MAX_GPAW) 2784 kvm->arch.gfn_direct_bits = TDX_SHARED_BIT_PWL_5; 2785 else 2786 kvm->arch.gfn_direct_bits = TDX_SHARED_BIT_PWL_4; 2787 2788 kvm_tdx->state = TD_STATE_INITIALIZED; 2789 out: 2790 /* kfree() accepts NULL. */ 2791 kfree(init_vm); 2792 kfree(td_params); 2793 2794 return ret; 2795 } 2796 2797 void tdx_flush_tlb_current(struct kvm_vcpu *vcpu) 2798 { 2799 /* 2800 * flush_tlb_current() is invoked when the first time for the vcpu to 2801 * run or when root of shared EPT is invalidated. 2802 * KVM only needs to flush shared EPT because the TDX module handles TLB 2803 * invalidation for private EPT in tdh_vp_enter(); 2804 * 2805 * A single context invalidation for shared EPT can be performed here. 2806 * However, this single context invalidation requires the private EPTP 2807 * rather than the shared EPTP to flush shared EPT, as shared EPT uses 2808 * private EPTP as its ASID for TLB invalidation. 2809 * 2810 * To avoid reading back private EPTP, perform a global invalidation for 2811 * shared EPT instead to keep this function simple. 2812 */ 2813 ept_sync_global(); 2814 } 2815 2816 void tdx_flush_tlb_all(struct kvm_vcpu *vcpu) 2817 { 2818 /* 2819 * TDX has called tdx_track() in tdx_sept_remove_private_spte() to 2820 * ensure that private EPT will be flushed on the next TD enter. No need 2821 * to call tdx_track() here again even when this callback is a result of 2822 * zapping private EPT. 2823 * 2824 * Due to the lack of the context to determine which EPT has been 2825 * affected by zapping, invoke invept() directly here for both shared 2826 * EPT and private EPT for simplicity, though it's not necessary for 2827 * private EPT. 2828 */ 2829 ept_sync_global(); 2830 } 2831 2832 static int tdx_td_finalize(struct kvm *kvm, struct kvm_tdx_cmd *cmd) 2833 { 2834 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 2835 2836 guard(mutex)(&kvm->slots_lock); 2837 2838 if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE) 2839 return -EINVAL; 2840 /* 2841 * Pages are pending for KVM_TDX_INIT_MEM_REGION to issue 2842 * TDH.MEM.PAGE.ADD(). 2843 */ 2844 if (atomic64_read(&kvm_tdx->nr_premapped)) 2845 return -EINVAL; 2846 2847 cmd->hw_error = tdh_mr_finalize(&kvm_tdx->td); 2848 if (tdx_operand_busy(cmd->hw_error)) 2849 return -EBUSY; 2850 if (KVM_BUG_ON(cmd->hw_error, kvm)) { 2851 pr_tdx_error(TDH_MR_FINALIZE, cmd->hw_error); 2852 return -EIO; 2853 } 2854 2855 kvm_tdx->state = TD_STATE_RUNNABLE; 2856 /* TD_STATE_RUNNABLE must be set before 'pre_fault_allowed' */ 2857 smp_wmb(); 2858 kvm->arch.pre_fault_allowed = true; 2859 return 0; 2860 } 2861 2862 int tdx_vm_ioctl(struct kvm *kvm, void __user *argp) 2863 { 2864 struct kvm_tdx_cmd tdx_cmd; 2865 int r; 2866 2867 if (copy_from_user(&tdx_cmd, argp, sizeof(struct kvm_tdx_cmd))) 2868 return -EFAULT; 2869 2870 /* 2871 * Userspace should never set hw_error. It is used to fill 2872 * hardware-defined error by the kernel. 2873 */ 2874 if (tdx_cmd.hw_error) 2875 return -EINVAL; 2876 2877 mutex_lock(&kvm->lock); 2878 2879 switch (tdx_cmd.id) { 2880 case KVM_TDX_CAPABILITIES: 2881 r = tdx_get_capabilities(&tdx_cmd); 2882 break; 2883 case KVM_TDX_INIT_VM: 2884 r = tdx_td_init(kvm, &tdx_cmd); 2885 break; 2886 case KVM_TDX_FINALIZE_VM: 2887 r = tdx_td_finalize(kvm, &tdx_cmd); 2888 break; 2889 default: 2890 r = -EINVAL; 2891 goto out; 2892 } 2893 2894 if (copy_to_user(argp, &tdx_cmd, sizeof(struct kvm_tdx_cmd))) 2895 r = -EFAULT; 2896 2897 out: 2898 mutex_unlock(&kvm->lock); 2899 return r; 2900 } 2901 2902 /* VMM can pass one 64bit auxiliary data to vcpu via RCX for guest BIOS. */ 2903 static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx) 2904 { 2905 struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); 2906 struct vcpu_tdx *tdx = to_tdx(vcpu); 2907 struct page *page; 2908 int ret, i; 2909 u64 err; 2910 2911 page = alloc_page(GFP_KERNEL); 2912 if (!page) 2913 return -ENOMEM; 2914 tdx->vp.tdvpr_page = page; 2915 2916 tdx->vp.tdcx_pages = kcalloc(kvm_tdx->td.tdcx_nr_pages, sizeof(*tdx->vp.tdcx_pages), 2917 GFP_KERNEL); 2918 if (!tdx->vp.tdcx_pages) { 2919 ret = -ENOMEM; 2920 goto free_tdvpr; 2921 } 2922 2923 for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) { 2924 page = alloc_page(GFP_KERNEL); 2925 if (!page) { 2926 ret = -ENOMEM; 2927 goto free_tdcx; 2928 } 2929 tdx->vp.tdcx_pages[i] = page; 2930 } 2931 2932 err = tdh_vp_create(&kvm_tdx->td, &tdx->vp); 2933 if (KVM_BUG_ON(err, vcpu->kvm)) { 2934 ret = -EIO; 2935 pr_tdx_error(TDH_VP_CREATE, err); 2936 goto free_tdcx; 2937 } 2938 2939 for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) { 2940 err = tdh_vp_addcx(&tdx->vp, tdx->vp.tdcx_pages[i]); 2941 if (KVM_BUG_ON(err, vcpu->kvm)) { 2942 pr_tdx_error(TDH_VP_ADDCX, err); 2943 /* 2944 * Pages already added are reclaimed by the vcpu_free 2945 * method, but the rest are freed here. 2946 */ 2947 for (; i < kvm_tdx->td.tdcx_nr_pages; i++) { 2948 __free_page(tdx->vp.tdcx_pages[i]); 2949 tdx->vp.tdcx_pages[i] = NULL; 2950 } 2951 return -EIO; 2952 } 2953 } 2954 2955 err = tdh_vp_init(&tdx->vp, vcpu_rcx, vcpu->vcpu_id); 2956 if (KVM_BUG_ON(err, vcpu->kvm)) { 2957 pr_tdx_error(TDH_VP_INIT, err); 2958 return -EIO; 2959 } 2960 2961 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 2962 2963 return 0; 2964 2965 free_tdcx: 2966 for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) { 2967 if (tdx->vp.tdcx_pages[i]) 2968 __free_page(tdx->vp.tdcx_pages[i]); 2969 tdx->vp.tdcx_pages[i] = NULL; 2970 } 2971 kfree(tdx->vp.tdcx_pages); 2972 tdx->vp.tdcx_pages = NULL; 2973 2974 free_tdvpr: 2975 if (tdx->vp.tdvpr_page) 2976 __free_page(tdx->vp.tdvpr_page); 2977 tdx->vp.tdvpr_page = 0; 2978 2979 return ret; 2980 } 2981 2982 /* Sometimes reads multipple subleafs. Return how many enties were written. */ 2983 static int tdx_vcpu_get_cpuid_leaf(struct kvm_vcpu *vcpu, u32 leaf, int *entry_index, 2984 struct kvm_cpuid_entry2 *output_e) 2985 { 2986 int sub_leaf = 0; 2987 int ret; 2988 2989 /* First try without a subleaf */ 2990 ret = tdx_read_cpuid(vcpu, leaf, 0, false, entry_index, output_e); 2991 2992 /* If success, or invalid leaf, just give up */ 2993 if (ret != -EIO) 2994 return ret; 2995 2996 /* 2997 * If the try without a subleaf failed, try reading subleafs until 2998 * failure. The TDX module only supports 6 bits of subleaf index. 2999 */ 3000 while (1) { 3001 /* Keep reading subleafs until there is a failure. */ 3002 if (tdx_read_cpuid(vcpu, leaf, sub_leaf, true, entry_index, output_e)) 3003 return !sub_leaf; 3004 3005 sub_leaf++; 3006 output_e++; 3007 } 3008 3009 return 0; 3010 } 3011 3012 static int tdx_vcpu_get_cpuid(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd) 3013 { 3014 struct kvm_cpuid2 __user *output, *td_cpuid; 3015 int r = 0, i = 0, leaf; 3016 u32 level; 3017 3018 output = u64_to_user_ptr(cmd->data); 3019 td_cpuid = kzalloc(sizeof(*td_cpuid) + 3020 sizeof(output->entries[0]) * KVM_MAX_CPUID_ENTRIES, 3021 GFP_KERNEL); 3022 if (!td_cpuid) 3023 return -ENOMEM; 3024 3025 if (copy_from_user(td_cpuid, output, sizeof(*output))) { 3026 r = -EFAULT; 3027 goto out; 3028 } 3029 3030 /* Read max CPUID for normal range */ 3031 if (tdx_vcpu_get_cpuid_leaf(vcpu, 0, &i, &td_cpuid->entries[i])) { 3032 r = -EIO; 3033 goto out; 3034 } 3035 level = td_cpuid->entries[0].eax; 3036 3037 for (leaf = 1; leaf <= level; leaf++) 3038 tdx_vcpu_get_cpuid_leaf(vcpu, leaf, &i, &td_cpuid->entries[i]); 3039 3040 /* Read max CPUID for extended range */ 3041 if (tdx_vcpu_get_cpuid_leaf(vcpu, 0x80000000, &i, &td_cpuid->entries[i])) { 3042 r = -EIO; 3043 goto out; 3044 } 3045 level = td_cpuid->entries[i - 1].eax; 3046 3047 for (leaf = 0x80000001; leaf <= level; leaf++) 3048 tdx_vcpu_get_cpuid_leaf(vcpu, leaf, &i, &td_cpuid->entries[i]); 3049 3050 if (td_cpuid->nent < i) 3051 r = -E2BIG; 3052 td_cpuid->nent = i; 3053 3054 if (copy_to_user(output, td_cpuid, sizeof(*output))) { 3055 r = -EFAULT; 3056 goto out; 3057 } 3058 3059 if (r == -E2BIG) 3060 goto out; 3061 3062 if (copy_to_user(output->entries, td_cpuid->entries, 3063 td_cpuid->nent * sizeof(struct kvm_cpuid_entry2))) 3064 r = -EFAULT; 3065 3066 out: 3067 kfree(td_cpuid); 3068 3069 return r; 3070 } 3071 3072 static int tdx_vcpu_init(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd) 3073 { 3074 u64 apic_base; 3075 struct vcpu_tdx *tdx = to_tdx(vcpu); 3076 int ret; 3077 3078 if (cmd->flags) 3079 return -EINVAL; 3080 3081 if (tdx->state != VCPU_TD_STATE_UNINITIALIZED) 3082 return -EINVAL; 3083 3084 /* 3085 * TDX requires X2APIC, userspace is responsible for configuring guest 3086 * CPUID accordingly. 3087 */ 3088 apic_base = APIC_DEFAULT_PHYS_BASE | LAPIC_MODE_X2APIC | 3089 (kvm_vcpu_is_reset_bsp(vcpu) ? MSR_IA32_APICBASE_BSP : 0); 3090 if (kvm_apic_set_base(vcpu, apic_base, true)) 3091 return -EINVAL; 3092 3093 ret = tdx_td_vcpu_init(vcpu, (u64)cmd->data); 3094 if (ret) 3095 return ret; 3096 3097 td_vmcs_write16(tdx, POSTED_INTR_NV, POSTED_INTR_VECTOR); 3098 td_vmcs_write64(tdx, POSTED_INTR_DESC_ADDR, __pa(&tdx->vt.pi_desc)); 3099 td_vmcs_setbit32(tdx, PIN_BASED_VM_EXEC_CONTROL, PIN_BASED_POSTED_INTR); 3100 3101 tdx->state = VCPU_TD_STATE_INITIALIZED; 3102 3103 return 0; 3104 } 3105 3106 void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) 3107 { 3108 /* 3109 * Yell on INIT, as TDX doesn't support INIT, i.e. KVM should drop all 3110 * INIT events. 3111 * 3112 * Defer initializing vCPU for RESET state until KVM_TDX_INIT_VCPU, as 3113 * userspace needs to define the vCPU model before KVM can initialize 3114 * vCPU state, e.g. to enable x2APIC. 3115 */ 3116 WARN_ON_ONCE(init_event); 3117 } 3118 3119 struct tdx_gmem_post_populate_arg { 3120 struct kvm_vcpu *vcpu; 3121 __u32 flags; 3122 }; 3123 3124 static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, 3125 void __user *src, int order, void *_arg) 3126 { 3127 u64 error_code = PFERR_GUEST_FINAL_MASK | PFERR_PRIVATE_ACCESS; 3128 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 3129 struct tdx_gmem_post_populate_arg *arg = _arg; 3130 struct kvm_vcpu *vcpu = arg->vcpu; 3131 gpa_t gpa = gfn_to_gpa(gfn); 3132 u8 level = PG_LEVEL_4K; 3133 struct page *src_page; 3134 int ret, i; 3135 u64 err, entry, level_state; 3136 3137 /* 3138 * Get the source page if it has been faulted in. Return failure if the 3139 * source page has been swapped out or unmapped in primary memory. 3140 */ 3141 ret = get_user_pages_fast((unsigned long)src, 1, 0, &src_page); 3142 if (ret < 0) 3143 return ret; 3144 if (ret != 1) 3145 return -ENOMEM; 3146 3147 ret = kvm_tdp_map_page(vcpu, gpa, error_code, &level); 3148 if (ret < 0) 3149 goto out; 3150 3151 /* 3152 * The private mem cannot be zapped after kvm_tdp_map_page() 3153 * because all paths are covered by slots_lock and the 3154 * filemap invalidate lock. Check that they are indeed enough. 3155 */ 3156 if (IS_ENABLED(CONFIG_KVM_PROVE_MMU)) { 3157 scoped_guard(read_lock, &kvm->mmu_lock) { 3158 if (KVM_BUG_ON(!kvm_tdp_mmu_gpa_is_mapped(vcpu, gpa), kvm)) { 3159 ret = -EIO; 3160 goto out; 3161 } 3162 } 3163 } 3164 3165 ret = 0; 3166 err = tdh_mem_page_add(&kvm_tdx->td, gpa, pfn_to_page(pfn), 3167 src_page, &entry, &level_state); 3168 if (err) { 3169 ret = unlikely(tdx_operand_busy(err)) ? -EBUSY : -EIO; 3170 goto out; 3171 } 3172 3173 if (!KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm)) 3174 atomic64_dec(&kvm_tdx->nr_premapped); 3175 3176 if (arg->flags & KVM_TDX_MEASURE_MEMORY_REGION) { 3177 for (i = 0; i < PAGE_SIZE; i += TDX_EXTENDMR_CHUNKSIZE) { 3178 err = tdh_mr_extend(&kvm_tdx->td, gpa + i, &entry, 3179 &level_state); 3180 if (err) { 3181 ret = -EIO; 3182 break; 3183 } 3184 } 3185 } 3186 3187 out: 3188 put_page(src_page); 3189 return ret; 3190 } 3191 3192 static int tdx_vcpu_init_mem_region(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd) 3193 { 3194 struct vcpu_tdx *tdx = to_tdx(vcpu); 3195 struct kvm *kvm = vcpu->kvm; 3196 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 3197 struct kvm_tdx_init_mem_region region; 3198 struct tdx_gmem_post_populate_arg arg; 3199 long gmem_ret; 3200 int ret; 3201 3202 if (tdx->state != VCPU_TD_STATE_INITIALIZED) 3203 return -EINVAL; 3204 3205 guard(mutex)(&kvm->slots_lock); 3206 3207 /* Once TD is finalized, the initial guest memory is fixed. */ 3208 if (kvm_tdx->state == TD_STATE_RUNNABLE) 3209 return -EINVAL; 3210 3211 if (cmd->flags & ~KVM_TDX_MEASURE_MEMORY_REGION) 3212 return -EINVAL; 3213 3214 if (copy_from_user(®ion, u64_to_user_ptr(cmd->data), sizeof(region))) 3215 return -EFAULT; 3216 3217 if (!PAGE_ALIGNED(region.source_addr) || !PAGE_ALIGNED(region.gpa) || 3218 !region.nr_pages || 3219 region.gpa + (region.nr_pages << PAGE_SHIFT) <= region.gpa || 3220 !vt_is_tdx_private_gpa(kvm, region.gpa) || 3221 !vt_is_tdx_private_gpa(kvm, region.gpa + (region.nr_pages << PAGE_SHIFT) - 1)) 3222 return -EINVAL; 3223 3224 kvm_mmu_reload(vcpu); 3225 ret = 0; 3226 while (region.nr_pages) { 3227 if (signal_pending(current)) { 3228 ret = -EINTR; 3229 break; 3230 } 3231 3232 arg = (struct tdx_gmem_post_populate_arg) { 3233 .vcpu = vcpu, 3234 .flags = cmd->flags, 3235 }; 3236 gmem_ret = kvm_gmem_populate(kvm, gpa_to_gfn(region.gpa), 3237 u64_to_user_ptr(region.source_addr), 3238 1, tdx_gmem_post_populate, &arg); 3239 if (gmem_ret < 0) { 3240 ret = gmem_ret; 3241 break; 3242 } 3243 3244 if (gmem_ret != 1) { 3245 ret = -EIO; 3246 break; 3247 } 3248 3249 region.source_addr += PAGE_SIZE; 3250 region.gpa += PAGE_SIZE; 3251 region.nr_pages--; 3252 3253 cond_resched(); 3254 } 3255 3256 if (copy_to_user(u64_to_user_ptr(cmd->data), ®ion, sizeof(region))) 3257 ret = -EFAULT; 3258 return ret; 3259 } 3260 3261 int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp) 3262 { 3263 struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); 3264 struct kvm_tdx_cmd cmd; 3265 int ret; 3266 3267 if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE) 3268 return -EINVAL; 3269 3270 if (copy_from_user(&cmd, argp, sizeof(cmd))) 3271 return -EFAULT; 3272 3273 if (cmd.hw_error) 3274 return -EINVAL; 3275 3276 switch (cmd.id) { 3277 case KVM_TDX_INIT_VCPU: 3278 ret = tdx_vcpu_init(vcpu, &cmd); 3279 break; 3280 case KVM_TDX_INIT_MEM_REGION: 3281 ret = tdx_vcpu_init_mem_region(vcpu, &cmd); 3282 break; 3283 case KVM_TDX_GET_CPUID: 3284 ret = tdx_vcpu_get_cpuid(vcpu, &cmd); 3285 break; 3286 default: 3287 ret = -EINVAL; 3288 break; 3289 } 3290 3291 return ret; 3292 } 3293 3294 int tdx_gmem_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn) 3295 { 3296 return PG_LEVEL_4K; 3297 } 3298 3299 static int tdx_online_cpu(unsigned int cpu) 3300 { 3301 unsigned long flags; 3302 int r; 3303 3304 /* Sanity check CPU is already in post-VMXON */ 3305 WARN_ON_ONCE(!(cr4_read_shadow() & X86_CR4_VMXE)); 3306 3307 local_irq_save(flags); 3308 r = tdx_cpu_enable(); 3309 local_irq_restore(flags); 3310 3311 return r; 3312 } 3313 3314 static int tdx_offline_cpu(unsigned int cpu) 3315 { 3316 int i; 3317 3318 /* No TD is running. Allow any cpu to be offline. */ 3319 if (!atomic_read(&nr_configured_hkid)) 3320 return 0; 3321 3322 /* 3323 * In order to reclaim TDX HKID, (i.e. when deleting guest TD), need to 3324 * call TDH.PHYMEM.PAGE.WBINVD on all packages to program all memory 3325 * controller with pconfig. If we have active TDX HKID, refuse to 3326 * offline the last online cpu. 3327 */ 3328 for_each_online_cpu(i) { 3329 /* 3330 * Found another online cpu on the same package. 3331 * Allow to offline. 3332 */ 3333 if (i != cpu && topology_physical_package_id(i) == 3334 topology_physical_package_id(cpu)) 3335 return 0; 3336 } 3337 3338 /* 3339 * This is the last cpu of this package. Don't offline it. 3340 * 3341 * Because it's hard for human operator to understand the 3342 * reason, warn it. 3343 */ 3344 #define MSG_ALLPKG_ONLINE \ 3345 "TDX requires all packages to have an online CPU. Delete all TDs in order to offline all CPUs of a package.\n" 3346 pr_warn_ratelimited(MSG_ALLPKG_ONLINE); 3347 return -EBUSY; 3348 } 3349 3350 static void __do_tdx_cleanup(void) 3351 { 3352 /* 3353 * Once TDX module is initialized, it cannot be disabled and 3354 * re-initialized again w/o runtime update (which isn't 3355 * supported by kernel). Only need to remove the cpuhp here. 3356 * The TDX host core code tracks TDX status and can handle 3357 * 'multiple enabling' scenario. 3358 */ 3359 WARN_ON_ONCE(!tdx_cpuhp_state); 3360 cpuhp_remove_state_nocalls_cpuslocked(tdx_cpuhp_state); 3361 tdx_cpuhp_state = 0; 3362 } 3363 3364 static void __tdx_cleanup(void) 3365 { 3366 cpus_read_lock(); 3367 __do_tdx_cleanup(); 3368 cpus_read_unlock(); 3369 } 3370 3371 static int __init __do_tdx_bringup(void) 3372 { 3373 int r; 3374 3375 /* 3376 * TDX-specific cpuhp callback to call tdx_cpu_enable() on all 3377 * online CPUs before calling tdx_enable(), and on any new 3378 * going-online CPU to make sure it is ready for TDX guest. 3379 */ 3380 r = cpuhp_setup_state_cpuslocked(CPUHP_AP_ONLINE_DYN, 3381 "kvm/cpu/tdx:online", 3382 tdx_online_cpu, tdx_offline_cpu); 3383 if (r < 0) 3384 return r; 3385 3386 tdx_cpuhp_state = r; 3387 3388 r = tdx_enable(); 3389 if (r) 3390 __do_tdx_cleanup(); 3391 3392 return r; 3393 } 3394 3395 static int __init __tdx_bringup(void) 3396 { 3397 const struct tdx_sys_info_td_conf *td_conf; 3398 int r, i; 3399 3400 for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++) { 3401 /* 3402 * Check if MSRs (tdx_uret_msrs) can be saved/restored 3403 * before returning to user space. 3404 * 3405 * this_cpu_ptr(user_return_msrs)->registered isn't checked 3406 * because the registration is done at vcpu runtime by 3407 * tdx_user_return_msr_update_cache(). 3408 */ 3409 tdx_uret_msrs[i].slot = kvm_find_user_return_msr(tdx_uret_msrs[i].msr); 3410 if (tdx_uret_msrs[i].slot == -1) { 3411 /* If any MSR isn't supported, it is a KVM bug */ 3412 pr_err("MSR %x isn't included by kvm_find_user_return_msr\n", 3413 tdx_uret_msrs[i].msr); 3414 return -EIO; 3415 } 3416 } 3417 3418 /* 3419 * Enabling TDX requires enabling hardware virtualization first, 3420 * as making SEAMCALLs requires CPU being in post-VMXON state. 3421 */ 3422 r = kvm_enable_virtualization(); 3423 if (r) 3424 return r; 3425 3426 cpus_read_lock(); 3427 r = __do_tdx_bringup(); 3428 cpus_read_unlock(); 3429 3430 if (r) 3431 goto tdx_bringup_err; 3432 3433 /* Get TDX global information for later use */ 3434 tdx_sysinfo = tdx_get_sysinfo(); 3435 if (WARN_ON_ONCE(!tdx_sysinfo)) { 3436 r = -EINVAL; 3437 goto get_sysinfo_err; 3438 } 3439 3440 /* Check TDX module and KVM capabilities */ 3441 if (!tdx_get_supported_attrs(&tdx_sysinfo->td_conf) || 3442 !tdx_get_supported_xfam(&tdx_sysinfo->td_conf)) 3443 goto get_sysinfo_err; 3444 3445 if (!(tdx_sysinfo->features.tdx_features0 & MD_FIELD_ID_FEATURES0_TOPOLOGY_ENUM)) 3446 goto get_sysinfo_err; 3447 3448 /* 3449 * TDX has its own limit of maximum vCPUs it can support for all 3450 * TDX guests in addition to KVM_MAX_VCPUS. Userspace needs to 3451 * query TDX guest's maximum vCPUs by checking KVM_CAP_MAX_VCPU 3452 * extension on per-VM basis. 3453 * 3454 * TDX module reports such limit via the MAX_VCPU_PER_TD global 3455 * metadata. Different modules may report different values. 3456 * Some old module may also not support this metadata (in which 3457 * case this limit is U16_MAX). 3458 * 3459 * In practice, the reported value reflects the maximum logical 3460 * CPUs that ALL the platforms that the module supports can 3461 * possibly have. 3462 * 3463 * Simply forwarding the MAX_VCPU_PER_TD to userspace could 3464 * result in an unpredictable ABI. KVM instead always advertise 3465 * the number of logical CPUs the platform has as the maximum 3466 * vCPUs for TDX guests. 3467 * 3468 * Make sure MAX_VCPU_PER_TD reported by TDX module is not 3469 * smaller than the number of logical CPUs, otherwise KVM will 3470 * report an unsupported value to userspace. 3471 * 3472 * Note, a platform with TDX enabled in the BIOS cannot support 3473 * physical CPU hotplug, and TDX requires the BIOS has marked 3474 * all logical CPUs in MADT table as enabled. Just use 3475 * num_present_cpus() for the number of logical CPUs. 3476 */ 3477 td_conf = &tdx_sysinfo->td_conf; 3478 if (td_conf->max_vcpus_per_td < num_present_cpus()) { 3479 pr_err("Disable TDX: MAX_VCPU_PER_TD (%u) smaller than number of logical CPUs (%u).\n", 3480 td_conf->max_vcpus_per_td, num_present_cpus()); 3481 r = -EINVAL; 3482 goto get_sysinfo_err; 3483 } 3484 3485 if (misc_cg_set_capacity(MISC_CG_RES_TDX, tdx_get_nr_guest_keyids())) { 3486 r = -EINVAL; 3487 goto get_sysinfo_err; 3488 } 3489 3490 /* 3491 * Leave hardware virtualization enabled after TDX is enabled 3492 * successfully. TDX CPU hotplug depends on this. 3493 */ 3494 return 0; 3495 3496 get_sysinfo_err: 3497 __tdx_cleanup(); 3498 tdx_bringup_err: 3499 kvm_disable_virtualization(); 3500 return r; 3501 } 3502 3503 void tdx_cleanup(void) 3504 { 3505 if (enable_tdx) { 3506 misc_cg_set_capacity(MISC_CG_RES_TDX, 0); 3507 __tdx_cleanup(); 3508 kvm_disable_virtualization(); 3509 } 3510 } 3511 3512 int __init tdx_bringup(void) 3513 { 3514 int r, i; 3515 3516 /* tdx_disable_virtualization_cpu() uses associated_tdvcpus. */ 3517 for_each_possible_cpu(i) 3518 INIT_LIST_HEAD(&per_cpu(associated_tdvcpus, i)); 3519 3520 if (!enable_tdx) 3521 return 0; 3522 3523 if (!enable_ept) { 3524 pr_err("EPT is required for TDX\n"); 3525 goto success_disable_tdx; 3526 } 3527 3528 if (!tdp_mmu_enabled || !enable_mmio_caching || !enable_ept_ad_bits) { 3529 pr_err("TDP MMU and MMIO caching and EPT A/D bit is required for TDX\n"); 3530 goto success_disable_tdx; 3531 } 3532 3533 if (!enable_apicv) { 3534 pr_err("APICv is required for TDX\n"); 3535 goto success_disable_tdx; 3536 } 3537 3538 if (!cpu_feature_enabled(X86_FEATURE_OSXSAVE)) { 3539 pr_err("tdx: OSXSAVE is required for TDX\n"); 3540 goto success_disable_tdx; 3541 } 3542 3543 if (!cpu_feature_enabled(X86_FEATURE_MOVDIR64B)) { 3544 pr_err("tdx: MOVDIR64B is required for TDX\n"); 3545 goto success_disable_tdx; 3546 } 3547 3548 if (!cpu_feature_enabled(X86_FEATURE_SELFSNOOP)) { 3549 pr_err("Self-snoop is required for TDX\n"); 3550 goto success_disable_tdx; 3551 } 3552 3553 if (!cpu_feature_enabled(X86_FEATURE_TDX_HOST_PLATFORM)) { 3554 pr_err("tdx: no TDX private KeyIDs available\n"); 3555 goto success_disable_tdx; 3556 } 3557 3558 if (!enable_virt_at_load) { 3559 pr_err("tdx: tdx requires kvm.enable_virt_at_load=1\n"); 3560 goto success_disable_tdx; 3561 } 3562 3563 /* 3564 * Ideally KVM should probe whether TDX module has been loaded 3565 * first and then try to bring it up. But TDX needs to use SEAMCALL 3566 * to probe whether the module is loaded (there is no CPUID or MSR 3567 * for that), and making SEAMCALL requires enabling virtualization 3568 * first, just like the rest steps of bringing up TDX module. 3569 * 3570 * So, for simplicity do everything in __tdx_bringup(); the first 3571 * SEAMCALL will return -ENODEV when the module is not loaded. The 3572 * only complication is having to make sure that initialization 3573 * SEAMCALLs don't return TDX_SEAMCALL_VMFAILINVALID in other 3574 * cases. 3575 */ 3576 r = __tdx_bringup(); 3577 if (r) { 3578 /* 3579 * Disable TDX only but don't fail to load module if 3580 * the TDX module could not be loaded. No need to print 3581 * message saying "module is not loaded" because it was 3582 * printed when the first SEAMCALL failed. 3583 */ 3584 if (r == -ENODEV) 3585 goto success_disable_tdx; 3586 3587 enable_tdx = 0; 3588 } 3589 3590 return r; 3591 3592 success_disable_tdx: 3593 enable_tdx = 0; 3594 return 0; 3595 } 3596