1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/cleanup.h> 3 #include <linux/cpu.h> 4 #include <asm/cpufeature.h> 5 #include <asm/fpu/xcr.h> 6 #include <linux/misc_cgroup.h> 7 #include <linux/mmu_context.h> 8 #include <asm/tdx.h> 9 #include "capabilities.h" 10 #include "mmu.h" 11 #include "x86_ops.h" 12 #include "lapic.h" 13 #include "tdx.h" 14 #include "vmx.h" 15 #include "mmu/spte.h" 16 #include "common.h" 17 #include "posted_intr.h" 18 #include "irq.h" 19 #include <trace/events/kvm.h> 20 #include "trace.h" 21 22 #pragma GCC poison to_vmx 23 24 #undef pr_fmt 25 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 26 27 #define pr_tdx_error(__fn, __err) \ 28 pr_err_ratelimited("SEAMCALL %s failed: 0x%llx\n", #__fn, __err) 29 30 #define __pr_tdx_error_N(__fn_str, __err, __fmt, ...) \ 31 pr_err_ratelimited("SEAMCALL " __fn_str " failed: 0x%llx, " __fmt, __err, __VA_ARGS__) 32 33 #define pr_tdx_error_1(__fn, __err, __rcx) \ 34 __pr_tdx_error_N(#__fn, __err, "rcx 0x%llx\n", __rcx) 35 36 #define pr_tdx_error_2(__fn, __err, __rcx, __rdx) \ 37 __pr_tdx_error_N(#__fn, __err, "rcx 0x%llx, rdx 0x%llx\n", __rcx, __rdx) 38 39 #define pr_tdx_error_3(__fn, __err, __rcx, __rdx, __r8) \ 40 __pr_tdx_error_N(#__fn, __err, "rcx 0x%llx, rdx 0x%llx, r8 0x%llx\n", __rcx, __rdx, __r8) 41 42 bool enable_tdx __ro_after_init; 43 module_param_named(tdx, enable_tdx, bool, 0444); 44 45 #define TDX_SHARED_BIT_PWL_5 gpa_to_gfn(BIT_ULL(51)) 46 #define TDX_SHARED_BIT_PWL_4 gpa_to_gfn(BIT_ULL(47)) 47 48 static enum cpuhp_state tdx_cpuhp_state; 49 50 static const struct tdx_sys_info *tdx_sysinfo; 51 52 void tdh_vp_rd_failed(struct vcpu_tdx *tdx, char *uclass, u32 field, u64 err) 53 { 54 KVM_BUG_ON(1, tdx->vcpu.kvm); 55 pr_err("TDH_VP_RD[%s.0x%x] failed 0x%llx\n", uclass, field, err); 56 } 57 58 void tdh_vp_wr_failed(struct vcpu_tdx *tdx, char *uclass, char *op, u32 field, 59 u64 val, u64 err) 60 { 61 KVM_BUG_ON(1, tdx->vcpu.kvm); 62 pr_err("TDH_VP_WR[%s.0x%x]%s0x%llx failed: 0x%llx\n", uclass, field, op, val, err); 63 } 64 65 #define KVM_SUPPORTED_TD_ATTRS (TDX_TD_ATTR_SEPT_VE_DISABLE) 66 67 static __always_inline struct kvm_tdx *to_kvm_tdx(struct kvm *kvm) 68 { 69 return container_of(kvm, struct kvm_tdx, kvm); 70 } 71 72 static __always_inline struct vcpu_tdx *to_tdx(struct kvm_vcpu *vcpu) 73 { 74 return container_of(vcpu, struct vcpu_tdx, vcpu); 75 } 76 77 static u64 tdx_get_supported_attrs(const struct tdx_sys_info_td_conf *td_conf) 78 { 79 u64 val = KVM_SUPPORTED_TD_ATTRS; 80 81 if ((val & td_conf->attributes_fixed1) != td_conf->attributes_fixed1) 82 return 0; 83 84 val &= td_conf->attributes_fixed0; 85 86 return val; 87 } 88 89 static u64 tdx_get_supported_xfam(const struct tdx_sys_info_td_conf *td_conf) 90 { 91 u64 val = kvm_caps.supported_xcr0 | kvm_caps.supported_xss; 92 93 if ((val & td_conf->xfam_fixed1) != td_conf->xfam_fixed1) 94 return 0; 95 96 val &= td_conf->xfam_fixed0; 97 98 return val; 99 } 100 101 static int tdx_get_guest_phys_addr_bits(const u32 eax) 102 { 103 return (eax & GENMASK(23, 16)) >> 16; 104 } 105 106 static u32 tdx_set_guest_phys_addr_bits(const u32 eax, int addr_bits) 107 { 108 return (eax & ~GENMASK(23, 16)) | (addr_bits & 0xff) << 16; 109 } 110 111 #define TDX_FEATURE_TSX (__feature_bit(X86_FEATURE_HLE) | __feature_bit(X86_FEATURE_RTM)) 112 113 static bool has_tsx(const struct kvm_cpuid_entry2 *entry) 114 { 115 return entry->function == 7 && entry->index == 0 && 116 (entry->ebx & TDX_FEATURE_TSX); 117 } 118 119 static void clear_tsx(struct kvm_cpuid_entry2 *entry) 120 { 121 entry->ebx &= ~TDX_FEATURE_TSX; 122 } 123 124 static bool has_waitpkg(const struct kvm_cpuid_entry2 *entry) 125 { 126 return entry->function == 7 && entry->index == 0 && 127 (entry->ecx & __feature_bit(X86_FEATURE_WAITPKG)); 128 } 129 130 static void clear_waitpkg(struct kvm_cpuid_entry2 *entry) 131 { 132 entry->ecx &= ~__feature_bit(X86_FEATURE_WAITPKG); 133 } 134 135 static void tdx_clear_unsupported_cpuid(struct kvm_cpuid_entry2 *entry) 136 { 137 if (has_tsx(entry)) 138 clear_tsx(entry); 139 140 if (has_waitpkg(entry)) 141 clear_waitpkg(entry); 142 } 143 144 static bool tdx_unsupported_cpuid(const struct kvm_cpuid_entry2 *entry) 145 { 146 return has_tsx(entry) || has_waitpkg(entry); 147 } 148 149 #define KVM_TDX_CPUID_NO_SUBLEAF ((__u32)-1) 150 151 static void td_init_cpuid_entry2(struct kvm_cpuid_entry2 *entry, unsigned char idx) 152 { 153 const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf; 154 155 entry->function = (u32)td_conf->cpuid_config_leaves[idx]; 156 entry->index = td_conf->cpuid_config_leaves[idx] >> 32; 157 entry->eax = (u32)td_conf->cpuid_config_values[idx][0]; 158 entry->ebx = td_conf->cpuid_config_values[idx][0] >> 32; 159 entry->ecx = (u32)td_conf->cpuid_config_values[idx][1]; 160 entry->edx = td_conf->cpuid_config_values[idx][1] >> 32; 161 162 if (entry->index == KVM_TDX_CPUID_NO_SUBLEAF) 163 entry->index = 0; 164 165 /* 166 * The TDX module doesn't allow configuring the guest phys addr bits 167 * (EAX[23:16]). However, KVM uses it as an interface to the userspace 168 * to configure the GPAW. Report these bits as configurable. 169 */ 170 if (entry->function == 0x80000008) 171 entry->eax = tdx_set_guest_phys_addr_bits(entry->eax, 0xff); 172 173 tdx_clear_unsupported_cpuid(entry); 174 } 175 176 #define TDVMCALLINFO_SETUP_EVENT_NOTIFY_INTERRUPT BIT(1) 177 178 static int init_kvm_tdx_caps(const struct tdx_sys_info_td_conf *td_conf, 179 struct kvm_tdx_capabilities *caps) 180 { 181 int i; 182 183 caps->supported_attrs = tdx_get_supported_attrs(td_conf); 184 if (!caps->supported_attrs) 185 return -EIO; 186 187 caps->supported_xfam = tdx_get_supported_xfam(td_conf); 188 if (!caps->supported_xfam) 189 return -EIO; 190 191 caps->cpuid.nent = td_conf->num_cpuid_config; 192 193 caps->user_tdvmcallinfo_1_r11 = 194 TDVMCALLINFO_SETUP_EVENT_NOTIFY_INTERRUPT; 195 196 for (i = 0; i < td_conf->num_cpuid_config; i++) 197 td_init_cpuid_entry2(&caps->cpuid.entries[i], i); 198 199 return 0; 200 } 201 202 /* 203 * Some SEAMCALLs acquire the TDX module globally, and can fail with 204 * TDX_OPERAND_BUSY. Use a global mutex to serialize these SEAMCALLs. 205 */ 206 static DEFINE_MUTEX(tdx_lock); 207 208 static atomic_t nr_configured_hkid; 209 210 static bool tdx_operand_busy(u64 err) 211 { 212 return (err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_BUSY; 213 } 214 215 216 /* 217 * A per-CPU list of TD vCPUs associated with a given CPU. 218 * Protected by interrupt mask. Only manipulated by the CPU owning this per-CPU 219 * list. 220 * - When a vCPU is loaded onto a CPU, it is removed from the per-CPU list of 221 * the old CPU during the IPI callback running on the old CPU, and then added 222 * to the per-CPU list of the new CPU. 223 * - When a TD is tearing down, all vCPUs are disassociated from their current 224 * running CPUs and removed from the per-CPU list during the IPI callback 225 * running on those CPUs. 226 * - When a CPU is brought down, traverse the per-CPU list to disassociate all 227 * associated TD vCPUs and remove them from the per-CPU list. 228 */ 229 static DEFINE_PER_CPU(struct list_head, associated_tdvcpus); 230 231 static __always_inline unsigned long tdvmcall_exit_type(struct kvm_vcpu *vcpu) 232 { 233 return to_tdx(vcpu)->vp_enter_args.r10; 234 } 235 236 static __always_inline unsigned long tdvmcall_leaf(struct kvm_vcpu *vcpu) 237 { 238 return to_tdx(vcpu)->vp_enter_args.r11; 239 } 240 241 static __always_inline void tdvmcall_set_return_code(struct kvm_vcpu *vcpu, 242 long val) 243 { 244 to_tdx(vcpu)->vp_enter_args.r10 = val; 245 } 246 247 static __always_inline void tdvmcall_set_return_val(struct kvm_vcpu *vcpu, 248 unsigned long val) 249 { 250 to_tdx(vcpu)->vp_enter_args.r11 = val; 251 } 252 253 static inline void tdx_hkid_free(struct kvm_tdx *kvm_tdx) 254 { 255 tdx_guest_keyid_free(kvm_tdx->hkid); 256 kvm_tdx->hkid = -1; 257 atomic_dec(&nr_configured_hkid); 258 misc_cg_uncharge(MISC_CG_RES_TDX, kvm_tdx->misc_cg, 1); 259 put_misc_cg(kvm_tdx->misc_cg); 260 kvm_tdx->misc_cg = NULL; 261 } 262 263 static inline bool is_hkid_assigned(struct kvm_tdx *kvm_tdx) 264 { 265 return kvm_tdx->hkid > 0; 266 } 267 268 static inline void tdx_disassociate_vp(struct kvm_vcpu *vcpu) 269 { 270 lockdep_assert_irqs_disabled(); 271 272 list_del(&to_tdx(vcpu)->cpu_list); 273 274 /* 275 * Ensure tdx->cpu_list is updated before setting vcpu->cpu to -1, 276 * otherwise, a different CPU can see vcpu->cpu = -1 and add the vCPU 277 * to its list before it's deleted from this CPU's list. 278 */ 279 smp_wmb(); 280 281 vcpu->cpu = -1; 282 } 283 284 static void tdx_clear_page(struct page *page) 285 { 286 const void *zero_page = (const void *) page_to_virt(ZERO_PAGE(0)); 287 void *dest = page_to_virt(page); 288 unsigned long i; 289 290 /* 291 * The page could have been poisoned. MOVDIR64B also clears 292 * the poison bit so the kernel can safely use the page again. 293 */ 294 for (i = 0; i < PAGE_SIZE; i += 64) 295 movdir64b(dest + i, zero_page); 296 /* 297 * MOVDIR64B store uses WC buffer. Prevent following memory reads 298 * from seeing potentially poisoned cache. 299 */ 300 __mb(); 301 } 302 303 static void tdx_no_vcpus_enter_start(struct kvm *kvm) 304 { 305 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 306 307 lockdep_assert_held_write(&kvm->mmu_lock); 308 309 WRITE_ONCE(kvm_tdx->wait_for_sept_zap, true); 310 311 kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE); 312 } 313 314 static void tdx_no_vcpus_enter_stop(struct kvm *kvm) 315 { 316 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 317 318 lockdep_assert_held_write(&kvm->mmu_lock); 319 320 WRITE_ONCE(kvm_tdx->wait_for_sept_zap, false); 321 } 322 323 /* TDH.PHYMEM.PAGE.RECLAIM is allowed only when destroying the TD. */ 324 static int __tdx_reclaim_page(struct page *page) 325 { 326 u64 err, rcx, rdx, r8; 327 328 err = tdh_phymem_page_reclaim(page, &rcx, &rdx, &r8); 329 330 /* 331 * No need to check for TDX_OPERAND_BUSY; all TD pages are freed 332 * before the HKID is released and control pages have also been 333 * released at this point, so there is no possibility of contention. 334 */ 335 if (WARN_ON_ONCE(err)) { 336 pr_tdx_error_3(TDH_PHYMEM_PAGE_RECLAIM, err, rcx, rdx, r8); 337 return -EIO; 338 } 339 return 0; 340 } 341 342 static int tdx_reclaim_page(struct page *page) 343 { 344 int r; 345 346 r = __tdx_reclaim_page(page); 347 if (!r) 348 tdx_clear_page(page); 349 return r; 350 } 351 352 353 /* 354 * Reclaim the TD control page(s) which are crypto-protected by TDX guest's 355 * private KeyID. Assume the cache associated with the TDX private KeyID has 356 * been flushed. 357 */ 358 static void tdx_reclaim_control_page(struct page *ctrl_page) 359 { 360 /* 361 * Leak the page if the kernel failed to reclaim the page. 362 * The kernel cannot use it safely anymore. 363 */ 364 if (tdx_reclaim_page(ctrl_page)) 365 return; 366 367 __free_page(ctrl_page); 368 } 369 370 struct tdx_flush_vp_arg { 371 struct kvm_vcpu *vcpu; 372 u64 err; 373 }; 374 375 static void tdx_flush_vp(void *_arg) 376 { 377 struct tdx_flush_vp_arg *arg = _arg; 378 struct kvm_vcpu *vcpu = arg->vcpu; 379 u64 err; 380 381 arg->err = 0; 382 lockdep_assert_irqs_disabled(); 383 384 /* Task migration can race with CPU offlining. */ 385 if (unlikely(vcpu->cpu != raw_smp_processor_id())) 386 return; 387 388 /* 389 * No need to do TDH_VP_FLUSH if the vCPU hasn't been initialized. The 390 * list tracking still needs to be updated so that it's correct if/when 391 * the vCPU does get initialized. 392 */ 393 if (to_tdx(vcpu)->state != VCPU_TD_STATE_UNINITIALIZED) { 394 /* 395 * No need to retry. TDX Resources needed for TDH.VP.FLUSH are: 396 * TDVPR as exclusive, TDR as shared, and TDCS as shared. This 397 * vp flush function is called when destructing vCPU/TD or vCPU 398 * migration. No other thread uses TDVPR in those cases. 399 */ 400 err = tdh_vp_flush(&to_tdx(vcpu)->vp); 401 if (unlikely(err && err != TDX_VCPU_NOT_ASSOCIATED)) { 402 /* 403 * This function is called in IPI context. Do not use 404 * printk to avoid console semaphore. 405 * The caller prints out the error message, instead. 406 */ 407 if (err) 408 arg->err = err; 409 } 410 } 411 412 tdx_disassociate_vp(vcpu); 413 } 414 415 static void tdx_flush_vp_on_cpu(struct kvm_vcpu *vcpu) 416 { 417 struct tdx_flush_vp_arg arg = { 418 .vcpu = vcpu, 419 }; 420 int cpu = vcpu->cpu; 421 422 if (unlikely(cpu == -1)) 423 return; 424 425 smp_call_function_single(cpu, tdx_flush_vp, &arg, 1); 426 if (KVM_BUG_ON(arg.err, vcpu->kvm)) 427 pr_tdx_error(TDH_VP_FLUSH, arg.err); 428 } 429 430 void tdx_disable_virtualization_cpu(void) 431 { 432 int cpu = raw_smp_processor_id(); 433 struct list_head *tdvcpus = &per_cpu(associated_tdvcpus, cpu); 434 struct tdx_flush_vp_arg arg; 435 struct vcpu_tdx *tdx, *tmp; 436 unsigned long flags; 437 438 local_irq_save(flags); 439 /* Safe variant needed as tdx_disassociate_vp() deletes the entry. */ 440 list_for_each_entry_safe(tdx, tmp, tdvcpus, cpu_list) { 441 arg.vcpu = &tdx->vcpu; 442 tdx_flush_vp(&arg); 443 } 444 local_irq_restore(flags); 445 } 446 447 #define TDX_SEAMCALL_RETRIES 10000 448 449 static void smp_func_do_phymem_cache_wb(void *unused) 450 { 451 u64 err = 0; 452 bool resume; 453 int i; 454 455 /* 456 * TDH.PHYMEM.CACHE.WB flushes caches associated with any TDX private 457 * KeyID on the package or core. The TDX module may not finish the 458 * cache flush but return TDX_INTERRUPTED_RESUMEABLE instead. The 459 * kernel should retry it until it returns success w/o rescheduling. 460 */ 461 for (i = TDX_SEAMCALL_RETRIES; i > 0; i--) { 462 resume = !!err; 463 err = tdh_phymem_cache_wb(resume); 464 switch (err) { 465 case TDX_INTERRUPTED_RESUMABLE: 466 continue; 467 case TDX_NO_HKID_READY_TO_WBCACHE: 468 err = TDX_SUCCESS; /* Already done by other thread */ 469 fallthrough; 470 default: 471 goto out; 472 } 473 } 474 475 out: 476 if (WARN_ON_ONCE(err)) 477 pr_tdx_error(TDH_PHYMEM_CACHE_WB, err); 478 } 479 480 void tdx_mmu_release_hkid(struct kvm *kvm) 481 { 482 bool packages_allocated, targets_allocated; 483 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 484 cpumask_var_t packages, targets; 485 struct kvm_vcpu *vcpu; 486 unsigned long j; 487 int i; 488 u64 err; 489 490 if (!is_hkid_assigned(kvm_tdx)) 491 return; 492 493 packages_allocated = zalloc_cpumask_var(&packages, GFP_KERNEL); 494 targets_allocated = zalloc_cpumask_var(&targets, GFP_KERNEL); 495 cpus_read_lock(); 496 497 kvm_for_each_vcpu(j, vcpu, kvm) 498 tdx_flush_vp_on_cpu(vcpu); 499 500 /* 501 * TDH.PHYMEM.CACHE.WB tries to acquire the TDX module global lock 502 * and can fail with TDX_OPERAND_BUSY when it fails to get the lock. 503 * Multiple TDX guests can be destroyed simultaneously. Take the 504 * mutex to prevent it from getting error. 505 */ 506 mutex_lock(&tdx_lock); 507 508 /* 509 * Releasing HKID is in vm_destroy(). 510 * After the above flushing vps, there should be no more vCPU 511 * associations, as all vCPU fds have been released at this stage. 512 */ 513 err = tdh_mng_vpflushdone(&kvm_tdx->td); 514 if (err == TDX_FLUSHVP_NOT_DONE) 515 goto out; 516 if (KVM_BUG_ON(err, kvm)) { 517 pr_tdx_error(TDH_MNG_VPFLUSHDONE, err); 518 pr_err("tdh_mng_vpflushdone() failed. HKID %d is leaked.\n", 519 kvm_tdx->hkid); 520 goto out; 521 } 522 523 for_each_online_cpu(i) { 524 if (packages_allocated && 525 cpumask_test_and_set_cpu(topology_physical_package_id(i), 526 packages)) 527 continue; 528 if (targets_allocated) 529 cpumask_set_cpu(i, targets); 530 } 531 if (targets_allocated) 532 on_each_cpu_mask(targets, smp_func_do_phymem_cache_wb, NULL, true); 533 else 534 on_each_cpu(smp_func_do_phymem_cache_wb, NULL, true); 535 /* 536 * In the case of error in smp_func_do_phymem_cache_wb(), the following 537 * tdh_mng_key_freeid() will fail. 538 */ 539 err = tdh_mng_key_freeid(&kvm_tdx->td); 540 if (KVM_BUG_ON(err, kvm)) { 541 pr_tdx_error(TDH_MNG_KEY_FREEID, err); 542 pr_err("tdh_mng_key_freeid() failed. HKID %d is leaked.\n", 543 kvm_tdx->hkid); 544 } else { 545 tdx_hkid_free(kvm_tdx); 546 } 547 548 out: 549 mutex_unlock(&tdx_lock); 550 cpus_read_unlock(); 551 free_cpumask_var(targets); 552 free_cpumask_var(packages); 553 } 554 555 static void tdx_reclaim_td_control_pages(struct kvm *kvm) 556 { 557 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 558 u64 err; 559 int i; 560 561 /* 562 * tdx_mmu_release_hkid() failed to reclaim HKID. Something went wrong 563 * heavily with TDX module. Give up freeing TD pages. As the function 564 * already warned, don't warn it again. 565 */ 566 if (is_hkid_assigned(kvm_tdx)) 567 return; 568 569 if (kvm_tdx->td.tdcs_pages) { 570 for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) { 571 if (!kvm_tdx->td.tdcs_pages[i]) 572 continue; 573 574 tdx_reclaim_control_page(kvm_tdx->td.tdcs_pages[i]); 575 } 576 kfree(kvm_tdx->td.tdcs_pages); 577 kvm_tdx->td.tdcs_pages = NULL; 578 } 579 580 if (!kvm_tdx->td.tdr_page) 581 return; 582 583 if (__tdx_reclaim_page(kvm_tdx->td.tdr_page)) 584 return; 585 586 /* 587 * Use a SEAMCALL to ask the TDX module to flush the cache based on the 588 * KeyID. TDX module may access TDR while operating on TD (Especially 589 * when it is reclaiming TDCS). 590 */ 591 err = tdh_phymem_page_wbinvd_tdr(&kvm_tdx->td); 592 if (KVM_BUG_ON(err, kvm)) { 593 pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err); 594 return; 595 } 596 tdx_clear_page(kvm_tdx->td.tdr_page); 597 598 __free_page(kvm_tdx->td.tdr_page); 599 kvm_tdx->td.tdr_page = NULL; 600 } 601 602 void tdx_vm_destroy(struct kvm *kvm) 603 { 604 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 605 606 tdx_reclaim_td_control_pages(kvm); 607 608 kvm_tdx->state = TD_STATE_UNINITIALIZED; 609 } 610 611 static int tdx_do_tdh_mng_key_config(void *param) 612 { 613 struct kvm_tdx *kvm_tdx = param; 614 u64 err; 615 616 /* TDX_RND_NO_ENTROPY related retries are handled by sc_retry() */ 617 err = tdh_mng_key_config(&kvm_tdx->td); 618 619 if (KVM_BUG_ON(err, &kvm_tdx->kvm)) { 620 pr_tdx_error(TDH_MNG_KEY_CONFIG, err); 621 return -EIO; 622 } 623 624 return 0; 625 } 626 627 int tdx_vm_init(struct kvm *kvm) 628 { 629 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 630 631 kvm->arch.has_protected_state = true; 632 kvm->arch.has_private_mem = true; 633 kvm->arch.disabled_quirks |= KVM_X86_QUIRK_IGNORE_GUEST_PAT; 634 635 /* 636 * Because guest TD is protected, VMM can't parse the instruction in TD. 637 * Instead, guest uses MMIO hypercall. For unmodified device driver, 638 * #VE needs to be injected for MMIO and #VE handler in TD converts MMIO 639 * instruction into MMIO hypercall. 640 * 641 * SPTE value for MMIO needs to be setup so that #VE is injected into 642 * TD instead of triggering EPT MISCONFIG. 643 * - RWX=0 so that EPT violation is triggered. 644 * - suppress #VE bit is cleared to inject #VE. 645 */ 646 kvm_mmu_set_mmio_spte_value(kvm, 0); 647 648 /* 649 * TDX has its own limit of maximum vCPUs it can support for all 650 * TDX guests in addition to KVM_MAX_VCPUS. TDX module reports 651 * such limit via the MAX_VCPU_PER_TD global metadata. In 652 * practice, it reflects the number of logical CPUs that ALL 653 * platforms that the TDX module supports can possibly have. 654 * 655 * Limit TDX guest's maximum vCPUs to the number of logical CPUs 656 * the platform has. Simply forwarding the MAX_VCPU_PER_TD to 657 * userspace would result in an unpredictable ABI. 658 */ 659 kvm->max_vcpus = min_t(int, kvm->max_vcpus, num_present_cpus()); 660 661 kvm_tdx->state = TD_STATE_UNINITIALIZED; 662 663 return 0; 664 } 665 666 int tdx_vcpu_create(struct kvm_vcpu *vcpu) 667 { 668 struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); 669 struct vcpu_tdx *tdx = to_tdx(vcpu); 670 671 if (kvm_tdx->state != TD_STATE_INITIALIZED) 672 return -EIO; 673 674 /* 675 * TDX module mandates APICv, which requires an in-kernel local APIC. 676 * Disallow an in-kernel I/O APIC, because level-triggered interrupts 677 * and thus the I/O APIC as a whole can't be faithfully emulated in KVM. 678 */ 679 if (!irqchip_split(vcpu->kvm)) 680 return -EINVAL; 681 682 fpstate_set_confidential(&vcpu->arch.guest_fpu); 683 vcpu->arch.apic->guest_apic_protected = true; 684 INIT_LIST_HEAD(&tdx->vt.pi_wakeup_list); 685 686 vcpu->arch.efer = EFER_SCE | EFER_LME | EFER_LMA | EFER_NX; 687 688 vcpu->arch.switch_db_regs = KVM_DEBUGREG_AUTO_SWITCH; 689 vcpu->arch.cr0_guest_owned_bits = -1ul; 690 vcpu->arch.cr4_guest_owned_bits = -1ul; 691 692 /* KVM can't change TSC offset/multiplier as TDX module manages them. */ 693 vcpu->arch.guest_tsc_protected = true; 694 vcpu->arch.tsc_offset = kvm_tdx->tsc_offset; 695 vcpu->arch.l1_tsc_offset = vcpu->arch.tsc_offset; 696 vcpu->arch.tsc_scaling_ratio = kvm_tdx->tsc_multiplier; 697 vcpu->arch.l1_tsc_scaling_ratio = kvm_tdx->tsc_multiplier; 698 699 vcpu->arch.guest_state_protected = 700 !(to_kvm_tdx(vcpu->kvm)->attributes & TDX_TD_ATTR_DEBUG); 701 702 if ((kvm_tdx->xfam & XFEATURE_MASK_XTILE) == XFEATURE_MASK_XTILE) 703 vcpu->arch.xfd_no_write_intercept = true; 704 705 tdx->vt.pi_desc.nv = POSTED_INTR_VECTOR; 706 __pi_set_sn(&tdx->vt.pi_desc); 707 708 tdx->state = VCPU_TD_STATE_UNINITIALIZED; 709 710 return 0; 711 } 712 713 void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 714 { 715 struct vcpu_tdx *tdx = to_tdx(vcpu); 716 717 vmx_vcpu_pi_load(vcpu, cpu); 718 if (vcpu->cpu == cpu || !is_hkid_assigned(to_kvm_tdx(vcpu->kvm))) 719 return; 720 721 tdx_flush_vp_on_cpu(vcpu); 722 723 KVM_BUG_ON(cpu != raw_smp_processor_id(), vcpu->kvm); 724 local_irq_disable(); 725 /* 726 * Pairs with the smp_wmb() in tdx_disassociate_vp() to ensure 727 * vcpu->cpu is read before tdx->cpu_list. 728 */ 729 smp_rmb(); 730 731 list_add(&tdx->cpu_list, &per_cpu(associated_tdvcpus, cpu)); 732 local_irq_enable(); 733 } 734 735 bool tdx_interrupt_allowed(struct kvm_vcpu *vcpu) 736 { 737 /* 738 * KVM can't get the interrupt status of TDX guest and it assumes 739 * interrupt is always allowed unless TDX guest calls TDVMCALL with HLT, 740 * which passes the interrupt blocked flag. 741 */ 742 return vmx_get_exit_reason(vcpu).basic != EXIT_REASON_HLT || 743 !to_tdx(vcpu)->vp_enter_args.r12; 744 } 745 746 bool tdx_protected_apic_has_interrupt(struct kvm_vcpu *vcpu) 747 { 748 u64 vcpu_state_details; 749 750 if (pi_has_pending_interrupt(vcpu)) 751 return true; 752 753 /* 754 * Only check RVI pending for HALTED case with IRQ enabled. 755 * For non-HLT cases, KVM doesn't care about STI/SS shadows. And if the 756 * interrupt was pending before TD exit, then it _must_ be blocked, 757 * otherwise the interrupt would have been serviced at the instruction 758 * boundary. 759 */ 760 if (vmx_get_exit_reason(vcpu).basic != EXIT_REASON_HLT || 761 to_tdx(vcpu)->vp_enter_args.r12) 762 return false; 763 764 vcpu_state_details = 765 td_state_non_arch_read64(to_tdx(vcpu), TD_VCPU_STATE_DETAILS_NON_ARCH); 766 767 return tdx_vcpu_state_details_intr_pending(vcpu_state_details); 768 } 769 770 /* 771 * Compared to vmx_prepare_switch_to_guest(), there is not much to do 772 * as SEAMCALL/SEAMRET calls take care of most of save and restore. 773 */ 774 void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) 775 { 776 struct vcpu_vt *vt = to_vt(vcpu); 777 778 if (vt->guest_state_loaded) 779 return; 780 781 if (likely(is_64bit_mm(current->mm))) 782 vt->msr_host_kernel_gs_base = current->thread.gsbase; 783 else 784 vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE); 785 786 vt->host_debugctlmsr = get_debugctlmsr(); 787 788 vt->guest_state_loaded = true; 789 } 790 791 struct tdx_uret_msr { 792 u32 msr; 793 unsigned int slot; 794 u64 defval; 795 }; 796 797 static struct tdx_uret_msr tdx_uret_msrs[] = { 798 {.msr = MSR_SYSCALL_MASK, .defval = 0x20200 }, 799 {.msr = MSR_STAR,}, 800 {.msr = MSR_LSTAR,}, 801 {.msr = MSR_TSC_AUX,}, 802 }; 803 804 static void tdx_user_return_msr_update_cache(void) 805 { 806 int i; 807 808 for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++) 809 kvm_user_return_msr_update_cache(tdx_uret_msrs[i].slot, 810 tdx_uret_msrs[i].defval); 811 } 812 813 static void tdx_prepare_switch_to_host(struct kvm_vcpu *vcpu) 814 { 815 struct vcpu_vt *vt = to_vt(vcpu); 816 struct vcpu_tdx *tdx = to_tdx(vcpu); 817 818 if (!vt->guest_state_loaded) 819 return; 820 821 ++vcpu->stat.host_state_reload; 822 wrmsrl(MSR_KERNEL_GS_BASE, vt->msr_host_kernel_gs_base); 823 824 if (tdx->guest_entered) { 825 tdx_user_return_msr_update_cache(); 826 tdx->guest_entered = false; 827 } 828 829 vt->guest_state_loaded = false; 830 } 831 832 void tdx_vcpu_put(struct kvm_vcpu *vcpu) 833 { 834 vmx_vcpu_pi_put(vcpu); 835 tdx_prepare_switch_to_host(vcpu); 836 } 837 838 void tdx_vcpu_free(struct kvm_vcpu *vcpu) 839 { 840 struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); 841 struct vcpu_tdx *tdx = to_tdx(vcpu); 842 int i; 843 844 /* 845 * It is not possible to reclaim pages while hkid is assigned. It might 846 * be assigned if: 847 * 1. the TD VM is being destroyed but freeing hkid failed, in which 848 * case the pages are leaked 849 * 2. TD VCPU creation failed and this on the error path, in which case 850 * there is nothing to do anyway 851 */ 852 if (is_hkid_assigned(kvm_tdx)) 853 return; 854 855 if (tdx->vp.tdcx_pages) { 856 for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) { 857 if (tdx->vp.tdcx_pages[i]) 858 tdx_reclaim_control_page(tdx->vp.tdcx_pages[i]); 859 } 860 kfree(tdx->vp.tdcx_pages); 861 tdx->vp.tdcx_pages = NULL; 862 } 863 if (tdx->vp.tdvpr_page) { 864 tdx_reclaim_control_page(tdx->vp.tdvpr_page); 865 tdx->vp.tdvpr_page = 0; 866 } 867 868 tdx->state = VCPU_TD_STATE_UNINITIALIZED; 869 } 870 871 int tdx_vcpu_pre_run(struct kvm_vcpu *vcpu) 872 { 873 if (unlikely(to_tdx(vcpu)->state != VCPU_TD_STATE_INITIALIZED || 874 to_kvm_tdx(vcpu->kvm)->state != TD_STATE_RUNNABLE)) 875 return -EINVAL; 876 877 return 1; 878 } 879 880 static __always_inline u32 tdcall_to_vmx_exit_reason(struct kvm_vcpu *vcpu) 881 { 882 switch (tdvmcall_leaf(vcpu)) { 883 case EXIT_REASON_CPUID: 884 case EXIT_REASON_HLT: 885 case EXIT_REASON_IO_INSTRUCTION: 886 case EXIT_REASON_MSR_READ: 887 case EXIT_REASON_MSR_WRITE: 888 return tdvmcall_leaf(vcpu); 889 case EXIT_REASON_EPT_VIOLATION: 890 return EXIT_REASON_EPT_MISCONFIG; 891 default: 892 break; 893 } 894 895 return EXIT_REASON_TDCALL; 896 } 897 898 static __always_inline u32 tdx_to_vmx_exit_reason(struct kvm_vcpu *vcpu) 899 { 900 struct vcpu_tdx *tdx = to_tdx(vcpu); 901 u32 exit_reason; 902 903 switch (tdx->vp_enter_ret & TDX_SEAMCALL_STATUS_MASK) { 904 case TDX_SUCCESS: 905 case TDX_NON_RECOVERABLE_VCPU: 906 case TDX_NON_RECOVERABLE_TD: 907 case TDX_NON_RECOVERABLE_TD_NON_ACCESSIBLE: 908 case TDX_NON_RECOVERABLE_TD_WRONG_APIC_MODE: 909 break; 910 default: 911 return -1u; 912 } 913 914 exit_reason = tdx->vp_enter_ret; 915 916 switch (exit_reason) { 917 case EXIT_REASON_TDCALL: 918 if (tdvmcall_exit_type(vcpu)) 919 return EXIT_REASON_VMCALL; 920 921 return tdcall_to_vmx_exit_reason(vcpu); 922 case EXIT_REASON_EPT_MISCONFIG: 923 /* 924 * Defer KVM_BUG_ON() until tdx_handle_exit() because this is in 925 * non-instrumentable code with interrupts disabled. 926 */ 927 return -1u; 928 default: 929 break; 930 } 931 932 return exit_reason; 933 } 934 935 static noinstr void tdx_vcpu_enter_exit(struct kvm_vcpu *vcpu) 936 { 937 struct vcpu_tdx *tdx = to_tdx(vcpu); 938 struct vcpu_vt *vt = to_vt(vcpu); 939 940 guest_state_enter_irqoff(); 941 942 tdx->vp_enter_ret = tdh_vp_enter(&tdx->vp, &tdx->vp_enter_args); 943 944 vt->exit_reason.full = tdx_to_vmx_exit_reason(vcpu); 945 946 vt->exit_qualification = tdx->vp_enter_args.rcx; 947 tdx->ext_exit_qualification = tdx->vp_enter_args.rdx; 948 tdx->exit_gpa = tdx->vp_enter_args.r8; 949 vt->exit_intr_info = tdx->vp_enter_args.r9; 950 951 vmx_handle_nmi(vcpu); 952 953 guest_state_exit_irqoff(); 954 } 955 956 static bool tdx_failed_vmentry(struct kvm_vcpu *vcpu) 957 { 958 return vmx_get_exit_reason(vcpu).failed_vmentry && 959 vmx_get_exit_reason(vcpu).full != -1u; 960 } 961 962 static fastpath_t tdx_exit_handlers_fastpath(struct kvm_vcpu *vcpu) 963 { 964 u64 vp_enter_ret = to_tdx(vcpu)->vp_enter_ret; 965 966 /* 967 * TDX_OPERAND_BUSY could be returned for SEPT due to 0-step mitigation 968 * or for TD EPOCH due to contention with TDH.MEM.TRACK on TDH.VP.ENTER. 969 * 970 * When KVM requests KVM_REQ_OUTSIDE_GUEST_MODE, which has both 971 * KVM_REQUEST_WAIT and KVM_REQUEST_NO_ACTION set, it requires target 972 * vCPUs leaving fastpath so that interrupt can be enabled to ensure the 973 * IPIs can be delivered. Return EXIT_FASTPATH_EXIT_HANDLED instead of 974 * EXIT_FASTPATH_REENTER_GUEST to exit fastpath, otherwise, the 975 * requester may be blocked endlessly. 976 */ 977 if (unlikely(tdx_operand_busy(vp_enter_ret))) 978 return EXIT_FASTPATH_EXIT_HANDLED; 979 980 return EXIT_FASTPATH_NONE; 981 } 982 983 #define TDX_REGS_AVAIL_SET (BIT_ULL(VCPU_EXREG_EXIT_INFO_1) | \ 984 BIT_ULL(VCPU_EXREG_EXIT_INFO_2) | \ 985 BIT_ULL(VCPU_REGS_RAX) | \ 986 BIT_ULL(VCPU_REGS_RBX) | \ 987 BIT_ULL(VCPU_REGS_RCX) | \ 988 BIT_ULL(VCPU_REGS_RDX) | \ 989 BIT_ULL(VCPU_REGS_RBP) | \ 990 BIT_ULL(VCPU_REGS_RSI) | \ 991 BIT_ULL(VCPU_REGS_RDI) | \ 992 BIT_ULL(VCPU_REGS_R8) | \ 993 BIT_ULL(VCPU_REGS_R9) | \ 994 BIT_ULL(VCPU_REGS_R10) | \ 995 BIT_ULL(VCPU_REGS_R11) | \ 996 BIT_ULL(VCPU_REGS_R12) | \ 997 BIT_ULL(VCPU_REGS_R13) | \ 998 BIT_ULL(VCPU_REGS_R14) | \ 999 BIT_ULL(VCPU_REGS_R15)) 1000 1001 static void tdx_load_host_xsave_state(struct kvm_vcpu *vcpu) 1002 { 1003 struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); 1004 1005 /* 1006 * All TDX hosts support PKRU; but even if they didn't, 1007 * vcpu->arch.host_pkru would be 0 and the wrpkru would be 1008 * skipped. 1009 */ 1010 if (vcpu->arch.host_pkru != 0) 1011 wrpkru(vcpu->arch.host_pkru); 1012 1013 if (kvm_host.xcr0 != (kvm_tdx->xfam & kvm_caps.supported_xcr0)) 1014 xsetbv(XCR_XFEATURE_ENABLED_MASK, kvm_host.xcr0); 1015 1016 /* 1017 * Likewise, even if a TDX hosts didn't support XSS both arms of 1018 * the comparison would be 0 and the wrmsrl would be skipped. 1019 */ 1020 if (kvm_host.xss != (kvm_tdx->xfam & kvm_caps.supported_xss)) 1021 wrmsrl(MSR_IA32_XSS, kvm_host.xss); 1022 } 1023 1024 #define TDX_DEBUGCTL_PRESERVED (DEBUGCTLMSR_BTF | \ 1025 DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI | \ 1026 DEBUGCTLMSR_FREEZE_IN_SMM) 1027 1028 fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) 1029 { 1030 struct vcpu_tdx *tdx = to_tdx(vcpu); 1031 struct vcpu_vt *vt = to_vt(vcpu); 1032 1033 /* 1034 * force_immediate_exit requires vCPU entering for events injection with 1035 * an immediately exit followed. But The TDX module doesn't guarantee 1036 * entry, it's already possible for KVM to _think_ it completely entry 1037 * to the guest without actually having done so. 1038 * Since KVM never needs to force an immediate exit for TDX, and can't 1039 * do direct injection, just warn on force_immediate_exit. 1040 */ 1041 WARN_ON_ONCE(force_immediate_exit); 1042 1043 /* 1044 * Wait until retry of SEPT-zap-related SEAMCALL completes before 1045 * allowing vCPU entry to avoid contention with tdh_vp_enter() and 1046 * TDCALLs. 1047 */ 1048 if (unlikely(READ_ONCE(to_kvm_tdx(vcpu->kvm)->wait_for_sept_zap))) 1049 return EXIT_FASTPATH_EXIT_HANDLED; 1050 1051 trace_kvm_entry(vcpu, force_immediate_exit); 1052 1053 if (pi_test_on(&vt->pi_desc)) { 1054 apic->send_IPI_self(POSTED_INTR_VECTOR); 1055 1056 if (pi_test_pir(kvm_lapic_get_reg(vcpu->arch.apic, APIC_LVTT) & 1057 APIC_VECTOR_MASK, &vt->pi_desc)) 1058 kvm_wait_lapic_expire(vcpu); 1059 } 1060 1061 tdx_vcpu_enter_exit(vcpu); 1062 1063 if (vt->host_debugctlmsr & ~TDX_DEBUGCTL_PRESERVED) 1064 update_debugctlmsr(vt->host_debugctlmsr); 1065 1066 tdx_load_host_xsave_state(vcpu); 1067 tdx->guest_entered = true; 1068 1069 vcpu->arch.regs_avail &= TDX_REGS_AVAIL_SET; 1070 1071 if (unlikely(tdx->vp_enter_ret == EXIT_REASON_EPT_MISCONFIG)) 1072 return EXIT_FASTPATH_NONE; 1073 1074 if (unlikely((tdx->vp_enter_ret & TDX_SW_ERROR) == TDX_SW_ERROR)) 1075 return EXIT_FASTPATH_NONE; 1076 1077 if (unlikely(vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MCE_DURING_VMENTRY)) 1078 kvm_machine_check(); 1079 1080 trace_kvm_exit(vcpu, KVM_ISA_VMX); 1081 1082 if (unlikely(tdx_failed_vmentry(vcpu))) 1083 return EXIT_FASTPATH_NONE; 1084 1085 return tdx_exit_handlers_fastpath(vcpu); 1086 } 1087 1088 void tdx_inject_nmi(struct kvm_vcpu *vcpu) 1089 { 1090 ++vcpu->stat.nmi_injections; 1091 td_management_write8(to_tdx(vcpu), TD_VCPU_PEND_NMI, 1); 1092 /* 1093 * From KVM's perspective, NMI injection is completed right after 1094 * writing to PEND_NMI. KVM doesn't care whether an NMI is injected by 1095 * the TDX module or not. 1096 */ 1097 vcpu->arch.nmi_injected = false; 1098 /* 1099 * TDX doesn't support KVM to request NMI window exit. If there is 1100 * still a pending vNMI, KVM is not able to inject it along with the 1101 * one pending in TDX module in a back-to-back way. Since the previous 1102 * vNMI is still pending in TDX module, i.e. it has not been delivered 1103 * to TDX guest yet, it's OK to collapse the pending vNMI into the 1104 * previous one. The guest is expected to handle all the NMI sources 1105 * when handling the first vNMI. 1106 */ 1107 vcpu->arch.nmi_pending = 0; 1108 } 1109 1110 static int tdx_handle_exception_nmi(struct kvm_vcpu *vcpu) 1111 { 1112 u32 intr_info = vmx_get_intr_info(vcpu); 1113 1114 /* 1115 * Machine checks are handled by handle_exception_irqoff(), or by 1116 * tdx_handle_exit() with TDX_NON_RECOVERABLE set if a #MC occurs on 1117 * VM-Entry. NMIs are handled by tdx_vcpu_enter_exit(). 1118 */ 1119 if (is_nmi(intr_info) || is_machine_check(intr_info)) 1120 return 1; 1121 1122 vcpu->run->exit_reason = KVM_EXIT_EXCEPTION; 1123 vcpu->run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK; 1124 vcpu->run->ex.error_code = 0; 1125 1126 return 0; 1127 } 1128 1129 static int complete_hypercall_exit(struct kvm_vcpu *vcpu) 1130 { 1131 tdvmcall_set_return_code(vcpu, vcpu->run->hypercall.ret); 1132 return 1; 1133 } 1134 1135 static int tdx_emulate_vmcall(struct kvm_vcpu *vcpu) 1136 { 1137 kvm_rax_write(vcpu, to_tdx(vcpu)->vp_enter_args.r10); 1138 kvm_rbx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r11); 1139 kvm_rcx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r12); 1140 kvm_rdx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r13); 1141 kvm_rsi_write(vcpu, to_tdx(vcpu)->vp_enter_args.r14); 1142 1143 return __kvm_emulate_hypercall(vcpu, 0, complete_hypercall_exit); 1144 } 1145 1146 /* 1147 * Split into chunks and check interrupt pending between chunks. This allows 1148 * for timely injection of interrupts to prevent issues with guest lockup 1149 * detection. 1150 */ 1151 #define TDX_MAP_GPA_MAX_LEN (2 * 1024 * 1024) 1152 static void __tdx_map_gpa(struct vcpu_tdx *tdx); 1153 1154 static int tdx_complete_vmcall_map_gpa(struct kvm_vcpu *vcpu) 1155 { 1156 struct vcpu_tdx *tdx = to_tdx(vcpu); 1157 1158 if (vcpu->run->hypercall.ret) { 1159 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); 1160 tdx->vp_enter_args.r11 = tdx->map_gpa_next; 1161 return 1; 1162 } 1163 1164 tdx->map_gpa_next += TDX_MAP_GPA_MAX_LEN; 1165 if (tdx->map_gpa_next >= tdx->map_gpa_end) 1166 return 1; 1167 1168 /* 1169 * Stop processing the remaining part if there is a pending interrupt, 1170 * which could be qualified to deliver. Skip checking pending RVI for 1171 * TDVMCALL_MAP_GPA, see comments in tdx_protected_apic_has_interrupt(). 1172 */ 1173 if (kvm_vcpu_has_events(vcpu)) { 1174 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_RETRY); 1175 tdx->vp_enter_args.r11 = tdx->map_gpa_next; 1176 return 1; 1177 } 1178 1179 __tdx_map_gpa(tdx); 1180 return 0; 1181 } 1182 1183 static void __tdx_map_gpa(struct vcpu_tdx *tdx) 1184 { 1185 u64 gpa = tdx->map_gpa_next; 1186 u64 size = tdx->map_gpa_end - tdx->map_gpa_next; 1187 1188 if (size > TDX_MAP_GPA_MAX_LEN) 1189 size = TDX_MAP_GPA_MAX_LEN; 1190 1191 tdx->vcpu.run->exit_reason = KVM_EXIT_HYPERCALL; 1192 tdx->vcpu.run->hypercall.nr = KVM_HC_MAP_GPA_RANGE; 1193 /* 1194 * In principle this should have been -KVM_ENOSYS, but userspace (QEMU <=9.2) 1195 * assumed that vcpu->run->hypercall.ret is never changed by KVM and thus that 1196 * it was always zero on KVM_EXIT_HYPERCALL. Since KVM is now overwriting 1197 * vcpu->run->hypercall.ret, ensuring that it is zero to not break QEMU. 1198 */ 1199 tdx->vcpu.run->hypercall.ret = 0; 1200 tdx->vcpu.run->hypercall.args[0] = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(tdx->vcpu.kvm)); 1201 tdx->vcpu.run->hypercall.args[1] = size / PAGE_SIZE; 1202 tdx->vcpu.run->hypercall.args[2] = vt_is_tdx_private_gpa(tdx->vcpu.kvm, gpa) ? 1203 KVM_MAP_GPA_RANGE_ENCRYPTED : 1204 KVM_MAP_GPA_RANGE_DECRYPTED; 1205 tdx->vcpu.run->hypercall.flags = KVM_EXIT_HYPERCALL_LONG_MODE; 1206 1207 tdx->vcpu.arch.complete_userspace_io = tdx_complete_vmcall_map_gpa; 1208 } 1209 1210 static int tdx_map_gpa(struct kvm_vcpu *vcpu) 1211 { 1212 struct vcpu_tdx *tdx = to_tdx(vcpu); 1213 u64 gpa = tdx->vp_enter_args.r12; 1214 u64 size = tdx->vp_enter_args.r13; 1215 u64 ret; 1216 1217 /* 1218 * Converting TDVMCALL_MAP_GPA to KVM_HC_MAP_GPA_RANGE requires 1219 * userspace to enable KVM_CAP_EXIT_HYPERCALL with KVM_HC_MAP_GPA_RANGE 1220 * bit set. This is a base call so it should always be supported, but 1221 * KVM has no way to ensure that userspace implements the GHCI correctly. 1222 * So if KVM_HC_MAP_GPA_RANGE does not cause a VMEXIT, return an error 1223 * to the guest. 1224 */ 1225 if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE)) { 1226 ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED; 1227 goto error; 1228 } 1229 1230 if (gpa + size <= gpa || !kvm_vcpu_is_legal_gpa(vcpu, gpa) || 1231 !kvm_vcpu_is_legal_gpa(vcpu, gpa + size - 1) || 1232 (vt_is_tdx_private_gpa(vcpu->kvm, gpa) != 1233 vt_is_tdx_private_gpa(vcpu->kvm, gpa + size - 1))) { 1234 ret = TDVMCALL_STATUS_INVALID_OPERAND; 1235 goto error; 1236 } 1237 1238 if (!PAGE_ALIGNED(gpa) || !PAGE_ALIGNED(size)) { 1239 ret = TDVMCALL_STATUS_ALIGN_ERROR; 1240 goto error; 1241 } 1242 1243 tdx->map_gpa_end = gpa + size; 1244 tdx->map_gpa_next = gpa; 1245 1246 __tdx_map_gpa(tdx); 1247 return 0; 1248 1249 error: 1250 tdvmcall_set_return_code(vcpu, ret); 1251 tdx->vp_enter_args.r11 = gpa; 1252 return 1; 1253 } 1254 1255 static int tdx_report_fatal_error(struct kvm_vcpu *vcpu) 1256 { 1257 struct vcpu_tdx *tdx = to_tdx(vcpu); 1258 u64 *regs = vcpu->run->system_event.data; 1259 u64 *module_regs = &tdx->vp_enter_args.r8; 1260 int index = VCPU_REGS_RAX; 1261 1262 vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; 1263 vcpu->run->system_event.type = KVM_SYSTEM_EVENT_TDX_FATAL; 1264 vcpu->run->system_event.ndata = 16; 1265 1266 /* Dump 16 general-purpose registers to userspace in ascending order. */ 1267 regs[index++] = tdx->vp_enter_ret; 1268 regs[index++] = tdx->vp_enter_args.rcx; 1269 regs[index++] = tdx->vp_enter_args.rdx; 1270 regs[index++] = tdx->vp_enter_args.rbx; 1271 regs[index++] = 0; 1272 regs[index++] = 0; 1273 regs[index++] = tdx->vp_enter_args.rsi; 1274 regs[index] = tdx->vp_enter_args.rdi; 1275 for (index = 0; index < 8; index++) 1276 regs[VCPU_REGS_R8 + index] = module_regs[index]; 1277 1278 return 0; 1279 } 1280 1281 static int tdx_emulate_cpuid(struct kvm_vcpu *vcpu) 1282 { 1283 u32 eax, ebx, ecx, edx; 1284 struct vcpu_tdx *tdx = to_tdx(vcpu); 1285 1286 /* EAX and ECX for cpuid is stored in R12 and R13. */ 1287 eax = tdx->vp_enter_args.r12; 1288 ecx = tdx->vp_enter_args.r13; 1289 1290 kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx, false); 1291 1292 tdx->vp_enter_args.r12 = eax; 1293 tdx->vp_enter_args.r13 = ebx; 1294 tdx->vp_enter_args.r14 = ecx; 1295 tdx->vp_enter_args.r15 = edx; 1296 1297 return 1; 1298 } 1299 1300 static int tdx_complete_pio_out(struct kvm_vcpu *vcpu) 1301 { 1302 vcpu->arch.pio.count = 0; 1303 return 1; 1304 } 1305 1306 static int tdx_complete_pio_in(struct kvm_vcpu *vcpu) 1307 { 1308 struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; 1309 unsigned long val = 0; 1310 int ret; 1311 1312 ret = ctxt->ops->pio_in_emulated(ctxt, vcpu->arch.pio.size, 1313 vcpu->arch.pio.port, &val, 1); 1314 1315 WARN_ON_ONCE(!ret); 1316 1317 tdvmcall_set_return_val(vcpu, val); 1318 1319 return 1; 1320 } 1321 1322 static int tdx_emulate_io(struct kvm_vcpu *vcpu) 1323 { 1324 struct vcpu_tdx *tdx = to_tdx(vcpu); 1325 struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; 1326 unsigned long val = 0; 1327 unsigned int port; 1328 u64 size, write; 1329 int ret; 1330 1331 ++vcpu->stat.io_exits; 1332 1333 size = tdx->vp_enter_args.r12; 1334 write = tdx->vp_enter_args.r13; 1335 port = tdx->vp_enter_args.r14; 1336 1337 if ((write != 0 && write != 1) || (size != 1 && size != 2 && size != 4)) { 1338 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); 1339 return 1; 1340 } 1341 1342 if (write) { 1343 val = tdx->vp_enter_args.r15; 1344 ret = ctxt->ops->pio_out_emulated(ctxt, size, port, &val, 1); 1345 } else { 1346 ret = ctxt->ops->pio_in_emulated(ctxt, size, port, &val, 1); 1347 } 1348 1349 if (!ret) 1350 vcpu->arch.complete_userspace_io = write ? tdx_complete_pio_out : 1351 tdx_complete_pio_in; 1352 else if (!write) 1353 tdvmcall_set_return_val(vcpu, val); 1354 1355 return ret; 1356 } 1357 1358 static int tdx_complete_mmio_read(struct kvm_vcpu *vcpu) 1359 { 1360 unsigned long val = 0; 1361 gpa_t gpa; 1362 int size; 1363 1364 gpa = vcpu->mmio_fragments[0].gpa; 1365 size = vcpu->mmio_fragments[0].len; 1366 1367 memcpy(&val, vcpu->run->mmio.data, size); 1368 tdvmcall_set_return_val(vcpu, val); 1369 trace_kvm_mmio(KVM_TRACE_MMIO_READ, size, gpa, &val); 1370 return 1; 1371 } 1372 1373 static inline int tdx_mmio_write(struct kvm_vcpu *vcpu, gpa_t gpa, int size, 1374 unsigned long val) 1375 { 1376 if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { 1377 trace_kvm_fast_mmio(gpa); 1378 return 0; 1379 } 1380 1381 trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, size, gpa, &val); 1382 if (kvm_io_bus_write(vcpu, KVM_MMIO_BUS, gpa, size, &val)) 1383 return -EOPNOTSUPP; 1384 1385 return 0; 1386 } 1387 1388 static inline int tdx_mmio_read(struct kvm_vcpu *vcpu, gpa_t gpa, int size) 1389 { 1390 unsigned long val; 1391 1392 if (kvm_io_bus_read(vcpu, KVM_MMIO_BUS, gpa, size, &val)) 1393 return -EOPNOTSUPP; 1394 1395 tdvmcall_set_return_val(vcpu, val); 1396 trace_kvm_mmio(KVM_TRACE_MMIO_READ, size, gpa, &val); 1397 return 0; 1398 } 1399 1400 static int tdx_emulate_mmio(struct kvm_vcpu *vcpu) 1401 { 1402 struct vcpu_tdx *tdx = to_tdx(vcpu); 1403 int size, write, r; 1404 unsigned long val; 1405 gpa_t gpa; 1406 1407 size = tdx->vp_enter_args.r12; 1408 write = tdx->vp_enter_args.r13; 1409 gpa = tdx->vp_enter_args.r14; 1410 val = write ? tdx->vp_enter_args.r15 : 0; 1411 1412 if (size != 1 && size != 2 && size != 4 && size != 8) 1413 goto error; 1414 if (write != 0 && write != 1) 1415 goto error; 1416 1417 /* 1418 * TDG.VP.VMCALL<MMIO> allows only shared GPA, it makes no sense to 1419 * do MMIO emulation for private GPA. 1420 */ 1421 if (vt_is_tdx_private_gpa(vcpu->kvm, gpa) || 1422 vt_is_tdx_private_gpa(vcpu->kvm, gpa + size - 1)) 1423 goto error; 1424 1425 gpa = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(vcpu->kvm)); 1426 1427 if (write) 1428 r = tdx_mmio_write(vcpu, gpa, size, val); 1429 else 1430 r = tdx_mmio_read(vcpu, gpa, size); 1431 if (!r) 1432 /* Kernel completed device emulation. */ 1433 return 1; 1434 1435 /* Request the device emulation to userspace device model. */ 1436 vcpu->mmio_is_write = write; 1437 if (!write) 1438 vcpu->arch.complete_userspace_io = tdx_complete_mmio_read; 1439 1440 vcpu->run->mmio.phys_addr = gpa; 1441 vcpu->run->mmio.len = size; 1442 vcpu->run->mmio.is_write = write; 1443 vcpu->run->exit_reason = KVM_EXIT_MMIO; 1444 1445 if (write) { 1446 memcpy(vcpu->run->mmio.data, &val, size); 1447 } else { 1448 vcpu->mmio_fragments[0].gpa = gpa; 1449 vcpu->mmio_fragments[0].len = size; 1450 trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, size, gpa, NULL); 1451 } 1452 return 0; 1453 1454 error: 1455 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); 1456 return 1; 1457 } 1458 1459 static int tdx_complete_get_td_vm_call_info(struct kvm_vcpu *vcpu) 1460 { 1461 struct vcpu_tdx *tdx = to_tdx(vcpu); 1462 1463 tdvmcall_set_return_code(vcpu, vcpu->run->tdx.get_tdvmcall_info.ret); 1464 1465 /* 1466 * For now, there is no TDVMCALL beyond GHCI base API supported by KVM 1467 * directly without the support from userspace, just set the value 1468 * returned from userspace. 1469 */ 1470 tdx->vp_enter_args.r11 = vcpu->run->tdx.get_tdvmcall_info.r11; 1471 tdx->vp_enter_args.r12 = vcpu->run->tdx.get_tdvmcall_info.r12; 1472 tdx->vp_enter_args.r13 = vcpu->run->tdx.get_tdvmcall_info.r13; 1473 tdx->vp_enter_args.r14 = vcpu->run->tdx.get_tdvmcall_info.r14; 1474 1475 return 1; 1476 } 1477 1478 static int tdx_get_td_vm_call_info(struct kvm_vcpu *vcpu) 1479 { 1480 struct vcpu_tdx *tdx = to_tdx(vcpu); 1481 1482 switch (tdx->vp_enter_args.r12) { 1483 case 0: 1484 tdx->vp_enter_args.r11 = 0; 1485 tdx->vp_enter_args.r12 = 0; 1486 tdx->vp_enter_args.r13 = 0; 1487 tdx->vp_enter_args.r14 = 0; 1488 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_SUCCESS); 1489 return 1; 1490 case 1: 1491 vcpu->run->tdx.get_tdvmcall_info.leaf = tdx->vp_enter_args.r12; 1492 vcpu->run->exit_reason = KVM_EXIT_TDX; 1493 vcpu->run->tdx.flags = 0; 1494 vcpu->run->tdx.nr = TDVMCALL_GET_TD_VM_CALL_INFO; 1495 vcpu->run->tdx.get_tdvmcall_info.ret = TDVMCALL_STATUS_SUCCESS; 1496 vcpu->run->tdx.get_tdvmcall_info.r11 = 0; 1497 vcpu->run->tdx.get_tdvmcall_info.r12 = 0; 1498 vcpu->run->tdx.get_tdvmcall_info.r13 = 0; 1499 vcpu->run->tdx.get_tdvmcall_info.r14 = 0; 1500 vcpu->arch.complete_userspace_io = tdx_complete_get_td_vm_call_info; 1501 return 0; 1502 default: 1503 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); 1504 return 1; 1505 } 1506 } 1507 1508 static int tdx_complete_simple(struct kvm_vcpu *vcpu) 1509 { 1510 tdvmcall_set_return_code(vcpu, vcpu->run->tdx.unknown.ret); 1511 return 1; 1512 } 1513 1514 static int tdx_get_quote(struct kvm_vcpu *vcpu) 1515 { 1516 struct vcpu_tdx *tdx = to_tdx(vcpu); 1517 u64 gpa = tdx->vp_enter_args.r12; 1518 u64 size = tdx->vp_enter_args.r13; 1519 1520 /* The gpa of buffer must have shared bit set. */ 1521 if (vt_is_tdx_private_gpa(vcpu->kvm, gpa)) { 1522 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); 1523 return 1; 1524 } 1525 1526 vcpu->run->exit_reason = KVM_EXIT_TDX; 1527 vcpu->run->tdx.flags = 0; 1528 vcpu->run->tdx.nr = TDVMCALL_GET_QUOTE; 1529 vcpu->run->tdx.get_quote.ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED; 1530 vcpu->run->tdx.get_quote.gpa = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(tdx->vcpu.kvm)); 1531 vcpu->run->tdx.get_quote.size = size; 1532 1533 vcpu->arch.complete_userspace_io = tdx_complete_simple; 1534 1535 return 0; 1536 } 1537 1538 static int tdx_setup_event_notify_interrupt(struct kvm_vcpu *vcpu) 1539 { 1540 struct vcpu_tdx *tdx = to_tdx(vcpu); 1541 u64 vector = tdx->vp_enter_args.r12; 1542 1543 if (vector < 32 || vector > 255) { 1544 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); 1545 return 1; 1546 } 1547 1548 vcpu->run->exit_reason = KVM_EXIT_TDX; 1549 vcpu->run->tdx.flags = 0; 1550 vcpu->run->tdx.nr = TDVMCALL_SETUP_EVENT_NOTIFY_INTERRUPT; 1551 vcpu->run->tdx.setup_event_notify.ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED; 1552 vcpu->run->tdx.setup_event_notify.vector = vector; 1553 1554 vcpu->arch.complete_userspace_io = tdx_complete_simple; 1555 1556 return 0; 1557 } 1558 1559 static int handle_tdvmcall(struct kvm_vcpu *vcpu) 1560 { 1561 switch (tdvmcall_leaf(vcpu)) { 1562 case TDVMCALL_MAP_GPA: 1563 return tdx_map_gpa(vcpu); 1564 case TDVMCALL_REPORT_FATAL_ERROR: 1565 return tdx_report_fatal_error(vcpu); 1566 case TDVMCALL_GET_TD_VM_CALL_INFO: 1567 return tdx_get_td_vm_call_info(vcpu); 1568 case TDVMCALL_GET_QUOTE: 1569 return tdx_get_quote(vcpu); 1570 case TDVMCALL_SETUP_EVENT_NOTIFY_INTERRUPT: 1571 return tdx_setup_event_notify_interrupt(vcpu); 1572 default: 1573 break; 1574 } 1575 1576 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED); 1577 return 1; 1578 } 1579 1580 void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int pgd_level) 1581 { 1582 u64 shared_bit = (pgd_level == 5) ? TDX_SHARED_BIT_PWL_5 : 1583 TDX_SHARED_BIT_PWL_4; 1584 1585 if (KVM_BUG_ON(shared_bit != kvm_gfn_direct_bits(vcpu->kvm), vcpu->kvm)) 1586 return; 1587 1588 td_vmcs_write64(to_tdx(vcpu), SHARED_EPT_POINTER, root_hpa); 1589 } 1590 1591 static void tdx_unpin(struct kvm *kvm, struct page *page) 1592 { 1593 put_page(page); 1594 } 1595 1596 static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn, 1597 enum pg_level level, struct page *page) 1598 { 1599 int tdx_level = pg_level_to_tdx_sept_level(level); 1600 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 1601 gpa_t gpa = gfn_to_gpa(gfn); 1602 u64 entry, level_state; 1603 u64 err; 1604 1605 err = tdh_mem_page_aug(&kvm_tdx->td, gpa, tdx_level, page, &entry, &level_state); 1606 if (unlikely(tdx_operand_busy(err))) { 1607 tdx_unpin(kvm, page); 1608 return -EBUSY; 1609 } 1610 1611 if (KVM_BUG_ON(err, kvm)) { 1612 pr_tdx_error_2(TDH_MEM_PAGE_AUG, err, entry, level_state); 1613 tdx_unpin(kvm, page); 1614 return -EIO; 1615 } 1616 1617 return 0; 1618 } 1619 1620 /* 1621 * KVM_TDX_INIT_MEM_REGION calls kvm_gmem_populate() to map guest pages; the 1622 * callback tdx_gmem_post_populate() then maps pages into private memory. 1623 * through the a seamcall TDH.MEM.PAGE.ADD(). The SEAMCALL also requires the 1624 * private EPT structures for the page to have been built before, which is 1625 * done via kvm_tdp_map_page(). nr_premapped counts the number of pages that 1626 * were added to the EPT structures but not added with TDH.MEM.PAGE.ADD(). 1627 * The counter has to be zero on KVM_TDX_FINALIZE_VM, to ensure that there 1628 * are no half-initialized shared EPT pages. 1629 */ 1630 static int tdx_mem_page_record_premap_cnt(struct kvm *kvm, gfn_t gfn, 1631 enum pg_level level, kvm_pfn_t pfn) 1632 { 1633 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 1634 1635 if (KVM_BUG_ON(kvm->arch.pre_fault_allowed, kvm)) 1636 return -EINVAL; 1637 1638 /* nr_premapped will be decreased when tdh_mem_page_add() is called. */ 1639 atomic64_inc(&kvm_tdx->nr_premapped); 1640 return 0; 1641 } 1642 1643 int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn, 1644 enum pg_level level, kvm_pfn_t pfn) 1645 { 1646 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 1647 struct page *page = pfn_to_page(pfn); 1648 1649 /* TODO: handle large pages. */ 1650 if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm)) 1651 return -EINVAL; 1652 1653 /* 1654 * Because guest_memfd doesn't support page migration with 1655 * a_ops->migrate_folio (yet), no callback is triggered for KVM on page 1656 * migration. Until guest_memfd supports page migration, prevent page 1657 * migration. 1658 * TODO: Once guest_memfd introduces callback on page migration, 1659 * implement it and remove get_page/put_page(). 1660 */ 1661 get_page(page); 1662 1663 /* 1664 * Read 'pre_fault_allowed' before 'kvm_tdx->state'; see matching 1665 * barrier in tdx_td_finalize(). 1666 */ 1667 smp_rmb(); 1668 if (likely(kvm_tdx->state == TD_STATE_RUNNABLE)) 1669 return tdx_mem_page_aug(kvm, gfn, level, page); 1670 1671 return tdx_mem_page_record_premap_cnt(kvm, gfn, level, pfn); 1672 } 1673 1674 static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn, 1675 enum pg_level level, struct page *page) 1676 { 1677 int tdx_level = pg_level_to_tdx_sept_level(level); 1678 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 1679 gpa_t gpa = gfn_to_gpa(gfn); 1680 u64 err, entry, level_state; 1681 1682 /* TODO: handle large pages. */ 1683 if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm)) 1684 return -EINVAL; 1685 1686 if (KVM_BUG_ON(!is_hkid_assigned(kvm_tdx), kvm)) 1687 return -EINVAL; 1688 1689 /* 1690 * When zapping private page, write lock is held. So no race condition 1691 * with other vcpu sept operation. 1692 * Race with TDH.VP.ENTER due to (0-step mitigation) and Guest TDCALLs. 1693 */ 1694 err = tdh_mem_page_remove(&kvm_tdx->td, gpa, tdx_level, &entry, 1695 &level_state); 1696 1697 if (unlikely(tdx_operand_busy(err))) { 1698 /* 1699 * The second retry is expected to succeed after kicking off all 1700 * other vCPUs and prevent them from invoking TDH.VP.ENTER. 1701 */ 1702 tdx_no_vcpus_enter_start(kvm); 1703 err = tdh_mem_page_remove(&kvm_tdx->td, gpa, tdx_level, &entry, 1704 &level_state); 1705 tdx_no_vcpus_enter_stop(kvm); 1706 } 1707 1708 if (KVM_BUG_ON(err, kvm)) { 1709 pr_tdx_error_2(TDH_MEM_PAGE_REMOVE, err, entry, level_state); 1710 return -EIO; 1711 } 1712 1713 err = tdh_phymem_page_wbinvd_hkid((u16)kvm_tdx->hkid, page); 1714 1715 if (KVM_BUG_ON(err, kvm)) { 1716 pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err); 1717 return -EIO; 1718 } 1719 tdx_clear_page(page); 1720 tdx_unpin(kvm, page); 1721 return 0; 1722 } 1723 1724 int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn, 1725 enum pg_level level, void *private_spt) 1726 { 1727 int tdx_level = pg_level_to_tdx_sept_level(level); 1728 gpa_t gpa = gfn_to_gpa(gfn); 1729 struct page *page = virt_to_page(private_spt); 1730 u64 err, entry, level_state; 1731 1732 err = tdh_mem_sept_add(&to_kvm_tdx(kvm)->td, gpa, tdx_level, page, &entry, 1733 &level_state); 1734 if (unlikely(tdx_operand_busy(err))) 1735 return -EBUSY; 1736 1737 if (KVM_BUG_ON(err, kvm)) { 1738 pr_tdx_error_2(TDH_MEM_SEPT_ADD, err, entry, level_state); 1739 return -EIO; 1740 } 1741 1742 return 0; 1743 } 1744 1745 /* 1746 * Check if the error returned from a SEPT zap SEAMCALL is due to that a page is 1747 * mapped by KVM_TDX_INIT_MEM_REGION without tdh_mem_page_add() being called 1748 * successfully. 1749 * 1750 * Since tdh_mem_sept_add() must have been invoked successfully before a 1751 * non-leaf entry present in the mirrored page table, the SEPT ZAP related 1752 * SEAMCALLs should not encounter err TDX_EPT_WALK_FAILED. They should instead 1753 * find TDX_EPT_ENTRY_STATE_INCORRECT due to an empty leaf entry found in the 1754 * SEPT. 1755 * 1756 * Further check if the returned entry from SEPT walking is with RWX permissions 1757 * to filter out anything unexpected. 1758 * 1759 * Note: @level is pg_level, not the tdx_level. The tdx_level extracted from 1760 * level_state returned from a SEAMCALL error is the same as that passed into 1761 * the SEAMCALL. 1762 */ 1763 static int tdx_is_sept_zap_err_due_to_premap(struct kvm_tdx *kvm_tdx, u64 err, 1764 u64 entry, int level) 1765 { 1766 if (!err || kvm_tdx->state == TD_STATE_RUNNABLE) 1767 return false; 1768 1769 if (err != (TDX_EPT_ENTRY_STATE_INCORRECT | TDX_OPERAND_ID_RCX)) 1770 return false; 1771 1772 if ((is_last_spte(entry, level) && (entry & VMX_EPT_RWX_MASK))) 1773 return false; 1774 1775 return true; 1776 } 1777 1778 static int tdx_sept_zap_private_spte(struct kvm *kvm, gfn_t gfn, 1779 enum pg_level level, struct page *page) 1780 { 1781 int tdx_level = pg_level_to_tdx_sept_level(level); 1782 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 1783 gpa_t gpa = gfn_to_gpa(gfn) & KVM_HPAGE_MASK(level); 1784 u64 err, entry, level_state; 1785 1786 /* For now large page isn't supported yet. */ 1787 WARN_ON_ONCE(level != PG_LEVEL_4K); 1788 1789 err = tdh_mem_range_block(&kvm_tdx->td, gpa, tdx_level, &entry, &level_state); 1790 1791 if (unlikely(tdx_operand_busy(err))) { 1792 /* After no vCPUs enter, the second retry is expected to succeed */ 1793 tdx_no_vcpus_enter_start(kvm); 1794 err = tdh_mem_range_block(&kvm_tdx->td, gpa, tdx_level, &entry, &level_state); 1795 tdx_no_vcpus_enter_stop(kvm); 1796 } 1797 if (tdx_is_sept_zap_err_due_to_premap(kvm_tdx, err, entry, level) && 1798 !KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm)) { 1799 atomic64_dec(&kvm_tdx->nr_premapped); 1800 tdx_unpin(kvm, page); 1801 return 0; 1802 } 1803 1804 if (KVM_BUG_ON(err, kvm)) { 1805 pr_tdx_error_2(TDH_MEM_RANGE_BLOCK, err, entry, level_state); 1806 return -EIO; 1807 } 1808 return 1; 1809 } 1810 1811 /* 1812 * Ensure shared and private EPTs to be flushed on all vCPUs. 1813 * tdh_mem_track() is the only caller that increases TD epoch. An increase in 1814 * the TD epoch (e.g., to value "N + 1") is successful only if no vCPUs are 1815 * running in guest mode with the value "N - 1". 1816 * 1817 * A successful execution of tdh_mem_track() ensures that vCPUs can only run in 1818 * guest mode with TD epoch value "N" if no TD exit occurs after the TD epoch 1819 * being increased to "N + 1". 1820 * 1821 * Kicking off all vCPUs after that further results in no vCPUs can run in guest 1822 * mode with TD epoch value "N", which unblocks the next tdh_mem_track() (e.g. 1823 * to increase TD epoch to "N + 2"). 1824 * 1825 * TDX module will flush EPT on the next TD enter and make vCPUs to run in 1826 * guest mode with TD epoch value "N + 1". 1827 * 1828 * kvm_make_all_cpus_request() guarantees all vCPUs are out of guest mode by 1829 * waiting empty IPI handler ack_kick(). 1830 * 1831 * No action is required to the vCPUs being kicked off since the kicking off 1832 * occurs certainly after TD epoch increment and before the next 1833 * tdh_mem_track(). 1834 */ 1835 static void tdx_track(struct kvm *kvm) 1836 { 1837 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 1838 u64 err; 1839 1840 /* If TD isn't finalized, it's before any vcpu running. */ 1841 if (unlikely(kvm_tdx->state != TD_STATE_RUNNABLE)) 1842 return; 1843 1844 lockdep_assert_held_write(&kvm->mmu_lock); 1845 1846 err = tdh_mem_track(&kvm_tdx->td); 1847 if (unlikely(tdx_operand_busy(err))) { 1848 /* After no vCPUs enter, the second retry is expected to succeed */ 1849 tdx_no_vcpus_enter_start(kvm); 1850 err = tdh_mem_track(&kvm_tdx->td); 1851 tdx_no_vcpus_enter_stop(kvm); 1852 } 1853 1854 if (KVM_BUG_ON(err, kvm)) 1855 pr_tdx_error(TDH_MEM_TRACK, err); 1856 1857 kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE); 1858 } 1859 1860 int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn, 1861 enum pg_level level, void *private_spt) 1862 { 1863 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 1864 1865 /* 1866 * free_external_spt() is only called after hkid is freed when TD is 1867 * tearing down. 1868 * KVM doesn't (yet) zap page table pages in mirror page table while 1869 * TD is active, though guest pages mapped in mirror page table could be 1870 * zapped during TD is active, e.g. for shared <-> private conversion 1871 * and slot move/deletion. 1872 */ 1873 if (KVM_BUG_ON(is_hkid_assigned(kvm_tdx), kvm)) 1874 return -EINVAL; 1875 1876 /* 1877 * The HKID assigned to this TD was already freed and cache was 1878 * already flushed. We don't have to flush again. 1879 */ 1880 return tdx_reclaim_page(virt_to_page(private_spt)); 1881 } 1882 1883 int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn, 1884 enum pg_level level, kvm_pfn_t pfn) 1885 { 1886 struct page *page = pfn_to_page(pfn); 1887 int ret; 1888 1889 /* 1890 * HKID is released after all private pages have been removed, and set 1891 * before any might be populated. Warn if zapping is attempted when 1892 * there can't be anything populated in the private EPT. 1893 */ 1894 if (KVM_BUG_ON(!is_hkid_assigned(to_kvm_tdx(kvm)), kvm)) 1895 return -EINVAL; 1896 1897 ret = tdx_sept_zap_private_spte(kvm, gfn, level, page); 1898 if (ret <= 0) 1899 return ret; 1900 1901 /* 1902 * TDX requires TLB tracking before dropping private page. Do 1903 * it here, although it is also done later. 1904 */ 1905 tdx_track(kvm); 1906 1907 return tdx_sept_drop_private_spte(kvm, gfn, level, page); 1908 } 1909 1910 void tdx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, 1911 int trig_mode, int vector) 1912 { 1913 struct kvm_vcpu *vcpu = apic->vcpu; 1914 struct vcpu_tdx *tdx = to_tdx(vcpu); 1915 1916 /* TDX supports only posted interrupt. No lapic emulation. */ 1917 __vmx_deliver_posted_interrupt(vcpu, &tdx->vt.pi_desc, vector); 1918 1919 trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector); 1920 } 1921 1922 static inline bool tdx_is_sept_violation_unexpected_pending(struct kvm_vcpu *vcpu) 1923 { 1924 u64 eeq_type = to_tdx(vcpu)->ext_exit_qualification & TDX_EXT_EXIT_QUAL_TYPE_MASK; 1925 u64 eq = vmx_get_exit_qual(vcpu); 1926 1927 if (eeq_type != TDX_EXT_EXIT_QUAL_TYPE_PENDING_EPT_VIOLATION) 1928 return false; 1929 1930 return !(eq & EPT_VIOLATION_PROT_MASK) && !(eq & EPT_VIOLATION_EXEC_FOR_RING3_LIN); 1931 } 1932 1933 static int tdx_handle_ept_violation(struct kvm_vcpu *vcpu) 1934 { 1935 unsigned long exit_qual; 1936 gpa_t gpa = to_tdx(vcpu)->exit_gpa; 1937 bool local_retry = false; 1938 int ret; 1939 1940 if (vt_is_tdx_private_gpa(vcpu->kvm, gpa)) { 1941 if (tdx_is_sept_violation_unexpected_pending(vcpu)) { 1942 pr_warn("Guest access before accepting 0x%llx on vCPU %d\n", 1943 gpa, vcpu->vcpu_id); 1944 kvm_vm_dead(vcpu->kvm); 1945 return -EIO; 1946 } 1947 /* 1948 * Always treat SEPT violations as write faults. Ignore the 1949 * EXIT_QUALIFICATION reported by TDX-SEAM for SEPT violations. 1950 * TD private pages are always RWX in the SEPT tables, 1951 * i.e. they're always mapped writable. Just as importantly, 1952 * treating SEPT violations as write faults is necessary to 1953 * avoid COW allocations, which will cause TDAUGPAGE failures 1954 * due to aliasing a single HPA to multiple GPAs. 1955 */ 1956 exit_qual = EPT_VIOLATION_ACC_WRITE; 1957 1958 /* Only private GPA triggers zero-step mitigation */ 1959 local_retry = true; 1960 } else { 1961 exit_qual = vmx_get_exit_qual(vcpu); 1962 /* 1963 * EPT violation due to instruction fetch should never be 1964 * triggered from shared memory in TDX guest. If such EPT 1965 * violation occurs, treat it as broken hardware. 1966 */ 1967 if (KVM_BUG_ON(exit_qual & EPT_VIOLATION_ACC_INSTR, vcpu->kvm)) 1968 return -EIO; 1969 } 1970 1971 trace_kvm_page_fault(vcpu, gpa, exit_qual); 1972 1973 /* 1974 * To minimize TDH.VP.ENTER invocations, retry locally for private GPA 1975 * mapping in TDX. 1976 * 1977 * KVM may return RET_PF_RETRY for private GPA due to 1978 * - contentions when atomically updating SPTEs of the mirror page table 1979 * - in-progress GFN invalidation or memslot removal. 1980 * - TDX_OPERAND_BUSY error from TDH.MEM.PAGE.AUG or TDH.MEM.SEPT.ADD, 1981 * caused by contentions with TDH.VP.ENTER (with zero-step mitigation) 1982 * or certain TDCALLs. 1983 * 1984 * If TDH.VP.ENTER is invoked more times than the threshold set by the 1985 * TDX module before KVM resolves the private GPA mapping, the TDX 1986 * module will activate zero-step mitigation during TDH.VP.ENTER. This 1987 * process acquires an SEPT tree lock in the TDX module, leading to 1988 * further contentions with TDH.MEM.PAGE.AUG or TDH.MEM.SEPT.ADD 1989 * operations on other vCPUs. 1990 * 1991 * Breaking out of local retries for kvm_vcpu_has_events() is for 1992 * interrupt injection. kvm_vcpu_has_events() should not see pending 1993 * events for TDX. Since KVM can't determine if IRQs (or NMIs) are 1994 * blocked by TDs, false positives are inevitable i.e., KVM may re-enter 1995 * the guest even if the IRQ/NMI can't be delivered. 1996 * 1997 * Note: even without breaking out of local retries, zero-step 1998 * mitigation may still occur due to 1999 * - invoking of TDH.VP.ENTER after KVM_EXIT_MEMORY_FAULT, 2000 * - a single RIP causing EPT violations for more GFNs than the 2001 * threshold count. 2002 * This is safe, as triggering zero-step mitigation only introduces 2003 * contentions to page installation SEAMCALLs on other vCPUs, which will 2004 * handle retries locally in their EPT violation handlers. 2005 */ 2006 while (1) { 2007 ret = __vmx_handle_ept_violation(vcpu, gpa, exit_qual); 2008 2009 if (ret != RET_PF_RETRY || !local_retry) 2010 break; 2011 2012 if (kvm_vcpu_has_events(vcpu) || signal_pending(current)) 2013 break; 2014 2015 if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu)) { 2016 ret = -EIO; 2017 break; 2018 } 2019 2020 cond_resched(); 2021 } 2022 return ret; 2023 } 2024 2025 int tdx_complete_emulated_msr(struct kvm_vcpu *vcpu, int err) 2026 { 2027 if (err) { 2028 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); 2029 return 1; 2030 } 2031 2032 if (vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MSR_READ) 2033 tdvmcall_set_return_val(vcpu, kvm_read_edx_eax(vcpu)); 2034 2035 return 1; 2036 } 2037 2038 2039 int tdx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t fastpath) 2040 { 2041 struct vcpu_tdx *tdx = to_tdx(vcpu); 2042 u64 vp_enter_ret = tdx->vp_enter_ret; 2043 union vmx_exit_reason exit_reason = vmx_get_exit_reason(vcpu); 2044 2045 if (fastpath != EXIT_FASTPATH_NONE) 2046 return 1; 2047 2048 if (unlikely(vp_enter_ret == EXIT_REASON_EPT_MISCONFIG)) { 2049 KVM_BUG_ON(1, vcpu->kvm); 2050 return -EIO; 2051 } 2052 2053 /* 2054 * Handle TDX SW errors, including TDX_SEAMCALL_UD, TDX_SEAMCALL_GP and 2055 * TDX_SEAMCALL_VMFAILINVALID. 2056 */ 2057 if (unlikely((vp_enter_ret & TDX_SW_ERROR) == TDX_SW_ERROR)) { 2058 KVM_BUG_ON(!kvm_rebooting, vcpu->kvm); 2059 goto unhandled_exit; 2060 } 2061 2062 if (unlikely(tdx_failed_vmentry(vcpu))) { 2063 /* 2064 * If the guest state is protected, that means off-TD debug is 2065 * not enabled, TDX_NON_RECOVERABLE must be set. 2066 */ 2067 WARN_ON_ONCE(vcpu->arch.guest_state_protected && 2068 !(vp_enter_ret & TDX_NON_RECOVERABLE)); 2069 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; 2070 vcpu->run->fail_entry.hardware_entry_failure_reason = exit_reason.full; 2071 vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu; 2072 return 0; 2073 } 2074 2075 if (unlikely(vp_enter_ret & (TDX_ERROR | TDX_NON_RECOVERABLE)) && 2076 exit_reason.basic != EXIT_REASON_TRIPLE_FAULT) { 2077 kvm_pr_unimpl("TD vp_enter_ret 0x%llx\n", vp_enter_ret); 2078 goto unhandled_exit; 2079 } 2080 2081 WARN_ON_ONCE(exit_reason.basic != EXIT_REASON_TRIPLE_FAULT && 2082 (vp_enter_ret & TDX_SEAMCALL_STATUS_MASK) != TDX_SUCCESS); 2083 2084 switch (exit_reason.basic) { 2085 case EXIT_REASON_TRIPLE_FAULT: 2086 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; 2087 vcpu->mmio_needed = 0; 2088 return 0; 2089 case EXIT_REASON_EXCEPTION_NMI: 2090 return tdx_handle_exception_nmi(vcpu); 2091 case EXIT_REASON_EXTERNAL_INTERRUPT: 2092 ++vcpu->stat.irq_exits; 2093 return 1; 2094 case EXIT_REASON_CPUID: 2095 return tdx_emulate_cpuid(vcpu); 2096 case EXIT_REASON_HLT: 2097 return kvm_emulate_halt_noskip(vcpu); 2098 case EXIT_REASON_TDCALL: 2099 return handle_tdvmcall(vcpu); 2100 case EXIT_REASON_VMCALL: 2101 return tdx_emulate_vmcall(vcpu); 2102 case EXIT_REASON_IO_INSTRUCTION: 2103 return tdx_emulate_io(vcpu); 2104 case EXIT_REASON_MSR_READ: 2105 kvm_rcx_write(vcpu, tdx->vp_enter_args.r12); 2106 return kvm_emulate_rdmsr(vcpu); 2107 case EXIT_REASON_MSR_WRITE: 2108 kvm_rcx_write(vcpu, tdx->vp_enter_args.r12); 2109 kvm_rax_write(vcpu, tdx->vp_enter_args.r13 & -1u); 2110 kvm_rdx_write(vcpu, tdx->vp_enter_args.r13 >> 32); 2111 return kvm_emulate_wrmsr(vcpu); 2112 case EXIT_REASON_EPT_MISCONFIG: 2113 return tdx_emulate_mmio(vcpu); 2114 case EXIT_REASON_EPT_VIOLATION: 2115 return tdx_handle_ept_violation(vcpu); 2116 case EXIT_REASON_OTHER_SMI: 2117 /* 2118 * Unlike VMX, SMI in SEAM non-root mode (i.e. when 2119 * TD guest vCPU is running) will cause VM exit to TDX module, 2120 * then SEAMRET to KVM. Once it exits to KVM, SMI is delivered 2121 * and handled by kernel handler right away. 2122 * 2123 * The Other SMI exit can also be caused by the SEAM non-root 2124 * machine check delivered via Machine Check System Management 2125 * Interrupt (MSMI), but it has already been handled by the 2126 * kernel machine check handler, i.e., the memory page has been 2127 * marked as poisoned and it won't be freed to the free list 2128 * when the TDX guest is terminated (the TDX module marks the 2129 * guest as dead and prevent it from further running when 2130 * machine check happens in SEAM non-root). 2131 * 2132 * - A MSMI will not reach here, it's handled as non_recoverable 2133 * case above. 2134 * - If it's not an MSMI, no need to do anything here. 2135 */ 2136 return 1; 2137 default: 2138 break; 2139 } 2140 2141 unhandled_exit: 2142 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 2143 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; 2144 vcpu->run->internal.ndata = 2; 2145 vcpu->run->internal.data[0] = vp_enter_ret; 2146 vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu; 2147 return 0; 2148 } 2149 2150 void tdx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, 2151 u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code) 2152 { 2153 struct vcpu_tdx *tdx = to_tdx(vcpu); 2154 2155 *reason = tdx->vt.exit_reason.full; 2156 if (*reason != -1u) { 2157 *info1 = vmx_get_exit_qual(vcpu); 2158 *info2 = tdx->ext_exit_qualification; 2159 *intr_info = vmx_get_intr_info(vcpu); 2160 } else { 2161 *info1 = 0; 2162 *info2 = 0; 2163 *intr_info = 0; 2164 } 2165 2166 *error_code = 0; 2167 } 2168 2169 bool tdx_has_emulated_msr(u32 index) 2170 { 2171 switch (index) { 2172 case MSR_IA32_UCODE_REV: 2173 case MSR_IA32_ARCH_CAPABILITIES: 2174 case MSR_IA32_POWER_CTL: 2175 case MSR_IA32_CR_PAT: 2176 case MSR_MTRRcap: 2177 case MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000: 2178 case MSR_MTRRdefType: 2179 case MSR_IA32_TSC_DEADLINE: 2180 case MSR_IA32_MISC_ENABLE: 2181 case MSR_PLATFORM_INFO: 2182 case MSR_MISC_FEATURES_ENABLES: 2183 case MSR_IA32_APICBASE: 2184 case MSR_EFER: 2185 case MSR_IA32_FEAT_CTL: 2186 case MSR_IA32_MCG_CAP: 2187 case MSR_IA32_MCG_STATUS: 2188 case MSR_IA32_MCG_CTL: 2189 case MSR_IA32_MCG_EXT_CTL: 2190 case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: 2191 case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1: 2192 /* MSR_IA32_MCx_{CTL, STATUS, ADDR, MISC, CTL2} */ 2193 case MSR_KVM_POLL_CONTROL: 2194 return true; 2195 case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff: 2196 /* 2197 * x2APIC registers that are virtualized by the CPU can't be 2198 * emulated, KVM doesn't have access to the virtual APIC page. 2199 */ 2200 switch (index) { 2201 case X2APIC_MSR(APIC_TASKPRI): 2202 case X2APIC_MSR(APIC_PROCPRI): 2203 case X2APIC_MSR(APIC_EOI): 2204 case X2APIC_MSR(APIC_ISR) ... X2APIC_MSR(APIC_ISR + APIC_ISR_NR): 2205 case X2APIC_MSR(APIC_TMR) ... X2APIC_MSR(APIC_TMR + APIC_ISR_NR): 2206 case X2APIC_MSR(APIC_IRR) ... X2APIC_MSR(APIC_IRR + APIC_ISR_NR): 2207 return false; 2208 default: 2209 return true; 2210 } 2211 default: 2212 return false; 2213 } 2214 } 2215 2216 static bool tdx_is_read_only_msr(u32 index) 2217 { 2218 return index == MSR_IA32_APICBASE || index == MSR_EFER || 2219 index == MSR_IA32_FEAT_CTL; 2220 } 2221 2222 int tdx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) 2223 { 2224 switch (msr->index) { 2225 case MSR_IA32_FEAT_CTL: 2226 /* 2227 * MCE and MCA are advertised via cpuid. Guest kernel could 2228 * check if LMCE is enabled or not. 2229 */ 2230 msr->data = FEAT_CTL_LOCKED; 2231 if (vcpu->arch.mcg_cap & MCG_LMCE_P) 2232 msr->data |= FEAT_CTL_LMCE_ENABLED; 2233 return 0; 2234 case MSR_IA32_MCG_EXT_CTL: 2235 if (!msr->host_initiated && !(vcpu->arch.mcg_cap & MCG_LMCE_P)) 2236 return 1; 2237 msr->data = vcpu->arch.mcg_ext_ctl; 2238 return 0; 2239 default: 2240 if (!tdx_has_emulated_msr(msr->index)) 2241 return 1; 2242 2243 return kvm_get_msr_common(vcpu, msr); 2244 } 2245 } 2246 2247 int tdx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) 2248 { 2249 switch (msr->index) { 2250 case MSR_IA32_MCG_EXT_CTL: 2251 if ((!msr->host_initiated && !(vcpu->arch.mcg_cap & MCG_LMCE_P)) || 2252 (msr->data & ~MCG_EXT_CTL_LMCE_EN)) 2253 return 1; 2254 vcpu->arch.mcg_ext_ctl = msr->data; 2255 return 0; 2256 default: 2257 if (tdx_is_read_only_msr(msr->index)) 2258 return 1; 2259 2260 if (!tdx_has_emulated_msr(msr->index)) 2261 return 1; 2262 2263 return kvm_set_msr_common(vcpu, msr); 2264 } 2265 } 2266 2267 static int tdx_get_capabilities(struct kvm_tdx_cmd *cmd) 2268 { 2269 const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf; 2270 struct kvm_tdx_capabilities __user *user_caps; 2271 struct kvm_tdx_capabilities *caps = NULL; 2272 u32 nr_user_entries; 2273 int ret = 0; 2274 2275 /* flags is reserved for future use */ 2276 if (cmd->flags) 2277 return -EINVAL; 2278 2279 caps = kzalloc(sizeof(*caps) + 2280 sizeof(struct kvm_cpuid_entry2) * td_conf->num_cpuid_config, 2281 GFP_KERNEL); 2282 if (!caps) 2283 return -ENOMEM; 2284 2285 user_caps = u64_to_user_ptr(cmd->data); 2286 if (get_user(nr_user_entries, &user_caps->cpuid.nent)) { 2287 ret = -EFAULT; 2288 goto out; 2289 } 2290 2291 if (nr_user_entries < td_conf->num_cpuid_config) { 2292 ret = -E2BIG; 2293 goto out; 2294 } 2295 2296 ret = init_kvm_tdx_caps(td_conf, caps); 2297 if (ret) 2298 goto out; 2299 2300 if (copy_to_user(user_caps, caps, sizeof(*caps))) { 2301 ret = -EFAULT; 2302 goto out; 2303 } 2304 2305 if (copy_to_user(user_caps->cpuid.entries, caps->cpuid.entries, 2306 caps->cpuid.nent * 2307 sizeof(caps->cpuid.entries[0]))) 2308 ret = -EFAULT; 2309 2310 out: 2311 /* kfree() accepts NULL. */ 2312 kfree(caps); 2313 return ret; 2314 } 2315 2316 /* 2317 * KVM reports guest physical address in CPUID.0x800000008.EAX[23:16], which is 2318 * similar to TDX's GPAW. Use this field as the interface for userspace to 2319 * configure the GPAW and EPT level for TDs. 2320 * 2321 * Only values 48 and 52 are supported. Value 52 means GPAW-52 and EPT level 2322 * 5, Value 48 means GPAW-48 and EPT level 4. For value 48, GPAW-48 is always 2323 * supported. Value 52 is only supported when the platform supports 5 level 2324 * EPT. 2325 */ 2326 static int setup_tdparams_eptp_controls(struct kvm_cpuid2 *cpuid, 2327 struct td_params *td_params) 2328 { 2329 const struct kvm_cpuid_entry2 *entry; 2330 int guest_pa; 2331 2332 entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent, 0x80000008, 0); 2333 if (!entry) 2334 return -EINVAL; 2335 2336 guest_pa = tdx_get_guest_phys_addr_bits(entry->eax); 2337 2338 if (guest_pa != 48 && guest_pa != 52) 2339 return -EINVAL; 2340 2341 if (guest_pa == 52 && !cpu_has_vmx_ept_5levels()) 2342 return -EINVAL; 2343 2344 td_params->eptp_controls = VMX_EPTP_MT_WB; 2345 if (guest_pa == 52) { 2346 td_params->eptp_controls |= VMX_EPTP_PWL_5; 2347 td_params->config_flags |= TDX_CONFIG_FLAGS_MAX_GPAW; 2348 } else { 2349 td_params->eptp_controls |= VMX_EPTP_PWL_4; 2350 } 2351 2352 return 0; 2353 } 2354 2355 static int setup_tdparams_cpuids(struct kvm_cpuid2 *cpuid, 2356 struct td_params *td_params) 2357 { 2358 const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf; 2359 const struct kvm_cpuid_entry2 *entry; 2360 struct tdx_cpuid_value *value; 2361 int i, copy_cnt = 0; 2362 2363 /* 2364 * td_params.cpuid_values: The number and the order of cpuid_value must 2365 * be same to the one of struct tdsysinfo.{num_cpuid_config, cpuid_configs} 2366 * It's assumed that td_params was zeroed. 2367 */ 2368 for (i = 0; i < td_conf->num_cpuid_config; i++) { 2369 struct kvm_cpuid_entry2 tmp; 2370 2371 td_init_cpuid_entry2(&tmp, i); 2372 2373 entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent, 2374 tmp.function, tmp.index); 2375 if (!entry) 2376 continue; 2377 2378 if (tdx_unsupported_cpuid(entry)) 2379 return -EINVAL; 2380 2381 copy_cnt++; 2382 2383 value = &td_params->cpuid_values[i]; 2384 value->eax = entry->eax; 2385 value->ebx = entry->ebx; 2386 value->ecx = entry->ecx; 2387 value->edx = entry->edx; 2388 2389 /* 2390 * TDX module does not accept nonzero bits 16..23 for the 2391 * CPUID[0x80000008].EAX, see setup_tdparams_eptp_controls(). 2392 */ 2393 if (tmp.function == 0x80000008) 2394 value->eax = tdx_set_guest_phys_addr_bits(value->eax, 0); 2395 } 2396 2397 /* 2398 * Rely on the TDX module to reject invalid configuration, but it can't 2399 * check of leafs that don't have a proper slot in td_params->cpuid_values 2400 * to stick then. So fail if there were entries that didn't get copied to 2401 * td_params. 2402 */ 2403 if (copy_cnt != cpuid->nent) 2404 return -EINVAL; 2405 2406 return 0; 2407 } 2408 2409 static int setup_tdparams(struct kvm *kvm, struct td_params *td_params, 2410 struct kvm_tdx_init_vm *init_vm) 2411 { 2412 const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf; 2413 struct kvm_cpuid2 *cpuid = &init_vm->cpuid; 2414 int ret; 2415 2416 if (kvm->created_vcpus) 2417 return -EBUSY; 2418 2419 if (init_vm->attributes & ~tdx_get_supported_attrs(td_conf)) 2420 return -EINVAL; 2421 2422 if (init_vm->xfam & ~tdx_get_supported_xfam(td_conf)) 2423 return -EINVAL; 2424 2425 td_params->max_vcpus = kvm->max_vcpus; 2426 td_params->attributes = init_vm->attributes | td_conf->attributes_fixed1; 2427 td_params->xfam = init_vm->xfam | td_conf->xfam_fixed1; 2428 2429 td_params->config_flags = TDX_CONFIG_FLAGS_NO_RBP_MOD; 2430 td_params->tsc_frequency = TDX_TSC_KHZ_TO_25MHZ(kvm->arch.default_tsc_khz); 2431 2432 ret = setup_tdparams_eptp_controls(cpuid, td_params); 2433 if (ret) 2434 return ret; 2435 2436 ret = setup_tdparams_cpuids(cpuid, td_params); 2437 if (ret) 2438 return ret; 2439 2440 #define MEMCPY_SAME_SIZE(dst, src) \ 2441 do { \ 2442 BUILD_BUG_ON(sizeof(dst) != sizeof(src)); \ 2443 memcpy((dst), (src), sizeof(dst)); \ 2444 } while (0) 2445 2446 MEMCPY_SAME_SIZE(td_params->mrconfigid, init_vm->mrconfigid); 2447 MEMCPY_SAME_SIZE(td_params->mrowner, init_vm->mrowner); 2448 MEMCPY_SAME_SIZE(td_params->mrownerconfig, init_vm->mrownerconfig); 2449 2450 return 0; 2451 } 2452 2453 static int __tdx_td_init(struct kvm *kvm, struct td_params *td_params, 2454 u64 *seamcall_err) 2455 { 2456 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 2457 cpumask_var_t packages; 2458 struct page **tdcs_pages = NULL; 2459 struct page *tdr_page; 2460 int ret, i; 2461 u64 err, rcx; 2462 2463 *seamcall_err = 0; 2464 ret = tdx_guest_keyid_alloc(); 2465 if (ret < 0) 2466 return ret; 2467 kvm_tdx->hkid = ret; 2468 kvm_tdx->misc_cg = get_current_misc_cg(); 2469 ret = misc_cg_try_charge(MISC_CG_RES_TDX, kvm_tdx->misc_cg, 1); 2470 if (ret) 2471 goto free_hkid; 2472 2473 ret = -ENOMEM; 2474 2475 atomic_inc(&nr_configured_hkid); 2476 2477 tdr_page = alloc_page(GFP_KERNEL); 2478 if (!tdr_page) 2479 goto free_hkid; 2480 2481 kvm_tdx->td.tdcs_nr_pages = tdx_sysinfo->td_ctrl.tdcs_base_size / PAGE_SIZE; 2482 /* TDVPS = TDVPR(4K page) + TDCX(multiple 4K pages), -1 for TDVPR. */ 2483 kvm_tdx->td.tdcx_nr_pages = tdx_sysinfo->td_ctrl.tdvps_base_size / PAGE_SIZE - 1; 2484 tdcs_pages = kcalloc(kvm_tdx->td.tdcs_nr_pages, sizeof(*kvm_tdx->td.tdcs_pages), 2485 GFP_KERNEL | __GFP_ZERO); 2486 if (!tdcs_pages) 2487 goto free_tdr; 2488 2489 for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) { 2490 tdcs_pages[i] = alloc_page(GFP_KERNEL); 2491 if (!tdcs_pages[i]) 2492 goto free_tdcs; 2493 } 2494 2495 if (!zalloc_cpumask_var(&packages, GFP_KERNEL)) 2496 goto free_tdcs; 2497 2498 cpus_read_lock(); 2499 2500 /* 2501 * Need at least one CPU of the package to be online in order to 2502 * program all packages for host key id. Check it. 2503 */ 2504 for_each_present_cpu(i) 2505 cpumask_set_cpu(topology_physical_package_id(i), packages); 2506 for_each_online_cpu(i) 2507 cpumask_clear_cpu(topology_physical_package_id(i), packages); 2508 if (!cpumask_empty(packages)) { 2509 ret = -EIO; 2510 /* 2511 * Because it's hard for human operator to figure out the 2512 * reason, warn it. 2513 */ 2514 #define MSG_ALLPKG "All packages need to have online CPU to create TD. Online CPU and retry.\n" 2515 pr_warn_ratelimited(MSG_ALLPKG); 2516 goto free_packages; 2517 } 2518 2519 /* 2520 * TDH.MNG.CREATE tries to grab the global TDX module and fails 2521 * with TDX_OPERAND_BUSY when it fails to grab. Take the global 2522 * lock to prevent it from failure. 2523 */ 2524 mutex_lock(&tdx_lock); 2525 kvm_tdx->td.tdr_page = tdr_page; 2526 err = tdh_mng_create(&kvm_tdx->td, kvm_tdx->hkid); 2527 mutex_unlock(&tdx_lock); 2528 2529 if (err == TDX_RND_NO_ENTROPY) { 2530 ret = -EAGAIN; 2531 goto free_packages; 2532 } 2533 2534 if (WARN_ON_ONCE(err)) { 2535 pr_tdx_error(TDH_MNG_CREATE, err); 2536 ret = -EIO; 2537 goto free_packages; 2538 } 2539 2540 for_each_online_cpu(i) { 2541 int pkg = topology_physical_package_id(i); 2542 2543 if (cpumask_test_and_set_cpu(pkg, packages)) 2544 continue; 2545 2546 /* 2547 * Program the memory controller in the package with an 2548 * encryption key associated to a TDX private host key id 2549 * assigned to this TDR. Concurrent operations on same memory 2550 * controller results in TDX_OPERAND_BUSY. No locking needed 2551 * beyond the cpus_read_lock() above as it serializes against 2552 * hotplug and the first online CPU of the package is always 2553 * used. We never have two CPUs in the same socket trying to 2554 * program the key. 2555 */ 2556 ret = smp_call_on_cpu(i, tdx_do_tdh_mng_key_config, 2557 kvm_tdx, true); 2558 if (ret) 2559 break; 2560 } 2561 cpus_read_unlock(); 2562 free_cpumask_var(packages); 2563 if (ret) { 2564 i = 0; 2565 goto teardown; 2566 } 2567 2568 kvm_tdx->td.tdcs_pages = tdcs_pages; 2569 for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) { 2570 err = tdh_mng_addcx(&kvm_tdx->td, tdcs_pages[i]); 2571 if (err == TDX_RND_NO_ENTROPY) { 2572 /* Here it's hard to allow userspace to retry. */ 2573 ret = -EAGAIN; 2574 goto teardown; 2575 } 2576 if (WARN_ON_ONCE(err)) { 2577 pr_tdx_error(TDH_MNG_ADDCX, err); 2578 ret = -EIO; 2579 goto teardown; 2580 } 2581 } 2582 2583 err = tdh_mng_init(&kvm_tdx->td, __pa(td_params), &rcx); 2584 if ((err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_INVALID) { 2585 /* 2586 * Because a user gives operands, don't warn. 2587 * Return a hint to the user because it's sometimes hard for the 2588 * user to figure out which operand is invalid. SEAMCALL status 2589 * code includes which operand caused invalid operand error. 2590 */ 2591 *seamcall_err = err; 2592 ret = -EINVAL; 2593 goto teardown; 2594 } else if (WARN_ON_ONCE(err)) { 2595 pr_tdx_error_1(TDH_MNG_INIT, err, rcx); 2596 ret = -EIO; 2597 goto teardown; 2598 } 2599 2600 return 0; 2601 2602 /* 2603 * The sequence for freeing resources from a partially initialized TD 2604 * varies based on where in the initialization flow failure occurred. 2605 * Simply use the full teardown and destroy, which naturally play nice 2606 * with partial initialization. 2607 */ 2608 teardown: 2609 /* Only free pages not yet added, so start at 'i' */ 2610 for (; i < kvm_tdx->td.tdcs_nr_pages; i++) { 2611 if (tdcs_pages[i]) { 2612 __free_page(tdcs_pages[i]); 2613 tdcs_pages[i] = NULL; 2614 } 2615 } 2616 if (!kvm_tdx->td.tdcs_pages) 2617 kfree(tdcs_pages); 2618 2619 tdx_mmu_release_hkid(kvm); 2620 tdx_reclaim_td_control_pages(kvm); 2621 2622 return ret; 2623 2624 free_packages: 2625 cpus_read_unlock(); 2626 free_cpumask_var(packages); 2627 2628 free_tdcs: 2629 for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) { 2630 if (tdcs_pages[i]) 2631 __free_page(tdcs_pages[i]); 2632 } 2633 kfree(tdcs_pages); 2634 kvm_tdx->td.tdcs_pages = NULL; 2635 2636 free_tdr: 2637 if (tdr_page) 2638 __free_page(tdr_page); 2639 kvm_tdx->td.tdr_page = 0; 2640 2641 free_hkid: 2642 tdx_hkid_free(kvm_tdx); 2643 2644 return ret; 2645 } 2646 2647 static u64 tdx_td_metadata_field_read(struct kvm_tdx *tdx, u64 field_id, 2648 u64 *data) 2649 { 2650 u64 err; 2651 2652 err = tdh_mng_rd(&tdx->td, field_id, data); 2653 2654 return err; 2655 } 2656 2657 #define TDX_MD_UNREADABLE_LEAF_MASK GENMASK(30, 7) 2658 #define TDX_MD_UNREADABLE_SUBLEAF_MASK GENMASK(31, 7) 2659 2660 static int tdx_read_cpuid(struct kvm_vcpu *vcpu, u32 leaf, u32 sub_leaf, 2661 bool sub_leaf_set, int *entry_index, 2662 struct kvm_cpuid_entry2 *out) 2663 { 2664 struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); 2665 u64 field_id = TD_MD_FIELD_ID_CPUID_VALUES; 2666 u64 ebx_eax, edx_ecx; 2667 u64 err = 0; 2668 2669 if (sub_leaf > 0b1111111) 2670 return -EINVAL; 2671 2672 if (*entry_index >= KVM_MAX_CPUID_ENTRIES) 2673 return -EINVAL; 2674 2675 if (leaf & TDX_MD_UNREADABLE_LEAF_MASK || 2676 sub_leaf & TDX_MD_UNREADABLE_SUBLEAF_MASK) 2677 return -EINVAL; 2678 2679 /* 2680 * bit 23:17, REVSERVED: reserved, must be 0; 2681 * bit 16, LEAF_31: leaf number bit 31; 2682 * bit 15:9, LEAF_6_0: leaf number bits 6:0, leaf bits 30:7 are 2683 * implicitly 0; 2684 * bit 8, SUBLEAF_NA: sub-leaf not applicable flag; 2685 * bit 7:1, SUBLEAF_6_0: sub-leaf number bits 6:0. If SUBLEAF_NA is 1, 2686 * the SUBLEAF_6_0 is all-1. 2687 * sub-leaf bits 31:7 are implicitly 0; 2688 * bit 0, ELEMENT_I: Element index within field; 2689 */ 2690 field_id |= ((leaf & 0x80000000) ? 1 : 0) << 16; 2691 field_id |= (leaf & 0x7f) << 9; 2692 if (sub_leaf_set) 2693 field_id |= (sub_leaf & 0x7f) << 1; 2694 else 2695 field_id |= 0x1fe; 2696 2697 err = tdx_td_metadata_field_read(kvm_tdx, field_id, &ebx_eax); 2698 if (err) //TODO check for specific errors 2699 goto err_out; 2700 2701 out->eax = (u32) ebx_eax; 2702 out->ebx = (u32) (ebx_eax >> 32); 2703 2704 field_id++; 2705 err = tdx_td_metadata_field_read(kvm_tdx, field_id, &edx_ecx); 2706 /* 2707 * It's weird that reading edx_ecx fails while reading ebx_eax 2708 * succeeded. 2709 */ 2710 if (WARN_ON_ONCE(err)) 2711 goto err_out; 2712 2713 out->ecx = (u32) edx_ecx; 2714 out->edx = (u32) (edx_ecx >> 32); 2715 2716 out->function = leaf; 2717 out->index = sub_leaf; 2718 out->flags |= sub_leaf_set ? KVM_CPUID_FLAG_SIGNIFCANT_INDEX : 0; 2719 2720 /* 2721 * Work around missing support on old TDX modules, fetch 2722 * guest maxpa from gfn_direct_bits. 2723 */ 2724 if (leaf == 0x80000008) { 2725 gpa_t gpa_bits = gfn_to_gpa(kvm_gfn_direct_bits(vcpu->kvm)); 2726 unsigned int g_maxpa = __ffs(gpa_bits) + 1; 2727 2728 out->eax = tdx_set_guest_phys_addr_bits(out->eax, g_maxpa); 2729 } 2730 2731 (*entry_index)++; 2732 2733 return 0; 2734 2735 err_out: 2736 out->eax = 0; 2737 out->ebx = 0; 2738 out->ecx = 0; 2739 out->edx = 0; 2740 2741 return -EIO; 2742 } 2743 2744 static int tdx_td_init(struct kvm *kvm, struct kvm_tdx_cmd *cmd) 2745 { 2746 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 2747 struct kvm_tdx_init_vm *init_vm; 2748 struct td_params *td_params = NULL; 2749 int ret; 2750 2751 BUILD_BUG_ON(sizeof(*init_vm) != 256 + sizeof_field(struct kvm_tdx_init_vm, cpuid)); 2752 BUILD_BUG_ON(sizeof(struct td_params) != 1024); 2753 2754 if (kvm_tdx->state != TD_STATE_UNINITIALIZED) 2755 return -EINVAL; 2756 2757 if (cmd->flags) 2758 return -EINVAL; 2759 2760 init_vm = kmalloc(sizeof(*init_vm) + 2761 sizeof(init_vm->cpuid.entries[0]) * KVM_MAX_CPUID_ENTRIES, 2762 GFP_KERNEL); 2763 if (!init_vm) 2764 return -ENOMEM; 2765 2766 if (copy_from_user(init_vm, u64_to_user_ptr(cmd->data), sizeof(*init_vm))) { 2767 ret = -EFAULT; 2768 goto out; 2769 } 2770 2771 if (init_vm->cpuid.nent > KVM_MAX_CPUID_ENTRIES) { 2772 ret = -E2BIG; 2773 goto out; 2774 } 2775 2776 if (copy_from_user(init_vm->cpuid.entries, 2777 u64_to_user_ptr(cmd->data) + sizeof(*init_vm), 2778 flex_array_size(init_vm, cpuid.entries, init_vm->cpuid.nent))) { 2779 ret = -EFAULT; 2780 goto out; 2781 } 2782 2783 if (memchr_inv(init_vm->reserved, 0, sizeof(init_vm->reserved))) { 2784 ret = -EINVAL; 2785 goto out; 2786 } 2787 2788 if (init_vm->cpuid.padding) { 2789 ret = -EINVAL; 2790 goto out; 2791 } 2792 2793 td_params = kzalloc(sizeof(struct td_params), GFP_KERNEL); 2794 if (!td_params) { 2795 ret = -ENOMEM; 2796 goto out; 2797 } 2798 2799 ret = setup_tdparams(kvm, td_params, init_vm); 2800 if (ret) 2801 goto out; 2802 2803 ret = __tdx_td_init(kvm, td_params, &cmd->hw_error); 2804 if (ret) 2805 goto out; 2806 2807 kvm_tdx->tsc_offset = td_tdcs_exec_read64(kvm_tdx, TD_TDCS_EXEC_TSC_OFFSET); 2808 kvm_tdx->tsc_multiplier = td_tdcs_exec_read64(kvm_tdx, TD_TDCS_EXEC_TSC_MULTIPLIER); 2809 kvm_tdx->attributes = td_params->attributes; 2810 kvm_tdx->xfam = td_params->xfam; 2811 2812 if (td_params->config_flags & TDX_CONFIG_FLAGS_MAX_GPAW) 2813 kvm->arch.gfn_direct_bits = TDX_SHARED_BIT_PWL_5; 2814 else 2815 kvm->arch.gfn_direct_bits = TDX_SHARED_BIT_PWL_4; 2816 2817 kvm_tdx->state = TD_STATE_INITIALIZED; 2818 out: 2819 /* kfree() accepts NULL. */ 2820 kfree(init_vm); 2821 kfree(td_params); 2822 2823 return ret; 2824 } 2825 2826 void tdx_flush_tlb_current(struct kvm_vcpu *vcpu) 2827 { 2828 /* 2829 * flush_tlb_current() is invoked when the first time for the vcpu to 2830 * run or when root of shared EPT is invalidated. 2831 * KVM only needs to flush shared EPT because the TDX module handles TLB 2832 * invalidation for private EPT in tdh_vp_enter(); 2833 * 2834 * A single context invalidation for shared EPT can be performed here. 2835 * However, this single context invalidation requires the private EPTP 2836 * rather than the shared EPTP to flush shared EPT, as shared EPT uses 2837 * private EPTP as its ASID for TLB invalidation. 2838 * 2839 * To avoid reading back private EPTP, perform a global invalidation for 2840 * shared EPT instead to keep this function simple. 2841 */ 2842 ept_sync_global(); 2843 } 2844 2845 void tdx_flush_tlb_all(struct kvm_vcpu *vcpu) 2846 { 2847 /* 2848 * TDX has called tdx_track() in tdx_sept_remove_private_spte() to 2849 * ensure that private EPT will be flushed on the next TD enter. No need 2850 * to call tdx_track() here again even when this callback is a result of 2851 * zapping private EPT. 2852 * 2853 * Due to the lack of the context to determine which EPT has been 2854 * affected by zapping, invoke invept() directly here for both shared 2855 * EPT and private EPT for simplicity, though it's not necessary for 2856 * private EPT. 2857 */ 2858 ept_sync_global(); 2859 } 2860 2861 static int tdx_td_finalize(struct kvm *kvm, struct kvm_tdx_cmd *cmd) 2862 { 2863 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 2864 2865 guard(mutex)(&kvm->slots_lock); 2866 2867 if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE) 2868 return -EINVAL; 2869 /* 2870 * Pages are pending for KVM_TDX_INIT_MEM_REGION to issue 2871 * TDH.MEM.PAGE.ADD(). 2872 */ 2873 if (atomic64_read(&kvm_tdx->nr_premapped)) 2874 return -EINVAL; 2875 2876 cmd->hw_error = tdh_mr_finalize(&kvm_tdx->td); 2877 if (tdx_operand_busy(cmd->hw_error)) 2878 return -EBUSY; 2879 if (KVM_BUG_ON(cmd->hw_error, kvm)) { 2880 pr_tdx_error(TDH_MR_FINALIZE, cmd->hw_error); 2881 return -EIO; 2882 } 2883 2884 kvm_tdx->state = TD_STATE_RUNNABLE; 2885 /* TD_STATE_RUNNABLE must be set before 'pre_fault_allowed' */ 2886 smp_wmb(); 2887 kvm->arch.pre_fault_allowed = true; 2888 return 0; 2889 } 2890 2891 int tdx_vm_ioctl(struct kvm *kvm, void __user *argp) 2892 { 2893 struct kvm_tdx_cmd tdx_cmd; 2894 int r; 2895 2896 if (copy_from_user(&tdx_cmd, argp, sizeof(struct kvm_tdx_cmd))) 2897 return -EFAULT; 2898 2899 /* 2900 * Userspace should never set hw_error. It is used to fill 2901 * hardware-defined error by the kernel. 2902 */ 2903 if (tdx_cmd.hw_error) 2904 return -EINVAL; 2905 2906 mutex_lock(&kvm->lock); 2907 2908 switch (tdx_cmd.id) { 2909 case KVM_TDX_CAPABILITIES: 2910 r = tdx_get_capabilities(&tdx_cmd); 2911 break; 2912 case KVM_TDX_INIT_VM: 2913 r = tdx_td_init(kvm, &tdx_cmd); 2914 break; 2915 case KVM_TDX_FINALIZE_VM: 2916 r = tdx_td_finalize(kvm, &tdx_cmd); 2917 break; 2918 default: 2919 r = -EINVAL; 2920 goto out; 2921 } 2922 2923 if (copy_to_user(argp, &tdx_cmd, sizeof(struct kvm_tdx_cmd))) 2924 r = -EFAULT; 2925 2926 out: 2927 mutex_unlock(&kvm->lock); 2928 return r; 2929 } 2930 2931 /* VMM can pass one 64bit auxiliary data to vcpu via RCX for guest BIOS. */ 2932 static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx) 2933 { 2934 struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); 2935 struct vcpu_tdx *tdx = to_tdx(vcpu); 2936 struct page *page; 2937 int ret, i; 2938 u64 err; 2939 2940 page = alloc_page(GFP_KERNEL); 2941 if (!page) 2942 return -ENOMEM; 2943 tdx->vp.tdvpr_page = page; 2944 2945 tdx->vp.tdcx_pages = kcalloc(kvm_tdx->td.tdcx_nr_pages, sizeof(*tdx->vp.tdcx_pages), 2946 GFP_KERNEL); 2947 if (!tdx->vp.tdcx_pages) { 2948 ret = -ENOMEM; 2949 goto free_tdvpr; 2950 } 2951 2952 for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) { 2953 page = alloc_page(GFP_KERNEL); 2954 if (!page) { 2955 ret = -ENOMEM; 2956 goto free_tdcx; 2957 } 2958 tdx->vp.tdcx_pages[i] = page; 2959 } 2960 2961 err = tdh_vp_create(&kvm_tdx->td, &tdx->vp); 2962 if (KVM_BUG_ON(err, vcpu->kvm)) { 2963 ret = -EIO; 2964 pr_tdx_error(TDH_VP_CREATE, err); 2965 goto free_tdcx; 2966 } 2967 2968 for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) { 2969 err = tdh_vp_addcx(&tdx->vp, tdx->vp.tdcx_pages[i]); 2970 if (KVM_BUG_ON(err, vcpu->kvm)) { 2971 pr_tdx_error(TDH_VP_ADDCX, err); 2972 /* 2973 * Pages already added are reclaimed by the vcpu_free 2974 * method, but the rest are freed here. 2975 */ 2976 for (; i < kvm_tdx->td.tdcx_nr_pages; i++) { 2977 __free_page(tdx->vp.tdcx_pages[i]); 2978 tdx->vp.tdcx_pages[i] = NULL; 2979 } 2980 return -EIO; 2981 } 2982 } 2983 2984 err = tdh_vp_init(&tdx->vp, vcpu_rcx, vcpu->vcpu_id); 2985 if (KVM_BUG_ON(err, vcpu->kvm)) { 2986 pr_tdx_error(TDH_VP_INIT, err); 2987 return -EIO; 2988 } 2989 2990 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 2991 2992 return 0; 2993 2994 free_tdcx: 2995 for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) { 2996 if (tdx->vp.tdcx_pages[i]) 2997 __free_page(tdx->vp.tdcx_pages[i]); 2998 tdx->vp.tdcx_pages[i] = NULL; 2999 } 3000 kfree(tdx->vp.tdcx_pages); 3001 tdx->vp.tdcx_pages = NULL; 3002 3003 free_tdvpr: 3004 if (tdx->vp.tdvpr_page) 3005 __free_page(tdx->vp.tdvpr_page); 3006 tdx->vp.tdvpr_page = 0; 3007 3008 return ret; 3009 } 3010 3011 /* Sometimes reads multipple subleafs. Return how many enties were written. */ 3012 static int tdx_vcpu_get_cpuid_leaf(struct kvm_vcpu *vcpu, u32 leaf, int *entry_index, 3013 struct kvm_cpuid_entry2 *output_e) 3014 { 3015 int sub_leaf = 0; 3016 int ret; 3017 3018 /* First try without a subleaf */ 3019 ret = tdx_read_cpuid(vcpu, leaf, 0, false, entry_index, output_e); 3020 3021 /* If success, or invalid leaf, just give up */ 3022 if (ret != -EIO) 3023 return ret; 3024 3025 /* 3026 * If the try without a subleaf failed, try reading subleafs until 3027 * failure. The TDX module only supports 6 bits of subleaf index. 3028 */ 3029 while (1) { 3030 /* Keep reading subleafs until there is a failure. */ 3031 if (tdx_read_cpuid(vcpu, leaf, sub_leaf, true, entry_index, output_e)) 3032 return !sub_leaf; 3033 3034 sub_leaf++; 3035 output_e++; 3036 } 3037 3038 return 0; 3039 } 3040 3041 static int tdx_vcpu_get_cpuid(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd) 3042 { 3043 struct kvm_cpuid2 __user *output, *td_cpuid; 3044 int r = 0, i = 0, leaf; 3045 u32 level; 3046 3047 output = u64_to_user_ptr(cmd->data); 3048 td_cpuid = kzalloc(sizeof(*td_cpuid) + 3049 sizeof(output->entries[0]) * KVM_MAX_CPUID_ENTRIES, 3050 GFP_KERNEL); 3051 if (!td_cpuid) 3052 return -ENOMEM; 3053 3054 if (copy_from_user(td_cpuid, output, sizeof(*output))) { 3055 r = -EFAULT; 3056 goto out; 3057 } 3058 3059 /* Read max CPUID for normal range */ 3060 if (tdx_vcpu_get_cpuid_leaf(vcpu, 0, &i, &td_cpuid->entries[i])) { 3061 r = -EIO; 3062 goto out; 3063 } 3064 level = td_cpuid->entries[0].eax; 3065 3066 for (leaf = 1; leaf <= level; leaf++) 3067 tdx_vcpu_get_cpuid_leaf(vcpu, leaf, &i, &td_cpuid->entries[i]); 3068 3069 /* Read max CPUID for extended range */ 3070 if (tdx_vcpu_get_cpuid_leaf(vcpu, 0x80000000, &i, &td_cpuid->entries[i])) { 3071 r = -EIO; 3072 goto out; 3073 } 3074 level = td_cpuid->entries[i - 1].eax; 3075 3076 for (leaf = 0x80000001; leaf <= level; leaf++) 3077 tdx_vcpu_get_cpuid_leaf(vcpu, leaf, &i, &td_cpuid->entries[i]); 3078 3079 if (td_cpuid->nent < i) 3080 r = -E2BIG; 3081 td_cpuid->nent = i; 3082 3083 if (copy_to_user(output, td_cpuid, sizeof(*output))) { 3084 r = -EFAULT; 3085 goto out; 3086 } 3087 3088 if (r == -E2BIG) 3089 goto out; 3090 3091 if (copy_to_user(output->entries, td_cpuid->entries, 3092 td_cpuid->nent * sizeof(struct kvm_cpuid_entry2))) 3093 r = -EFAULT; 3094 3095 out: 3096 kfree(td_cpuid); 3097 3098 return r; 3099 } 3100 3101 static int tdx_vcpu_init(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd) 3102 { 3103 u64 apic_base; 3104 struct vcpu_tdx *tdx = to_tdx(vcpu); 3105 int ret; 3106 3107 if (cmd->flags) 3108 return -EINVAL; 3109 3110 if (tdx->state != VCPU_TD_STATE_UNINITIALIZED) 3111 return -EINVAL; 3112 3113 /* 3114 * TDX requires X2APIC, userspace is responsible for configuring guest 3115 * CPUID accordingly. 3116 */ 3117 apic_base = APIC_DEFAULT_PHYS_BASE | LAPIC_MODE_X2APIC | 3118 (kvm_vcpu_is_reset_bsp(vcpu) ? MSR_IA32_APICBASE_BSP : 0); 3119 if (kvm_apic_set_base(vcpu, apic_base, true)) 3120 return -EINVAL; 3121 3122 ret = tdx_td_vcpu_init(vcpu, (u64)cmd->data); 3123 if (ret) 3124 return ret; 3125 3126 td_vmcs_write16(tdx, POSTED_INTR_NV, POSTED_INTR_VECTOR); 3127 td_vmcs_write64(tdx, POSTED_INTR_DESC_ADDR, __pa(&tdx->vt.pi_desc)); 3128 td_vmcs_setbit32(tdx, PIN_BASED_VM_EXEC_CONTROL, PIN_BASED_POSTED_INTR); 3129 3130 tdx->state = VCPU_TD_STATE_INITIALIZED; 3131 3132 return 0; 3133 } 3134 3135 void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) 3136 { 3137 /* 3138 * Yell on INIT, as TDX doesn't support INIT, i.e. KVM should drop all 3139 * INIT events. 3140 * 3141 * Defer initializing vCPU for RESET state until KVM_TDX_INIT_VCPU, as 3142 * userspace needs to define the vCPU model before KVM can initialize 3143 * vCPU state, e.g. to enable x2APIC. 3144 */ 3145 WARN_ON_ONCE(init_event); 3146 } 3147 3148 struct tdx_gmem_post_populate_arg { 3149 struct kvm_vcpu *vcpu; 3150 __u32 flags; 3151 }; 3152 3153 static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, 3154 void __user *src, int order, void *_arg) 3155 { 3156 u64 error_code = PFERR_GUEST_FINAL_MASK | PFERR_PRIVATE_ACCESS; 3157 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 3158 struct tdx_gmem_post_populate_arg *arg = _arg; 3159 struct kvm_vcpu *vcpu = arg->vcpu; 3160 gpa_t gpa = gfn_to_gpa(gfn); 3161 u8 level = PG_LEVEL_4K; 3162 struct page *src_page; 3163 int ret, i; 3164 u64 err, entry, level_state; 3165 3166 /* 3167 * Get the source page if it has been faulted in. Return failure if the 3168 * source page has been swapped out or unmapped in primary memory. 3169 */ 3170 ret = get_user_pages_fast((unsigned long)src, 1, 0, &src_page); 3171 if (ret < 0) 3172 return ret; 3173 if (ret != 1) 3174 return -ENOMEM; 3175 3176 ret = kvm_tdp_map_page(vcpu, gpa, error_code, &level); 3177 if (ret < 0) 3178 goto out; 3179 3180 /* 3181 * The private mem cannot be zapped after kvm_tdp_map_page() 3182 * because all paths are covered by slots_lock and the 3183 * filemap invalidate lock. Check that they are indeed enough. 3184 */ 3185 if (IS_ENABLED(CONFIG_KVM_PROVE_MMU)) { 3186 scoped_guard(read_lock, &kvm->mmu_lock) { 3187 if (KVM_BUG_ON(!kvm_tdp_mmu_gpa_is_mapped(vcpu, gpa), kvm)) { 3188 ret = -EIO; 3189 goto out; 3190 } 3191 } 3192 } 3193 3194 ret = 0; 3195 err = tdh_mem_page_add(&kvm_tdx->td, gpa, pfn_to_page(pfn), 3196 src_page, &entry, &level_state); 3197 if (err) { 3198 ret = unlikely(tdx_operand_busy(err)) ? -EBUSY : -EIO; 3199 goto out; 3200 } 3201 3202 if (!KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm)) 3203 atomic64_dec(&kvm_tdx->nr_premapped); 3204 3205 if (arg->flags & KVM_TDX_MEASURE_MEMORY_REGION) { 3206 for (i = 0; i < PAGE_SIZE; i += TDX_EXTENDMR_CHUNKSIZE) { 3207 err = tdh_mr_extend(&kvm_tdx->td, gpa + i, &entry, 3208 &level_state); 3209 if (err) { 3210 ret = -EIO; 3211 break; 3212 } 3213 } 3214 } 3215 3216 out: 3217 put_page(src_page); 3218 return ret; 3219 } 3220 3221 static int tdx_vcpu_init_mem_region(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd) 3222 { 3223 struct vcpu_tdx *tdx = to_tdx(vcpu); 3224 struct kvm *kvm = vcpu->kvm; 3225 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 3226 struct kvm_tdx_init_mem_region region; 3227 struct tdx_gmem_post_populate_arg arg; 3228 long gmem_ret; 3229 int ret; 3230 3231 if (tdx->state != VCPU_TD_STATE_INITIALIZED) 3232 return -EINVAL; 3233 3234 guard(mutex)(&kvm->slots_lock); 3235 3236 /* Once TD is finalized, the initial guest memory is fixed. */ 3237 if (kvm_tdx->state == TD_STATE_RUNNABLE) 3238 return -EINVAL; 3239 3240 if (cmd->flags & ~KVM_TDX_MEASURE_MEMORY_REGION) 3241 return -EINVAL; 3242 3243 if (copy_from_user(®ion, u64_to_user_ptr(cmd->data), sizeof(region))) 3244 return -EFAULT; 3245 3246 if (!PAGE_ALIGNED(region.source_addr) || !PAGE_ALIGNED(region.gpa) || 3247 !region.nr_pages || 3248 region.gpa + (region.nr_pages << PAGE_SHIFT) <= region.gpa || 3249 !vt_is_tdx_private_gpa(kvm, region.gpa) || 3250 !vt_is_tdx_private_gpa(kvm, region.gpa + (region.nr_pages << PAGE_SHIFT) - 1)) 3251 return -EINVAL; 3252 3253 kvm_mmu_reload(vcpu); 3254 ret = 0; 3255 while (region.nr_pages) { 3256 if (signal_pending(current)) { 3257 ret = -EINTR; 3258 break; 3259 } 3260 3261 arg = (struct tdx_gmem_post_populate_arg) { 3262 .vcpu = vcpu, 3263 .flags = cmd->flags, 3264 }; 3265 gmem_ret = kvm_gmem_populate(kvm, gpa_to_gfn(region.gpa), 3266 u64_to_user_ptr(region.source_addr), 3267 1, tdx_gmem_post_populate, &arg); 3268 if (gmem_ret < 0) { 3269 ret = gmem_ret; 3270 break; 3271 } 3272 3273 if (gmem_ret != 1) { 3274 ret = -EIO; 3275 break; 3276 } 3277 3278 region.source_addr += PAGE_SIZE; 3279 region.gpa += PAGE_SIZE; 3280 region.nr_pages--; 3281 3282 cond_resched(); 3283 } 3284 3285 if (copy_to_user(u64_to_user_ptr(cmd->data), ®ion, sizeof(region))) 3286 ret = -EFAULT; 3287 return ret; 3288 } 3289 3290 int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp) 3291 { 3292 struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); 3293 struct kvm_tdx_cmd cmd; 3294 int ret; 3295 3296 if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE) 3297 return -EINVAL; 3298 3299 if (copy_from_user(&cmd, argp, sizeof(cmd))) 3300 return -EFAULT; 3301 3302 if (cmd.hw_error) 3303 return -EINVAL; 3304 3305 switch (cmd.id) { 3306 case KVM_TDX_INIT_VCPU: 3307 ret = tdx_vcpu_init(vcpu, &cmd); 3308 break; 3309 case KVM_TDX_INIT_MEM_REGION: 3310 ret = tdx_vcpu_init_mem_region(vcpu, &cmd); 3311 break; 3312 case KVM_TDX_GET_CPUID: 3313 ret = tdx_vcpu_get_cpuid(vcpu, &cmd); 3314 break; 3315 default: 3316 ret = -EINVAL; 3317 break; 3318 } 3319 3320 return ret; 3321 } 3322 3323 int tdx_gmem_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn) 3324 { 3325 return PG_LEVEL_4K; 3326 } 3327 3328 static int tdx_online_cpu(unsigned int cpu) 3329 { 3330 unsigned long flags; 3331 int r; 3332 3333 /* Sanity check CPU is already in post-VMXON */ 3334 WARN_ON_ONCE(!(cr4_read_shadow() & X86_CR4_VMXE)); 3335 3336 local_irq_save(flags); 3337 r = tdx_cpu_enable(); 3338 local_irq_restore(flags); 3339 3340 return r; 3341 } 3342 3343 static int tdx_offline_cpu(unsigned int cpu) 3344 { 3345 int i; 3346 3347 /* No TD is running. Allow any cpu to be offline. */ 3348 if (!atomic_read(&nr_configured_hkid)) 3349 return 0; 3350 3351 /* 3352 * In order to reclaim TDX HKID, (i.e. when deleting guest TD), need to 3353 * call TDH.PHYMEM.PAGE.WBINVD on all packages to program all memory 3354 * controller with pconfig. If we have active TDX HKID, refuse to 3355 * offline the last online cpu. 3356 */ 3357 for_each_online_cpu(i) { 3358 /* 3359 * Found another online cpu on the same package. 3360 * Allow to offline. 3361 */ 3362 if (i != cpu && topology_physical_package_id(i) == 3363 topology_physical_package_id(cpu)) 3364 return 0; 3365 } 3366 3367 /* 3368 * This is the last cpu of this package. Don't offline it. 3369 * 3370 * Because it's hard for human operator to understand the 3371 * reason, warn it. 3372 */ 3373 #define MSG_ALLPKG_ONLINE \ 3374 "TDX requires all packages to have an online CPU. Delete all TDs in order to offline all CPUs of a package.\n" 3375 pr_warn_ratelimited(MSG_ALLPKG_ONLINE); 3376 return -EBUSY; 3377 } 3378 3379 static void __do_tdx_cleanup(void) 3380 { 3381 /* 3382 * Once TDX module is initialized, it cannot be disabled and 3383 * re-initialized again w/o runtime update (which isn't 3384 * supported by kernel). Only need to remove the cpuhp here. 3385 * The TDX host core code tracks TDX status and can handle 3386 * 'multiple enabling' scenario. 3387 */ 3388 WARN_ON_ONCE(!tdx_cpuhp_state); 3389 cpuhp_remove_state_nocalls_cpuslocked(tdx_cpuhp_state); 3390 tdx_cpuhp_state = 0; 3391 } 3392 3393 static void __tdx_cleanup(void) 3394 { 3395 cpus_read_lock(); 3396 __do_tdx_cleanup(); 3397 cpus_read_unlock(); 3398 } 3399 3400 static int __init __do_tdx_bringup(void) 3401 { 3402 int r; 3403 3404 /* 3405 * TDX-specific cpuhp callback to call tdx_cpu_enable() on all 3406 * online CPUs before calling tdx_enable(), and on any new 3407 * going-online CPU to make sure it is ready for TDX guest. 3408 */ 3409 r = cpuhp_setup_state_cpuslocked(CPUHP_AP_ONLINE_DYN, 3410 "kvm/cpu/tdx:online", 3411 tdx_online_cpu, tdx_offline_cpu); 3412 if (r < 0) 3413 return r; 3414 3415 tdx_cpuhp_state = r; 3416 3417 r = tdx_enable(); 3418 if (r) 3419 __do_tdx_cleanup(); 3420 3421 return r; 3422 } 3423 3424 static int __init __tdx_bringup(void) 3425 { 3426 const struct tdx_sys_info_td_conf *td_conf; 3427 int r, i; 3428 3429 for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++) { 3430 /* 3431 * Check if MSRs (tdx_uret_msrs) can be saved/restored 3432 * before returning to user space. 3433 * 3434 * this_cpu_ptr(user_return_msrs)->registered isn't checked 3435 * because the registration is done at vcpu runtime by 3436 * tdx_user_return_msr_update_cache(). 3437 */ 3438 tdx_uret_msrs[i].slot = kvm_find_user_return_msr(tdx_uret_msrs[i].msr); 3439 if (tdx_uret_msrs[i].slot == -1) { 3440 /* If any MSR isn't supported, it is a KVM bug */ 3441 pr_err("MSR %x isn't included by kvm_find_user_return_msr\n", 3442 tdx_uret_msrs[i].msr); 3443 return -EIO; 3444 } 3445 } 3446 3447 /* 3448 * Enabling TDX requires enabling hardware virtualization first, 3449 * as making SEAMCALLs requires CPU being in post-VMXON state. 3450 */ 3451 r = kvm_enable_virtualization(); 3452 if (r) 3453 return r; 3454 3455 cpus_read_lock(); 3456 r = __do_tdx_bringup(); 3457 cpus_read_unlock(); 3458 3459 if (r) 3460 goto tdx_bringup_err; 3461 3462 /* Get TDX global information for later use */ 3463 tdx_sysinfo = tdx_get_sysinfo(); 3464 if (WARN_ON_ONCE(!tdx_sysinfo)) { 3465 r = -EINVAL; 3466 goto get_sysinfo_err; 3467 } 3468 3469 /* Check TDX module and KVM capabilities */ 3470 if (!tdx_get_supported_attrs(&tdx_sysinfo->td_conf) || 3471 !tdx_get_supported_xfam(&tdx_sysinfo->td_conf)) 3472 goto get_sysinfo_err; 3473 3474 if (!(tdx_sysinfo->features.tdx_features0 & MD_FIELD_ID_FEATURES0_TOPOLOGY_ENUM)) 3475 goto get_sysinfo_err; 3476 3477 /* 3478 * TDX has its own limit of maximum vCPUs it can support for all 3479 * TDX guests in addition to KVM_MAX_VCPUS. Userspace needs to 3480 * query TDX guest's maximum vCPUs by checking KVM_CAP_MAX_VCPU 3481 * extension on per-VM basis. 3482 * 3483 * TDX module reports such limit via the MAX_VCPU_PER_TD global 3484 * metadata. Different modules may report different values. 3485 * Some old module may also not support this metadata (in which 3486 * case this limit is U16_MAX). 3487 * 3488 * In practice, the reported value reflects the maximum logical 3489 * CPUs that ALL the platforms that the module supports can 3490 * possibly have. 3491 * 3492 * Simply forwarding the MAX_VCPU_PER_TD to userspace could 3493 * result in an unpredictable ABI. KVM instead always advertise 3494 * the number of logical CPUs the platform has as the maximum 3495 * vCPUs for TDX guests. 3496 * 3497 * Make sure MAX_VCPU_PER_TD reported by TDX module is not 3498 * smaller than the number of logical CPUs, otherwise KVM will 3499 * report an unsupported value to userspace. 3500 * 3501 * Note, a platform with TDX enabled in the BIOS cannot support 3502 * physical CPU hotplug, and TDX requires the BIOS has marked 3503 * all logical CPUs in MADT table as enabled. Just use 3504 * num_present_cpus() for the number of logical CPUs. 3505 */ 3506 td_conf = &tdx_sysinfo->td_conf; 3507 if (td_conf->max_vcpus_per_td < num_present_cpus()) { 3508 pr_err("Disable TDX: MAX_VCPU_PER_TD (%u) smaller than number of logical CPUs (%u).\n", 3509 td_conf->max_vcpus_per_td, num_present_cpus()); 3510 r = -EINVAL; 3511 goto get_sysinfo_err; 3512 } 3513 3514 if (misc_cg_set_capacity(MISC_CG_RES_TDX, tdx_get_nr_guest_keyids())) { 3515 r = -EINVAL; 3516 goto get_sysinfo_err; 3517 } 3518 3519 /* 3520 * Leave hardware virtualization enabled after TDX is enabled 3521 * successfully. TDX CPU hotplug depends on this. 3522 */ 3523 return 0; 3524 3525 get_sysinfo_err: 3526 __tdx_cleanup(); 3527 tdx_bringup_err: 3528 kvm_disable_virtualization(); 3529 return r; 3530 } 3531 3532 void tdx_cleanup(void) 3533 { 3534 if (enable_tdx) { 3535 misc_cg_set_capacity(MISC_CG_RES_TDX, 0); 3536 __tdx_cleanup(); 3537 kvm_disable_virtualization(); 3538 } 3539 } 3540 3541 int __init tdx_bringup(void) 3542 { 3543 int r, i; 3544 3545 /* tdx_disable_virtualization_cpu() uses associated_tdvcpus. */ 3546 for_each_possible_cpu(i) 3547 INIT_LIST_HEAD(&per_cpu(associated_tdvcpus, i)); 3548 3549 if (!enable_tdx) 3550 return 0; 3551 3552 if (!enable_ept) { 3553 pr_err("EPT is required for TDX\n"); 3554 goto success_disable_tdx; 3555 } 3556 3557 if (!tdp_mmu_enabled || !enable_mmio_caching || !enable_ept_ad_bits) { 3558 pr_err("TDP MMU and MMIO caching and EPT A/D bit is required for TDX\n"); 3559 goto success_disable_tdx; 3560 } 3561 3562 if (!enable_apicv) { 3563 pr_err("APICv is required for TDX\n"); 3564 goto success_disable_tdx; 3565 } 3566 3567 if (!cpu_feature_enabled(X86_FEATURE_OSXSAVE)) { 3568 pr_err("tdx: OSXSAVE is required for TDX\n"); 3569 goto success_disable_tdx; 3570 } 3571 3572 if (!cpu_feature_enabled(X86_FEATURE_MOVDIR64B)) { 3573 pr_err("tdx: MOVDIR64B is required for TDX\n"); 3574 goto success_disable_tdx; 3575 } 3576 3577 if (!cpu_feature_enabled(X86_FEATURE_SELFSNOOP)) { 3578 pr_err("Self-snoop is required for TDX\n"); 3579 goto success_disable_tdx; 3580 } 3581 3582 if (!cpu_feature_enabled(X86_FEATURE_TDX_HOST_PLATFORM)) { 3583 pr_err("tdx: no TDX private KeyIDs available\n"); 3584 goto success_disable_tdx; 3585 } 3586 3587 if (!enable_virt_at_load) { 3588 pr_err("tdx: tdx requires kvm.enable_virt_at_load=1\n"); 3589 goto success_disable_tdx; 3590 } 3591 3592 /* 3593 * Ideally KVM should probe whether TDX module has been loaded 3594 * first and then try to bring it up. But TDX needs to use SEAMCALL 3595 * to probe whether the module is loaded (there is no CPUID or MSR 3596 * for that), and making SEAMCALL requires enabling virtualization 3597 * first, just like the rest steps of bringing up TDX module. 3598 * 3599 * So, for simplicity do everything in __tdx_bringup(); the first 3600 * SEAMCALL will return -ENODEV when the module is not loaded. The 3601 * only complication is having to make sure that initialization 3602 * SEAMCALLs don't return TDX_SEAMCALL_VMFAILINVALID in other 3603 * cases. 3604 */ 3605 r = __tdx_bringup(); 3606 if (r) { 3607 /* 3608 * Disable TDX only but don't fail to load module if 3609 * the TDX module could not be loaded. No need to print 3610 * message saying "module is not loaded" because it was 3611 * printed when the first SEAMCALL failed. 3612 */ 3613 if (r == -ENODEV) 3614 goto success_disable_tdx; 3615 3616 enable_tdx = 0; 3617 } 3618 3619 return r; 3620 3621 success_disable_tdx: 3622 enable_tdx = 0; 3623 return 0; 3624 } 3625