1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/cleanup.h> 3 #include <linux/cpu.h> 4 #include <asm/cpufeature.h> 5 #include <asm/fpu/xcr.h> 6 #include <linux/misc_cgroup.h> 7 #include <linux/mmu_context.h> 8 #include <asm/tdx.h> 9 #include "capabilities.h" 10 #include "mmu.h" 11 #include "x86_ops.h" 12 #include "lapic.h" 13 #include "tdx.h" 14 #include "vmx.h" 15 #include "mmu/spte.h" 16 #include "common.h" 17 #include "posted_intr.h" 18 #include "irq.h" 19 #include <trace/events/kvm.h> 20 #include "trace.h" 21 22 #pragma GCC poison to_vmx 23 24 #undef pr_fmt 25 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 26 27 #define pr_tdx_error(__fn, __err) \ 28 pr_err_ratelimited("SEAMCALL %s failed: 0x%llx\n", #__fn, __err) 29 30 #define __pr_tdx_error_N(__fn_str, __err, __fmt, ...) \ 31 pr_err_ratelimited("SEAMCALL " __fn_str " failed: 0x%llx, " __fmt, __err, __VA_ARGS__) 32 33 #define pr_tdx_error_1(__fn, __err, __rcx) \ 34 __pr_tdx_error_N(#__fn, __err, "rcx 0x%llx\n", __rcx) 35 36 #define pr_tdx_error_2(__fn, __err, __rcx, __rdx) \ 37 __pr_tdx_error_N(#__fn, __err, "rcx 0x%llx, rdx 0x%llx\n", __rcx, __rdx) 38 39 #define pr_tdx_error_3(__fn, __err, __rcx, __rdx, __r8) \ 40 __pr_tdx_error_N(#__fn, __err, "rcx 0x%llx, rdx 0x%llx, r8 0x%llx\n", __rcx, __rdx, __r8) 41 42 bool enable_tdx __ro_after_init; 43 module_param_named(tdx, enable_tdx, bool, 0444); 44 45 #define TDX_SHARED_BIT_PWL_5 gpa_to_gfn(BIT_ULL(51)) 46 #define TDX_SHARED_BIT_PWL_4 gpa_to_gfn(BIT_ULL(47)) 47 48 static enum cpuhp_state tdx_cpuhp_state; 49 50 static const struct tdx_sys_info *tdx_sysinfo; 51 52 void tdh_vp_rd_failed(struct vcpu_tdx *tdx, char *uclass, u32 field, u64 err) 53 { 54 KVM_BUG_ON(1, tdx->vcpu.kvm); 55 pr_err("TDH_VP_RD[%s.0x%x] failed 0x%llx\n", uclass, field, err); 56 } 57 58 void tdh_vp_wr_failed(struct vcpu_tdx *tdx, char *uclass, char *op, u32 field, 59 u64 val, u64 err) 60 { 61 KVM_BUG_ON(1, tdx->vcpu.kvm); 62 pr_err("TDH_VP_WR[%s.0x%x]%s0x%llx failed: 0x%llx\n", uclass, field, op, val, err); 63 } 64 65 #define KVM_SUPPORTED_TD_ATTRS (TDX_TD_ATTR_SEPT_VE_DISABLE) 66 67 static __always_inline struct kvm_tdx *to_kvm_tdx(struct kvm *kvm) 68 { 69 return container_of(kvm, struct kvm_tdx, kvm); 70 } 71 72 static __always_inline struct vcpu_tdx *to_tdx(struct kvm_vcpu *vcpu) 73 { 74 return container_of(vcpu, struct vcpu_tdx, vcpu); 75 } 76 77 static u64 tdx_get_supported_attrs(const struct tdx_sys_info_td_conf *td_conf) 78 { 79 u64 val = KVM_SUPPORTED_TD_ATTRS; 80 81 if ((val & td_conf->attributes_fixed1) != td_conf->attributes_fixed1) 82 return 0; 83 84 val &= td_conf->attributes_fixed0; 85 86 return val; 87 } 88 89 static u64 tdx_get_supported_xfam(const struct tdx_sys_info_td_conf *td_conf) 90 { 91 u64 val = kvm_caps.supported_xcr0 | kvm_caps.supported_xss; 92 93 if ((val & td_conf->xfam_fixed1) != td_conf->xfam_fixed1) 94 return 0; 95 96 val &= td_conf->xfam_fixed0; 97 98 return val; 99 } 100 101 static int tdx_get_guest_phys_addr_bits(const u32 eax) 102 { 103 return (eax & GENMASK(23, 16)) >> 16; 104 } 105 106 static u32 tdx_set_guest_phys_addr_bits(const u32 eax, int addr_bits) 107 { 108 return (eax & ~GENMASK(23, 16)) | (addr_bits & 0xff) << 16; 109 } 110 111 #define TDX_FEATURE_TSX (__feature_bit(X86_FEATURE_HLE) | __feature_bit(X86_FEATURE_RTM)) 112 113 static bool has_tsx(const struct kvm_cpuid_entry2 *entry) 114 { 115 return entry->function == 7 && entry->index == 0 && 116 (entry->ebx & TDX_FEATURE_TSX); 117 } 118 119 static void clear_tsx(struct kvm_cpuid_entry2 *entry) 120 { 121 entry->ebx &= ~TDX_FEATURE_TSX; 122 } 123 124 static bool has_waitpkg(const struct kvm_cpuid_entry2 *entry) 125 { 126 return entry->function == 7 && entry->index == 0 && 127 (entry->ecx & __feature_bit(X86_FEATURE_WAITPKG)); 128 } 129 130 static void clear_waitpkg(struct kvm_cpuid_entry2 *entry) 131 { 132 entry->ecx &= ~__feature_bit(X86_FEATURE_WAITPKG); 133 } 134 135 static void tdx_clear_unsupported_cpuid(struct kvm_cpuid_entry2 *entry) 136 { 137 if (has_tsx(entry)) 138 clear_tsx(entry); 139 140 if (has_waitpkg(entry)) 141 clear_waitpkg(entry); 142 } 143 144 static bool tdx_unsupported_cpuid(const struct kvm_cpuid_entry2 *entry) 145 { 146 return has_tsx(entry) || has_waitpkg(entry); 147 } 148 149 #define KVM_TDX_CPUID_NO_SUBLEAF ((__u32)-1) 150 151 static void td_init_cpuid_entry2(struct kvm_cpuid_entry2 *entry, unsigned char idx) 152 { 153 const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf; 154 155 entry->function = (u32)td_conf->cpuid_config_leaves[idx]; 156 entry->index = td_conf->cpuid_config_leaves[idx] >> 32; 157 entry->eax = (u32)td_conf->cpuid_config_values[idx][0]; 158 entry->ebx = td_conf->cpuid_config_values[idx][0] >> 32; 159 entry->ecx = (u32)td_conf->cpuid_config_values[idx][1]; 160 entry->edx = td_conf->cpuid_config_values[idx][1] >> 32; 161 162 if (entry->index == KVM_TDX_CPUID_NO_SUBLEAF) 163 entry->index = 0; 164 165 /* 166 * The TDX module doesn't allow configuring the guest phys addr bits 167 * (EAX[23:16]). However, KVM uses it as an interface to the userspace 168 * to configure the GPAW. Report these bits as configurable. 169 */ 170 if (entry->function == 0x80000008) 171 entry->eax = tdx_set_guest_phys_addr_bits(entry->eax, 0xff); 172 173 tdx_clear_unsupported_cpuid(entry); 174 } 175 176 #define TDVMCALLINFO_GET_QUOTE BIT(0) 177 #define TDVMCALLINFO_SETUP_EVENT_NOTIFY_INTERRUPT BIT(1) 178 179 static int init_kvm_tdx_caps(const struct tdx_sys_info_td_conf *td_conf, 180 struct kvm_tdx_capabilities *caps) 181 { 182 int i; 183 184 caps->supported_attrs = tdx_get_supported_attrs(td_conf); 185 if (!caps->supported_attrs) 186 return -EIO; 187 188 caps->supported_xfam = tdx_get_supported_xfam(td_conf); 189 if (!caps->supported_xfam) 190 return -EIO; 191 192 caps->cpuid.nent = td_conf->num_cpuid_config; 193 194 caps->user_tdvmcallinfo_1_r11 = 195 TDVMCALLINFO_GET_QUOTE | 196 TDVMCALLINFO_SETUP_EVENT_NOTIFY_INTERRUPT; 197 198 for (i = 0; i < td_conf->num_cpuid_config; i++) 199 td_init_cpuid_entry2(&caps->cpuid.entries[i], i); 200 201 return 0; 202 } 203 204 /* 205 * Some SEAMCALLs acquire the TDX module globally, and can fail with 206 * TDX_OPERAND_BUSY. Use a global mutex to serialize these SEAMCALLs. 207 */ 208 static DEFINE_MUTEX(tdx_lock); 209 210 static atomic_t nr_configured_hkid; 211 212 static bool tdx_operand_busy(u64 err) 213 { 214 return (err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_BUSY; 215 } 216 217 218 /* 219 * A per-CPU list of TD vCPUs associated with a given CPU. 220 * Protected by interrupt mask. Only manipulated by the CPU owning this per-CPU 221 * list. 222 * - When a vCPU is loaded onto a CPU, it is removed from the per-CPU list of 223 * the old CPU during the IPI callback running on the old CPU, and then added 224 * to the per-CPU list of the new CPU. 225 * - When a TD is tearing down, all vCPUs are disassociated from their current 226 * running CPUs and removed from the per-CPU list during the IPI callback 227 * running on those CPUs. 228 * - When a CPU is brought down, traverse the per-CPU list to disassociate all 229 * associated TD vCPUs and remove them from the per-CPU list. 230 */ 231 static DEFINE_PER_CPU(struct list_head, associated_tdvcpus); 232 233 static __always_inline unsigned long tdvmcall_exit_type(struct kvm_vcpu *vcpu) 234 { 235 return to_tdx(vcpu)->vp_enter_args.r10; 236 } 237 238 static __always_inline unsigned long tdvmcall_leaf(struct kvm_vcpu *vcpu) 239 { 240 return to_tdx(vcpu)->vp_enter_args.r11; 241 } 242 243 static __always_inline void tdvmcall_set_return_code(struct kvm_vcpu *vcpu, 244 long val) 245 { 246 to_tdx(vcpu)->vp_enter_args.r10 = val; 247 } 248 249 static __always_inline void tdvmcall_set_return_val(struct kvm_vcpu *vcpu, 250 unsigned long val) 251 { 252 to_tdx(vcpu)->vp_enter_args.r11 = val; 253 } 254 255 static inline void tdx_hkid_free(struct kvm_tdx *kvm_tdx) 256 { 257 tdx_guest_keyid_free(kvm_tdx->hkid); 258 kvm_tdx->hkid = -1; 259 atomic_dec(&nr_configured_hkid); 260 misc_cg_uncharge(MISC_CG_RES_TDX, kvm_tdx->misc_cg, 1); 261 put_misc_cg(kvm_tdx->misc_cg); 262 kvm_tdx->misc_cg = NULL; 263 } 264 265 static inline bool is_hkid_assigned(struct kvm_tdx *kvm_tdx) 266 { 267 return kvm_tdx->hkid > 0; 268 } 269 270 static inline void tdx_disassociate_vp(struct kvm_vcpu *vcpu) 271 { 272 lockdep_assert_irqs_disabled(); 273 274 list_del(&to_tdx(vcpu)->cpu_list); 275 276 /* 277 * Ensure tdx->cpu_list is updated before setting vcpu->cpu to -1, 278 * otherwise, a different CPU can see vcpu->cpu = -1 and add the vCPU 279 * to its list before it's deleted from this CPU's list. 280 */ 281 smp_wmb(); 282 283 vcpu->cpu = -1; 284 } 285 286 static void tdx_clear_page(struct page *page) 287 { 288 const void *zero_page = (const void *) page_to_virt(ZERO_PAGE(0)); 289 void *dest = page_to_virt(page); 290 unsigned long i; 291 292 /* 293 * The page could have been poisoned. MOVDIR64B also clears 294 * the poison bit so the kernel can safely use the page again. 295 */ 296 for (i = 0; i < PAGE_SIZE; i += 64) 297 movdir64b(dest + i, zero_page); 298 /* 299 * MOVDIR64B store uses WC buffer. Prevent following memory reads 300 * from seeing potentially poisoned cache. 301 */ 302 __mb(); 303 } 304 305 static void tdx_no_vcpus_enter_start(struct kvm *kvm) 306 { 307 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 308 309 lockdep_assert_held_write(&kvm->mmu_lock); 310 311 WRITE_ONCE(kvm_tdx->wait_for_sept_zap, true); 312 313 kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE); 314 } 315 316 static void tdx_no_vcpus_enter_stop(struct kvm *kvm) 317 { 318 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 319 320 lockdep_assert_held_write(&kvm->mmu_lock); 321 322 WRITE_ONCE(kvm_tdx->wait_for_sept_zap, false); 323 } 324 325 /* TDH.PHYMEM.PAGE.RECLAIM is allowed only when destroying the TD. */ 326 static int __tdx_reclaim_page(struct page *page) 327 { 328 u64 err, rcx, rdx, r8; 329 330 err = tdh_phymem_page_reclaim(page, &rcx, &rdx, &r8); 331 332 /* 333 * No need to check for TDX_OPERAND_BUSY; all TD pages are freed 334 * before the HKID is released and control pages have also been 335 * released at this point, so there is no possibility of contention. 336 */ 337 if (WARN_ON_ONCE(err)) { 338 pr_tdx_error_3(TDH_PHYMEM_PAGE_RECLAIM, err, rcx, rdx, r8); 339 return -EIO; 340 } 341 return 0; 342 } 343 344 static int tdx_reclaim_page(struct page *page) 345 { 346 int r; 347 348 r = __tdx_reclaim_page(page); 349 if (!r) 350 tdx_clear_page(page); 351 return r; 352 } 353 354 355 /* 356 * Reclaim the TD control page(s) which are crypto-protected by TDX guest's 357 * private KeyID. Assume the cache associated with the TDX private KeyID has 358 * been flushed. 359 */ 360 static void tdx_reclaim_control_page(struct page *ctrl_page) 361 { 362 /* 363 * Leak the page if the kernel failed to reclaim the page. 364 * The kernel cannot use it safely anymore. 365 */ 366 if (tdx_reclaim_page(ctrl_page)) 367 return; 368 369 __free_page(ctrl_page); 370 } 371 372 struct tdx_flush_vp_arg { 373 struct kvm_vcpu *vcpu; 374 u64 err; 375 }; 376 377 static void tdx_flush_vp(void *_arg) 378 { 379 struct tdx_flush_vp_arg *arg = _arg; 380 struct kvm_vcpu *vcpu = arg->vcpu; 381 u64 err; 382 383 arg->err = 0; 384 lockdep_assert_irqs_disabled(); 385 386 /* Task migration can race with CPU offlining. */ 387 if (unlikely(vcpu->cpu != raw_smp_processor_id())) 388 return; 389 390 /* 391 * No need to do TDH_VP_FLUSH if the vCPU hasn't been initialized. The 392 * list tracking still needs to be updated so that it's correct if/when 393 * the vCPU does get initialized. 394 */ 395 if (to_tdx(vcpu)->state != VCPU_TD_STATE_UNINITIALIZED) { 396 /* 397 * No need to retry. TDX Resources needed for TDH.VP.FLUSH are: 398 * TDVPR as exclusive, TDR as shared, and TDCS as shared. This 399 * vp flush function is called when destructing vCPU/TD or vCPU 400 * migration. No other thread uses TDVPR in those cases. 401 */ 402 err = tdh_vp_flush(&to_tdx(vcpu)->vp); 403 if (unlikely(err && err != TDX_VCPU_NOT_ASSOCIATED)) { 404 /* 405 * This function is called in IPI context. Do not use 406 * printk to avoid console semaphore. 407 * The caller prints out the error message, instead. 408 */ 409 if (err) 410 arg->err = err; 411 } 412 } 413 414 tdx_disassociate_vp(vcpu); 415 } 416 417 static void tdx_flush_vp_on_cpu(struct kvm_vcpu *vcpu) 418 { 419 struct tdx_flush_vp_arg arg = { 420 .vcpu = vcpu, 421 }; 422 int cpu = vcpu->cpu; 423 424 if (unlikely(cpu == -1)) 425 return; 426 427 smp_call_function_single(cpu, tdx_flush_vp, &arg, 1); 428 if (KVM_BUG_ON(arg.err, vcpu->kvm)) 429 pr_tdx_error(TDH_VP_FLUSH, arg.err); 430 } 431 432 void tdx_disable_virtualization_cpu(void) 433 { 434 int cpu = raw_smp_processor_id(); 435 struct list_head *tdvcpus = &per_cpu(associated_tdvcpus, cpu); 436 struct tdx_flush_vp_arg arg; 437 struct vcpu_tdx *tdx, *tmp; 438 unsigned long flags; 439 440 local_irq_save(flags); 441 /* Safe variant needed as tdx_disassociate_vp() deletes the entry. */ 442 list_for_each_entry_safe(tdx, tmp, tdvcpus, cpu_list) { 443 arg.vcpu = &tdx->vcpu; 444 tdx_flush_vp(&arg); 445 } 446 local_irq_restore(flags); 447 } 448 449 #define TDX_SEAMCALL_RETRIES 10000 450 451 static void smp_func_do_phymem_cache_wb(void *unused) 452 { 453 u64 err = 0; 454 bool resume; 455 int i; 456 457 /* 458 * TDH.PHYMEM.CACHE.WB flushes caches associated with any TDX private 459 * KeyID on the package or core. The TDX module may not finish the 460 * cache flush but return TDX_INTERRUPTED_RESUMEABLE instead. The 461 * kernel should retry it until it returns success w/o rescheduling. 462 */ 463 for (i = TDX_SEAMCALL_RETRIES; i > 0; i--) { 464 resume = !!err; 465 err = tdh_phymem_cache_wb(resume); 466 switch (err) { 467 case TDX_INTERRUPTED_RESUMABLE: 468 continue; 469 case TDX_NO_HKID_READY_TO_WBCACHE: 470 err = TDX_SUCCESS; /* Already done by other thread */ 471 fallthrough; 472 default: 473 goto out; 474 } 475 } 476 477 out: 478 if (WARN_ON_ONCE(err)) 479 pr_tdx_error(TDH_PHYMEM_CACHE_WB, err); 480 } 481 482 void tdx_mmu_release_hkid(struct kvm *kvm) 483 { 484 bool packages_allocated, targets_allocated; 485 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 486 cpumask_var_t packages, targets; 487 struct kvm_vcpu *vcpu; 488 unsigned long j; 489 int i; 490 u64 err; 491 492 if (!is_hkid_assigned(kvm_tdx)) 493 return; 494 495 packages_allocated = zalloc_cpumask_var(&packages, GFP_KERNEL); 496 targets_allocated = zalloc_cpumask_var(&targets, GFP_KERNEL); 497 cpus_read_lock(); 498 499 kvm_for_each_vcpu(j, vcpu, kvm) 500 tdx_flush_vp_on_cpu(vcpu); 501 502 /* 503 * TDH.PHYMEM.CACHE.WB tries to acquire the TDX module global lock 504 * and can fail with TDX_OPERAND_BUSY when it fails to get the lock. 505 * Multiple TDX guests can be destroyed simultaneously. Take the 506 * mutex to prevent it from getting error. 507 */ 508 mutex_lock(&tdx_lock); 509 510 /* 511 * Releasing HKID is in vm_destroy(). 512 * After the above flushing vps, there should be no more vCPU 513 * associations, as all vCPU fds have been released at this stage. 514 */ 515 err = tdh_mng_vpflushdone(&kvm_tdx->td); 516 if (err == TDX_FLUSHVP_NOT_DONE) 517 goto out; 518 if (KVM_BUG_ON(err, kvm)) { 519 pr_tdx_error(TDH_MNG_VPFLUSHDONE, err); 520 pr_err("tdh_mng_vpflushdone() failed. HKID %d is leaked.\n", 521 kvm_tdx->hkid); 522 goto out; 523 } 524 525 for_each_online_cpu(i) { 526 if (packages_allocated && 527 cpumask_test_and_set_cpu(topology_physical_package_id(i), 528 packages)) 529 continue; 530 if (targets_allocated) 531 cpumask_set_cpu(i, targets); 532 } 533 if (targets_allocated) 534 on_each_cpu_mask(targets, smp_func_do_phymem_cache_wb, NULL, true); 535 else 536 on_each_cpu(smp_func_do_phymem_cache_wb, NULL, true); 537 /* 538 * In the case of error in smp_func_do_phymem_cache_wb(), the following 539 * tdh_mng_key_freeid() will fail. 540 */ 541 err = tdh_mng_key_freeid(&kvm_tdx->td); 542 if (KVM_BUG_ON(err, kvm)) { 543 pr_tdx_error(TDH_MNG_KEY_FREEID, err); 544 pr_err("tdh_mng_key_freeid() failed. HKID %d is leaked.\n", 545 kvm_tdx->hkid); 546 } else { 547 tdx_hkid_free(kvm_tdx); 548 } 549 550 out: 551 mutex_unlock(&tdx_lock); 552 cpus_read_unlock(); 553 free_cpumask_var(targets); 554 free_cpumask_var(packages); 555 } 556 557 static void tdx_reclaim_td_control_pages(struct kvm *kvm) 558 { 559 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 560 u64 err; 561 int i; 562 563 /* 564 * tdx_mmu_release_hkid() failed to reclaim HKID. Something went wrong 565 * heavily with TDX module. Give up freeing TD pages. As the function 566 * already warned, don't warn it again. 567 */ 568 if (is_hkid_assigned(kvm_tdx)) 569 return; 570 571 if (kvm_tdx->td.tdcs_pages) { 572 for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) { 573 if (!kvm_tdx->td.tdcs_pages[i]) 574 continue; 575 576 tdx_reclaim_control_page(kvm_tdx->td.tdcs_pages[i]); 577 } 578 kfree(kvm_tdx->td.tdcs_pages); 579 kvm_tdx->td.tdcs_pages = NULL; 580 } 581 582 if (!kvm_tdx->td.tdr_page) 583 return; 584 585 if (__tdx_reclaim_page(kvm_tdx->td.tdr_page)) 586 return; 587 588 /* 589 * Use a SEAMCALL to ask the TDX module to flush the cache based on the 590 * KeyID. TDX module may access TDR while operating on TD (Especially 591 * when it is reclaiming TDCS). 592 */ 593 err = tdh_phymem_page_wbinvd_tdr(&kvm_tdx->td); 594 if (KVM_BUG_ON(err, kvm)) { 595 pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err); 596 return; 597 } 598 tdx_clear_page(kvm_tdx->td.tdr_page); 599 600 __free_page(kvm_tdx->td.tdr_page); 601 kvm_tdx->td.tdr_page = NULL; 602 } 603 604 void tdx_vm_destroy(struct kvm *kvm) 605 { 606 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 607 608 tdx_reclaim_td_control_pages(kvm); 609 610 kvm_tdx->state = TD_STATE_UNINITIALIZED; 611 } 612 613 static int tdx_do_tdh_mng_key_config(void *param) 614 { 615 struct kvm_tdx *kvm_tdx = param; 616 u64 err; 617 618 /* TDX_RND_NO_ENTROPY related retries are handled by sc_retry() */ 619 err = tdh_mng_key_config(&kvm_tdx->td); 620 621 if (KVM_BUG_ON(err, &kvm_tdx->kvm)) { 622 pr_tdx_error(TDH_MNG_KEY_CONFIG, err); 623 return -EIO; 624 } 625 626 return 0; 627 } 628 629 int tdx_vm_init(struct kvm *kvm) 630 { 631 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 632 633 kvm->arch.has_protected_state = true; 634 kvm->arch.has_private_mem = true; 635 kvm->arch.disabled_quirks |= KVM_X86_QUIRK_IGNORE_GUEST_PAT; 636 637 /* 638 * Because guest TD is protected, VMM can't parse the instruction in TD. 639 * Instead, guest uses MMIO hypercall. For unmodified device driver, 640 * #VE needs to be injected for MMIO and #VE handler in TD converts MMIO 641 * instruction into MMIO hypercall. 642 * 643 * SPTE value for MMIO needs to be setup so that #VE is injected into 644 * TD instead of triggering EPT MISCONFIG. 645 * - RWX=0 so that EPT violation is triggered. 646 * - suppress #VE bit is cleared to inject #VE. 647 */ 648 kvm_mmu_set_mmio_spte_value(kvm, 0); 649 650 /* 651 * TDX has its own limit of maximum vCPUs it can support for all 652 * TDX guests in addition to KVM_MAX_VCPUS. TDX module reports 653 * such limit via the MAX_VCPU_PER_TD global metadata. In 654 * practice, it reflects the number of logical CPUs that ALL 655 * platforms that the TDX module supports can possibly have. 656 * 657 * Limit TDX guest's maximum vCPUs to the number of logical CPUs 658 * the platform has. Simply forwarding the MAX_VCPU_PER_TD to 659 * userspace would result in an unpredictable ABI. 660 */ 661 kvm->max_vcpus = min_t(int, kvm->max_vcpus, num_present_cpus()); 662 663 kvm_tdx->state = TD_STATE_UNINITIALIZED; 664 665 return 0; 666 } 667 668 int tdx_vcpu_create(struct kvm_vcpu *vcpu) 669 { 670 struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); 671 struct vcpu_tdx *tdx = to_tdx(vcpu); 672 673 if (kvm_tdx->state != TD_STATE_INITIALIZED) 674 return -EIO; 675 676 /* 677 * TDX module mandates APICv, which requires an in-kernel local APIC. 678 * Disallow an in-kernel I/O APIC, because level-triggered interrupts 679 * and thus the I/O APIC as a whole can't be faithfully emulated in KVM. 680 */ 681 if (!irqchip_split(vcpu->kvm)) 682 return -EINVAL; 683 684 fpstate_set_confidential(&vcpu->arch.guest_fpu); 685 vcpu->arch.apic->guest_apic_protected = true; 686 INIT_LIST_HEAD(&tdx->vt.pi_wakeup_list); 687 688 vcpu->arch.efer = EFER_SCE | EFER_LME | EFER_LMA | EFER_NX; 689 690 vcpu->arch.switch_db_regs = KVM_DEBUGREG_AUTO_SWITCH; 691 vcpu->arch.cr0_guest_owned_bits = -1ul; 692 vcpu->arch.cr4_guest_owned_bits = -1ul; 693 694 /* KVM can't change TSC offset/multiplier as TDX module manages them. */ 695 vcpu->arch.guest_tsc_protected = true; 696 vcpu->arch.tsc_offset = kvm_tdx->tsc_offset; 697 vcpu->arch.l1_tsc_offset = vcpu->arch.tsc_offset; 698 vcpu->arch.tsc_scaling_ratio = kvm_tdx->tsc_multiplier; 699 vcpu->arch.l1_tsc_scaling_ratio = kvm_tdx->tsc_multiplier; 700 701 vcpu->arch.guest_state_protected = 702 !(to_kvm_tdx(vcpu->kvm)->attributes & TDX_TD_ATTR_DEBUG); 703 704 if ((kvm_tdx->xfam & XFEATURE_MASK_XTILE) == XFEATURE_MASK_XTILE) 705 vcpu->arch.xfd_no_write_intercept = true; 706 707 tdx->vt.pi_desc.nv = POSTED_INTR_VECTOR; 708 __pi_set_sn(&tdx->vt.pi_desc); 709 710 tdx->state = VCPU_TD_STATE_UNINITIALIZED; 711 712 return 0; 713 } 714 715 void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 716 { 717 struct vcpu_tdx *tdx = to_tdx(vcpu); 718 719 vmx_vcpu_pi_load(vcpu, cpu); 720 if (vcpu->cpu == cpu || !is_hkid_assigned(to_kvm_tdx(vcpu->kvm))) 721 return; 722 723 tdx_flush_vp_on_cpu(vcpu); 724 725 KVM_BUG_ON(cpu != raw_smp_processor_id(), vcpu->kvm); 726 local_irq_disable(); 727 /* 728 * Pairs with the smp_wmb() in tdx_disassociate_vp() to ensure 729 * vcpu->cpu is read before tdx->cpu_list. 730 */ 731 smp_rmb(); 732 733 list_add(&tdx->cpu_list, &per_cpu(associated_tdvcpus, cpu)); 734 local_irq_enable(); 735 } 736 737 bool tdx_interrupt_allowed(struct kvm_vcpu *vcpu) 738 { 739 /* 740 * KVM can't get the interrupt status of TDX guest and it assumes 741 * interrupt is always allowed unless TDX guest calls TDVMCALL with HLT, 742 * which passes the interrupt blocked flag. 743 */ 744 return vmx_get_exit_reason(vcpu).basic != EXIT_REASON_HLT || 745 !to_tdx(vcpu)->vp_enter_args.r12; 746 } 747 748 bool tdx_protected_apic_has_interrupt(struct kvm_vcpu *vcpu) 749 { 750 u64 vcpu_state_details; 751 752 if (pi_has_pending_interrupt(vcpu)) 753 return true; 754 755 /* 756 * Only check RVI pending for HALTED case with IRQ enabled. 757 * For non-HLT cases, KVM doesn't care about STI/SS shadows. And if the 758 * interrupt was pending before TD exit, then it _must_ be blocked, 759 * otherwise the interrupt would have been serviced at the instruction 760 * boundary. 761 */ 762 if (vmx_get_exit_reason(vcpu).basic != EXIT_REASON_HLT || 763 to_tdx(vcpu)->vp_enter_args.r12) 764 return false; 765 766 vcpu_state_details = 767 td_state_non_arch_read64(to_tdx(vcpu), TD_VCPU_STATE_DETAILS_NON_ARCH); 768 769 return tdx_vcpu_state_details_intr_pending(vcpu_state_details); 770 } 771 772 /* 773 * Compared to vmx_prepare_switch_to_guest(), there is not much to do 774 * as SEAMCALL/SEAMRET calls take care of most of save and restore. 775 */ 776 void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) 777 { 778 struct vcpu_vt *vt = to_vt(vcpu); 779 780 if (vt->guest_state_loaded) 781 return; 782 783 if (likely(is_64bit_mm(current->mm))) 784 vt->msr_host_kernel_gs_base = current->thread.gsbase; 785 else 786 vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE); 787 788 vt->host_debugctlmsr = get_debugctlmsr(); 789 790 vt->guest_state_loaded = true; 791 } 792 793 struct tdx_uret_msr { 794 u32 msr; 795 unsigned int slot; 796 u64 defval; 797 }; 798 799 static struct tdx_uret_msr tdx_uret_msrs[] = { 800 {.msr = MSR_SYSCALL_MASK, .defval = 0x20200 }, 801 {.msr = MSR_STAR,}, 802 {.msr = MSR_LSTAR,}, 803 {.msr = MSR_TSC_AUX,}, 804 }; 805 806 static void tdx_user_return_msr_update_cache(void) 807 { 808 int i; 809 810 for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++) 811 kvm_user_return_msr_update_cache(tdx_uret_msrs[i].slot, 812 tdx_uret_msrs[i].defval); 813 } 814 815 static void tdx_prepare_switch_to_host(struct kvm_vcpu *vcpu) 816 { 817 struct vcpu_vt *vt = to_vt(vcpu); 818 struct vcpu_tdx *tdx = to_tdx(vcpu); 819 820 if (!vt->guest_state_loaded) 821 return; 822 823 ++vcpu->stat.host_state_reload; 824 wrmsrl(MSR_KERNEL_GS_BASE, vt->msr_host_kernel_gs_base); 825 826 if (tdx->guest_entered) { 827 tdx_user_return_msr_update_cache(); 828 tdx->guest_entered = false; 829 } 830 831 vt->guest_state_loaded = false; 832 } 833 834 void tdx_vcpu_put(struct kvm_vcpu *vcpu) 835 { 836 vmx_vcpu_pi_put(vcpu); 837 tdx_prepare_switch_to_host(vcpu); 838 } 839 840 void tdx_vcpu_free(struct kvm_vcpu *vcpu) 841 { 842 struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); 843 struct vcpu_tdx *tdx = to_tdx(vcpu); 844 int i; 845 846 /* 847 * It is not possible to reclaim pages while hkid is assigned. It might 848 * be assigned if: 849 * 1. the TD VM is being destroyed but freeing hkid failed, in which 850 * case the pages are leaked 851 * 2. TD VCPU creation failed and this on the error path, in which case 852 * there is nothing to do anyway 853 */ 854 if (is_hkid_assigned(kvm_tdx)) 855 return; 856 857 if (tdx->vp.tdcx_pages) { 858 for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) { 859 if (tdx->vp.tdcx_pages[i]) 860 tdx_reclaim_control_page(tdx->vp.tdcx_pages[i]); 861 } 862 kfree(tdx->vp.tdcx_pages); 863 tdx->vp.tdcx_pages = NULL; 864 } 865 if (tdx->vp.tdvpr_page) { 866 tdx_reclaim_control_page(tdx->vp.tdvpr_page); 867 tdx->vp.tdvpr_page = 0; 868 } 869 870 tdx->state = VCPU_TD_STATE_UNINITIALIZED; 871 } 872 873 int tdx_vcpu_pre_run(struct kvm_vcpu *vcpu) 874 { 875 if (unlikely(to_tdx(vcpu)->state != VCPU_TD_STATE_INITIALIZED || 876 to_kvm_tdx(vcpu->kvm)->state != TD_STATE_RUNNABLE)) 877 return -EINVAL; 878 879 return 1; 880 } 881 882 static __always_inline u32 tdcall_to_vmx_exit_reason(struct kvm_vcpu *vcpu) 883 { 884 switch (tdvmcall_leaf(vcpu)) { 885 case EXIT_REASON_CPUID: 886 case EXIT_REASON_HLT: 887 case EXIT_REASON_IO_INSTRUCTION: 888 case EXIT_REASON_MSR_READ: 889 case EXIT_REASON_MSR_WRITE: 890 return tdvmcall_leaf(vcpu); 891 case EXIT_REASON_EPT_VIOLATION: 892 return EXIT_REASON_EPT_MISCONFIG; 893 default: 894 break; 895 } 896 897 return EXIT_REASON_TDCALL; 898 } 899 900 static __always_inline u32 tdx_to_vmx_exit_reason(struct kvm_vcpu *vcpu) 901 { 902 struct vcpu_tdx *tdx = to_tdx(vcpu); 903 u32 exit_reason; 904 905 switch (tdx->vp_enter_ret & TDX_SEAMCALL_STATUS_MASK) { 906 case TDX_SUCCESS: 907 case TDX_NON_RECOVERABLE_VCPU: 908 case TDX_NON_RECOVERABLE_TD: 909 case TDX_NON_RECOVERABLE_TD_NON_ACCESSIBLE: 910 case TDX_NON_RECOVERABLE_TD_WRONG_APIC_MODE: 911 break; 912 default: 913 return -1u; 914 } 915 916 exit_reason = tdx->vp_enter_ret; 917 918 switch (exit_reason) { 919 case EXIT_REASON_TDCALL: 920 if (tdvmcall_exit_type(vcpu)) 921 return EXIT_REASON_VMCALL; 922 923 return tdcall_to_vmx_exit_reason(vcpu); 924 case EXIT_REASON_EPT_MISCONFIG: 925 /* 926 * Defer KVM_BUG_ON() until tdx_handle_exit() because this is in 927 * non-instrumentable code with interrupts disabled. 928 */ 929 return -1u; 930 default: 931 break; 932 } 933 934 return exit_reason; 935 } 936 937 static noinstr void tdx_vcpu_enter_exit(struct kvm_vcpu *vcpu) 938 { 939 struct vcpu_tdx *tdx = to_tdx(vcpu); 940 struct vcpu_vt *vt = to_vt(vcpu); 941 942 guest_state_enter_irqoff(); 943 944 tdx->vp_enter_ret = tdh_vp_enter(&tdx->vp, &tdx->vp_enter_args); 945 946 vt->exit_reason.full = tdx_to_vmx_exit_reason(vcpu); 947 948 vt->exit_qualification = tdx->vp_enter_args.rcx; 949 tdx->ext_exit_qualification = tdx->vp_enter_args.rdx; 950 tdx->exit_gpa = tdx->vp_enter_args.r8; 951 vt->exit_intr_info = tdx->vp_enter_args.r9; 952 953 vmx_handle_nmi(vcpu); 954 955 guest_state_exit_irqoff(); 956 } 957 958 static bool tdx_failed_vmentry(struct kvm_vcpu *vcpu) 959 { 960 return vmx_get_exit_reason(vcpu).failed_vmentry && 961 vmx_get_exit_reason(vcpu).full != -1u; 962 } 963 964 static fastpath_t tdx_exit_handlers_fastpath(struct kvm_vcpu *vcpu) 965 { 966 u64 vp_enter_ret = to_tdx(vcpu)->vp_enter_ret; 967 968 /* 969 * TDX_OPERAND_BUSY could be returned for SEPT due to 0-step mitigation 970 * or for TD EPOCH due to contention with TDH.MEM.TRACK on TDH.VP.ENTER. 971 * 972 * When KVM requests KVM_REQ_OUTSIDE_GUEST_MODE, which has both 973 * KVM_REQUEST_WAIT and KVM_REQUEST_NO_ACTION set, it requires target 974 * vCPUs leaving fastpath so that interrupt can be enabled to ensure the 975 * IPIs can be delivered. Return EXIT_FASTPATH_EXIT_HANDLED instead of 976 * EXIT_FASTPATH_REENTER_GUEST to exit fastpath, otherwise, the 977 * requester may be blocked endlessly. 978 */ 979 if (unlikely(tdx_operand_busy(vp_enter_ret))) 980 return EXIT_FASTPATH_EXIT_HANDLED; 981 982 return EXIT_FASTPATH_NONE; 983 } 984 985 #define TDX_REGS_AVAIL_SET (BIT_ULL(VCPU_EXREG_EXIT_INFO_1) | \ 986 BIT_ULL(VCPU_EXREG_EXIT_INFO_2) | \ 987 BIT_ULL(VCPU_REGS_RAX) | \ 988 BIT_ULL(VCPU_REGS_RBX) | \ 989 BIT_ULL(VCPU_REGS_RCX) | \ 990 BIT_ULL(VCPU_REGS_RDX) | \ 991 BIT_ULL(VCPU_REGS_RBP) | \ 992 BIT_ULL(VCPU_REGS_RSI) | \ 993 BIT_ULL(VCPU_REGS_RDI) | \ 994 BIT_ULL(VCPU_REGS_R8) | \ 995 BIT_ULL(VCPU_REGS_R9) | \ 996 BIT_ULL(VCPU_REGS_R10) | \ 997 BIT_ULL(VCPU_REGS_R11) | \ 998 BIT_ULL(VCPU_REGS_R12) | \ 999 BIT_ULL(VCPU_REGS_R13) | \ 1000 BIT_ULL(VCPU_REGS_R14) | \ 1001 BIT_ULL(VCPU_REGS_R15)) 1002 1003 static void tdx_load_host_xsave_state(struct kvm_vcpu *vcpu) 1004 { 1005 struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); 1006 1007 /* 1008 * All TDX hosts support PKRU; but even if they didn't, 1009 * vcpu->arch.host_pkru would be 0 and the wrpkru would be 1010 * skipped. 1011 */ 1012 if (vcpu->arch.host_pkru != 0) 1013 wrpkru(vcpu->arch.host_pkru); 1014 1015 if (kvm_host.xcr0 != (kvm_tdx->xfam & kvm_caps.supported_xcr0)) 1016 xsetbv(XCR_XFEATURE_ENABLED_MASK, kvm_host.xcr0); 1017 1018 /* 1019 * Likewise, even if a TDX hosts didn't support XSS both arms of 1020 * the comparison would be 0 and the wrmsrl would be skipped. 1021 */ 1022 if (kvm_host.xss != (kvm_tdx->xfam & kvm_caps.supported_xss)) 1023 wrmsrl(MSR_IA32_XSS, kvm_host.xss); 1024 } 1025 1026 #define TDX_DEBUGCTL_PRESERVED (DEBUGCTLMSR_BTF | \ 1027 DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI | \ 1028 DEBUGCTLMSR_FREEZE_IN_SMM) 1029 1030 fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) 1031 { 1032 struct vcpu_tdx *tdx = to_tdx(vcpu); 1033 struct vcpu_vt *vt = to_vt(vcpu); 1034 1035 /* 1036 * force_immediate_exit requires vCPU entering for events injection with 1037 * an immediately exit followed. But The TDX module doesn't guarantee 1038 * entry, it's already possible for KVM to _think_ it completely entry 1039 * to the guest without actually having done so. 1040 * Since KVM never needs to force an immediate exit for TDX, and can't 1041 * do direct injection, just warn on force_immediate_exit. 1042 */ 1043 WARN_ON_ONCE(force_immediate_exit); 1044 1045 /* 1046 * Wait until retry of SEPT-zap-related SEAMCALL completes before 1047 * allowing vCPU entry to avoid contention with tdh_vp_enter() and 1048 * TDCALLs. 1049 */ 1050 if (unlikely(READ_ONCE(to_kvm_tdx(vcpu->kvm)->wait_for_sept_zap))) 1051 return EXIT_FASTPATH_EXIT_HANDLED; 1052 1053 trace_kvm_entry(vcpu, force_immediate_exit); 1054 1055 if (pi_test_on(&vt->pi_desc)) { 1056 apic->send_IPI_self(POSTED_INTR_VECTOR); 1057 1058 if (pi_test_pir(kvm_lapic_get_reg(vcpu->arch.apic, APIC_LVTT) & 1059 APIC_VECTOR_MASK, &vt->pi_desc)) 1060 kvm_wait_lapic_expire(vcpu); 1061 } 1062 1063 tdx_vcpu_enter_exit(vcpu); 1064 1065 if (vt->host_debugctlmsr & ~TDX_DEBUGCTL_PRESERVED) 1066 update_debugctlmsr(vt->host_debugctlmsr); 1067 1068 tdx_load_host_xsave_state(vcpu); 1069 tdx->guest_entered = true; 1070 1071 vcpu->arch.regs_avail &= TDX_REGS_AVAIL_SET; 1072 1073 if (unlikely(tdx->vp_enter_ret == EXIT_REASON_EPT_MISCONFIG)) 1074 return EXIT_FASTPATH_NONE; 1075 1076 if (unlikely((tdx->vp_enter_ret & TDX_SW_ERROR) == TDX_SW_ERROR)) 1077 return EXIT_FASTPATH_NONE; 1078 1079 if (unlikely(vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MCE_DURING_VMENTRY)) 1080 kvm_machine_check(); 1081 1082 trace_kvm_exit(vcpu, KVM_ISA_VMX); 1083 1084 if (unlikely(tdx_failed_vmentry(vcpu))) 1085 return EXIT_FASTPATH_NONE; 1086 1087 return tdx_exit_handlers_fastpath(vcpu); 1088 } 1089 1090 void tdx_inject_nmi(struct kvm_vcpu *vcpu) 1091 { 1092 ++vcpu->stat.nmi_injections; 1093 td_management_write8(to_tdx(vcpu), TD_VCPU_PEND_NMI, 1); 1094 /* 1095 * From KVM's perspective, NMI injection is completed right after 1096 * writing to PEND_NMI. KVM doesn't care whether an NMI is injected by 1097 * the TDX module or not. 1098 */ 1099 vcpu->arch.nmi_injected = false; 1100 /* 1101 * TDX doesn't support KVM to request NMI window exit. If there is 1102 * still a pending vNMI, KVM is not able to inject it along with the 1103 * one pending in TDX module in a back-to-back way. Since the previous 1104 * vNMI is still pending in TDX module, i.e. it has not been delivered 1105 * to TDX guest yet, it's OK to collapse the pending vNMI into the 1106 * previous one. The guest is expected to handle all the NMI sources 1107 * when handling the first vNMI. 1108 */ 1109 vcpu->arch.nmi_pending = 0; 1110 } 1111 1112 static int tdx_handle_exception_nmi(struct kvm_vcpu *vcpu) 1113 { 1114 u32 intr_info = vmx_get_intr_info(vcpu); 1115 1116 /* 1117 * Machine checks are handled by handle_exception_irqoff(), or by 1118 * tdx_handle_exit() with TDX_NON_RECOVERABLE set if a #MC occurs on 1119 * VM-Entry. NMIs are handled by tdx_vcpu_enter_exit(). 1120 */ 1121 if (is_nmi(intr_info) || is_machine_check(intr_info)) 1122 return 1; 1123 1124 vcpu->run->exit_reason = KVM_EXIT_EXCEPTION; 1125 vcpu->run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK; 1126 vcpu->run->ex.error_code = 0; 1127 1128 return 0; 1129 } 1130 1131 static int complete_hypercall_exit(struct kvm_vcpu *vcpu) 1132 { 1133 tdvmcall_set_return_code(vcpu, vcpu->run->hypercall.ret); 1134 return 1; 1135 } 1136 1137 static int tdx_emulate_vmcall(struct kvm_vcpu *vcpu) 1138 { 1139 kvm_rax_write(vcpu, to_tdx(vcpu)->vp_enter_args.r10); 1140 kvm_rbx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r11); 1141 kvm_rcx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r12); 1142 kvm_rdx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r13); 1143 kvm_rsi_write(vcpu, to_tdx(vcpu)->vp_enter_args.r14); 1144 1145 return __kvm_emulate_hypercall(vcpu, 0, complete_hypercall_exit); 1146 } 1147 1148 /* 1149 * Split into chunks and check interrupt pending between chunks. This allows 1150 * for timely injection of interrupts to prevent issues with guest lockup 1151 * detection. 1152 */ 1153 #define TDX_MAP_GPA_MAX_LEN (2 * 1024 * 1024) 1154 static void __tdx_map_gpa(struct vcpu_tdx *tdx); 1155 1156 static int tdx_complete_vmcall_map_gpa(struct kvm_vcpu *vcpu) 1157 { 1158 struct vcpu_tdx *tdx = to_tdx(vcpu); 1159 1160 if (vcpu->run->hypercall.ret) { 1161 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); 1162 tdx->vp_enter_args.r11 = tdx->map_gpa_next; 1163 return 1; 1164 } 1165 1166 tdx->map_gpa_next += TDX_MAP_GPA_MAX_LEN; 1167 if (tdx->map_gpa_next >= tdx->map_gpa_end) 1168 return 1; 1169 1170 /* 1171 * Stop processing the remaining part if there is a pending interrupt, 1172 * which could be qualified to deliver. Skip checking pending RVI for 1173 * TDVMCALL_MAP_GPA, see comments in tdx_protected_apic_has_interrupt(). 1174 */ 1175 if (kvm_vcpu_has_events(vcpu)) { 1176 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_RETRY); 1177 tdx->vp_enter_args.r11 = tdx->map_gpa_next; 1178 return 1; 1179 } 1180 1181 __tdx_map_gpa(tdx); 1182 return 0; 1183 } 1184 1185 static void __tdx_map_gpa(struct vcpu_tdx *tdx) 1186 { 1187 u64 gpa = tdx->map_gpa_next; 1188 u64 size = tdx->map_gpa_end - tdx->map_gpa_next; 1189 1190 if (size > TDX_MAP_GPA_MAX_LEN) 1191 size = TDX_MAP_GPA_MAX_LEN; 1192 1193 tdx->vcpu.run->exit_reason = KVM_EXIT_HYPERCALL; 1194 tdx->vcpu.run->hypercall.nr = KVM_HC_MAP_GPA_RANGE; 1195 /* 1196 * In principle this should have been -KVM_ENOSYS, but userspace (QEMU <=9.2) 1197 * assumed that vcpu->run->hypercall.ret is never changed by KVM and thus that 1198 * it was always zero on KVM_EXIT_HYPERCALL. Since KVM is now overwriting 1199 * vcpu->run->hypercall.ret, ensuring that it is zero to not break QEMU. 1200 */ 1201 tdx->vcpu.run->hypercall.ret = 0; 1202 tdx->vcpu.run->hypercall.args[0] = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(tdx->vcpu.kvm)); 1203 tdx->vcpu.run->hypercall.args[1] = size / PAGE_SIZE; 1204 tdx->vcpu.run->hypercall.args[2] = vt_is_tdx_private_gpa(tdx->vcpu.kvm, gpa) ? 1205 KVM_MAP_GPA_RANGE_ENCRYPTED : 1206 KVM_MAP_GPA_RANGE_DECRYPTED; 1207 tdx->vcpu.run->hypercall.flags = KVM_EXIT_HYPERCALL_LONG_MODE; 1208 1209 tdx->vcpu.arch.complete_userspace_io = tdx_complete_vmcall_map_gpa; 1210 } 1211 1212 static int tdx_map_gpa(struct kvm_vcpu *vcpu) 1213 { 1214 struct vcpu_tdx *tdx = to_tdx(vcpu); 1215 u64 gpa = tdx->vp_enter_args.r12; 1216 u64 size = tdx->vp_enter_args.r13; 1217 u64 ret; 1218 1219 /* 1220 * Converting TDVMCALL_MAP_GPA to KVM_HC_MAP_GPA_RANGE requires 1221 * userspace to enable KVM_CAP_EXIT_HYPERCALL with KVM_HC_MAP_GPA_RANGE 1222 * bit set. This is a base call so it should always be supported, but 1223 * KVM has no way to ensure that userspace implements the GHCI correctly. 1224 * So if KVM_HC_MAP_GPA_RANGE does not cause a VMEXIT, return an error 1225 * to the guest. 1226 */ 1227 if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE)) { 1228 ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED; 1229 goto error; 1230 } 1231 1232 if (gpa + size <= gpa || !kvm_vcpu_is_legal_gpa(vcpu, gpa) || 1233 !kvm_vcpu_is_legal_gpa(vcpu, gpa + size - 1) || 1234 (vt_is_tdx_private_gpa(vcpu->kvm, gpa) != 1235 vt_is_tdx_private_gpa(vcpu->kvm, gpa + size - 1))) { 1236 ret = TDVMCALL_STATUS_INVALID_OPERAND; 1237 goto error; 1238 } 1239 1240 if (!PAGE_ALIGNED(gpa) || !PAGE_ALIGNED(size)) { 1241 ret = TDVMCALL_STATUS_ALIGN_ERROR; 1242 goto error; 1243 } 1244 1245 tdx->map_gpa_end = gpa + size; 1246 tdx->map_gpa_next = gpa; 1247 1248 __tdx_map_gpa(tdx); 1249 return 0; 1250 1251 error: 1252 tdvmcall_set_return_code(vcpu, ret); 1253 tdx->vp_enter_args.r11 = gpa; 1254 return 1; 1255 } 1256 1257 static int tdx_report_fatal_error(struct kvm_vcpu *vcpu) 1258 { 1259 struct vcpu_tdx *tdx = to_tdx(vcpu); 1260 u64 *regs = vcpu->run->system_event.data; 1261 u64 *module_regs = &tdx->vp_enter_args.r8; 1262 int index = VCPU_REGS_RAX; 1263 1264 vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; 1265 vcpu->run->system_event.type = KVM_SYSTEM_EVENT_TDX_FATAL; 1266 vcpu->run->system_event.ndata = 16; 1267 1268 /* Dump 16 general-purpose registers to userspace in ascending order. */ 1269 regs[index++] = tdx->vp_enter_ret; 1270 regs[index++] = tdx->vp_enter_args.rcx; 1271 regs[index++] = tdx->vp_enter_args.rdx; 1272 regs[index++] = tdx->vp_enter_args.rbx; 1273 regs[index++] = 0; 1274 regs[index++] = 0; 1275 regs[index++] = tdx->vp_enter_args.rsi; 1276 regs[index] = tdx->vp_enter_args.rdi; 1277 for (index = 0; index < 8; index++) 1278 regs[VCPU_REGS_R8 + index] = module_regs[index]; 1279 1280 return 0; 1281 } 1282 1283 static int tdx_emulate_cpuid(struct kvm_vcpu *vcpu) 1284 { 1285 u32 eax, ebx, ecx, edx; 1286 struct vcpu_tdx *tdx = to_tdx(vcpu); 1287 1288 /* EAX and ECX for cpuid is stored in R12 and R13. */ 1289 eax = tdx->vp_enter_args.r12; 1290 ecx = tdx->vp_enter_args.r13; 1291 1292 kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx, false); 1293 1294 tdx->vp_enter_args.r12 = eax; 1295 tdx->vp_enter_args.r13 = ebx; 1296 tdx->vp_enter_args.r14 = ecx; 1297 tdx->vp_enter_args.r15 = edx; 1298 1299 return 1; 1300 } 1301 1302 static int tdx_complete_pio_out(struct kvm_vcpu *vcpu) 1303 { 1304 vcpu->arch.pio.count = 0; 1305 return 1; 1306 } 1307 1308 static int tdx_complete_pio_in(struct kvm_vcpu *vcpu) 1309 { 1310 struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; 1311 unsigned long val = 0; 1312 int ret; 1313 1314 ret = ctxt->ops->pio_in_emulated(ctxt, vcpu->arch.pio.size, 1315 vcpu->arch.pio.port, &val, 1); 1316 1317 WARN_ON_ONCE(!ret); 1318 1319 tdvmcall_set_return_val(vcpu, val); 1320 1321 return 1; 1322 } 1323 1324 static int tdx_emulate_io(struct kvm_vcpu *vcpu) 1325 { 1326 struct vcpu_tdx *tdx = to_tdx(vcpu); 1327 struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; 1328 unsigned long val = 0; 1329 unsigned int port; 1330 u64 size, write; 1331 int ret; 1332 1333 ++vcpu->stat.io_exits; 1334 1335 size = tdx->vp_enter_args.r12; 1336 write = tdx->vp_enter_args.r13; 1337 port = tdx->vp_enter_args.r14; 1338 1339 if ((write != 0 && write != 1) || (size != 1 && size != 2 && size != 4)) { 1340 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); 1341 return 1; 1342 } 1343 1344 if (write) { 1345 val = tdx->vp_enter_args.r15; 1346 ret = ctxt->ops->pio_out_emulated(ctxt, size, port, &val, 1); 1347 } else { 1348 ret = ctxt->ops->pio_in_emulated(ctxt, size, port, &val, 1); 1349 } 1350 1351 if (!ret) 1352 vcpu->arch.complete_userspace_io = write ? tdx_complete_pio_out : 1353 tdx_complete_pio_in; 1354 else if (!write) 1355 tdvmcall_set_return_val(vcpu, val); 1356 1357 return ret; 1358 } 1359 1360 static int tdx_complete_mmio_read(struct kvm_vcpu *vcpu) 1361 { 1362 unsigned long val = 0; 1363 gpa_t gpa; 1364 int size; 1365 1366 gpa = vcpu->mmio_fragments[0].gpa; 1367 size = vcpu->mmio_fragments[0].len; 1368 1369 memcpy(&val, vcpu->run->mmio.data, size); 1370 tdvmcall_set_return_val(vcpu, val); 1371 trace_kvm_mmio(KVM_TRACE_MMIO_READ, size, gpa, &val); 1372 return 1; 1373 } 1374 1375 static inline int tdx_mmio_write(struct kvm_vcpu *vcpu, gpa_t gpa, int size, 1376 unsigned long val) 1377 { 1378 if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { 1379 trace_kvm_fast_mmio(gpa); 1380 return 0; 1381 } 1382 1383 trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, size, gpa, &val); 1384 if (kvm_io_bus_write(vcpu, KVM_MMIO_BUS, gpa, size, &val)) 1385 return -EOPNOTSUPP; 1386 1387 return 0; 1388 } 1389 1390 static inline int tdx_mmio_read(struct kvm_vcpu *vcpu, gpa_t gpa, int size) 1391 { 1392 unsigned long val; 1393 1394 if (kvm_io_bus_read(vcpu, KVM_MMIO_BUS, gpa, size, &val)) 1395 return -EOPNOTSUPP; 1396 1397 tdvmcall_set_return_val(vcpu, val); 1398 trace_kvm_mmio(KVM_TRACE_MMIO_READ, size, gpa, &val); 1399 return 0; 1400 } 1401 1402 static int tdx_emulate_mmio(struct kvm_vcpu *vcpu) 1403 { 1404 struct vcpu_tdx *tdx = to_tdx(vcpu); 1405 int size, write, r; 1406 unsigned long val; 1407 gpa_t gpa; 1408 1409 size = tdx->vp_enter_args.r12; 1410 write = tdx->vp_enter_args.r13; 1411 gpa = tdx->vp_enter_args.r14; 1412 val = write ? tdx->vp_enter_args.r15 : 0; 1413 1414 if (size != 1 && size != 2 && size != 4 && size != 8) 1415 goto error; 1416 if (write != 0 && write != 1) 1417 goto error; 1418 1419 /* 1420 * TDG.VP.VMCALL<MMIO> allows only shared GPA, it makes no sense to 1421 * do MMIO emulation for private GPA. 1422 */ 1423 if (vt_is_tdx_private_gpa(vcpu->kvm, gpa) || 1424 vt_is_tdx_private_gpa(vcpu->kvm, gpa + size - 1)) 1425 goto error; 1426 1427 gpa = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(vcpu->kvm)); 1428 1429 if (write) 1430 r = tdx_mmio_write(vcpu, gpa, size, val); 1431 else 1432 r = tdx_mmio_read(vcpu, gpa, size); 1433 if (!r) 1434 /* Kernel completed device emulation. */ 1435 return 1; 1436 1437 /* Request the device emulation to userspace device model. */ 1438 vcpu->mmio_is_write = write; 1439 if (!write) 1440 vcpu->arch.complete_userspace_io = tdx_complete_mmio_read; 1441 1442 vcpu->run->mmio.phys_addr = gpa; 1443 vcpu->run->mmio.len = size; 1444 vcpu->run->mmio.is_write = write; 1445 vcpu->run->exit_reason = KVM_EXIT_MMIO; 1446 1447 if (write) { 1448 memcpy(vcpu->run->mmio.data, &val, size); 1449 } else { 1450 vcpu->mmio_fragments[0].gpa = gpa; 1451 vcpu->mmio_fragments[0].len = size; 1452 trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, size, gpa, NULL); 1453 } 1454 return 0; 1455 1456 error: 1457 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); 1458 return 1; 1459 } 1460 1461 static int tdx_complete_get_td_vm_call_info(struct kvm_vcpu *vcpu) 1462 { 1463 struct vcpu_tdx *tdx = to_tdx(vcpu); 1464 1465 tdvmcall_set_return_code(vcpu, vcpu->run->tdx.get_tdvmcall_info.ret); 1466 1467 /* 1468 * For now, there is no TDVMCALL beyond GHCI base API supported by KVM 1469 * directly without the support from userspace, just set the value 1470 * returned from userspace. 1471 */ 1472 tdx->vp_enter_args.r11 = vcpu->run->tdx.get_tdvmcall_info.r11; 1473 tdx->vp_enter_args.r12 = vcpu->run->tdx.get_tdvmcall_info.r12; 1474 tdx->vp_enter_args.r13 = vcpu->run->tdx.get_tdvmcall_info.r13; 1475 tdx->vp_enter_args.r14 = vcpu->run->tdx.get_tdvmcall_info.r14; 1476 1477 return 1; 1478 } 1479 1480 static int tdx_get_td_vm_call_info(struct kvm_vcpu *vcpu) 1481 { 1482 struct vcpu_tdx *tdx = to_tdx(vcpu); 1483 1484 switch (tdx->vp_enter_args.r12) { 1485 case 0: 1486 tdx->vp_enter_args.r11 = 0; 1487 tdx->vp_enter_args.r12 = 0; 1488 tdx->vp_enter_args.r13 = 0; 1489 tdx->vp_enter_args.r14 = 0; 1490 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_SUCCESS); 1491 return 1; 1492 case 1: 1493 vcpu->run->tdx.get_tdvmcall_info.leaf = tdx->vp_enter_args.r12; 1494 vcpu->run->exit_reason = KVM_EXIT_TDX; 1495 vcpu->run->tdx.flags = 0; 1496 vcpu->run->tdx.nr = TDVMCALL_GET_TD_VM_CALL_INFO; 1497 vcpu->run->tdx.get_tdvmcall_info.ret = TDVMCALL_STATUS_SUCCESS; 1498 vcpu->run->tdx.get_tdvmcall_info.r11 = 0; 1499 vcpu->run->tdx.get_tdvmcall_info.r12 = 0; 1500 vcpu->run->tdx.get_tdvmcall_info.r13 = 0; 1501 vcpu->run->tdx.get_tdvmcall_info.r14 = 0; 1502 vcpu->arch.complete_userspace_io = tdx_complete_get_td_vm_call_info; 1503 return 0; 1504 default: 1505 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); 1506 return 1; 1507 } 1508 } 1509 1510 static int tdx_complete_simple(struct kvm_vcpu *vcpu) 1511 { 1512 tdvmcall_set_return_code(vcpu, vcpu->run->tdx.unknown.ret); 1513 return 1; 1514 } 1515 1516 static int tdx_get_quote(struct kvm_vcpu *vcpu) 1517 { 1518 struct vcpu_tdx *tdx = to_tdx(vcpu); 1519 u64 gpa = tdx->vp_enter_args.r12; 1520 u64 size = tdx->vp_enter_args.r13; 1521 1522 /* The gpa of buffer must have shared bit set. */ 1523 if (vt_is_tdx_private_gpa(vcpu->kvm, gpa)) { 1524 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); 1525 return 1; 1526 } 1527 1528 vcpu->run->exit_reason = KVM_EXIT_TDX; 1529 vcpu->run->tdx.flags = 0; 1530 vcpu->run->tdx.nr = TDVMCALL_GET_QUOTE; 1531 vcpu->run->tdx.get_quote.ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED; 1532 vcpu->run->tdx.get_quote.gpa = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(tdx->vcpu.kvm)); 1533 vcpu->run->tdx.get_quote.size = size; 1534 1535 vcpu->arch.complete_userspace_io = tdx_complete_simple; 1536 1537 return 0; 1538 } 1539 1540 static int tdx_setup_event_notify_interrupt(struct kvm_vcpu *vcpu) 1541 { 1542 struct vcpu_tdx *tdx = to_tdx(vcpu); 1543 u64 vector = tdx->vp_enter_args.r12; 1544 1545 if (vector < 32 || vector > 255) { 1546 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); 1547 return 1; 1548 } 1549 1550 vcpu->run->exit_reason = KVM_EXIT_TDX; 1551 vcpu->run->tdx.flags = 0; 1552 vcpu->run->tdx.nr = TDVMCALL_SETUP_EVENT_NOTIFY_INTERRUPT; 1553 vcpu->run->tdx.setup_event_notify.ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED; 1554 vcpu->run->tdx.setup_event_notify.vector = vector; 1555 1556 vcpu->arch.complete_userspace_io = tdx_complete_simple; 1557 1558 return 0; 1559 } 1560 1561 static int handle_tdvmcall(struct kvm_vcpu *vcpu) 1562 { 1563 switch (tdvmcall_leaf(vcpu)) { 1564 case TDVMCALL_MAP_GPA: 1565 return tdx_map_gpa(vcpu); 1566 case TDVMCALL_REPORT_FATAL_ERROR: 1567 return tdx_report_fatal_error(vcpu); 1568 case TDVMCALL_GET_TD_VM_CALL_INFO: 1569 return tdx_get_td_vm_call_info(vcpu); 1570 case TDVMCALL_GET_QUOTE: 1571 return tdx_get_quote(vcpu); 1572 case TDVMCALL_SETUP_EVENT_NOTIFY_INTERRUPT: 1573 return tdx_setup_event_notify_interrupt(vcpu); 1574 default: 1575 break; 1576 } 1577 1578 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED); 1579 return 1; 1580 } 1581 1582 void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int pgd_level) 1583 { 1584 u64 shared_bit = (pgd_level == 5) ? TDX_SHARED_BIT_PWL_5 : 1585 TDX_SHARED_BIT_PWL_4; 1586 1587 if (KVM_BUG_ON(shared_bit != kvm_gfn_direct_bits(vcpu->kvm), vcpu->kvm)) 1588 return; 1589 1590 td_vmcs_write64(to_tdx(vcpu), SHARED_EPT_POINTER, root_hpa); 1591 } 1592 1593 static void tdx_unpin(struct kvm *kvm, struct page *page) 1594 { 1595 put_page(page); 1596 } 1597 1598 static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn, 1599 enum pg_level level, struct page *page) 1600 { 1601 int tdx_level = pg_level_to_tdx_sept_level(level); 1602 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 1603 gpa_t gpa = gfn_to_gpa(gfn); 1604 u64 entry, level_state; 1605 u64 err; 1606 1607 err = tdh_mem_page_aug(&kvm_tdx->td, gpa, tdx_level, page, &entry, &level_state); 1608 if (unlikely(tdx_operand_busy(err))) { 1609 tdx_unpin(kvm, page); 1610 return -EBUSY; 1611 } 1612 1613 if (KVM_BUG_ON(err, kvm)) { 1614 pr_tdx_error_2(TDH_MEM_PAGE_AUG, err, entry, level_state); 1615 tdx_unpin(kvm, page); 1616 return -EIO; 1617 } 1618 1619 return 0; 1620 } 1621 1622 /* 1623 * KVM_TDX_INIT_MEM_REGION calls kvm_gmem_populate() to map guest pages; the 1624 * callback tdx_gmem_post_populate() then maps pages into private memory. 1625 * through the a seamcall TDH.MEM.PAGE.ADD(). The SEAMCALL also requires the 1626 * private EPT structures for the page to have been built before, which is 1627 * done via kvm_tdp_map_page(). nr_premapped counts the number of pages that 1628 * were added to the EPT structures but not added with TDH.MEM.PAGE.ADD(). 1629 * The counter has to be zero on KVM_TDX_FINALIZE_VM, to ensure that there 1630 * are no half-initialized shared EPT pages. 1631 */ 1632 static int tdx_mem_page_record_premap_cnt(struct kvm *kvm, gfn_t gfn, 1633 enum pg_level level, kvm_pfn_t pfn) 1634 { 1635 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 1636 1637 if (KVM_BUG_ON(kvm->arch.pre_fault_allowed, kvm)) 1638 return -EINVAL; 1639 1640 /* nr_premapped will be decreased when tdh_mem_page_add() is called. */ 1641 atomic64_inc(&kvm_tdx->nr_premapped); 1642 return 0; 1643 } 1644 1645 int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn, 1646 enum pg_level level, kvm_pfn_t pfn) 1647 { 1648 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 1649 struct page *page = pfn_to_page(pfn); 1650 1651 /* TODO: handle large pages. */ 1652 if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm)) 1653 return -EINVAL; 1654 1655 /* 1656 * Because guest_memfd doesn't support page migration with 1657 * a_ops->migrate_folio (yet), no callback is triggered for KVM on page 1658 * migration. Until guest_memfd supports page migration, prevent page 1659 * migration. 1660 * TODO: Once guest_memfd introduces callback on page migration, 1661 * implement it and remove get_page/put_page(). 1662 */ 1663 get_page(page); 1664 1665 /* 1666 * Read 'pre_fault_allowed' before 'kvm_tdx->state'; see matching 1667 * barrier in tdx_td_finalize(). 1668 */ 1669 smp_rmb(); 1670 if (likely(kvm_tdx->state == TD_STATE_RUNNABLE)) 1671 return tdx_mem_page_aug(kvm, gfn, level, page); 1672 1673 return tdx_mem_page_record_premap_cnt(kvm, gfn, level, pfn); 1674 } 1675 1676 static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn, 1677 enum pg_level level, struct page *page) 1678 { 1679 int tdx_level = pg_level_to_tdx_sept_level(level); 1680 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 1681 gpa_t gpa = gfn_to_gpa(gfn); 1682 u64 err, entry, level_state; 1683 1684 /* TODO: handle large pages. */ 1685 if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm)) 1686 return -EINVAL; 1687 1688 if (KVM_BUG_ON(!is_hkid_assigned(kvm_tdx), kvm)) 1689 return -EINVAL; 1690 1691 /* 1692 * When zapping private page, write lock is held. So no race condition 1693 * with other vcpu sept operation. 1694 * Race with TDH.VP.ENTER due to (0-step mitigation) and Guest TDCALLs. 1695 */ 1696 err = tdh_mem_page_remove(&kvm_tdx->td, gpa, tdx_level, &entry, 1697 &level_state); 1698 1699 if (unlikely(tdx_operand_busy(err))) { 1700 /* 1701 * The second retry is expected to succeed after kicking off all 1702 * other vCPUs and prevent them from invoking TDH.VP.ENTER. 1703 */ 1704 tdx_no_vcpus_enter_start(kvm); 1705 err = tdh_mem_page_remove(&kvm_tdx->td, gpa, tdx_level, &entry, 1706 &level_state); 1707 tdx_no_vcpus_enter_stop(kvm); 1708 } 1709 1710 if (KVM_BUG_ON(err, kvm)) { 1711 pr_tdx_error_2(TDH_MEM_PAGE_REMOVE, err, entry, level_state); 1712 return -EIO; 1713 } 1714 1715 err = tdh_phymem_page_wbinvd_hkid((u16)kvm_tdx->hkid, page); 1716 1717 if (KVM_BUG_ON(err, kvm)) { 1718 pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err); 1719 return -EIO; 1720 } 1721 tdx_clear_page(page); 1722 tdx_unpin(kvm, page); 1723 return 0; 1724 } 1725 1726 int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn, 1727 enum pg_level level, void *private_spt) 1728 { 1729 int tdx_level = pg_level_to_tdx_sept_level(level); 1730 gpa_t gpa = gfn_to_gpa(gfn); 1731 struct page *page = virt_to_page(private_spt); 1732 u64 err, entry, level_state; 1733 1734 err = tdh_mem_sept_add(&to_kvm_tdx(kvm)->td, gpa, tdx_level, page, &entry, 1735 &level_state); 1736 if (unlikely(tdx_operand_busy(err))) 1737 return -EBUSY; 1738 1739 if (KVM_BUG_ON(err, kvm)) { 1740 pr_tdx_error_2(TDH_MEM_SEPT_ADD, err, entry, level_state); 1741 return -EIO; 1742 } 1743 1744 return 0; 1745 } 1746 1747 /* 1748 * Check if the error returned from a SEPT zap SEAMCALL is due to that a page is 1749 * mapped by KVM_TDX_INIT_MEM_REGION without tdh_mem_page_add() being called 1750 * successfully. 1751 * 1752 * Since tdh_mem_sept_add() must have been invoked successfully before a 1753 * non-leaf entry present in the mirrored page table, the SEPT ZAP related 1754 * SEAMCALLs should not encounter err TDX_EPT_WALK_FAILED. They should instead 1755 * find TDX_EPT_ENTRY_STATE_INCORRECT due to an empty leaf entry found in the 1756 * SEPT. 1757 * 1758 * Further check if the returned entry from SEPT walking is with RWX permissions 1759 * to filter out anything unexpected. 1760 * 1761 * Note: @level is pg_level, not the tdx_level. The tdx_level extracted from 1762 * level_state returned from a SEAMCALL error is the same as that passed into 1763 * the SEAMCALL. 1764 */ 1765 static int tdx_is_sept_zap_err_due_to_premap(struct kvm_tdx *kvm_tdx, u64 err, 1766 u64 entry, int level) 1767 { 1768 if (!err || kvm_tdx->state == TD_STATE_RUNNABLE) 1769 return false; 1770 1771 if (err != (TDX_EPT_ENTRY_STATE_INCORRECT | TDX_OPERAND_ID_RCX)) 1772 return false; 1773 1774 if ((is_last_spte(entry, level) && (entry & VMX_EPT_RWX_MASK))) 1775 return false; 1776 1777 return true; 1778 } 1779 1780 static int tdx_sept_zap_private_spte(struct kvm *kvm, gfn_t gfn, 1781 enum pg_level level, struct page *page) 1782 { 1783 int tdx_level = pg_level_to_tdx_sept_level(level); 1784 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 1785 gpa_t gpa = gfn_to_gpa(gfn) & KVM_HPAGE_MASK(level); 1786 u64 err, entry, level_state; 1787 1788 /* For now large page isn't supported yet. */ 1789 WARN_ON_ONCE(level != PG_LEVEL_4K); 1790 1791 err = tdh_mem_range_block(&kvm_tdx->td, gpa, tdx_level, &entry, &level_state); 1792 1793 if (unlikely(tdx_operand_busy(err))) { 1794 /* After no vCPUs enter, the second retry is expected to succeed */ 1795 tdx_no_vcpus_enter_start(kvm); 1796 err = tdh_mem_range_block(&kvm_tdx->td, gpa, tdx_level, &entry, &level_state); 1797 tdx_no_vcpus_enter_stop(kvm); 1798 } 1799 if (tdx_is_sept_zap_err_due_to_premap(kvm_tdx, err, entry, level) && 1800 !KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm)) { 1801 atomic64_dec(&kvm_tdx->nr_premapped); 1802 tdx_unpin(kvm, page); 1803 return 0; 1804 } 1805 1806 if (KVM_BUG_ON(err, kvm)) { 1807 pr_tdx_error_2(TDH_MEM_RANGE_BLOCK, err, entry, level_state); 1808 return -EIO; 1809 } 1810 return 1; 1811 } 1812 1813 /* 1814 * Ensure shared and private EPTs to be flushed on all vCPUs. 1815 * tdh_mem_track() is the only caller that increases TD epoch. An increase in 1816 * the TD epoch (e.g., to value "N + 1") is successful only if no vCPUs are 1817 * running in guest mode with the value "N - 1". 1818 * 1819 * A successful execution of tdh_mem_track() ensures that vCPUs can only run in 1820 * guest mode with TD epoch value "N" if no TD exit occurs after the TD epoch 1821 * being increased to "N + 1". 1822 * 1823 * Kicking off all vCPUs after that further results in no vCPUs can run in guest 1824 * mode with TD epoch value "N", which unblocks the next tdh_mem_track() (e.g. 1825 * to increase TD epoch to "N + 2"). 1826 * 1827 * TDX module will flush EPT on the next TD enter and make vCPUs to run in 1828 * guest mode with TD epoch value "N + 1". 1829 * 1830 * kvm_make_all_cpus_request() guarantees all vCPUs are out of guest mode by 1831 * waiting empty IPI handler ack_kick(). 1832 * 1833 * No action is required to the vCPUs being kicked off since the kicking off 1834 * occurs certainly after TD epoch increment and before the next 1835 * tdh_mem_track(). 1836 */ 1837 static void tdx_track(struct kvm *kvm) 1838 { 1839 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 1840 u64 err; 1841 1842 /* If TD isn't finalized, it's before any vcpu running. */ 1843 if (unlikely(kvm_tdx->state != TD_STATE_RUNNABLE)) 1844 return; 1845 1846 lockdep_assert_held_write(&kvm->mmu_lock); 1847 1848 err = tdh_mem_track(&kvm_tdx->td); 1849 if (unlikely(tdx_operand_busy(err))) { 1850 /* After no vCPUs enter, the second retry is expected to succeed */ 1851 tdx_no_vcpus_enter_start(kvm); 1852 err = tdh_mem_track(&kvm_tdx->td); 1853 tdx_no_vcpus_enter_stop(kvm); 1854 } 1855 1856 if (KVM_BUG_ON(err, kvm)) 1857 pr_tdx_error(TDH_MEM_TRACK, err); 1858 1859 kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE); 1860 } 1861 1862 int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn, 1863 enum pg_level level, void *private_spt) 1864 { 1865 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 1866 1867 /* 1868 * free_external_spt() is only called after hkid is freed when TD is 1869 * tearing down. 1870 * KVM doesn't (yet) zap page table pages in mirror page table while 1871 * TD is active, though guest pages mapped in mirror page table could be 1872 * zapped during TD is active, e.g. for shared <-> private conversion 1873 * and slot move/deletion. 1874 */ 1875 if (KVM_BUG_ON(is_hkid_assigned(kvm_tdx), kvm)) 1876 return -EINVAL; 1877 1878 /* 1879 * The HKID assigned to this TD was already freed and cache was 1880 * already flushed. We don't have to flush again. 1881 */ 1882 return tdx_reclaim_page(virt_to_page(private_spt)); 1883 } 1884 1885 int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn, 1886 enum pg_level level, kvm_pfn_t pfn) 1887 { 1888 struct page *page = pfn_to_page(pfn); 1889 int ret; 1890 1891 /* 1892 * HKID is released after all private pages have been removed, and set 1893 * before any might be populated. Warn if zapping is attempted when 1894 * there can't be anything populated in the private EPT. 1895 */ 1896 if (KVM_BUG_ON(!is_hkid_assigned(to_kvm_tdx(kvm)), kvm)) 1897 return -EINVAL; 1898 1899 ret = tdx_sept_zap_private_spte(kvm, gfn, level, page); 1900 if (ret <= 0) 1901 return ret; 1902 1903 /* 1904 * TDX requires TLB tracking before dropping private page. Do 1905 * it here, although it is also done later. 1906 */ 1907 tdx_track(kvm); 1908 1909 return tdx_sept_drop_private_spte(kvm, gfn, level, page); 1910 } 1911 1912 void tdx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, 1913 int trig_mode, int vector) 1914 { 1915 struct kvm_vcpu *vcpu = apic->vcpu; 1916 struct vcpu_tdx *tdx = to_tdx(vcpu); 1917 1918 /* TDX supports only posted interrupt. No lapic emulation. */ 1919 __vmx_deliver_posted_interrupt(vcpu, &tdx->vt.pi_desc, vector); 1920 1921 trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector); 1922 } 1923 1924 static inline bool tdx_is_sept_violation_unexpected_pending(struct kvm_vcpu *vcpu) 1925 { 1926 u64 eeq_type = to_tdx(vcpu)->ext_exit_qualification & TDX_EXT_EXIT_QUAL_TYPE_MASK; 1927 u64 eq = vmx_get_exit_qual(vcpu); 1928 1929 if (eeq_type != TDX_EXT_EXIT_QUAL_TYPE_PENDING_EPT_VIOLATION) 1930 return false; 1931 1932 return !(eq & EPT_VIOLATION_PROT_MASK) && !(eq & EPT_VIOLATION_EXEC_FOR_RING3_LIN); 1933 } 1934 1935 static int tdx_handle_ept_violation(struct kvm_vcpu *vcpu) 1936 { 1937 unsigned long exit_qual; 1938 gpa_t gpa = to_tdx(vcpu)->exit_gpa; 1939 bool local_retry = false; 1940 int ret; 1941 1942 if (vt_is_tdx_private_gpa(vcpu->kvm, gpa)) { 1943 if (tdx_is_sept_violation_unexpected_pending(vcpu)) { 1944 pr_warn("Guest access before accepting 0x%llx on vCPU %d\n", 1945 gpa, vcpu->vcpu_id); 1946 kvm_vm_dead(vcpu->kvm); 1947 return -EIO; 1948 } 1949 /* 1950 * Always treat SEPT violations as write faults. Ignore the 1951 * EXIT_QUALIFICATION reported by TDX-SEAM for SEPT violations. 1952 * TD private pages are always RWX in the SEPT tables, 1953 * i.e. they're always mapped writable. Just as importantly, 1954 * treating SEPT violations as write faults is necessary to 1955 * avoid COW allocations, which will cause TDAUGPAGE failures 1956 * due to aliasing a single HPA to multiple GPAs. 1957 */ 1958 exit_qual = EPT_VIOLATION_ACC_WRITE; 1959 1960 /* Only private GPA triggers zero-step mitigation */ 1961 local_retry = true; 1962 } else { 1963 exit_qual = vmx_get_exit_qual(vcpu); 1964 /* 1965 * EPT violation due to instruction fetch should never be 1966 * triggered from shared memory in TDX guest. If such EPT 1967 * violation occurs, treat it as broken hardware. 1968 */ 1969 if (KVM_BUG_ON(exit_qual & EPT_VIOLATION_ACC_INSTR, vcpu->kvm)) 1970 return -EIO; 1971 } 1972 1973 trace_kvm_page_fault(vcpu, gpa, exit_qual); 1974 1975 /* 1976 * To minimize TDH.VP.ENTER invocations, retry locally for private GPA 1977 * mapping in TDX. 1978 * 1979 * KVM may return RET_PF_RETRY for private GPA due to 1980 * - contentions when atomically updating SPTEs of the mirror page table 1981 * - in-progress GFN invalidation or memslot removal. 1982 * - TDX_OPERAND_BUSY error from TDH.MEM.PAGE.AUG or TDH.MEM.SEPT.ADD, 1983 * caused by contentions with TDH.VP.ENTER (with zero-step mitigation) 1984 * or certain TDCALLs. 1985 * 1986 * If TDH.VP.ENTER is invoked more times than the threshold set by the 1987 * TDX module before KVM resolves the private GPA mapping, the TDX 1988 * module will activate zero-step mitigation during TDH.VP.ENTER. This 1989 * process acquires an SEPT tree lock in the TDX module, leading to 1990 * further contentions with TDH.MEM.PAGE.AUG or TDH.MEM.SEPT.ADD 1991 * operations on other vCPUs. 1992 * 1993 * Breaking out of local retries for kvm_vcpu_has_events() is for 1994 * interrupt injection. kvm_vcpu_has_events() should not see pending 1995 * events for TDX. Since KVM can't determine if IRQs (or NMIs) are 1996 * blocked by TDs, false positives are inevitable i.e., KVM may re-enter 1997 * the guest even if the IRQ/NMI can't be delivered. 1998 * 1999 * Note: even without breaking out of local retries, zero-step 2000 * mitigation may still occur due to 2001 * - invoking of TDH.VP.ENTER after KVM_EXIT_MEMORY_FAULT, 2002 * - a single RIP causing EPT violations for more GFNs than the 2003 * threshold count. 2004 * This is safe, as triggering zero-step mitigation only introduces 2005 * contentions to page installation SEAMCALLs on other vCPUs, which will 2006 * handle retries locally in their EPT violation handlers. 2007 */ 2008 while (1) { 2009 ret = __vmx_handle_ept_violation(vcpu, gpa, exit_qual); 2010 2011 if (ret != RET_PF_RETRY || !local_retry) 2012 break; 2013 2014 if (kvm_vcpu_has_events(vcpu) || signal_pending(current)) 2015 break; 2016 2017 if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu)) { 2018 ret = -EIO; 2019 break; 2020 } 2021 2022 cond_resched(); 2023 } 2024 return ret; 2025 } 2026 2027 int tdx_complete_emulated_msr(struct kvm_vcpu *vcpu, int err) 2028 { 2029 if (err) { 2030 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); 2031 return 1; 2032 } 2033 2034 if (vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MSR_READ) 2035 tdvmcall_set_return_val(vcpu, kvm_read_edx_eax(vcpu)); 2036 2037 return 1; 2038 } 2039 2040 2041 int tdx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t fastpath) 2042 { 2043 struct vcpu_tdx *tdx = to_tdx(vcpu); 2044 u64 vp_enter_ret = tdx->vp_enter_ret; 2045 union vmx_exit_reason exit_reason = vmx_get_exit_reason(vcpu); 2046 2047 if (fastpath != EXIT_FASTPATH_NONE) 2048 return 1; 2049 2050 if (unlikely(vp_enter_ret == EXIT_REASON_EPT_MISCONFIG)) { 2051 KVM_BUG_ON(1, vcpu->kvm); 2052 return -EIO; 2053 } 2054 2055 /* 2056 * Handle TDX SW errors, including TDX_SEAMCALL_UD, TDX_SEAMCALL_GP and 2057 * TDX_SEAMCALL_VMFAILINVALID. 2058 */ 2059 if (unlikely((vp_enter_ret & TDX_SW_ERROR) == TDX_SW_ERROR)) { 2060 KVM_BUG_ON(!kvm_rebooting, vcpu->kvm); 2061 goto unhandled_exit; 2062 } 2063 2064 if (unlikely(tdx_failed_vmentry(vcpu))) { 2065 /* 2066 * If the guest state is protected, that means off-TD debug is 2067 * not enabled, TDX_NON_RECOVERABLE must be set. 2068 */ 2069 WARN_ON_ONCE(vcpu->arch.guest_state_protected && 2070 !(vp_enter_ret & TDX_NON_RECOVERABLE)); 2071 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; 2072 vcpu->run->fail_entry.hardware_entry_failure_reason = exit_reason.full; 2073 vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu; 2074 return 0; 2075 } 2076 2077 if (unlikely(vp_enter_ret & (TDX_ERROR | TDX_NON_RECOVERABLE)) && 2078 exit_reason.basic != EXIT_REASON_TRIPLE_FAULT) { 2079 kvm_pr_unimpl("TD vp_enter_ret 0x%llx\n", vp_enter_ret); 2080 goto unhandled_exit; 2081 } 2082 2083 WARN_ON_ONCE(exit_reason.basic != EXIT_REASON_TRIPLE_FAULT && 2084 (vp_enter_ret & TDX_SEAMCALL_STATUS_MASK) != TDX_SUCCESS); 2085 2086 switch (exit_reason.basic) { 2087 case EXIT_REASON_TRIPLE_FAULT: 2088 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; 2089 vcpu->mmio_needed = 0; 2090 return 0; 2091 case EXIT_REASON_EXCEPTION_NMI: 2092 return tdx_handle_exception_nmi(vcpu); 2093 case EXIT_REASON_EXTERNAL_INTERRUPT: 2094 ++vcpu->stat.irq_exits; 2095 return 1; 2096 case EXIT_REASON_CPUID: 2097 return tdx_emulate_cpuid(vcpu); 2098 case EXIT_REASON_HLT: 2099 return kvm_emulate_halt_noskip(vcpu); 2100 case EXIT_REASON_TDCALL: 2101 return handle_tdvmcall(vcpu); 2102 case EXIT_REASON_VMCALL: 2103 return tdx_emulate_vmcall(vcpu); 2104 case EXIT_REASON_IO_INSTRUCTION: 2105 return tdx_emulate_io(vcpu); 2106 case EXIT_REASON_MSR_READ: 2107 kvm_rcx_write(vcpu, tdx->vp_enter_args.r12); 2108 return kvm_emulate_rdmsr(vcpu); 2109 case EXIT_REASON_MSR_WRITE: 2110 kvm_rcx_write(vcpu, tdx->vp_enter_args.r12); 2111 kvm_rax_write(vcpu, tdx->vp_enter_args.r13 & -1u); 2112 kvm_rdx_write(vcpu, tdx->vp_enter_args.r13 >> 32); 2113 return kvm_emulate_wrmsr(vcpu); 2114 case EXIT_REASON_EPT_MISCONFIG: 2115 return tdx_emulate_mmio(vcpu); 2116 case EXIT_REASON_EPT_VIOLATION: 2117 return tdx_handle_ept_violation(vcpu); 2118 case EXIT_REASON_OTHER_SMI: 2119 /* 2120 * Unlike VMX, SMI in SEAM non-root mode (i.e. when 2121 * TD guest vCPU is running) will cause VM exit to TDX module, 2122 * then SEAMRET to KVM. Once it exits to KVM, SMI is delivered 2123 * and handled by kernel handler right away. 2124 * 2125 * The Other SMI exit can also be caused by the SEAM non-root 2126 * machine check delivered via Machine Check System Management 2127 * Interrupt (MSMI), but it has already been handled by the 2128 * kernel machine check handler, i.e., the memory page has been 2129 * marked as poisoned and it won't be freed to the free list 2130 * when the TDX guest is terminated (the TDX module marks the 2131 * guest as dead and prevent it from further running when 2132 * machine check happens in SEAM non-root). 2133 * 2134 * - A MSMI will not reach here, it's handled as non_recoverable 2135 * case above. 2136 * - If it's not an MSMI, no need to do anything here. 2137 */ 2138 return 1; 2139 default: 2140 break; 2141 } 2142 2143 unhandled_exit: 2144 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 2145 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; 2146 vcpu->run->internal.ndata = 2; 2147 vcpu->run->internal.data[0] = vp_enter_ret; 2148 vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu; 2149 return 0; 2150 } 2151 2152 void tdx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, 2153 u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code) 2154 { 2155 struct vcpu_tdx *tdx = to_tdx(vcpu); 2156 2157 *reason = tdx->vt.exit_reason.full; 2158 if (*reason != -1u) { 2159 *info1 = vmx_get_exit_qual(vcpu); 2160 *info2 = tdx->ext_exit_qualification; 2161 *intr_info = vmx_get_intr_info(vcpu); 2162 } else { 2163 *info1 = 0; 2164 *info2 = 0; 2165 *intr_info = 0; 2166 } 2167 2168 *error_code = 0; 2169 } 2170 2171 bool tdx_has_emulated_msr(u32 index) 2172 { 2173 switch (index) { 2174 case MSR_IA32_UCODE_REV: 2175 case MSR_IA32_ARCH_CAPABILITIES: 2176 case MSR_IA32_POWER_CTL: 2177 case MSR_IA32_CR_PAT: 2178 case MSR_MTRRcap: 2179 case MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000: 2180 case MSR_MTRRdefType: 2181 case MSR_IA32_TSC_DEADLINE: 2182 case MSR_IA32_MISC_ENABLE: 2183 case MSR_PLATFORM_INFO: 2184 case MSR_MISC_FEATURES_ENABLES: 2185 case MSR_IA32_APICBASE: 2186 case MSR_EFER: 2187 case MSR_IA32_FEAT_CTL: 2188 case MSR_IA32_MCG_CAP: 2189 case MSR_IA32_MCG_STATUS: 2190 case MSR_IA32_MCG_CTL: 2191 case MSR_IA32_MCG_EXT_CTL: 2192 case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: 2193 case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1: 2194 /* MSR_IA32_MCx_{CTL, STATUS, ADDR, MISC, CTL2} */ 2195 case MSR_KVM_POLL_CONTROL: 2196 return true; 2197 case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff: 2198 /* 2199 * x2APIC registers that are virtualized by the CPU can't be 2200 * emulated, KVM doesn't have access to the virtual APIC page. 2201 */ 2202 switch (index) { 2203 case X2APIC_MSR(APIC_TASKPRI): 2204 case X2APIC_MSR(APIC_PROCPRI): 2205 case X2APIC_MSR(APIC_EOI): 2206 case X2APIC_MSR(APIC_ISR) ... X2APIC_MSR(APIC_ISR + APIC_ISR_NR): 2207 case X2APIC_MSR(APIC_TMR) ... X2APIC_MSR(APIC_TMR + APIC_ISR_NR): 2208 case X2APIC_MSR(APIC_IRR) ... X2APIC_MSR(APIC_IRR + APIC_ISR_NR): 2209 return false; 2210 default: 2211 return true; 2212 } 2213 default: 2214 return false; 2215 } 2216 } 2217 2218 static bool tdx_is_read_only_msr(u32 index) 2219 { 2220 return index == MSR_IA32_APICBASE || index == MSR_EFER || 2221 index == MSR_IA32_FEAT_CTL; 2222 } 2223 2224 int tdx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) 2225 { 2226 switch (msr->index) { 2227 case MSR_IA32_FEAT_CTL: 2228 /* 2229 * MCE and MCA are advertised via cpuid. Guest kernel could 2230 * check if LMCE is enabled or not. 2231 */ 2232 msr->data = FEAT_CTL_LOCKED; 2233 if (vcpu->arch.mcg_cap & MCG_LMCE_P) 2234 msr->data |= FEAT_CTL_LMCE_ENABLED; 2235 return 0; 2236 case MSR_IA32_MCG_EXT_CTL: 2237 if (!msr->host_initiated && !(vcpu->arch.mcg_cap & MCG_LMCE_P)) 2238 return 1; 2239 msr->data = vcpu->arch.mcg_ext_ctl; 2240 return 0; 2241 default: 2242 if (!tdx_has_emulated_msr(msr->index)) 2243 return 1; 2244 2245 return kvm_get_msr_common(vcpu, msr); 2246 } 2247 } 2248 2249 int tdx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) 2250 { 2251 switch (msr->index) { 2252 case MSR_IA32_MCG_EXT_CTL: 2253 if ((!msr->host_initiated && !(vcpu->arch.mcg_cap & MCG_LMCE_P)) || 2254 (msr->data & ~MCG_EXT_CTL_LMCE_EN)) 2255 return 1; 2256 vcpu->arch.mcg_ext_ctl = msr->data; 2257 return 0; 2258 default: 2259 if (tdx_is_read_only_msr(msr->index)) 2260 return 1; 2261 2262 if (!tdx_has_emulated_msr(msr->index)) 2263 return 1; 2264 2265 return kvm_set_msr_common(vcpu, msr); 2266 } 2267 } 2268 2269 static int tdx_get_capabilities(struct kvm_tdx_cmd *cmd) 2270 { 2271 const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf; 2272 struct kvm_tdx_capabilities __user *user_caps; 2273 struct kvm_tdx_capabilities *caps = NULL; 2274 int ret = 0; 2275 2276 /* flags is reserved for future use */ 2277 if (cmd->flags) 2278 return -EINVAL; 2279 2280 caps = kmalloc(sizeof(*caps) + 2281 sizeof(struct kvm_cpuid_entry2) * td_conf->num_cpuid_config, 2282 GFP_KERNEL); 2283 if (!caps) 2284 return -ENOMEM; 2285 2286 user_caps = u64_to_user_ptr(cmd->data); 2287 if (copy_from_user(caps, user_caps, sizeof(*caps))) { 2288 ret = -EFAULT; 2289 goto out; 2290 } 2291 2292 if (caps->cpuid.nent < td_conf->num_cpuid_config) { 2293 ret = -E2BIG; 2294 goto out; 2295 } 2296 2297 ret = init_kvm_tdx_caps(td_conf, caps); 2298 if (ret) 2299 goto out; 2300 2301 if (copy_to_user(user_caps, caps, sizeof(*caps))) { 2302 ret = -EFAULT; 2303 goto out; 2304 } 2305 2306 if (copy_to_user(user_caps->cpuid.entries, caps->cpuid.entries, 2307 caps->cpuid.nent * 2308 sizeof(caps->cpuid.entries[0]))) 2309 ret = -EFAULT; 2310 2311 out: 2312 /* kfree() accepts NULL. */ 2313 kfree(caps); 2314 return ret; 2315 } 2316 2317 /* 2318 * KVM reports guest physical address in CPUID.0x800000008.EAX[23:16], which is 2319 * similar to TDX's GPAW. Use this field as the interface for userspace to 2320 * configure the GPAW and EPT level for TDs. 2321 * 2322 * Only values 48 and 52 are supported. Value 52 means GPAW-52 and EPT level 2323 * 5, Value 48 means GPAW-48 and EPT level 4. For value 48, GPAW-48 is always 2324 * supported. Value 52 is only supported when the platform supports 5 level 2325 * EPT. 2326 */ 2327 static int setup_tdparams_eptp_controls(struct kvm_cpuid2 *cpuid, 2328 struct td_params *td_params) 2329 { 2330 const struct kvm_cpuid_entry2 *entry; 2331 int guest_pa; 2332 2333 entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent, 0x80000008, 0); 2334 if (!entry) 2335 return -EINVAL; 2336 2337 guest_pa = tdx_get_guest_phys_addr_bits(entry->eax); 2338 2339 if (guest_pa != 48 && guest_pa != 52) 2340 return -EINVAL; 2341 2342 if (guest_pa == 52 && !cpu_has_vmx_ept_5levels()) 2343 return -EINVAL; 2344 2345 td_params->eptp_controls = VMX_EPTP_MT_WB; 2346 if (guest_pa == 52) { 2347 td_params->eptp_controls |= VMX_EPTP_PWL_5; 2348 td_params->config_flags |= TDX_CONFIG_FLAGS_MAX_GPAW; 2349 } else { 2350 td_params->eptp_controls |= VMX_EPTP_PWL_4; 2351 } 2352 2353 return 0; 2354 } 2355 2356 static int setup_tdparams_cpuids(struct kvm_cpuid2 *cpuid, 2357 struct td_params *td_params) 2358 { 2359 const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf; 2360 const struct kvm_cpuid_entry2 *entry; 2361 struct tdx_cpuid_value *value; 2362 int i, copy_cnt = 0; 2363 2364 /* 2365 * td_params.cpuid_values: The number and the order of cpuid_value must 2366 * be same to the one of struct tdsysinfo.{num_cpuid_config, cpuid_configs} 2367 * It's assumed that td_params was zeroed. 2368 */ 2369 for (i = 0; i < td_conf->num_cpuid_config; i++) { 2370 struct kvm_cpuid_entry2 tmp; 2371 2372 td_init_cpuid_entry2(&tmp, i); 2373 2374 entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent, 2375 tmp.function, tmp.index); 2376 if (!entry) 2377 continue; 2378 2379 if (tdx_unsupported_cpuid(entry)) 2380 return -EINVAL; 2381 2382 copy_cnt++; 2383 2384 value = &td_params->cpuid_values[i]; 2385 value->eax = entry->eax; 2386 value->ebx = entry->ebx; 2387 value->ecx = entry->ecx; 2388 value->edx = entry->edx; 2389 2390 /* 2391 * TDX module does not accept nonzero bits 16..23 for the 2392 * CPUID[0x80000008].EAX, see setup_tdparams_eptp_controls(). 2393 */ 2394 if (tmp.function == 0x80000008) 2395 value->eax = tdx_set_guest_phys_addr_bits(value->eax, 0); 2396 } 2397 2398 /* 2399 * Rely on the TDX module to reject invalid configuration, but it can't 2400 * check of leafs that don't have a proper slot in td_params->cpuid_values 2401 * to stick then. So fail if there were entries that didn't get copied to 2402 * td_params. 2403 */ 2404 if (copy_cnt != cpuid->nent) 2405 return -EINVAL; 2406 2407 return 0; 2408 } 2409 2410 static int setup_tdparams(struct kvm *kvm, struct td_params *td_params, 2411 struct kvm_tdx_init_vm *init_vm) 2412 { 2413 const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf; 2414 struct kvm_cpuid2 *cpuid = &init_vm->cpuid; 2415 int ret; 2416 2417 if (kvm->created_vcpus) 2418 return -EBUSY; 2419 2420 if (init_vm->attributes & ~tdx_get_supported_attrs(td_conf)) 2421 return -EINVAL; 2422 2423 if (init_vm->xfam & ~tdx_get_supported_xfam(td_conf)) 2424 return -EINVAL; 2425 2426 td_params->max_vcpus = kvm->max_vcpus; 2427 td_params->attributes = init_vm->attributes | td_conf->attributes_fixed1; 2428 td_params->xfam = init_vm->xfam | td_conf->xfam_fixed1; 2429 2430 td_params->config_flags = TDX_CONFIG_FLAGS_NO_RBP_MOD; 2431 td_params->tsc_frequency = TDX_TSC_KHZ_TO_25MHZ(kvm->arch.default_tsc_khz); 2432 2433 ret = setup_tdparams_eptp_controls(cpuid, td_params); 2434 if (ret) 2435 return ret; 2436 2437 ret = setup_tdparams_cpuids(cpuid, td_params); 2438 if (ret) 2439 return ret; 2440 2441 #define MEMCPY_SAME_SIZE(dst, src) \ 2442 do { \ 2443 BUILD_BUG_ON(sizeof(dst) != sizeof(src)); \ 2444 memcpy((dst), (src), sizeof(dst)); \ 2445 } while (0) 2446 2447 MEMCPY_SAME_SIZE(td_params->mrconfigid, init_vm->mrconfigid); 2448 MEMCPY_SAME_SIZE(td_params->mrowner, init_vm->mrowner); 2449 MEMCPY_SAME_SIZE(td_params->mrownerconfig, init_vm->mrownerconfig); 2450 2451 return 0; 2452 } 2453 2454 static int __tdx_td_init(struct kvm *kvm, struct td_params *td_params, 2455 u64 *seamcall_err) 2456 { 2457 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 2458 cpumask_var_t packages; 2459 struct page **tdcs_pages = NULL; 2460 struct page *tdr_page; 2461 int ret, i; 2462 u64 err, rcx; 2463 2464 *seamcall_err = 0; 2465 ret = tdx_guest_keyid_alloc(); 2466 if (ret < 0) 2467 return ret; 2468 kvm_tdx->hkid = ret; 2469 kvm_tdx->misc_cg = get_current_misc_cg(); 2470 ret = misc_cg_try_charge(MISC_CG_RES_TDX, kvm_tdx->misc_cg, 1); 2471 if (ret) 2472 goto free_hkid; 2473 2474 ret = -ENOMEM; 2475 2476 atomic_inc(&nr_configured_hkid); 2477 2478 tdr_page = alloc_page(GFP_KERNEL); 2479 if (!tdr_page) 2480 goto free_hkid; 2481 2482 kvm_tdx->td.tdcs_nr_pages = tdx_sysinfo->td_ctrl.tdcs_base_size / PAGE_SIZE; 2483 /* TDVPS = TDVPR(4K page) + TDCX(multiple 4K pages), -1 for TDVPR. */ 2484 kvm_tdx->td.tdcx_nr_pages = tdx_sysinfo->td_ctrl.tdvps_base_size / PAGE_SIZE - 1; 2485 tdcs_pages = kcalloc(kvm_tdx->td.tdcs_nr_pages, sizeof(*kvm_tdx->td.tdcs_pages), 2486 GFP_KERNEL | __GFP_ZERO); 2487 if (!tdcs_pages) 2488 goto free_tdr; 2489 2490 for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) { 2491 tdcs_pages[i] = alloc_page(GFP_KERNEL); 2492 if (!tdcs_pages[i]) 2493 goto free_tdcs; 2494 } 2495 2496 if (!zalloc_cpumask_var(&packages, GFP_KERNEL)) 2497 goto free_tdcs; 2498 2499 cpus_read_lock(); 2500 2501 /* 2502 * Need at least one CPU of the package to be online in order to 2503 * program all packages for host key id. Check it. 2504 */ 2505 for_each_present_cpu(i) 2506 cpumask_set_cpu(topology_physical_package_id(i), packages); 2507 for_each_online_cpu(i) 2508 cpumask_clear_cpu(topology_physical_package_id(i), packages); 2509 if (!cpumask_empty(packages)) { 2510 ret = -EIO; 2511 /* 2512 * Because it's hard for human operator to figure out the 2513 * reason, warn it. 2514 */ 2515 #define MSG_ALLPKG "All packages need to have online CPU to create TD. Online CPU and retry.\n" 2516 pr_warn_ratelimited(MSG_ALLPKG); 2517 goto free_packages; 2518 } 2519 2520 /* 2521 * TDH.MNG.CREATE tries to grab the global TDX module and fails 2522 * with TDX_OPERAND_BUSY when it fails to grab. Take the global 2523 * lock to prevent it from failure. 2524 */ 2525 mutex_lock(&tdx_lock); 2526 kvm_tdx->td.tdr_page = tdr_page; 2527 err = tdh_mng_create(&kvm_tdx->td, kvm_tdx->hkid); 2528 mutex_unlock(&tdx_lock); 2529 2530 if (err == TDX_RND_NO_ENTROPY) { 2531 ret = -EAGAIN; 2532 goto free_packages; 2533 } 2534 2535 if (WARN_ON_ONCE(err)) { 2536 pr_tdx_error(TDH_MNG_CREATE, err); 2537 ret = -EIO; 2538 goto free_packages; 2539 } 2540 2541 for_each_online_cpu(i) { 2542 int pkg = topology_physical_package_id(i); 2543 2544 if (cpumask_test_and_set_cpu(pkg, packages)) 2545 continue; 2546 2547 /* 2548 * Program the memory controller in the package with an 2549 * encryption key associated to a TDX private host key id 2550 * assigned to this TDR. Concurrent operations on same memory 2551 * controller results in TDX_OPERAND_BUSY. No locking needed 2552 * beyond the cpus_read_lock() above as it serializes against 2553 * hotplug and the first online CPU of the package is always 2554 * used. We never have two CPUs in the same socket trying to 2555 * program the key. 2556 */ 2557 ret = smp_call_on_cpu(i, tdx_do_tdh_mng_key_config, 2558 kvm_tdx, true); 2559 if (ret) 2560 break; 2561 } 2562 cpus_read_unlock(); 2563 free_cpumask_var(packages); 2564 if (ret) { 2565 i = 0; 2566 goto teardown; 2567 } 2568 2569 kvm_tdx->td.tdcs_pages = tdcs_pages; 2570 for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) { 2571 err = tdh_mng_addcx(&kvm_tdx->td, tdcs_pages[i]); 2572 if (err == TDX_RND_NO_ENTROPY) { 2573 /* Here it's hard to allow userspace to retry. */ 2574 ret = -EAGAIN; 2575 goto teardown; 2576 } 2577 if (WARN_ON_ONCE(err)) { 2578 pr_tdx_error(TDH_MNG_ADDCX, err); 2579 ret = -EIO; 2580 goto teardown; 2581 } 2582 } 2583 2584 err = tdh_mng_init(&kvm_tdx->td, __pa(td_params), &rcx); 2585 if ((err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_INVALID) { 2586 /* 2587 * Because a user gives operands, don't warn. 2588 * Return a hint to the user because it's sometimes hard for the 2589 * user to figure out which operand is invalid. SEAMCALL status 2590 * code includes which operand caused invalid operand error. 2591 */ 2592 *seamcall_err = err; 2593 ret = -EINVAL; 2594 goto teardown; 2595 } else if (WARN_ON_ONCE(err)) { 2596 pr_tdx_error_1(TDH_MNG_INIT, err, rcx); 2597 ret = -EIO; 2598 goto teardown; 2599 } 2600 2601 return 0; 2602 2603 /* 2604 * The sequence for freeing resources from a partially initialized TD 2605 * varies based on where in the initialization flow failure occurred. 2606 * Simply use the full teardown and destroy, which naturally play nice 2607 * with partial initialization. 2608 */ 2609 teardown: 2610 /* Only free pages not yet added, so start at 'i' */ 2611 for (; i < kvm_tdx->td.tdcs_nr_pages; i++) { 2612 if (tdcs_pages[i]) { 2613 __free_page(tdcs_pages[i]); 2614 tdcs_pages[i] = NULL; 2615 } 2616 } 2617 if (!kvm_tdx->td.tdcs_pages) 2618 kfree(tdcs_pages); 2619 2620 tdx_mmu_release_hkid(kvm); 2621 tdx_reclaim_td_control_pages(kvm); 2622 2623 return ret; 2624 2625 free_packages: 2626 cpus_read_unlock(); 2627 free_cpumask_var(packages); 2628 2629 free_tdcs: 2630 for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) { 2631 if (tdcs_pages[i]) 2632 __free_page(tdcs_pages[i]); 2633 } 2634 kfree(tdcs_pages); 2635 kvm_tdx->td.tdcs_pages = NULL; 2636 2637 free_tdr: 2638 if (tdr_page) 2639 __free_page(tdr_page); 2640 kvm_tdx->td.tdr_page = 0; 2641 2642 free_hkid: 2643 tdx_hkid_free(kvm_tdx); 2644 2645 return ret; 2646 } 2647 2648 static u64 tdx_td_metadata_field_read(struct kvm_tdx *tdx, u64 field_id, 2649 u64 *data) 2650 { 2651 u64 err; 2652 2653 err = tdh_mng_rd(&tdx->td, field_id, data); 2654 2655 return err; 2656 } 2657 2658 #define TDX_MD_UNREADABLE_LEAF_MASK GENMASK(30, 7) 2659 #define TDX_MD_UNREADABLE_SUBLEAF_MASK GENMASK(31, 7) 2660 2661 static int tdx_read_cpuid(struct kvm_vcpu *vcpu, u32 leaf, u32 sub_leaf, 2662 bool sub_leaf_set, int *entry_index, 2663 struct kvm_cpuid_entry2 *out) 2664 { 2665 struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); 2666 u64 field_id = TD_MD_FIELD_ID_CPUID_VALUES; 2667 u64 ebx_eax, edx_ecx; 2668 u64 err = 0; 2669 2670 if (sub_leaf > 0b1111111) 2671 return -EINVAL; 2672 2673 if (*entry_index >= KVM_MAX_CPUID_ENTRIES) 2674 return -EINVAL; 2675 2676 if (leaf & TDX_MD_UNREADABLE_LEAF_MASK || 2677 sub_leaf & TDX_MD_UNREADABLE_SUBLEAF_MASK) 2678 return -EINVAL; 2679 2680 /* 2681 * bit 23:17, REVSERVED: reserved, must be 0; 2682 * bit 16, LEAF_31: leaf number bit 31; 2683 * bit 15:9, LEAF_6_0: leaf number bits 6:0, leaf bits 30:7 are 2684 * implicitly 0; 2685 * bit 8, SUBLEAF_NA: sub-leaf not applicable flag; 2686 * bit 7:1, SUBLEAF_6_0: sub-leaf number bits 6:0. If SUBLEAF_NA is 1, 2687 * the SUBLEAF_6_0 is all-1. 2688 * sub-leaf bits 31:7 are implicitly 0; 2689 * bit 0, ELEMENT_I: Element index within field; 2690 */ 2691 field_id |= ((leaf & 0x80000000) ? 1 : 0) << 16; 2692 field_id |= (leaf & 0x7f) << 9; 2693 if (sub_leaf_set) 2694 field_id |= (sub_leaf & 0x7f) << 1; 2695 else 2696 field_id |= 0x1fe; 2697 2698 err = tdx_td_metadata_field_read(kvm_tdx, field_id, &ebx_eax); 2699 if (err) //TODO check for specific errors 2700 goto err_out; 2701 2702 out->eax = (u32) ebx_eax; 2703 out->ebx = (u32) (ebx_eax >> 32); 2704 2705 field_id++; 2706 err = tdx_td_metadata_field_read(kvm_tdx, field_id, &edx_ecx); 2707 /* 2708 * It's weird that reading edx_ecx fails while reading ebx_eax 2709 * succeeded. 2710 */ 2711 if (WARN_ON_ONCE(err)) 2712 goto err_out; 2713 2714 out->ecx = (u32) edx_ecx; 2715 out->edx = (u32) (edx_ecx >> 32); 2716 2717 out->function = leaf; 2718 out->index = sub_leaf; 2719 out->flags |= sub_leaf_set ? KVM_CPUID_FLAG_SIGNIFCANT_INDEX : 0; 2720 2721 /* 2722 * Work around missing support on old TDX modules, fetch 2723 * guest maxpa from gfn_direct_bits. 2724 */ 2725 if (leaf == 0x80000008) { 2726 gpa_t gpa_bits = gfn_to_gpa(kvm_gfn_direct_bits(vcpu->kvm)); 2727 unsigned int g_maxpa = __ffs(gpa_bits) + 1; 2728 2729 out->eax = tdx_set_guest_phys_addr_bits(out->eax, g_maxpa); 2730 } 2731 2732 (*entry_index)++; 2733 2734 return 0; 2735 2736 err_out: 2737 out->eax = 0; 2738 out->ebx = 0; 2739 out->ecx = 0; 2740 out->edx = 0; 2741 2742 return -EIO; 2743 } 2744 2745 static int tdx_td_init(struct kvm *kvm, struct kvm_tdx_cmd *cmd) 2746 { 2747 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 2748 struct kvm_tdx_init_vm *init_vm; 2749 struct td_params *td_params = NULL; 2750 int ret; 2751 2752 BUILD_BUG_ON(sizeof(*init_vm) != 256 + sizeof_field(struct kvm_tdx_init_vm, cpuid)); 2753 BUILD_BUG_ON(sizeof(struct td_params) != 1024); 2754 2755 if (kvm_tdx->state != TD_STATE_UNINITIALIZED) 2756 return -EINVAL; 2757 2758 if (cmd->flags) 2759 return -EINVAL; 2760 2761 init_vm = kmalloc(sizeof(*init_vm) + 2762 sizeof(init_vm->cpuid.entries[0]) * KVM_MAX_CPUID_ENTRIES, 2763 GFP_KERNEL); 2764 if (!init_vm) 2765 return -ENOMEM; 2766 2767 if (copy_from_user(init_vm, u64_to_user_ptr(cmd->data), sizeof(*init_vm))) { 2768 ret = -EFAULT; 2769 goto out; 2770 } 2771 2772 if (init_vm->cpuid.nent > KVM_MAX_CPUID_ENTRIES) { 2773 ret = -E2BIG; 2774 goto out; 2775 } 2776 2777 if (copy_from_user(init_vm->cpuid.entries, 2778 u64_to_user_ptr(cmd->data) + sizeof(*init_vm), 2779 flex_array_size(init_vm, cpuid.entries, init_vm->cpuid.nent))) { 2780 ret = -EFAULT; 2781 goto out; 2782 } 2783 2784 if (memchr_inv(init_vm->reserved, 0, sizeof(init_vm->reserved))) { 2785 ret = -EINVAL; 2786 goto out; 2787 } 2788 2789 if (init_vm->cpuid.padding) { 2790 ret = -EINVAL; 2791 goto out; 2792 } 2793 2794 td_params = kzalloc(sizeof(struct td_params), GFP_KERNEL); 2795 if (!td_params) { 2796 ret = -ENOMEM; 2797 goto out; 2798 } 2799 2800 ret = setup_tdparams(kvm, td_params, init_vm); 2801 if (ret) 2802 goto out; 2803 2804 ret = __tdx_td_init(kvm, td_params, &cmd->hw_error); 2805 if (ret) 2806 goto out; 2807 2808 kvm_tdx->tsc_offset = td_tdcs_exec_read64(kvm_tdx, TD_TDCS_EXEC_TSC_OFFSET); 2809 kvm_tdx->tsc_multiplier = td_tdcs_exec_read64(kvm_tdx, TD_TDCS_EXEC_TSC_MULTIPLIER); 2810 kvm_tdx->attributes = td_params->attributes; 2811 kvm_tdx->xfam = td_params->xfam; 2812 2813 if (td_params->config_flags & TDX_CONFIG_FLAGS_MAX_GPAW) 2814 kvm->arch.gfn_direct_bits = TDX_SHARED_BIT_PWL_5; 2815 else 2816 kvm->arch.gfn_direct_bits = TDX_SHARED_BIT_PWL_4; 2817 2818 kvm_tdx->state = TD_STATE_INITIALIZED; 2819 out: 2820 /* kfree() accepts NULL. */ 2821 kfree(init_vm); 2822 kfree(td_params); 2823 2824 return ret; 2825 } 2826 2827 void tdx_flush_tlb_current(struct kvm_vcpu *vcpu) 2828 { 2829 /* 2830 * flush_tlb_current() is invoked when the first time for the vcpu to 2831 * run or when root of shared EPT is invalidated. 2832 * KVM only needs to flush shared EPT because the TDX module handles TLB 2833 * invalidation for private EPT in tdh_vp_enter(); 2834 * 2835 * A single context invalidation for shared EPT can be performed here. 2836 * However, this single context invalidation requires the private EPTP 2837 * rather than the shared EPTP to flush shared EPT, as shared EPT uses 2838 * private EPTP as its ASID for TLB invalidation. 2839 * 2840 * To avoid reading back private EPTP, perform a global invalidation for 2841 * shared EPT instead to keep this function simple. 2842 */ 2843 ept_sync_global(); 2844 } 2845 2846 void tdx_flush_tlb_all(struct kvm_vcpu *vcpu) 2847 { 2848 /* 2849 * TDX has called tdx_track() in tdx_sept_remove_private_spte() to 2850 * ensure that private EPT will be flushed on the next TD enter. No need 2851 * to call tdx_track() here again even when this callback is a result of 2852 * zapping private EPT. 2853 * 2854 * Due to the lack of the context to determine which EPT has been 2855 * affected by zapping, invoke invept() directly here for both shared 2856 * EPT and private EPT for simplicity, though it's not necessary for 2857 * private EPT. 2858 */ 2859 ept_sync_global(); 2860 } 2861 2862 static int tdx_td_finalize(struct kvm *kvm, struct kvm_tdx_cmd *cmd) 2863 { 2864 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 2865 2866 guard(mutex)(&kvm->slots_lock); 2867 2868 if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE) 2869 return -EINVAL; 2870 /* 2871 * Pages are pending for KVM_TDX_INIT_MEM_REGION to issue 2872 * TDH.MEM.PAGE.ADD(). 2873 */ 2874 if (atomic64_read(&kvm_tdx->nr_premapped)) 2875 return -EINVAL; 2876 2877 cmd->hw_error = tdh_mr_finalize(&kvm_tdx->td); 2878 if (tdx_operand_busy(cmd->hw_error)) 2879 return -EBUSY; 2880 if (KVM_BUG_ON(cmd->hw_error, kvm)) { 2881 pr_tdx_error(TDH_MR_FINALIZE, cmd->hw_error); 2882 return -EIO; 2883 } 2884 2885 kvm_tdx->state = TD_STATE_RUNNABLE; 2886 /* TD_STATE_RUNNABLE must be set before 'pre_fault_allowed' */ 2887 smp_wmb(); 2888 kvm->arch.pre_fault_allowed = true; 2889 return 0; 2890 } 2891 2892 int tdx_vm_ioctl(struct kvm *kvm, void __user *argp) 2893 { 2894 struct kvm_tdx_cmd tdx_cmd; 2895 int r; 2896 2897 if (copy_from_user(&tdx_cmd, argp, sizeof(struct kvm_tdx_cmd))) 2898 return -EFAULT; 2899 2900 /* 2901 * Userspace should never set hw_error. It is used to fill 2902 * hardware-defined error by the kernel. 2903 */ 2904 if (tdx_cmd.hw_error) 2905 return -EINVAL; 2906 2907 mutex_lock(&kvm->lock); 2908 2909 switch (tdx_cmd.id) { 2910 case KVM_TDX_CAPABILITIES: 2911 r = tdx_get_capabilities(&tdx_cmd); 2912 break; 2913 case KVM_TDX_INIT_VM: 2914 r = tdx_td_init(kvm, &tdx_cmd); 2915 break; 2916 case KVM_TDX_FINALIZE_VM: 2917 r = tdx_td_finalize(kvm, &tdx_cmd); 2918 break; 2919 default: 2920 r = -EINVAL; 2921 goto out; 2922 } 2923 2924 if (copy_to_user(argp, &tdx_cmd, sizeof(struct kvm_tdx_cmd))) 2925 r = -EFAULT; 2926 2927 out: 2928 mutex_unlock(&kvm->lock); 2929 return r; 2930 } 2931 2932 /* VMM can pass one 64bit auxiliary data to vcpu via RCX for guest BIOS. */ 2933 static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx) 2934 { 2935 struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); 2936 struct vcpu_tdx *tdx = to_tdx(vcpu); 2937 struct page *page; 2938 int ret, i; 2939 u64 err; 2940 2941 page = alloc_page(GFP_KERNEL); 2942 if (!page) 2943 return -ENOMEM; 2944 tdx->vp.tdvpr_page = page; 2945 2946 tdx->vp.tdcx_pages = kcalloc(kvm_tdx->td.tdcx_nr_pages, sizeof(*tdx->vp.tdcx_pages), 2947 GFP_KERNEL); 2948 if (!tdx->vp.tdcx_pages) { 2949 ret = -ENOMEM; 2950 goto free_tdvpr; 2951 } 2952 2953 for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) { 2954 page = alloc_page(GFP_KERNEL); 2955 if (!page) { 2956 ret = -ENOMEM; 2957 goto free_tdcx; 2958 } 2959 tdx->vp.tdcx_pages[i] = page; 2960 } 2961 2962 err = tdh_vp_create(&kvm_tdx->td, &tdx->vp); 2963 if (KVM_BUG_ON(err, vcpu->kvm)) { 2964 ret = -EIO; 2965 pr_tdx_error(TDH_VP_CREATE, err); 2966 goto free_tdcx; 2967 } 2968 2969 for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) { 2970 err = tdh_vp_addcx(&tdx->vp, tdx->vp.tdcx_pages[i]); 2971 if (KVM_BUG_ON(err, vcpu->kvm)) { 2972 pr_tdx_error(TDH_VP_ADDCX, err); 2973 /* 2974 * Pages already added are reclaimed by the vcpu_free 2975 * method, but the rest are freed here. 2976 */ 2977 for (; i < kvm_tdx->td.tdcx_nr_pages; i++) { 2978 __free_page(tdx->vp.tdcx_pages[i]); 2979 tdx->vp.tdcx_pages[i] = NULL; 2980 } 2981 return -EIO; 2982 } 2983 } 2984 2985 err = tdh_vp_init(&tdx->vp, vcpu_rcx, vcpu->vcpu_id); 2986 if (KVM_BUG_ON(err, vcpu->kvm)) { 2987 pr_tdx_error(TDH_VP_INIT, err); 2988 return -EIO; 2989 } 2990 2991 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 2992 2993 return 0; 2994 2995 free_tdcx: 2996 for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) { 2997 if (tdx->vp.tdcx_pages[i]) 2998 __free_page(tdx->vp.tdcx_pages[i]); 2999 tdx->vp.tdcx_pages[i] = NULL; 3000 } 3001 kfree(tdx->vp.tdcx_pages); 3002 tdx->vp.tdcx_pages = NULL; 3003 3004 free_tdvpr: 3005 if (tdx->vp.tdvpr_page) 3006 __free_page(tdx->vp.tdvpr_page); 3007 tdx->vp.tdvpr_page = 0; 3008 3009 return ret; 3010 } 3011 3012 /* Sometimes reads multipple subleafs. Return how many enties were written. */ 3013 static int tdx_vcpu_get_cpuid_leaf(struct kvm_vcpu *vcpu, u32 leaf, int *entry_index, 3014 struct kvm_cpuid_entry2 *output_e) 3015 { 3016 int sub_leaf = 0; 3017 int ret; 3018 3019 /* First try without a subleaf */ 3020 ret = tdx_read_cpuid(vcpu, leaf, 0, false, entry_index, output_e); 3021 3022 /* If success, or invalid leaf, just give up */ 3023 if (ret != -EIO) 3024 return ret; 3025 3026 /* 3027 * If the try without a subleaf failed, try reading subleafs until 3028 * failure. The TDX module only supports 6 bits of subleaf index. 3029 */ 3030 while (1) { 3031 /* Keep reading subleafs until there is a failure. */ 3032 if (tdx_read_cpuid(vcpu, leaf, sub_leaf, true, entry_index, output_e)) 3033 return !sub_leaf; 3034 3035 sub_leaf++; 3036 output_e++; 3037 } 3038 3039 return 0; 3040 } 3041 3042 static int tdx_vcpu_get_cpuid(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd) 3043 { 3044 struct kvm_cpuid2 __user *output, *td_cpuid; 3045 int r = 0, i = 0, leaf; 3046 u32 level; 3047 3048 output = u64_to_user_ptr(cmd->data); 3049 td_cpuid = kzalloc(sizeof(*td_cpuid) + 3050 sizeof(output->entries[0]) * KVM_MAX_CPUID_ENTRIES, 3051 GFP_KERNEL); 3052 if (!td_cpuid) 3053 return -ENOMEM; 3054 3055 if (copy_from_user(td_cpuid, output, sizeof(*output))) { 3056 r = -EFAULT; 3057 goto out; 3058 } 3059 3060 /* Read max CPUID for normal range */ 3061 if (tdx_vcpu_get_cpuid_leaf(vcpu, 0, &i, &td_cpuid->entries[i])) { 3062 r = -EIO; 3063 goto out; 3064 } 3065 level = td_cpuid->entries[0].eax; 3066 3067 for (leaf = 1; leaf <= level; leaf++) 3068 tdx_vcpu_get_cpuid_leaf(vcpu, leaf, &i, &td_cpuid->entries[i]); 3069 3070 /* Read max CPUID for extended range */ 3071 if (tdx_vcpu_get_cpuid_leaf(vcpu, 0x80000000, &i, &td_cpuid->entries[i])) { 3072 r = -EIO; 3073 goto out; 3074 } 3075 level = td_cpuid->entries[i - 1].eax; 3076 3077 for (leaf = 0x80000001; leaf <= level; leaf++) 3078 tdx_vcpu_get_cpuid_leaf(vcpu, leaf, &i, &td_cpuid->entries[i]); 3079 3080 if (td_cpuid->nent < i) 3081 r = -E2BIG; 3082 td_cpuid->nent = i; 3083 3084 if (copy_to_user(output, td_cpuid, sizeof(*output))) { 3085 r = -EFAULT; 3086 goto out; 3087 } 3088 3089 if (r == -E2BIG) 3090 goto out; 3091 3092 if (copy_to_user(output->entries, td_cpuid->entries, 3093 td_cpuid->nent * sizeof(struct kvm_cpuid_entry2))) 3094 r = -EFAULT; 3095 3096 out: 3097 kfree(td_cpuid); 3098 3099 return r; 3100 } 3101 3102 static int tdx_vcpu_init(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd) 3103 { 3104 u64 apic_base; 3105 struct vcpu_tdx *tdx = to_tdx(vcpu); 3106 int ret; 3107 3108 if (cmd->flags) 3109 return -EINVAL; 3110 3111 if (tdx->state != VCPU_TD_STATE_UNINITIALIZED) 3112 return -EINVAL; 3113 3114 /* 3115 * TDX requires X2APIC, userspace is responsible for configuring guest 3116 * CPUID accordingly. 3117 */ 3118 apic_base = APIC_DEFAULT_PHYS_BASE | LAPIC_MODE_X2APIC | 3119 (kvm_vcpu_is_reset_bsp(vcpu) ? MSR_IA32_APICBASE_BSP : 0); 3120 if (kvm_apic_set_base(vcpu, apic_base, true)) 3121 return -EINVAL; 3122 3123 ret = tdx_td_vcpu_init(vcpu, (u64)cmd->data); 3124 if (ret) 3125 return ret; 3126 3127 td_vmcs_write16(tdx, POSTED_INTR_NV, POSTED_INTR_VECTOR); 3128 td_vmcs_write64(tdx, POSTED_INTR_DESC_ADDR, __pa(&tdx->vt.pi_desc)); 3129 td_vmcs_setbit32(tdx, PIN_BASED_VM_EXEC_CONTROL, PIN_BASED_POSTED_INTR); 3130 3131 tdx->state = VCPU_TD_STATE_INITIALIZED; 3132 3133 return 0; 3134 } 3135 3136 void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) 3137 { 3138 /* 3139 * Yell on INIT, as TDX doesn't support INIT, i.e. KVM should drop all 3140 * INIT events. 3141 * 3142 * Defer initializing vCPU for RESET state until KVM_TDX_INIT_VCPU, as 3143 * userspace needs to define the vCPU model before KVM can initialize 3144 * vCPU state, e.g. to enable x2APIC. 3145 */ 3146 WARN_ON_ONCE(init_event); 3147 } 3148 3149 struct tdx_gmem_post_populate_arg { 3150 struct kvm_vcpu *vcpu; 3151 __u32 flags; 3152 }; 3153 3154 static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, 3155 void __user *src, int order, void *_arg) 3156 { 3157 u64 error_code = PFERR_GUEST_FINAL_MASK | PFERR_PRIVATE_ACCESS; 3158 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 3159 struct tdx_gmem_post_populate_arg *arg = _arg; 3160 struct kvm_vcpu *vcpu = arg->vcpu; 3161 gpa_t gpa = gfn_to_gpa(gfn); 3162 u8 level = PG_LEVEL_4K; 3163 struct page *src_page; 3164 int ret, i; 3165 u64 err, entry, level_state; 3166 3167 /* 3168 * Get the source page if it has been faulted in. Return failure if the 3169 * source page has been swapped out or unmapped in primary memory. 3170 */ 3171 ret = get_user_pages_fast((unsigned long)src, 1, 0, &src_page); 3172 if (ret < 0) 3173 return ret; 3174 if (ret != 1) 3175 return -ENOMEM; 3176 3177 ret = kvm_tdp_map_page(vcpu, gpa, error_code, &level); 3178 if (ret < 0) 3179 goto out; 3180 3181 /* 3182 * The private mem cannot be zapped after kvm_tdp_map_page() 3183 * because all paths are covered by slots_lock and the 3184 * filemap invalidate lock. Check that they are indeed enough. 3185 */ 3186 if (IS_ENABLED(CONFIG_KVM_PROVE_MMU)) { 3187 scoped_guard(read_lock, &kvm->mmu_lock) { 3188 if (KVM_BUG_ON(!kvm_tdp_mmu_gpa_is_mapped(vcpu, gpa), kvm)) { 3189 ret = -EIO; 3190 goto out; 3191 } 3192 } 3193 } 3194 3195 ret = 0; 3196 err = tdh_mem_page_add(&kvm_tdx->td, gpa, pfn_to_page(pfn), 3197 src_page, &entry, &level_state); 3198 if (err) { 3199 ret = unlikely(tdx_operand_busy(err)) ? -EBUSY : -EIO; 3200 goto out; 3201 } 3202 3203 if (!KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm)) 3204 atomic64_dec(&kvm_tdx->nr_premapped); 3205 3206 if (arg->flags & KVM_TDX_MEASURE_MEMORY_REGION) { 3207 for (i = 0; i < PAGE_SIZE; i += TDX_EXTENDMR_CHUNKSIZE) { 3208 err = tdh_mr_extend(&kvm_tdx->td, gpa + i, &entry, 3209 &level_state); 3210 if (err) { 3211 ret = -EIO; 3212 break; 3213 } 3214 } 3215 } 3216 3217 out: 3218 put_page(src_page); 3219 return ret; 3220 } 3221 3222 static int tdx_vcpu_init_mem_region(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd) 3223 { 3224 struct vcpu_tdx *tdx = to_tdx(vcpu); 3225 struct kvm *kvm = vcpu->kvm; 3226 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 3227 struct kvm_tdx_init_mem_region region; 3228 struct tdx_gmem_post_populate_arg arg; 3229 long gmem_ret; 3230 int ret; 3231 3232 if (tdx->state != VCPU_TD_STATE_INITIALIZED) 3233 return -EINVAL; 3234 3235 guard(mutex)(&kvm->slots_lock); 3236 3237 /* Once TD is finalized, the initial guest memory is fixed. */ 3238 if (kvm_tdx->state == TD_STATE_RUNNABLE) 3239 return -EINVAL; 3240 3241 if (cmd->flags & ~KVM_TDX_MEASURE_MEMORY_REGION) 3242 return -EINVAL; 3243 3244 if (copy_from_user(®ion, u64_to_user_ptr(cmd->data), sizeof(region))) 3245 return -EFAULT; 3246 3247 if (!PAGE_ALIGNED(region.source_addr) || !PAGE_ALIGNED(region.gpa) || 3248 !region.nr_pages || 3249 region.gpa + (region.nr_pages << PAGE_SHIFT) <= region.gpa || 3250 !vt_is_tdx_private_gpa(kvm, region.gpa) || 3251 !vt_is_tdx_private_gpa(kvm, region.gpa + (region.nr_pages << PAGE_SHIFT) - 1)) 3252 return -EINVAL; 3253 3254 kvm_mmu_reload(vcpu); 3255 ret = 0; 3256 while (region.nr_pages) { 3257 if (signal_pending(current)) { 3258 ret = -EINTR; 3259 break; 3260 } 3261 3262 arg = (struct tdx_gmem_post_populate_arg) { 3263 .vcpu = vcpu, 3264 .flags = cmd->flags, 3265 }; 3266 gmem_ret = kvm_gmem_populate(kvm, gpa_to_gfn(region.gpa), 3267 u64_to_user_ptr(region.source_addr), 3268 1, tdx_gmem_post_populate, &arg); 3269 if (gmem_ret < 0) { 3270 ret = gmem_ret; 3271 break; 3272 } 3273 3274 if (gmem_ret != 1) { 3275 ret = -EIO; 3276 break; 3277 } 3278 3279 region.source_addr += PAGE_SIZE; 3280 region.gpa += PAGE_SIZE; 3281 region.nr_pages--; 3282 3283 cond_resched(); 3284 } 3285 3286 if (copy_to_user(u64_to_user_ptr(cmd->data), ®ion, sizeof(region))) 3287 ret = -EFAULT; 3288 return ret; 3289 } 3290 3291 int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp) 3292 { 3293 struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); 3294 struct kvm_tdx_cmd cmd; 3295 int ret; 3296 3297 if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE) 3298 return -EINVAL; 3299 3300 if (copy_from_user(&cmd, argp, sizeof(cmd))) 3301 return -EFAULT; 3302 3303 if (cmd.hw_error) 3304 return -EINVAL; 3305 3306 switch (cmd.id) { 3307 case KVM_TDX_INIT_VCPU: 3308 ret = tdx_vcpu_init(vcpu, &cmd); 3309 break; 3310 case KVM_TDX_INIT_MEM_REGION: 3311 ret = tdx_vcpu_init_mem_region(vcpu, &cmd); 3312 break; 3313 case KVM_TDX_GET_CPUID: 3314 ret = tdx_vcpu_get_cpuid(vcpu, &cmd); 3315 break; 3316 default: 3317 ret = -EINVAL; 3318 break; 3319 } 3320 3321 return ret; 3322 } 3323 3324 int tdx_gmem_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn) 3325 { 3326 return PG_LEVEL_4K; 3327 } 3328 3329 static int tdx_online_cpu(unsigned int cpu) 3330 { 3331 unsigned long flags; 3332 int r; 3333 3334 /* Sanity check CPU is already in post-VMXON */ 3335 WARN_ON_ONCE(!(cr4_read_shadow() & X86_CR4_VMXE)); 3336 3337 local_irq_save(flags); 3338 r = tdx_cpu_enable(); 3339 local_irq_restore(flags); 3340 3341 return r; 3342 } 3343 3344 static int tdx_offline_cpu(unsigned int cpu) 3345 { 3346 int i; 3347 3348 /* No TD is running. Allow any cpu to be offline. */ 3349 if (!atomic_read(&nr_configured_hkid)) 3350 return 0; 3351 3352 /* 3353 * In order to reclaim TDX HKID, (i.e. when deleting guest TD), need to 3354 * call TDH.PHYMEM.PAGE.WBINVD on all packages to program all memory 3355 * controller with pconfig. If we have active TDX HKID, refuse to 3356 * offline the last online cpu. 3357 */ 3358 for_each_online_cpu(i) { 3359 /* 3360 * Found another online cpu on the same package. 3361 * Allow to offline. 3362 */ 3363 if (i != cpu && topology_physical_package_id(i) == 3364 topology_physical_package_id(cpu)) 3365 return 0; 3366 } 3367 3368 /* 3369 * This is the last cpu of this package. Don't offline it. 3370 * 3371 * Because it's hard for human operator to understand the 3372 * reason, warn it. 3373 */ 3374 #define MSG_ALLPKG_ONLINE \ 3375 "TDX requires all packages to have an online CPU. Delete all TDs in order to offline all CPUs of a package.\n" 3376 pr_warn_ratelimited(MSG_ALLPKG_ONLINE); 3377 return -EBUSY; 3378 } 3379 3380 static void __do_tdx_cleanup(void) 3381 { 3382 /* 3383 * Once TDX module is initialized, it cannot be disabled and 3384 * re-initialized again w/o runtime update (which isn't 3385 * supported by kernel). Only need to remove the cpuhp here. 3386 * The TDX host core code tracks TDX status and can handle 3387 * 'multiple enabling' scenario. 3388 */ 3389 WARN_ON_ONCE(!tdx_cpuhp_state); 3390 cpuhp_remove_state_nocalls_cpuslocked(tdx_cpuhp_state); 3391 tdx_cpuhp_state = 0; 3392 } 3393 3394 static void __tdx_cleanup(void) 3395 { 3396 cpus_read_lock(); 3397 __do_tdx_cleanup(); 3398 cpus_read_unlock(); 3399 } 3400 3401 static int __init __do_tdx_bringup(void) 3402 { 3403 int r; 3404 3405 /* 3406 * TDX-specific cpuhp callback to call tdx_cpu_enable() on all 3407 * online CPUs before calling tdx_enable(), and on any new 3408 * going-online CPU to make sure it is ready for TDX guest. 3409 */ 3410 r = cpuhp_setup_state_cpuslocked(CPUHP_AP_ONLINE_DYN, 3411 "kvm/cpu/tdx:online", 3412 tdx_online_cpu, tdx_offline_cpu); 3413 if (r < 0) 3414 return r; 3415 3416 tdx_cpuhp_state = r; 3417 3418 r = tdx_enable(); 3419 if (r) 3420 __do_tdx_cleanup(); 3421 3422 return r; 3423 } 3424 3425 static int __init __tdx_bringup(void) 3426 { 3427 const struct tdx_sys_info_td_conf *td_conf; 3428 int r, i; 3429 3430 for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++) { 3431 /* 3432 * Check if MSRs (tdx_uret_msrs) can be saved/restored 3433 * before returning to user space. 3434 * 3435 * this_cpu_ptr(user_return_msrs)->registered isn't checked 3436 * because the registration is done at vcpu runtime by 3437 * tdx_user_return_msr_update_cache(). 3438 */ 3439 tdx_uret_msrs[i].slot = kvm_find_user_return_msr(tdx_uret_msrs[i].msr); 3440 if (tdx_uret_msrs[i].slot == -1) { 3441 /* If any MSR isn't supported, it is a KVM bug */ 3442 pr_err("MSR %x isn't included by kvm_find_user_return_msr\n", 3443 tdx_uret_msrs[i].msr); 3444 return -EIO; 3445 } 3446 } 3447 3448 /* 3449 * Enabling TDX requires enabling hardware virtualization first, 3450 * as making SEAMCALLs requires CPU being in post-VMXON state. 3451 */ 3452 r = kvm_enable_virtualization(); 3453 if (r) 3454 return r; 3455 3456 cpus_read_lock(); 3457 r = __do_tdx_bringup(); 3458 cpus_read_unlock(); 3459 3460 if (r) 3461 goto tdx_bringup_err; 3462 3463 /* Get TDX global information for later use */ 3464 tdx_sysinfo = tdx_get_sysinfo(); 3465 if (WARN_ON_ONCE(!tdx_sysinfo)) { 3466 r = -EINVAL; 3467 goto get_sysinfo_err; 3468 } 3469 3470 /* Check TDX module and KVM capabilities */ 3471 if (!tdx_get_supported_attrs(&tdx_sysinfo->td_conf) || 3472 !tdx_get_supported_xfam(&tdx_sysinfo->td_conf)) 3473 goto get_sysinfo_err; 3474 3475 if (!(tdx_sysinfo->features.tdx_features0 & MD_FIELD_ID_FEATURES0_TOPOLOGY_ENUM)) 3476 goto get_sysinfo_err; 3477 3478 /* 3479 * TDX has its own limit of maximum vCPUs it can support for all 3480 * TDX guests in addition to KVM_MAX_VCPUS. Userspace needs to 3481 * query TDX guest's maximum vCPUs by checking KVM_CAP_MAX_VCPU 3482 * extension on per-VM basis. 3483 * 3484 * TDX module reports such limit via the MAX_VCPU_PER_TD global 3485 * metadata. Different modules may report different values. 3486 * Some old module may also not support this metadata (in which 3487 * case this limit is U16_MAX). 3488 * 3489 * In practice, the reported value reflects the maximum logical 3490 * CPUs that ALL the platforms that the module supports can 3491 * possibly have. 3492 * 3493 * Simply forwarding the MAX_VCPU_PER_TD to userspace could 3494 * result in an unpredictable ABI. KVM instead always advertise 3495 * the number of logical CPUs the platform has as the maximum 3496 * vCPUs for TDX guests. 3497 * 3498 * Make sure MAX_VCPU_PER_TD reported by TDX module is not 3499 * smaller than the number of logical CPUs, otherwise KVM will 3500 * report an unsupported value to userspace. 3501 * 3502 * Note, a platform with TDX enabled in the BIOS cannot support 3503 * physical CPU hotplug, and TDX requires the BIOS has marked 3504 * all logical CPUs in MADT table as enabled. Just use 3505 * num_present_cpus() for the number of logical CPUs. 3506 */ 3507 td_conf = &tdx_sysinfo->td_conf; 3508 if (td_conf->max_vcpus_per_td < num_present_cpus()) { 3509 pr_err("Disable TDX: MAX_VCPU_PER_TD (%u) smaller than number of logical CPUs (%u).\n", 3510 td_conf->max_vcpus_per_td, num_present_cpus()); 3511 r = -EINVAL; 3512 goto get_sysinfo_err; 3513 } 3514 3515 if (misc_cg_set_capacity(MISC_CG_RES_TDX, tdx_get_nr_guest_keyids())) { 3516 r = -EINVAL; 3517 goto get_sysinfo_err; 3518 } 3519 3520 /* 3521 * Leave hardware virtualization enabled after TDX is enabled 3522 * successfully. TDX CPU hotplug depends on this. 3523 */ 3524 return 0; 3525 3526 get_sysinfo_err: 3527 __tdx_cleanup(); 3528 tdx_bringup_err: 3529 kvm_disable_virtualization(); 3530 return r; 3531 } 3532 3533 void tdx_cleanup(void) 3534 { 3535 if (enable_tdx) { 3536 misc_cg_set_capacity(MISC_CG_RES_TDX, 0); 3537 __tdx_cleanup(); 3538 kvm_disable_virtualization(); 3539 } 3540 } 3541 3542 int __init tdx_bringup(void) 3543 { 3544 int r, i; 3545 3546 /* tdx_disable_virtualization_cpu() uses associated_tdvcpus. */ 3547 for_each_possible_cpu(i) 3548 INIT_LIST_HEAD(&per_cpu(associated_tdvcpus, i)); 3549 3550 if (!enable_tdx) 3551 return 0; 3552 3553 if (!enable_ept) { 3554 pr_err("EPT is required for TDX\n"); 3555 goto success_disable_tdx; 3556 } 3557 3558 if (!tdp_mmu_enabled || !enable_mmio_caching || !enable_ept_ad_bits) { 3559 pr_err("TDP MMU and MMIO caching and EPT A/D bit is required for TDX\n"); 3560 goto success_disable_tdx; 3561 } 3562 3563 if (!enable_apicv) { 3564 pr_err("APICv is required for TDX\n"); 3565 goto success_disable_tdx; 3566 } 3567 3568 if (!cpu_feature_enabled(X86_FEATURE_OSXSAVE)) { 3569 pr_err("tdx: OSXSAVE is required for TDX\n"); 3570 goto success_disable_tdx; 3571 } 3572 3573 if (!cpu_feature_enabled(X86_FEATURE_MOVDIR64B)) { 3574 pr_err("tdx: MOVDIR64B is required for TDX\n"); 3575 goto success_disable_tdx; 3576 } 3577 3578 if (!cpu_feature_enabled(X86_FEATURE_SELFSNOOP)) { 3579 pr_err("Self-snoop is required for TDX\n"); 3580 goto success_disable_tdx; 3581 } 3582 3583 if (!cpu_feature_enabled(X86_FEATURE_TDX_HOST_PLATFORM)) { 3584 pr_err("tdx: no TDX private KeyIDs available\n"); 3585 goto success_disable_tdx; 3586 } 3587 3588 if (!enable_virt_at_load) { 3589 pr_err("tdx: tdx requires kvm.enable_virt_at_load=1\n"); 3590 goto success_disable_tdx; 3591 } 3592 3593 /* 3594 * Ideally KVM should probe whether TDX module has been loaded 3595 * first and then try to bring it up. But TDX needs to use SEAMCALL 3596 * to probe whether the module is loaded (there is no CPUID or MSR 3597 * for that), and making SEAMCALL requires enabling virtualization 3598 * first, just like the rest steps of bringing up TDX module. 3599 * 3600 * So, for simplicity do everything in __tdx_bringup(); the first 3601 * SEAMCALL will return -ENODEV when the module is not loaded. The 3602 * only complication is having to make sure that initialization 3603 * SEAMCALLs don't return TDX_SEAMCALL_VMFAILINVALID in other 3604 * cases. 3605 */ 3606 r = __tdx_bringup(); 3607 if (r) { 3608 /* 3609 * Disable TDX only but don't fail to load module if 3610 * the TDX module could not be loaded. No need to print 3611 * message saying "module is not loaded" because it was 3612 * printed when the first SEAMCALL failed. 3613 */ 3614 if (r == -ENODEV) 3615 goto success_disable_tdx; 3616 3617 enable_tdx = 0; 3618 } 3619 3620 return r; 3621 3622 success_disable_tdx: 3623 enable_tdx = 0; 3624 return 0; 3625 } 3626