1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/cleanup.h> 3 #include <linux/cpu.h> 4 #include <asm/cpufeature.h> 5 #include <asm/fpu/xcr.h> 6 #include <linux/misc_cgroup.h> 7 #include <linux/mmu_context.h> 8 #include <asm/tdx.h> 9 #include "capabilities.h" 10 #include "mmu.h" 11 #include "x86_ops.h" 12 #include "lapic.h" 13 #include "tdx.h" 14 #include "vmx.h" 15 #include "mmu/spte.h" 16 #include "common.h" 17 #include "posted_intr.h" 18 #include "irq.h" 19 #include <trace/events/kvm.h> 20 #include "trace.h" 21 22 #pragma GCC poison to_vmx 23 24 #undef pr_fmt 25 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 26 27 #define __TDX_BUG_ON(__err, __f, __kvm, __fmt, __args...) \ 28 ({ \ 29 struct kvm *_kvm = (__kvm); \ 30 bool __ret = !!(__err); \ 31 \ 32 if (WARN_ON_ONCE(__ret && (!_kvm || !_kvm->vm_bugged))) { \ 33 if (_kvm) \ 34 kvm_vm_bugged(_kvm); \ 35 pr_err_ratelimited("SEAMCALL " __f " failed: 0x%llx" __fmt "\n",\ 36 __err, __args); \ 37 } \ 38 unlikely(__ret); \ 39 }) 40 41 #define TDX_BUG_ON(__err, __fn, __kvm) \ 42 __TDX_BUG_ON(__err, #__fn, __kvm, "%s", "") 43 44 #define TDX_BUG_ON_1(__err, __fn, a1, __kvm) \ 45 __TDX_BUG_ON(__err, #__fn, __kvm, ", " #a1 " 0x%llx", a1) 46 47 #define TDX_BUG_ON_2(__err, __fn, a1, a2, __kvm) \ 48 __TDX_BUG_ON(__err, #__fn, __kvm, ", " #a1 " 0x%llx, " #a2 " 0x%llx", a1, a2) 49 50 #define TDX_BUG_ON_3(__err, __fn, a1, a2, a3, __kvm) \ 51 __TDX_BUG_ON(__err, #__fn, __kvm, ", " #a1 " 0x%llx, " #a2 ", 0x%llx, " #a3 " 0x%llx", \ 52 a1, a2, a3) 53 54 55 bool enable_tdx __ro_after_init; 56 module_param_named(tdx, enable_tdx, bool, 0444); 57 58 #define TDX_SHARED_BIT_PWL_5 gpa_to_gfn(BIT_ULL(51)) 59 #define TDX_SHARED_BIT_PWL_4 gpa_to_gfn(BIT_ULL(47)) 60 61 static enum cpuhp_state tdx_cpuhp_state; 62 63 static const struct tdx_sys_info *tdx_sysinfo; 64 65 void tdh_vp_rd_failed(struct vcpu_tdx *tdx, char *uclass, u32 field, u64 err) 66 { 67 KVM_BUG_ON(1, tdx->vcpu.kvm); 68 pr_err("TDH_VP_RD[%s.0x%x] failed 0x%llx\n", uclass, field, err); 69 } 70 71 void tdh_vp_wr_failed(struct vcpu_tdx *tdx, char *uclass, char *op, u32 field, 72 u64 val, u64 err) 73 { 74 KVM_BUG_ON(1, tdx->vcpu.kvm); 75 pr_err("TDH_VP_WR[%s.0x%x]%s0x%llx failed: 0x%llx\n", uclass, field, op, val, err); 76 } 77 78 #define KVM_SUPPORTED_TD_ATTRS (TDX_TD_ATTR_SEPT_VE_DISABLE) 79 80 static __always_inline struct kvm_tdx *to_kvm_tdx(struct kvm *kvm) 81 { 82 return container_of(kvm, struct kvm_tdx, kvm); 83 } 84 85 static __always_inline struct vcpu_tdx *to_tdx(struct kvm_vcpu *vcpu) 86 { 87 return container_of(vcpu, struct vcpu_tdx, vcpu); 88 } 89 90 static u64 tdx_get_supported_attrs(const struct tdx_sys_info_td_conf *td_conf) 91 { 92 u64 val = KVM_SUPPORTED_TD_ATTRS; 93 94 if ((val & td_conf->attributes_fixed1) != td_conf->attributes_fixed1) 95 return 0; 96 97 val &= td_conf->attributes_fixed0; 98 99 return val; 100 } 101 102 static u64 tdx_get_supported_xfam(const struct tdx_sys_info_td_conf *td_conf) 103 { 104 u64 val = kvm_caps.supported_xcr0 | kvm_caps.supported_xss; 105 106 if ((val & td_conf->xfam_fixed1) != td_conf->xfam_fixed1) 107 return 0; 108 109 val &= td_conf->xfam_fixed0; 110 111 return val; 112 } 113 114 static int tdx_get_guest_phys_addr_bits(const u32 eax) 115 { 116 return (eax & GENMASK(23, 16)) >> 16; 117 } 118 119 static u32 tdx_set_guest_phys_addr_bits(const u32 eax, int addr_bits) 120 { 121 return (eax & ~GENMASK(23, 16)) | (addr_bits & 0xff) << 16; 122 } 123 124 #define TDX_FEATURE_TSX (__feature_bit(X86_FEATURE_HLE) | __feature_bit(X86_FEATURE_RTM)) 125 126 static bool has_tsx(const struct kvm_cpuid_entry2 *entry) 127 { 128 return entry->function == 7 && entry->index == 0 && 129 (entry->ebx & TDX_FEATURE_TSX); 130 } 131 132 static void clear_tsx(struct kvm_cpuid_entry2 *entry) 133 { 134 entry->ebx &= ~TDX_FEATURE_TSX; 135 } 136 137 static bool has_waitpkg(const struct kvm_cpuid_entry2 *entry) 138 { 139 return entry->function == 7 && entry->index == 0 && 140 (entry->ecx & __feature_bit(X86_FEATURE_WAITPKG)); 141 } 142 143 static void clear_waitpkg(struct kvm_cpuid_entry2 *entry) 144 { 145 entry->ecx &= ~__feature_bit(X86_FEATURE_WAITPKG); 146 } 147 148 static void tdx_clear_unsupported_cpuid(struct kvm_cpuid_entry2 *entry) 149 { 150 if (has_tsx(entry)) 151 clear_tsx(entry); 152 153 if (has_waitpkg(entry)) 154 clear_waitpkg(entry); 155 } 156 157 static bool tdx_unsupported_cpuid(const struct kvm_cpuid_entry2 *entry) 158 { 159 return has_tsx(entry) || has_waitpkg(entry); 160 } 161 162 #define KVM_TDX_CPUID_NO_SUBLEAF ((__u32)-1) 163 164 static void td_init_cpuid_entry2(struct kvm_cpuid_entry2 *entry, unsigned char idx) 165 { 166 const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf; 167 168 entry->function = (u32)td_conf->cpuid_config_leaves[idx]; 169 entry->index = td_conf->cpuid_config_leaves[idx] >> 32; 170 entry->eax = (u32)td_conf->cpuid_config_values[idx][0]; 171 entry->ebx = td_conf->cpuid_config_values[idx][0] >> 32; 172 entry->ecx = (u32)td_conf->cpuid_config_values[idx][1]; 173 entry->edx = td_conf->cpuid_config_values[idx][1] >> 32; 174 175 if (entry->index == KVM_TDX_CPUID_NO_SUBLEAF) 176 entry->index = 0; 177 178 /* 179 * The TDX module doesn't allow configuring the guest phys addr bits 180 * (EAX[23:16]). However, KVM uses it as an interface to the userspace 181 * to configure the GPAW. Report these bits as configurable. 182 */ 183 if (entry->function == 0x80000008) 184 entry->eax = tdx_set_guest_phys_addr_bits(entry->eax, 0xff); 185 186 tdx_clear_unsupported_cpuid(entry); 187 } 188 189 #define TDVMCALLINFO_SETUP_EVENT_NOTIFY_INTERRUPT BIT(1) 190 191 static int init_kvm_tdx_caps(const struct tdx_sys_info_td_conf *td_conf, 192 struct kvm_tdx_capabilities *caps) 193 { 194 int i; 195 196 caps->supported_attrs = tdx_get_supported_attrs(td_conf); 197 if (!caps->supported_attrs) 198 return -EIO; 199 200 caps->supported_xfam = tdx_get_supported_xfam(td_conf); 201 if (!caps->supported_xfam) 202 return -EIO; 203 204 caps->cpuid.nent = td_conf->num_cpuid_config; 205 206 caps->user_tdvmcallinfo_1_r11 = 207 TDVMCALLINFO_SETUP_EVENT_NOTIFY_INTERRUPT; 208 209 for (i = 0; i < td_conf->num_cpuid_config; i++) 210 td_init_cpuid_entry2(&caps->cpuid.entries[i], i); 211 212 return 0; 213 } 214 215 /* 216 * Some SEAMCALLs acquire the TDX module globally, and can fail with 217 * TDX_OPERAND_BUSY. Use a global mutex to serialize these SEAMCALLs. 218 */ 219 static DEFINE_MUTEX(tdx_lock); 220 221 static atomic_t nr_configured_hkid; 222 223 static bool tdx_operand_busy(u64 err) 224 { 225 return (err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_BUSY; 226 } 227 228 229 /* 230 * A per-CPU list of TD vCPUs associated with a given CPU. 231 * Protected by interrupt mask. Only manipulated by the CPU owning this per-CPU 232 * list. 233 * - When a vCPU is loaded onto a CPU, it is removed from the per-CPU list of 234 * the old CPU during the IPI callback running on the old CPU, and then added 235 * to the per-CPU list of the new CPU. 236 * - When a TD is tearing down, all vCPUs are disassociated from their current 237 * running CPUs and removed from the per-CPU list during the IPI callback 238 * running on those CPUs. 239 * - When a CPU is brought down, traverse the per-CPU list to disassociate all 240 * associated TD vCPUs and remove them from the per-CPU list. 241 */ 242 static DEFINE_PER_CPU(struct list_head, associated_tdvcpus); 243 244 static __always_inline unsigned long tdvmcall_exit_type(struct kvm_vcpu *vcpu) 245 { 246 return to_tdx(vcpu)->vp_enter_args.r10; 247 } 248 249 static __always_inline unsigned long tdvmcall_leaf(struct kvm_vcpu *vcpu) 250 { 251 return to_tdx(vcpu)->vp_enter_args.r11; 252 } 253 254 static __always_inline void tdvmcall_set_return_code(struct kvm_vcpu *vcpu, 255 long val) 256 { 257 to_tdx(vcpu)->vp_enter_args.r10 = val; 258 } 259 260 static __always_inline void tdvmcall_set_return_val(struct kvm_vcpu *vcpu, 261 unsigned long val) 262 { 263 to_tdx(vcpu)->vp_enter_args.r11 = val; 264 } 265 266 static inline void tdx_hkid_free(struct kvm_tdx *kvm_tdx) 267 { 268 tdx_guest_keyid_free(kvm_tdx->hkid); 269 kvm_tdx->hkid = -1; 270 atomic_dec(&nr_configured_hkid); 271 misc_cg_uncharge(MISC_CG_RES_TDX, kvm_tdx->misc_cg, 1); 272 put_misc_cg(kvm_tdx->misc_cg); 273 kvm_tdx->misc_cg = NULL; 274 } 275 276 static inline bool is_hkid_assigned(struct kvm_tdx *kvm_tdx) 277 { 278 return kvm_tdx->hkid > 0; 279 } 280 281 static inline void tdx_disassociate_vp(struct kvm_vcpu *vcpu) 282 { 283 lockdep_assert_irqs_disabled(); 284 285 list_del(&to_tdx(vcpu)->cpu_list); 286 287 /* 288 * Ensure tdx->cpu_list is updated before setting vcpu->cpu to -1, 289 * otherwise, a different CPU can see vcpu->cpu = -1 and add the vCPU 290 * to its list before it's deleted from this CPU's list. 291 */ 292 smp_wmb(); 293 294 vcpu->cpu = -1; 295 } 296 297 /* 298 * Execute a SEAMCALL related to removing/blocking S-EPT entries, with a single 299 * retry (if necessary) after forcing vCPUs to exit and wait for the operation 300 * to complete. All flows that remove/block S-EPT entries run with mmu_lock 301 * held for write, i.e. are mutually exclusive with each other, but they aren't 302 * mutually exclusive with running vCPUs, and so can fail with "operand busy" 303 * if a vCPU acquires a relevant lock in the TDX-Module, e.g. when doing TDCALL. 304 * 305 * Note, the retry is guaranteed to succeed, absent KVM and/or TDX-Module bugs. 306 */ 307 #define tdh_do_no_vcpus(tdh_func, kvm, args...) \ 308 ({ \ 309 struct kvm_tdx *__kvm_tdx = to_kvm_tdx(kvm); \ 310 u64 __err; \ 311 \ 312 lockdep_assert_held_write(&kvm->mmu_lock); \ 313 \ 314 __err = tdh_func(args); \ 315 if (unlikely(tdx_operand_busy(__err))) { \ 316 WRITE_ONCE(__kvm_tdx->wait_for_sept_zap, true); \ 317 kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE); \ 318 \ 319 __err = tdh_func(args); \ 320 \ 321 WRITE_ONCE(__kvm_tdx->wait_for_sept_zap, false); \ 322 } \ 323 __err; \ 324 }) 325 326 /* TDH.PHYMEM.PAGE.RECLAIM is allowed only when destroying the TD. */ 327 static int __tdx_reclaim_page(struct page *page) 328 { 329 u64 err, rcx, rdx, r8; 330 331 err = tdh_phymem_page_reclaim(page, &rcx, &rdx, &r8); 332 333 /* 334 * No need to check for TDX_OPERAND_BUSY; all TD pages are freed 335 * before the HKID is released and control pages have also been 336 * released at this point, so there is no possibility of contention. 337 */ 338 if (TDX_BUG_ON_3(err, TDH_PHYMEM_PAGE_RECLAIM, rcx, rdx, r8, NULL)) 339 return -EIO; 340 341 return 0; 342 } 343 344 static int tdx_reclaim_page(struct page *page) 345 { 346 int r; 347 348 r = __tdx_reclaim_page(page); 349 if (!r) 350 tdx_quirk_reset_page(page); 351 return r; 352 } 353 354 355 /* 356 * Reclaim the TD control page(s) which are crypto-protected by TDX guest's 357 * private KeyID. Assume the cache associated with the TDX private KeyID has 358 * been flushed. 359 */ 360 static void tdx_reclaim_control_page(struct page *ctrl_page) 361 { 362 /* 363 * Leak the page if the kernel failed to reclaim the page. 364 * The kernel cannot use it safely anymore. 365 */ 366 if (tdx_reclaim_page(ctrl_page)) 367 return; 368 369 __free_page(ctrl_page); 370 } 371 372 struct tdx_flush_vp_arg { 373 struct kvm_vcpu *vcpu; 374 u64 err; 375 }; 376 377 static void tdx_flush_vp(void *_arg) 378 { 379 struct tdx_flush_vp_arg *arg = _arg; 380 struct kvm_vcpu *vcpu = arg->vcpu; 381 u64 err; 382 383 arg->err = 0; 384 lockdep_assert_irqs_disabled(); 385 386 /* Task migration can race with CPU offlining. */ 387 if (unlikely(vcpu->cpu != raw_smp_processor_id())) 388 return; 389 390 /* 391 * No need to do TDH_VP_FLUSH if the vCPU hasn't been initialized. The 392 * list tracking still needs to be updated so that it's correct if/when 393 * the vCPU does get initialized. 394 */ 395 if (to_tdx(vcpu)->state != VCPU_TD_STATE_UNINITIALIZED) { 396 /* 397 * No need to retry. TDX Resources needed for TDH.VP.FLUSH are: 398 * TDVPR as exclusive, TDR as shared, and TDCS as shared. This 399 * vp flush function is called when destructing vCPU/TD or vCPU 400 * migration. No other thread uses TDVPR in those cases. 401 */ 402 err = tdh_vp_flush(&to_tdx(vcpu)->vp); 403 if (unlikely(err && err != TDX_VCPU_NOT_ASSOCIATED)) { 404 /* 405 * This function is called in IPI context. Do not use 406 * printk to avoid console semaphore. 407 * The caller prints out the error message, instead. 408 */ 409 if (err) 410 arg->err = err; 411 } 412 } 413 414 tdx_disassociate_vp(vcpu); 415 } 416 417 static void tdx_flush_vp_on_cpu(struct kvm_vcpu *vcpu) 418 { 419 struct tdx_flush_vp_arg arg = { 420 .vcpu = vcpu, 421 }; 422 int cpu = vcpu->cpu; 423 424 if (unlikely(cpu == -1)) 425 return; 426 427 smp_call_function_single(cpu, tdx_flush_vp, &arg, 1); 428 429 TDX_BUG_ON(arg.err, TDH_VP_FLUSH, vcpu->kvm); 430 } 431 432 void tdx_disable_virtualization_cpu(void) 433 { 434 int cpu = raw_smp_processor_id(); 435 struct list_head *tdvcpus = &per_cpu(associated_tdvcpus, cpu); 436 struct tdx_flush_vp_arg arg; 437 struct vcpu_tdx *tdx, *tmp; 438 unsigned long flags; 439 440 local_irq_save(flags); 441 /* Safe variant needed as tdx_disassociate_vp() deletes the entry. */ 442 list_for_each_entry_safe(tdx, tmp, tdvcpus, cpu_list) { 443 arg.vcpu = &tdx->vcpu; 444 tdx_flush_vp(&arg); 445 } 446 local_irq_restore(flags); 447 448 /* 449 * Flush cache now if kexec is possible: this is necessary to avoid 450 * having dirty private memory cachelines when the new kernel boots, 451 * but WBINVD is a relatively expensive operation and doing it during 452 * kexec can exacerbate races in native_stop_other_cpus(). Do it 453 * now, since this is a safe moment and there is going to be no more 454 * TDX activity on this CPU from this point on. 455 */ 456 tdx_cpu_flush_cache_for_kexec(); 457 } 458 459 #define TDX_SEAMCALL_RETRIES 10000 460 461 static void smp_func_do_phymem_cache_wb(void *unused) 462 { 463 u64 err = 0; 464 bool resume; 465 int i; 466 467 /* 468 * TDH.PHYMEM.CACHE.WB flushes caches associated with any TDX private 469 * KeyID on the package or core. The TDX module may not finish the 470 * cache flush but return TDX_INTERRUPTED_RESUMEABLE instead. The 471 * kernel should retry it until it returns success w/o rescheduling. 472 */ 473 for (i = TDX_SEAMCALL_RETRIES; i > 0; i--) { 474 resume = !!err; 475 err = tdh_phymem_cache_wb(resume); 476 switch (err) { 477 case TDX_INTERRUPTED_RESUMABLE: 478 continue; 479 case TDX_NO_HKID_READY_TO_WBCACHE: 480 err = TDX_SUCCESS; /* Already done by other thread */ 481 fallthrough; 482 default: 483 goto out; 484 } 485 } 486 487 out: 488 TDX_BUG_ON(err, TDH_PHYMEM_CACHE_WB, NULL); 489 } 490 491 void tdx_mmu_release_hkid(struct kvm *kvm) 492 { 493 bool packages_allocated, targets_allocated; 494 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 495 cpumask_var_t packages, targets; 496 struct kvm_vcpu *vcpu; 497 unsigned long j; 498 int i; 499 u64 err; 500 501 if (!is_hkid_assigned(kvm_tdx)) 502 return; 503 504 packages_allocated = zalloc_cpumask_var(&packages, GFP_KERNEL); 505 targets_allocated = zalloc_cpumask_var(&targets, GFP_KERNEL); 506 cpus_read_lock(); 507 508 kvm_for_each_vcpu(j, vcpu, kvm) 509 tdx_flush_vp_on_cpu(vcpu); 510 511 /* 512 * TDH.PHYMEM.CACHE.WB tries to acquire the TDX module global lock 513 * and can fail with TDX_OPERAND_BUSY when it fails to get the lock. 514 * Multiple TDX guests can be destroyed simultaneously. Take the 515 * mutex to prevent it from getting error. 516 */ 517 mutex_lock(&tdx_lock); 518 519 /* 520 * Releasing HKID is in vm_destroy(). 521 * After the above flushing vps, there should be no more vCPU 522 * associations, as all vCPU fds have been released at this stage. 523 */ 524 err = tdh_mng_vpflushdone(&kvm_tdx->td); 525 if (err == TDX_FLUSHVP_NOT_DONE) 526 goto out; 527 if (TDX_BUG_ON(err, TDH_MNG_VPFLUSHDONE, kvm)) { 528 pr_err("tdh_mng_vpflushdone() failed. HKID %d is leaked.\n", 529 kvm_tdx->hkid); 530 goto out; 531 } 532 533 for_each_online_cpu(i) { 534 if (packages_allocated && 535 cpumask_test_and_set_cpu(topology_physical_package_id(i), 536 packages)) 537 continue; 538 if (targets_allocated) 539 cpumask_set_cpu(i, targets); 540 } 541 if (targets_allocated) 542 on_each_cpu_mask(targets, smp_func_do_phymem_cache_wb, NULL, true); 543 else 544 on_each_cpu(smp_func_do_phymem_cache_wb, NULL, true); 545 /* 546 * In the case of error in smp_func_do_phymem_cache_wb(), the following 547 * tdh_mng_key_freeid() will fail. 548 */ 549 err = tdh_mng_key_freeid(&kvm_tdx->td); 550 if (TDX_BUG_ON(err, TDH_MNG_KEY_FREEID, kvm)) { 551 pr_err("tdh_mng_key_freeid() failed. HKID %d is leaked.\n", 552 kvm_tdx->hkid); 553 } else { 554 tdx_hkid_free(kvm_tdx); 555 } 556 557 out: 558 mutex_unlock(&tdx_lock); 559 cpus_read_unlock(); 560 free_cpumask_var(targets); 561 free_cpumask_var(packages); 562 } 563 564 static void tdx_reclaim_td_control_pages(struct kvm *kvm) 565 { 566 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 567 u64 err; 568 int i; 569 570 /* 571 * tdx_mmu_release_hkid() failed to reclaim HKID. Something went wrong 572 * heavily with TDX module. Give up freeing TD pages. As the function 573 * already warned, don't warn it again. 574 */ 575 if (is_hkid_assigned(kvm_tdx)) 576 return; 577 578 if (kvm_tdx->td.tdcs_pages) { 579 for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) { 580 if (!kvm_tdx->td.tdcs_pages[i]) 581 continue; 582 583 tdx_reclaim_control_page(kvm_tdx->td.tdcs_pages[i]); 584 } 585 kfree(kvm_tdx->td.tdcs_pages); 586 kvm_tdx->td.tdcs_pages = NULL; 587 } 588 589 if (!kvm_tdx->td.tdr_page) 590 return; 591 592 if (__tdx_reclaim_page(kvm_tdx->td.tdr_page)) 593 return; 594 595 /* 596 * Use a SEAMCALL to ask the TDX module to flush the cache based on the 597 * KeyID. TDX module may access TDR while operating on TD (Especially 598 * when it is reclaiming TDCS). 599 */ 600 err = tdh_phymem_page_wbinvd_tdr(&kvm_tdx->td); 601 if (TDX_BUG_ON(err, TDH_PHYMEM_PAGE_WBINVD, kvm)) 602 return; 603 604 tdx_quirk_reset_page(kvm_tdx->td.tdr_page); 605 606 __free_page(kvm_tdx->td.tdr_page); 607 kvm_tdx->td.tdr_page = NULL; 608 } 609 610 void tdx_vm_destroy(struct kvm *kvm) 611 { 612 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 613 614 tdx_reclaim_td_control_pages(kvm); 615 616 kvm_tdx->state = TD_STATE_UNINITIALIZED; 617 } 618 619 static int tdx_do_tdh_mng_key_config(void *param) 620 { 621 struct kvm_tdx *kvm_tdx = param; 622 u64 err; 623 624 /* TDX_RND_NO_ENTROPY related retries are handled by sc_retry() */ 625 err = tdh_mng_key_config(&kvm_tdx->td); 626 if (TDX_BUG_ON(err, TDH_MNG_KEY_CONFIG, &kvm_tdx->kvm)) 627 return -EIO; 628 629 return 0; 630 } 631 632 int tdx_vm_init(struct kvm *kvm) 633 { 634 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 635 636 kvm->arch.has_protected_state = true; 637 /* 638 * TDX Module doesn't allow the hypervisor to modify the EOI-bitmap, 639 * i.e. all EOIs are accelerated and never trigger exits. 640 */ 641 kvm->arch.has_protected_eoi = true; 642 kvm->arch.has_private_mem = true; 643 kvm->arch.disabled_quirks |= KVM_X86_QUIRK_IGNORE_GUEST_PAT; 644 645 /* 646 * Because guest TD is protected, VMM can't parse the instruction in TD. 647 * Instead, guest uses MMIO hypercall. For unmodified device driver, 648 * #VE needs to be injected for MMIO and #VE handler in TD converts MMIO 649 * instruction into MMIO hypercall. 650 * 651 * SPTE value for MMIO needs to be setup so that #VE is injected into 652 * TD instead of triggering EPT MISCONFIG. 653 * - RWX=0 so that EPT violation is triggered. 654 * - suppress #VE bit is cleared to inject #VE. 655 */ 656 kvm_mmu_set_mmio_spte_value(kvm, 0); 657 658 /* 659 * TDX has its own limit of maximum vCPUs it can support for all 660 * TDX guests in addition to KVM_MAX_VCPUS. TDX module reports 661 * such limit via the MAX_VCPU_PER_TD global metadata. In 662 * practice, it reflects the number of logical CPUs that ALL 663 * platforms that the TDX module supports can possibly have. 664 * 665 * Limit TDX guest's maximum vCPUs to the number of logical CPUs 666 * the platform has. Simply forwarding the MAX_VCPU_PER_TD to 667 * userspace would result in an unpredictable ABI. 668 */ 669 kvm->max_vcpus = min_t(int, kvm->max_vcpus, num_present_cpus()); 670 671 kvm_tdx->state = TD_STATE_UNINITIALIZED; 672 673 return 0; 674 } 675 676 int tdx_vcpu_create(struct kvm_vcpu *vcpu) 677 { 678 struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); 679 struct vcpu_tdx *tdx = to_tdx(vcpu); 680 681 if (kvm_tdx->state != TD_STATE_INITIALIZED) 682 return -EIO; 683 684 /* 685 * TDX module mandates APICv, which requires an in-kernel local APIC. 686 * Disallow an in-kernel I/O APIC, because level-triggered interrupts 687 * and thus the I/O APIC as a whole can't be faithfully emulated in KVM. 688 */ 689 if (!irqchip_split(vcpu->kvm)) 690 return -EINVAL; 691 692 fpstate_set_confidential(&vcpu->arch.guest_fpu); 693 vcpu->arch.apic->guest_apic_protected = true; 694 INIT_LIST_HEAD(&tdx->vt.pi_wakeup_list); 695 696 vcpu->arch.efer = EFER_SCE | EFER_LME | EFER_LMA | EFER_NX; 697 698 vcpu->arch.switch_db_regs = KVM_DEBUGREG_AUTO_SWITCH; 699 vcpu->arch.cr0_guest_owned_bits = -1ul; 700 vcpu->arch.cr4_guest_owned_bits = -1ul; 701 702 /* KVM can't change TSC offset/multiplier as TDX module manages them. */ 703 vcpu->arch.guest_tsc_protected = true; 704 vcpu->arch.tsc_offset = kvm_tdx->tsc_offset; 705 vcpu->arch.l1_tsc_offset = vcpu->arch.tsc_offset; 706 vcpu->arch.tsc_scaling_ratio = kvm_tdx->tsc_multiplier; 707 vcpu->arch.l1_tsc_scaling_ratio = kvm_tdx->tsc_multiplier; 708 709 vcpu->arch.guest_state_protected = 710 !(to_kvm_tdx(vcpu->kvm)->attributes & TDX_TD_ATTR_DEBUG); 711 712 if ((kvm_tdx->xfam & XFEATURE_MASK_XTILE) == XFEATURE_MASK_XTILE) 713 vcpu->arch.xfd_no_write_intercept = true; 714 715 tdx->vt.pi_desc.nv = POSTED_INTR_VECTOR; 716 __pi_set_sn(&tdx->vt.pi_desc); 717 718 tdx->state = VCPU_TD_STATE_UNINITIALIZED; 719 720 return 0; 721 } 722 723 void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 724 { 725 struct vcpu_tdx *tdx = to_tdx(vcpu); 726 727 vmx_vcpu_pi_load(vcpu, cpu); 728 if (vcpu->cpu == cpu || !is_hkid_assigned(to_kvm_tdx(vcpu->kvm))) 729 return; 730 731 tdx_flush_vp_on_cpu(vcpu); 732 733 KVM_BUG_ON(cpu != raw_smp_processor_id(), vcpu->kvm); 734 local_irq_disable(); 735 /* 736 * Pairs with the smp_wmb() in tdx_disassociate_vp() to ensure 737 * vcpu->cpu is read before tdx->cpu_list. 738 */ 739 smp_rmb(); 740 741 list_add(&tdx->cpu_list, &per_cpu(associated_tdvcpus, cpu)); 742 local_irq_enable(); 743 } 744 745 bool tdx_interrupt_allowed(struct kvm_vcpu *vcpu) 746 { 747 /* 748 * KVM can't get the interrupt status of TDX guest and it assumes 749 * interrupt is always allowed unless TDX guest calls TDVMCALL with HLT, 750 * which passes the interrupt blocked flag. 751 */ 752 return vmx_get_exit_reason(vcpu).basic != EXIT_REASON_HLT || 753 !to_tdx(vcpu)->vp_enter_args.r12; 754 } 755 756 static bool tdx_protected_apic_has_interrupt(struct kvm_vcpu *vcpu) 757 { 758 u64 vcpu_state_details; 759 760 if (pi_has_pending_interrupt(vcpu)) 761 return true; 762 763 /* 764 * Only check RVI pending for HALTED case with IRQ enabled. 765 * For non-HLT cases, KVM doesn't care about STI/SS shadows. And if the 766 * interrupt was pending before TD exit, then it _must_ be blocked, 767 * otherwise the interrupt would have been serviced at the instruction 768 * boundary. 769 */ 770 if (vmx_get_exit_reason(vcpu).basic != EXIT_REASON_HLT || 771 to_tdx(vcpu)->vp_enter_args.r12) 772 return false; 773 774 vcpu_state_details = 775 td_state_non_arch_read64(to_tdx(vcpu), TD_VCPU_STATE_DETAILS_NON_ARCH); 776 777 return tdx_vcpu_state_details_intr_pending(vcpu_state_details); 778 } 779 780 struct tdx_uret_msr { 781 u32 msr; 782 unsigned int slot; 783 u64 defval; 784 }; 785 786 static struct tdx_uret_msr tdx_uret_msrs[] = { 787 {.msr = MSR_SYSCALL_MASK, .defval = 0x20200 }, 788 {.msr = MSR_STAR,}, 789 {.msr = MSR_LSTAR,}, 790 {.msr = MSR_TSC_AUX,}, 791 }; 792 793 void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) 794 { 795 struct vcpu_vt *vt = to_vt(vcpu); 796 int i; 797 798 if (vt->guest_state_loaded) 799 return; 800 801 if (likely(is_64bit_mm(current->mm))) 802 vt->msr_host_kernel_gs_base = current->thread.gsbase; 803 else 804 vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE); 805 806 vt->guest_state_loaded = true; 807 808 /* 809 * Explicitly set user-return MSRs that are clobbered by the TDX-Module 810 * if VP.ENTER succeeds, i.e. on TD-Exit, with the values that would be 811 * written by the TDX-Module. Don't rely on the TDX-Module to actually 812 * clobber the MSRs, as the contract is poorly defined and not upheld. 813 * E.g. the TDX-Module will synthesize an EPT Violation without doing 814 * VM-Enter if it suspects a zero-step attack, and never "restore" VMM 815 * state. 816 */ 817 for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++) 818 kvm_set_user_return_msr(tdx_uret_msrs[i].slot, 819 tdx_uret_msrs[i].defval, -1ull); 820 } 821 822 static void tdx_prepare_switch_to_host(struct kvm_vcpu *vcpu) 823 { 824 struct vcpu_vt *vt = to_vt(vcpu); 825 826 if (!vt->guest_state_loaded) 827 return; 828 829 ++vcpu->stat.host_state_reload; 830 wrmsrl(MSR_KERNEL_GS_BASE, vt->msr_host_kernel_gs_base); 831 832 vt->guest_state_loaded = false; 833 } 834 835 void tdx_vcpu_put(struct kvm_vcpu *vcpu) 836 { 837 vmx_vcpu_pi_put(vcpu); 838 tdx_prepare_switch_to_host(vcpu); 839 } 840 841 /* 842 * Life cycles for a TD and a vCPU: 843 * 1. KVM_CREATE_VM ioctl. 844 * TD state is TD_STATE_UNINITIALIZED. 845 * hkid is not assigned at this stage. 846 * 2. KVM_TDX_INIT_VM ioctl. 847 * TD transitions to TD_STATE_INITIALIZED. 848 * hkid is assigned after this stage. 849 * 3. KVM_CREATE_VCPU ioctl. (only when TD is TD_STATE_INITIALIZED). 850 * 3.1 tdx_vcpu_create() transitions vCPU state to VCPU_TD_STATE_UNINITIALIZED. 851 * 3.2 vcpu_load() and vcpu_put() in kvm_arch_vcpu_create(). 852 * 3.3 (conditional) if any error encountered after kvm_arch_vcpu_create() 853 * kvm_arch_vcpu_destroy() --> tdx_vcpu_free(). 854 * 4. KVM_TDX_INIT_VCPU ioctl. 855 * tdx_vcpu_init() transitions vCPU state to VCPU_TD_STATE_INITIALIZED. 856 * vCPU control structures are allocated at this stage. 857 * 5. kvm_destroy_vm(). 858 * 5.1 tdx_mmu_release_hkid(): (1) tdh_vp_flush(), disassociates all vCPUs. 859 * (2) puts hkid to !assigned state. 860 * 5.2 kvm_destroy_vcpus() --> tdx_vcpu_free(): 861 * transitions vCPU to VCPU_TD_STATE_UNINITIALIZED state. 862 * 5.3 tdx_vm_destroy() 863 * transitions TD to TD_STATE_UNINITIALIZED state. 864 * 865 * tdx_vcpu_free() can be invoked only at 3.3 or 5.2. 866 * - If at 3.3, hkid is still assigned, but the vCPU must be in 867 * VCPU_TD_STATE_UNINITIALIZED state. 868 * - if at 5.2, hkid must be !assigned and all vCPUs must be in 869 * VCPU_TD_STATE_INITIALIZED state and have been dissociated. 870 */ 871 void tdx_vcpu_free(struct kvm_vcpu *vcpu) 872 { 873 struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); 874 struct vcpu_tdx *tdx = to_tdx(vcpu); 875 int i; 876 877 if (vcpu->cpu != -1) { 878 KVM_BUG_ON(tdx->state == VCPU_TD_STATE_INITIALIZED, vcpu->kvm); 879 tdx_flush_vp_on_cpu(vcpu); 880 return; 881 } 882 883 /* 884 * It is not possible to reclaim pages while hkid is assigned. It might 885 * be assigned if the TD VM is being destroyed but freeing hkid failed, 886 * in which case the pages are leaked. 887 */ 888 if (is_hkid_assigned(kvm_tdx)) 889 return; 890 891 if (tdx->vp.tdcx_pages) { 892 for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) { 893 if (tdx->vp.tdcx_pages[i]) 894 tdx_reclaim_control_page(tdx->vp.tdcx_pages[i]); 895 } 896 kfree(tdx->vp.tdcx_pages); 897 tdx->vp.tdcx_pages = NULL; 898 } 899 if (tdx->vp.tdvpr_page) { 900 tdx_reclaim_control_page(tdx->vp.tdvpr_page); 901 tdx->vp.tdvpr_page = NULL; 902 tdx->vp.tdvpr_pa = 0; 903 } 904 905 tdx->state = VCPU_TD_STATE_UNINITIALIZED; 906 } 907 908 int tdx_vcpu_pre_run(struct kvm_vcpu *vcpu) 909 { 910 if (unlikely(to_tdx(vcpu)->state != VCPU_TD_STATE_INITIALIZED || 911 to_kvm_tdx(vcpu->kvm)->state != TD_STATE_RUNNABLE)) 912 return -EINVAL; 913 914 return 1; 915 } 916 917 static __always_inline u32 tdcall_to_vmx_exit_reason(struct kvm_vcpu *vcpu) 918 { 919 switch (tdvmcall_leaf(vcpu)) { 920 case EXIT_REASON_CPUID: 921 case EXIT_REASON_HLT: 922 case EXIT_REASON_IO_INSTRUCTION: 923 case EXIT_REASON_MSR_READ: 924 case EXIT_REASON_MSR_WRITE: 925 return tdvmcall_leaf(vcpu); 926 case EXIT_REASON_EPT_VIOLATION: 927 return EXIT_REASON_EPT_MISCONFIG; 928 default: 929 break; 930 } 931 932 return EXIT_REASON_TDCALL; 933 } 934 935 static __always_inline u32 tdx_to_vmx_exit_reason(struct kvm_vcpu *vcpu) 936 { 937 struct vcpu_tdx *tdx = to_tdx(vcpu); 938 u32 exit_reason; 939 940 switch (tdx->vp_enter_ret & TDX_SEAMCALL_STATUS_MASK) { 941 case TDX_SUCCESS: 942 case TDX_NON_RECOVERABLE_VCPU: 943 case TDX_NON_RECOVERABLE_TD: 944 case TDX_NON_RECOVERABLE_TD_NON_ACCESSIBLE: 945 case TDX_NON_RECOVERABLE_TD_WRONG_APIC_MODE: 946 break; 947 default: 948 return -1u; 949 } 950 951 exit_reason = tdx->vp_enter_ret; 952 953 switch (exit_reason) { 954 case EXIT_REASON_TDCALL: 955 if (tdvmcall_exit_type(vcpu)) 956 return EXIT_REASON_VMCALL; 957 958 return tdcall_to_vmx_exit_reason(vcpu); 959 case EXIT_REASON_EPT_MISCONFIG: 960 /* 961 * Defer KVM_BUG_ON() until tdx_handle_exit() because this is in 962 * non-instrumentable code with interrupts disabled. 963 */ 964 return -1u; 965 default: 966 break; 967 } 968 969 return exit_reason; 970 } 971 972 static noinstr void tdx_vcpu_enter_exit(struct kvm_vcpu *vcpu) 973 { 974 struct vcpu_tdx *tdx = to_tdx(vcpu); 975 struct vcpu_vt *vt = to_vt(vcpu); 976 977 guest_state_enter_irqoff(); 978 979 tdx->vp_enter_ret = tdh_vp_enter(&tdx->vp, &tdx->vp_enter_args); 980 981 vt->exit_reason.full = tdx_to_vmx_exit_reason(vcpu); 982 983 vt->exit_qualification = tdx->vp_enter_args.rcx; 984 tdx->ext_exit_qualification = tdx->vp_enter_args.rdx; 985 tdx->exit_gpa = tdx->vp_enter_args.r8; 986 vt->exit_intr_info = tdx->vp_enter_args.r9; 987 988 vmx_handle_nmi(vcpu); 989 990 guest_state_exit_irqoff(); 991 } 992 993 static bool tdx_failed_vmentry(struct kvm_vcpu *vcpu) 994 { 995 return vmx_get_exit_reason(vcpu).failed_vmentry && 996 vmx_get_exit_reason(vcpu).full != -1u; 997 } 998 999 static fastpath_t tdx_exit_handlers_fastpath(struct kvm_vcpu *vcpu) 1000 { 1001 u64 vp_enter_ret = to_tdx(vcpu)->vp_enter_ret; 1002 1003 /* 1004 * TDX_OPERAND_BUSY could be returned for SEPT due to 0-step mitigation 1005 * or for TD EPOCH due to contention with TDH.MEM.TRACK on TDH.VP.ENTER. 1006 * 1007 * When KVM requests KVM_REQ_OUTSIDE_GUEST_MODE, which has both 1008 * KVM_REQUEST_WAIT and KVM_REQUEST_NO_ACTION set, it requires target 1009 * vCPUs leaving fastpath so that interrupt can be enabled to ensure the 1010 * IPIs can be delivered. Return EXIT_FASTPATH_EXIT_HANDLED instead of 1011 * EXIT_FASTPATH_REENTER_GUEST to exit fastpath, otherwise, the 1012 * requester may be blocked endlessly. 1013 */ 1014 if (unlikely(tdx_operand_busy(vp_enter_ret))) 1015 return EXIT_FASTPATH_EXIT_HANDLED; 1016 1017 return EXIT_FASTPATH_NONE; 1018 } 1019 1020 #define TDX_REGS_AVAIL_SET (BIT_ULL(VCPU_EXREG_EXIT_INFO_1) | \ 1021 BIT_ULL(VCPU_EXREG_EXIT_INFO_2) | \ 1022 BIT_ULL(VCPU_REGS_RAX) | \ 1023 BIT_ULL(VCPU_REGS_RBX) | \ 1024 BIT_ULL(VCPU_REGS_RCX) | \ 1025 BIT_ULL(VCPU_REGS_RDX) | \ 1026 BIT_ULL(VCPU_REGS_RBP) | \ 1027 BIT_ULL(VCPU_REGS_RSI) | \ 1028 BIT_ULL(VCPU_REGS_RDI) | \ 1029 BIT_ULL(VCPU_REGS_R8) | \ 1030 BIT_ULL(VCPU_REGS_R9) | \ 1031 BIT_ULL(VCPU_REGS_R10) | \ 1032 BIT_ULL(VCPU_REGS_R11) | \ 1033 BIT_ULL(VCPU_REGS_R12) | \ 1034 BIT_ULL(VCPU_REGS_R13) | \ 1035 BIT_ULL(VCPU_REGS_R14) | \ 1036 BIT_ULL(VCPU_REGS_R15)) 1037 1038 static void tdx_load_host_xsave_state(struct kvm_vcpu *vcpu) 1039 { 1040 struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); 1041 1042 /* 1043 * All TDX hosts support PKRU; but even if they didn't, 1044 * vcpu->arch.host_pkru would be 0 and the wrpkru would be 1045 * skipped. 1046 */ 1047 if (vcpu->arch.host_pkru != 0) 1048 wrpkru(vcpu->arch.host_pkru); 1049 1050 if (kvm_host.xcr0 != (kvm_tdx->xfam & kvm_caps.supported_xcr0)) 1051 xsetbv(XCR_XFEATURE_ENABLED_MASK, kvm_host.xcr0); 1052 1053 /* 1054 * Likewise, even if a TDX hosts didn't support XSS both arms of 1055 * the comparison would be 0 and the wrmsrl would be skipped. 1056 */ 1057 if (kvm_host.xss != (kvm_tdx->xfam & kvm_caps.supported_xss)) 1058 wrmsrl(MSR_IA32_XSS, kvm_host.xss); 1059 } 1060 1061 #define TDX_DEBUGCTL_PRESERVED (DEBUGCTLMSR_BTF | \ 1062 DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI | \ 1063 DEBUGCTLMSR_FREEZE_IN_SMM) 1064 1065 fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) 1066 { 1067 struct vcpu_tdx *tdx = to_tdx(vcpu); 1068 struct vcpu_vt *vt = to_vt(vcpu); 1069 1070 /* 1071 * WARN if KVM wants to force an immediate exit, as the TDX module does 1072 * not guarantee entry into the guest, i.e. it's possible for KVM to 1073 * _think_ it completed entry to the guest and forced an immediate exit 1074 * without actually having done so. Luckily, KVM never needs to force 1075 * an immediate exit for TDX (KVM can't do direct event injection, so 1076 * just WARN and continue on. 1077 */ 1078 WARN_ON_ONCE(run_flags); 1079 1080 /* 1081 * Wait until retry of SEPT-zap-related SEAMCALL completes before 1082 * allowing vCPU entry to avoid contention with tdh_vp_enter() and 1083 * TDCALLs. 1084 */ 1085 if (unlikely(READ_ONCE(to_kvm_tdx(vcpu->kvm)->wait_for_sept_zap))) 1086 return EXIT_FASTPATH_EXIT_HANDLED; 1087 1088 trace_kvm_entry(vcpu, run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT); 1089 1090 if (pi_test_on(&vt->pi_desc)) { 1091 apic->send_IPI_self(POSTED_INTR_VECTOR); 1092 1093 if (pi_test_pir(kvm_lapic_get_reg(vcpu->arch.apic, APIC_LVTT) & 1094 APIC_VECTOR_MASK, &vt->pi_desc)) 1095 kvm_wait_lapic_expire(vcpu); 1096 } 1097 1098 tdx_vcpu_enter_exit(vcpu); 1099 1100 if (vcpu->arch.host_debugctl & ~TDX_DEBUGCTL_PRESERVED) 1101 update_debugctlmsr(vcpu->arch.host_debugctl); 1102 1103 tdx_load_host_xsave_state(vcpu); 1104 1105 vcpu->arch.regs_avail &= TDX_REGS_AVAIL_SET; 1106 1107 if (unlikely(tdx->vp_enter_ret == EXIT_REASON_EPT_MISCONFIG)) 1108 return EXIT_FASTPATH_NONE; 1109 1110 if (unlikely((tdx->vp_enter_ret & TDX_SW_ERROR) == TDX_SW_ERROR)) 1111 return EXIT_FASTPATH_NONE; 1112 1113 trace_kvm_exit(vcpu, KVM_ISA_VMX); 1114 1115 if (unlikely(tdx_failed_vmentry(vcpu))) 1116 return EXIT_FASTPATH_NONE; 1117 1118 return tdx_exit_handlers_fastpath(vcpu); 1119 } 1120 1121 void tdx_inject_nmi(struct kvm_vcpu *vcpu) 1122 { 1123 ++vcpu->stat.nmi_injections; 1124 td_management_write8(to_tdx(vcpu), TD_VCPU_PEND_NMI, 1); 1125 /* 1126 * From KVM's perspective, NMI injection is completed right after 1127 * writing to PEND_NMI. KVM doesn't care whether an NMI is injected by 1128 * the TDX module or not. 1129 */ 1130 vcpu->arch.nmi_injected = false; 1131 /* 1132 * TDX doesn't support KVM to request NMI window exit. If there is 1133 * still a pending vNMI, KVM is not able to inject it along with the 1134 * one pending in TDX module in a back-to-back way. Since the previous 1135 * vNMI is still pending in TDX module, i.e. it has not been delivered 1136 * to TDX guest yet, it's OK to collapse the pending vNMI into the 1137 * previous one. The guest is expected to handle all the NMI sources 1138 * when handling the first vNMI. 1139 */ 1140 vcpu->arch.nmi_pending = 0; 1141 } 1142 1143 static int tdx_handle_exception_nmi(struct kvm_vcpu *vcpu) 1144 { 1145 u32 intr_info = vmx_get_intr_info(vcpu); 1146 1147 /* 1148 * Machine checks are handled by handle_exception_irqoff(), or by 1149 * tdx_handle_exit() with TDX_NON_RECOVERABLE set if a #MC occurs on 1150 * VM-Entry. NMIs are handled by tdx_vcpu_enter_exit(). 1151 */ 1152 if (is_nmi(intr_info) || is_machine_check(intr_info)) 1153 return 1; 1154 1155 vcpu->run->exit_reason = KVM_EXIT_EXCEPTION; 1156 vcpu->run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK; 1157 vcpu->run->ex.error_code = 0; 1158 1159 return 0; 1160 } 1161 1162 static int complete_hypercall_exit(struct kvm_vcpu *vcpu) 1163 { 1164 tdvmcall_set_return_code(vcpu, vcpu->run->hypercall.ret); 1165 return 1; 1166 } 1167 1168 static int tdx_emulate_vmcall(struct kvm_vcpu *vcpu) 1169 { 1170 kvm_rax_write(vcpu, to_tdx(vcpu)->vp_enter_args.r10); 1171 kvm_rbx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r11); 1172 kvm_rcx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r12); 1173 kvm_rdx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r13); 1174 kvm_rsi_write(vcpu, to_tdx(vcpu)->vp_enter_args.r14); 1175 1176 return __kvm_emulate_hypercall(vcpu, 0, complete_hypercall_exit); 1177 } 1178 1179 /* 1180 * Split into chunks and check interrupt pending between chunks. This allows 1181 * for timely injection of interrupts to prevent issues with guest lockup 1182 * detection. 1183 */ 1184 #define TDX_MAP_GPA_MAX_LEN (2 * 1024 * 1024) 1185 static void __tdx_map_gpa(struct vcpu_tdx *tdx); 1186 1187 static int tdx_complete_vmcall_map_gpa(struct kvm_vcpu *vcpu) 1188 { 1189 struct vcpu_tdx *tdx = to_tdx(vcpu); 1190 1191 if (vcpu->run->hypercall.ret) { 1192 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); 1193 tdx->vp_enter_args.r11 = tdx->map_gpa_next; 1194 return 1; 1195 } 1196 1197 tdx->map_gpa_next += TDX_MAP_GPA_MAX_LEN; 1198 if (tdx->map_gpa_next >= tdx->map_gpa_end) 1199 return 1; 1200 1201 /* 1202 * Stop processing the remaining part if there is a pending interrupt, 1203 * which could be qualified to deliver. Skip checking pending RVI for 1204 * TDVMCALL_MAP_GPA, see comments in tdx_protected_apic_has_interrupt(). 1205 */ 1206 if (kvm_vcpu_has_events(vcpu)) { 1207 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_RETRY); 1208 tdx->vp_enter_args.r11 = tdx->map_gpa_next; 1209 return 1; 1210 } 1211 1212 __tdx_map_gpa(tdx); 1213 return 0; 1214 } 1215 1216 static void __tdx_map_gpa(struct vcpu_tdx *tdx) 1217 { 1218 u64 gpa = tdx->map_gpa_next; 1219 u64 size = tdx->map_gpa_end - tdx->map_gpa_next; 1220 1221 if (size > TDX_MAP_GPA_MAX_LEN) 1222 size = TDX_MAP_GPA_MAX_LEN; 1223 1224 tdx->vcpu.run->exit_reason = KVM_EXIT_HYPERCALL; 1225 tdx->vcpu.run->hypercall.nr = KVM_HC_MAP_GPA_RANGE; 1226 /* 1227 * In principle this should have been -KVM_ENOSYS, but userspace (QEMU <=9.2) 1228 * assumed that vcpu->run->hypercall.ret is never changed by KVM and thus that 1229 * it was always zero on KVM_EXIT_HYPERCALL. Since KVM is now overwriting 1230 * vcpu->run->hypercall.ret, ensuring that it is zero to not break QEMU. 1231 */ 1232 tdx->vcpu.run->hypercall.ret = 0; 1233 tdx->vcpu.run->hypercall.args[0] = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(tdx->vcpu.kvm)); 1234 tdx->vcpu.run->hypercall.args[1] = size / PAGE_SIZE; 1235 tdx->vcpu.run->hypercall.args[2] = vt_is_tdx_private_gpa(tdx->vcpu.kvm, gpa) ? 1236 KVM_MAP_GPA_RANGE_ENCRYPTED : 1237 KVM_MAP_GPA_RANGE_DECRYPTED; 1238 tdx->vcpu.run->hypercall.flags = KVM_EXIT_HYPERCALL_LONG_MODE; 1239 1240 tdx->vcpu.arch.complete_userspace_io = tdx_complete_vmcall_map_gpa; 1241 } 1242 1243 static int tdx_map_gpa(struct kvm_vcpu *vcpu) 1244 { 1245 struct vcpu_tdx *tdx = to_tdx(vcpu); 1246 u64 gpa = tdx->vp_enter_args.r12; 1247 u64 size = tdx->vp_enter_args.r13; 1248 u64 ret; 1249 1250 /* 1251 * Converting TDVMCALL_MAP_GPA to KVM_HC_MAP_GPA_RANGE requires 1252 * userspace to enable KVM_CAP_EXIT_HYPERCALL with KVM_HC_MAP_GPA_RANGE 1253 * bit set. This is a base call so it should always be supported, but 1254 * KVM has no way to ensure that userspace implements the GHCI correctly. 1255 * So if KVM_HC_MAP_GPA_RANGE does not cause a VMEXIT, return an error 1256 * to the guest. 1257 */ 1258 if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE)) { 1259 ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED; 1260 goto error; 1261 } 1262 1263 if (gpa + size <= gpa || !kvm_vcpu_is_legal_gpa(vcpu, gpa) || 1264 !kvm_vcpu_is_legal_gpa(vcpu, gpa + size - 1) || 1265 (vt_is_tdx_private_gpa(vcpu->kvm, gpa) != 1266 vt_is_tdx_private_gpa(vcpu->kvm, gpa + size - 1))) { 1267 ret = TDVMCALL_STATUS_INVALID_OPERAND; 1268 goto error; 1269 } 1270 1271 if (!PAGE_ALIGNED(gpa) || !PAGE_ALIGNED(size)) { 1272 ret = TDVMCALL_STATUS_ALIGN_ERROR; 1273 goto error; 1274 } 1275 1276 tdx->map_gpa_end = gpa + size; 1277 tdx->map_gpa_next = gpa; 1278 1279 __tdx_map_gpa(tdx); 1280 return 0; 1281 1282 error: 1283 tdvmcall_set_return_code(vcpu, ret); 1284 tdx->vp_enter_args.r11 = gpa; 1285 return 1; 1286 } 1287 1288 static int tdx_report_fatal_error(struct kvm_vcpu *vcpu) 1289 { 1290 struct vcpu_tdx *tdx = to_tdx(vcpu); 1291 u64 *regs = vcpu->run->system_event.data; 1292 u64 *module_regs = &tdx->vp_enter_args.r8; 1293 int index = VCPU_REGS_RAX; 1294 1295 vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; 1296 vcpu->run->system_event.type = KVM_SYSTEM_EVENT_TDX_FATAL; 1297 vcpu->run->system_event.ndata = 16; 1298 1299 /* Dump 16 general-purpose registers to userspace in ascending order. */ 1300 regs[index++] = tdx->vp_enter_ret; 1301 regs[index++] = tdx->vp_enter_args.rcx; 1302 regs[index++] = tdx->vp_enter_args.rdx; 1303 regs[index++] = tdx->vp_enter_args.rbx; 1304 regs[index++] = 0; 1305 regs[index++] = 0; 1306 regs[index++] = tdx->vp_enter_args.rsi; 1307 regs[index] = tdx->vp_enter_args.rdi; 1308 for (index = 0; index < 8; index++) 1309 regs[VCPU_REGS_R8 + index] = module_regs[index]; 1310 1311 return 0; 1312 } 1313 1314 static int tdx_emulate_cpuid(struct kvm_vcpu *vcpu) 1315 { 1316 u32 eax, ebx, ecx, edx; 1317 struct vcpu_tdx *tdx = to_tdx(vcpu); 1318 1319 /* EAX and ECX for cpuid is stored in R12 and R13. */ 1320 eax = tdx->vp_enter_args.r12; 1321 ecx = tdx->vp_enter_args.r13; 1322 1323 kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx, false); 1324 1325 tdx->vp_enter_args.r12 = eax; 1326 tdx->vp_enter_args.r13 = ebx; 1327 tdx->vp_enter_args.r14 = ecx; 1328 tdx->vp_enter_args.r15 = edx; 1329 1330 return 1; 1331 } 1332 1333 static int tdx_complete_pio_out(struct kvm_vcpu *vcpu) 1334 { 1335 vcpu->arch.pio.count = 0; 1336 return 1; 1337 } 1338 1339 static int tdx_complete_pio_in(struct kvm_vcpu *vcpu) 1340 { 1341 struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; 1342 unsigned long val = 0; 1343 int ret; 1344 1345 ret = ctxt->ops->pio_in_emulated(ctxt, vcpu->arch.pio.size, 1346 vcpu->arch.pio.port, &val, 1); 1347 1348 WARN_ON_ONCE(!ret); 1349 1350 tdvmcall_set_return_val(vcpu, val); 1351 1352 return 1; 1353 } 1354 1355 static int tdx_emulate_io(struct kvm_vcpu *vcpu) 1356 { 1357 struct vcpu_tdx *tdx = to_tdx(vcpu); 1358 struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; 1359 unsigned long val = 0; 1360 unsigned int port; 1361 u64 size, write; 1362 int ret; 1363 1364 ++vcpu->stat.io_exits; 1365 1366 size = tdx->vp_enter_args.r12; 1367 write = tdx->vp_enter_args.r13; 1368 port = tdx->vp_enter_args.r14; 1369 1370 if ((write != 0 && write != 1) || (size != 1 && size != 2 && size != 4)) { 1371 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); 1372 return 1; 1373 } 1374 1375 if (write) { 1376 val = tdx->vp_enter_args.r15; 1377 ret = ctxt->ops->pio_out_emulated(ctxt, size, port, &val, 1); 1378 } else { 1379 ret = ctxt->ops->pio_in_emulated(ctxt, size, port, &val, 1); 1380 } 1381 1382 if (!ret) 1383 vcpu->arch.complete_userspace_io = write ? tdx_complete_pio_out : 1384 tdx_complete_pio_in; 1385 else if (!write) 1386 tdvmcall_set_return_val(vcpu, val); 1387 1388 return ret; 1389 } 1390 1391 static int tdx_complete_mmio_read(struct kvm_vcpu *vcpu) 1392 { 1393 unsigned long val = 0; 1394 gpa_t gpa; 1395 int size; 1396 1397 gpa = vcpu->mmio_fragments[0].gpa; 1398 size = vcpu->mmio_fragments[0].len; 1399 1400 memcpy(&val, vcpu->run->mmio.data, size); 1401 tdvmcall_set_return_val(vcpu, val); 1402 trace_kvm_mmio(KVM_TRACE_MMIO_READ, size, gpa, &val); 1403 return 1; 1404 } 1405 1406 static inline int tdx_mmio_write(struct kvm_vcpu *vcpu, gpa_t gpa, int size, 1407 unsigned long val) 1408 { 1409 if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { 1410 trace_kvm_fast_mmio(gpa); 1411 return 0; 1412 } 1413 1414 trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, size, gpa, &val); 1415 if (kvm_io_bus_write(vcpu, KVM_MMIO_BUS, gpa, size, &val)) 1416 return -EOPNOTSUPP; 1417 1418 return 0; 1419 } 1420 1421 static inline int tdx_mmio_read(struct kvm_vcpu *vcpu, gpa_t gpa, int size) 1422 { 1423 unsigned long val; 1424 1425 if (kvm_io_bus_read(vcpu, KVM_MMIO_BUS, gpa, size, &val)) 1426 return -EOPNOTSUPP; 1427 1428 tdvmcall_set_return_val(vcpu, val); 1429 trace_kvm_mmio(KVM_TRACE_MMIO_READ, size, gpa, &val); 1430 return 0; 1431 } 1432 1433 static int tdx_emulate_mmio(struct kvm_vcpu *vcpu) 1434 { 1435 struct vcpu_tdx *tdx = to_tdx(vcpu); 1436 int size, write, r; 1437 unsigned long val; 1438 gpa_t gpa; 1439 1440 size = tdx->vp_enter_args.r12; 1441 write = tdx->vp_enter_args.r13; 1442 gpa = tdx->vp_enter_args.r14; 1443 val = write ? tdx->vp_enter_args.r15 : 0; 1444 1445 if (size != 1 && size != 2 && size != 4 && size != 8) 1446 goto error; 1447 if (write != 0 && write != 1) 1448 goto error; 1449 1450 /* 1451 * TDG.VP.VMCALL<MMIO> allows only shared GPA, it makes no sense to 1452 * do MMIO emulation for private GPA. 1453 */ 1454 if (vt_is_tdx_private_gpa(vcpu->kvm, gpa) || 1455 vt_is_tdx_private_gpa(vcpu->kvm, gpa + size - 1)) 1456 goto error; 1457 1458 gpa = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(vcpu->kvm)); 1459 1460 if (write) 1461 r = tdx_mmio_write(vcpu, gpa, size, val); 1462 else 1463 r = tdx_mmio_read(vcpu, gpa, size); 1464 if (!r) 1465 /* Kernel completed device emulation. */ 1466 return 1; 1467 1468 /* Request the device emulation to userspace device model. */ 1469 vcpu->mmio_is_write = write; 1470 if (!write) 1471 vcpu->arch.complete_userspace_io = tdx_complete_mmio_read; 1472 1473 vcpu->run->mmio.phys_addr = gpa; 1474 vcpu->run->mmio.len = size; 1475 vcpu->run->mmio.is_write = write; 1476 vcpu->run->exit_reason = KVM_EXIT_MMIO; 1477 1478 if (write) { 1479 memcpy(vcpu->run->mmio.data, &val, size); 1480 } else { 1481 vcpu->mmio_fragments[0].gpa = gpa; 1482 vcpu->mmio_fragments[0].len = size; 1483 trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, size, gpa, NULL); 1484 } 1485 return 0; 1486 1487 error: 1488 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); 1489 return 1; 1490 } 1491 1492 static int tdx_complete_get_td_vm_call_info(struct kvm_vcpu *vcpu) 1493 { 1494 struct vcpu_tdx *tdx = to_tdx(vcpu); 1495 1496 tdvmcall_set_return_code(vcpu, vcpu->run->tdx.get_tdvmcall_info.ret); 1497 1498 /* 1499 * For now, there is no TDVMCALL beyond GHCI base API supported by KVM 1500 * directly without the support from userspace, just set the value 1501 * returned from userspace. 1502 */ 1503 tdx->vp_enter_args.r11 = vcpu->run->tdx.get_tdvmcall_info.r11; 1504 tdx->vp_enter_args.r12 = vcpu->run->tdx.get_tdvmcall_info.r12; 1505 tdx->vp_enter_args.r13 = vcpu->run->tdx.get_tdvmcall_info.r13; 1506 tdx->vp_enter_args.r14 = vcpu->run->tdx.get_tdvmcall_info.r14; 1507 1508 return 1; 1509 } 1510 1511 static int tdx_get_td_vm_call_info(struct kvm_vcpu *vcpu) 1512 { 1513 struct vcpu_tdx *tdx = to_tdx(vcpu); 1514 1515 switch (tdx->vp_enter_args.r12) { 1516 case 0: 1517 tdx->vp_enter_args.r11 = 0; 1518 tdx->vp_enter_args.r12 = 0; 1519 tdx->vp_enter_args.r13 = 0; 1520 tdx->vp_enter_args.r14 = 0; 1521 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_SUCCESS); 1522 return 1; 1523 case 1: 1524 vcpu->run->tdx.get_tdvmcall_info.leaf = tdx->vp_enter_args.r12; 1525 vcpu->run->exit_reason = KVM_EXIT_TDX; 1526 vcpu->run->tdx.flags = 0; 1527 vcpu->run->tdx.nr = TDVMCALL_GET_TD_VM_CALL_INFO; 1528 vcpu->run->tdx.get_tdvmcall_info.ret = TDVMCALL_STATUS_SUCCESS; 1529 vcpu->run->tdx.get_tdvmcall_info.r11 = 0; 1530 vcpu->run->tdx.get_tdvmcall_info.r12 = 0; 1531 vcpu->run->tdx.get_tdvmcall_info.r13 = 0; 1532 vcpu->run->tdx.get_tdvmcall_info.r14 = 0; 1533 vcpu->arch.complete_userspace_io = tdx_complete_get_td_vm_call_info; 1534 return 0; 1535 default: 1536 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); 1537 return 1; 1538 } 1539 } 1540 1541 static int tdx_complete_simple(struct kvm_vcpu *vcpu) 1542 { 1543 tdvmcall_set_return_code(vcpu, vcpu->run->tdx.unknown.ret); 1544 return 1; 1545 } 1546 1547 static int tdx_get_quote(struct kvm_vcpu *vcpu) 1548 { 1549 struct vcpu_tdx *tdx = to_tdx(vcpu); 1550 u64 gpa = tdx->vp_enter_args.r12; 1551 u64 size = tdx->vp_enter_args.r13; 1552 1553 /* The gpa of buffer must have shared bit set. */ 1554 if (vt_is_tdx_private_gpa(vcpu->kvm, gpa)) { 1555 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); 1556 return 1; 1557 } 1558 1559 vcpu->run->exit_reason = KVM_EXIT_TDX; 1560 vcpu->run->tdx.flags = 0; 1561 vcpu->run->tdx.nr = TDVMCALL_GET_QUOTE; 1562 vcpu->run->tdx.get_quote.ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED; 1563 vcpu->run->tdx.get_quote.gpa = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(tdx->vcpu.kvm)); 1564 vcpu->run->tdx.get_quote.size = size; 1565 1566 vcpu->arch.complete_userspace_io = tdx_complete_simple; 1567 1568 return 0; 1569 } 1570 1571 static int tdx_setup_event_notify_interrupt(struct kvm_vcpu *vcpu) 1572 { 1573 struct vcpu_tdx *tdx = to_tdx(vcpu); 1574 u64 vector = tdx->vp_enter_args.r12; 1575 1576 if (vector < 32 || vector > 255) { 1577 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); 1578 return 1; 1579 } 1580 1581 vcpu->run->exit_reason = KVM_EXIT_TDX; 1582 vcpu->run->tdx.flags = 0; 1583 vcpu->run->tdx.nr = TDVMCALL_SETUP_EVENT_NOTIFY_INTERRUPT; 1584 vcpu->run->tdx.setup_event_notify.ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED; 1585 vcpu->run->tdx.setup_event_notify.vector = vector; 1586 1587 vcpu->arch.complete_userspace_io = tdx_complete_simple; 1588 1589 return 0; 1590 } 1591 1592 static int handle_tdvmcall(struct kvm_vcpu *vcpu) 1593 { 1594 switch (tdvmcall_leaf(vcpu)) { 1595 case TDVMCALL_MAP_GPA: 1596 return tdx_map_gpa(vcpu); 1597 case TDVMCALL_REPORT_FATAL_ERROR: 1598 return tdx_report_fatal_error(vcpu); 1599 case TDVMCALL_GET_TD_VM_CALL_INFO: 1600 return tdx_get_td_vm_call_info(vcpu); 1601 case TDVMCALL_GET_QUOTE: 1602 return tdx_get_quote(vcpu); 1603 case TDVMCALL_SETUP_EVENT_NOTIFY_INTERRUPT: 1604 return tdx_setup_event_notify_interrupt(vcpu); 1605 default: 1606 break; 1607 } 1608 1609 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED); 1610 return 1; 1611 } 1612 1613 void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int pgd_level) 1614 { 1615 u64 shared_bit = (pgd_level == 5) ? TDX_SHARED_BIT_PWL_5 : 1616 TDX_SHARED_BIT_PWL_4; 1617 1618 if (KVM_BUG_ON(shared_bit != kvm_gfn_direct_bits(vcpu->kvm), vcpu->kvm)) 1619 return; 1620 1621 td_vmcs_write64(to_tdx(vcpu), SHARED_EPT_POINTER, root_hpa); 1622 } 1623 1624 static int tdx_mem_page_add(struct kvm *kvm, gfn_t gfn, enum pg_level level, 1625 kvm_pfn_t pfn) 1626 { 1627 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 1628 u64 err, entry, level_state; 1629 gpa_t gpa = gfn_to_gpa(gfn); 1630 1631 lockdep_assert_held(&kvm->slots_lock); 1632 1633 if (KVM_BUG_ON(kvm->arch.pre_fault_allowed, kvm) || 1634 KVM_BUG_ON(!kvm_tdx->page_add_src, kvm)) 1635 return -EIO; 1636 1637 err = tdh_mem_page_add(&kvm_tdx->td, gpa, pfn_to_page(pfn), 1638 kvm_tdx->page_add_src, &entry, &level_state); 1639 if (unlikely(tdx_operand_busy(err))) 1640 return -EBUSY; 1641 1642 if (TDX_BUG_ON_2(err, TDH_MEM_PAGE_ADD, entry, level_state, kvm)) 1643 return -EIO; 1644 1645 return 0; 1646 } 1647 1648 static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn, 1649 enum pg_level level, kvm_pfn_t pfn) 1650 { 1651 int tdx_level = pg_level_to_tdx_sept_level(level); 1652 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 1653 struct page *page = pfn_to_page(pfn); 1654 gpa_t gpa = gfn_to_gpa(gfn); 1655 u64 entry, level_state; 1656 u64 err; 1657 1658 err = tdh_mem_page_aug(&kvm_tdx->td, gpa, tdx_level, page, &entry, &level_state); 1659 if (unlikely(tdx_operand_busy(err))) 1660 return -EBUSY; 1661 1662 if (TDX_BUG_ON_2(err, TDH_MEM_PAGE_AUG, entry, level_state, kvm)) 1663 return -EIO; 1664 1665 return 0; 1666 } 1667 1668 static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn, 1669 enum pg_level level, u64 mirror_spte) 1670 { 1671 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 1672 kvm_pfn_t pfn = spte_to_pfn(mirror_spte); 1673 1674 /* TODO: handle large pages. */ 1675 if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm)) 1676 return -EIO; 1677 1678 WARN_ON_ONCE(!is_shadow_present_pte(mirror_spte) || 1679 (mirror_spte & VMX_EPT_RWX_MASK) != VMX_EPT_RWX_MASK); 1680 1681 /* 1682 * Ensure pre_fault_allowed is read by kvm_arch_vcpu_pre_fault_memory() 1683 * before kvm_tdx->state. Userspace must not be allowed to pre-fault 1684 * arbitrary memory until the initial memory image is finalized. Pairs 1685 * with the smp_wmb() in tdx_td_finalize(). 1686 */ 1687 smp_rmb(); 1688 1689 /* 1690 * If the TD isn't finalized/runnable, then userspace is initializing 1691 * the VM image via KVM_TDX_INIT_MEM_REGION; ADD the page to the TD. 1692 */ 1693 if (unlikely(kvm_tdx->state != TD_STATE_RUNNABLE)) 1694 return tdx_mem_page_add(kvm, gfn, level, pfn); 1695 1696 return tdx_mem_page_aug(kvm, gfn, level, pfn); 1697 } 1698 1699 static int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn, 1700 enum pg_level level, void *private_spt) 1701 { 1702 int tdx_level = pg_level_to_tdx_sept_level(level); 1703 gpa_t gpa = gfn_to_gpa(gfn); 1704 struct page *page = virt_to_page(private_spt); 1705 u64 err, entry, level_state; 1706 1707 err = tdh_mem_sept_add(&to_kvm_tdx(kvm)->td, gpa, tdx_level, page, &entry, 1708 &level_state); 1709 if (unlikely(tdx_operand_busy(err))) 1710 return -EBUSY; 1711 1712 if (TDX_BUG_ON_2(err, TDH_MEM_SEPT_ADD, entry, level_state, kvm)) 1713 return -EIO; 1714 1715 return 0; 1716 } 1717 1718 /* 1719 * Ensure shared and private EPTs to be flushed on all vCPUs. 1720 * tdh_mem_track() is the only caller that increases TD epoch. An increase in 1721 * the TD epoch (e.g., to value "N + 1") is successful only if no vCPUs are 1722 * running in guest mode with the value "N - 1". 1723 * 1724 * A successful execution of tdh_mem_track() ensures that vCPUs can only run in 1725 * guest mode with TD epoch value "N" if no TD exit occurs after the TD epoch 1726 * being increased to "N + 1". 1727 * 1728 * Kicking off all vCPUs after that further results in no vCPUs can run in guest 1729 * mode with TD epoch value "N", which unblocks the next tdh_mem_track() (e.g. 1730 * to increase TD epoch to "N + 2"). 1731 * 1732 * TDX module will flush EPT on the next TD enter and make vCPUs to run in 1733 * guest mode with TD epoch value "N + 1". 1734 * 1735 * kvm_make_all_cpus_request() guarantees all vCPUs are out of guest mode by 1736 * waiting empty IPI handler ack_kick(). 1737 * 1738 * No action is required to the vCPUs being kicked off since the kicking off 1739 * occurs certainly after TD epoch increment and before the next 1740 * tdh_mem_track(). 1741 */ 1742 static void tdx_track(struct kvm *kvm) 1743 { 1744 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 1745 u64 err; 1746 1747 /* If TD isn't finalized, it's before any vcpu running. */ 1748 if (unlikely(kvm_tdx->state != TD_STATE_RUNNABLE)) 1749 return; 1750 1751 /* 1752 * The full sequence of TDH.MEM.TRACK and forcing vCPUs out of guest 1753 * mode must be serialized, as TDH.MEM.TRACK will fail if the previous 1754 * tracking epoch hasn't completed. 1755 */ 1756 lockdep_assert_held_write(&kvm->mmu_lock); 1757 1758 err = tdh_do_no_vcpus(tdh_mem_track, kvm, &kvm_tdx->td); 1759 TDX_BUG_ON(err, TDH_MEM_TRACK, kvm); 1760 1761 kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE); 1762 } 1763 1764 static int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn, 1765 enum pg_level level, void *private_spt) 1766 { 1767 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 1768 1769 /* 1770 * free_external_spt() is only called after hkid is freed when TD is 1771 * tearing down. 1772 * KVM doesn't (yet) zap page table pages in mirror page table while 1773 * TD is active, though guest pages mapped in mirror page table could be 1774 * zapped during TD is active, e.g. for shared <-> private conversion 1775 * and slot move/deletion. 1776 */ 1777 if (KVM_BUG_ON(is_hkid_assigned(kvm_tdx), kvm)) 1778 return -EIO; 1779 1780 /* 1781 * The HKID assigned to this TD was already freed and cache was 1782 * already flushed. We don't have to flush again. 1783 */ 1784 return tdx_reclaim_page(virt_to_page(private_spt)); 1785 } 1786 1787 static void tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn, 1788 enum pg_level level, u64 mirror_spte) 1789 { 1790 struct page *page = pfn_to_page(spte_to_pfn(mirror_spte)); 1791 int tdx_level = pg_level_to_tdx_sept_level(level); 1792 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 1793 gpa_t gpa = gfn_to_gpa(gfn); 1794 u64 err, entry, level_state; 1795 1796 lockdep_assert_held_write(&kvm->mmu_lock); 1797 1798 /* 1799 * HKID is released after all private pages have been removed, and set 1800 * before any might be populated. Warn if zapping is attempted when 1801 * there can't be anything populated in the private EPT. 1802 */ 1803 if (KVM_BUG_ON(!is_hkid_assigned(to_kvm_tdx(kvm)), kvm)) 1804 return; 1805 1806 /* TODO: handle large pages. */ 1807 if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm)) 1808 return; 1809 1810 err = tdh_do_no_vcpus(tdh_mem_range_block, kvm, &kvm_tdx->td, gpa, 1811 tdx_level, &entry, &level_state); 1812 if (TDX_BUG_ON_2(err, TDH_MEM_RANGE_BLOCK, entry, level_state, kvm)) 1813 return; 1814 1815 /* 1816 * TDX requires TLB tracking before dropping private page. Do 1817 * it here, although it is also done later. 1818 */ 1819 tdx_track(kvm); 1820 1821 /* 1822 * When zapping private page, write lock is held. So no race condition 1823 * with other vcpu sept operation. 1824 * Race with TDH.VP.ENTER due to (0-step mitigation) and Guest TDCALLs. 1825 */ 1826 err = tdh_do_no_vcpus(tdh_mem_page_remove, kvm, &kvm_tdx->td, gpa, 1827 tdx_level, &entry, &level_state); 1828 if (TDX_BUG_ON_2(err, TDH_MEM_PAGE_REMOVE, entry, level_state, kvm)) 1829 return; 1830 1831 err = tdh_phymem_page_wbinvd_hkid((u16)kvm_tdx->hkid, page); 1832 if (TDX_BUG_ON(err, TDH_PHYMEM_PAGE_WBINVD, kvm)) 1833 return; 1834 1835 tdx_quirk_reset_page(page); 1836 } 1837 1838 void tdx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, 1839 int trig_mode, int vector) 1840 { 1841 struct kvm_vcpu *vcpu = apic->vcpu; 1842 struct vcpu_tdx *tdx = to_tdx(vcpu); 1843 1844 /* TDX supports only posted interrupt. No lapic emulation. */ 1845 __vmx_deliver_posted_interrupt(vcpu, &tdx->vt.pi_desc, vector); 1846 1847 trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector); 1848 } 1849 1850 static inline bool tdx_is_sept_violation_unexpected_pending(struct kvm_vcpu *vcpu) 1851 { 1852 u64 eeq_type = to_tdx(vcpu)->ext_exit_qualification & TDX_EXT_EXIT_QUAL_TYPE_MASK; 1853 u64 eq = vmx_get_exit_qual(vcpu); 1854 1855 if (eeq_type != TDX_EXT_EXIT_QUAL_TYPE_PENDING_EPT_VIOLATION) 1856 return false; 1857 1858 return !(eq & EPT_VIOLATION_PROT_MASK) && !(eq & EPT_VIOLATION_EXEC_FOR_RING3_LIN); 1859 } 1860 1861 static int tdx_handle_ept_violation(struct kvm_vcpu *vcpu) 1862 { 1863 unsigned long exit_qual; 1864 gpa_t gpa = to_tdx(vcpu)->exit_gpa; 1865 bool local_retry = false; 1866 int ret; 1867 1868 if (vt_is_tdx_private_gpa(vcpu->kvm, gpa)) { 1869 if (tdx_is_sept_violation_unexpected_pending(vcpu)) { 1870 pr_warn("Guest access before accepting 0x%llx on vCPU %d\n", 1871 gpa, vcpu->vcpu_id); 1872 kvm_vm_dead(vcpu->kvm); 1873 return -EIO; 1874 } 1875 /* 1876 * Always treat SEPT violations as write faults. Ignore the 1877 * EXIT_QUALIFICATION reported by TDX-SEAM for SEPT violations. 1878 * TD private pages are always RWX in the SEPT tables, 1879 * i.e. they're always mapped writable. Just as importantly, 1880 * treating SEPT violations as write faults is necessary to 1881 * avoid COW allocations, which will cause TDAUGPAGE failures 1882 * due to aliasing a single HPA to multiple GPAs. 1883 */ 1884 exit_qual = EPT_VIOLATION_ACC_WRITE; 1885 1886 /* Only private GPA triggers zero-step mitigation */ 1887 local_retry = true; 1888 } else { 1889 exit_qual = vmx_get_exit_qual(vcpu); 1890 /* 1891 * EPT violation due to instruction fetch should never be 1892 * triggered from shared memory in TDX guest. If such EPT 1893 * violation occurs, treat it as broken hardware. 1894 */ 1895 if (KVM_BUG_ON(exit_qual & EPT_VIOLATION_ACC_INSTR, vcpu->kvm)) 1896 return -EIO; 1897 } 1898 1899 trace_kvm_page_fault(vcpu, gpa, exit_qual); 1900 1901 /* 1902 * To minimize TDH.VP.ENTER invocations, retry locally for private GPA 1903 * mapping in TDX. 1904 * 1905 * KVM may return RET_PF_RETRY for private GPA due to 1906 * - contentions when atomically updating SPTEs of the mirror page table 1907 * - in-progress GFN invalidation or memslot removal. 1908 * - TDX_OPERAND_BUSY error from TDH.MEM.PAGE.AUG or TDH.MEM.SEPT.ADD, 1909 * caused by contentions with TDH.VP.ENTER (with zero-step mitigation) 1910 * or certain TDCALLs. 1911 * 1912 * If TDH.VP.ENTER is invoked more times than the threshold set by the 1913 * TDX module before KVM resolves the private GPA mapping, the TDX 1914 * module will activate zero-step mitigation during TDH.VP.ENTER. This 1915 * process acquires an SEPT tree lock in the TDX module, leading to 1916 * further contentions with TDH.MEM.PAGE.AUG or TDH.MEM.SEPT.ADD 1917 * operations on other vCPUs. 1918 * 1919 * Breaking out of local retries for kvm_vcpu_has_events() is for 1920 * interrupt injection. kvm_vcpu_has_events() should not see pending 1921 * events for TDX. Since KVM can't determine if IRQs (or NMIs) are 1922 * blocked by TDs, false positives are inevitable i.e., KVM may re-enter 1923 * the guest even if the IRQ/NMI can't be delivered. 1924 * 1925 * Note: even without breaking out of local retries, zero-step 1926 * mitigation may still occur due to 1927 * - invoking of TDH.VP.ENTER after KVM_EXIT_MEMORY_FAULT, 1928 * - a single RIP causing EPT violations for more GFNs than the 1929 * threshold count. 1930 * This is safe, as triggering zero-step mitigation only introduces 1931 * contentions to page installation SEAMCALLs on other vCPUs, which will 1932 * handle retries locally in their EPT violation handlers. 1933 */ 1934 while (1) { 1935 struct kvm_memory_slot *slot; 1936 1937 ret = __vmx_handle_ept_violation(vcpu, gpa, exit_qual); 1938 1939 if (ret != RET_PF_RETRY || !local_retry) 1940 break; 1941 1942 if (kvm_vcpu_has_events(vcpu) || signal_pending(current)) 1943 break; 1944 1945 if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu)) { 1946 ret = -EIO; 1947 break; 1948 } 1949 1950 /* 1951 * Bail if the memslot is invalid, i.e. is being deleted, as 1952 * faulting in will never succeed and this task needs to drop 1953 * SRCU in order to let memslot deletion complete. 1954 */ 1955 slot = kvm_vcpu_gfn_to_memslot(vcpu, gpa_to_gfn(gpa)); 1956 if (slot && slot->flags & KVM_MEMSLOT_INVALID) 1957 break; 1958 1959 cond_resched(); 1960 } 1961 return ret; 1962 } 1963 1964 int tdx_complete_emulated_msr(struct kvm_vcpu *vcpu, int err) 1965 { 1966 if (err) { 1967 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); 1968 return 1; 1969 } 1970 1971 if (vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MSR_READ) 1972 tdvmcall_set_return_val(vcpu, kvm_read_edx_eax(vcpu)); 1973 1974 return 1; 1975 } 1976 1977 1978 int tdx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t fastpath) 1979 { 1980 struct vcpu_tdx *tdx = to_tdx(vcpu); 1981 u64 vp_enter_ret = tdx->vp_enter_ret; 1982 union vmx_exit_reason exit_reason = vmx_get_exit_reason(vcpu); 1983 1984 if (fastpath != EXIT_FASTPATH_NONE) 1985 return 1; 1986 1987 if (unlikely(vp_enter_ret == EXIT_REASON_EPT_MISCONFIG)) { 1988 KVM_BUG_ON(1, vcpu->kvm); 1989 return -EIO; 1990 } 1991 1992 /* 1993 * Handle TDX SW errors, including TDX_SEAMCALL_UD, TDX_SEAMCALL_GP and 1994 * TDX_SEAMCALL_VMFAILINVALID. 1995 */ 1996 if (unlikely((vp_enter_ret & TDX_SW_ERROR) == TDX_SW_ERROR)) { 1997 KVM_BUG_ON(!kvm_rebooting, vcpu->kvm); 1998 goto unhandled_exit; 1999 } 2000 2001 if (unlikely(tdx_failed_vmentry(vcpu))) { 2002 /* 2003 * If the guest state is protected, that means off-TD debug is 2004 * not enabled, TDX_NON_RECOVERABLE must be set. 2005 */ 2006 WARN_ON_ONCE(vcpu->arch.guest_state_protected && 2007 !(vp_enter_ret & TDX_NON_RECOVERABLE)); 2008 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; 2009 vcpu->run->fail_entry.hardware_entry_failure_reason = exit_reason.full; 2010 vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu; 2011 return 0; 2012 } 2013 2014 if (unlikely(vp_enter_ret & (TDX_ERROR | TDX_NON_RECOVERABLE)) && 2015 exit_reason.basic != EXIT_REASON_TRIPLE_FAULT) { 2016 kvm_pr_unimpl("TD vp_enter_ret 0x%llx\n", vp_enter_ret); 2017 goto unhandled_exit; 2018 } 2019 2020 WARN_ON_ONCE(exit_reason.basic != EXIT_REASON_TRIPLE_FAULT && 2021 (vp_enter_ret & TDX_SEAMCALL_STATUS_MASK) != TDX_SUCCESS); 2022 2023 switch (exit_reason.basic) { 2024 case EXIT_REASON_TRIPLE_FAULT: 2025 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; 2026 vcpu->mmio_needed = 0; 2027 return 0; 2028 case EXIT_REASON_EXCEPTION_NMI: 2029 return tdx_handle_exception_nmi(vcpu); 2030 case EXIT_REASON_EXTERNAL_INTERRUPT: 2031 ++vcpu->stat.irq_exits; 2032 return 1; 2033 case EXIT_REASON_CPUID: 2034 return tdx_emulate_cpuid(vcpu); 2035 case EXIT_REASON_HLT: 2036 return kvm_emulate_halt_noskip(vcpu); 2037 case EXIT_REASON_TDCALL: 2038 return handle_tdvmcall(vcpu); 2039 case EXIT_REASON_VMCALL: 2040 return tdx_emulate_vmcall(vcpu); 2041 case EXIT_REASON_IO_INSTRUCTION: 2042 return tdx_emulate_io(vcpu); 2043 case EXIT_REASON_MSR_READ: 2044 kvm_rcx_write(vcpu, tdx->vp_enter_args.r12); 2045 return kvm_emulate_rdmsr(vcpu); 2046 case EXIT_REASON_MSR_WRITE: 2047 kvm_rcx_write(vcpu, tdx->vp_enter_args.r12); 2048 kvm_rax_write(vcpu, tdx->vp_enter_args.r13 & -1u); 2049 kvm_rdx_write(vcpu, tdx->vp_enter_args.r13 >> 32); 2050 return kvm_emulate_wrmsr(vcpu); 2051 case EXIT_REASON_EPT_MISCONFIG: 2052 return tdx_emulate_mmio(vcpu); 2053 case EXIT_REASON_EPT_VIOLATION: 2054 return tdx_handle_ept_violation(vcpu); 2055 case EXIT_REASON_OTHER_SMI: 2056 /* 2057 * Unlike VMX, SMI in SEAM non-root mode (i.e. when 2058 * TD guest vCPU is running) will cause VM exit to TDX module, 2059 * then SEAMRET to KVM. Once it exits to KVM, SMI is delivered 2060 * and handled by kernel handler right away. 2061 * 2062 * The Other SMI exit can also be caused by the SEAM non-root 2063 * machine check delivered via Machine Check System Management 2064 * Interrupt (MSMI), but it has already been handled by the 2065 * kernel machine check handler, i.e., the memory page has been 2066 * marked as poisoned and it won't be freed to the free list 2067 * when the TDX guest is terminated (the TDX module marks the 2068 * guest as dead and prevent it from further running when 2069 * machine check happens in SEAM non-root). 2070 * 2071 * - A MSMI will not reach here, it's handled as non_recoverable 2072 * case above. 2073 * - If it's not an MSMI, no need to do anything here. 2074 */ 2075 return 1; 2076 default: 2077 break; 2078 } 2079 2080 unhandled_exit: 2081 kvm_prepare_unexpected_reason_exit(vcpu, vp_enter_ret); 2082 return 0; 2083 } 2084 2085 void tdx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, 2086 u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code) 2087 { 2088 struct vcpu_tdx *tdx = to_tdx(vcpu); 2089 2090 *reason = tdx->vt.exit_reason.full; 2091 if (*reason != -1u) { 2092 *info1 = vmx_get_exit_qual(vcpu); 2093 *info2 = tdx->ext_exit_qualification; 2094 *intr_info = vmx_get_intr_info(vcpu); 2095 } else { 2096 *info1 = 0; 2097 *info2 = 0; 2098 *intr_info = 0; 2099 } 2100 2101 *error_code = 0; 2102 } 2103 2104 bool tdx_has_emulated_msr(u32 index) 2105 { 2106 switch (index) { 2107 case MSR_IA32_UCODE_REV: 2108 case MSR_IA32_ARCH_CAPABILITIES: 2109 case MSR_IA32_POWER_CTL: 2110 case MSR_IA32_CR_PAT: 2111 case MSR_MTRRcap: 2112 case MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000: 2113 case MSR_MTRRdefType: 2114 case MSR_IA32_TSC_DEADLINE: 2115 case MSR_IA32_MISC_ENABLE: 2116 case MSR_PLATFORM_INFO: 2117 case MSR_MISC_FEATURES_ENABLES: 2118 case MSR_IA32_APICBASE: 2119 case MSR_EFER: 2120 case MSR_IA32_FEAT_CTL: 2121 case MSR_IA32_MCG_CAP: 2122 case MSR_IA32_MCG_STATUS: 2123 case MSR_IA32_MCG_CTL: 2124 case MSR_IA32_MCG_EXT_CTL: 2125 case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: 2126 case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1: 2127 /* MSR_IA32_MCx_{CTL, STATUS, ADDR, MISC, CTL2} */ 2128 case MSR_KVM_POLL_CONTROL: 2129 return true; 2130 case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff: 2131 /* 2132 * x2APIC registers that are virtualized by the CPU can't be 2133 * emulated, KVM doesn't have access to the virtual APIC page. 2134 */ 2135 switch (index) { 2136 case X2APIC_MSR(APIC_TASKPRI): 2137 case X2APIC_MSR(APIC_PROCPRI): 2138 case X2APIC_MSR(APIC_EOI): 2139 case X2APIC_MSR(APIC_ISR) ... X2APIC_MSR(APIC_ISR + APIC_ISR_NR): 2140 case X2APIC_MSR(APIC_TMR) ... X2APIC_MSR(APIC_TMR + APIC_ISR_NR): 2141 case X2APIC_MSR(APIC_IRR) ... X2APIC_MSR(APIC_IRR + APIC_ISR_NR): 2142 return false; 2143 default: 2144 return true; 2145 } 2146 default: 2147 return false; 2148 } 2149 } 2150 2151 static bool tdx_is_read_only_msr(u32 index) 2152 { 2153 return index == MSR_IA32_APICBASE || index == MSR_EFER || 2154 index == MSR_IA32_FEAT_CTL; 2155 } 2156 2157 int tdx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) 2158 { 2159 switch (msr->index) { 2160 case MSR_IA32_FEAT_CTL: 2161 /* 2162 * MCE and MCA are advertised via cpuid. Guest kernel could 2163 * check if LMCE is enabled or not. 2164 */ 2165 msr->data = FEAT_CTL_LOCKED; 2166 if (vcpu->arch.mcg_cap & MCG_LMCE_P) 2167 msr->data |= FEAT_CTL_LMCE_ENABLED; 2168 return 0; 2169 case MSR_IA32_MCG_EXT_CTL: 2170 if (!msr->host_initiated && !(vcpu->arch.mcg_cap & MCG_LMCE_P)) 2171 return 1; 2172 msr->data = vcpu->arch.mcg_ext_ctl; 2173 return 0; 2174 default: 2175 if (!tdx_has_emulated_msr(msr->index)) 2176 return 1; 2177 2178 return kvm_get_msr_common(vcpu, msr); 2179 } 2180 } 2181 2182 int tdx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) 2183 { 2184 switch (msr->index) { 2185 case MSR_IA32_MCG_EXT_CTL: 2186 if ((!msr->host_initiated && !(vcpu->arch.mcg_cap & MCG_LMCE_P)) || 2187 (msr->data & ~MCG_EXT_CTL_LMCE_EN)) 2188 return 1; 2189 vcpu->arch.mcg_ext_ctl = msr->data; 2190 return 0; 2191 default: 2192 if (tdx_is_read_only_msr(msr->index)) 2193 return 1; 2194 2195 if (!tdx_has_emulated_msr(msr->index)) 2196 return 1; 2197 2198 return kvm_set_msr_common(vcpu, msr); 2199 } 2200 } 2201 2202 static int tdx_get_capabilities(struct kvm_tdx_cmd *cmd) 2203 { 2204 const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf; 2205 struct kvm_tdx_capabilities __user *user_caps; 2206 struct kvm_tdx_capabilities *caps = NULL; 2207 u32 nr_user_entries; 2208 int ret = 0; 2209 2210 /* flags is reserved for future use */ 2211 if (cmd->flags) 2212 return -EINVAL; 2213 2214 user_caps = u64_to_user_ptr(cmd->data); 2215 if (get_user(nr_user_entries, &user_caps->cpuid.nent)) 2216 return -EFAULT; 2217 2218 if (nr_user_entries < td_conf->num_cpuid_config) 2219 return -E2BIG; 2220 2221 caps = kzalloc_flex(*caps, cpuid.entries, td_conf->num_cpuid_config); 2222 if (!caps) 2223 return -ENOMEM; 2224 2225 ret = init_kvm_tdx_caps(td_conf, caps); 2226 if (ret) 2227 goto out; 2228 2229 if (copy_to_user(user_caps, caps, struct_size(caps, cpuid.entries, 2230 caps->cpuid.nent))) { 2231 ret = -EFAULT; 2232 goto out; 2233 } 2234 2235 out: 2236 /* kfree() accepts NULL. */ 2237 kfree(caps); 2238 return ret; 2239 } 2240 2241 /* 2242 * KVM reports guest physical address in CPUID.0x800000008.EAX[23:16], which is 2243 * similar to TDX's GPAW. Use this field as the interface for userspace to 2244 * configure the GPAW and EPT level for TDs. 2245 * 2246 * Only values 48 and 52 are supported. Value 52 means GPAW-52 and EPT level 2247 * 5, Value 48 means GPAW-48 and EPT level 4. For value 48, GPAW-48 is always 2248 * supported. Value 52 is only supported when the platform supports 5 level 2249 * EPT. 2250 */ 2251 static int setup_tdparams_eptp_controls(struct kvm_cpuid2 *cpuid, 2252 struct td_params *td_params) 2253 { 2254 const struct kvm_cpuid_entry2 *entry; 2255 int guest_pa; 2256 2257 entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent, 0x80000008, 0); 2258 if (!entry) 2259 return -EINVAL; 2260 2261 guest_pa = tdx_get_guest_phys_addr_bits(entry->eax); 2262 2263 if (guest_pa != 48 && guest_pa != 52) 2264 return -EINVAL; 2265 2266 if (guest_pa == 52 && !cpu_has_vmx_ept_5levels()) 2267 return -EINVAL; 2268 2269 td_params->eptp_controls = VMX_EPTP_MT_WB; 2270 if (guest_pa == 52) { 2271 td_params->eptp_controls |= VMX_EPTP_PWL_5; 2272 td_params->config_flags |= TDX_CONFIG_FLAGS_MAX_GPAW; 2273 } else { 2274 td_params->eptp_controls |= VMX_EPTP_PWL_4; 2275 } 2276 2277 return 0; 2278 } 2279 2280 static int setup_tdparams_cpuids(struct kvm_cpuid2 *cpuid, 2281 struct td_params *td_params) 2282 { 2283 const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf; 2284 const struct kvm_cpuid_entry2 *entry; 2285 struct tdx_cpuid_value *value; 2286 int i, copy_cnt = 0; 2287 2288 /* 2289 * td_params.cpuid_values: The number and the order of cpuid_value must 2290 * be same to the one of struct tdsysinfo.{num_cpuid_config, cpuid_configs} 2291 * It's assumed that td_params was zeroed. 2292 */ 2293 for (i = 0; i < td_conf->num_cpuid_config; i++) { 2294 struct kvm_cpuid_entry2 tmp; 2295 2296 td_init_cpuid_entry2(&tmp, i); 2297 2298 entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent, 2299 tmp.function, tmp.index); 2300 if (!entry) 2301 continue; 2302 2303 if (tdx_unsupported_cpuid(entry)) 2304 return -EINVAL; 2305 2306 copy_cnt++; 2307 2308 value = &td_params->cpuid_values[i]; 2309 value->eax = entry->eax; 2310 value->ebx = entry->ebx; 2311 value->ecx = entry->ecx; 2312 value->edx = entry->edx; 2313 2314 /* 2315 * TDX module does not accept nonzero bits 16..23 for the 2316 * CPUID[0x80000008].EAX, see setup_tdparams_eptp_controls(). 2317 */ 2318 if (tmp.function == 0x80000008) 2319 value->eax = tdx_set_guest_phys_addr_bits(value->eax, 0); 2320 } 2321 2322 /* 2323 * Rely on the TDX module to reject invalid configuration, but it can't 2324 * check of leafs that don't have a proper slot in td_params->cpuid_values 2325 * to stick then. So fail if there were entries that didn't get copied to 2326 * td_params. 2327 */ 2328 if (copy_cnt != cpuid->nent) 2329 return -EINVAL; 2330 2331 return 0; 2332 } 2333 2334 static int setup_tdparams(struct kvm *kvm, struct td_params *td_params, 2335 struct kvm_tdx_init_vm *init_vm) 2336 { 2337 const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf; 2338 struct kvm_cpuid2 *cpuid = &init_vm->cpuid; 2339 int ret; 2340 2341 if (kvm->created_vcpus) 2342 return -EBUSY; 2343 2344 if (init_vm->attributes & ~tdx_get_supported_attrs(td_conf)) 2345 return -EINVAL; 2346 2347 if (init_vm->xfam & ~tdx_get_supported_xfam(td_conf)) 2348 return -EINVAL; 2349 2350 td_params->max_vcpus = kvm->max_vcpus; 2351 td_params->attributes = init_vm->attributes | td_conf->attributes_fixed1; 2352 td_params->xfam = init_vm->xfam | td_conf->xfam_fixed1; 2353 2354 td_params->config_flags = TDX_CONFIG_FLAGS_NO_RBP_MOD; 2355 td_params->tsc_frequency = TDX_TSC_KHZ_TO_25MHZ(kvm->arch.default_tsc_khz); 2356 2357 ret = setup_tdparams_eptp_controls(cpuid, td_params); 2358 if (ret) 2359 return ret; 2360 2361 ret = setup_tdparams_cpuids(cpuid, td_params); 2362 if (ret) 2363 return ret; 2364 2365 #define MEMCPY_SAME_SIZE(dst, src) \ 2366 do { \ 2367 BUILD_BUG_ON(sizeof(dst) != sizeof(src)); \ 2368 memcpy((dst), (src), sizeof(dst)); \ 2369 } while (0) 2370 2371 MEMCPY_SAME_SIZE(td_params->mrconfigid, init_vm->mrconfigid); 2372 MEMCPY_SAME_SIZE(td_params->mrowner, init_vm->mrowner); 2373 MEMCPY_SAME_SIZE(td_params->mrownerconfig, init_vm->mrownerconfig); 2374 2375 return 0; 2376 } 2377 2378 static int __tdx_td_init(struct kvm *kvm, struct td_params *td_params, 2379 u64 *seamcall_err) 2380 { 2381 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 2382 cpumask_var_t packages; 2383 struct page **tdcs_pages = NULL; 2384 struct page *tdr_page; 2385 int ret, i; 2386 u64 err, rcx; 2387 2388 *seamcall_err = 0; 2389 ret = tdx_guest_keyid_alloc(); 2390 if (ret < 0) 2391 return ret; 2392 kvm_tdx->hkid = ret; 2393 kvm_tdx->misc_cg = get_current_misc_cg(); 2394 ret = misc_cg_try_charge(MISC_CG_RES_TDX, kvm_tdx->misc_cg, 1); 2395 if (ret) 2396 goto free_hkid; 2397 2398 ret = -ENOMEM; 2399 2400 atomic_inc(&nr_configured_hkid); 2401 2402 tdr_page = alloc_page(GFP_KERNEL); 2403 if (!tdr_page) 2404 goto free_hkid; 2405 2406 kvm_tdx->td.tdcs_nr_pages = tdx_sysinfo->td_ctrl.tdcs_base_size / PAGE_SIZE; 2407 /* TDVPS = TDVPR(4K page) + TDCX(multiple 4K pages), -1 for TDVPR. */ 2408 kvm_tdx->td.tdcx_nr_pages = tdx_sysinfo->td_ctrl.tdvps_base_size / PAGE_SIZE - 1; 2409 tdcs_pages = kzalloc_objs(*kvm_tdx->td.tdcs_pages, 2410 kvm_tdx->td.tdcs_nr_pages); 2411 if (!tdcs_pages) 2412 goto free_tdr; 2413 2414 for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) { 2415 tdcs_pages[i] = alloc_page(GFP_KERNEL); 2416 if (!tdcs_pages[i]) 2417 goto free_tdcs; 2418 } 2419 2420 if (!zalloc_cpumask_var(&packages, GFP_KERNEL)) 2421 goto free_tdcs; 2422 2423 cpus_read_lock(); 2424 2425 /* 2426 * Need at least one CPU of the package to be online in order to 2427 * program all packages for host key id. Check it. 2428 */ 2429 for_each_present_cpu(i) 2430 cpumask_set_cpu(topology_physical_package_id(i), packages); 2431 for_each_online_cpu(i) 2432 cpumask_clear_cpu(topology_physical_package_id(i), packages); 2433 if (!cpumask_empty(packages)) { 2434 ret = -EIO; 2435 /* 2436 * Because it's hard for human operator to figure out the 2437 * reason, warn it. 2438 */ 2439 #define MSG_ALLPKG "All packages need to have online CPU to create TD. Online CPU and retry.\n" 2440 pr_warn_ratelimited(MSG_ALLPKG); 2441 goto free_packages; 2442 } 2443 2444 /* 2445 * TDH.MNG.CREATE tries to grab the global TDX module and fails 2446 * with TDX_OPERAND_BUSY when it fails to grab. Take the global 2447 * lock to prevent it from failure. 2448 */ 2449 mutex_lock(&tdx_lock); 2450 kvm_tdx->td.tdr_page = tdr_page; 2451 err = tdh_mng_create(&kvm_tdx->td, kvm_tdx->hkid); 2452 mutex_unlock(&tdx_lock); 2453 2454 if (err == TDX_RND_NO_ENTROPY) { 2455 ret = -EAGAIN; 2456 goto free_packages; 2457 } 2458 2459 if (TDX_BUG_ON(err, TDH_MNG_CREATE, kvm)) { 2460 ret = -EIO; 2461 goto free_packages; 2462 } 2463 2464 for_each_online_cpu(i) { 2465 int pkg = topology_physical_package_id(i); 2466 2467 if (cpumask_test_and_set_cpu(pkg, packages)) 2468 continue; 2469 2470 /* 2471 * Program the memory controller in the package with an 2472 * encryption key associated to a TDX private host key id 2473 * assigned to this TDR. Concurrent operations on same memory 2474 * controller results in TDX_OPERAND_BUSY. No locking needed 2475 * beyond the cpus_read_lock() above as it serializes against 2476 * hotplug and the first online CPU of the package is always 2477 * used. We never have two CPUs in the same socket trying to 2478 * program the key. 2479 */ 2480 ret = smp_call_on_cpu(i, tdx_do_tdh_mng_key_config, 2481 kvm_tdx, true); 2482 if (ret) 2483 break; 2484 } 2485 cpus_read_unlock(); 2486 free_cpumask_var(packages); 2487 if (ret) { 2488 i = 0; 2489 goto teardown; 2490 } 2491 2492 kvm_tdx->td.tdcs_pages = tdcs_pages; 2493 for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) { 2494 err = tdh_mng_addcx(&kvm_tdx->td, tdcs_pages[i]); 2495 if (err == TDX_RND_NO_ENTROPY) { 2496 /* Here it's hard to allow userspace to retry. */ 2497 ret = -EAGAIN; 2498 goto teardown; 2499 } 2500 if (TDX_BUG_ON(err, TDH_MNG_ADDCX, kvm)) { 2501 ret = -EIO; 2502 goto teardown; 2503 } 2504 } 2505 2506 err = tdh_mng_init(&kvm_tdx->td, __pa(td_params), &rcx); 2507 if ((err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_INVALID) { 2508 /* 2509 * Because a user gives operands, don't warn. 2510 * Return a hint to the user because it's sometimes hard for the 2511 * user to figure out which operand is invalid. SEAMCALL status 2512 * code includes which operand caused invalid operand error. 2513 */ 2514 *seamcall_err = err; 2515 ret = -EINVAL; 2516 goto teardown; 2517 } else if (TDX_BUG_ON_1(err, TDH_MNG_INIT, rcx, kvm)) { 2518 ret = -EIO; 2519 goto teardown; 2520 } 2521 2522 return 0; 2523 2524 /* 2525 * The sequence for freeing resources from a partially initialized TD 2526 * varies based on where in the initialization flow failure occurred. 2527 * Simply use the full teardown and destroy, which naturally play nice 2528 * with partial initialization. 2529 */ 2530 teardown: 2531 /* Only free pages not yet added, so start at 'i' */ 2532 for (; i < kvm_tdx->td.tdcs_nr_pages; i++) { 2533 if (tdcs_pages[i]) { 2534 __free_page(tdcs_pages[i]); 2535 tdcs_pages[i] = NULL; 2536 } 2537 } 2538 if (!kvm_tdx->td.tdcs_pages) 2539 kfree(tdcs_pages); 2540 2541 tdx_mmu_release_hkid(kvm); 2542 tdx_reclaim_td_control_pages(kvm); 2543 2544 return ret; 2545 2546 free_packages: 2547 cpus_read_unlock(); 2548 free_cpumask_var(packages); 2549 2550 free_tdcs: 2551 for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) { 2552 if (tdcs_pages[i]) 2553 __free_page(tdcs_pages[i]); 2554 } 2555 kfree(tdcs_pages); 2556 kvm_tdx->td.tdcs_pages = NULL; 2557 2558 free_tdr: 2559 if (tdr_page) 2560 __free_page(tdr_page); 2561 kvm_tdx->td.tdr_page = NULL; 2562 2563 free_hkid: 2564 tdx_hkid_free(kvm_tdx); 2565 2566 return ret; 2567 } 2568 2569 static u64 tdx_td_metadata_field_read(struct kvm_tdx *tdx, u64 field_id, 2570 u64 *data) 2571 { 2572 u64 err; 2573 2574 err = tdh_mng_rd(&tdx->td, field_id, data); 2575 2576 return err; 2577 } 2578 2579 #define TDX_MD_UNREADABLE_LEAF_MASK GENMASK(30, 7) 2580 #define TDX_MD_UNREADABLE_SUBLEAF_MASK GENMASK(31, 7) 2581 2582 static int tdx_read_cpuid(struct kvm_vcpu *vcpu, u32 leaf, u32 sub_leaf, 2583 bool sub_leaf_set, int *entry_index, 2584 struct kvm_cpuid_entry2 *out) 2585 { 2586 struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); 2587 u64 field_id = TD_MD_FIELD_ID_CPUID_VALUES; 2588 u64 ebx_eax, edx_ecx; 2589 u64 err = 0; 2590 2591 if (sub_leaf > 0b1111111) 2592 return -EINVAL; 2593 2594 if (*entry_index >= KVM_MAX_CPUID_ENTRIES) 2595 return -EINVAL; 2596 2597 if (leaf & TDX_MD_UNREADABLE_LEAF_MASK || 2598 sub_leaf & TDX_MD_UNREADABLE_SUBLEAF_MASK) 2599 return -EINVAL; 2600 2601 /* 2602 * bit 23:17, REVSERVED: reserved, must be 0; 2603 * bit 16, LEAF_31: leaf number bit 31; 2604 * bit 15:9, LEAF_6_0: leaf number bits 6:0, leaf bits 30:7 are 2605 * implicitly 0; 2606 * bit 8, SUBLEAF_NA: sub-leaf not applicable flag; 2607 * bit 7:1, SUBLEAF_6_0: sub-leaf number bits 6:0. If SUBLEAF_NA is 1, 2608 * the SUBLEAF_6_0 is all-1. 2609 * sub-leaf bits 31:7 are implicitly 0; 2610 * bit 0, ELEMENT_I: Element index within field; 2611 */ 2612 field_id |= ((leaf & 0x80000000) ? 1 : 0) << 16; 2613 field_id |= (leaf & 0x7f) << 9; 2614 if (sub_leaf_set) 2615 field_id |= (sub_leaf & 0x7f) << 1; 2616 else 2617 field_id |= 0x1fe; 2618 2619 err = tdx_td_metadata_field_read(kvm_tdx, field_id, &ebx_eax); 2620 if (err) //TODO check for specific errors 2621 goto err_out; 2622 2623 out->eax = (u32) ebx_eax; 2624 out->ebx = (u32) (ebx_eax >> 32); 2625 2626 field_id++; 2627 err = tdx_td_metadata_field_read(kvm_tdx, field_id, &edx_ecx); 2628 /* 2629 * It's weird that reading edx_ecx fails while reading ebx_eax 2630 * succeeded. 2631 */ 2632 if (WARN_ON_ONCE(err)) 2633 goto err_out; 2634 2635 out->ecx = (u32) edx_ecx; 2636 out->edx = (u32) (edx_ecx >> 32); 2637 2638 out->function = leaf; 2639 out->index = sub_leaf; 2640 out->flags |= sub_leaf_set ? KVM_CPUID_FLAG_SIGNIFCANT_INDEX : 0; 2641 2642 /* 2643 * Work around missing support on old TDX modules, fetch 2644 * guest maxpa from gfn_direct_bits. 2645 */ 2646 if (leaf == 0x80000008) { 2647 gpa_t gpa_bits = gfn_to_gpa(kvm_gfn_direct_bits(vcpu->kvm)); 2648 unsigned int g_maxpa = __ffs(gpa_bits) + 1; 2649 2650 out->eax = tdx_set_guest_phys_addr_bits(out->eax, g_maxpa); 2651 } 2652 2653 (*entry_index)++; 2654 2655 return 0; 2656 2657 err_out: 2658 out->eax = 0; 2659 out->ebx = 0; 2660 out->ecx = 0; 2661 out->edx = 0; 2662 2663 return -EIO; 2664 } 2665 2666 typedef void *tdx_vm_state_guard_t; 2667 2668 static tdx_vm_state_guard_t tdx_acquire_vm_state_locks(struct kvm *kvm) 2669 { 2670 int r; 2671 2672 mutex_lock(&kvm->lock); 2673 2674 if (kvm->created_vcpus != atomic_read(&kvm->online_vcpus)) { 2675 r = -EBUSY; 2676 goto out_err; 2677 } 2678 2679 r = kvm_lock_all_vcpus(kvm); 2680 if (r) 2681 goto out_err; 2682 2683 /* 2684 * Note the unintuitive ordering! vcpu->mutex must be taken outside 2685 * kvm->slots_lock! 2686 */ 2687 mutex_lock(&kvm->slots_lock); 2688 return kvm; 2689 2690 out_err: 2691 mutex_unlock(&kvm->lock); 2692 return ERR_PTR(r); 2693 } 2694 2695 static void tdx_release_vm_state_locks(struct kvm *kvm) 2696 { 2697 mutex_unlock(&kvm->slots_lock); 2698 kvm_unlock_all_vcpus(kvm); 2699 mutex_unlock(&kvm->lock); 2700 } 2701 2702 DEFINE_CLASS(tdx_vm_state_guard, tdx_vm_state_guard_t, 2703 if (!IS_ERR(_T)) tdx_release_vm_state_locks(_T), 2704 tdx_acquire_vm_state_locks(kvm), struct kvm *kvm); 2705 2706 static int tdx_td_init(struct kvm *kvm, struct kvm_tdx_cmd *cmd) 2707 { 2708 struct kvm_tdx_init_vm __user *user_data = u64_to_user_ptr(cmd->data); 2709 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 2710 struct kvm_tdx_init_vm *init_vm; 2711 struct td_params *td_params = NULL; 2712 u32 nr_user_entries; 2713 int ret; 2714 2715 BUILD_BUG_ON(sizeof(*init_vm) != 256 + sizeof_field(struct kvm_tdx_init_vm, cpuid)); 2716 BUILD_BUG_ON(sizeof(struct td_params) != 1024); 2717 2718 if (kvm_tdx->state != TD_STATE_UNINITIALIZED) 2719 return -EINVAL; 2720 2721 if (cmd->flags) 2722 return -EINVAL; 2723 2724 if (get_user(nr_user_entries, &user_data->cpuid.nent)) 2725 return -EFAULT; 2726 2727 if (nr_user_entries > KVM_MAX_CPUID_ENTRIES) 2728 return -E2BIG; 2729 2730 init_vm = memdup_user(user_data, 2731 struct_size(user_data, cpuid.entries, nr_user_entries)); 2732 if (IS_ERR(init_vm)) 2733 return PTR_ERR(init_vm); 2734 2735 if (memchr_inv(init_vm->reserved, 0, sizeof(init_vm->reserved))) { 2736 ret = -EINVAL; 2737 goto out; 2738 } 2739 2740 if (init_vm->cpuid.padding) { 2741 ret = -EINVAL; 2742 goto out; 2743 } 2744 2745 td_params = kzalloc_obj(struct td_params); 2746 if (!td_params) { 2747 ret = -ENOMEM; 2748 goto out; 2749 } 2750 2751 ret = setup_tdparams(kvm, td_params, init_vm); 2752 if (ret) 2753 goto out; 2754 2755 ret = __tdx_td_init(kvm, td_params, &cmd->hw_error); 2756 if (ret) 2757 goto out; 2758 2759 kvm_tdx->tsc_offset = td_tdcs_exec_read64(kvm_tdx, TD_TDCS_EXEC_TSC_OFFSET); 2760 kvm_tdx->tsc_multiplier = td_tdcs_exec_read64(kvm_tdx, TD_TDCS_EXEC_TSC_MULTIPLIER); 2761 kvm_tdx->attributes = td_params->attributes; 2762 kvm_tdx->xfam = td_params->xfam; 2763 2764 if (td_params->config_flags & TDX_CONFIG_FLAGS_MAX_GPAW) 2765 kvm->arch.gfn_direct_bits = TDX_SHARED_BIT_PWL_5; 2766 else 2767 kvm->arch.gfn_direct_bits = TDX_SHARED_BIT_PWL_4; 2768 2769 kvm_tdx->state = TD_STATE_INITIALIZED; 2770 out: 2771 /* kfree() accepts NULL. */ 2772 kfree(init_vm); 2773 kfree(td_params); 2774 2775 return ret; 2776 } 2777 2778 void tdx_flush_tlb_current(struct kvm_vcpu *vcpu) 2779 { 2780 /* 2781 * flush_tlb_current() is invoked when the first time for the vcpu to 2782 * run or when root of shared EPT is invalidated. 2783 * KVM only needs to flush shared EPT because the TDX module handles TLB 2784 * invalidation for private EPT in tdh_vp_enter(); 2785 * 2786 * A single context invalidation for shared EPT can be performed here. 2787 * However, this single context invalidation requires the private EPTP 2788 * rather than the shared EPTP to flush shared EPT, as shared EPT uses 2789 * private EPTP as its ASID for TLB invalidation. 2790 * 2791 * To avoid reading back private EPTP, perform a global invalidation for 2792 * shared EPT instead to keep this function simple. 2793 */ 2794 ept_sync_global(); 2795 } 2796 2797 void tdx_flush_tlb_all(struct kvm_vcpu *vcpu) 2798 { 2799 /* 2800 * TDX has called tdx_track() in tdx_sept_remove_private_spte() to 2801 * ensure that private EPT will be flushed on the next TD enter. No need 2802 * to call tdx_track() here again even when this callback is a result of 2803 * zapping private EPT. 2804 * 2805 * Due to the lack of the context to determine which EPT has been 2806 * affected by zapping, invoke invept() directly here for both shared 2807 * EPT and private EPT for simplicity, though it's not necessary for 2808 * private EPT. 2809 */ 2810 ept_sync_global(); 2811 } 2812 2813 static int tdx_td_finalize(struct kvm *kvm, struct kvm_tdx_cmd *cmd) 2814 { 2815 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 2816 2817 if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE) 2818 return -EINVAL; 2819 2820 cmd->hw_error = tdh_mr_finalize(&kvm_tdx->td); 2821 if (tdx_operand_busy(cmd->hw_error)) 2822 return -EBUSY; 2823 if (TDX_BUG_ON(cmd->hw_error, TDH_MR_FINALIZE, kvm)) 2824 return -EIO; 2825 2826 kvm_tdx->state = TD_STATE_RUNNABLE; 2827 /* TD_STATE_RUNNABLE must be set before 'pre_fault_allowed' */ 2828 smp_wmb(); 2829 kvm->arch.pre_fault_allowed = true; 2830 return 0; 2831 } 2832 2833 static int tdx_get_cmd(void __user *argp, struct kvm_tdx_cmd *cmd) 2834 { 2835 if (copy_from_user(cmd, argp, sizeof(*cmd))) 2836 return -EFAULT; 2837 2838 /* 2839 * Userspace should never set hw_error. KVM writes hw_error to report 2840 * hardware-defined error back to userspace. 2841 */ 2842 if (cmd->hw_error) 2843 return -EINVAL; 2844 2845 return 0; 2846 } 2847 2848 int tdx_vm_ioctl(struct kvm *kvm, void __user *argp) 2849 { 2850 struct kvm_tdx_cmd tdx_cmd; 2851 int r; 2852 2853 r = tdx_get_cmd(argp, &tdx_cmd); 2854 if (r) 2855 return r; 2856 2857 if (tdx_cmd.id == KVM_TDX_CAPABILITIES) 2858 return tdx_get_capabilities(&tdx_cmd); 2859 2860 CLASS(tdx_vm_state_guard, guard)(kvm); 2861 if (IS_ERR(guard)) 2862 return PTR_ERR(guard); 2863 2864 switch (tdx_cmd.id) { 2865 case KVM_TDX_INIT_VM: 2866 r = tdx_td_init(kvm, &tdx_cmd); 2867 break; 2868 case KVM_TDX_FINALIZE_VM: 2869 r = tdx_td_finalize(kvm, &tdx_cmd); 2870 break; 2871 default: 2872 return -EINVAL; 2873 } 2874 2875 if (copy_to_user(argp, &tdx_cmd, sizeof(struct kvm_tdx_cmd))) 2876 return -EFAULT; 2877 2878 return r; 2879 } 2880 2881 /* VMM can pass one 64bit auxiliary data to vcpu via RCX for guest BIOS. */ 2882 static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx) 2883 { 2884 struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); 2885 struct vcpu_tdx *tdx = to_tdx(vcpu); 2886 struct page *page; 2887 int ret, i; 2888 u64 err; 2889 2890 page = alloc_page(GFP_KERNEL); 2891 if (!page) 2892 return -ENOMEM; 2893 tdx->vp.tdvpr_page = page; 2894 2895 /* 2896 * page_to_phys() does not work in 'noinstr' code, like guest 2897 * entry via tdh_vp_enter(). Precalculate and store it instead 2898 * of doing it at runtime later. 2899 */ 2900 tdx->vp.tdvpr_pa = page_to_phys(tdx->vp.tdvpr_page); 2901 2902 tdx->vp.tdcx_pages = kcalloc(kvm_tdx->td.tdcx_nr_pages, sizeof(*tdx->vp.tdcx_pages), 2903 GFP_KERNEL); 2904 if (!tdx->vp.tdcx_pages) { 2905 ret = -ENOMEM; 2906 goto free_tdvpr; 2907 } 2908 2909 for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) { 2910 page = alloc_page(GFP_KERNEL); 2911 if (!page) { 2912 ret = -ENOMEM; 2913 goto free_tdcx; 2914 } 2915 tdx->vp.tdcx_pages[i] = page; 2916 } 2917 2918 err = tdh_vp_create(&kvm_tdx->td, &tdx->vp); 2919 if (TDX_BUG_ON(err, TDH_VP_CREATE, vcpu->kvm)) { 2920 ret = -EIO; 2921 goto free_tdcx; 2922 } 2923 2924 for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) { 2925 err = tdh_vp_addcx(&tdx->vp, tdx->vp.tdcx_pages[i]); 2926 if (TDX_BUG_ON(err, TDH_VP_ADDCX, vcpu->kvm)) { 2927 /* 2928 * Pages already added are reclaimed by the vcpu_free 2929 * method, but the rest are freed here. 2930 */ 2931 for (; i < kvm_tdx->td.tdcx_nr_pages; i++) { 2932 __free_page(tdx->vp.tdcx_pages[i]); 2933 tdx->vp.tdcx_pages[i] = NULL; 2934 } 2935 return -EIO; 2936 } 2937 } 2938 2939 /* 2940 * tdh_vp_init() can take an exclusive lock of the TDR resource inside 2941 * the TDX-Module. The TDR resource is also taken as shared in several 2942 * no-fail MMU paths, which could return TDX_OPERAND_BUSY on contention 2943 * (TDX-Module locks are try-lock implementations with no slow path). 2944 * Take mmu_lock for write to reflect the nature of the lock taken by 2945 * the TDX-Module, and to ensure the no-fail MMU paths succeed, e.g. if 2946 * a concurrent PUNCH_HOLE on guest_memfd triggers removal of SPTEs. 2947 */ 2948 scoped_guard(write_lock, &vcpu->kvm->mmu_lock) { 2949 err = tdh_vp_init(&tdx->vp, vcpu_rcx, vcpu->vcpu_id); 2950 if (TDX_BUG_ON(err, TDH_VP_INIT, vcpu->kvm)) 2951 return -EIO; 2952 } 2953 2954 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 2955 2956 return 0; 2957 2958 free_tdcx: 2959 for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) { 2960 if (tdx->vp.tdcx_pages[i]) 2961 __free_page(tdx->vp.tdcx_pages[i]); 2962 tdx->vp.tdcx_pages[i] = NULL; 2963 } 2964 kfree(tdx->vp.tdcx_pages); 2965 tdx->vp.tdcx_pages = NULL; 2966 2967 free_tdvpr: 2968 if (tdx->vp.tdvpr_page) 2969 __free_page(tdx->vp.tdvpr_page); 2970 tdx->vp.tdvpr_page = NULL; 2971 tdx->vp.tdvpr_pa = 0; 2972 2973 return ret; 2974 } 2975 2976 /* Sometimes reads multipple subleafs. Return how many enties were written. */ 2977 static int tdx_vcpu_get_cpuid_leaf(struct kvm_vcpu *vcpu, u32 leaf, int *entry_index, 2978 struct kvm_cpuid_entry2 *output_e) 2979 { 2980 int sub_leaf = 0; 2981 int ret; 2982 2983 /* First try without a subleaf */ 2984 ret = tdx_read_cpuid(vcpu, leaf, 0, false, entry_index, output_e); 2985 2986 /* If success, or invalid leaf, just give up */ 2987 if (ret != -EIO) 2988 return ret; 2989 2990 /* 2991 * If the try without a subleaf failed, try reading subleafs until 2992 * failure. The TDX module only supports 6 bits of subleaf index. 2993 */ 2994 while (1) { 2995 /* Keep reading subleafs until there is a failure. */ 2996 if (tdx_read_cpuid(vcpu, leaf, sub_leaf, true, entry_index, output_e)) 2997 return !sub_leaf; 2998 2999 sub_leaf++; 3000 output_e++; 3001 } 3002 3003 return 0; 3004 } 3005 3006 static int tdx_vcpu_get_cpuid(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd) 3007 { 3008 struct kvm_cpuid2 __user *output; 3009 struct kvm_cpuid2 *td_cpuid; 3010 int r = 0, i = 0, leaf; 3011 u32 level; 3012 3013 output = u64_to_user_ptr(cmd->data); 3014 td_cpuid = kzalloc(sizeof(*td_cpuid) + 3015 sizeof(output->entries[0]) * KVM_MAX_CPUID_ENTRIES, 3016 GFP_KERNEL); 3017 if (!td_cpuid) 3018 return -ENOMEM; 3019 3020 if (copy_from_user(td_cpuid, output, sizeof(*output))) { 3021 r = -EFAULT; 3022 goto out; 3023 } 3024 3025 /* Read max CPUID for normal range */ 3026 if (tdx_vcpu_get_cpuid_leaf(vcpu, 0, &i, &td_cpuid->entries[i])) { 3027 r = -EIO; 3028 goto out; 3029 } 3030 level = td_cpuid->entries[0].eax; 3031 3032 for (leaf = 1; leaf <= level; leaf++) 3033 tdx_vcpu_get_cpuid_leaf(vcpu, leaf, &i, &td_cpuid->entries[i]); 3034 3035 /* Read max CPUID for extended range */ 3036 if (tdx_vcpu_get_cpuid_leaf(vcpu, 0x80000000, &i, &td_cpuid->entries[i])) { 3037 r = -EIO; 3038 goto out; 3039 } 3040 level = td_cpuid->entries[i - 1].eax; 3041 3042 for (leaf = 0x80000001; leaf <= level; leaf++) 3043 tdx_vcpu_get_cpuid_leaf(vcpu, leaf, &i, &td_cpuid->entries[i]); 3044 3045 if (td_cpuid->nent < i) 3046 r = -E2BIG; 3047 td_cpuid->nent = i; 3048 3049 if (copy_to_user(output, td_cpuid, sizeof(*output))) { 3050 r = -EFAULT; 3051 goto out; 3052 } 3053 3054 if (r == -E2BIG) 3055 goto out; 3056 3057 if (copy_to_user(output->entries, td_cpuid->entries, 3058 td_cpuid->nent * sizeof(struct kvm_cpuid_entry2))) 3059 r = -EFAULT; 3060 3061 out: 3062 kfree(td_cpuid); 3063 3064 return r; 3065 } 3066 3067 static int tdx_vcpu_init(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd) 3068 { 3069 u64 apic_base; 3070 struct vcpu_tdx *tdx = to_tdx(vcpu); 3071 int ret; 3072 3073 if (cmd->flags) 3074 return -EINVAL; 3075 3076 if (tdx->state != VCPU_TD_STATE_UNINITIALIZED) 3077 return -EINVAL; 3078 3079 /* 3080 * TDX requires X2APIC, userspace is responsible for configuring guest 3081 * CPUID accordingly. 3082 */ 3083 apic_base = APIC_DEFAULT_PHYS_BASE | LAPIC_MODE_X2APIC | 3084 (kvm_vcpu_is_reset_bsp(vcpu) ? MSR_IA32_APICBASE_BSP : 0); 3085 if (kvm_apic_set_base(vcpu, apic_base, true)) 3086 return -EINVAL; 3087 3088 ret = tdx_td_vcpu_init(vcpu, (u64)cmd->data); 3089 if (ret) 3090 return ret; 3091 3092 td_vmcs_write16(tdx, POSTED_INTR_NV, POSTED_INTR_VECTOR); 3093 td_vmcs_write64(tdx, POSTED_INTR_DESC_ADDR, __pa(&tdx->vt.pi_desc)); 3094 td_vmcs_setbit32(tdx, PIN_BASED_VM_EXEC_CONTROL, PIN_BASED_POSTED_INTR); 3095 3096 tdx->state = VCPU_TD_STATE_INITIALIZED; 3097 3098 return 0; 3099 } 3100 3101 void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) 3102 { 3103 /* 3104 * Yell on INIT, as TDX doesn't support INIT, i.e. KVM should drop all 3105 * INIT events. 3106 * 3107 * Defer initializing vCPU for RESET state until KVM_TDX_INIT_VCPU, as 3108 * userspace needs to define the vCPU model before KVM can initialize 3109 * vCPU state, e.g. to enable x2APIC. 3110 */ 3111 WARN_ON_ONCE(init_event); 3112 } 3113 3114 struct tdx_gmem_post_populate_arg { 3115 struct kvm_vcpu *vcpu; 3116 __u32 flags; 3117 }; 3118 3119 static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, 3120 struct page *src_page, void *_arg) 3121 { 3122 struct tdx_gmem_post_populate_arg *arg = _arg; 3123 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 3124 u64 err, entry, level_state; 3125 gpa_t gpa = gfn_to_gpa(gfn); 3126 int ret, i; 3127 3128 if (KVM_BUG_ON(kvm_tdx->page_add_src, kvm)) 3129 return -EIO; 3130 3131 if (!src_page) 3132 return -EOPNOTSUPP; 3133 3134 kvm_tdx->page_add_src = src_page; 3135 ret = kvm_tdp_mmu_map_private_pfn(arg->vcpu, gfn, pfn); 3136 kvm_tdx->page_add_src = NULL; 3137 3138 if (ret || !(arg->flags & KVM_TDX_MEASURE_MEMORY_REGION)) 3139 return ret; 3140 3141 /* 3142 * Note, MR.EXTEND can fail if the S-EPT mapping is somehow removed 3143 * between mapping the pfn and now, but slots_lock prevents memslot 3144 * updates, filemap_invalidate_lock() prevents guest_memfd updates, 3145 * mmu_notifier events can't reach S-EPT entries, and KVM's internal 3146 * zapping flows are mutually exclusive with S-EPT mappings. 3147 */ 3148 for (i = 0; i < PAGE_SIZE; i += TDX_EXTENDMR_CHUNKSIZE) { 3149 err = tdh_mr_extend(&kvm_tdx->td, gpa + i, &entry, &level_state); 3150 if (TDX_BUG_ON_2(err, TDH_MR_EXTEND, entry, level_state, kvm)) 3151 return -EIO; 3152 } 3153 3154 return 0; 3155 } 3156 3157 static int tdx_vcpu_init_mem_region(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd) 3158 { 3159 struct vcpu_tdx *tdx = to_tdx(vcpu); 3160 struct kvm *kvm = vcpu->kvm; 3161 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 3162 struct kvm_tdx_init_mem_region region; 3163 struct tdx_gmem_post_populate_arg arg; 3164 long gmem_ret; 3165 int ret; 3166 3167 if (tdx->state != VCPU_TD_STATE_INITIALIZED) 3168 return -EINVAL; 3169 3170 /* Once TD is finalized, the initial guest memory is fixed. */ 3171 if (kvm_tdx->state == TD_STATE_RUNNABLE) 3172 return -EINVAL; 3173 3174 if (cmd->flags & ~KVM_TDX_MEASURE_MEMORY_REGION) 3175 return -EINVAL; 3176 3177 if (copy_from_user(®ion, u64_to_user_ptr(cmd->data), sizeof(region))) 3178 return -EFAULT; 3179 3180 if (!PAGE_ALIGNED(region.source_addr) || !PAGE_ALIGNED(region.gpa) || 3181 !region.nr_pages || 3182 region.gpa + (region.nr_pages << PAGE_SHIFT) <= region.gpa || 3183 !vt_is_tdx_private_gpa(kvm, region.gpa) || 3184 !vt_is_tdx_private_gpa(kvm, region.gpa + (region.nr_pages << PAGE_SHIFT) - 1)) 3185 return -EINVAL; 3186 3187 ret = 0; 3188 while (region.nr_pages) { 3189 if (signal_pending(current)) { 3190 ret = -EINTR; 3191 break; 3192 } 3193 3194 arg = (struct tdx_gmem_post_populate_arg) { 3195 .vcpu = vcpu, 3196 .flags = cmd->flags, 3197 }; 3198 gmem_ret = kvm_gmem_populate(kvm, gpa_to_gfn(region.gpa), 3199 u64_to_user_ptr(region.source_addr), 3200 1, tdx_gmem_post_populate, &arg); 3201 if (gmem_ret < 0) { 3202 ret = gmem_ret; 3203 break; 3204 } 3205 3206 if (gmem_ret != 1) { 3207 ret = -EIO; 3208 break; 3209 } 3210 3211 region.source_addr += PAGE_SIZE; 3212 region.gpa += PAGE_SIZE; 3213 region.nr_pages--; 3214 3215 cond_resched(); 3216 } 3217 3218 if (copy_to_user(u64_to_user_ptr(cmd->data), ®ion, sizeof(region))) 3219 ret = -EFAULT; 3220 return ret; 3221 } 3222 3223 int tdx_vcpu_unlocked_ioctl(struct kvm_vcpu *vcpu, void __user *argp) 3224 { 3225 struct kvm *kvm = vcpu->kvm; 3226 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 3227 struct kvm_tdx_cmd cmd; 3228 int r; 3229 3230 r = tdx_get_cmd(argp, &cmd); 3231 if (r) 3232 return r; 3233 3234 CLASS(tdx_vm_state_guard, guard)(kvm); 3235 if (IS_ERR(guard)) 3236 return PTR_ERR(guard); 3237 3238 if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE) 3239 return -EINVAL; 3240 3241 vcpu_load(vcpu); 3242 3243 switch (cmd.id) { 3244 case KVM_TDX_INIT_MEM_REGION: 3245 r = tdx_vcpu_init_mem_region(vcpu, &cmd); 3246 break; 3247 case KVM_TDX_INIT_VCPU: 3248 r = tdx_vcpu_init(vcpu, &cmd); 3249 break; 3250 default: 3251 r = -ENOIOCTLCMD; 3252 break; 3253 } 3254 3255 vcpu_put(vcpu); 3256 3257 return r; 3258 } 3259 3260 int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp) 3261 { 3262 struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); 3263 struct kvm_tdx_cmd cmd; 3264 int ret; 3265 3266 if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE) 3267 return -EINVAL; 3268 3269 ret = tdx_get_cmd(argp, &cmd); 3270 if (ret) 3271 return ret; 3272 3273 switch (cmd.id) { 3274 case KVM_TDX_GET_CPUID: 3275 ret = tdx_vcpu_get_cpuid(vcpu, &cmd); 3276 break; 3277 default: 3278 ret = -EINVAL; 3279 break; 3280 } 3281 3282 return ret; 3283 } 3284 3285 int tdx_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, bool is_private) 3286 { 3287 if (!is_private) 3288 return 0; 3289 3290 return PG_LEVEL_4K; 3291 } 3292 3293 static int tdx_online_cpu(unsigned int cpu) 3294 { 3295 unsigned long flags; 3296 int r; 3297 3298 /* Sanity check CPU is already in post-VMXON */ 3299 WARN_ON_ONCE(!(cr4_read_shadow() & X86_CR4_VMXE)); 3300 3301 local_irq_save(flags); 3302 r = tdx_cpu_enable(); 3303 local_irq_restore(flags); 3304 3305 return r; 3306 } 3307 3308 static int tdx_offline_cpu(unsigned int cpu) 3309 { 3310 int i; 3311 3312 /* No TD is running. Allow any cpu to be offline. */ 3313 if (!atomic_read(&nr_configured_hkid)) 3314 return 0; 3315 3316 /* 3317 * In order to reclaim TDX HKID, (i.e. when deleting guest TD), need to 3318 * call TDH.PHYMEM.PAGE.WBINVD on all packages to program all memory 3319 * controller with pconfig. If we have active TDX HKID, refuse to 3320 * offline the last online cpu. 3321 */ 3322 for_each_online_cpu(i) { 3323 /* 3324 * Found another online cpu on the same package. 3325 * Allow to offline. 3326 */ 3327 if (i != cpu && topology_physical_package_id(i) == 3328 topology_physical_package_id(cpu)) 3329 return 0; 3330 } 3331 3332 /* 3333 * This is the last cpu of this package. Don't offline it. 3334 * 3335 * Because it's hard for human operator to understand the 3336 * reason, warn it. 3337 */ 3338 #define MSG_ALLPKG_ONLINE \ 3339 "TDX requires all packages to have an online CPU. Delete all TDs in order to offline all CPUs of a package.\n" 3340 pr_warn_ratelimited(MSG_ALLPKG_ONLINE); 3341 return -EBUSY; 3342 } 3343 3344 static void __do_tdx_cleanup(void) 3345 { 3346 /* 3347 * Once TDX module is initialized, it cannot be disabled and 3348 * re-initialized again w/o runtime update (which isn't 3349 * supported by kernel). Only need to remove the cpuhp here. 3350 * The TDX host core code tracks TDX status and can handle 3351 * 'multiple enabling' scenario. 3352 */ 3353 WARN_ON_ONCE(!tdx_cpuhp_state); 3354 cpuhp_remove_state_nocalls_cpuslocked(tdx_cpuhp_state); 3355 tdx_cpuhp_state = 0; 3356 } 3357 3358 static void __tdx_cleanup(void) 3359 { 3360 cpus_read_lock(); 3361 __do_tdx_cleanup(); 3362 cpus_read_unlock(); 3363 } 3364 3365 static int __init __do_tdx_bringup(void) 3366 { 3367 int r; 3368 3369 /* 3370 * TDX-specific cpuhp callback to call tdx_cpu_enable() on all 3371 * online CPUs before calling tdx_enable(), and on any new 3372 * going-online CPU to make sure it is ready for TDX guest. 3373 */ 3374 r = cpuhp_setup_state_cpuslocked(CPUHP_AP_ONLINE_DYN, 3375 "kvm/cpu/tdx:online", 3376 tdx_online_cpu, tdx_offline_cpu); 3377 if (r < 0) 3378 return r; 3379 3380 tdx_cpuhp_state = r; 3381 3382 r = tdx_enable(); 3383 if (r) 3384 __do_tdx_cleanup(); 3385 3386 return r; 3387 } 3388 3389 static int __init __tdx_bringup(void) 3390 { 3391 const struct tdx_sys_info_td_conf *td_conf; 3392 int r, i; 3393 3394 for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++) { 3395 /* 3396 * Check if MSRs (tdx_uret_msrs) can be saved/restored 3397 * before returning to user space. 3398 */ 3399 tdx_uret_msrs[i].slot = kvm_find_user_return_msr(tdx_uret_msrs[i].msr); 3400 if (tdx_uret_msrs[i].slot == -1) { 3401 /* If any MSR isn't supported, it is a KVM bug */ 3402 pr_err("MSR %x isn't included by kvm_find_user_return_msr\n", 3403 tdx_uret_msrs[i].msr); 3404 return -EIO; 3405 } 3406 } 3407 3408 /* 3409 * Enabling TDX requires enabling hardware virtualization first, 3410 * as making SEAMCALLs requires CPU being in post-VMXON state. 3411 */ 3412 r = kvm_enable_virtualization(); 3413 if (r) 3414 return r; 3415 3416 cpus_read_lock(); 3417 r = __do_tdx_bringup(); 3418 cpus_read_unlock(); 3419 3420 if (r) 3421 goto tdx_bringup_err; 3422 3423 r = -EINVAL; 3424 /* Get TDX global information for later use */ 3425 tdx_sysinfo = tdx_get_sysinfo(); 3426 if (WARN_ON_ONCE(!tdx_sysinfo)) 3427 goto get_sysinfo_err; 3428 3429 /* Check TDX module and KVM capabilities */ 3430 if (!tdx_get_supported_attrs(&tdx_sysinfo->td_conf) || 3431 !tdx_get_supported_xfam(&tdx_sysinfo->td_conf)) 3432 goto get_sysinfo_err; 3433 3434 if (!(tdx_sysinfo->features.tdx_features0 & MD_FIELD_ID_FEATURES0_TOPOLOGY_ENUM)) 3435 goto get_sysinfo_err; 3436 3437 /* 3438 * TDX has its own limit of maximum vCPUs it can support for all 3439 * TDX guests in addition to KVM_MAX_VCPUS. Userspace needs to 3440 * query TDX guest's maximum vCPUs by checking KVM_CAP_MAX_VCPU 3441 * extension on per-VM basis. 3442 * 3443 * TDX module reports such limit via the MAX_VCPU_PER_TD global 3444 * metadata. Different modules may report different values. 3445 * Some old module may also not support this metadata (in which 3446 * case this limit is U16_MAX). 3447 * 3448 * In practice, the reported value reflects the maximum logical 3449 * CPUs that ALL the platforms that the module supports can 3450 * possibly have. 3451 * 3452 * Simply forwarding the MAX_VCPU_PER_TD to userspace could 3453 * result in an unpredictable ABI. KVM instead always advertise 3454 * the number of logical CPUs the platform has as the maximum 3455 * vCPUs for TDX guests. 3456 * 3457 * Make sure MAX_VCPU_PER_TD reported by TDX module is not 3458 * smaller than the number of logical CPUs, otherwise KVM will 3459 * report an unsupported value to userspace. 3460 * 3461 * Note, a platform with TDX enabled in the BIOS cannot support 3462 * physical CPU hotplug, and TDX requires the BIOS has marked 3463 * all logical CPUs in MADT table as enabled. Just use 3464 * num_present_cpus() for the number of logical CPUs. 3465 */ 3466 td_conf = &tdx_sysinfo->td_conf; 3467 if (td_conf->max_vcpus_per_td < num_present_cpus()) { 3468 pr_err("Disable TDX: MAX_VCPU_PER_TD (%u) smaller than number of logical CPUs (%u).\n", 3469 td_conf->max_vcpus_per_td, num_present_cpus()); 3470 goto get_sysinfo_err; 3471 } 3472 3473 if (misc_cg_set_capacity(MISC_CG_RES_TDX, tdx_get_nr_guest_keyids())) 3474 goto get_sysinfo_err; 3475 3476 /* 3477 * Leave hardware virtualization enabled after TDX is enabled 3478 * successfully. TDX CPU hotplug depends on this. 3479 */ 3480 return 0; 3481 3482 get_sysinfo_err: 3483 __tdx_cleanup(); 3484 tdx_bringup_err: 3485 kvm_disable_virtualization(); 3486 return r; 3487 } 3488 3489 void tdx_cleanup(void) 3490 { 3491 if (enable_tdx) { 3492 misc_cg_set_capacity(MISC_CG_RES_TDX, 0); 3493 __tdx_cleanup(); 3494 kvm_disable_virtualization(); 3495 } 3496 } 3497 3498 int __init tdx_bringup(void) 3499 { 3500 int r, i; 3501 3502 /* tdx_disable_virtualization_cpu() uses associated_tdvcpus. */ 3503 for_each_possible_cpu(i) 3504 INIT_LIST_HEAD(&per_cpu(associated_tdvcpus, i)); 3505 3506 if (!enable_tdx) 3507 return 0; 3508 3509 if (!enable_ept) { 3510 pr_err("EPT is required for TDX\n"); 3511 goto success_disable_tdx; 3512 } 3513 3514 if (!tdp_mmu_enabled || !enable_mmio_caching || !enable_ept_ad_bits) { 3515 pr_err("TDP MMU and MMIO caching and EPT A/D bit is required for TDX\n"); 3516 goto success_disable_tdx; 3517 } 3518 3519 if (!enable_apicv) { 3520 pr_err("APICv is required for TDX\n"); 3521 goto success_disable_tdx; 3522 } 3523 3524 if (!cpu_feature_enabled(X86_FEATURE_OSXSAVE)) { 3525 pr_err("tdx: OSXSAVE is required for TDX\n"); 3526 goto success_disable_tdx; 3527 } 3528 3529 if (!cpu_feature_enabled(X86_FEATURE_MOVDIR64B)) { 3530 pr_err("tdx: MOVDIR64B is required for TDX\n"); 3531 goto success_disable_tdx; 3532 } 3533 3534 if (!cpu_feature_enabled(X86_FEATURE_SELFSNOOP)) { 3535 pr_err("Self-snoop is required for TDX\n"); 3536 goto success_disable_tdx; 3537 } 3538 3539 if (!cpu_feature_enabled(X86_FEATURE_TDX_HOST_PLATFORM)) { 3540 pr_err("tdx: no TDX private KeyIDs available\n"); 3541 goto success_disable_tdx; 3542 } 3543 3544 if (!enable_virt_at_load) { 3545 pr_err("tdx: tdx requires kvm.enable_virt_at_load=1\n"); 3546 goto success_disable_tdx; 3547 } 3548 3549 /* 3550 * Ideally KVM should probe whether TDX module has been loaded 3551 * first and then try to bring it up. But TDX needs to use SEAMCALL 3552 * to probe whether the module is loaded (there is no CPUID or MSR 3553 * for that), and making SEAMCALL requires enabling virtualization 3554 * first, just like the rest steps of bringing up TDX module. 3555 * 3556 * So, for simplicity do everything in __tdx_bringup(); the first 3557 * SEAMCALL will return -ENODEV when the module is not loaded. The 3558 * only complication is having to make sure that initialization 3559 * SEAMCALLs don't return TDX_SEAMCALL_VMFAILINVALID in other 3560 * cases. 3561 */ 3562 r = __tdx_bringup(); 3563 if (r) { 3564 /* 3565 * Disable TDX only but don't fail to load module if the TDX 3566 * module could not be loaded. No need to print message saying 3567 * "module is not loaded" because it was printed when the first 3568 * SEAMCALL failed. Don't bother unwinding the S-EPT hooks or 3569 * vm_size, as kvm_x86_ops have already been finalized (and are 3570 * intentionally not exported). The S-EPT code is unreachable, 3571 * and allocating a few more bytes per VM in a should-be-rare 3572 * failure scenario is a non-issue. 3573 */ 3574 if (r == -ENODEV) 3575 goto success_disable_tdx; 3576 3577 enable_tdx = 0; 3578 } 3579 3580 return r; 3581 3582 success_disable_tdx: 3583 enable_tdx = 0; 3584 return 0; 3585 } 3586 3587 void __init tdx_hardware_setup(void) 3588 { 3589 KVM_SANITY_CHECK_VM_STRUCT_SIZE(kvm_tdx); 3590 3591 /* 3592 * Note, if the TDX module can't be loaded, KVM TDX support will be 3593 * disabled but KVM will continue loading (see tdx_bringup()). 3594 */ 3595 vt_x86_ops.vm_size = max_t(unsigned int, vt_x86_ops.vm_size, sizeof(struct kvm_tdx)); 3596 3597 vt_x86_ops.link_external_spt = tdx_sept_link_private_spt; 3598 vt_x86_ops.set_external_spte = tdx_sept_set_private_spte; 3599 vt_x86_ops.free_external_spt = tdx_sept_free_private_spt; 3600 vt_x86_ops.remove_external_spte = tdx_sept_remove_private_spte; 3601 vt_x86_ops.protected_apic_has_interrupt = tdx_protected_apic_has_interrupt; 3602 } 3603