1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/cleanup.h> 3 #include <linux/cpu.h> 4 #include <asm/cpufeature.h> 5 #include <asm/fpu/xcr.h> 6 #include <linux/misc_cgroup.h> 7 #include <linux/mmu_context.h> 8 #include <asm/tdx.h> 9 #include <asm/virt.h> 10 #include "capabilities.h" 11 #include "mmu.h" 12 #include "x86_ops.h" 13 #include "lapic.h" 14 #include "tdx.h" 15 #include "vmx.h" 16 #include "mmu/spte.h" 17 #include "common.h" 18 #include "posted_intr.h" 19 #include "irq.h" 20 #include <trace/events/kvm.h> 21 #include "trace.h" 22 23 #pragma GCC poison to_vmx 24 25 #undef pr_fmt 26 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 27 28 #define __TDX_BUG_ON(__err, __f, __kvm, __fmt, __args...) \ 29 ({ \ 30 struct kvm *_kvm = (__kvm); \ 31 bool __ret = !!(__err); \ 32 \ 33 if (WARN_ON_ONCE(__ret && (!_kvm || !_kvm->vm_bugged))) { \ 34 if (_kvm) \ 35 kvm_vm_bugged(_kvm); \ 36 pr_err_ratelimited("SEAMCALL " __f " failed: 0x%llx" __fmt "\n",\ 37 __err, __args); \ 38 } \ 39 unlikely(__ret); \ 40 }) 41 42 #define TDX_BUG_ON(__err, __fn, __kvm) \ 43 __TDX_BUG_ON(__err, #__fn, __kvm, "%s", "") 44 45 #define TDX_BUG_ON_1(__err, __fn, a1, __kvm) \ 46 __TDX_BUG_ON(__err, #__fn, __kvm, ", " #a1 " 0x%llx", a1) 47 48 #define TDX_BUG_ON_2(__err, __fn, a1, a2, __kvm) \ 49 __TDX_BUG_ON(__err, #__fn, __kvm, ", " #a1 " 0x%llx, " #a2 " 0x%llx", a1, a2) 50 51 #define TDX_BUG_ON_3(__err, __fn, a1, a2, a3, __kvm) \ 52 __TDX_BUG_ON(__err, #__fn, __kvm, ", " #a1 " 0x%llx, " #a2 ", 0x%llx, " #a3 " 0x%llx", \ 53 a1, a2, a3) 54 55 56 bool enable_tdx __ro_after_init; 57 module_param_named(tdx, enable_tdx, bool, 0444); 58 59 #define TDX_SHARED_BIT_PWL_5 gpa_to_gfn(BIT_ULL(51)) 60 #define TDX_SHARED_BIT_PWL_4 gpa_to_gfn(BIT_ULL(47)) 61 62 static const struct tdx_sys_info *tdx_sysinfo; 63 64 void tdh_vp_rd_failed(struct vcpu_tdx *tdx, char *uclass, u32 field, u64 err) 65 { 66 KVM_BUG_ON(1, tdx->vcpu.kvm); 67 pr_err("TDH_VP_RD[%s.0x%x] failed 0x%llx\n", uclass, field, err); 68 } 69 70 void tdh_vp_wr_failed(struct vcpu_tdx *tdx, char *uclass, char *op, u32 field, 71 u64 val, u64 err) 72 { 73 KVM_BUG_ON(1, tdx->vcpu.kvm); 74 pr_err("TDH_VP_WR[%s.0x%x]%s0x%llx failed: 0x%llx\n", uclass, field, op, val, err); 75 } 76 77 #define KVM_SUPPORTED_TDX_TD_ATTRS (TDX_TD_ATTR_SEPT_VE_DISABLE) 78 79 static __always_inline struct kvm_tdx *to_kvm_tdx(struct kvm *kvm) 80 { 81 return container_of(kvm, struct kvm_tdx, kvm); 82 } 83 84 static __always_inline struct vcpu_tdx *to_tdx(struct kvm_vcpu *vcpu) 85 { 86 return container_of(vcpu, struct vcpu_tdx, vcpu); 87 } 88 89 static u64 tdx_get_supported_attrs(const struct tdx_sys_info_td_conf *td_conf) 90 { 91 u64 val = KVM_SUPPORTED_TDX_TD_ATTRS; 92 93 if ((val & td_conf->attributes_fixed1) != td_conf->attributes_fixed1) 94 return 0; 95 96 val &= td_conf->attributes_fixed0; 97 98 return val; 99 } 100 101 static u64 tdx_get_supported_xfam(const struct tdx_sys_info_td_conf *td_conf) 102 { 103 u64 val = kvm_caps.supported_xcr0 | kvm_caps.supported_xss; 104 105 if ((val & td_conf->xfam_fixed1) != td_conf->xfam_fixed1) 106 return 0; 107 108 val &= td_conf->xfam_fixed0; 109 110 return val; 111 } 112 113 static int tdx_get_guest_phys_addr_bits(const u32 eax) 114 { 115 return (eax & GENMASK(23, 16)) >> 16; 116 } 117 118 static u32 tdx_set_guest_phys_addr_bits(const u32 eax, int addr_bits) 119 { 120 return (eax & ~GENMASK(23, 16)) | (addr_bits & 0xff) << 16; 121 } 122 123 #define TDX_FEATURE_TSX (__feature_bit(X86_FEATURE_HLE) | __feature_bit(X86_FEATURE_RTM)) 124 125 static bool has_tsx(const struct kvm_cpuid_entry2 *entry) 126 { 127 return entry->function == 7 && entry->index == 0 && 128 (entry->ebx & TDX_FEATURE_TSX); 129 } 130 131 static void clear_tsx(struct kvm_cpuid_entry2 *entry) 132 { 133 entry->ebx &= ~TDX_FEATURE_TSX; 134 } 135 136 static bool has_waitpkg(const struct kvm_cpuid_entry2 *entry) 137 { 138 return entry->function == 7 && entry->index == 0 && 139 (entry->ecx & __feature_bit(X86_FEATURE_WAITPKG)); 140 } 141 142 static void clear_waitpkg(struct kvm_cpuid_entry2 *entry) 143 { 144 entry->ecx &= ~__feature_bit(X86_FEATURE_WAITPKG); 145 } 146 147 static void tdx_clear_unsupported_cpuid(struct kvm_cpuid_entry2 *entry) 148 { 149 if (has_tsx(entry)) 150 clear_tsx(entry); 151 152 if (has_waitpkg(entry)) 153 clear_waitpkg(entry); 154 } 155 156 static bool tdx_unsupported_cpuid(const struct kvm_cpuid_entry2 *entry) 157 { 158 return has_tsx(entry) || has_waitpkg(entry); 159 } 160 161 #define KVM_TDX_CPUID_NO_SUBLEAF ((__u32)-1) 162 163 static void td_init_cpuid_entry2(struct kvm_cpuid_entry2 *entry, unsigned char idx) 164 { 165 const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf; 166 167 entry->function = (u32)td_conf->cpuid_config_leaves[idx]; 168 entry->index = td_conf->cpuid_config_leaves[idx] >> 32; 169 entry->eax = (u32)td_conf->cpuid_config_values[idx][0]; 170 entry->ebx = td_conf->cpuid_config_values[idx][0] >> 32; 171 entry->ecx = (u32)td_conf->cpuid_config_values[idx][1]; 172 entry->edx = td_conf->cpuid_config_values[idx][1] >> 32; 173 174 if (entry->index == KVM_TDX_CPUID_NO_SUBLEAF) 175 entry->index = 0; 176 177 /* 178 * The TDX module doesn't allow configuring the guest phys addr bits 179 * (EAX[23:16]). However, KVM uses it as an interface to the userspace 180 * to configure the GPAW. Report these bits as configurable. 181 */ 182 if (entry->function == 0x80000008) 183 entry->eax = tdx_set_guest_phys_addr_bits(entry->eax, 0xff); 184 185 tdx_clear_unsupported_cpuid(entry); 186 } 187 188 #define TDVMCALLINFO_SETUP_EVENT_NOTIFY_INTERRUPT BIT(1) 189 190 static int init_kvm_tdx_caps(const struct tdx_sys_info_td_conf *td_conf, 191 struct kvm_tdx_capabilities *caps) 192 { 193 int i; 194 195 caps->supported_attrs = tdx_get_supported_attrs(td_conf); 196 if (!caps->supported_attrs) 197 return -EIO; 198 199 caps->supported_xfam = tdx_get_supported_xfam(td_conf); 200 if (!caps->supported_xfam) 201 return -EIO; 202 203 caps->cpuid.nent = td_conf->num_cpuid_config; 204 205 caps->user_tdvmcallinfo_1_r11 = 206 TDVMCALLINFO_SETUP_EVENT_NOTIFY_INTERRUPT; 207 208 for (i = 0; i < td_conf->num_cpuid_config; i++) 209 td_init_cpuid_entry2(&caps->cpuid.entries[i], i); 210 211 return 0; 212 } 213 214 /* 215 * Some SEAMCALLs acquire the TDX module globally, and can fail with 216 * TDX_OPERAND_BUSY. Use a global mutex to serialize these SEAMCALLs. 217 */ 218 static DEFINE_MUTEX(tdx_lock); 219 220 static bool tdx_operand_busy(u64 err) 221 { 222 return (err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_BUSY; 223 } 224 225 226 /* 227 * A per-CPU list of TD vCPUs associated with a given CPU. 228 * Protected by interrupt mask. Only manipulated by the CPU owning this per-CPU 229 * list. 230 * - When a vCPU is loaded onto a CPU, it is removed from the per-CPU list of 231 * the old CPU during the IPI callback running on the old CPU, and then added 232 * to the per-CPU list of the new CPU. 233 * - When a TD is tearing down, all vCPUs are disassociated from their current 234 * running CPUs and removed from the per-CPU list during the IPI callback 235 * running on those CPUs. 236 * - When a CPU is brought down, traverse the per-CPU list to disassociate all 237 * associated TD vCPUs and remove them from the per-CPU list. 238 */ 239 static DEFINE_PER_CPU(struct list_head, associated_tdvcpus); 240 241 static __always_inline unsigned long tdvmcall_exit_type(struct kvm_vcpu *vcpu) 242 { 243 return to_tdx(vcpu)->vp_enter_args.r10; 244 } 245 246 static __always_inline unsigned long tdvmcall_leaf(struct kvm_vcpu *vcpu) 247 { 248 return to_tdx(vcpu)->vp_enter_args.r11; 249 } 250 251 static __always_inline void tdvmcall_set_return_code(struct kvm_vcpu *vcpu, 252 long val) 253 { 254 to_tdx(vcpu)->vp_enter_args.r10 = val; 255 } 256 257 static __always_inline void tdvmcall_set_return_val(struct kvm_vcpu *vcpu, 258 unsigned long val) 259 { 260 to_tdx(vcpu)->vp_enter_args.r11 = val; 261 } 262 263 static inline void tdx_hkid_free(struct kvm_tdx *kvm_tdx) 264 { 265 tdx_guest_keyid_free(kvm_tdx->hkid); 266 kvm_tdx->hkid = -1; 267 misc_cg_uncharge(MISC_CG_RES_TDX, kvm_tdx->misc_cg, 1); 268 put_misc_cg(kvm_tdx->misc_cg); 269 kvm_tdx->misc_cg = NULL; 270 } 271 272 static inline bool is_hkid_assigned(struct kvm_tdx *kvm_tdx) 273 { 274 return kvm_tdx->hkid > 0; 275 } 276 277 static inline void tdx_disassociate_vp(struct kvm_vcpu *vcpu) 278 { 279 lockdep_assert_irqs_disabled(); 280 281 list_del(&to_tdx(vcpu)->cpu_list); 282 283 /* 284 * Ensure tdx->cpu_list is updated before setting vcpu->cpu to -1, 285 * otherwise, a different CPU can see vcpu->cpu = -1 and add the vCPU 286 * to its list before it's deleted from this CPU's list. 287 */ 288 smp_wmb(); 289 290 vcpu->cpu = -1; 291 } 292 293 /* 294 * Execute a SEAMCALL related to removing/blocking S-EPT entries, with a single 295 * retry (if necessary) after forcing vCPUs to exit and wait for the operation 296 * to complete. All flows that remove/block S-EPT entries run with mmu_lock 297 * held for write, i.e. are mutually exclusive with each other, but they aren't 298 * mutually exclusive with running vCPUs, and so can fail with "operand busy" 299 * if a vCPU acquires a relevant lock in the TDX-Module, e.g. when doing TDCALL. 300 * 301 * Note, the retry is guaranteed to succeed, absent KVM and/or TDX-Module bugs. 302 */ 303 #define tdh_do_no_vcpus(tdh_func, kvm, args...) \ 304 ({ \ 305 struct kvm_tdx *__kvm_tdx = to_kvm_tdx(kvm); \ 306 u64 __err; \ 307 \ 308 lockdep_assert_held_write(&kvm->mmu_lock); \ 309 \ 310 __err = tdh_func(args); \ 311 if (unlikely(tdx_operand_busy(__err))) { \ 312 WRITE_ONCE(__kvm_tdx->wait_for_sept_zap, true); \ 313 kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE); \ 314 \ 315 __err = tdh_func(args); \ 316 \ 317 WRITE_ONCE(__kvm_tdx->wait_for_sept_zap, false); \ 318 } \ 319 __err; \ 320 }) 321 322 /* TDH.PHYMEM.PAGE.RECLAIM is allowed only when destroying the TD. */ 323 static int __tdx_reclaim_page(struct page *page) 324 { 325 u64 err, rcx, rdx, r8; 326 327 err = tdh_phymem_page_reclaim(page, &rcx, &rdx, &r8); 328 329 /* 330 * No need to check for TDX_OPERAND_BUSY; all TD pages are freed 331 * before the HKID is released and control pages have also been 332 * released at this point, so there is no possibility of contention. 333 */ 334 if (TDX_BUG_ON_3(err, TDH_PHYMEM_PAGE_RECLAIM, rcx, rdx, r8, NULL)) 335 return -EIO; 336 337 return 0; 338 } 339 340 static int tdx_reclaim_page(struct page *page) 341 { 342 int r; 343 344 r = __tdx_reclaim_page(page); 345 if (!r) 346 tdx_quirk_reset_page(page); 347 return r; 348 } 349 350 351 /* 352 * Reclaim the TD control page(s) which are crypto-protected by TDX guest's 353 * private KeyID. Assume the cache associated with the TDX private KeyID has 354 * been flushed. 355 */ 356 static void tdx_reclaim_control_page(struct page *ctrl_page) 357 { 358 /* 359 * Leak the page if the kernel failed to reclaim the page. 360 * The kernel cannot use it safely anymore. 361 */ 362 if (tdx_reclaim_page(ctrl_page)) 363 return; 364 365 __free_page(ctrl_page); 366 } 367 368 struct tdx_flush_vp_arg { 369 struct kvm_vcpu *vcpu; 370 u64 err; 371 }; 372 373 static void tdx_flush_vp(void *_arg) 374 { 375 struct tdx_flush_vp_arg *arg = _arg; 376 struct kvm_vcpu *vcpu = arg->vcpu; 377 u64 err; 378 379 arg->err = 0; 380 lockdep_assert_irqs_disabled(); 381 382 /* Task migration can race with CPU offlining. */ 383 if (unlikely(vcpu->cpu != raw_smp_processor_id())) 384 return; 385 386 /* 387 * No need to do TDH_VP_FLUSH if the vCPU hasn't been initialized. The 388 * list tracking still needs to be updated so that it's correct if/when 389 * the vCPU does get initialized. 390 */ 391 if (to_tdx(vcpu)->state != VCPU_TD_STATE_UNINITIALIZED) { 392 /* 393 * No need to retry. TDX Resources needed for TDH.VP.FLUSH are: 394 * TDVPR as exclusive, TDR as shared, and TDCS as shared. This 395 * vp flush function is called when destructing vCPU/TD or vCPU 396 * migration. No other thread uses TDVPR in those cases. 397 */ 398 err = tdh_vp_flush(&to_tdx(vcpu)->vp); 399 if (unlikely(err && err != TDX_VCPU_NOT_ASSOCIATED)) { 400 /* 401 * This function is called in IPI context. Do not use 402 * printk to avoid console semaphore. 403 * The caller prints out the error message, instead. 404 */ 405 if (err) 406 arg->err = err; 407 } 408 } 409 410 tdx_disassociate_vp(vcpu); 411 } 412 413 static void tdx_flush_vp_on_cpu(struct kvm_vcpu *vcpu) 414 { 415 struct tdx_flush_vp_arg arg = { 416 .vcpu = vcpu, 417 }; 418 int cpu = vcpu->cpu; 419 420 if (unlikely(cpu == -1)) 421 return; 422 423 smp_call_function_single(cpu, tdx_flush_vp, &arg, 1); 424 425 TDX_BUG_ON(arg.err, TDH_VP_FLUSH, vcpu->kvm); 426 } 427 428 void tdx_disable_virtualization_cpu(void) 429 { 430 int cpu = raw_smp_processor_id(); 431 struct list_head *tdvcpus = &per_cpu(associated_tdvcpus, cpu); 432 struct tdx_flush_vp_arg arg; 433 struct vcpu_tdx *tdx, *tmp; 434 unsigned long flags; 435 436 local_irq_save(flags); 437 /* Safe variant needed as tdx_disassociate_vp() deletes the entry. */ 438 list_for_each_entry_safe(tdx, tmp, tdvcpus, cpu_list) { 439 arg.vcpu = &tdx->vcpu; 440 tdx_flush_vp(&arg); 441 } 442 local_irq_restore(flags); 443 444 /* 445 * Flush cache now if kexec is possible: this is necessary to avoid 446 * having dirty private memory cachelines when the new kernel boots, 447 * but WBINVD is a relatively expensive operation and doing it during 448 * kexec can exacerbate races in native_stop_other_cpus(). Do it 449 * now, since this is a safe moment and there is going to be no more 450 * TDX activity on this CPU from this point on. 451 */ 452 tdx_cpu_flush_cache_for_kexec(); 453 } 454 455 #define TDX_SEAMCALL_RETRIES 10000 456 457 static void smp_func_do_phymem_cache_wb(void *unused) 458 { 459 u64 err = 0; 460 bool resume; 461 int i; 462 463 /* 464 * TDH.PHYMEM.CACHE.WB flushes caches associated with any TDX private 465 * KeyID on the package or core. The TDX module may not finish the 466 * cache flush but return TDX_INTERRUPTED_RESUMEABLE instead. The 467 * kernel should retry it until it returns success w/o rescheduling. 468 */ 469 for (i = TDX_SEAMCALL_RETRIES; i > 0; i--) { 470 resume = !!err; 471 err = tdh_phymem_cache_wb(resume); 472 switch (err) { 473 case TDX_INTERRUPTED_RESUMABLE: 474 continue; 475 case TDX_NO_HKID_READY_TO_WBCACHE: 476 err = TDX_SUCCESS; /* Already done by other thread */ 477 fallthrough; 478 default: 479 goto out; 480 } 481 } 482 483 out: 484 TDX_BUG_ON(err, TDH_PHYMEM_CACHE_WB, NULL); 485 } 486 487 void tdx_mmu_release_hkid(struct kvm *kvm) 488 { 489 bool packages_allocated, targets_allocated; 490 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 491 cpumask_var_t packages, targets; 492 struct kvm_vcpu *vcpu; 493 unsigned long j; 494 int i; 495 u64 err; 496 497 if (!is_hkid_assigned(kvm_tdx)) 498 return; 499 500 packages_allocated = zalloc_cpumask_var(&packages, GFP_KERNEL); 501 targets_allocated = zalloc_cpumask_var(&targets, GFP_KERNEL); 502 cpus_read_lock(); 503 504 kvm_for_each_vcpu(j, vcpu, kvm) 505 tdx_flush_vp_on_cpu(vcpu); 506 507 /* 508 * TDH.PHYMEM.CACHE.WB tries to acquire the TDX module global lock 509 * and can fail with TDX_OPERAND_BUSY when it fails to get the lock. 510 * Multiple TDX guests can be destroyed simultaneously. Take the 511 * mutex to prevent it from getting error. 512 */ 513 mutex_lock(&tdx_lock); 514 515 /* 516 * Releasing HKID is in vm_destroy(). 517 * After the above flushing vps, there should be no more vCPU 518 * associations, as all vCPU fds have been released at this stage. 519 */ 520 err = tdh_mng_vpflushdone(&kvm_tdx->td); 521 if (err == TDX_FLUSHVP_NOT_DONE) 522 goto out; 523 if (TDX_BUG_ON(err, TDH_MNG_VPFLUSHDONE, kvm)) { 524 pr_err("tdh_mng_vpflushdone() failed. HKID %d is leaked.\n", 525 kvm_tdx->hkid); 526 goto out; 527 } 528 529 for_each_online_cpu(i) { 530 if (packages_allocated && 531 cpumask_test_and_set_cpu(topology_physical_package_id(i), 532 packages)) 533 continue; 534 if (targets_allocated) 535 cpumask_set_cpu(i, targets); 536 } 537 if (targets_allocated) 538 on_each_cpu_mask(targets, smp_func_do_phymem_cache_wb, NULL, true); 539 else 540 on_each_cpu(smp_func_do_phymem_cache_wb, NULL, true); 541 /* 542 * In the case of error in smp_func_do_phymem_cache_wb(), the following 543 * tdh_mng_key_freeid() will fail. 544 */ 545 err = tdh_mng_key_freeid(&kvm_tdx->td); 546 if (TDX_BUG_ON(err, TDH_MNG_KEY_FREEID, kvm)) { 547 pr_err("tdh_mng_key_freeid() failed. HKID %d is leaked.\n", 548 kvm_tdx->hkid); 549 } else { 550 tdx_hkid_free(kvm_tdx); 551 } 552 553 out: 554 mutex_unlock(&tdx_lock); 555 cpus_read_unlock(); 556 free_cpumask_var(targets); 557 free_cpumask_var(packages); 558 } 559 560 static void tdx_reclaim_td_control_pages(struct kvm *kvm) 561 { 562 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 563 u64 err; 564 int i; 565 566 /* 567 * tdx_mmu_release_hkid() failed to reclaim HKID. Something went wrong 568 * heavily with TDX module. Give up freeing TD pages. As the function 569 * already warned, don't warn it again. 570 */ 571 if (is_hkid_assigned(kvm_tdx)) 572 return; 573 574 if (kvm_tdx->td.tdcs_pages) { 575 for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) { 576 if (!kvm_tdx->td.tdcs_pages[i]) 577 continue; 578 579 tdx_reclaim_control_page(kvm_tdx->td.tdcs_pages[i]); 580 } 581 kfree(kvm_tdx->td.tdcs_pages); 582 kvm_tdx->td.tdcs_pages = NULL; 583 } 584 585 if (!kvm_tdx->td.tdr_page) 586 return; 587 588 if (__tdx_reclaim_page(kvm_tdx->td.tdr_page)) 589 return; 590 591 /* 592 * Use a SEAMCALL to ask the TDX module to flush the cache based on the 593 * KeyID. TDX module may access TDR while operating on TD (Especially 594 * when it is reclaiming TDCS). 595 */ 596 err = tdh_phymem_page_wbinvd_tdr(&kvm_tdx->td); 597 if (TDX_BUG_ON(err, TDH_PHYMEM_PAGE_WBINVD, kvm)) 598 return; 599 600 tdx_quirk_reset_page(kvm_tdx->td.tdr_page); 601 602 __free_page(kvm_tdx->td.tdr_page); 603 kvm_tdx->td.tdr_page = NULL; 604 } 605 606 void tdx_vm_destroy(struct kvm *kvm) 607 { 608 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 609 610 tdx_reclaim_td_control_pages(kvm); 611 612 kvm_tdx->state = TD_STATE_UNINITIALIZED; 613 } 614 615 static int tdx_do_tdh_mng_key_config(void *param) 616 { 617 struct kvm_tdx *kvm_tdx = param; 618 u64 err; 619 620 /* TDX_RND_NO_ENTROPY related retries are handled by sc_retry() */ 621 err = tdh_mng_key_config(&kvm_tdx->td); 622 if (TDX_BUG_ON(err, TDH_MNG_KEY_CONFIG, &kvm_tdx->kvm)) 623 return -EIO; 624 625 return 0; 626 } 627 628 int tdx_vm_init(struct kvm *kvm) 629 { 630 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 631 632 kvm->arch.has_protected_state = true; 633 /* 634 * TDX Module doesn't allow the hypervisor to modify the EOI-bitmap, 635 * i.e. all EOIs are accelerated and never trigger exits. 636 */ 637 kvm->arch.has_protected_eoi = true; 638 kvm->arch.has_private_mem = true; 639 kvm->arch.disabled_quirks |= KVM_X86_QUIRK_IGNORE_GUEST_PAT; 640 641 /* 642 * Because guest TD is protected, VMM can't parse the instruction in TD. 643 * Instead, guest uses MMIO hypercall. For unmodified device driver, 644 * #VE needs to be injected for MMIO and #VE handler in TD converts MMIO 645 * instruction into MMIO hypercall. 646 * 647 * SPTE value for MMIO needs to be setup so that #VE is injected into 648 * TD instead of triggering EPT MISCONFIG. 649 * - RWX=0 so that EPT violation is triggered. 650 * - suppress #VE bit is cleared to inject #VE. 651 */ 652 kvm_mmu_set_mmio_spte_value(kvm, 0); 653 654 /* 655 * TDX has its own limit of maximum vCPUs it can support for all 656 * TDX guests in addition to KVM_MAX_VCPUS. TDX module reports 657 * such limit via the MAX_VCPU_PER_TD global metadata. In 658 * practice, it reflects the number of logical CPUs that ALL 659 * platforms that the TDX module supports can possibly have. 660 * 661 * Limit TDX guest's maximum vCPUs to the number of logical CPUs 662 * the platform has. Simply forwarding the MAX_VCPU_PER_TD to 663 * userspace would result in an unpredictable ABI. 664 */ 665 kvm->max_vcpus = min_t(int, kvm->max_vcpus, num_present_cpus()); 666 667 kvm_tdx->state = TD_STATE_UNINITIALIZED; 668 669 return 0; 670 } 671 672 int tdx_vcpu_create(struct kvm_vcpu *vcpu) 673 { 674 struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); 675 struct vcpu_tdx *tdx = to_tdx(vcpu); 676 677 if (kvm_tdx->state != TD_STATE_INITIALIZED) 678 return -EIO; 679 680 /* 681 * TDX module mandates APICv, which requires an in-kernel local APIC. 682 * Disallow an in-kernel I/O APIC, because level-triggered interrupts 683 * and thus the I/O APIC as a whole can't be faithfully emulated in KVM. 684 */ 685 if (!irqchip_split(vcpu->kvm)) 686 return -EINVAL; 687 688 fpstate_set_confidential(&vcpu->arch.guest_fpu); 689 vcpu->arch.apic->guest_apic_protected = true; 690 INIT_LIST_HEAD(&tdx->vt.pi_wakeup_list); 691 692 vcpu->arch.efer = EFER_SCE | EFER_LME | EFER_LMA | EFER_NX; 693 694 vcpu->arch.switch_db_regs = KVM_DEBUGREG_AUTO_SWITCH; 695 vcpu->arch.cr0_guest_owned_bits = -1ul; 696 vcpu->arch.cr4_guest_owned_bits = -1ul; 697 698 /* KVM can't change TSC offset/multiplier as TDX module manages them. */ 699 vcpu->arch.guest_tsc_protected = true; 700 vcpu->arch.tsc_offset = kvm_tdx->tsc_offset; 701 vcpu->arch.l1_tsc_offset = vcpu->arch.tsc_offset; 702 vcpu->arch.tsc_scaling_ratio = kvm_tdx->tsc_multiplier; 703 vcpu->arch.l1_tsc_scaling_ratio = kvm_tdx->tsc_multiplier; 704 705 vcpu->arch.guest_state_protected = 706 !(to_kvm_tdx(vcpu->kvm)->attributes & TDX_TD_ATTR_DEBUG); 707 708 if ((kvm_tdx->xfam & XFEATURE_MASK_XTILE) == XFEATURE_MASK_XTILE) 709 vcpu->arch.xfd_no_write_intercept = true; 710 711 tdx->vt.pi_desc.nv = POSTED_INTR_VECTOR; 712 __pi_set_sn(&tdx->vt.pi_desc); 713 714 tdx->state = VCPU_TD_STATE_UNINITIALIZED; 715 716 return 0; 717 } 718 719 void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 720 { 721 struct vcpu_tdx *tdx = to_tdx(vcpu); 722 723 vmx_vcpu_pi_load(vcpu, cpu); 724 if (vcpu->cpu == cpu || !is_hkid_assigned(to_kvm_tdx(vcpu->kvm))) 725 return; 726 727 tdx_flush_vp_on_cpu(vcpu); 728 729 KVM_BUG_ON(cpu != raw_smp_processor_id(), vcpu->kvm); 730 local_irq_disable(); 731 /* 732 * Pairs with the smp_wmb() in tdx_disassociate_vp() to ensure 733 * vcpu->cpu is read before tdx->cpu_list. 734 */ 735 smp_rmb(); 736 737 list_add(&tdx->cpu_list, &per_cpu(associated_tdvcpus, cpu)); 738 local_irq_enable(); 739 } 740 741 bool tdx_interrupt_allowed(struct kvm_vcpu *vcpu) 742 { 743 /* 744 * KVM can't get the interrupt status of TDX guest and it assumes 745 * interrupt is always allowed unless TDX guest calls TDVMCALL with HLT, 746 * which passes the interrupt blocked flag. 747 */ 748 return vmx_get_exit_reason(vcpu).basic != EXIT_REASON_HLT || 749 !to_tdx(vcpu)->vp_enter_args.r12; 750 } 751 752 static bool tdx_protected_apic_has_interrupt(struct kvm_vcpu *vcpu) 753 { 754 u64 vcpu_state_details; 755 756 if (pi_has_pending_interrupt(vcpu)) 757 return true; 758 759 /* 760 * Only check RVI pending for HALTED case with IRQ enabled. 761 * For non-HLT cases, KVM doesn't care about STI/SS shadows. And if the 762 * interrupt was pending before TD exit, then it _must_ be blocked, 763 * otherwise the interrupt would have been serviced at the instruction 764 * boundary. 765 */ 766 if (vmx_get_exit_reason(vcpu).basic != EXIT_REASON_HLT || 767 to_tdx(vcpu)->vp_enter_args.r12) 768 return false; 769 770 vcpu_state_details = 771 td_state_non_arch_read64(to_tdx(vcpu), TD_VCPU_STATE_DETAILS_NON_ARCH); 772 773 return tdx_vcpu_state_details_intr_pending(vcpu_state_details); 774 } 775 776 struct tdx_uret_msr { 777 u32 msr; 778 unsigned int slot; 779 u64 defval; 780 }; 781 782 static struct tdx_uret_msr tdx_uret_msrs[] = { 783 {.msr = MSR_SYSCALL_MASK, .defval = 0x20200 }, 784 {.msr = MSR_STAR,}, 785 {.msr = MSR_LSTAR,}, 786 {.msr = MSR_TSC_AUX,}, 787 }; 788 789 void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) 790 { 791 struct vcpu_vt *vt = to_vt(vcpu); 792 int i; 793 794 if (vt->guest_state_loaded) 795 return; 796 797 if (likely(is_64bit_mm(current->mm))) 798 vt->msr_host_kernel_gs_base = current->thread.gsbase; 799 else 800 vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE); 801 802 vt->guest_state_loaded = true; 803 804 /* 805 * Explicitly set user-return MSRs that are clobbered by the TDX-Module 806 * if VP.ENTER succeeds, i.e. on TD-Exit, with the values that would be 807 * written by the TDX-Module. Don't rely on the TDX-Module to actually 808 * clobber the MSRs, as the contract is poorly defined and not upheld. 809 * E.g. the TDX-Module will synthesize an EPT Violation without doing 810 * VM-Enter if it suspects a zero-step attack, and never "restore" VMM 811 * state. 812 */ 813 for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++) 814 kvm_set_user_return_msr(tdx_uret_msrs[i].slot, 815 tdx_uret_msrs[i].defval, -1ull); 816 } 817 818 static void tdx_prepare_switch_to_host(struct kvm_vcpu *vcpu) 819 { 820 struct vcpu_vt *vt = to_vt(vcpu); 821 822 if (!vt->guest_state_loaded) 823 return; 824 825 ++vcpu->stat.host_state_reload; 826 wrmsrl(MSR_KERNEL_GS_BASE, vt->msr_host_kernel_gs_base); 827 828 vt->guest_state_loaded = false; 829 } 830 831 void tdx_vcpu_put(struct kvm_vcpu *vcpu) 832 { 833 vmx_vcpu_pi_put(vcpu); 834 tdx_prepare_switch_to_host(vcpu); 835 } 836 837 /* 838 * Life cycles for a TD and a vCPU: 839 * 1. KVM_CREATE_VM ioctl. 840 * TD state is TD_STATE_UNINITIALIZED. 841 * hkid is not assigned at this stage. 842 * 2. KVM_TDX_INIT_VM ioctl. 843 * TD transitions to TD_STATE_INITIALIZED. 844 * hkid is assigned after this stage. 845 * 3. KVM_CREATE_VCPU ioctl. (only when TD is TD_STATE_INITIALIZED). 846 * 3.1 tdx_vcpu_create() transitions vCPU state to VCPU_TD_STATE_UNINITIALIZED. 847 * 3.2 vcpu_load() and vcpu_put() in kvm_arch_vcpu_create(). 848 * 3.3 (conditional) if any error encountered after kvm_arch_vcpu_create() 849 * kvm_arch_vcpu_destroy() --> tdx_vcpu_free(). 850 * 4. KVM_TDX_INIT_VCPU ioctl. 851 * tdx_vcpu_init() transitions vCPU state to VCPU_TD_STATE_INITIALIZED. 852 * vCPU control structures are allocated at this stage. 853 * 5. kvm_destroy_vm(). 854 * 5.1 tdx_mmu_release_hkid(): (1) tdh_vp_flush(), disassociates all vCPUs. 855 * (2) puts hkid to !assigned state. 856 * 5.2 kvm_destroy_vcpus() --> tdx_vcpu_free(): 857 * transitions vCPU to VCPU_TD_STATE_UNINITIALIZED state. 858 * 5.3 tdx_vm_destroy() 859 * transitions TD to TD_STATE_UNINITIALIZED state. 860 * 861 * tdx_vcpu_free() can be invoked only at 3.3 or 5.2. 862 * - If at 3.3, hkid is still assigned, but the vCPU must be in 863 * VCPU_TD_STATE_UNINITIALIZED state. 864 * - if at 5.2, hkid must be !assigned and all vCPUs must be in 865 * VCPU_TD_STATE_INITIALIZED state and have been dissociated. 866 */ 867 void tdx_vcpu_free(struct kvm_vcpu *vcpu) 868 { 869 struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); 870 struct vcpu_tdx *tdx = to_tdx(vcpu); 871 int i; 872 873 if (vcpu->cpu != -1) { 874 KVM_BUG_ON(tdx->state == VCPU_TD_STATE_INITIALIZED, vcpu->kvm); 875 tdx_flush_vp_on_cpu(vcpu); 876 return; 877 } 878 879 /* 880 * It is not possible to reclaim pages while hkid is assigned. It might 881 * be assigned if the TD VM is being destroyed but freeing hkid failed, 882 * in which case the pages are leaked. 883 */ 884 if (is_hkid_assigned(kvm_tdx)) 885 return; 886 887 if (tdx->vp.tdcx_pages) { 888 for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) { 889 if (tdx->vp.tdcx_pages[i]) 890 tdx_reclaim_control_page(tdx->vp.tdcx_pages[i]); 891 } 892 kfree(tdx->vp.tdcx_pages); 893 tdx->vp.tdcx_pages = NULL; 894 } 895 if (tdx->vp.tdvpr_page) { 896 tdx_reclaim_control_page(tdx->vp.tdvpr_page); 897 tdx->vp.tdvpr_page = NULL; 898 tdx->vp.tdvpr_pa = 0; 899 } 900 901 tdx->state = VCPU_TD_STATE_UNINITIALIZED; 902 } 903 904 int tdx_vcpu_pre_run(struct kvm_vcpu *vcpu) 905 { 906 if (unlikely(to_tdx(vcpu)->state != VCPU_TD_STATE_INITIALIZED || 907 to_kvm_tdx(vcpu->kvm)->state != TD_STATE_RUNNABLE)) 908 return -EINVAL; 909 910 return 1; 911 } 912 913 static __always_inline u32 tdcall_to_vmx_exit_reason(struct kvm_vcpu *vcpu) 914 { 915 switch (tdvmcall_leaf(vcpu)) { 916 case EXIT_REASON_CPUID: 917 case EXIT_REASON_HLT: 918 case EXIT_REASON_IO_INSTRUCTION: 919 case EXIT_REASON_MSR_READ: 920 case EXIT_REASON_MSR_WRITE: 921 return tdvmcall_leaf(vcpu); 922 case EXIT_REASON_EPT_VIOLATION: 923 return EXIT_REASON_EPT_MISCONFIG; 924 default: 925 break; 926 } 927 928 return EXIT_REASON_TDCALL; 929 } 930 931 static __always_inline u32 tdx_to_vmx_exit_reason(struct kvm_vcpu *vcpu) 932 { 933 struct vcpu_tdx *tdx = to_tdx(vcpu); 934 u32 exit_reason; 935 936 switch (tdx->vp_enter_ret & TDX_SEAMCALL_STATUS_MASK) { 937 case TDX_SUCCESS: 938 case TDX_NON_RECOVERABLE_VCPU: 939 case TDX_NON_RECOVERABLE_TD: 940 case TDX_NON_RECOVERABLE_TD_NON_ACCESSIBLE: 941 case TDX_NON_RECOVERABLE_TD_WRONG_APIC_MODE: 942 break; 943 default: 944 return -1u; 945 } 946 947 exit_reason = tdx->vp_enter_ret; 948 949 switch (exit_reason) { 950 case EXIT_REASON_TDCALL: 951 if (tdvmcall_exit_type(vcpu)) 952 return EXIT_REASON_VMCALL; 953 954 return tdcall_to_vmx_exit_reason(vcpu); 955 case EXIT_REASON_EPT_MISCONFIG: 956 /* 957 * Defer KVM_BUG_ON() until tdx_handle_exit() because this is in 958 * non-instrumentable code with interrupts disabled. 959 */ 960 return -1u; 961 default: 962 break; 963 } 964 965 return exit_reason; 966 } 967 968 static noinstr void tdx_vcpu_enter_exit(struct kvm_vcpu *vcpu) 969 { 970 struct vcpu_tdx *tdx = to_tdx(vcpu); 971 struct vcpu_vt *vt = to_vt(vcpu); 972 973 guest_state_enter_irqoff(); 974 975 tdx->vp_enter_ret = tdh_vp_enter(&tdx->vp, &tdx->vp_enter_args); 976 977 vt->exit_reason.full = tdx_to_vmx_exit_reason(vcpu); 978 979 vt->exit_qualification = tdx->vp_enter_args.rcx; 980 tdx->ext_exit_qualification = tdx->vp_enter_args.rdx; 981 tdx->exit_gpa = tdx->vp_enter_args.r8; 982 vt->exit_intr_info = tdx->vp_enter_args.r9; 983 984 vmx_handle_nmi(vcpu); 985 986 guest_state_exit_irqoff(); 987 } 988 989 static bool tdx_failed_vmentry(struct kvm_vcpu *vcpu) 990 { 991 return vmx_get_exit_reason(vcpu).failed_vmentry && 992 vmx_get_exit_reason(vcpu).full != -1u; 993 } 994 995 static fastpath_t tdx_exit_handlers_fastpath(struct kvm_vcpu *vcpu) 996 { 997 u64 vp_enter_ret = to_tdx(vcpu)->vp_enter_ret; 998 999 /* 1000 * TDX_OPERAND_BUSY could be returned for SEPT due to 0-step mitigation 1001 * or for TD EPOCH due to contention with TDH.MEM.TRACK on TDH.VP.ENTER. 1002 * 1003 * When KVM requests KVM_REQ_OUTSIDE_GUEST_MODE, which has both 1004 * KVM_REQUEST_WAIT and KVM_REQUEST_NO_ACTION set, it requires target 1005 * vCPUs leaving fastpath so that interrupt can be enabled to ensure the 1006 * IPIs can be delivered. Return EXIT_FASTPATH_EXIT_HANDLED instead of 1007 * EXIT_FASTPATH_REENTER_GUEST to exit fastpath, otherwise, the 1008 * requester may be blocked endlessly. 1009 */ 1010 if (unlikely(tdx_operand_busy(vp_enter_ret))) 1011 return EXIT_FASTPATH_EXIT_HANDLED; 1012 1013 return EXIT_FASTPATH_NONE; 1014 } 1015 1016 #define TDX_REGS_AVAIL_SET (BIT_ULL(VCPU_EXREG_EXIT_INFO_1) | \ 1017 BIT_ULL(VCPU_EXREG_EXIT_INFO_2) | \ 1018 BIT_ULL(VCPU_REGS_RAX) | \ 1019 BIT_ULL(VCPU_REGS_RBX) | \ 1020 BIT_ULL(VCPU_REGS_RCX) | \ 1021 BIT_ULL(VCPU_REGS_RDX) | \ 1022 BIT_ULL(VCPU_REGS_RBP) | \ 1023 BIT_ULL(VCPU_REGS_RSI) | \ 1024 BIT_ULL(VCPU_REGS_RDI) | \ 1025 BIT_ULL(VCPU_REGS_R8) | \ 1026 BIT_ULL(VCPU_REGS_R9) | \ 1027 BIT_ULL(VCPU_REGS_R10) | \ 1028 BIT_ULL(VCPU_REGS_R11) | \ 1029 BIT_ULL(VCPU_REGS_R12) | \ 1030 BIT_ULL(VCPU_REGS_R13) | \ 1031 BIT_ULL(VCPU_REGS_R14) | \ 1032 BIT_ULL(VCPU_REGS_R15)) 1033 1034 static void tdx_load_host_xsave_state(struct kvm_vcpu *vcpu) 1035 { 1036 struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); 1037 1038 /* 1039 * All TDX hosts support PKRU; but even if they didn't, 1040 * vcpu->arch.host_pkru would be 0 and the wrpkru would be 1041 * skipped. 1042 */ 1043 if (vcpu->arch.host_pkru != 0) 1044 wrpkru(vcpu->arch.host_pkru); 1045 1046 if (kvm_host.xcr0 != (kvm_tdx->xfam & kvm_caps.supported_xcr0)) 1047 xsetbv(XCR_XFEATURE_ENABLED_MASK, kvm_host.xcr0); 1048 1049 /* 1050 * Likewise, even if a TDX hosts didn't support XSS both arms of 1051 * the comparison would be 0 and the wrmsrl would be skipped. 1052 */ 1053 if (kvm_host.xss != (kvm_tdx->xfam & kvm_caps.supported_xss)) 1054 wrmsrl(MSR_IA32_XSS, kvm_host.xss); 1055 } 1056 1057 #define TDX_DEBUGCTL_PRESERVED (DEBUGCTLMSR_BTF | \ 1058 DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI | \ 1059 DEBUGCTLMSR_FREEZE_IN_SMM) 1060 1061 fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) 1062 { 1063 struct vcpu_tdx *tdx = to_tdx(vcpu); 1064 struct vcpu_vt *vt = to_vt(vcpu); 1065 1066 /* 1067 * WARN if KVM wants to force an immediate exit, as the TDX module does 1068 * not guarantee entry into the guest, i.e. it's possible for KVM to 1069 * _think_ it completed entry to the guest and forced an immediate exit 1070 * without actually having done so. Luckily, KVM never needs to force 1071 * an immediate exit for TDX (KVM can't do direct event injection, so 1072 * just WARN and continue on. 1073 */ 1074 WARN_ON_ONCE(run_flags); 1075 1076 /* 1077 * Wait until retry of SEPT-zap-related SEAMCALL completes before 1078 * allowing vCPU entry to avoid contention with tdh_vp_enter() and 1079 * TDCALLs. 1080 */ 1081 if (unlikely(READ_ONCE(to_kvm_tdx(vcpu->kvm)->wait_for_sept_zap))) 1082 return EXIT_FASTPATH_EXIT_HANDLED; 1083 1084 trace_kvm_entry(vcpu, run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT); 1085 1086 if (pi_test_on(&vt->pi_desc)) { 1087 apic->send_IPI_self(POSTED_INTR_VECTOR); 1088 1089 if (pi_test_pir(kvm_lapic_get_reg(vcpu->arch.apic, APIC_LVTT) & 1090 APIC_VECTOR_MASK, &vt->pi_desc)) 1091 kvm_wait_lapic_expire(vcpu); 1092 } 1093 1094 tdx_vcpu_enter_exit(vcpu); 1095 1096 if (vcpu->arch.host_debugctl & ~TDX_DEBUGCTL_PRESERVED) 1097 update_debugctlmsr(vcpu->arch.host_debugctl); 1098 1099 tdx_load_host_xsave_state(vcpu); 1100 1101 vcpu->arch.regs_avail &= TDX_REGS_AVAIL_SET; 1102 1103 if (unlikely(tdx->vp_enter_ret == EXIT_REASON_EPT_MISCONFIG)) 1104 return EXIT_FASTPATH_NONE; 1105 1106 if (unlikely((tdx->vp_enter_ret & TDX_SW_ERROR) == TDX_SW_ERROR)) 1107 return EXIT_FASTPATH_NONE; 1108 1109 trace_kvm_exit(vcpu, KVM_ISA_VMX); 1110 1111 if (unlikely(tdx_failed_vmentry(vcpu))) 1112 return EXIT_FASTPATH_NONE; 1113 1114 return tdx_exit_handlers_fastpath(vcpu); 1115 } 1116 1117 void tdx_inject_nmi(struct kvm_vcpu *vcpu) 1118 { 1119 ++vcpu->stat.nmi_injections; 1120 td_management_write8(to_tdx(vcpu), TD_VCPU_PEND_NMI, 1); 1121 /* 1122 * From KVM's perspective, NMI injection is completed right after 1123 * writing to PEND_NMI. KVM doesn't care whether an NMI is injected by 1124 * the TDX module or not. 1125 */ 1126 vcpu->arch.nmi_injected = false; 1127 /* 1128 * TDX doesn't support KVM to request NMI window exit. If there is 1129 * still a pending vNMI, KVM is not able to inject it along with the 1130 * one pending in TDX module in a back-to-back way. Since the previous 1131 * vNMI is still pending in TDX module, i.e. it has not been delivered 1132 * to TDX guest yet, it's OK to collapse the pending vNMI into the 1133 * previous one. The guest is expected to handle all the NMI sources 1134 * when handling the first vNMI. 1135 */ 1136 vcpu->arch.nmi_pending = 0; 1137 } 1138 1139 static int tdx_handle_exception_nmi(struct kvm_vcpu *vcpu) 1140 { 1141 u32 intr_info = vmx_get_intr_info(vcpu); 1142 1143 /* 1144 * Machine checks are handled by handle_exception_irqoff(), or by 1145 * tdx_handle_exit() with TDX_NON_RECOVERABLE set if a #MC occurs on 1146 * VM-Entry. NMIs are handled by tdx_vcpu_enter_exit(). 1147 */ 1148 if (is_nmi(intr_info) || is_machine_check(intr_info)) 1149 return 1; 1150 1151 vcpu->run->exit_reason = KVM_EXIT_EXCEPTION; 1152 vcpu->run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK; 1153 vcpu->run->ex.error_code = 0; 1154 1155 return 0; 1156 } 1157 1158 static int complete_hypercall_exit(struct kvm_vcpu *vcpu) 1159 { 1160 tdvmcall_set_return_code(vcpu, vcpu->run->hypercall.ret); 1161 return 1; 1162 } 1163 1164 static int tdx_emulate_vmcall(struct kvm_vcpu *vcpu) 1165 { 1166 kvm_rax_write(vcpu, to_tdx(vcpu)->vp_enter_args.r10); 1167 kvm_rbx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r11); 1168 kvm_rcx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r12); 1169 kvm_rdx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r13); 1170 kvm_rsi_write(vcpu, to_tdx(vcpu)->vp_enter_args.r14); 1171 1172 return __kvm_emulate_hypercall(vcpu, 0, complete_hypercall_exit); 1173 } 1174 1175 /* 1176 * Split into chunks and check interrupt pending between chunks. This allows 1177 * for timely injection of interrupts to prevent issues with guest lockup 1178 * detection. 1179 */ 1180 #define TDX_MAP_GPA_MAX_LEN (2 * 1024 * 1024) 1181 static void __tdx_map_gpa(struct vcpu_tdx *tdx); 1182 1183 static int tdx_complete_vmcall_map_gpa(struct kvm_vcpu *vcpu) 1184 { 1185 struct vcpu_tdx *tdx = to_tdx(vcpu); 1186 1187 if (vcpu->run->hypercall.ret) { 1188 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); 1189 tdx->vp_enter_args.r11 = tdx->map_gpa_next; 1190 return 1; 1191 } 1192 1193 tdx->map_gpa_next += TDX_MAP_GPA_MAX_LEN; 1194 if (tdx->map_gpa_next >= tdx->map_gpa_end) 1195 return 1; 1196 1197 /* 1198 * Stop processing the remaining part if there is a pending interrupt, 1199 * which could be qualified to deliver. Skip checking pending RVI for 1200 * TDVMCALL_MAP_GPA, see comments in tdx_protected_apic_has_interrupt(). 1201 */ 1202 if (kvm_vcpu_has_events(vcpu)) { 1203 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_RETRY); 1204 tdx->vp_enter_args.r11 = tdx->map_gpa_next; 1205 return 1; 1206 } 1207 1208 __tdx_map_gpa(tdx); 1209 return 0; 1210 } 1211 1212 static void __tdx_map_gpa(struct vcpu_tdx *tdx) 1213 { 1214 u64 gpa = tdx->map_gpa_next; 1215 u64 size = tdx->map_gpa_end - tdx->map_gpa_next; 1216 1217 if (size > TDX_MAP_GPA_MAX_LEN) 1218 size = TDX_MAP_GPA_MAX_LEN; 1219 1220 tdx->vcpu.run->exit_reason = KVM_EXIT_HYPERCALL; 1221 tdx->vcpu.run->hypercall.nr = KVM_HC_MAP_GPA_RANGE; 1222 /* 1223 * In principle this should have been -KVM_ENOSYS, but userspace (QEMU <=9.2) 1224 * assumed that vcpu->run->hypercall.ret is never changed by KVM and thus that 1225 * it was always zero on KVM_EXIT_HYPERCALL. Since KVM is now overwriting 1226 * vcpu->run->hypercall.ret, ensuring that it is zero to not break QEMU. 1227 */ 1228 tdx->vcpu.run->hypercall.ret = 0; 1229 tdx->vcpu.run->hypercall.args[0] = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(tdx->vcpu.kvm)); 1230 tdx->vcpu.run->hypercall.args[1] = size / PAGE_SIZE; 1231 tdx->vcpu.run->hypercall.args[2] = vt_is_tdx_private_gpa(tdx->vcpu.kvm, gpa) ? 1232 KVM_MAP_GPA_RANGE_ENCRYPTED : 1233 KVM_MAP_GPA_RANGE_DECRYPTED; 1234 tdx->vcpu.run->hypercall.flags = KVM_EXIT_HYPERCALL_LONG_MODE; 1235 1236 tdx->vcpu.arch.complete_userspace_io = tdx_complete_vmcall_map_gpa; 1237 } 1238 1239 static int tdx_map_gpa(struct kvm_vcpu *vcpu) 1240 { 1241 struct vcpu_tdx *tdx = to_tdx(vcpu); 1242 u64 gpa = tdx->vp_enter_args.r12; 1243 u64 size = tdx->vp_enter_args.r13; 1244 u64 ret; 1245 1246 /* 1247 * Converting TDVMCALL_MAP_GPA to KVM_HC_MAP_GPA_RANGE requires 1248 * userspace to enable KVM_CAP_EXIT_HYPERCALL with KVM_HC_MAP_GPA_RANGE 1249 * bit set. This is a base call so it should always be supported, but 1250 * KVM has no way to ensure that userspace implements the GHCI correctly. 1251 * So if KVM_HC_MAP_GPA_RANGE does not cause a VMEXIT, return an error 1252 * to the guest. 1253 */ 1254 if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE)) { 1255 ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED; 1256 goto error; 1257 } 1258 1259 if (gpa + size <= gpa || !kvm_vcpu_is_legal_gpa(vcpu, gpa) || 1260 !kvm_vcpu_is_legal_gpa(vcpu, gpa + size - 1) || 1261 (vt_is_tdx_private_gpa(vcpu->kvm, gpa) != 1262 vt_is_tdx_private_gpa(vcpu->kvm, gpa + size - 1))) { 1263 ret = TDVMCALL_STATUS_INVALID_OPERAND; 1264 goto error; 1265 } 1266 1267 if (!PAGE_ALIGNED(gpa) || !PAGE_ALIGNED(size)) { 1268 ret = TDVMCALL_STATUS_ALIGN_ERROR; 1269 goto error; 1270 } 1271 1272 tdx->map_gpa_end = gpa + size; 1273 tdx->map_gpa_next = gpa; 1274 1275 __tdx_map_gpa(tdx); 1276 return 0; 1277 1278 error: 1279 tdvmcall_set_return_code(vcpu, ret); 1280 tdx->vp_enter_args.r11 = gpa; 1281 return 1; 1282 } 1283 1284 static int tdx_report_fatal_error(struct kvm_vcpu *vcpu) 1285 { 1286 struct vcpu_tdx *tdx = to_tdx(vcpu); 1287 u64 *regs = vcpu->run->system_event.data; 1288 u64 *module_regs = &tdx->vp_enter_args.r8; 1289 int index = VCPU_REGS_RAX; 1290 1291 vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; 1292 vcpu->run->system_event.type = KVM_SYSTEM_EVENT_TDX_FATAL; 1293 vcpu->run->system_event.ndata = 16; 1294 1295 /* Dump 16 general-purpose registers to userspace in ascending order. */ 1296 regs[index++] = tdx->vp_enter_ret; 1297 regs[index++] = tdx->vp_enter_args.rcx; 1298 regs[index++] = tdx->vp_enter_args.rdx; 1299 regs[index++] = tdx->vp_enter_args.rbx; 1300 regs[index++] = 0; 1301 regs[index++] = 0; 1302 regs[index++] = tdx->vp_enter_args.rsi; 1303 regs[index] = tdx->vp_enter_args.rdi; 1304 for (index = 0; index < 8; index++) 1305 regs[VCPU_REGS_R8 + index] = module_regs[index]; 1306 1307 return 0; 1308 } 1309 1310 static int tdx_emulate_cpuid(struct kvm_vcpu *vcpu) 1311 { 1312 u32 eax, ebx, ecx, edx; 1313 struct vcpu_tdx *tdx = to_tdx(vcpu); 1314 1315 /* EAX and ECX for cpuid is stored in R12 and R13. */ 1316 eax = tdx->vp_enter_args.r12; 1317 ecx = tdx->vp_enter_args.r13; 1318 1319 kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx, false); 1320 1321 tdx->vp_enter_args.r12 = eax; 1322 tdx->vp_enter_args.r13 = ebx; 1323 tdx->vp_enter_args.r14 = ecx; 1324 tdx->vp_enter_args.r15 = edx; 1325 1326 return 1; 1327 } 1328 1329 static int tdx_complete_pio_out(struct kvm_vcpu *vcpu) 1330 { 1331 vcpu->arch.pio.count = 0; 1332 return 1; 1333 } 1334 1335 static int tdx_complete_pio_in(struct kvm_vcpu *vcpu) 1336 { 1337 struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; 1338 unsigned long val = 0; 1339 int ret; 1340 1341 ret = ctxt->ops->pio_in_emulated(ctxt, vcpu->arch.pio.size, 1342 vcpu->arch.pio.port, &val, 1); 1343 1344 WARN_ON_ONCE(!ret); 1345 1346 tdvmcall_set_return_val(vcpu, val); 1347 1348 return 1; 1349 } 1350 1351 static int tdx_emulate_io(struct kvm_vcpu *vcpu) 1352 { 1353 struct vcpu_tdx *tdx = to_tdx(vcpu); 1354 struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; 1355 unsigned long val = 0; 1356 unsigned int port; 1357 u64 size, write; 1358 int ret; 1359 1360 ++vcpu->stat.io_exits; 1361 1362 size = tdx->vp_enter_args.r12; 1363 write = tdx->vp_enter_args.r13; 1364 port = tdx->vp_enter_args.r14; 1365 1366 if ((write != 0 && write != 1) || (size != 1 && size != 2 && size != 4)) { 1367 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); 1368 return 1; 1369 } 1370 1371 if (write) { 1372 val = tdx->vp_enter_args.r15; 1373 ret = ctxt->ops->pio_out_emulated(ctxt, size, port, &val, 1); 1374 } else { 1375 ret = ctxt->ops->pio_in_emulated(ctxt, size, port, &val, 1); 1376 } 1377 1378 if (!ret) 1379 vcpu->arch.complete_userspace_io = write ? tdx_complete_pio_out : 1380 tdx_complete_pio_in; 1381 else if (!write) 1382 tdvmcall_set_return_val(vcpu, val); 1383 1384 return ret; 1385 } 1386 1387 static int tdx_complete_mmio_read(struct kvm_vcpu *vcpu) 1388 { 1389 unsigned long val = 0; 1390 gpa_t gpa; 1391 int size; 1392 1393 gpa = vcpu->mmio_fragments[0].gpa; 1394 size = vcpu->mmio_fragments[0].len; 1395 1396 memcpy(&val, vcpu->run->mmio.data, size); 1397 tdvmcall_set_return_val(vcpu, val); 1398 trace_kvm_mmio(KVM_TRACE_MMIO_READ, size, gpa, &val); 1399 return 1; 1400 } 1401 1402 static inline int tdx_mmio_write(struct kvm_vcpu *vcpu, gpa_t gpa, int size, 1403 unsigned long val) 1404 { 1405 if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { 1406 trace_kvm_fast_mmio(gpa); 1407 return 0; 1408 } 1409 1410 trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, size, gpa, &val); 1411 if (kvm_io_bus_write(vcpu, KVM_MMIO_BUS, gpa, size, &val)) 1412 return -EOPNOTSUPP; 1413 1414 return 0; 1415 } 1416 1417 static inline int tdx_mmio_read(struct kvm_vcpu *vcpu, gpa_t gpa, int size) 1418 { 1419 unsigned long val; 1420 1421 if (kvm_io_bus_read(vcpu, KVM_MMIO_BUS, gpa, size, &val)) 1422 return -EOPNOTSUPP; 1423 1424 tdvmcall_set_return_val(vcpu, val); 1425 trace_kvm_mmio(KVM_TRACE_MMIO_READ, size, gpa, &val); 1426 return 0; 1427 } 1428 1429 static int tdx_emulate_mmio(struct kvm_vcpu *vcpu) 1430 { 1431 struct vcpu_tdx *tdx = to_tdx(vcpu); 1432 int size, write, r; 1433 unsigned long val; 1434 gpa_t gpa; 1435 1436 size = tdx->vp_enter_args.r12; 1437 write = tdx->vp_enter_args.r13; 1438 gpa = tdx->vp_enter_args.r14; 1439 val = write ? tdx->vp_enter_args.r15 : 0; 1440 1441 if (size != 1 && size != 2 && size != 4 && size != 8) 1442 goto error; 1443 if (write != 0 && write != 1) 1444 goto error; 1445 1446 /* 1447 * TDG.VP.VMCALL<MMIO> allows only shared GPA, it makes no sense to 1448 * do MMIO emulation for private GPA. 1449 */ 1450 if (vt_is_tdx_private_gpa(vcpu->kvm, gpa) || 1451 vt_is_tdx_private_gpa(vcpu->kvm, gpa + size - 1)) 1452 goto error; 1453 1454 gpa = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(vcpu->kvm)); 1455 1456 if (write) 1457 r = tdx_mmio_write(vcpu, gpa, size, val); 1458 else 1459 r = tdx_mmio_read(vcpu, gpa, size); 1460 if (!r) 1461 /* Kernel completed device emulation. */ 1462 return 1; 1463 1464 /* Request the device emulation to userspace device model. */ 1465 vcpu->mmio_is_write = write; 1466 1467 __kvm_prepare_emulated_mmio_exit(vcpu, gpa, size, &val, write); 1468 1469 if (!write) { 1470 vcpu->arch.complete_userspace_io = tdx_complete_mmio_read; 1471 vcpu->mmio_fragments[0].gpa = gpa; 1472 vcpu->mmio_fragments[0].len = size; 1473 trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, size, gpa, NULL); 1474 } 1475 return 0; 1476 1477 error: 1478 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); 1479 return 1; 1480 } 1481 1482 static int tdx_complete_get_td_vm_call_info(struct kvm_vcpu *vcpu) 1483 { 1484 struct vcpu_tdx *tdx = to_tdx(vcpu); 1485 1486 tdvmcall_set_return_code(vcpu, vcpu->run->tdx.get_tdvmcall_info.ret); 1487 1488 /* 1489 * For now, there is no TDVMCALL beyond GHCI base API supported by KVM 1490 * directly without the support from userspace, just set the value 1491 * returned from userspace. 1492 */ 1493 tdx->vp_enter_args.r11 = vcpu->run->tdx.get_tdvmcall_info.r11; 1494 tdx->vp_enter_args.r12 = vcpu->run->tdx.get_tdvmcall_info.r12; 1495 tdx->vp_enter_args.r13 = vcpu->run->tdx.get_tdvmcall_info.r13; 1496 tdx->vp_enter_args.r14 = vcpu->run->tdx.get_tdvmcall_info.r14; 1497 1498 return 1; 1499 } 1500 1501 static int tdx_get_td_vm_call_info(struct kvm_vcpu *vcpu) 1502 { 1503 struct vcpu_tdx *tdx = to_tdx(vcpu); 1504 1505 switch (tdx->vp_enter_args.r12) { 1506 case 0: 1507 tdx->vp_enter_args.r11 = 0; 1508 tdx->vp_enter_args.r12 = 0; 1509 tdx->vp_enter_args.r13 = 0; 1510 tdx->vp_enter_args.r14 = 0; 1511 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_SUCCESS); 1512 return 1; 1513 case 1: 1514 vcpu->run->tdx.get_tdvmcall_info.leaf = tdx->vp_enter_args.r12; 1515 vcpu->run->exit_reason = KVM_EXIT_TDX; 1516 vcpu->run->tdx.flags = 0; 1517 vcpu->run->tdx.nr = TDVMCALL_GET_TD_VM_CALL_INFO; 1518 vcpu->run->tdx.get_tdvmcall_info.ret = TDVMCALL_STATUS_SUCCESS; 1519 vcpu->run->tdx.get_tdvmcall_info.r11 = 0; 1520 vcpu->run->tdx.get_tdvmcall_info.r12 = 0; 1521 vcpu->run->tdx.get_tdvmcall_info.r13 = 0; 1522 vcpu->run->tdx.get_tdvmcall_info.r14 = 0; 1523 vcpu->arch.complete_userspace_io = tdx_complete_get_td_vm_call_info; 1524 return 0; 1525 default: 1526 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); 1527 return 1; 1528 } 1529 } 1530 1531 static int tdx_complete_simple(struct kvm_vcpu *vcpu) 1532 { 1533 tdvmcall_set_return_code(vcpu, vcpu->run->tdx.unknown.ret); 1534 return 1; 1535 } 1536 1537 static int tdx_get_quote(struct kvm_vcpu *vcpu) 1538 { 1539 struct vcpu_tdx *tdx = to_tdx(vcpu); 1540 u64 gpa = tdx->vp_enter_args.r12; 1541 u64 size = tdx->vp_enter_args.r13; 1542 1543 /* The gpa of buffer must have shared bit set. */ 1544 if (vt_is_tdx_private_gpa(vcpu->kvm, gpa)) { 1545 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); 1546 return 1; 1547 } 1548 1549 vcpu->run->exit_reason = KVM_EXIT_TDX; 1550 vcpu->run->tdx.flags = 0; 1551 vcpu->run->tdx.nr = TDVMCALL_GET_QUOTE; 1552 vcpu->run->tdx.get_quote.ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED; 1553 vcpu->run->tdx.get_quote.gpa = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(tdx->vcpu.kvm)); 1554 vcpu->run->tdx.get_quote.size = size; 1555 1556 vcpu->arch.complete_userspace_io = tdx_complete_simple; 1557 1558 return 0; 1559 } 1560 1561 static int tdx_setup_event_notify_interrupt(struct kvm_vcpu *vcpu) 1562 { 1563 struct vcpu_tdx *tdx = to_tdx(vcpu); 1564 u64 vector = tdx->vp_enter_args.r12; 1565 1566 if (vector < 32 || vector > 255) { 1567 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); 1568 return 1; 1569 } 1570 1571 vcpu->run->exit_reason = KVM_EXIT_TDX; 1572 vcpu->run->tdx.flags = 0; 1573 vcpu->run->tdx.nr = TDVMCALL_SETUP_EVENT_NOTIFY_INTERRUPT; 1574 vcpu->run->tdx.setup_event_notify.ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED; 1575 vcpu->run->tdx.setup_event_notify.vector = vector; 1576 1577 vcpu->arch.complete_userspace_io = tdx_complete_simple; 1578 1579 return 0; 1580 } 1581 1582 static int handle_tdvmcall(struct kvm_vcpu *vcpu) 1583 { 1584 switch (tdvmcall_leaf(vcpu)) { 1585 case TDVMCALL_MAP_GPA: 1586 return tdx_map_gpa(vcpu); 1587 case TDVMCALL_REPORT_FATAL_ERROR: 1588 return tdx_report_fatal_error(vcpu); 1589 case TDVMCALL_GET_TD_VM_CALL_INFO: 1590 return tdx_get_td_vm_call_info(vcpu); 1591 case TDVMCALL_GET_QUOTE: 1592 return tdx_get_quote(vcpu); 1593 case TDVMCALL_SETUP_EVENT_NOTIFY_INTERRUPT: 1594 return tdx_setup_event_notify_interrupt(vcpu); 1595 default: 1596 break; 1597 } 1598 1599 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED); 1600 return 1; 1601 } 1602 1603 void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int pgd_level) 1604 { 1605 u64 shared_bit = (pgd_level == 5) ? TDX_SHARED_BIT_PWL_5 : 1606 TDX_SHARED_BIT_PWL_4; 1607 1608 if (KVM_BUG_ON(shared_bit != kvm_gfn_direct_bits(vcpu->kvm), vcpu->kvm)) 1609 return; 1610 1611 td_vmcs_write64(to_tdx(vcpu), SHARED_EPT_POINTER, root_hpa); 1612 } 1613 1614 static int tdx_mem_page_add(struct kvm *kvm, gfn_t gfn, enum pg_level level, 1615 kvm_pfn_t pfn) 1616 { 1617 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 1618 u64 err, entry, level_state; 1619 gpa_t gpa = gfn_to_gpa(gfn); 1620 1621 lockdep_assert_held(&kvm->slots_lock); 1622 1623 if (KVM_BUG_ON(kvm->arch.pre_fault_allowed, kvm) || 1624 KVM_BUG_ON(!kvm_tdx->page_add_src, kvm)) 1625 return -EIO; 1626 1627 err = tdh_mem_page_add(&kvm_tdx->td, gpa, pfn_to_page(pfn), 1628 kvm_tdx->page_add_src, &entry, &level_state); 1629 if (unlikely(tdx_operand_busy(err))) 1630 return -EBUSY; 1631 1632 if (TDX_BUG_ON_2(err, TDH_MEM_PAGE_ADD, entry, level_state, kvm)) 1633 return -EIO; 1634 1635 return 0; 1636 } 1637 1638 static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn, 1639 enum pg_level level, kvm_pfn_t pfn) 1640 { 1641 int tdx_level = pg_level_to_tdx_sept_level(level); 1642 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 1643 struct page *page = pfn_to_page(pfn); 1644 gpa_t gpa = gfn_to_gpa(gfn); 1645 u64 entry, level_state; 1646 u64 err; 1647 1648 err = tdh_mem_page_aug(&kvm_tdx->td, gpa, tdx_level, page, &entry, &level_state); 1649 if (unlikely(tdx_operand_busy(err))) 1650 return -EBUSY; 1651 1652 if (TDX_BUG_ON_2(err, TDH_MEM_PAGE_AUG, entry, level_state, kvm)) 1653 return -EIO; 1654 1655 return 0; 1656 } 1657 1658 static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn, 1659 enum pg_level level, u64 mirror_spte) 1660 { 1661 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 1662 kvm_pfn_t pfn = spte_to_pfn(mirror_spte); 1663 1664 /* TODO: handle large pages. */ 1665 if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm)) 1666 return -EIO; 1667 1668 WARN_ON_ONCE(!is_shadow_present_pte(mirror_spte) || 1669 (mirror_spte & VMX_EPT_RWX_MASK) != VMX_EPT_RWX_MASK); 1670 1671 /* 1672 * Ensure pre_fault_allowed is read by kvm_arch_vcpu_pre_fault_memory() 1673 * before kvm_tdx->state. Userspace must not be allowed to pre-fault 1674 * arbitrary memory until the initial memory image is finalized. Pairs 1675 * with the smp_wmb() in tdx_td_finalize(). 1676 */ 1677 smp_rmb(); 1678 1679 /* 1680 * If the TD isn't finalized/runnable, then userspace is initializing 1681 * the VM image via KVM_TDX_INIT_MEM_REGION; ADD the page to the TD. 1682 */ 1683 if (unlikely(kvm_tdx->state != TD_STATE_RUNNABLE)) 1684 return tdx_mem_page_add(kvm, gfn, level, pfn); 1685 1686 return tdx_mem_page_aug(kvm, gfn, level, pfn); 1687 } 1688 1689 static int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn, 1690 enum pg_level level, void *private_spt) 1691 { 1692 int tdx_level = pg_level_to_tdx_sept_level(level); 1693 gpa_t gpa = gfn_to_gpa(gfn); 1694 struct page *page = virt_to_page(private_spt); 1695 u64 err, entry, level_state; 1696 1697 err = tdh_mem_sept_add(&to_kvm_tdx(kvm)->td, gpa, tdx_level, page, &entry, 1698 &level_state); 1699 if (unlikely(tdx_operand_busy(err))) 1700 return -EBUSY; 1701 1702 if (TDX_BUG_ON_2(err, TDH_MEM_SEPT_ADD, entry, level_state, kvm)) 1703 return -EIO; 1704 1705 return 0; 1706 } 1707 1708 /* 1709 * Ensure shared and private EPTs to be flushed on all vCPUs. 1710 * tdh_mem_track() is the only caller that increases TD epoch. An increase in 1711 * the TD epoch (e.g., to value "N + 1") is successful only if no vCPUs are 1712 * running in guest mode with the value "N - 1". 1713 * 1714 * A successful execution of tdh_mem_track() ensures that vCPUs can only run in 1715 * guest mode with TD epoch value "N" if no TD exit occurs after the TD epoch 1716 * being increased to "N + 1". 1717 * 1718 * Kicking off all vCPUs after that further results in no vCPUs can run in guest 1719 * mode with TD epoch value "N", which unblocks the next tdh_mem_track() (e.g. 1720 * to increase TD epoch to "N + 2"). 1721 * 1722 * TDX module will flush EPT on the next TD enter and make vCPUs to run in 1723 * guest mode with TD epoch value "N + 1". 1724 * 1725 * kvm_make_all_cpus_request() guarantees all vCPUs are out of guest mode by 1726 * waiting empty IPI handler ack_kick(). 1727 * 1728 * No action is required to the vCPUs being kicked off since the kicking off 1729 * occurs certainly after TD epoch increment and before the next 1730 * tdh_mem_track(). 1731 */ 1732 static void tdx_track(struct kvm *kvm) 1733 { 1734 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 1735 u64 err; 1736 1737 /* If TD isn't finalized, it's before any vcpu running. */ 1738 if (unlikely(kvm_tdx->state != TD_STATE_RUNNABLE)) 1739 return; 1740 1741 /* 1742 * The full sequence of TDH.MEM.TRACK and forcing vCPUs out of guest 1743 * mode must be serialized, as TDH.MEM.TRACK will fail if the previous 1744 * tracking epoch hasn't completed. 1745 */ 1746 lockdep_assert_held_write(&kvm->mmu_lock); 1747 1748 err = tdh_do_no_vcpus(tdh_mem_track, kvm, &kvm_tdx->td); 1749 TDX_BUG_ON(err, TDH_MEM_TRACK, kvm); 1750 1751 kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE); 1752 } 1753 1754 static int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn, 1755 enum pg_level level, void *private_spt) 1756 { 1757 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 1758 1759 /* 1760 * free_external_spt() is only called after hkid is freed when TD is 1761 * tearing down. 1762 * KVM doesn't (yet) zap page table pages in mirror page table while 1763 * TD is active, though guest pages mapped in mirror page table could be 1764 * zapped during TD is active, e.g. for shared <-> private conversion 1765 * and slot move/deletion. 1766 */ 1767 if (KVM_BUG_ON(is_hkid_assigned(kvm_tdx), kvm)) 1768 return -EIO; 1769 1770 /* 1771 * The HKID assigned to this TD was already freed and cache was 1772 * already flushed. We don't have to flush again. 1773 */ 1774 return tdx_reclaim_page(virt_to_page(private_spt)); 1775 } 1776 1777 static void tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn, 1778 enum pg_level level, u64 mirror_spte) 1779 { 1780 struct page *page = pfn_to_page(spte_to_pfn(mirror_spte)); 1781 int tdx_level = pg_level_to_tdx_sept_level(level); 1782 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 1783 gpa_t gpa = gfn_to_gpa(gfn); 1784 u64 err, entry, level_state; 1785 1786 lockdep_assert_held_write(&kvm->mmu_lock); 1787 1788 /* 1789 * HKID is released after all private pages have been removed, and set 1790 * before any might be populated. Warn if zapping is attempted when 1791 * there can't be anything populated in the private EPT. 1792 */ 1793 if (KVM_BUG_ON(!is_hkid_assigned(to_kvm_tdx(kvm)), kvm)) 1794 return; 1795 1796 /* TODO: handle large pages. */ 1797 if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm)) 1798 return; 1799 1800 err = tdh_do_no_vcpus(tdh_mem_range_block, kvm, &kvm_tdx->td, gpa, 1801 tdx_level, &entry, &level_state); 1802 if (TDX_BUG_ON_2(err, TDH_MEM_RANGE_BLOCK, entry, level_state, kvm)) 1803 return; 1804 1805 /* 1806 * TDX requires TLB tracking before dropping private page. Do 1807 * it here, although it is also done later. 1808 */ 1809 tdx_track(kvm); 1810 1811 /* 1812 * When zapping private page, write lock is held. So no race condition 1813 * with other vcpu sept operation. 1814 * Race with TDH.VP.ENTER due to (0-step mitigation) and Guest TDCALLs. 1815 */ 1816 err = tdh_do_no_vcpus(tdh_mem_page_remove, kvm, &kvm_tdx->td, gpa, 1817 tdx_level, &entry, &level_state); 1818 if (TDX_BUG_ON_2(err, TDH_MEM_PAGE_REMOVE, entry, level_state, kvm)) 1819 return; 1820 1821 err = tdh_phymem_page_wbinvd_hkid((u16)kvm_tdx->hkid, page); 1822 if (TDX_BUG_ON(err, TDH_PHYMEM_PAGE_WBINVD, kvm)) 1823 return; 1824 1825 tdx_quirk_reset_page(page); 1826 } 1827 1828 void tdx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, 1829 int trig_mode, int vector) 1830 { 1831 struct kvm_vcpu *vcpu = apic->vcpu; 1832 struct vcpu_tdx *tdx = to_tdx(vcpu); 1833 1834 /* TDX supports only posted interrupt. No lapic emulation. */ 1835 __vmx_deliver_posted_interrupt(vcpu, &tdx->vt.pi_desc, vector); 1836 1837 trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector); 1838 } 1839 1840 static inline bool tdx_is_sept_violation_unexpected_pending(struct kvm_vcpu *vcpu) 1841 { 1842 u64 eeq_type = to_tdx(vcpu)->ext_exit_qualification & TDX_EXT_EXIT_QUAL_TYPE_MASK; 1843 u64 eq = vmx_get_exit_qual(vcpu); 1844 1845 if (eeq_type != TDX_EXT_EXIT_QUAL_TYPE_PENDING_EPT_VIOLATION) 1846 return false; 1847 1848 return !(eq & EPT_VIOLATION_PROT_MASK) && !(eq & EPT_VIOLATION_EXEC_FOR_RING3_LIN); 1849 } 1850 1851 static int tdx_handle_ept_violation(struct kvm_vcpu *vcpu) 1852 { 1853 unsigned long exit_qual; 1854 gpa_t gpa = to_tdx(vcpu)->exit_gpa; 1855 bool local_retry = false; 1856 int ret; 1857 1858 if (vt_is_tdx_private_gpa(vcpu->kvm, gpa)) { 1859 if (tdx_is_sept_violation_unexpected_pending(vcpu)) { 1860 pr_warn("Guest access before accepting 0x%llx on vCPU %d\n", 1861 gpa, vcpu->vcpu_id); 1862 kvm_vm_dead(vcpu->kvm); 1863 return -EIO; 1864 } 1865 /* 1866 * Always treat SEPT violations as write faults. Ignore the 1867 * EXIT_QUALIFICATION reported by TDX-SEAM for SEPT violations. 1868 * TD private pages are always RWX in the SEPT tables, 1869 * i.e. they're always mapped writable. Just as importantly, 1870 * treating SEPT violations as write faults is necessary to 1871 * avoid COW allocations, which will cause TDAUGPAGE failures 1872 * due to aliasing a single HPA to multiple GPAs. 1873 */ 1874 exit_qual = EPT_VIOLATION_ACC_WRITE; 1875 1876 /* Only private GPA triggers zero-step mitigation */ 1877 local_retry = true; 1878 } else { 1879 exit_qual = vmx_get_exit_qual(vcpu); 1880 /* 1881 * EPT violation due to instruction fetch should never be 1882 * triggered from shared memory in TDX guest. If such EPT 1883 * violation occurs, treat it as broken hardware. 1884 */ 1885 if (KVM_BUG_ON(exit_qual & EPT_VIOLATION_ACC_INSTR, vcpu->kvm)) 1886 return -EIO; 1887 } 1888 1889 trace_kvm_page_fault(vcpu, gpa, exit_qual); 1890 1891 /* 1892 * To minimize TDH.VP.ENTER invocations, retry locally for private GPA 1893 * mapping in TDX. 1894 * 1895 * KVM may return RET_PF_RETRY for private GPA due to 1896 * - contentions when atomically updating SPTEs of the mirror page table 1897 * - in-progress GFN invalidation or memslot removal. 1898 * - TDX_OPERAND_BUSY error from TDH.MEM.PAGE.AUG or TDH.MEM.SEPT.ADD, 1899 * caused by contentions with TDH.VP.ENTER (with zero-step mitigation) 1900 * or certain TDCALLs. 1901 * 1902 * If TDH.VP.ENTER is invoked more times than the threshold set by the 1903 * TDX module before KVM resolves the private GPA mapping, the TDX 1904 * module will activate zero-step mitigation during TDH.VP.ENTER. This 1905 * process acquires an SEPT tree lock in the TDX module, leading to 1906 * further contentions with TDH.MEM.PAGE.AUG or TDH.MEM.SEPT.ADD 1907 * operations on other vCPUs. 1908 * 1909 * Breaking out of local retries for kvm_vcpu_has_events() is for 1910 * interrupt injection. kvm_vcpu_has_events() should not see pending 1911 * events for TDX. Since KVM can't determine if IRQs (or NMIs) are 1912 * blocked by TDs, false positives are inevitable i.e., KVM may re-enter 1913 * the guest even if the IRQ/NMI can't be delivered. 1914 * 1915 * Note: even without breaking out of local retries, zero-step 1916 * mitigation may still occur due to 1917 * - invoking of TDH.VP.ENTER after KVM_EXIT_MEMORY_FAULT, 1918 * - a single RIP causing EPT violations for more GFNs than the 1919 * threshold count. 1920 * This is safe, as triggering zero-step mitigation only introduces 1921 * contentions to page installation SEAMCALLs on other vCPUs, which will 1922 * handle retries locally in their EPT violation handlers. 1923 */ 1924 while (1) { 1925 struct kvm_memory_slot *slot; 1926 1927 ret = __vmx_handle_ept_violation(vcpu, gpa, exit_qual); 1928 1929 if (ret != RET_PF_RETRY || !local_retry) 1930 break; 1931 1932 if (kvm_vcpu_has_events(vcpu) || signal_pending(current)) 1933 break; 1934 1935 if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu)) { 1936 ret = -EIO; 1937 break; 1938 } 1939 1940 /* 1941 * Bail if the memslot is invalid, i.e. is being deleted, as 1942 * faulting in will never succeed and this task needs to drop 1943 * SRCU in order to let memslot deletion complete. 1944 */ 1945 slot = kvm_vcpu_gfn_to_memslot(vcpu, gpa_to_gfn(gpa)); 1946 if (slot && slot->flags & KVM_MEMSLOT_INVALID) 1947 break; 1948 1949 cond_resched(); 1950 } 1951 return ret; 1952 } 1953 1954 int tdx_complete_emulated_msr(struct kvm_vcpu *vcpu, int err) 1955 { 1956 if (err) { 1957 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); 1958 return 1; 1959 } 1960 1961 if (vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MSR_READ) 1962 tdvmcall_set_return_val(vcpu, kvm_read_edx_eax(vcpu)); 1963 1964 return 1; 1965 } 1966 1967 1968 int tdx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t fastpath) 1969 { 1970 struct vcpu_tdx *tdx = to_tdx(vcpu); 1971 u64 vp_enter_ret = tdx->vp_enter_ret; 1972 union vmx_exit_reason exit_reason = vmx_get_exit_reason(vcpu); 1973 1974 if (fastpath != EXIT_FASTPATH_NONE) 1975 return 1; 1976 1977 if (unlikely(vp_enter_ret == EXIT_REASON_EPT_MISCONFIG)) { 1978 KVM_BUG_ON(1, vcpu->kvm); 1979 return -EIO; 1980 } 1981 1982 /* 1983 * Handle TDX SW errors, including TDX_SEAMCALL_UD, TDX_SEAMCALL_GP and 1984 * TDX_SEAMCALL_VMFAILINVALID. 1985 */ 1986 if (unlikely((vp_enter_ret & TDX_SW_ERROR) == TDX_SW_ERROR)) { 1987 KVM_BUG_ON(!virt_rebooting, vcpu->kvm); 1988 goto unhandled_exit; 1989 } 1990 1991 if (unlikely(tdx_failed_vmentry(vcpu))) { 1992 /* 1993 * If the guest state is protected, that means off-TD debug is 1994 * not enabled, TDX_NON_RECOVERABLE must be set. 1995 */ 1996 WARN_ON_ONCE(vcpu->arch.guest_state_protected && 1997 !(vp_enter_ret & TDX_NON_RECOVERABLE)); 1998 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; 1999 vcpu->run->fail_entry.hardware_entry_failure_reason = exit_reason.full; 2000 vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu; 2001 return 0; 2002 } 2003 2004 if (unlikely(vp_enter_ret & (TDX_ERROR | TDX_NON_RECOVERABLE)) && 2005 exit_reason.basic != EXIT_REASON_TRIPLE_FAULT) { 2006 kvm_pr_unimpl("TD vp_enter_ret 0x%llx\n", vp_enter_ret); 2007 goto unhandled_exit; 2008 } 2009 2010 WARN_ON_ONCE(exit_reason.basic != EXIT_REASON_TRIPLE_FAULT && 2011 (vp_enter_ret & TDX_SEAMCALL_STATUS_MASK) != TDX_SUCCESS); 2012 2013 switch (exit_reason.basic) { 2014 case EXIT_REASON_TRIPLE_FAULT: 2015 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; 2016 vcpu->mmio_needed = 0; 2017 return 0; 2018 case EXIT_REASON_EXCEPTION_NMI: 2019 return tdx_handle_exception_nmi(vcpu); 2020 case EXIT_REASON_EXTERNAL_INTERRUPT: 2021 ++vcpu->stat.irq_exits; 2022 return 1; 2023 case EXIT_REASON_CPUID: 2024 return tdx_emulate_cpuid(vcpu); 2025 case EXIT_REASON_HLT: 2026 return kvm_emulate_halt_noskip(vcpu); 2027 case EXIT_REASON_TDCALL: 2028 return handle_tdvmcall(vcpu); 2029 case EXIT_REASON_VMCALL: 2030 return tdx_emulate_vmcall(vcpu); 2031 case EXIT_REASON_IO_INSTRUCTION: 2032 return tdx_emulate_io(vcpu); 2033 case EXIT_REASON_MSR_READ: 2034 kvm_rcx_write(vcpu, tdx->vp_enter_args.r12); 2035 return kvm_emulate_rdmsr(vcpu); 2036 case EXIT_REASON_MSR_WRITE: 2037 kvm_rcx_write(vcpu, tdx->vp_enter_args.r12); 2038 kvm_rax_write(vcpu, tdx->vp_enter_args.r13 & -1u); 2039 kvm_rdx_write(vcpu, tdx->vp_enter_args.r13 >> 32); 2040 return kvm_emulate_wrmsr(vcpu); 2041 case EXIT_REASON_EPT_MISCONFIG: 2042 return tdx_emulate_mmio(vcpu); 2043 case EXIT_REASON_EPT_VIOLATION: 2044 return tdx_handle_ept_violation(vcpu); 2045 case EXIT_REASON_OTHER_SMI: 2046 /* 2047 * Unlike VMX, SMI in SEAM non-root mode (i.e. when 2048 * TD guest vCPU is running) will cause VM exit to TDX module, 2049 * then SEAMRET to KVM. Once it exits to KVM, SMI is delivered 2050 * and handled by kernel handler right away. 2051 * 2052 * The Other SMI exit can also be caused by the SEAM non-root 2053 * machine check delivered via Machine Check System Management 2054 * Interrupt (MSMI), but it has already been handled by the 2055 * kernel machine check handler, i.e., the memory page has been 2056 * marked as poisoned and it won't be freed to the free list 2057 * when the TDX guest is terminated (the TDX module marks the 2058 * guest as dead and prevent it from further running when 2059 * machine check happens in SEAM non-root). 2060 * 2061 * - A MSMI will not reach here, it's handled as non_recoverable 2062 * case above. 2063 * - If it's not an MSMI, no need to do anything here. 2064 */ 2065 return 1; 2066 default: 2067 break; 2068 } 2069 2070 unhandled_exit: 2071 kvm_prepare_unexpected_reason_exit(vcpu, vp_enter_ret); 2072 return 0; 2073 } 2074 2075 void tdx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, 2076 u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code) 2077 { 2078 struct vcpu_tdx *tdx = to_tdx(vcpu); 2079 2080 *reason = tdx->vt.exit_reason.full; 2081 if (*reason != -1u) { 2082 *info1 = vmx_get_exit_qual(vcpu); 2083 *info2 = tdx->ext_exit_qualification; 2084 *intr_info = vmx_get_intr_info(vcpu); 2085 } else { 2086 *info1 = 0; 2087 *info2 = 0; 2088 *intr_info = 0; 2089 } 2090 2091 *error_code = 0; 2092 } 2093 2094 bool tdx_has_emulated_msr(u32 index) 2095 { 2096 switch (index) { 2097 case MSR_IA32_UCODE_REV: 2098 case MSR_IA32_ARCH_CAPABILITIES: 2099 case MSR_IA32_POWER_CTL: 2100 case MSR_IA32_CR_PAT: 2101 case MSR_MTRRcap: 2102 case MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000: 2103 case MSR_MTRRdefType: 2104 case MSR_IA32_TSC_DEADLINE: 2105 case MSR_IA32_MISC_ENABLE: 2106 case MSR_PLATFORM_INFO: 2107 case MSR_MISC_FEATURES_ENABLES: 2108 case MSR_IA32_APICBASE: 2109 case MSR_EFER: 2110 case MSR_IA32_FEAT_CTL: 2111 case MSR_IA32_MCG_CAP: 2112 case MSR_IA32_MCG_STATUS: 2113 case MSR_IA32_MCG_CTL: 2114 case MSR_IA32_MCG_EXT_CTL: 2115 case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: 2116 case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1: 2117 /* MSR_IA32_MCx_{CTL, STATUS, ADDR, MISC, CTL2} */ 2118 case MSR_KVM_POLL_CONTROL: 2119 return true; 2120 case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff: 2121 /* 2122 * x2APIC registers that are virtualized by the CPU can't be 2123 * emulated, KVM doesn't have access to the virtual APIC page. 2124 */ 2125 switch (index) { 2126 case X2APIC_MSR(APIC_TASKPRI): 2127 case X2APIC_MSR(APIC_PROCPRI): 2128 case X2APIC_MSR(APIC_EOI): 2129 case X2APIC_MSR(APIC_ISR) ... X2APIC_MSR(APIC_ISR + APIC_ISR_NR): 2130 case X2APIC_MSR(APIC_TMR) ... X2APIC_MSR(APIC_TMR + APIC_ISR_NR): 2131 case X2APIC_MSR(APIC_IRR) ... X2APIC_MSR(APIC_IRR + APIC_ISR_NR): 2132 return false; 2133 default: 2134 return true; 2135 } 2136 default: 2137 return false; 2138 } 2139 } 2140 2141 static bool tdx_is_read_only_msr(u32 index) 2142 { 2143 return index == MSR_IA32_APICBASE || index == MSR_EFER || 2144 index == MSR_IA32_FEAT_CTL; 2145 } 2146 2147 int tdx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) 2148 { 2149 switch (msr->index) { 2150 case MSR_IA32_FEAT_CTL: 2151 /* 2152 * MCE and MCA are advertised via cpuid. Guest kernel could 2153 * check if LMCE is enabled or not. 2154 */ 2155 msr->data = FEAT_CTL_LOCKED; 2156 if (vcpu->arch.mcg_cap & MCG_LMCE_P) 2157 msr->data |= FEAT_CTL_LMCE_ENABLED; 2158 return 0; 2159 case MSR_IA32_MCG_EXT_CTL: 2160 if (!msr->host_initiated && !(vcpu->arch.mcg_cap & MCG_LMCE_P)) 2161 return 1; 2162 msr->data = vcpu->arch.mcg_ext_ctl; 2163 return 0; 2164 default: 2165 if (!tdx_has_emulated_msr(msr->index)) 2166 return 1; 2167 2168 return kvm_get_msr_common(vcpu, msr); 2169 } 2170 } 2171 2172 int tdx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) 2173 { 2174 switch (msr->index) { 2175 case MSR_IA32_MCG_EXT_CTL: 2176 if ((!msr->host_initiated && !(vcpu->arch.mcg_cap & MCG_LMCE_P)) || 2177 (msr->data & ~MCG_EXT_CTL_LMCE_EN)) 2178 return 1; 2179 vcpu->arch.mcg_ext_ctl = msr->data; 2180 return 0; 2181 default: 2182 if (tdx_is_read_only_msr(msr->index)) 2183 return 1; 2184 2185 if (!tdx_has_emulated_msr(msr->index)) 2186 return 1; 2187 2188 return kvm_set_msr_common(vcpu, msr); 2189 } 2190 } 2191 2192 static int tdx_get_capabilities(struct kvm_tdx_cmd *cmd) 2193 { 2194 const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf; 2195 struct kvm_tdx_capabilities __user *user_caps; 2196 struct kvm_tdx_capabilities *caps = NULL; 2197 u32 nr_user_entries; 2198 int ret = 0; 2199 2200 /* flags is reserved for future use */ 2201 if (cmd->flags) 2202 return -EINVAL; 2203 2204 user_caps = u64_to_user_ptr(cmd->data); 2205 if (get_user(nr_user_entries, &user_caps->cpuid.nent)) 2206 return -EFAULT; 2207 2208 if (nr_user_entries < td_conf->num_cpuid_config) 2209 return -E2BIG; 2210 2211 caps = kzalloc_flex(*caps, cpuid.entries, td_conf->num_cpuid_config); 2212 if (!caps) 2213 return -ENOMEM; 2214 2215 ret = init_kvm_tdx_caps(td_conf, caps); 2216 if (ret) 2217 goto out; 2218 2219 if (copy_to_user(user_caps, caps, struct_size(caps, cpuid.entries, 2220 caps->cpuid.nent))) { 2221 ret = -EFAULT; 2222 goto out; 2223 } 2224 2225 out: 2226 /* kfree() accepts NULL. */ 2227 kfree(caps); 2228 return ret; 2229 } 2230 2231 /* 2232 * KVM reports guest physical address in CPUID.0x800000008.EAX[23:16], which is 2233 * similar to TDX's GPAW. Use this field as the interface for userspace to 2234 * configure the GPAW and EPT level for TDs. 2235 * 2236 * Only values 48 and 52 are supported. Value 52 means GPAW-52 and EPT level 2237 * 5, Value 48 means GPAW-48 and EPT level 4. For value 48, GPAW-48 is always 2238 * supported. Value 52 is only supported when the platform supports 5 level 2239 * EPT. 2240 */ 2241 static int setup_tdparams_eptp_controls(struct kvm_cpuid2 *cpuid, 2242 struct td_params *td_params) 2243 { 2244 const struct kvm_cpuid_entry2 *entry; 2245 int guest_pa; 2246 2247 entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent, 0x80000008, 0); 2248 if (!entry) 2249 return -EINVAL; 2250 2251 guest_pa = tdx_get_guest_phys_addr_bits(entry->eax); 2252 2253 if (guest_pa != 48 && guest_pa != 52) 2254 return -EINVAL; 2255 2256 if (guest_pa == 52 && !cpu_has_vmx_ept_5levels()) 2257 return -EINVAL; 2258 2259 td_params->eptp_controls = VMX_EPTP_MT_WB; 2260 if (guest_pa == 52) { 2261 td_params->eptp_controls |= VMX_EPTP_PWL_5; 2262 td_params->config_flags |= TDX_CONFIG_FLAGS_MAX_GPAW; 2263 } else { 2264 td_params->eptp_controls |= VMX_EPTP_PWL_4; 2265 } 2266 2267 return 0; 2268 } 2269 2270 static int setup_tdparams_cpuids(struct kvm_cpuid2 *cpuid, 2271 struct td_params *td_params) 2272 { 2273 const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf; 2274 const struct kvm_cpuid_entry2 *entry; 2275 struct tdx_cpuid_value *value; 2276 int i, copy_cnt = 0; 2277 2278 /* 2279 * td_params.cpuid_values: The number and the order of cpuid_value must 2280 * be same to the one of struct tdsysinfo.{num_cpuid_config, cpuid_configs} 2281 * It's assumed that td_params was zeroed. 2282 */ 2283 for (i = 0; i < td_conf->num_cpuid_config; i++) { 2284 struct kvm_cpuid_entry2 tmp; 2285 2286 td_init_cpuid_entry2(&tmp, i); 2287 2288 entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent, 2289 tmp.function, tmp.index); 2290 if (!entry) 2291 continue; 2292 2293 if (tdx_unsupported_cpuid(entry)) 2294 return -EINVAL; 2295 2296 copy_cnt++; 2297 2298 value = &td_params->cpuid_values[i]; 2299 value->eax = entry->eax; 2300 value->ebx = entry->ebx; 2301 value->ecx = entry->ecx; 2302 value->edx = entry->edx; 2303 2304 /* 2305 * TDX module does not accept nonzero bits 16..23 for the 2306 * CPUID[0x80000008].EAX, see setup_tdparams_eptp_controls(). 2307 */ 2308 if (tmp.function == 0x80000008) 2309 value->eax = tdx_set_guest_phys_addr_bits(value->eax, 0); 2310 } 2311 2312 /* 2313 * Rely on the TDX module to reject invalid configuration, but it can't 2314 * check of leafs that don't have a proper slot in td_params->cpuid_values 2315 * to stick then. So fail if there were entries that didn't get copied to 2316 * td_params. 2317 */ 2318 if (copy_cnt != cpuid->nent) 2319 return -EINVAL; 2320 2321 return 0; 2322 } 2323 2324 static int setup_tdparams(struct kvm *kvm, struct td_params *td_params, 2325 struct kvm_tdx_init_vm *init_vm) 2326 { 2327 const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf; 2328 struct kvm_cpuid2 *cpuid = &init_vm->cpuid; 2329 int ret; 2330 2331 if (kvm->created_vcpus) 2332 return -EBUSY; 2333 2334 if (init_vm->attributes & ~tdx_get_supported_attrs(td_conf)) 2335 return -EINVAL; 2336 2337 if (init_vm->xfam & ~tdx_get_supported_xfam(td_conf)) 2338 return -EINVAL; 2339 2340 td_params->max_vcpus = kvm->max_vcpus; 2341 td_params->attributes = init_vm->attributes | td_conf->attributes_fixed1; 2342 td_params->xfam = init_vm->xfam | td_conf->xfam_fixed1; 2343 2344 td_params->config_flags = TDX_CONFIG_FLAGS_NO_RBP_MOD; 2345 td_params->tsc_frequency = TDX_TSC_KHZ_TO_25MHZ(kvm->arch.default_tsc_khz); 2346 2347 ret = setup_tdparams_eptp_controls(cpuid, td_params); 2348 if (ret) 2349 return ret; 2350 2351 ret = setup_tdparams_cpuids(cpuid, td_params); 2352 if (ret) 2353 return ret; 2354 2355 #define MEMCPY_SAME_SIZE(dst, src) \ 2356 do { \ 2357 BUILD_BUG_ON(sizeof(dst) != sizeof(src)); \ 2358 memcpy((dst), (src), sizeof(dst)); \ 2359 } while (0) 2360 2361 MEMCPY_SAME_SIZE(td_params->mrconfigid, init_vm->mrconfigid); 2362 MEMCPY_SAME_SIZE(td_params->mrowner, init_vm->mrowner); 2363 MEMCPY_SAME_SIZE(td_params->mrownerconfig, init_vm->mrownerconfig); 2364 2365 return 0; 2366 } 2367 2368 static int __tdx_td_init(struct kvm *kvm, struct td_params *td_params, 2369 u64 *seamcall_err) 2370 { 2371 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 2372 cpumask_var_t packages; 2373 struct page **tdcs_pages = NULL; 2374 struct page *tdr_page; 2375 int ret, i; 2376 u64 err, rcx; 2377 2378 *seamcall_err = 0; 2379 ret = tdx_guest_keyid_alloc(); 2380 if (ret < 0) 2381 return ret; 2382 kvm_tdx->hkid = ret; 2383 kvm_tdx->misc_cg = get_current_misc_cg(); 2384 ret = misc_cg_try_charge(MISC_CG_RES_TDX, kvm_tdx->misc_cg, 1); 2385 if (ret) 2386 goto free_hkid; 2387 2388 ret = -ENOMEM; 2389 2390 tdr_page = alloc_page(GFP_KERNEL); 2391 if (!tdr_page) 2392 goto free_hkid; 2393 2394 kvm_tdx->td.tdcs_nr_pages = tdx_sysinfo->td_ctrl.tdcs_base_size / PAGE_SIZE; 2395 /* TDVPS = TDVPR(4K page) + TDCX(multiple 4K pages), -1 for TDVPR. */ 2396 kvm_tdx->td.tdcx_nr_pages = tdx_sysinfo->td_ctrl.tdvps_base_size / PAGE_SIZE - 1; 2397 tdcs_pages = kzalloc_objs(*kvm_tdx->td.tdcs_pages, 2398 kvm_tdx->td.tdcs_nr_pages); 2399 if (!tdcs_pages) 2400 goto free_tdr; 2401 2402 for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) { 2403 tdcs_pages[i] = alloc_page(GFP_KERNEL); 2404 if (!tdcs_pages[i]) 2405 goto free_tdcs; 2406 } 2407 2408 if (!zalloc_cpumask_var(&packages, GFP_KERNEL)) 2409 goto free_tdcs; 2410 2411 cpus_read_lock(); 2412 2413 /* 2414 * Need at least one CPU of the package to be online in order to 2415 * program all packages for host key id. Check it. 2416 */ 2417 for_each_present_cpu(i) 2418 cpumask_set_cpu(topology_physical_package_id(i), packages); 2419 for_each_online_cpu(i) 2420 cpumask_clear_cpu(topology_physical_package_id(i), packages); 2421 if (!cpumask_empty(packages)) { 2422 ret = -EIO; 2423 /* 2424 * Because it's hard for human operator to figure out the 2425 * reason, warn it. 2426 */ 2427 #define MSG_ALLPKG "All packages need to have online CPU to create TD. Online CPU and retry.\n" 2428 pr_warn_ratelimited(MSG_ALLPKG); 2429 goto free_packages; 2430 } 2431 2432 /* 2433 * TDH.MNG.CREATE tries to grab the global TDX module and fails 2434 * with TDX_OPERAND_BUSY when it fails to grab. Take the global 2435 * lock to prevent it from failure. 2436 */ 2437 mutex_lock(&tdx_lock); 2438 kvm_tdx->td.tdr_page = tdr_page; 2439 err = tdh_mng_create(&kvm_tdx->td, kvm_tdx->hkid); 2440 mutex_unlock(&tdx_lock); 2441 2442 if (err == TDX_RND_NO_ENTROPY) { 2443 ret = -EAGAIN; 2444 goto free_packages; 2445 } 2446 2447 if (TDX_BUG_ON(err, TDH_MNG_CREATE, kvm)) { 2448 ret = -EIO; 2449 goto free_packages; 2450 } 2451 2452 for_each_online_cpu(i) { 2453 int pkg = topology_physical_package_id(i); 2454 2455 if (cpumask_test_and_set_cpu(pkg, packages)) 2456 continue; 2457 2458 /* 2459 * Program the memory controller in the package with an 2460 * encryption key associated to a TDX private host key id 2461 * assigned to this TDR. Concurrent operations on same memory 2462 * controller results in TDX_OPERAND_BUSY. No locking needed 2463 * beyond the cpus_read_lock() above as it serializes against 2464 * hotplug and the first online CPU of the package is always 2465 * used. We never have two CPUs in the same socket trying to 2466 * program the key. 2467 */ 2468 ret = smp_call_on_cpu(i, tdx_do_tdh_mng_key_config, 2469 kvm_tdx, true); 2470 if (ret) 2471 break; 2472 } 2473 cpus_read_unlock(); 2474 free_cpumask_var(packages); 2475 if (ret) { 2476 i = 0; 2477 goto teardown; 2478 } 2479 2480 kvm_tdx->td.tdcs_pages = tdcs_pages; 2481 for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) { 2482 err = tdh_mng_addcx(&kvm_tdx->td, tdcs_pages[i]); 2483 if (err == TDX_RND_NO_ENTROPY) { 2484 /* Here it's hard to allow userspace to retry. */ 2485 ret = -EAGAIN; 2486 goto teardown; 2487 } 2488 if (TDX_BUG_ON(err, TDH_MNG_ADDCX, kvm)) { 2489 ret = -EIO; 2490 goto teardown; 2491 } 2492 } 2493 2494 err = tdh_mng_init(&kvm_tdx->td, __pa(td_params), &rcx); 2495 if ((err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_INVALID) { 2496 /* 2497 * Because a user gives operands, don't warn. 2498 * Return a hint to the user because it's sometimes hard for the 2499 * user to figure out which operand is invalid. SEAMCALL status 2500 * code includes which operand caused invalid operand error. 2501 */ 2502 *seamcall_err = err; 2503 ret = -EINVAL; 2504 goto teardown; 2505 } else if (TDX_BUG_ON_1(err, TDH_MNG_INIT, rcx, kvm)) { 2506 ret = -EIO; 2507 goto teardown; 2508 } 2509 2510 return 0; 2511 2512 /* 2513 * The sequence for freeing resources from a partially initialized TD 2514 * varies based on where in the initialization flow failure occurred. 2515 * Simply use the full teardown and destroy, which naturally play nice 2516 * with partial initialization. 2517 */ 2518 teardown: 2519 /* Only free pages not yet added, so start at 'i' */ 2520 for (; i < kvm_tdx->td.tdcs_nr_pages; i++) { 2521 if (tdcs_pages[i]) { 2522 __free_page(tdcs_pages[i]); 2523 tdcs_pages[i] = NULL; 2524 } 2525 } 2526 if (!kvm_tdx->td.tdcs_pages) 2527 kfree(tdcs_pages); 2528 2529 tdx_mmu_release_hkid(kvm); 2530 tdx_reclaim_td_control_pages(kvm); 2531 2532 return ret; 2533 2534 free_packages: 2535 cpus_read_unlock(); 2536 free_cpumask_var(packages); 2537 2538 free_tdcs: 2539 for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) { 2540 if (tdcs_pages[i]) 2541 __free_page(tdcs_pages[i]); 2542 } 2543 kfree(tdcs_pages); 2544 kvm_tdx->td.tdcs_pages = NULL; 2545 2546 free_tdr: 2547 if (tdr_page) 2548 __free_page(tdr_page); 2549 kvm_tdx->td.tdr_page = NULL; 2550 2551 free_hkid: 2552 tdx_hkid_free(kvm_tdx); 2553 2554 return ret; 2555 } 2556 2557 static u64 tdx_td_metadata_field_read(struct kvm_tdx *tdx, u64 field_id, 2558 u64 *data) 2559 { 2560 u64 err; 2561 2562 err = tdh_mng_rd(&tdx->td, field_id, data); 2563 2564 return err; 2565 } 2566 2567 #define TDX_MD_UNREADABLE_LEAF_MASK GENMASK(30, 7) 2568 #define TDX_MD_UNREADABLE_SUBLEAF_MASK GENMASK(31, 7) 2569 2570 static int tdx_read_cpuid(struct kvm_vcpu *vcpu, u32 leaf, u32 sub_leaf, 2571 bool sub_leaf_set, int *entry_index, 2572 struct kvm_cpuid_entry2 *out) 2573 { 2574 struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); 2575 u64 field_id = TD_MD_FIELD_ID_CPUID_VALUES; 2576 u64 ebx_eax, edx_ecx; 2577 u64 err = 0; 2578 2579 if (sub_leaf > 0b1111111) 2580 return -EINVAL; 2581 2582 if (*entry_index >= KVM_MAX_CPUID_ENTRIES) 2583 return -EINVAL; 2584 2585 if (leaf & TDX_MD_UNREADABLE_LEAF_MASK || 2586 sub_leaf & TDX_MD_UNREADABLE_SUBLEAF_MASK) 2587 return -EINVAL; 2588 2589 /* 2590 * bit 23:17, REVSERVED: reserved, must be 0; 2591 * bit 16, LEAF_31: leaf number bit 31; 2592 * bit 15:9, LEAF_6_0: leaf number bits 6:0, leaf bits 30:7 are 2593 * implicitly 0; 2594 * bit 8, SUBLEAF_NA: sub-leaf not applicable flag; 2595 * bit 7:1, SUBLEAF_6_0: sub-leaf number bits 6:0. If SUBLEAF_NA is 1, 2596 * the SUBLEAF_6_0 is all-1. 2597 * sub-leaf bits 31:7 are implicitly 0; 2598 * bit 0, ELEMENT_I: Element index within field; 2599 */ 2600 field_id |= ((leaf & 0x80000000) ? 1 : 0) << 16; 2601 field_id |= (leaf & 0x7f) << 9; 2602 if (sub_leaf_set) 2603 field_id |= (sub_leaf & 0x7f) << 1; 2604 else 2605 field_id |= 0x1fe; 2606 2607 err = tdx_td_metadata_field_read(kvm_tdx, field_id, &ebx_eax); 2608 if (err) //TODO check for specific errors 2609 goto err_out; 2610 2611 out->eax = (u32) ebx_eax; 2612 out->ebx = (u32) (ebx_eax >> 32); 2613 2614 field_id++; 2615 err = tdx_td_metadata_field_read(kvm_tdx, field_id, &edx_ecx); 2616 /* 2617 * It's weird that reading edx_ecx fails while reading ebx_eax 2618 * succeeded. 2619 */ 2620 if (WARN_ON_ONCE(err)) 2621 goto err_out; 2622 2623 out->ecx = (u32) edx_ecx; 2624 out->edx = (u32) (edx_ecx >> 32); 2625 2626 out->function = leaf; 2627 out->index = sub_leaf; 2628 out->flags |= sub_leaf_set ? KVM_CPUID_FLAG_SIGNIFCANT_INDEX : 0; 2629 2630 /* 2631 * Work around missing support on old TDX modules, fetch 2632 * guest maxpa from gfn_direct_bits. 2633 */ 2634 if (leaf == 0x80000008) { 2635 gpa_t gpa_bits = gfn_to_gpa(kvm_gfn_direct_bits(vcpu->kvm)); 2636 unsigned int g_maxpa = __ffs(gpa_bits) + 1; 2637 2638 out->eax = tdx_set_guest_phys_addr_bits(out->eax, g_maxpa); 2639 } 2640 2641 (*entry_index)++; 2642 2643 return 0; 2644 2645 err_out: 2646 out->eax = 0; 2647 out->ebx = 0; 2648 out->ecx = 0; 2649 out->edx = 0; 2650 2651 return -EIO; 2652 } 2653 2654 typedef void *tdx_vm_state_guard_t; 2655 2656 static tdx_vm_state_guard_t tdx_acquire_vm_state_locks(struct kvm *kvm) 2657 { 2658 int r; 2659 2660 mutex_lock(&kvm->lock); 2661 2662 if (kvm->created_vcpus != atomic_read(&kvm->online_vcpus)) { 2663 r = -EBUSY; 2664 goto out_err; 2665 } 2666 2667 r = kvm_lock_all_vcpus(kvm); 2668 if (r) 2669 goto out_err; 2670 2671 /* 2672 * Note the unintuitive ordering! vcpu->mutex must be taken outside 2673 * kvm->slots_lock! 2674 */ 2675 mutex_lock(&kvm->slots_lock); 2676 return kvm; 2677 2678 out_err: 2679 mutex_unlock(&kvm->lock); 2680 return ERR_PTR(r); 2681 } 2682 2683 static void tdx_release_vm_state_locks(struct kvm *kvm) 2684 { 2685 mutex_unlock(&kvm->slots_lock); 2686 kvm_unlock_all_vcpus(kvm); 2687 mutex_unlock(&kvm->lock); 2688 } 2689 2690 DEFINE_CLASS(tdx_vm_state_guard, tdx_vm_state_guard_t, 2691 if (!IS_ERR(_T)) tdx_release_vm_state_locks(_T), 2692 tdx_acquire_vm_state_locks(kvm), struct kvm *kvm); 2693 2694 static int tdx_td_init(struct kvm *kvm, struct kvm_tdx_cmd *cmd) 2695 { 2696 struct kvm_tdx_init_vm __user *user_data = u64_to_user_ptr(cmd->data); 2697 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 2698 struct kvm_tdx_init_vm *init_vm; 2699 struct td_params *td_params = NULL; 2700 u32 nr_user_entries; 2701 int ret; 2702 2703 BUILD_BUG_ON(sizeof(*init_vm) != 256 + sizeof_field(struct kvm_tdx_init_vm, cpuid)); 2704 BUILD_BUG_ON(sizeof(struct td_params) != 1024); 2705 2706 if (kvm_tdx->state != TD_STATE_UNINITIALIZED) 2707 return -EINVAL; 2708 2709 if (cmd->flags) 2710 return -EINVAL; 2711 2712 if (get_user(nr_user_entries, &user_data->cpuid.nent)) 2713 return -EFAULT; 2714 2715 if (nr_user_entries > KVM_MAX_CPUID_ENTRIES) 2716 return -E2BIG; 2717 2718 init_vm = memdup_user(user_data, 2719 struct_size(user_data, cpuid.entries, nr_user_entries)); 2720 if (IS_ERR(init_vm)) 2721 return PTR_ERR(init_vm); 2722 2723 if (memchr_inv(init_vm->reserved, 0, sizeof(init_vm->reserved))) { 2724 ret = -EINVAL; 2725 goto out; 2726 } 2727 2728 if (init_vm->cpuid.padding) { 2729 ret = -EINVAL; 2730 goto out; 2731 } 2732 2733 td_params = kzalloc_obj(struct td_params); 2734 if (!td_params) { 2735 ret = -ENOMEM; 2736 goto out; 2737 } 2738 2739 ret = setup_tdparams(kvm, td_params, init_vm); 2740 if (ret) 2741 goto out; 2742 2743 ret = __tdx_td_init(kvm, td_params, &cmd->hw_error); 2744 if (ret) 2745 goto out; 2746 2747 kvm_tdx->tsc_offset = td_tdcs_exec_read64(kvm_tdx, TD_TDCS_EXEC_TSC_OFFSET); 2748 kvm_tdx->tsc_multiplier = td_tdcs_exec_read64(kvm_tdx, TD_TDCS_EXEC_TSC_MULTIPLIER); 2749 kvm_tdx->attributes = td_params->attributes; 2750 kvm_tdx->xfam = td_params->xfam; 2751 2752 if (td_params->config_flags & TDX_CONFIG_FLAGS_MAX_GPAW) 2753 kvm->arch.gfn_direct_bits = TDX_SHARED_BIT_PWL_5; 2754 else 2755 kvm->arch.gfn_direct_bits = TDX_SHARED_BIT_PWL_4; 2756 2757 kvm_tdx->state = TD_STATE_INITIALIZED; 2758 out: 2759 /* kfree() accepts NULL. */ 2760 kfree(init_vm); 2761 kfree(td_params); 2762 2763 return ret; 2764 } 2765 2766 void tdx_flush_tlb_current(struct kvm_vcpu *vcpu) 2767 { 2768 /* 2769 * flush_tlb_current() is invoked when the first time for the vcpu to 2770 * run or when root of shared EPT is invalidated. 2771 * KVM only needs to flush shared EPT because the TDX module handles TLB 2772 * invalidation for private EPT in tdh_vp_enter(); 2773 * 2774 * A single context invalidation for shared EPT can be performed here. 2775 * However, this single context invalidation requires the private EPTP 2776 * rather than the shared EPTP to flush shared EPT, as shared EPT uses 2777 * private EPTP as its ASID for TLB invalidation. 2778 * 2779 * To avoid reading back private EPTP, perform a global invalidation for 2780 * shared EPT instead to keep this function simple. 2781 */ 2782 ept_sync_global(); 2783 } 2784 2785 void tdx_flush_tlb_all(struct kvm_vcpu *vcpu) 2786 { 2787 /* 2788 * TDX has called tdx_track() in tdx_sept_remove_private_spte() to 2789 * ensure that private EPT will be flushed on the next TD enter. No need 2790 * to call tdx_track() here again even when this callback is a result of 2791 * zapping private EPT. 2792 * 2793 * Due to the lack of the context to determine which EPT has been 2794 * affected by zapping, invoke invept() directly here for both shared 2795 * EPT and private EPT for simplicity, though it's not necessary for 2796 * private EPT. 2797 */ 2798 ept_sync_global(); 2799 } 2800 2801 static int tdx_td_finalize(struct kvm *kvm, struct kvm_tdx_cmd *cmd) 2802 { 2803 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 2804 2805 if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE) 2806 return -EINVAL; 2807 2808 cmd->hw_error = tdh_mr_finalize(&kvm_tdx->td); 2809 if (tdx_operand_busy(cmd->hw_error)) 2810 return -EBUSY; 2811 if (TDX_BUG_ON(cmd->hw_error, TDH_MR_FINALIZE, kvm)) 2812 return -EIO; 2813 2814 kvm_tdx->state = TD_STATE_RUNNABLE; 2815 /* TD_STATE_RUNNABLE must be set before 'pre_fault_allowed' */ 2816 smp_wmb(); 2817 kvm->arch.pre_fault_allowed = true; 2818 return 0; 2819 } 2820 2821 static int tdx_get_cmd(void __user *argp, struct kvm_tdx_cmd *cmd) 2822 { 2823 if (copy_from_user(cmd, argp, sizeof(*cmd))) 2824 return -EFAULT; 2825 2826 /* 2827 * Userspace should never set hw_error. KVM writes hw_error to report 2828 * hardware-defined error back to userspace. 2829 */ 2830 if (cmd->hw_error) 2831 return -EINVAL; 2832 2833 return 0; 2834 } 2835 2836 int tdx_vm_ioctl(struct kvm *kvm, void __user *argp) 2837 { 2838 struct kvm_tdx_cmd tdx_cmd; 2839 int r; 2840 2841 r = tdx_get_cmd(argp, &tdx_cmd); 2842 if (r) 2843 return r; 2844 2845 if (tdx_cmd.id == KVM_TDX_CAPABILITIES) 2846 return tdx_get_capabilities(&tdx_cmd); 2847 2848 CLASS(tdx_vm_state_guard, guard)(kvm); 2849 if (IS_ERR(guard)) 2850 return PTR_ERR(guard); 2851 2852 switch (tdx_cmd.id) { 2853 case KVM_TDX_INIT_VM: 2854 r = tdx_td_init(kvm, &tdx_cmd); 2855 break; 2856 case KVM_TDX_FINALIZE_VM: 2857 r = tdx_td_finalize(kvm, &tdx_cmd); 2858 break; 2859 default: 2860 return -EINVAL; 2861 } 2862 2863 if (copy_to_user(argp, &tdx_cmd, sizeof(struct kvm_tdx_cmd))) 2864 return -EFAULT; 2865 2866 return r; 2867 } 2868 2869 /* VMM can pass one 64bit auxiliary data to vcpu via RCX for guest BIOS. */ 2870 static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx) 2871 { 2872 struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); 2873 struct vcpu_tdx *tdx = to_tdx(vcpu); 2874 struct page *page; 2875 int ret, i; 2876 u64 err; 2877 2878 page = alloc_page(GFP_KERNEL); 2879 if (!page) 2880 return -ENOMEM; 2881 tdx->vp.tdvpr_page = page; 2882 2883 /* 2884 * page_to_phys() does not work in 'noinstr' code, like guest 2885 * entry via tdh_vp_enter(). Precalculate and store it instead 2886 * of doing it at runtime later. 2887 */ 2888 tdx->vp.tdvpr_pa = page_to_phys(tdx->vp.tdvpr_page); 2889 2890 tdx->vp.tdcx_pages = kcalloc(kvm_tdx->td.tdcx_nr_pages, sizeof(*tdx->vp.tdcx_pages), 2891 GFP_KERNEL); 2892 if (!tdx->vp.tdcx_pages) { 2893 ret = -ENOMEM; 2894 goto free_tdvpr; 2895 } 2896 2897 for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) { 2898 page = alloc_page(GFP_KERNEL); 2899 if (!page) { 2900 ret = -ENOMEM; 2901 goto free_tdcx; 2902 } 2903 tdx->vp.tdcx_pages[i] = page; 2904 } 2905 2906 err = tdh_vp_create(&kvm_tdx->td, &tdx->vp); 2907 if (TDX_BUG_ON(err, TDH_VP_CREATE, vcpu->kvm)) { 2908 ret = -EIO; 2909 goto free_tdcx; 2910 } 2911 2912 for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) { 2913 err = tdh_vp_addcx(&tdx->vp, tdx->vp.tdcx_pages[i]); 2914 if (TDX_BUG_ON(err, TDH_VP_ADDCX, vcpu->kvm)) { 2915 /* 2916 * Pages already added are reclaimed by the vcpu_free 2917 * method, but the rest are freed here. 2918 */ 2919 for (; i < kvm_tdx->td.tdcx_nr_pages; i++) { 2920 __free_page(tdx->vp.tdcx_pages[i]); 2921 tdx->vp.tdcx_pages[i] = NULL; 2922 } 2923 return -EIO; 2924 } 2925 } 2926 2927 /* 2928 * tdh_vp_init() can take an exclusive lock of the TDR resource inside 2929 * the TDX-Module. The TDR resource is also taken as shared in several 2930 * no-fail MMU paths, which could return TDX_OPERAND_BUSY on contention 2931 * (TDX-Module locks are try-lock implementations with no slow path). 2932 * Take mmu_lock for write to reflect the nature of the lock taken by 2933 * the TDX-Module, and to ensure the no-fail MMU paths succeed, e.g. if 2934 * a concurrent PUNCH_HOLE on guest_memfd triggers removal of SPTEs. 2935 */ 2936 scoped_guard(write_lock, &vcpu->kvm->mmu_lock) { 2937 err = tdh_vp_init(&tdx->vp, vcpu_rcx, vcpu->vcpu_id); 2938 if (TDX_BUG_ON(err, TDH_VP_INIT, vcpu->kvm)) 2939 return -EIO; 2940 } 2941 2942 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 2943 2944 return 0; 2945 2946 free_tdcx: 2947 for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) { 2948 if (tdx->vp.tdcx_pages[i]) 2949 __free_page(tdx->vp.tdcx_pages[i]); 2950 tdx->vp.tdcx_pages[i] = NULL; 2951 } 2952 kfree(tdx->vp.tdcx_pages); 2953 tdx->vp.tdcx_pages = NULL; 2954 2955 free_tdvpr: 2956 if (tdx->vp.tdvpr_page) 2957 __free_page(tdx->vp.tdvpr_page); 2958 tdx->vp.tdvpr_page = NULL; 2959 tdx->vp.tdvpr_pa = 0; 2960 2961 return ret; 2962 } 2963 2964 /* Sometimes reads multipple subleafs. Return how many enties were written. */ 2965 static int tdx_vcpu_get_cpuid_leaf(struct kvm_vcpu *vcpu, u32 leaf, int *entry_index, 2966 struct kvm_cpuid_entry2 *output_e) 2967 { 2968 int sub_leaf = 0; 2969 int ret; 2970 2971 /* First try without a subleaf */ 2972 ret = tdx_read_cpuid(vcpu, leaf, 0, false, entry_index, output_e); 2973 2974 /* If success, or invalid leaf, just give up */ 2975 if (ret != -EIO) 2976 return ret; 2977 2978 /* 2979 * If the try without a subleaf failed, try reading subleafs until 2980 * failure. The TDX module only supports 6 bits of subleaf index. 2981 */ 2982 while (1) { 2983 /* Keep reading subleafs until there is a failure. */ 2984 if (tdx_read_cpuid(vcpu, leaf, sub_leaf, true, entry_index, output_e)) 2985 return !sub_leaf; 2986 2987 sub_leaf++; 2988 output_e++; 2989 } 2990 2991 return 0; 2992 } 2993 2994 static int tdx_vcpu_get_cpuid(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd) 2995 { 2996 struct kvm_cpuid2 __user *output; 2997 struct kvm_cpuid2 *td_cpuid; 2998 int r = 0, i = 0, leaf; 2999 u32 level; 3000 3001 output = u64_to_user_ptr(cmd->data); 3002 td_cpuid = kzalloc(sizeof(*td_cpuid) + 3003 sizeof(output->entries[0]) * KVM_MAX_CPUID_ENTRIES, 3004 GFP_KERNEL); 3005 if (!td_cpuid) 3006 return -ENOMEM; 3007 3008 if (copy_from_user(td_cpuid, output, sizeof(*output))) { 3009 r = -EFAULT; 3010 goto out; 3011 } 3012 3013 /* Read max CPUID for normal range */ 3014 if (tdx_vcpu_get_cpuid_leaf(vcpu, 0, &i, &td_cpuid->entries[i])) { 3015 r = -EIO; 3016 goto out; 3017 } 3018 level = td_cpuid->entries[0].eax; 3019 3020 for (leaf = 1; leaf <= level; leaf++) 3021 tdx_vcpu_get_cpuid_leaf(vcpu, leaf, &i, &td_cpuid->entries[i]); 3022 3023 /* Read max CPUID for extended range */ 3024 if (tdx_vcpu_get_cpuid_leaf(vcpu, 0x80000000, &i, &td_cpuid->entries[i])) { 3025 r = -EIO; 3026 goto out; 3027 } 3028 level = td_cpuid->entries[i - 1].eax; 3029 3030 for (leaf = 0x80000001; leaf <= level; leaf++) 3031 tdx_vcpu_get_cpuid_leaf(vcpu, leaf, &i, &td_cpuid->entries[i]); 3032 3033 if (td_cpuid->nent < i) 3034 r = -E2BIG; 3035 td_cpuid->nent = i; 3036 3037 if (copy_to_user(output, td_cpuid, sizeof(*output))) { 3038 r = -EFAULT; 3039 goto out; 3040 } 3041 3042 if (r == -E2BIG) 3043 goto out; 3044 3045 if (copy_to_user(output->entries, td_cpuid->entries, 3046 td_cpuid->nent * sizeof(struct kvm_cpuid_entry2))) 3047 r = -EFAULT; 3048 3049 out: 3050 kfree(td_cpuid); 3051 3052 return r; 3053 } 3054 3055 static int tdx_vcpu_init(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd) 3056 { 3057 u64 apic_base; 3058 struct vcpu_tdx *tdx = to_tdx(vcpu); 3059 int ret; 3060 3061 if (cmd->flags) 3062 return -EINVAL; 3063 3064 if (tdx->state != VCPU_TD_STATE_UNINITIALIZED) 3065 return -EINVAL; 3066 3067 /* 3068 * TDX requires X2APIC, userspace is responsible for configuring guest 3069 * CPUID accordingly. 3070 */ 3071 apic_base = APIC_DEFAULT_PHYS_BASE | LAPIC_MODE_X2APIC | 3072 (kvm_vcpu_is_reset_bsp(vcpu) ? MSR_IA32_APICBASE_BSP : 0); 3073 if (kvm_apic_set_base(vcpu, apic_base, true)) 3074 return -EINVAL; 3075 3076 ret = tdx_td_vcpu_init(vcpu, (u64)cmd->data); 3077 if (ret) 3078 return ret; 3079 3080 td_vmcs_write16(tdx, POSTED_INTR_NV, POSTED_INTR_VECTOR); 3081 td_vmcs_write64(tdx, POSTED_INTR_DESC_ADDR, __pa(&tdx->vt.pi_desc)); 3082 td_vmcs_setbit32(tdx, PIN_BASED_VM_EXEC_CONTROL, PIN_BASED_POSTED_INTR); 3083 3084 tdx->state = VCPU_TD_STATE_INITIALIZED; 3085 3086 return 0; 3087 } 3088 3089 void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) 3090 { 3091 /* 3092 * Yell on INIT, as TDX doesn't support INIT, i.e. KVM should drop all 3093 * INIT events. 3094 * 3095 * Defer initializing vCPU for RESET state until KVM_TDX_INIT_VCPU, as 3096 * userspace needs to define the vCPU model before KVM can initialize 3097 * vCPU state, e.g. to enable x2APIC. 3098 */ 3099 WARN_ON_ONCE(init_event); 3100 } 3101 3102 struct tdx_gmem_post_populate_arg { 3103 struct kvm_vcpu *vcpu; 3104 __u32 flags; 3105 }; 3106 3107 static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, 3108 struct page *src_page, void *_arg) 3109 { 3110 struct tdx_gmem_post_populate_arg *arg = _arg; 3111 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 3112 u64 err, entry, level_state; 3113 gpa_t gpa = gfn_to_gpa(gfn); 3114 int ret, i; 3115 3116 if (KVM_BUG_ON(kvm_tdx->page_add_src, kvm)) 3117 return -EIO; 3118 3119 if (!src_page) 3120 return -EOPNOTSUPP; 3121 3122 kvm_tdx->page_add_src = src_page; 3123 ret = kvm_tdp_mmu_map_private_pfn(arg->vcpu, gfn, pfn); 3124 kvm_tdx->page_add_src = NULL; 3125 3126 if (ret || !(arg->flags & KVM_TDX_MEASURE_MEMORY_REGION)) 3127 return ret; 3128 3129 /* 3130 * Note, MR.EXTEND can fail if the S-EPT mapping is somehow removed 3131 * between mapping the pfn and now, but slots_lock prevents memslot 3132 * updates, filemap_invalidate_lock() prevents guest_memfd updates, 3133 * mmu_notifier events can't reach S-EPT entries, and KVM's internal 3134 * zapping flows are mutually exclusive with S-EPT mappings. 3135 */ 3136 for (i = 0; i < PAGE_SIZE; i += TDX_EXTENDMR_CHUNKSIZE) { 3137 err = tdh_mr_extend(&kvm_tdx->td, gpa + i, &entry, &level_state); 3138 if (TDX_BUG_ON_2(err, TDH_MR_EXTEND, entry, level_state, kvm)) 3139 return -EIO; 3140 } 3141 3142 return 0; 3143 } 3144 3145 static int tdx_vcpu_init_mem_region(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd) 3146 { 3147 struct vcpu_tdx *tdx = to_tdx(vcpu); 3148 struct kvm *kvm = vcpu->kvm; 3149 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 3150 struct kvm_tdx_init_mem_region region; 3151 struct tdx_gmem_post_populate_arg arg; 3152 long gmem_ret; 3153 int ret; 3154 3155 if (tdx->state != VCPU_TD_STATE_INITIALIZED) 3156 return -EINVAL; 3157 3158 /* Once TD is finalized, the initial guest memory is fixed. */ 3159 if (kvm_tdx->state == TD_STATE_RUNNABLE) 3160 return -EINVAL; 3161 3162 if (cmd->flags & ~KVM_TDX_MEASURE_MEMORY_REGION) 3163 return -EINVAL; 3164 3165 if (copy_from_user(®ion, u64_to_user_ptr(cmd->data), sizeof(region))) 3166 return -EFAULT; 3167 3168 if (!PAGE_ALIGNED(region.source_addr) || !PAGE_ALIGNED(region.gpa) || 3169 !region.nr_pages || 3170 region.gpa + (region.nr_pages << PAGE_SHIFT) <= region.gpa || 3171 !vt_is_tdx_private_gpa(kvm, region.gpa) || 3172 !vt_is_tdx_private_gpa(kvm, region.gpa + (region.nr_pages << PAGE_SHIFT) - 1)) 3173 return -EINVAL; 3174 3175 ret = 0; 3176 while (region.nr_pages) { 3177 if (signal_pending(current)) { 3178 ret = -EINTR; 3179 break; 3180 } 3181 3182 arg = (struct tdx_gmem_post_populate_arg) { 3183 .vcpu = vcpu, 3184 .flags = cmd->flags, 3185 }; 3186 gmem_ret = kvm_gmem_populate(kvm, gpa_to_gfn(region.gpa), 3187 u64_to_user_ptr(region.source_addr), 3188 1, tdx_gmem_post_populate, &arg); 3189 if (gmem_ret < 0) { 3190 ret = gmem_ret; 3191 break; 3192 } 3193 3194 if (gmem_ret != 1) { 3195 ret = -EIO; 3196 break; 3197 } 3198 3199 region.source_addr += PAGE_SIZE; 3200 region.gpa += PAGE_SIZE; 3201 region.nr_pages--; 3202 3203 cond_resched(); 3204 } 3205 3206 if (copy_to_user(u64_to_user_ptr(cmd->data), ®ion, sizeof(region))) 3207 ret = -EFAULT; 3208 return ret; 3209 } 3210 3211 int tdx_vcpu_unlocked_ioctl(struct kvm_vcpu *vcpu, void __user *argp) 3212 { 3213 struct kvm *kvm = vcpu->kvm; 3214 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 3215 struct kvm_tdx_cmd cmd; 3216 int r; 3217 3218 r = tdx_get_cmd(argp, &cmd); 3219 if (r) 3220 return r; 3221 3222 CLASS(tdx_vm_state_guard, guard)(kvm); 3223 if (IS_ERR(guard)) 3224 return PTR_ERR(guard); 3225 3226 if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE) 3227 return -EINVAL; 3228 3229 vcpu_load(vcpu); 3230 3231 switch (cmd.id) { 3232 case KVM_TDX_INIT_MEM_REGION: 3233 r = tdx_vcpu_init_mem_region(vcpu, &cmd); 3234 break; 3235 case KVM_TDX_INIT_VCPU: 3236 r = tdx_vcpu_init(vcpu, &cmd); 3237 break; 3238 default: 3239 r = -ENOIOCTLCMD; 3240 break; 3241 } 3242 3243 vcpu_put(vcpu); 3244 3245 return r; 3246 } 3247 3248 int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp) 3249 { 3250 struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); 3251 struct kvm_tdx_cmd cmd; 3252 int ret; 3253 3254 if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE) 3255 return -EINVAL; 3256 3257 ret = tdx_get_cmd(argp, &cmd); 3258 if (ret) 3259 return ret; 3260 3261 switch (cmd.id) { 3262 case KVM_TDX_GET_CPUID: 3263 ret = tdx_vcpu_get_cpuid(vcpu, &cmd); 3264 break; 3265 default: 3266 ret = -EINVAL; 3267 break; 3268 } 3269 3270 return ret; 3271 } 3272 3273 int tdx_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, bool is_private) 3274 { 3275 if (!is_private) 3276 return 0; 3277 3278 return PG_LEVEL_4K; 3279 } 3280 3281 void tdx_hardware_unsetup(void) 3282 { 3283 misc_cg_set_capacity(MISC_CG_RES_TDX, 0); 3284 } 3285 3286 static int __init __tdx_hardware_setup(void) 3287 { 3288 const struct tdx_sys_info_td_conf *td_conf; 3289 int i; 3290 3291 for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++) { 3292 /* 3293 * Check if MSRs (tdx_uret_msrs) can be saved/restored 3294 * before returning to user space. 3295 */ 3296 tdx_uret_msrs[i].slot = kvm_find_user_return_msr(tdx_uret_msrs[i].msr); 3297 if (tdx_uret_msrs[i].slot == -1) { 3298 /* If any MSR isn't supported, it is a KVM bug */ 3299 pr_err("MSR %x isn't included by kvm_find_user_return_msr\n", 3300 tdx_uret_msrs[i].msr); 3301 return -EIO; 3302 } 3303 } 3304 3305 /* Get TDX global information for later use */ 3306 tdx_sysinfo = tdx_get_sysinfo(); 3307 if (!tdx_sysinfo) 3308 return -ENODEV; 3309 3310 /* Check TDX module and KVM capabilities */ 3311 if (!tdx_get_supported_attrs(&tdx_sysinfo->td_conf) || 3312 !tdx_get_supported_xfam(&tdx_sysinfo->td_conf)) 3313 return -EINVAL; 3314 3315 if (!(tdx_sysinfo->features.tdx_features0 & MD_FIELD_ID_FEATURES0_TOPOLOGY_ENUM)) 3316 return -EINVAL; 3317 3318 /* 3319 * TDX has its own limit of maximum vCPUs it can support for all 3320 * TDX guests in addition to KVM_MAX_VCPUS. Userspace needs to 3321 * query TDX guest's maximum vCPUs by checking KVM_CAP_MAX_VCPU 3322 * extension on per-VM basis. 3323 * 3324 * TDX module reports such limit via the MAX_VCPU_PER_TD global 3325 * metadata. Different modules may report different values. 3326 * Some old module may also not support this metadata (in which 3327 * case this limit is U16_MAX). 3328 * 3329 * In practice, the reported value reflects the maximum logical 3330 * CPUs that ALL the platforms that the module supports can 3331 * possibly have. 3332 * 3333 * Simply forwarding the MAX_VCPU_PER_TD to userspace could 3334 * result in an unpredictable ABI. KVM instead always advertise 3335 * the number of logical CPUs the platform has as the maximum 3336 * vCPUs for TDX guests. 3337 * 3338 * Make sure MAX_VCPU_PER_TD reported by TDX module is not 3339 * smaller than the number of logical CPUs, otherwise KVM will 3340 * report an unsupported value to userspace. 3341 * 3342 * Note, a platform with TDX enabled in the BIOS cannot support 3343 * physical CPU hotplug, and TDX requires the BIOS has marked 3344 * all logical CPUs in MADT table as enabled. Just use 3345 * num_present_cpus() for the number of logical CPUs. 3346 */ 3347 td_conf = &tdx_sysinfo->td_conf; 3348 if (td_conf->max_vcpus_per_td < num_present_cpus()) { 3349 pr_err("Disable TDX: MAX_VCPU_PER_TD (%u) smaller than number of logical CPUs (%u).\n", 3350 td_conf->max_vcpus_per_td, num_present_cpus()); 3351 return -EINVAL; 3352 } 3353 3354 if (misc_cg_set_capacity(MISC_CG_RES_TDX, tdx_get_nr_guest_keyids())) 3355 return -EINVAL; 3356 3357 return 0; 3358 } 3359 3360 int __init tdx_hardware_setup(void) 3361 { 3362 int r, i; 3363 3364 /* tdx_disable_virtualization_cpu() uses associated_tdvcpus. */ 3365 for_each_possible_cpu(i) 3366 INIT_LIST_HEAD(&per_cpu(associated_tdvcpus, i)); 3367 3368 if (!enable_tdx) 3369 return 0; 3370 3371 if (!enable_ept) { 3372 pr_err("EPT is required for TDX\n"); 3373 goto success_disable_tdx; 3374 } 3375 3376 if (!tdp_mmu_enabled || !enable_mmio_caching || !enable_ept_ad_bits) { 3377 pr_err("TDP MMU and MMIO caching and EPT A/D bit is required for TDX\n"); 3378 goto success_disable_tdx; 3379 } 3380 3381 if (!enable_apicv) { 3382 pr_err("APICv is required for TDX\n"); 3383 goto success_disable_tdx; 3384 } 3385 3386 if (!cpu_feature_enabled(X86_FEATURE_OSXSAVE)) { 3387 pr_err("tdx: OSXSAVE is required for TDX\n"); 3388 goto success_disable_tdx; 3389 } 3390 3391 if (!cpu_feature_enabled(X86_FEATURE_TDX_HOST_PLATFORM)) { 3392 pr_err("TDX not supported by the host platform\n"); 3393 goto success_disable_tdx; 3394 } 3395 3396 r = __tdx_hardware_setup(); 3397 if (r) { 3398 /* 3399 * Disable TDX only but don't fail to load module if the TDX 3400 * module could not be loaded. No need to print message saying 3401 * "module is not loaded" because it was printed when the first 3402 * SEAMCALL failed. Don't bother unwinding the S-EPT hooks or 3403 * vm_size, as kvm_x86_ops have already been finalized (and are 3404 * intentionally not exported). The S-EPT code is unreachable, 3405 * and allocating a few more bytes per VM in a should-be-rare 3406 * failure scenario is a non-issue. 3407 */ 3408 if (r == -ENODEV) 3409 goto success_disable_tdx; 3410 3411 return r; 3412 } 3413 3414 KVM_SANITY_CHECK_VM_STRUCT_SIZE(kvm_tdx); 3415 3416 vt_x86_ops.vm_size = max_t(unsigned int, vt_x86_ops.vm_size, sizeof(struct kvm_tdx)); 3417 3418 vt_x86_ops.link_external_spt = tdx_sept_link_private_spt; 3419 vt_x86_ops.set_external_spte = tdx_sept_set_private_spte; 3420 vt_x86_ops.free_external_spt = tdx_sept_free_private_spt; 3421 vt_x86_ops.remove_external_spte = tdx_sept_remove_private_spte; 3422 vt_x86_ops.protected_apic_has_interrupt = tdx_protected_apic_has_interrupt; 3423 return 0; 3424 3425 success_disable_tdx: 3426 enable_tdx = 0; 3427 return 0; 3428 } 3429