1 // SPDX-License-Identifier: GPL-2.0 2 /* Copyright (C) 2021-2022 Intel Corporation */ 3 4 #undef pr_fmt 5 #define pr_fmt(fmt) "tdx: " fmt 6 7 #include <linux/cpufeature.h> 8 #include <linux/export.h> 9 #include <linux/io.h> 10 #include <linux/kexec.h> 11 #include <asm/coco.h> 12 #include <asm/tdx.h> 13 #include <asm/vmx.h> 14 #include <asm/ia32.h> 15 #include <asm/insn.h> 16 #include <asm/insn-eval.h> 17 #include <asm/paravirt_types.h> 18 #include <asm/pgtable.h> 19 #include <asm/set_memory.h> 20 #include <asm/traps.h> 21 22 /* MMIO direction */ 23 #define EPT_READ 0 24 #define EPT_WRITE 1 25 26 /* Port I/O direction */ 27 #define PORT_READ 0 28 #define PORT_WRITE 1 29 30 /* See Exit Qualification for I/O Instructions in VMX documentation */ 31 #define VE_IS_IO_IN(e) ((e) & BIT(3)) 32 #define VE_GET_IO_SIZE(e) (((e) & GENMASK(2, 0)) + 1) 33 #define VE_GET_PORT_NUM(e) ((e) >> 16) 34 #define VE_IS_IO_STRING(e) ((e) & BIT(4)) 35 36 /* TDX Module call error codes */ 37 #define TDCALL_RETURN_CODE(a) ((a) >> 32) 38 #define TDCALL_INVALID_OPERAND 0xc0000100 39 #define TDCALL_OPERAND_BUSY 0x80000200 40 41 #define TDREPORT_SUBTYPE_0 0 42 43 static atomic_long_t nr_shared; 44 45 /* Called from __tdx_hypercall() for unrecoverable failure */ 46 noinstr void __noreturn __tdx_hypercall_failed(void) 47 { 48 instrumentation_begin(); 49 panic("TDVMCALL failed. TDX module bug?"); 50 } 51 52 #ifdef CONFIG_KVM_GUEST 53 long tdx_kvm_hypercall(unsigned int nr, unsigned long p1, unsigned long p2, 54 unsigned long p3, unsigned long p4) 55 { 56 struct tdx_module_args args = { 57 .r10 = nr, 58 .r11 = p1, 59 .r12 = p2, 60 .r13 = p3, 61 .r14 = p4, 62 }; 63 64 return __tdx_hypercall(&args); 65 } 66 EXPORT_SYMBOL_GPL(tdx_kvm_hypercall); 67 #endif 68 69 /* 70 * Used for TDX guests to make calls directly to the TD module. This 71 * should only be used for calls that have no legitimate reason to fail 72 * or where the kernel can not survive the call failing. 73 */ 74 static inline void tdcall(u64 fn, struct tdx_module_args *args) 75 { 76 if (__tdcall_ret(fn, args)) 77 panic("TDCALL %lld failed (Buggy TDX module!)\n", fn); 78 } 79 80 /* Read TD-scoped metadata */ 81 static inline u64 tdg_vm_rd(u64 field, u64 *value) 82 { 83 struct tdx_module_args args = { 84 .rdx = field, 85 }; 86 u64 ret; 87 88 ret = __tdcall_ret(TDG_VM_RD, &args); 89 *value = args.r8; 90 91 return ret; 92 } 93 94 /* Write TD-scoped metadata */ 95 static inline u64 tdg_vm_wr(u64 field, u64 value, u64 mask) 96 { 97 struct tdx_module_args args = { 98 .rdx = field, 99 .r8 = value, 100 .r9 = mask, 101 }; 102 103 return __tdcall(TDG_VM_WR, &args); 104 } 105 106 /** 107 * tdx_mcall_get_report0() - Wrapper to get TDREPORT0 (a.k.a. TDREPORT 108 * subtype 0) using TDG.MR.REPORT TDCALL. 109 * @reportdata: Address of the input buffer which contains user-defined 110 * REPORTDATA to be included into TDREPORT. 111 * @tdreport: Address of the output buffer to store TDREPORT. 112 * 113 * Refer to section titled "TDG.MR.REPORT leaf" in the TDX Module v1.0 114 * specification for more information on TDG.MR.REPORT TDCALL. 115 * 116 * It is used in the TDX guest driver module to get the TDREPORT0. 117 * 118 * Return 0 on success, -ENXIO for invalid operands, -EBUSY for busy operation, 119 * or -EIO on other TDCALL failures. 120 */ 121 int tdx_mcall_get_report0(u8 *reportdata, u8 *tdreport) 122 { 123 struct tdx_module_args args = { 124 .rcx = virt_to_phys(tdreport), 125 .rdx = virt_to_phys(reportdata), 126 .r8 = TDREPORT_SUBTYPE_0, 127 }; 128 u64 ret; 129 130 ret = __tdcall(TDG_MR_REPORT, &args); 131 if (ret) { 132 if (TDCALL_RETURN_CODE(ret) == TDCALL_INVALID_OPERAND) 133 return -ENXIO; 134 else if (TDCALL_RETURN_CODE(ret) == TDCALL_OPERAND_BUSY) 135 return -EBUSY; 136 return -EIO; 137 } 138 139 return 0; 140 } 141 EXPORT_SYMBOL_GPL(tdx_mcall_get_report0); 142 143 /** 144 * tdx_mcall_extend_rtmr() - Wrapper to extend RTMR registers using 145 * TDG.MR.RTMR.EXTEND TDCALL. 146 * @index: Index of RTMR register to be extended. 147 * @data: Address of the input buffer with RTMR register extend data. 148 * 149 * Refer to section titled "TDG.MR.RTMR.EXTEND leaf" in the TDX Module v1.0 150 * specification for more information on TDG.MR.RTMR.EXTEND TDCALL. 151 * 152 * It is used in the TDX guest driver module to allow user to extend the RTMR 153 * registers. 154 * 155 * Return 0 on success, -ENXIO for invalid operands, -EBUSY for busy operation, 156 * or -EIO on other TDCALL failures. 157 */ 158 int tdx_mcall_extend_rtmr(u8 index, u8 *data) 159 { 160 struct tdx_module_args args = { 161 .rcx = virt_to_phys(data), 162 .rdx = index, 163 }; 164 u64 ret; 165 166 ret = __tdcall(TDG_MR_RTMR_EXTEND, &args); 167 if (ret) { 168 if (TDCALL_RETURN_CODE(ret) == TDCALL_INVALID_OPERAND) 169 return -ENXIO; 170 if (TDCALL_RETURN_CODE(ret) == TDCALL_OPERAND_BUSY) 171 return -EBUSY; 172 return -EIO; 173 } 174 175 return 0; 176 } 177 EXPORT_SYMBOL_GPL(tdx_mcall_extend_rtmr); 178 179 /** 180 * tdx_hcall_get_quote() - Wrapper to request TD Quote using GetQuote 181 * hypercall. 182 * @buf: Address of the directly mapped shared kernel buffer which 183 * contains TDREPORT. The same buffer will be used by VMM to 184 * store the generated TD Quote output. 185 * @size: size of the tdquote buffer (4KB-aligned). 186 * 187 * Refer to section titled "TDG.VP.VMCALL<GetQuote>" in the TDX GHCI 188 * v1.0 specification for more information on GetQuote hypercall. 189 * It is used in the TDX guest driver module to get the TD Quote. 190 * 191 * Return 0 on success or error code on failure. 192 */ 193 u64 tdx_hcall_get_quote(u8 *buf, size_t size) 194 { 195 /* Since buf is a shared memory, set the shared (decrypted) bits */ 196 return _tdx_hypercall(TDVMCALL_GET_QUOTE, cc_mkdec(virt_to_phys(buf)), size, 0, 0); 197 } 198 EXPORT_SYMBOL_GPL(tdx_hcall_get_quote); 199 200 static void __noreturn tdx_panic(const char *msg) 201 { 202 struct tdx_module_args args = { 203 .r10 = TDX_HYPERCALL_STANDARD, 204 .r11 = TDVMCALL_REPORT_FATAL_ERROR, 205 .r12 = 0, /* Error code: 0 is Panic */ 206 }; 207 union { 208 /* Define register order according to the GHCI */ 209 struct { u64 r14, r15, rbx, rdi, rsi, r8, r9, rdx; }; 210 211 char bytes[64] __nonstring; 212 } message; 213 214 /* VMM assumes '\0' in byte 65, if the message took all 64 bytes */ 215 strtomem_pad(message.bytes, msg, '\0'); 216 217 args.r8 = message.r8; 218 args.r9 = message.r9; 219 args.r14 = message.r14; 220 args.r15 = message.r15; 221 args.rdi = message.rdi; 222 args.rsi = message.rsi; 223 args.rbx = message.rbx; 224 args.rdx = message.rdx; 225 226 /* 227 * This hypercall should never return and it is not safe 228 * to keep the guest running. Call it forever if it 229 * happens to return. 230 */ 231 while (1) 232 __tdx_hypercall(&args); 233 } 234 235 /* 236 * The kernel cannot handle #VEs when accessing normal kernel memory. Ensure 237 * that no #VE will be delivered for accesses to TD-private memory. 238 * 239 * TDX 1.0 does not allow the guest to disable SEPT #VE on its own. The VMM 240 * controls if the guest will receive such #VE with TD attribute 241 * TDX_ATTR_SEPT_VE_DISABLE. 242 * 243 * Newer TDX modules allow the guest to control if it wants to receive SEPT 244 * violation #VEs. 245 * 246 * Check if the feature is available and disable SEPT #VE if possible. 247 * 248 * If the TD is allowed to disable/enable SEPT #VEs, the TDX_ATTR_SEPT_VE_DISABLE 249 * attribute is no longer reliable. It reflects the initial state of the 250 * control for the TD, but it will not be updated if someone (e.g. bootloader) 251 * changes it before the kernel starts. Kernel must check TDCS_TD_CTLS bit to 252 * determine if SEPT #VEs are enabled or disabled. 253 */ 254 static void disable_sept_ve(u64 td_attr) 255 { 256 const char *msg = "TD misconfiguration: SEPT #VE has to be disabled"; 257 bool debug = td_attr & TDX_ATTR_DEBUG; 258 u64 config, controls; 259 260 /* Is this TD allowed to disable SEPT #VE */ 261 tdg_vm_rd(TDCS_CONFIG_FLAGS, &config); 262 if (!(config & TDCS_CONFIG_FLEXIBLE_PENDING_VE)) { 263 /* No SEPT #VE controls for the guest: check the attribute */ 264 if (td_attr & TDX_ATTR_SEPT_VE_DISABLE) 265 return; 266 267 /* Relax SEPT_VE_DISABLE check for debug TD for backtraces */ 268 if (debug) 269 pr_warn("%s\n", msg); 270 else 271 tdx_panic(msg); 272 return; 273 } 274 275 /* Check if SEPT #VE has been disabled before us */ 276 tdg_vm_rd(TDCS_TD_CTLS, &controls); 277 if (controls & TD_CTLS_PENDING_VE_DISABLE) 278 return; 279 280 /* Keep #VEs enabled for splats in debugging environments */ 281 if (debug) 282 return; 283 284 /* Disable SEPT #VEs */ 285 tdg_vm_wr(TDCS_TD_CTLS, TD_CTLS_PENDING_VE_DISABLE, 286 TD_CTLS_PENDING_VE_DISABLE); 287 } 288 289 /* 290 * TDX 1.0 generates a #VE when accessing topology-related CPUID leafs (0xB and 291 * 0x1F) and the X2APIC_APICID MSR. The kernel returns all zeros on CPUID #VEs. 292 * In practice, this means that the kernel can only boot with a plain topology. 293 * Any complications will cause problems. 294 * 295 * The ENUM_TOPOLOGY feature allows the VMM to provide topology information. 296 * Enabling the feature eliminates topology-related #VEs: the TDX module 297 * virtualizes accesses to the CPUID leafs and the MSR. 298 * 299 * Enable ENUM_TOPOLOGY if it is available. 300 */ 301 static void enable_cpu_topology_enumeration(void) 302 { 303 u64 configured; 304 305 /* Has the VMM provided a valid topology configuration? */ 306 tdg_vm_rd(TDCS_TOPOLOGY_ENUM_CONFIGURED, &configured); 307 if (!configured) { 308 pr_err("VMM did not configure X2APIC_IDs properly\n"); 309 return; 310 } 311 312 tdg_vm_wr(TDCS_TD_CTLS, TD_CTLS_ENUM_TOPOLOGY, TD_CTLS_ENUM_TOPOLOGY); 313 } 314 315 static void reduce_unnecessary_ve(void) 316 { 317 u64 err = tdg_vm_wr(TDCS_TD_CTLS, TD_CTLS_REDUCE_VE, TD_CTLS_REDUCE_VE); 318 319 if (err == TDX_SUCCESS) 320 return; 321 322 /* 323 * Enabling REDUCE_VE includes ENUM_TOPOLOGY. Only try to 324 * enable ENUM_TOPOLOGY if REDUCE_VE was not successful. 325 */ 326 enable_cpu_topology_enumeration(); 327 } 328 329 static void tdx_setup(u64 *cc_mask) 330 { 331 struct tdx_module_args args = {}; 332 unsigned int gpa_width; 333 u64 td_attr; 334 335 /* 336 * TDINFO TDX module call is used to get the TD execution environment 337 * information like GPA width, number of available vcpus, debug mode 338 * information, etc. More details about the ABI can be found in TDX 339 * Guest-Host-Communication Interface (GHCI), section 2.4.2 TDCALL 340 * [TDG.VP.INFO]. 341 */ 342 tdcall(TDG_VP_INFO, &args); 343 344 /* 345 * The highest bit of a guest physical address is the "sharing" bit. 346 * Set it for shared pages and clear it for private pages. 347 * 348 * The GPA width that comes out of this call is critical. TDX guests 349 * can not meaningfully run without it. 350 */ 351 gpa_width = args.rcx & GENMASK(5, 0); 352 *cc_mask = BIT_ULL(gpa_width - 1); 353 354 td_attr = args.rdx; 355 356 /* Kernel does not use NOTIFY_ENABLES and does not need random #VEs */ 357 tdg_vm_wr(TDCS_NOTIFY_ENABLES, 0, -1ULL); 358 359 disable_sept_ve(td_attr); 360 361 reduce_unnecessary_ve(); 362 } 363 364 /* 365 * The TDX module spec states that #VE may be injected for a limited set of 366 * reasons: 367 * 368 * - Emulation of the architectural #VE injection on EPT violation; 369 * 370 * - As a result of guest TD execution of a disallowed instruction, 371 * a disallowed MSR access, or CPUID virtualization; 372 * 373 * - A notification to the guest TD about anomalous behavior; 374 * 375 * The last one is opt-in and is not used by the kernel. 376 * 377 * The Intel Software Developer's Manual describes cases when instruction 378 * length field can be used in section "Information for VM Exits Due to 379 * Instruction Execution". 380 * 381 * For TDX, it ultimately means GET_VEINFO provides reliable instruction length 382 * information if #VE occurred due to instruction execution, but not for EPT 383 * violations. 384 */ 385 static int ve_instr_len(struct ve_info *ve) 386 { 387 switch (ve->exit_reason) { 388 case EXIT_REASON_HLT: 389 case EXIT_REASON_MSR_READ: 390 case EXIT_REASON_MSR_WRITE: 391 case EXIT_REASON_CPUID: 392 case EXIT_REASON_IO_INSTRUCTION: 393 /* It is safe to use ve->instr_len for #VE due instructions */ 394 return ve->instr_len; 395 case EXIT_REASON_EPT_VIOLATION: 396 /* 397 * For EPT violations, ve->insn_len is not defined. For those, 398 * the kernel must decode instructions manually and should not 399 * be using this function. 400 */ 401 WARN_ONCE(1, "ve->instr_len is not defined for EPT violations"); 402 return 0; 403 default: 404 WARN_ONCE(1, "Unexpected #VE-type: %lld\n", ve->exit_reason); 405 return ve->instr_len; 406 } 407 } 408 409 static u64 __cpuidle __halt(const bool irq_disabled) 410 { 411 struct tdx_module_args args = { 412 .r10 = TDX_HYPERCALL_STANDARD, 413 .r11 = hcall_func(EXIT_REASON_HLT), 414 .r12 = irq_disabled, 415 }; 416 417 /* 418 * Emulate HLT operation via hypercall. More info about ABI 419 * can be found in TDX Guest-Host-Communication Interface 420 * (GHCI), section 3.8 TDG.VP.VMCALL<Instruction.HLT>. 421 * 422 * The VMM uses the "IRQ disabled" param to understand IRQ 423 * enabled status (RFLAGS.IF) of the TD guest and to determine 424 * whether or not it should schedule the halted vCPU if an 425 * IRQ becomes pending. E.g. if IRQs are disabled, the VMM 426 * can keep the vCPU in virtual HLT, even if an IRQ is 427 * pending, without hanging/breaking the guest. 428 */ 429 return __tdx_hypercall(&args); 430 } 431 432 static int handle_halt(struct ve_info *ve) 433 { 434 const bool irq_disabled = irqs_disabled(); 435 436 /* 437 * HLT with IRQs enabled is unsafe, as an IRQ that is intended to be a 438 * wake event may be consumed before requesting HLT emulation, leaving 439 * the vCPU blocking indefinitely. 440 */ 441 if (WARN_ONCE(!irq_disabled, "HLT emulation with IRQs enabled")) 442 return -EIO; 443 444 if (__halt(irq_disabled)) 445 return -EIO; 446 447 return ve_instr_len(ve); 448 } 449 450 void __cpuidle tdx_halt(void) 451 { 452 const bool irq_disabled = false; 453 454 /* 455 * Use WARN_ONCE() to report the failure. 456 */ 457 if (__halt(irq_disabled)) 458 WARN_ONCE(1, "HLT instruction emulation failed\n"); 459 } 460 461 static void __cpuidle tdx_safe_halt(void) 462 { 463 tdx_halt(); 464 /* 465 * "__cpuidle" section doesn't support instrumentation, so stick 466 * with raw_* variant that avoids tracing hooks. 467 */ 468 raw_local_irq_enable(); 469 } 470 471 static int read_msr(struct pt_regs *regs, struct ve_info *ve) 472 { 473 struct tdx_module_args args = { 474 .r10 = TDX_HYPERCALL_STANDARD, 475 .r11 = hcall_func(EXIT_REASON_MSR_READ), 476 .r12 = regs->cx, 477 }; 478 479 /* 480 * Emulate the MSR read via hypercall. More info about ABI 481 * can be found in TDX Guest-Host-Communication Interface 482 * (GHCI), section titled "TDG.VP.VMCALL<Instruction.RDMSR>". 483 */ 484 if (__tdx_hypercall(&args)) 485 return -EIO; 486 487 regs->ax = lower_32_bits(args.r11); 488 regs->dx = upper_32_bits(args.r11); 489 return ve_instr_len(ve); 490 } 491 492 static int write_msr(struct pt_regs *regs, struct ve_info *ve) 493 { 494 struct tdx_module_args args = { 495 .r10 = TDX_HYPERCALL_STANDARD, 496 .r11 = hcall_func(EXIT_REASON_MSR_WRITE), 497 .r12 = regs->cx, 498 .r13 = (u64)regs->dx << 32 | regs->ax, 499 }; 500 501 /* 502 * Emulate the MSR write via hypercall. More info about ABI 503 * can be found in TDX Guest-Host-Communication Interface 504 * (GHCI) section titled "TDG.VP.VMCALL<Instruction.WRMSR>". 505 */ 506 if (__tdx_hypercall(&args)) 507 return -EIO; 508 509 return ve_instr_len(ve); 510 } 511 512 static int handle_cpuid(struct pt_regs *regs, struct ve_info *ve) 513 { 514 struct tdx_module_args args = { 515 .r10 = TDX_HYPERCALL_STANDARD, 516 .r11 = hcall_func(EXIT_REASON_CPUID), 517 .r12 = regs->ax, 518 .r13 = regs->cx, 519 }; 520 521 /* 522 * Only allow VMM to control range reserved for hypervisor 523 * communication. 524 * 525 * Return all-zeros for any CPUID outside the range. It matches CPU 526 * behaviour for non-supported leaf. 527 */ 528 if (regs->ax < 0x40000000 || regs->ax > 0x4FFFFFFF) { 529 regs->ax = regs->bx = regs->cx = regs->dx = 0; 530 return ve_instr_len(ve); 531 } 532 533 /* 534 * Emulate the CPUID instruction via a hypercall. More info about 535 * ABI can be found in TDX Guest-Host-Communication Interface 536 * (GHCI), section titled "VP.VMCALL<Instruction.CPUID>". 537 */ 538 if (__tdx_hypercall(&args)) 539 return -EIO; 540 541 /* 542 * As per TDX GHCI CPUID ABI, r12-r15 registers contain contents of 543 * EAX, EBX, ECX, EDX registers after the CPUID instruction execution. 544 * So copy the register contents back to pt_regs. 545 */ 546 regs->ax = args.r12; 547 regs->bx = args.r13; 548 regs->cx = args.r14; 549 regs->dx = args.r15; 550 551 return ve_instr_len(ve); 552 } 553 554 static bool mmio_read(int size, unsigned long addr, unsigned long *val) 555 { 556 struct tdx_module_args args = { 557 .r10 = TDX_HYPERCALL_STANDARD, 558 .r11 = hcall_func(EXIT_REASON_EPT_VIOLATION), 559 .r12 = size, 560 .r13 = EPT_READ, 561 .r14 = addr, 562 }; 563 564 if (__tdx_hypercall(&args)) 565 return false; 566 567 *val = args.r11; 568 return true; 569 } 570 571 static bool mmio_write(int size, unsigned long addr, unsigned long val) 572 { 573 return !_tdx_hypercall(hcall_func(EXIT_REASON_EPT_VIOLATION), size, 574 EPT_WRITE, addr, val); 575 } 576 577 static int handle_mmio(struct pt_regs *regs, struct ve_info *ve) 578 { 579 unsigned long *reg, val, vaddr; 580 char buffer[MAX_INSN_SIZE]; 581 enum insn_mmio_type mmio; 582 struct insn insn = {}; 583 int size, extend_size; 584 u8 extend_val = 0; 585 586 /* Only in-kernel MMIO is supported */ 587 if (WARN_ON_ONCE(user_mode(regs))) 588 return -EFAULT; 589 590 if (copy_from_kernel_nofault(buffer, (void *)regs->ip, MAX_INSN_SIZE)) 591 return -EFAULT; 592 593 if (insn_decode(&insn, buffer, MAX_INSN_SIZE, INSN_MODE_64)) 594 return -EINVAL; 595 596 mmio = insn_decode_mmio(&insn, &size); 597 if (WARN_ON_ONCE(mmio == INSN_MMIO_DECODE_FAILED)) 598 return -EINVAL; 599 600 if (mmio != INSN_MMIO_WRITE_IMM && mmio != INSN_MMIO_MOVS) { 601 reg = insn_get_modrm_reg_ptr(&insn, regs); 602 if (!reg) 603 return -EINVAL; 604 } 605 606 if (!fault_in_kernel_space(ve->gla)) { 607 WARN_ONCE(1, "Access to userspace address is not supported"); 608 return -EINVAL; 609 } 610 611 /* 612 * Reject EPT violation #VEs that split pages. 613 * 614 * MMIO accesses are supposed to be naturally aligned and therefore 615 * never cross page boundaries. Seeing split page accesses indicates 616 * a bug or a load_unaligned_zeropad() that stepped into an MMIO page. 617 * 618 * load_unaligned_zeropad() will recover using exception fixups. 619 */ 620 vaddr = (unsigned long)insn_get_addr_ref(&insn, regs); 621 if (vaddr / PAGE_SIZE != (vaddr + size - 1) / PAGE_SIZE) 622 return -EFAULT; 623 624 /* Handle writes first */ 625 switch (mmio) { 626 case INSN_MMIO_WRITE: 627 memcpy(&val, reg, size); 628 if (!mmio_write(size, ve->gpa, val)) 629 return -EIO; 630 return insn.length; 631 case INSN_MMIO_WRITE_IMM: 632 val = insn.immediate.value; 633 if (!mmio_write(size, ve->gpa, val)) 634 return -EIO; 635 return insn.length; 636 case INSN_MMIO_READ: 637 case INSN_MMIO_READ_ZERO_EXTEND: 638 case INSN_MMIO_READ_SIGN_EXTEND: 639 /* Reads are handled below */ 640 break; 641 case INSN_MMIO_MOVS: 642 case INSN_MMIO_DECODE_FAILED: 643 /* 644 * MMIO was accessed with an instruction that could not be 645 * decoded or handled properly. It was likely not using io.h 646 * helpers or accessed MMIO accidentally. 647 */ 648 return -EINVAL; 649 default: 650 WARN_ONCE(1, "Unknown insn_decode_mmio() decode value?"); 651 return -EINVAL; 652 } 653 654 /* Handle reads */ 655 if (!mmio_read(size, ve->gpa, &val)) 656 return -EIO; 657 658 switch (mmio) { 659 case INSN_MMIO_READ: 660 /* Zero-extend for 32-bit operation */ 661 extend_size = size == 4 ? sizeof(*reg) : 0; 662 break; 663 case INSN_MMIO_READ_ZERO_EXTEND: 664 /* Zero extend based on operand size */ 665 extend_size = insn.opnd_bytes; 666 break; 667 case INSN_MMIO_READ_SIGN_EXTEND: 668 /* Sign extend based on operand size */ 669 extend_size = insn.opnd_bytes; 670 if (size == 1 && val & BIT(7)) 671 extend_val = 0xFF; 672 else if (size > 1 && val & BIT(15)) 673 extend_val = 0xFF; 674 break; 675 default: 676 /* All other cases has to be covered with the first switch() */ 677 WARN_ON_ONCE(1); 678 return -EINVAL; 679 } 680 681 if (extend_size) 682 memset(reg, extend_val, extend_size); 683 memcpy(reg, &val, size); 684 return insn.length; 685 } 686 687 static bool handle_in(struct pt_regs *regs, int size, int port) 688 { 689 struct tdx_module_args args = { 690 .r10 = TDX_HYPERCALL_STANDARD, 691 .r11 = hcall_func(EXIT_REASON_IO_INSTRUCTION), 692 .r12 = size, 693 .r13 = PORT_READ, 694 .r14 = port, 695 }; 696 u64 mask = GENMASK(BITS_PER_BYTE * size, 0); 697 bool success; 698 699 /* 700 * Emulate the I/O read via hypercall. More info about ABI can be found 701 * in TDX Guest-Host-Communication Interface (GHCI) section titled 702 * "TDG.VP.VMCALL<Instruction.IO>". 703 */ 704 success = !__tdx_hypercall(&args); 705 706 /* Update part of the register affected by the emulated instruction */ 707 regs->ax &= ~mask; 708 if (success) 709 regs->ax |= args.r11 & mask; 710 711 return success; 712 } 713 714 static bool handle_out(struct pt_regs *regs, int size, int port) 715 { 716 u64 mask = GENMASK(BITS_PER_BYTE * size, 0); 717 718 /* 719 * Emulate the I/O write via hypercall. More info about ABI can be found 720 * in TDX Guest-Host-Communication Interface (GHCI) section titled 721 * "TDG.VP.VMCALL<Instruction.IO>". 722 */ 723 return !_tdx_hypercall(hcall_func(EXIT_REASON_IO_INSTRUCTION), size, 724 PORT_WRITE, port, regs->ax & mask); 725 } 726 727 /* 728 * Emulate I/O using hypercall. 729 * 730 * Assumes the IO instruction was using ax, which is enforced 731 * by the standard io.h macros. 732 * 733 * Return True on success or False on failure. 734 */ 735 static int handle_io(struct pt_regs *regs, struct ve_info *ve) 736 { 737 u32 exit_qual = ve->exit_qual; 738 int size, port; 739 bool in, ret; 740 741 if (VE_IS_IO_STRING(exit_qual)) 742 return -EIO; 743 744 in = VE_IS_IO_IN(exit_qual); 745 size = VE_GET_IO_SIZE(exit_qual); 746 port = VE_GET_PORT_NUM(exit_qual); 747 748 749 if (in) 750 ret = handle_in(regs, size, port); 751 else 752 ret = handle_out(regs, size, port); 753 if (!ret) 754 return -EIO; 755 756 return ve_instr_len(ve); 757 } 758 759 /* 760 * Early #VE exception handler. Only handles a subset of port I/O. 761 * Intended only for earlyprintk. If failed, return false. 762 */ 763 __init bool tdx_early_handle_ve(struct pt_regs *regs) 764 { 765 struct ve_info ve; 766 int insn_len; 767 768 tdx_get_ve_info(&ve); 769 770 if (ve.exit_reason != EXIT_REASON_IO_INSTRUCTION) 771 return false; 772 773 insn_len = handle_io(regs, &ve); 774 if (insn_len < 0) 775 return false; 776 777 regs->ip += insn_len; 778 return true; 779 } 780 781 void tdx_get_ve_info(struct ve_info *ve) 782 { 783 struct tdx_module_args args = {}; 784 785 /* 786 * Called during #VE handling to retrieve the #VE info from the 787 * TDX module. 788 * 789 * This has to be called early in #VE handling. A "nested" #VE which 790 * occurs before this will raise a #DF and is not recoverable. 791 * 792 * The call retrieves the #VE info from the TDX module, which also 793 * clears the "#VE valid" flag. This must be done before anything else 794 * because any #VE that occurs while the valid flag is set will lead to 795 * #DF. 796 * 797 * Note, the TDX module treats virtual NMIs as inhibited if the #VE 798 * valid flag is set. It means that NMI=>#VE will not result in a #DF. 799 */ 800 tdcall(TDG_VP_VEINFO_GET, &args); 801 802 /* Transfer the output parameters */ 803 ve->exit_reason = args.rcx; 804 ve->exit_qual = args.rdx; 805 ve->gla = args.r8; 806 ve->gpa = args.r9; 807 ve->instr_len = lower_32_bits(args.r10); 808 ve->instr_info = upper_32_bits(args.r10); 809 } 810 811 /* 812 * Handle the user initiated #VE. 813 * 814 * On success, returns the number of bytes RIP should be incremented (>=0) 815 * or -errno on error. 816 */ 817 static int virt_exception_user(struct pt_regs *regs, struct ve_info *ve) 818 { 819 switch (ve->exit_reason) { 820 case EXIT_REASON_CPUID: 821 return handle_cpuid(regs, ve); 822 default: 823 pr_warn("Unexpected #VE: %lld\n", ve->exit_reason); 824 return -EIO; 825 } 826 } 827 828 static inline bool is_private_gpa(u64 gpa) 829 { 830 return gpa == cc_mkenc(gpa); 831 } 832 833 /* 834 * Handle the kernel #VE. 835 * 836 * On success, returns the number of bytes RIP should be incremented (>=0) 837 * or -errno on error. 838 */ 839 static int virt_exception_kernel(struct pt_regs *regs, struct ve_info *ve) 840 { 841 switch (ve->exit_reason) { 842 case EXIT_REASON_HLT: 843 return handle_halt(ve); 844 case EXIT_REASON_MSR_READ: 845 return read_msr(regs, ve); 846 case EXIT_REASON_MSR_WRITE: 847 return write_msr(regs, ve); 848 case EXIT_REASON_CPUID: 849 return handle_cpuid(regs, ve); 850 case EXIT_REASON_EPT_VIOLATION: 851 if (is_private_gpa(ve->gpa)) 852 panic("Unexpected EPT-violation on private memory."); 853 return handle_mmio(regs, ve); 854 case EXIT_REASON_IO_INSTRUCTION: 855 return handle_io(regs, ve); 856 default: 857 pr_warn("Unexpected #VE: %lld\n", ve->exit_reason); 858 return -EIO; 859 } 860 } 861 862 bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve) 863 { 864 int insn_len; 865 866 if (user_mode(regs)) 867 insn_len = virt_exception_user(regs, ve); 868 else 869 insn_len = virt_exception_kernel(regs, ve); 870 if (insn_len < 0) 871 return false; 872 873 /* After successful #VE handling, move the IP */ 874 regs->ip += insn_len; 875 876 return true; 877 } 878 879 static bool tdx_tlb_flush_required(bool private) 880 { 881 /* 882 * TDX guest is responsible for flushing TLB on private->shared 883 * transition. VMM is responsible for flushing on shared->private. 884 * 885 * The VMM _can't_ flush private addresses as it can't generate PAs 886 * with the guest's HKID. Shared memory isn't subject to integrity 887 * checking, i.e. the VMM doesn't need to flush for its own protection. 888 * 889 * There's no need to flush when converting from shared to private, 890 * as flushing is the VMM's responsibility in this case, e.g. it must 891 * flush to avoid integrity failures in the face of a buggy or 892 * malicious guest. 893 */ 894 return !private; 895 } 896 897 static bool tdx_cache_flush_required(void) 898 { 899 /* 900 * AMD SME/SEV can avoid cache flushing if HW enforces cache coherence. 901 * TDX doesn't have such capability. 902 * 903 * Flush cache unconditionally. 904 */ 905 return true; 906 } 907 908 /* 909 * Notify the VMM about page mapping conversion. More info about ABI 910 * can be found in TDX Guest-Host-Communication Interface (GHCI), 911 * section "TDG.VP.VMCALL<MapGPA>". 912 */ 913 static bool tdx_map_gpa(phys_addr_t start, phys_addr_t end, bool enc) 914 { 915 /* Retrying the hypercall a second time should succeed; use 3 just in case */ 916 const int max_retries_per_page = 3; 917 int retry_count = 0; 918 919 if (!enc) { 920 /* Set the shared (decrypted) bits: */ 921 start |= cc_mkdec(0); 922 end |= cc_mkdec(0); 923 } 924 925 while (retry_count < max_retries_per_page) { 926 struct tdx_module_args args = { 927 .r10 = TDX_HYPERCALL_STANDARD, 928 .r11 = TDVMCALL_MAP_GPA, 929 .r12 = start, 930 .r13 = end - start }; 931 932 u64 map_fail_paddr; 933 u64 ret = __tdx_hypercall(&args); 934 935 if (ret != TDVMCALL_STATUS_RETRY) 936 return !ret; 937 /* 938 * The guest must retry the operation for the pages in the 939 * region starting at the GPA specified in R11. R11 comes 940 * from the untrusted VMM. Sanity check it. 941 */ 942 map_fail_paddr = args.r11; 943 if (map_fail_paddr < start || map_fail_paddr >= end) 944 return false; 945 946 /* "Consume" a retry without forward progress */ 947 if (map_fail_paddr == start) { 948 retry_count++; 949 continue; 950 } 951 952 start = map_fail_paddr; 953 retry_count = 0; 954 } 955 956 return false; 957 } 958 959 /* 960 * Inform the VMM of the guest's intent for this physical page: shared with 961 * the VMM or private to the guest. The VMM is expected to change its mapping 962 * of the page in response. 963 */ 964 static bool tdx_enc_status_changed(unsigned long vaddr, int numpages, bool enc) 965 { 966 phys_addr_t start = __pa(vaddr); 967 phys_addr_t end = __pa(vaddr + numpages * PAGE_SIZE); 968 969 if (!tdx_map_gpa(start, end, enc)) 970 return false; 971 972 /* shared->private conversion requires memory to be accepted before use */ 973 if (enc) 974 return tdx_accept_memory(start, end); 975 976 return true; 977 } 978 979 static int tdx_enc_status_change_prepare(unsigned long vaddr, int numpages, 980 bool enc) 981 { 982 /* 983 * Only handle shared->private conversion here. 984 * See the comment in tdx_early_init(). 985 */ 986 if (enc && !tdx_enc_status_changed(vaddr, numpages, enc)) 987 return -EIO; 988 989 return 0; 990 } 991 992 static int tdx_enc_status_change_finish(unsigned long vaddr, int numpages, 993 bool enc) 994 { 995 /* 996 * Only handle private->shared conversion here. 997 * See the comment in tdx_early_init(). 998 */ 999 if (!enc && !tdx_enc_status_changed(vaddr, numpages, enc)) 1000 return -EIO; 1001 1002 if (enc) 1003 atomic_long_sub(numpages, &nr_shared); 1004 else 1005 atomic_long_add(numpages, &nr_shared); 1006 1007 return 0; 1008 } 1009 1010 /* Stop new private<->shared conversions */ 1011 static void tdx_kexec_begin(void) 1012 { 1013 if (!IS_ENABLED(CONFIG_KEXEC_CORE)) 1014 return; 1015 1016 /* 1017 * Crash kernel reaches here with interrupts disabled: can't wait for 1018 * conversions to finish. 1019 * 1020 * If race happened, just report and proceed. 1021 */ 1022 if (!set_memory_enc_stop_conversion()) 1023 pr_warn("Failed to stop shared<->private conversions\n"); 1024 } 1025 1026 /* Walk direct mapping and convert all shared memory back to private */ 1027 static void tdx_kexec_finish(void) 1028 { 1029 unsigned long addr, end; 1030 long found = 0, shared; 1031 1032 if (!IS_ENABLED(CONFIG_KEXEC_CORE)) 1033 return; 1034 1035 lockdep_assert_irqs_disabled(); 1036 1037 addr = PAGE_OFFSET; 1038 end = PAGE_OFFSET + get_max_mapped(); 1039 1040 while (addr < end) { 1041 unsigned long size; 1042 unsigned int level; 1043 pte_t *pte; 1044 1045 pte = lookup_address(addr, &level); 1046 size = page_level_size(level); 1047 1048 if (pte && pte_decrypted(*pte)) { 1049 int pages = size / PAGE_SIZE; 1050 1051 /* 1052 * Touching memory with shared bit set triggers implicit 1053 * conversion to shared. 1054 * 1055 * Make sure nobody touches the shared range from 1056 * now on. 1057 */ 1058 set_pte(pte, __pte(0)); 1059 1060 /* 1061 * Memory encryption state persists across kexec. 1062 * If tdx_enc_status_changed() fails in the first 1063 * kernel, it leaves memory in an unknown state. 1064 * 1065 * If that memory remains shared, accessing it in the 1066 * *next* kernel through a private mapping will result 1067 * in an unrecoverable guest shutdown. 1068 * 1069 * The kdump kernel boot is not impacted as it uses 1070 * a pre-reserved memory range that is always private. 1071 * However, gathering crash information could lead to 1072 * a crash if it accesses unconverted memory through 1073 * a private mapping which is possible when accessing 1074 * that memory through /proc/vmcore, for example. 1075 * 1076 * In all cases, print error info in order to leave 1077 * enough bread crumbs for debugging. 1078 */ 1079 if (!tdx_enc_status_changed(addr, pages, true)) { 1080 pr_err("Failed to unshare range %#lx-%#lx\n", 1081 addr, addr + size); 1082 } 1083 1084 found += pages; 1085 } 1086 1087 addr += size; 1088 } 1089 1090 __flush_tlb_all(); 1091 1092 shared = atomic_long_read(&nr_shared); 1093 if (shared != found) { 1094 pr_err("shared page accounting is off\n"); 1095 pr_err("nr_shared = %ld, nr_found = %ld\n", shared, found); 1096 } 1097 } 1098 1099 static __init void tdx_announce(void) 1100 { 1101 struct tdx_module_args args = {}; 1102 u64 controls; 1103 1104 pr_info("Guest detected\n"); 1105 1106 tdcall(TDG_VP_INFO, &args); 1107 tdx_dump_attributes(args.rdx); 1108 1109 tdg_vm_rd(TDCS_TD_CTLS, &controls); 1110 tdx_dump_td_ctls(controls); 1111 } 1112 1113 void __init tdx_early_init(void) 1114 { 1115 u64 cc_mask; 1116 u32 eax, sig[3]; 1117 1118 cpuid_count(TDX_CPUID_LEAF_ID, 0, &eax, &sig[0], &sig[2], &sig[1]); 1119 1120 if (memcmp(TDX_IDENT, sig, sizeof(sig))) 1121 return; 1122 1123 setup_force_cpu_cap(X86_FEATURE_TDX_GUEST); 1124 1125 /* TSC is the only reliable clock in TDX guest */ 1126 setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); 1127 1128 cc_vendor = CC_VENDOR_INTEL; 1129 1130 /* Configure the TD */ 1131 tdx_setup(&cc_mask); 1132 1133 cc_set_mask(cc_mask); 1134 1135 /* 1136 * All bits above GPA width are reserved and kernel treats shared bit 1137 * as flag, not as part of physical address. 1138 * 1139 * Adjust physical mask to only cover valid GPA bits. 1140 */ 1141 physical_mask &= cc_mask - 1; 1142 1143 /* 1144 * The kernel mapping should match the TDX metadata for the page. 1145 * load_unaligned_zeropad() can touch memory *adjacent* to that which is 1146 * owned by the caller and can catch even _momentary_ mismatches. Bad 1147 * things happen on mismatch: 1148 * 1149 * - Private mapping => Shared Page == Guest shutdown 1150 * - Shared mapping => Private Page == Recoverable #VE 1151 * 1152 * guest.enc_status_change_prepare() converts the page from 1153 * shared=>private before the mapping becomes private. 1154 * 1155 * guest.enc_status_change_finish() converts the page from 1156 * private=>shared after the mapping becomes private. 1157 * 1158 * In both cases there is a temporary shared mapping to a private page, 1159 * which can result in a #VE. But, there is never a private mapping to 1160 * a shared page. 1161 */ 1162 x86_platform.guest.enc_status_change_prepare = tdx_enc_status_change_prepare; 1163 x86_platform.guest.enc_status_change_finish = tdx_enc_status_change_finish; 1164 1165 x86_platform.guest.enc_cache_flush_required = tdx_cache_flush_required; 1166 x86_platform.guest.enc_tlb_flush_required = tdx_tlb_flush_required; 1167 1168 x86_platform.guest.enc_kexec_begin = tdx_kexec_begin; 1169 x86_platform.guest.enc_kexec_finish = tdx_kexec_finish; 1170 1171 /* 1172 * Avoid "sti;hlt" execution in TDX guests as HLT induces a #VE that 1173 * will enable interrupts before HLT TDCALL invocation if executed 1174 * in STI-shadow, possibly resulting in missed wakeup events. 1175 * 1176 * Modify all possible HLT execution paths to use TDX specific routines 1177 * that directly execute TDCALL and toggle the interrupt state as 1178 * needed after TDCALL completion. This also reduces HLT related #VEs 1179 * in addition to having a reliable halt logic execution. 1180 */ 1181 pv_ops.irq.safe_halt = tdx_safe_halt; 1182 pv_ops.irq.halt = tdx_halt; 1183 1184 /* 1185 * TDX intercepts the RDMSR to read the X2APIC ID in the parallel 1186 * bringup low level code. That raises #VE which cannot be handled 1187 * there. 1188 * 1189 * Intel-TDX has a secure RDMSR hypercall, but that needs to be 1190 * implemented separately in the low level startup ASM code. 1191 * Until that is in place, disable parallel bringup for TDX. 1192 */ 1193 x86_cpuinit.parallel_bringup = false; 1194 1195 tdx_announce(); 1196 } 1197