1 // SPDX-License-Identifier: GPL-2.0
2 /* Copyright (C) 2021-2022 Intel Corporation */
3
4 #undef pr_fmt
5 #define pr_fmt(fmt) "tdx: " fmt
6
7 #include <linux/cpufeature.h>
8 #include <linux/export.h>
9 #include <linux/io.h>
10 #include <linux/kexec.h>
11 #include <asm/coco.h>
12 #include <asm/tdx.h>
13 #include <asm/vmx.h>
14 #include <asm/ia32.h>
15 #include <asm/insn.h>
16 #include <asm/insn-eval.h>
17 #include <asm/pgtable.h>
18 #include <asm/set_memory.h>
19
20 /* MMIO direction */
21 #define EPT_READ 0
22 #define EPT_WRITE 1
23
24 /* Port I/O direction */
25 #define PORT_READ 0
26 #define PORT_WRITE 1
27
28 /* See Exit Qualification for I/O Instructions in VMX documentation */
29 #define VE_IS_IO_IN(e) ((e) & BIT(3))
30 #define VE_GET_IO_SIZE(e) (((e) & GENMASK(2, 0)) + 1)
31 #define VE_GET_PORT_NUM(e) ((e) >> 16)
32 #define VE_IS_IO_STRING(e) ((e) & BIT(4))
33
34 #define ATTR_DEBUG BIT(0)
35 #define ATTR_SEPT_VE_DISABLE BIT(28)
36
37 /* TDX Module call error codes */
38 #define TDCALL_RETURN_CODE(a) ((a) >> 32)
39 #define TDCALL_INVALID_OPERAND 0xc0000100
40
41 #define TDREPORT_SUBTYPE_0 0
42
43 static atomic_long_t nr_shared;
44
45 /* Called from __tdx_hypercall() for unrecoverable failure */
__tdx_hypercall_failed(void)46 noinstr void __noreturn __tdx_hypercall_failed(void)
47 {
48 instrumentation_begin();
49 panic("TDVMCALL failed. TDX module bug?");
50 }
51
52 #ifdef CONFIG_KVM_GUEST
tdx_kvm_hypercall(unsigned int nr,unsigned long p1,unsigned long p2,unsigned long p3,unsigned long p4)53 long tdx_kvm_hypercall(unsigned int nr, unsigned long p1, unsigned long p2,
54 unsigned long p3, unsigned long p4)
55 {
56 struct tdx_module_args args = {
57 .r10 = nr,
58 .r11 = p1,
59 .r12 = p2,
60 .r13 = p3,
61 .r14 = p4,
62 };
63
64 return __tdx_hypercall(&args);
65 }
66 EXPORT_SYMBOL_GPL(tdx_kvm_hypercall);
67 #endif
68
69 /*
70 * Used for TDX guests to make calls directly to the TD module. This
71 * should only be used for calls that have no legitimate reason to fail
72 * or where the kernel can not survive the call failing.
73 */
tdcall(u64 fn,struct tdx_module_args * args)74 static inline void tdcall(u64 fn, struct tdx_module_args *args)
75 {
76 if (__tdcall_ret(fn, args))
77 panic("TDCALL %lld failed (Buggy TDX module!)\n", fn);
78 }
79
80 /**
81 * tdx_mcall_get_report0() - Wrapper to get TDREPORT0 (a.k.a. TDREPORT
82 * subtype 0) using TDG.MR.REPORT TDCALL.
83 * @reportdata: Address of the input buffer which contains user-defined
84 * REPORTDATA to be included into TDREPORT.
85 * @tdreport: Address of the output buffer to store TDREPORT.
86 *
87 * Refer to section titled "TDG.MR.REPORT leaf" in the TDX Module
88 * v1.0 specification for more information on TDG.MR.REPORT TDCALL.
89 * It is used in the TDX guest driver module to get the TDREPORT0.
90 *
91 * Return 0 on success, -EINVAL for invalid operands, or -EIO on
92 * other TDCALL failures.
93 */
tdx_mcall_get_report0(u8 * reportdata,u8 * tdreport)94 int tdx_mcall_get_report0(u8 *reportdata, u8 *tdreport)
95 {
96 struct tdx_module_args args = {
97 .rcx = virt_to_phys(tdreport),
98 .rdx = virt_to_phys(reportdata),
99 .r8 = TDREPORT_SUBTYPE_0,
100 };
101 u64 ret;
102
103 ret = __tdcall(TDG_MR_REPORT, &args);
104 if (ret) {
105 if (TDCALL_RETURN_CODE(ret) == TDCALL_INVALID_OPERAND)
106 return -EINVAL;
107 return -EIO;
108 }
109
110 return 0;
111 }
112 EXPORT_SYMBOL_GPL(tdx_mcall_get_report0);
113
114 /**
115 * tdx_hcall_get_quote() - Wrapper to request TD Quote using GetQuote
116 * hypercall.
117 * @buf: Address of the directly mapped shared kernel buffer which
118 * contains TDREPORT. The same buffer will be used by VMM to
119 * store the generated TD Quote output.
120 * @size: size of the tdquote buffer (4KB-aligned).
121 *
122 * Refer to section titled "TDG.VP.VMCALL<GetQuote>" in the TDX GHCI
123 * v1.0 specification for more information on GetQuote hypercall.
124 * It is used in the TDX guest driver module to get the TD Quote.
125 *
126 * Return 0 on success or error code on failure.
127 */
tdx_hcall_get_quote(u8 * buf,size_t size)128 u64 tdx_hcall_get_quote(u8 *buf, size_t size)
129 {
130 /* Since buf is a shared memory, set the shared (decrypted) bits */
131 return _tdx_hypercall(TDVMCALL_GET_QUOTE, cc_mkdec(virt_to_phys(buf)), size, 0, 0);
132 }
133 EXPORT_SYMBOL_GPL(tdx_hcall_get_quote);
134
tdx_panic(const char * msg)135 static void __noreturn tdx_panic(const char *msg)
136 {
137 struct tdx_module_args args = {
138 .r10 = TDX_HYPERCALL_STANDARD,
139 .r11 = TDVMCALL_REPORT_FATAL_ERROR,
140 .r12 = 0, /* Error code: 0 is Panic */
141 };
142 union {
143 /* Define register order according to the GHCI */
144 struct { u64 r14, r15, rbx, rdi, rsi, r8, r9, rdx; };
145
146 char str[64];
147 } message;
148
149 /* VMM assumes '\0' in byte 65, if the message took all 64 bytes */
150 strtomem_pad(message.str, msg, '\0');
151
152 args.r8 = message.r8;
153 args.r9 = message.r9;
154 args.r14 = message.r14;
155 args.r15 = message.r15;
156 args.rdi = message.rdi;
157 args.rsi = message.rsi;
158 args.rbx = message.rbx;
159 args.rdx = message.rdx;
160
161 /*
162 * This hypercall should never return and it is not safe
163 * to keep the guest running. Call it forever if it
164 * happens to return.
165 */
166 while (1)
167 __tdx_hypercall(&args);
168 }
169
tdx_parse_tdinfo(u64 * cc_mask)170 static void tdx_parse_tdinfo(u64 *cc_mask)
171 {
172 struct tdx_module_args args = {};
173 unsigned int gpa_width;
174 u64 td_attr;
175
176 /*
177 * TDINFO TDX module call is used to get the TD execution environment
178 * information like GPA width, number of available vcpus, debug mode
179 * information, etc. More details about the ABI can be found in TDX
180 * Guest-Host-Communication Interface (GHCI), section 2.4.2 TDCALL
181 * [TDG.VP.INFO].
182 */
183 tdcall(TDG_VP_INFO, &args);
184
185 /*
186 * The highest bit of a guest physical address is the "sharing" bit.
187 * Set it for shared pages and clear it for private pages.
188 *
189 * The GPA width that comes out of this call is critical. TDX guests
190 * can not meaningfully run without it.
191 */
192 gpa_width = args.rcx & GENMASK(5, 0);
193 *cc_mask = BIT_ULL(gpa_width - 1);
194
195 /*
196 * The kernel can not handle #VE's when accessing normal kernel
197 * memory. Ensure that no #VE will be delivered for accesses to
198 * TD-private memory. Only VMM-shared memory (MMIO) will #VE.
199 */
200 td_attr = args.rdx;
201 if (!(td_attr & ATTR_SEPT_VE_DISABLE)) {
202 const char *msg = "TD misconfiguration: SEPT_VE_DISABLE attribute must be set.";
203
204 /* Relax SEPT_VE_DISABLE check for debug TD. */
205 if (td_attr & ATTR_DEBUG)
206 pr_warn("%s\n", msg);
207 else
208 tdx_panic(msg);
209 }
210 }
211
212 /*
213 * The TDX module spec states that #VE may be injected for a limited set of
214 * reasons:
215 *
216 * - Emulation of the architectural #VE injection on EPT violation;
217 *
218 * - As a result of guest TD execution of a disallowed instruction,
219 * a disallowed MSR access, or CPUID virtualization;
220 *
221 * - A notification to the guest TD about anomalous behavior;
222 *
223 * The last one is opt-in and is not used by the kernel.
224 *
225 * The Intel Software Developer's Manual describes cases when instruction
226 * length field can be used in section "Information for VM Exits Due to
227 * Instruction Execution".
228 *
229 * For TDX, it ultimately means GET_VEINFO provides reliable instruction length
230 * information if #VE occurred due to instruction execution, but not for EPT
231 * violations.
232 */
ve_instr_len(struct ve_info * ve)233 static int ve_instr_len(struct ve_info *ve)
234 {
235 switch (ve->exit_reason) {
236 case EXIT_REASON_HLT:
237 case EXIT_REASON_MSR_READ:
238 case EXIT_REASON_MSR_WRITE:
239 case EXIT_REASON_CPUID:
240 case EXIT_REASON_IO_INSTRUCTION:
241 /* It is safe to use ve->instr_len for #VE due instructions */
242 return ve->instr_len;
243 case EXIT_REASON_EPT_VIOLATION:
244 /*
245 * For EPT violations, ve->insn_len is not defined. For those,
246 * the kernel must decode instructions manually and should not
247 * be using this function.
248 */
249 WARN_ONCE(1, "ve->instr_len is not defined for EPT violations");
250 return 0;
251 default:
252 WARN_ONCE(1, "Unexpected #VE-type: %lld\n", ve->exit_reason);
253 return ve->instr_len;
254 }
255 }
256
__halt(const bool irq_disabled)257 static u64 __cpuidle __halt(const bool irq_disabled)
258 {
259 struct tdx_module_args args = {
260 .r10 = TDX_HYPERCALL_STANDARD,
261 .r11 = hcall_func(EXIT_REASON_HLT),
262 .r12 = irq_disabled,
263 };
264
265 /*
266 * Emulate HLT operation via hypercall. More info about ABI
267 * can be found in TDX Guest-Host-Communication Interface
268 * (GHCI), section 3.8 TDG.VP.VMCALL<Instruction.HLT>.
269 *
270 * The VMM uses the "IRQ disabled" param to understand IRQ
271 * enabled status (RFLAGS.IF) of the TD guest and to determine
272 * whether or not it should schedule the halted vCPU if an
273 * IRQ becomes pending. E.g. if IRQs are disabled, the VMM
274 * can keep the vCPU in virtual HLT, even if an IRQ is
275 * pending, without hanging/breaking the guest.
276 */
277 return __tdx_hypercall(&args);
278 }
279
handle_halt(struct ve_info * ve)280 static int handle_halt(struct ve_info *ve)
281 {
282 const bool irq_disabled = irqs_disabled();
283
284 if (__halt(irq_disabled))
285 return -EIO;
286
287 return ve_instr_len(ve);
288 }
289
tdx_safe_halt(void)290 void __cpuidle tdx_safe_halt(void)
291 {
292 const bool irq_disabled = false;
293
294 /*
295 * Use WARN_ONCE() to report the failure.
296 */
297 if (__halt(irq_disabled))
298 WARN_ONCE(1, "HLT instruction emulation failed\n");
299 }
300
read_msr(struct pt_regs * regs,struct ve_info * ve)301 static int read_msr(struct pt_regs *regs, struct ve_info *ve)
302 {
303 struct tdx_module_args args = {
304 .r10 = TDX_HYPERCALL_STANDARD,
305 .r11 = hcall_func(EXIT_REASON_MSR_READ),
306 .r12 = regs->cx,
307 };
308
309 /*
310 * Emulate the MSR read via hypercall. More info about ABI
311 * can be found in TDX Guest-Host-Communication Interface
312 * (GHCI), section titled "TDG.VP.VMCALL<Instruction.RDMSR>".
313 */
314 if (__tdx_hypercall(&args))
315 return -EIO;
316
317 regs->ax = lower_32_bits(args.r11);
318 regs->dx = upper_32_bits(args.r11);
319 return ve_instr_len(ve);
320 }
321
write_msr(struct pt_regs * regs,struct ve_info * ve)322 static int write_msr(struct pt_regs *regs, struct ve_info *ve)
323 {
324 struct tdx_module_args args = {
325 .r10 = TDX_HYPERCALL_STANDARD,
326 .r11 = hcall_func(EXIT_REASON_MSR_WRITE),
327 .r12 = regs->cx,
328 .r13 = (u64)regs->dx << 32 | regs->ax,
329 };
330
331 /*
332 * Emulate the MSR write via hypercall. More info about ABI
333 * can be found in TDX Guest-Host-Communication Interface
334 * (GHCI) section titled "TDG.VP.VMCALL<Instruction.WRMSR>".
335 */
336 if (__tdx_hypercall(&args))
337 return -EIO;
338
339 return ve_instr_len(ve);
340 }
341
handle_cpuid(struct pt_regs * regs,struct ve_info * ve)342 static int handle_cpuid(struct pt_regs *regs, struct ve_info *ve)
343 {
344 struct tdx_module_args args = {
345 .r10 = TDX_HYPERCALL_STANDARD,
346 .r11 = hcall_func(EXIT_REASON_CPUID),
347 .r12 = regs->ax,
348 .r13 = regs->cx,
349 };
350
351 /*
352 * Only allow VMM to control range reserved for hypervisor
353 * communication.
354 *
355 * Return all-zeros for any CPUID outside the range. It matches CPU
356 * behaviour for non-supported leaf.
357 */
358 if (regs->ax < 0x40000000 || regs->ax > 0x4FFFFFFF) {
359 regs->ax = regs->bx = regs->cx = regs->dx = 0;
360 return ve_instr_len(ve);
361 }
362
363 /*
364 * Emulate the CPUID instruction via a hypercall. More info about
365 * ABI can be found in TDX Guest-Host-Communication Interface
366 * (GHCI), section titled "VP.VMCALL<Instruction.CPUID>".
367 */
368 if (__tdx_hypercall(&args))
369 return -EIO;
370
371 /*
372 * As per TDX GHCI CPUID ABI, r12-r15 registers contain contents of
373 * EAX, EBX, ECX, EDX registers after the CPUID instruction execution.
374 * So copy the register contents back to pt_regs.
375 */
376 regs->ax = args.r12;
377 regs->bx = args.r13;
378 regs->cx = args.r14;
379 regs->dx = args.r15;
380
381 return ve_instr_len(ve);
382 }
383
mmio_read(int size,unsigned long addr,unsigned long * val)384 static bool mmio_read(int size, unsigned long addr, unsigned long *val)
385 {
386 struct tdx_module_args args = {
387 .r10 = TDX_HYPERCALL_STANDARD,
388 .r11 = hcall_func(EXIT_REASON_EPT_VIOLATION),
389 .r12 = size,
390 .r13 = EPT_READ,
391 .r14 = addr,
392 };
393
394 if (__tdx_hypercall(&args))
395 return false;
396
397 *val = args.r11;
398 return true;
399 }
400
mmio_write(int size,unsigned long addr,unsigned long val)401 static bool mmio_write(int size, unsigned long addr, unsigned long val)
402 {
403 return !_tdx_hypercall(hcall_func(EXIT_REASON_EPT_VIOLATION), size,
404 EPT_WRITE, addr, val);
405 }
406
handle_mmio(struct pt_regs * regs,struct ve_info * ve)407 static int handle_mmio(struct pt_regs *regs, struct ve_info *ve)
408 {
409 unsigned long *reg, val, vaddr;
410 char buffer[MAX_INSN_SIZE];
411 enum insn_mmio_type mmio;
412 struct insn insn = {};
413 int size, extend_size;
414 u8 extend_val = 0;
415
416 /* Only in-kernel MMIO is supported */
417 if (WARN_ON_ONCE(user_mode(regs)))
418 return -EFAULT;
419
420 if (copy_from_kernel_nofault(buffer, (void *)regs->ip, MAX_INSN_SIZE))
421 return -EFAULT;
422
423 if (insn_decode(&insn, buffer, MAX_INSN_SIZE, INSN_MODE_64))
424 return -EINVAL;
425
426 mmio = insn_decode_mmio(&insn, &size);
427 if (WARN_ON_ONCE(mmio == INSN_MMIO_DECODE_FAILED))
428 return -EINVAL;
429
430 if (mmio != INSN_MMIO_WRITE_IMM && mmio != INSN_MMIO_MOVS) {
431 reg = insn_get_modrm_reg_ptr(&insn, regs);
432 if (!reg)
433 return -EINVAL;
434 }
435
436 /*
437 * Reject EPT violation #VEs that split pages.
438 *
439 * MMIO accesses are supposed to be naturally aligned and therefore
440 * never cross page boundaries. Seeing split page accesses indicates
441 * a bug or a load_unaligned_zeropad() that stepped into an MMIO page.
442 *
443 * load_unaligned_zeropad() will recover using exception fixups.
444 */
445 vaddr = (unsigned long)insn_get_addr_ref(&insn, regs);
446 if (vaddr / PAGE_SIZE != (vaddr + size - 1) / PAGE_SIZE)
447 return -EFAULT;
448
449 /* Handle writes first */
450 switch (mmio) {
451 case INSN_MMIO_WRITE:
452 memcpy(&val, reg, size);
453 if (!mmio_write(size, ve->gpa, val))
454 return -EIO;
455 return insn.length;
456 case INSN_MMIO_WRITE_IMM:
457 val = insn.immediate.value;
458 if (!mmio_write(size, ve->gpa, val))
459 return -EIO;
460 return insn.length;
461 case INSN_MMIO_READ:
462 case INSN_MMIO_READ_ZERO_EXTEND:
463 case INSN_MMIO_READ_SIGN_EXTEND:
464 /* Reads are handled below */
465 break;
466 case INSN_MMIO_MOVS:
467 case INSN_MMIO_DECODE_FAILED:
468 /*
469 * MMIO was accessed with an instruction that could not be
470 * decoded or handled properly. It was likely not using io.h
471 * helpers or accessed MMIO accidentally.
472 */
473 return -EINVAL;
474 default:
475 WARN_ONCE(1, "Unknown insn_decode_mmio() decode value?");
476 return -EINVAL;
477 }
478
479 /* Handle reads */
480 if (!mmio_read(size, ve->gpa, &val))
481 return -EIO;
482
483 switch (mmio) {
484 case INSN_MMIO_READ:
485 /* Zero-extend for 32-bit operation */
486 extend_size = size == 4 ? sizeof(*reg) : 0;
487 break;
488 case INSN_MMIO_READ_ZERO_EXTEND:
489 /* Zero extend based on operand size */
490 extend_size = insn.opnd_bytes;
491 break;
492 case INSN_MMIO_READ_SIGN_EXTEND:
493 /* Sign extend based on operand size */
494 extend_size = insn.opnd_bytes;
495 if (size == 1 && val & BIT(7))
496 extend_val = 0xFF;
497 else if (size > 1 && val & BIT(15))
498 extend_val = 0xFF;
499 break;
500 default:
501 /* All other cases has to be covered with the first switch() */
502 WARN_ON_ONCE(1);
503 return -EINVAL;
504 }
505
506 if (extend_size)
507 memset(reg, extend_val, extend_size);
508 memcpy(reg, &val, size);
509 return insn.length;
510 }
511
handle_in(struct pt_regs * regs,int size,int port)512 static bool handle_in(struct pt_regs *regs, int size, int port)
513 {
514 struct tdx_module_args args = {
515 .r10 = TDX_HYPERCALL_STANDARD,
516 .r11 = hcall_func(EXIT_REASON_IO_INSTRUCTION),
517 .r12 = size,
518 .r13 = PORT_READ,
519 .r14 = port,
520 };
521 u64 mask = GENMASK(BITS_PER_BYTE * size, 0);
522 bool success;
523
524 /*
525 * Emulate the I/O read via hypercall. More info about ABI can be found
526 * in TDX Guest-Host-Communication Interface (GHCI) section titled
527 * "TDG.VP.VMCALL<Instruction.IO>".
528 */
529 success = !__tdx_hypercall(&args);
530
531 /* Update part of the register affected by the emulated instruction */
532 regs->ax &= ~mask;
533 if (success)
534 regs->ax |= args.r11 & mask;
535
536 return success;
537 }
538
handle_out(struct pt_regs * regs,int size,int port)539 static bool handle_out(struct pt_regs *regs, int size, int port)
540 {
541 u64 mask = GENMASK(BITS_PER_BYTE * size, 0);
542
543 /*
544 * Emulate the I/O write via hypercall. More info about ABI can be found
545 * in TDX Guest-Host-Communication Interface (GHCI) section titled
546 * "TDG.VP.VMCALL<Instruction.IO>".
547 */
548 return !_tdx_hypercall(hcall_func(EXIT_REASON_IO_INSTRUCTION), size,
549 PORT_WRITE, port, regs->ax & mask);
550 }
551
552 /*
553 * Emulate I/O using hypercall.
554 *
555 * Assumes the IO instruction was using ax, which is enforced
556 * by the standard io.h macros.
557 *
558 * Return True on success or False on failure.
559 */
handle_io(struct pt_regs * regs,struct ve_info * ve)560 static int handle_io(struct pt_regs *regs, struct ve_info *ve)
561 {
562 u32 exit_qual = ve->exit_qual;
563 int size, port;
564 bool in, ret;
565
566 if (VE_IS_IO_STRING(exit_qual))
567 return -EIO;
568
569 in = VE_IS_IO_IN(exit_qual);
570 size = VE_GET_IO_SIZE(exit_qual);
571 port = VE_GET_PORT_NUM(exit_qual);
572
573
574 if (in)
575 ret = handle_in(regs, size, port);
576 else
577 ret = handle_out(regs, size, port);
578 if (!ret)
579 return -EIO;
580
581 return ve_instr_len(ve);
582 }
583
584 /*
585 * Early #VE exception handler. Only handles a subset of port I/O.
586 * Intended only for earlyprintk. If failed, return false.
587 */
tdx_early_handle_ve(struct pt_regs * regs)588 __init bool tdx_early_handle_ve(struct pt_regs *regs)
589 {
590 struct ve_info ve;
591 int insn_len;
592
593 tdx_get_ve_info(&ve);
594
595 if (ve.exit_reason != EXIT_REASON_IO_INSTRUCTION)
596 return false;
597
598 insn_len = handle_io(regs, &ve);
599 if (insn_len < 0)
600 return false;
601
602 regs->ip += insn_len;
603 return true;
604 }
605
tdx_get_ve_info(struct ve_info * ve)606 void tdx_get_ve_info(struct ve_info *ve)
607 {
608 struct tdx_module_args args = {};
609
610 /*
611 * Called during #VE handling to retrieve the #VE info from the
612 * TDX module.
613 *
614 * This has to be called early in #VE handling. A "nested" #VE which
615 * occurs before this will raise a #DF and is not recoverable.
616 *
617 * The call retrieves the #VE info from the TDX module, which also
618 * clears the "#VE valid" flag. This must be done before anything else
619 * because any #VE that occurs while the valid flag is set will lead to
620 * #DF.
621 *
622 * Note, the TDX module treats virtual NMIs as inhibited if the #VE
623 * valid flag is set. It means that NMI=>#VE will not result in a #DF.
624 */
625 tdcall(TDG_VP_VEINFO_GET, &args);
626
627 /* Transfer the output parameters */
628 ve->exit_reason = args.rcx;
629 ve->exit_qual = args.rdx;
630 ve->gla = args.r8;
631 ve->gpa = args.r9;
632 ve->instr_len = lower_32_bits(args.r10);
633 ve->instr_info = upper_32_bits(args.r10);
634 }
635
636 /*
637 * Handle the user initiated #VE.
638 *
639 * On success, returns the number of bytes RIP should be incremented (>=0)
640 * or -errno on error.
641 */
virt_exception_user(struct pt_regs * regs,struct ve_info * ve)642 static int virt_exception_user(struct pt_regs *regs, struct ve_info *ve)
643 {
644 switch (ve->exit_reason) {
645 case EXIT_REASON_CPUID:
646 return handle_cpuid(regs, ve);
647 default:
648 pr_warn("Unexpected #VE: %lld\n", ve->exit_reason);
649 return -EIO;
650 }
651 }
652
is_private_gpa(u64 gpa)653 static inline bool is_private_gpa(u64 gpa)
654 {
655 return gpa == cc_mkenc(gpa);
656 }
657
658 /*
659 * Handle the kernel #VE.
660 *
661 * On success, returns the number of bytes RIP should be incremented (>=0)
662 * or -errno on error.
663 */
virt_exception_kernel(struct pt_regs * regs,struct ve_info * ve)664 static int virt_exception_kernel(struct pt_regs *regs, struct ve_info *ve)
665 {
666 switch (ve->exit_reason) {
667 case EXIT_REASON_HLT:
668 return handle_halt(ve);
669 case EXIT_REASON_MSR_READ:
670 return read_msr(regs, ve);
671 case EXIT_REASON_MSR_WRITE:
672 return write_msr(regs, ve);
673 case EXIT_REASON_CPUID:
674 return handle_cpuid(regs, ve);
675 case EXIT_REASON_EPT_VIOLATION:
676 if (is_private_gpa(ve->gpa))
677 panic("Unexpected EPT-violation on private memory.");
678 return handle_mmio(regs, ve);
679 case EXIT_REASON_IO_INSTRUCTION:
680 return handle_io(regs, ve);
681 default:
682 pr_warn("Unexpected #VE: %lld\n", ve->exit_reason);
683 return -EIO;
684 }
685 }
686
tdx_handle_virt_exception(struct pt_regs * regs,struct ve_info * ve)687 bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve)
688 {
689 int insn_len;
690
691 if (user_mode(regs))
692 insn_len = virt_exception_user(regs, ve);
693 else
694 insn_len = virt_exception_kernel(regs, ve);
695 if (insn_len < 0)
696 return false;
697
698 /* After successful #VE handling, move the IP */
699 regs->ip += insn_len;
700
701 return true;
702 }
703
tdx_tlb_flush_required(bool private)704 static bool tdx_tlb_flush_required(bool private)
705 {
706 /*
707 * TDX guest is responsible for flushing TLB on private->shared
708 * transition. VMM is responsible for flushing on shared->private.
709 *
710 * The VMM _can't_ flush private addresses as it can't generate PAs
711 * with the guest's HKID. Shared memory isn't subject to integrity
712 * checking, i.e. the VMM doesn't need to flush for its own protection.
713 *
714 * There's no need to flush when converting from shared to private,
715 * as flushing is the VMM's responsibility in this case, e.g. it must
716 * flush to avoid integrity failures in the face of a buggy or
717 * malicious guest.
718 */
719 return !private;
720 }
721
tdx_cache_flush_required(void)722 static bool tdx_cache_flush_required(void)
723 {
724 /*
725 * AMD SME/SEV can avoid cache flushing if HW enforces cache coherence.
726 * TDX doesn't have such capability.
727 *
728 * Flush cache unconditionally.
729 */
730 return true;
731 }
732
733 /*
734 * Notify the VMM about page mapping conversion. More info about ABI
735 * can be found in TDX Guest-Host-Communication Interface (GHCI),
736 * section "TDG.VP.VMCALL<MapGPA>".
737 */
tdx_map_gpa(phys_addr_t start,phys_addr_t end,bool enc)738 static bool tdx_map_gpa(phys_addr_t start, phys_addr_t end, bool enc)
739 {
740 /* Retrying the hypercall a second time should succeed; use 3 just in case */
741 const int max_retries_per_page = 3;
742 int retry_count = 0;
743
744 if (!enc) {
745 /* Set the shared (decrypted) bits: */
746 start |= cc_mkdec(0);
747 end |= cc_mkdec(0);
748 }
749
750 while (retry_count < max_retries_per_page) {
751 struct tdx_module_args args = {
752 .r10 = TDX_HYPERCALL_STANDARD,
753 .r11 = TDVMCALL_MAP_GPA,
754 .r12 = start,
755 .r13 = end - start };
756
757 u64 map_fail_paddr;
758 u64 ret = __tdx_hypercall(&args);
759
760 if (ret != TDVMCALL_STATUS_RETRY)
761 return !ret;
762 /*
763 * The guest must retry the operation for the pages in the
764 * region starting at the GPA specified in R11. R11 comes
765 * from the untrusted VMM. Sanity check it.
766 */
767 map_fail_paddr = args.r11;
768 if (map_fail_paddr < start || map_fail_paddr >= end)
769 return false;
770
771 /* "Consume" a retry without forward progress */
772 if (map_fail_paddr == start) {
773 retry_count++;
774 continue;
775 }
776
777 start = map_fail_paddr;
778 retry_count = 0;
779 }
780
781 return false;
782 }
783
784 /*
785 * Inform the VMM of the guest's intent for this physical page: shared with
786 * the VMM or private to the guest. The VMM is expected to change its mapping
787 * of the page in response.
788 */
tdx_enc_status_changed(unsigned long vaddr,int numpages,bool enc)789 static bool tdx_enc_status_changed(unsigned long vaddr, int numpages, bool enc)
790 {
791 phys_addr_t start = __pa(vaddr);
792 phys_addr_t end = __pa(vaddr + numpages * PAGE_SIZE);
793
794 if (!tdx_map_gpa(start, end, enc))
795 return false;
796
797 /* shared->private conversion requires memory to be accepted before use */
798 if (enc)
799 return tdx_accept_memory(start, end);
800
801 return true;
802 }
803
tdx_enc_status_change_prepare(unsigned long vaddr,int numpages,bool enc)804 static int tdx_enc_status_change_prepare(unsigned long vaddr, int numpages,
805 bool enc)
806 {
807 /*
808 * Only handle shared->private conversion here.
809 * See the comment in tdx_early_init().
810 */
811 if (enc && !tdx_enc_status_changed(vaddr, numpages, enc))
812 return -EIO;
813
814 return 0;
815 }
816
tdx_enc_status_change_finish(unsigned long vaddr,int numpages,bool enc)817 static int tdx_enc_status_change_finish(unsigned long vaddr, int numpages,
818 bool enc)
819 {
820 /*
821 * Only handle private->shared conversion here.
822 * See the comment in tdx_early_init().
823 */
824 if (!enc && !tdx_enc_status_changed(vaddr, numpages, enc))
825 return -EIO;
826
827 if (enc)
828 atomic_long_sub(numpages, &nr_shared);
829 else
830 atomic_long_add(numpages, &nr_shared);
831
832 return 0;
833 }
834
835 /* Stop new private<->shared conversions */
tdx_kexec_begin(void)836 static void tdx_kexec_begin(void)
837 {
838 if (!IS_ENABLED(CONFIG_KEXEC_CORE))
839 return;
840
841 /*
842 * Crash kernel reaches here with interrupts disabled: can't wait for
843 * conversions to finish.
844 *
845 * If race happened, just report and proceed.
846 */
847 if (!set_memory_enc_stop_conversion())
848 pr_warn("Failed to stop shared<->private conversions\n");
849 }
850
851 /* Walk direct mapping and convert all shared memory back to private */
tdx_kexec_finish(void)852 static void tdx_kexec_finish(void)
853 {
854 unsigned long addr, end;
855 long found = 0, shared;
856
857 if (!IS_ENABLED(CONFIG_KEXEC_CORE))
858 return;
859
860 lockdep_assert_irqs_disabled();
861
862 addr = PAGE_OFFSET;
863 end = PAGE_OFFSET + get_max_mapped();
864
865 while (addr < end) {
866 unsigned long size;
867 unsigned int level;
868 pte_t *pte;
869
870 pte = lookup_address(addr, &level);
871 size = page_level_size(level);
872
873 if (pte && pte_decrypted(*pte)) {
874 int pages = size / PAGE_SIZE;
875
876 /*
877 * Touching memory with shared bit set triggers implicit
878 * conversion to shared.
879 *
880 * Make sure nobody touches the shared range from
881 * now on.
882 */
883 set_pte(pte, __pte(0));
884
885 /*
886 * Memory encryption state persists across kexec.
887 * If tdx_enc_status_changed() fails in the first
888 * kernel, it leaves memory in an unknown state.
889 *
890 * If that memory remains shared, accessing it in the
891 * *next* kernel through a private mapping will result
892 * in an unrecoverable guest shutdown.
893 *
894 * The kdump kernel boot is not impacted as it uses
895 * a pre-reserved memory range that is always private.
896 * However, gathering crash information could lead to
897 * a crash if it accesses unconverted memory through
898 * a private mapping which is possible when accessing
899 * that memory through /proc/vmcore, for example.
900 *
901 * In all cases, print error info in order to leave
902 * enough bread crumbs for debugging.
903 */
904 if (!tdx_enc_status_changed(addr, pages, true)) {
905 pr_err("Failed to unshare range %#lx-%#lx\n",
906 addr, addr + size);
907 }
908
909 found += pages;
910 }
911
912 addr += size;
913 }
914
915 __flush_tlb_all();
916
917 shared = atomic_long_read(&nr_shared);
918 if (shared != found) {
919 pr_err("shared page accounting is off\n");
920 pr_err("nr_shared = %ld, nr_found = %ld\n", shared, found);
921 }
922 }
923
tdx_early_init(void)924 void __init tdx_early_init(void)
925 {
926 struct tdx_module_args args = {
927 .rdx = TDCS_NOTIFY_ENABLES,
928 .r9 = -1ULL,
929 };
930 u64 cc_mask;
931 u32 eax, sig[3];
932
933 cpuid_count(TDX_CPUID_LEAF_ID, 0, &eax, &sig[0], &sig[2], &sig[1]);
934
935 if (memcmp(TDX_IDENT, sig, sizeof(sig)))
936 return;
937
938 setup_force_cpu_cap(X86_FEATURE_TDX_GUEST);
939
940 /* TSC is the only reliable clock in TDX guest */
941 setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
942
943 cc_vendor = CC_VENDOR_INTEL;
944 tdx_parse_tdinfo(&cc_mask);
945 cc_set_mask(cc_mask);
946
947 /* Kernel does not use NOTIFY_ENABLES and does not need random #VEs */
948 tdcall(TDG_VM_WR, &args);
949
950 /*
951 * All bits above GPA width are reserved and kernel treats shared bit
952 * as flag, not as part of physical address.
953 *
954 * Adjust physical mask to only cover valid GPA bits.
955 */
956 physical_mask &= cc_mask - 1;
957
958 /*
959 * The kernel mapping should match the TDX metadata for the page.
960 * load_unaligned_zeropad() can touch memory *adjacent* to that which is
961 * owned by the caller and can catch even _momentary_ mismatches. Bad
962 * things happen on mismatch:
963 *
964 * - Private mapping => Shared Page == Guest shutdown
965 * - Shared mapping => Private Page == Recoverable #VE
966 *
967 * guest.enc_status_change_prepare() converts the page from
968 * shared=>private before the mapping becomes private.
969 *
970 * guest.enc_status_change_finish() converts the page from
971 * private=>shared after the mapping becomes private.
972 *
973 * In both cases there is a temporary shared mapping to a private page,
974 * which can result in a #VE. But, there is never a private mapping to
975 * a shared page.
976 */
977 x86_platform.guest.enc_status_change_prepare = tdx_enc_status_change_prepare;
978 x86_platform.guest.enc_status_change_finish = tdx_enc_status_change_finish;
979
980 x86_platform.guest.enc_cache_flush_required = tdx_cache_flush_required;
981 x86_platform.guest.enc_tlb_flush_required = tdx_tlb_flush_required;
982
983 x86_platform.guest.enc_kexec_begin = tdx_kexec_begin;
984 x86_platform.guest.enc_kexec_finish = tdx_kexec_finish;
985
986 /*
987 * TDX intercepts the RDMSR to read the X2APIC ID in the parallel
988 * bringup low level code. That raises #VE which cannot be handled
989 * there.
990 *
991 * Intel-TDX has a secure RDMSR hypercall, but that needs to be
992 * implemented separately in the low level startup ASM code.
993 * Until that is in place, disable parallel bringup for TDX.
994 */
995 x86_cpuinit.parallel_bringup = false;
996
997 pr_info("Guest detected\n");
998 }
999