xref: /linux/arch/x86/coco/tdx/tdx.c (revision 335bbdf01d25517ae832ac1807fd8323c1f4f3b9)
1 // SPDX-License-Identifier: GPL-2.0
2 /* Copyright (C) 2021-2022 Intel Corporation */
3 
4 #undef pr_fmt
5 #define pr_fmt(fmt)     "tdx: " fmt
6 
7 #include <linux/cpufeature.h>
8 #include <linux/export.h>
9 #include <linux/io.h>
10 #include <asm/coco.h>
11 #include <asm/tdx.h>
12 #include <asm/vmx.h>
13 #include <asm/insn.h>
14 #include <asm/insn-eval.h>
15 #include <asm/pgtable.h>
16 
17 /* MMIO direction */
18 #define EPT_READ	0
19 #define EPT_WRITE	1
20 
21 /* Port I/O direction */
22 #define PORT_READ	0
23 #define PORT_WRITE	1
24 
25 /* See Exit Qualification for I/O Instructions in VMX documentation */
26 #define VE_IS_IO_IN(e)		((e) & BIT(3))
27 #define VE_GET_IO_SIZE(e)	(((e) & GENMASK(2, 0)) + 1)
28 #define VE_GET_PORT_NUM(e)	((e) >> 16)
29 #define VE_IS_IO_STRING(e)	((e) & BIT(4))
30 
31 #define ATTR_DEBUG		BIT(0)
32 #define ATTR_SEPT_VE_DISABLE	BIT(28)
33 
34 /* TDX Module call error codes */
35 #define TDCALL_RETURN_CODE(a)	((a) >> 32)
36 #define TDCALL_INVALID_OPERAND	0xc0000100
37 
38 #define TDREPORT_SUBTYPE_0	0
39 
40 /* Called from __tdx_hypercall() for unrecoverable failure */
41 noinstr void __noreturn __tdx_hypercall_failed(void)
42 {
43 	instrumentation_begin();
44 	panic("TDVMCALL failed. TDX module bug?");
45 }
46 
47 #ifdef CONFIG_KVM_GUEST
48 long tdx_kvm_hypercall(unsigned int nr, unsigned long p1, unsigned long p2,
49 		       unsigned long p3, unsigned long p4)
50 {
51 	struct tdx_module_args args = {
52 		.r10 = nr,
53 		.r11 = p1,
54 		.r12 = p2,
55 		.r13 = p3,
56 		.r14 = p4,
57 	};
58 
59 	return __tdx_hypercall(&args);
60 }
61 EXPORT_SYMBOL_GPL(tdx_kvm_hypercall);
62 #endif
63 
64 /*
65  * Used for TDX guests to make calls directly to the TD module.  This
66  * should only be used for calls that have no legitimate reason to fail
67  * or where the kernel can not survive the call failing.
68  */
69 static inline void tdcall(u64 fn, struct tdx_module_args *args)
70 {
71 	if (__tdcall_ret(fn, args))
72 		panic("TDCALL %lld failed (Buggy TDX module!)\n", fn);
73 }
74 
75 /**
76  * tdx_mcall_get_report0() - Wrapper to get TDREPORT0 (a.k.a. TDREPORT
77  *                           subtype 0) using TDG.MR.REPORT TDCALL.
78  * @reportdata: Address of the input buffer which contains user-defined
79  *              REPORTDATA to be included into TDREPORT.
80  * @tdreport: Address of the output buffer to store TDREPORT.
81  *
82  * Refer to section titled "TDG.MR.REPORT leaf" in the TDX Module
83  * v1.0 specification for more information on TDG.MR.REPORT TDCALL.
84  * It is used in the TDX guest driver module to get the TDREPORT0.
85  *
86  * Return 0 on success, -EINVAL for invalid operands, or -EIO on
87  * other TDCALL failures.
88  */
89 int tdx_mcall_get_report0(u8 *reportdata, u8 *tdreport)
90 {
91 	struct tdx_module_args args = {
92 		.rcx = virt_to_phys(tdreport),
93 		.rdx = virt_to_phys(reportdata),
94 		.r8 = TDREPORT_SUBTYPE_0,
95 	};
96 	u64 ret;
97 
98 	ret = __tdcall(TDG_MR_REPORT, &args);
99 	if (ret) {
100 		if (TDCALL_RETURN_CODE(ret) == TDCALL_INVALID_OPERAND)
101 			return -EINVAL;
102 		return -EIO;
103 	}
104 
105 	return 0;
106 }
107 EXPORT_SYMBOL_GPL(tdx_mcall_get_report0);
108 
109 /**
110  * tdx_hcall_get_quote() - Wrapper to request TD Quote using GetQuote
111  *                         hypercall.
112  * @buf: Address of the directly mapped shared kernel buffer which
113  *       contains TDREPORT. The same buffer will be used by VMM to
114  *       store the generated TD Quote output.
115  * @size: size of the tdquote buffer (4KB-aligned).
116  *
117  * Refer to section titled "TDG.VP.VMCALL<GetQuote>" in the TDX GHCI
118  * v1.0 specification for more information on GetQuote hypercall.
119  * It is used in the TDX guest driver module to get the TD Quote.
120  *
121  * Return 0 on success or error code on failure.
122  */
123 u64 tdx_hcall_get_quote(u8 *buf, size_t size)
124 {
125 	/* Since buf is a shared memory, set the shared (decrypted) bits */
126 	return _tdx_hypercall(TDVMCALL_GET_QUOTE, cc_mkdec(virt_to_phys(buf)), size, 0, 0);
127 }
128 EXPORT_SYMBOL_GPL(tdx_hcall_get_quote);
129 
130 static void __noreturn tdx_panic(const char *msg)
131 {
132 	struct tdx_module_args args = {
133 		.r10 = TDX_HYPERCALL_STANDARD,
134 		.r11 = TDVMCALL_REPORT_FATAL_ERROR,
135 		.r12 = 0, /* Error code: 0 is Panic */
136 	};
137 	union {
138 		/* Define register order according to the GHCI */
139 		struct { u64 r14, r15, rbx, rdi, rsi, r8, r9, rdx; };
140 
141 		char str[64];
142 	} message;
143 
144 	/* VMM assumes '\0' in byte 65, if the message took all 64 bytes */
145 	strtomem_pad(message.str, msg, '\0');
146 
147 	args.r8  = message.r8;
148 	args.r9  = message.r9;
149 	args.r14 = message.r14;
150 	args.r15 = message.r15;
151 	args.rdi = message.rdi;
152 	args.rsi = message.rsi;
153 	args.rbx = message.rbx;
154 	args.rdx = message.rdx;
155 
156 	/*
157 	 * This hypercall should never return and it is not safe
158 	 * to keep the guest running. Call it forever if it
159 	 * happens to return.
160 	 */
161 	while (1)
162 		__tdx_hypercall(&args);
163 }
164 
165 static void tdx_parse_tdinfo(u64 *cc_mask)
166 {
167 	struct tdx_module_args args = {};
168 	unsigned int gpa_width;
169 	u64 td_attr;
170 
171 	/*
172 	 * TDINFO TDX module call is used to get the TD execution environment
173 	 * information like GPA width, number of available vcpus, debug mode
174 	 * information, etc. More details about the ABI can be found in TDX
175 	 * Guest-Host-Communication Interface (GHCI), section 2.4.2 TDCALL
176 	 * [TDG.VP.INFO].
177 	 */
178 	tdcall(TDG_VP_INFO, &args);
179 
180 	/*
181 	 * The highest bit of a guest physical address is the "sharing" bit.
182 	 * Set it for shared pages and clear it for private pages.
183 	 *
184 	 * The GPA width that comes out of this call is critical. TDX guests
185 	 * can not meaningfully run without it.
186 	 */
187 	gpa_width = args.rcx & GENMASK(5, 0);
188 	*cc_mask = BIT_ULL(gpa_width - 1);
189 
190 	/*
191 	 * The kernel can not handle #VE's when accessing normal kernel
192 	 * memory.  Ensure that no #VE will be delivered for accesses to
193 	 * TD-private memory.  Only VMM-shared memory (MMIO) will #VE.
194 	 */
195 	td_attr = args.rdx;
196 	if (!(td_attr & ATTR_SEPT_VE_DISABLE)) {
197 		const char *msg = "TD misconfiguration: SEPT_VE_DISABLE attribute must be set.";
198 
199 		/* Relax SEPT_VE_DISABLE check for debug TD. */
200 		if (td_attr & ATTR_DEBUG)
201 			pr_warn("%s\n", msg);
202 		else
203 			tdx_panic(msg);
204 	}
205 }
206 
207 /*
208  * The TDX module spec states that #VE may be injected for a limited set of
209  * reasons:
210  *
211  *  - Emulation of the architectural #VE injection on EPT violation;
212  *
213  *  - As a result of guest TD execution of a disallowed instruction,
214  *    a disallowed MSR access, or CPUID virtualization;
215  *
216  *  - A notification to the guest TD about anomalous behavior;
217  *
218  * The last one is opt-in and is not used by the kernel.
219  *
220  * The Intel Software Developer's Manual describes cases when instruction
221  * length field can be used in section "Information for VM Exits Due to
222  * Instruction Execution".
223  *
224  * For TDX, it ultimately means GET_VEINFO provides reliable instruction length
225  * information if #VE occurred due to instruction execution, but not for EPT
226  * violations.
227  */
228 static int ve_instr_len(struct ve_info *ve)
229 {
230 	switch (ve->exit_reason) {
231 	case EXIT_REASON_HLT:
232 	case EXIT_REASON_MSR_READ:
233 	case EXIT_REASON_MSR_WRITE:
234 	case EXIT_REASON_CPUID:
235 	case EXIT_REASON_IO_INSTRUCTION:
236 		/* It is safe to use ve->instr_len for #VE due instructions */
237 		return ve->instr_len;
238 	case EXIT_REASON_EPT_VIOLATION:
239 		/*
240 		 * For EPT violations, ve->insn_len is not defined. For those,
241 		 * the kernel must decode instructions manually and should not
242 		 * be using this function.
243 		 */
244 		WARN_ONCE(1, "ve->instr_len is not defined for EPT violations");
245 		return 0;
246 	default:
247 		WARN_ONCE(1, "Unexpected #VE-type: %lld\n", ve->exit_reason);
248 		return ve->instr_len;
249 	}
250 }
251 
252 static u64 __cpuidle __halt(const bool irq_disabled)
253 {
254 	struct tdx_module_args args = {
255 		.r10 = TDX_HYPERCALL_STANDARD,
256 		.r11 = hcall_func(EXIT_REASON_HLT),
257 		.r12 = irq_disabled,
258 	};
259 
260 	/*
261 	 * Emulate HLT operation via hypercall. More info about ABI
262 	 * can be found in TDX Guest-Host-Communication Interface
263 	 * (GHCI), section 3.8 TDG.VP.VMCALL<Instruction.HLT>.
264 	 *
265 	 * The VMM uses the "IRQ disabled" param to understand IRQ
266 	 * enabled status (RFLAGS.IF) of the TD guest and to determine
267 	 * whether or not it should schedule the halted vCPU if an
268 	 * IRQ becomes pending. E.g. if IRQs are disabled, the VMM
269 	 * can keep the vCPU in virtual HLT, even if an IRQ is
270 	 * pending, without hanging/breaking the guest.
271 	 */
272 	return __tdx_hypercall(&args);
273 }
274 
275 static int handle_halt(struct ve_info *ve)
276 {
277 	const bool irq_disabled = irqs_disabled();
278 
279 	if (__halt(irq_disabled))
280 		return -EIO;
281 
282 	return ve_instr_len(ve);
283 }
284 
285 void __cpuidle tdx_safe_halt(void)
286 {
287 	const bool irq_disabled = false;
288 
289 	/*
290 	 * Use WARN_ONCE() to report the failure.
291 	 */
292 	if (__halt(irq_disabled))
293 		WARN_ONCE(1, "HLT instruction emulation failed\n");
294 }
295 
296 static int read_msr(struct pt_regs *regs, struct ve_info *ve)
297 {
298 	struct tdx_module_args args = {
299 		.r10 = TDX_HYPERCALL_STANDARD,
300 		.r11 = hcall_func(EXIT_REASON_MSR_READ),
301 		.r12 = regs->cx,
302 	};
303 
304 	/*
305 	 * Emulate the MSR read via hypercall. More info about ABI
306 	 * can be found in TDX Guest-Host-Communication Interface
307 	 * (GHCI), section titled "TDG.VP.VMCALL<Instruction.RDMSR>".
308 	 */
309 	if (__tdx_hypercall(&args))
310 		return -EIO;
311 
312 	regs->ax = lower_32_bits(args.r11);
313 	regs->dx = upper_32_bits(args.r11);
314 	return ve_instr_len(ve);
315 }
316 
317 static int write_msr(struct pt_regs *regs, struct ve_info *ve)
318 {
319 	struct tdx_module_args args = {
320 		.r10 = TDX_HYPERCALL_STANDARD,
321 		.r11 = hcall_func(EXIT_REASON_MSR_WRITE),
322 		.r12 = regs->cx,
323 		.r13 = (u64)regs->dx << 32 | regs->ax,
324 	};
325 
326 	/*
327 	 * Emulate the MSR write via hypercall. More info about ABI
328 	 * can be found in TDX Guest-Host-Communication Interface
329 	 * (GHCI) section titled "TDG.VP.VMCALL<Instruction.WRMSR>".
330 	 */
331 	if (__tdx_hypercall(&args))
332 		return -EIO;
333 
334 	return ve_instr_len(ve);
335 }
336 
337 static int handle_cpuid(struct pt_regs *regs, struct ve_info *ve)
338 {
339 	struct tdx_module_args args = {
340 		.r10 = TDX_HYPERCALL_STANDARD,
341 		.r11 = hcall_func(EXIT_REASON_CPUID),
342 		.r12 = regs->ax,
343 		.r13 = regs->cx,
344 	};
345 
346 	/*
347 	 * Only allow VMM to control range reserved for hypervisor
348 	 * communication.
349 	 *
350 	 * Return all-zeros for any CPUID outside the range. It matches CPU
351 	 * behaviour for non-supported leaf.
352 	 */
353 	if (regs->ax < 0x40000000 || regs->ax > 0x4FFFFFFF) {
354 		regs->ax = regs->bx = regs->cx = regs->dx = 0;
355 		return ve_instr_len(ve);
356 	}
357 
358 	/*
359 	 * Emulate the CPUID instruction via a hypercall. More info about
360 	 * ABI can be found in TDX Guest-Host-Communication Interface
361 	 * (GHCI), section titled "VP.VMCALL<Instruction.CPUID>".
362 	 */
363 	if (__tdx_hypercall(&args))
364 		return -EIO;
365 
366 	/*
367 	 * As per TDX GHCI CPUID ABI, r12-r15 registers contain contents of
368 	 * EAX, EBX, ECX, EDX registers after the CPUID instruction execution.
369 	 * So copy the register contents back to pt_regs.
370 	 */
371 	regs->ax = args.r12;
372 	regs->bx = args.r13;
373 	regs->cx = args.r14;
374 	regs->dx = args.r15;
375 
376 	return ve_instr_len(ve);
377 }
378 
379 static bool mmio_read(int size, unsigned long addr, unsigned long *val)
380 {
381 	struct tdx_module_args args = {
382 		.r10 = TDX_HYPERCALL_STANDARD,
383 		.r11 = hcall_func(EXIT_REASON_EPT_VIOLATION),
384 		.r12 = size,
385 		.r13 = EPT_READ,
386 		.r14 = addr,
387 		.r15 = *val,
388 	};
389 
390 	if (__tdx_hypercall(&args))
391 		return false;
392 
393 	*val = args.r11;
394 	return true;
395 }
396 
397 static bool mmio_write(int size, unsigned long addr, unsigned long val)
398 {
399 	return !_tdx_hypercall(hcall_func(EXIT_REASON_EPT_VIOLATION), size,
400 			       EPT_WRITE, addr, val);
401 }
402 
403 static int handle_mmio(struct pt_regs *regs, struct ve_info *ve)
404 {
405 	unsigned long *reg, val, vaddr;
406 	char buffer[MAX_INSN_SIZE];
407 	enum insn_mmio_type mmio;
408 	struct insn insn = {};
409 	int size, extend_size;
410 	u8 extend_val = 0;
411 
412 	/* Only in-kernel MMIO is supported */
413 	if (WARN_ON_ONCE(user_mode(regs)))
414 		return -EFAULT;
415 
416 	if (copy_from_kernel_nofault(buffer, (void *)regs->ip, MAX_INSN_SIZE))
417 		return -EFAULT;
418 
419 	if (insn_decode(&insn, buffer, MAX_INSN_SIZE, INSN_MODE_64))
420 		return -EINVAL;
421 
422 	mmio = insn_decode_mmio(&insn, &size);
423 	if (WARN_ON_ONCE(mmio == INSN_MMIO_DECODE_FAILED))
424 		return -EINVAL;
425 
426 	if (mmio != INSN_MMIO_WRITE_IMM && mmio != INSN_MMIO_MOVS) {
427 		reg = insn_get_modrm_reg_ptr(&insn, regs);
428 		if (!reg)
429 			return -EINVAL;
430 	}
431 
432 	/*
433 	 * Reject EPT violation #VEs that split pages.
434 	 *
435 	 * MMIO accesses are supposed to be naturally aligned and therefore
436 	 * never cross page boundaries. Seeing split page accesses indicates
437 	 * a bug or a load_unaligned_zeropad() that stepped into an MMIO page.
438 	 *
439 	 * load_unaligned_zeropad() will recover using exception fixups.
440 	 */
441 	vaddr = (unsigned long)insn_get_addr_ref(&insn, regs);
442 	if (vaddr / PAGE_SIZE != (vaddr + size - 1) / PAGE_SIZE)
443 		return -EFAULT;
444 
445 	/* Handle writes first */
446 	switch (mmio) {
447 	case INSN_MMIO_WRITE:
448 		memcpy(&val, reg, size);
449 		if (!mmio_write(size, ve->gpa, val))
450 			return -EIO;
451 		return insn.length;
452 	case INSN_MMIO_WRITE_IMM:
453 		val = insn.immediate.value;
454 		if (!mmio_write(size, ve->gpa, val))
455 			return -EIO;
456 		return insn.length;
457 	case INSN_MMIO_READ:
458 	case INSN_MMIO_READ_ZERO_EXTEND:
459 	case INSN_MMIO_READ_SIGN_EXTEND:
460 		/* Reads are handled below */
461 		break;
462 	case INSN_MMIO_MOVS:
463 	case INSN_MMIO_DECODE_FAILED:
464 		/*
465 		 * MMIO was accessed with an instruction that could not be
466 		 * decoded or handled properly. It was likely not using io.h
467 		 * helpers or accessed MMIO accidentally.
468 		 */
469 		return -EINVAL;
470 	default:
471 		WARN_ONCE(1, "Unknown insn_decode_mmio() decode value?");
472 		return -EINVAL;
473 	}
474 
475 	/* Handle reads */
476 	if (!mmio_read(size, ve->gpa, &val))
477 		return -EIO;
478 
479 	switch (mmio) {
480 	case INSN_MMIO_READ:
481 		/* Zero-extend for 32-bit operation */
482 		extend_size = size == 4 ? sizeof(*reg) : 0;
483 		break;
484 	case INSN_MMIO_READ_ZERO_EXTEND:
485 		/* Zero extend based on operand size */
486 		extend_size = insn.opnd_bytes;
487 		break;
488 	case INSN_MMIO_READ_SIGN_EXTEND:
489 		/* Sign extend based on operand size */
490 		extend_size = insn.opnd_bytes;
491 		if (size == 1 && val & BIT(7))
492 			extend_val = 0xFF;
493 		else if (size > 1 && val & BIT(15))
494 			extend_val = 0xFF;
495 		break;
496 	default:
497 		/* All other cases has to be covered with the first switch() */
498 		WARN_ON_ONCE(1);
499 		return -EINVAL;
500 	}
501 
502 	if (extend_size)
503 		memset(reg, extend_val, extend_size);
504 	memcpy(reg, &val, size);
505 	return insn.length;
506 }
507 
508 static bool handle_in(struct pt_regs *regs, int size, int port)
509 {
510 	struct tdx_module_args args = {
511 		.r10 = TDX_HYPERCALL_STANDARD,
512 		.r11 = hcall_func(EXIT_REASON_IO_INSTRUCTION),
513 		.r12 = size,
514 		.r13 = PORT_READ,
515 		.r14 = port,
516 	};
517 	u64 mask = GENMASK(BITS_PER_BYTE * size, 0);
518 	bool success;
519 
520 	/*
521 	 * Emulate the I/O read via hypercall. More info about ABI can be found
522 	 * in TDX Guest-Host-Communication Interface (GHCI) section titled
523 	 * "TDG.VP.VMCALL<Instruction.IO>".
524 	 */
525 	success = !__tdx_hypercall(&args);
526 
527 	/* Update part of the register affected by the emulated instruction */
528 	regs->ax &= ~mask;
529 	if (success)
530 		regs->ax |= args.r11 & mask;
531 
532 	return success;
533 }
534 
535 static bool handle_out(struct pt_regs *regs, int size, int port)
536 {
537 	u64 mask = GENMASK(BITS_PER_BYTE * size, 0);
538 
539 	/*
540 	 * Emulate the I/O write via hypercall. More info about ABI can be found
541 	 * in TDX Guest-Host-Communication Interface (GHCI) section titled
542 	 * "TDG.VP.VMCALL<Instruction.IO>".
543 	 */
544 	return !_tdx_hypercall(hcall_func(EXIT_REASON_IO_INSTRUCTION), size,
545 			       PORT_WRITE, port, regs->ax & mask);
546 }
547 
548 /*
549  * Emulate I/O using hypercall.
550  *
551  * Assumes the IO instruction was using ax, which is enforced
552  * by the standard io.h macros.
553  *
554  * Return True on success or False on failure.
555  */
556 static int handle_io(struct pt_regs *regs, struct ve_info *ve)
557 {
558 	u32 exit_qual = ve->exit_qual;
559 	int size, port;
560 	bool in, ret;
561 
562 	if (VE_IS_IO_STRING(exit_qual))
563 		return -EIO;
564 
565 	in   = VE_IS_IO_IN(exit_qual);
566 	size = VE_GET_IO_SIZE(exit_qual);
567 	port = VE_GET_PORT_NUM(exit_qual);
568 
569 
570 	if (in)
571 		ret = handle_in(regs, size, port);
572 	else
573 		ret = handle_out(regs, size, port);
574 	if (!ret)
575 		return -EIO;
576 
577 	return ve_instr_len(ve);
578 }
579 
580 /*
581  * Early #VE exception handler. Only handles a subset of port I/O.
582  * Intended only for earlyprintk. If failed, return false.
583  */
584 __init bool tdx_early_handle_ve(struct pt_regs *regs)
585 {
586 	struct ve_info ve;
587 	int insn_len;
588 
589 	tdx_get_ve_info(&ve);
590 
591 	if (ve.exit_reason != EXIT_REASON_IO_INSTRUCTION)
592 		return false;
593 
594 	insn_len = handle_io(regs, &ve);
595 	if (insn_len < 0)
596 		return false;
597 
598 	regs->ip += insn_len;
599 	return true;
600 }
601 
602 void tdx_get_ve_info(struct ve_info *ve)
603 {
604 	struct tdx_module_args args = {};
605 
606 	/*
607 	 * Called during #VE handling to retrieve the #VE info from the
608 	 * TDX module.
609 	 *
610 	 * This has to be called early in #VE handling.  A "nested" #VE which
611 	 * occurs before this will raise a #DF and is not recoverable.
612 	 *
613 	 * The call retrieves the #VE info from the TDX module, which also
614 	 * clears the "#VE valid" flag. This must be done before anything else
615 	 * because any #VE that occurs while the valid flag is set will lead to
616 	 * #DF.
617 	 *
618 	 * Note, the TDX module treats virtual NMIs as inhibited if the #VE
619 	 * valid flag is set. It means that NMI=>#VE will not result in a #DF.
620 	 */
621 	tdcall(TDG_VP_VEINFO_GET, &args);
622 
623 	/* Transfer the output parameters */
624 	ve->exit_reason = args.rcx;
625 	ve->exit_qual   = args.rdx;
626 	ve->gla         = args.r8;
627 	ve->gpa         = args.r9;
628 	ve->instr_len   = lower_32_bits(args.r10);
629 	ve->instr_info  = upper_32_bits(args.r10);
630 }
631 
632 /*
633  * Handle the user initiated #VE.
634  *
635  * On success, returns the number of bytes RIP should be incremented (>=0)
636  * or -errno on error.
637  */
638 static int virt_exception_user(struct pt_regs *regs, struct ve_info *ve)
639 {
640 	switch (ve->exit_reason) {
641 	case EXIT_REASON_CPUID:
642 		return handle_cpuid(regs, ve);
643 	default:
644 		pr_warn("Unexpected #VE: %lld\n", ve->exit_reason);
645 		return -EIO;
646 	}
647 }
648 
649 static inline bool is_private_gpa(u64 gpa)
650 {
651 	return gpa == cc_mkenc(gpa);
652 }
653 
654 /*
655  * Handle the kernel #VE.
656  *
657  * On success, returns the number of bytes RIP should be incremented (>=0)
658  * or -errno on error.
659  */
660 static int virt_exception_kernel(struct pt_regs *regs, struct ve_info *ve)
661 {
662 	switch (ve->exit_reason) {
663 	case EXIT_REASON_HLT:
664 		return handle_halt(ve);
665 	case EXIT_REASON_MSR_READ:
666 		return read_msr(regs, ve);
667 	case EXIT_REASON_MSR_WRITE:
668 		return write_msr(regs, ve);
669 	case EXIT_REASON_CPUID:
670 		return handle_cpuid(regs, ve);
671 	case EXIT_REASON_EPT_VIOLATION:
672 		if (is_private_gpa(ve->gpa))
673 			panic("Unexpected EPT-violation on private memory.");
674 		return handle_mmio(regs, ve);
675 	case EXIT_REASON_IO_INSTRUCTION:
676 		return handle_io(regs, ve);
677 	default:
678 		pr_warn("Unexpected #VE: %lld\n", ve->exit_reason);
679 		return -EIO;
680 	}
681 }
682 
683 bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve)
684 {
685 	int insn_len;
686 
687 	if (user_mode(regs))
688 		insn_len = virt_exception_user(regs, ve);
689 	else
690 		insn_len = virt_exception_kernel(regs, ve);
691 	if (insn_len < 0)
692 		return false;
693 
694 	/* After successful #VE handling, move the IP */
695 	regs->ip += insn_len;
696 
697 	return true;
698 }
699 
700 static bool tdx_tlb_flush_required(bool private)
701 {
702 	/*
703 	 * TDX guest is responsible for flushing TLB on private->shared
704 	 * transition. VMM is responsible for flushing on shared->private.
705 	 *
706 	 * The VMM _can't_ flush private addresses as it can't generate PAs
707 	 * with the guest's HKID.  Shared memory isn't subject to integrity
708 	 * checking, i.e. the VMM doesn't need to flush for its own protection.
709 	 *
710 	 * There's no need to flush when converting from shared to private,
711 	 * as flushing is the VMM's responsibility in this case, e.g. it must
712 	 * flush to avoid integrity failures in the face of a buggy or
713 	 * malicious guest.
714 	 */
715 	return !private;
716 }
717 
718 static bool tdx_cache_flush_required(void)
719 {
720 	/*
721 	 * AMD SME/SEV can avoid cache flushing if HW enforces cache coherence.
722 	 * TDX doesn't have such capability.
723 	 *
724 	 * Flush cache unconditionally.
725 	 */
726 	return true;
727 }
728 
729 /*
730  * Notify the VMM about page mapping conversion. More info about ABI
731  * can be found in TDX Guest-Host-Communication Interface (GHCI),
732  * section "TDG.VP.VMCALL<MapGPA>".
733  */
734 static bool tdx_map_gpa(phys_addr_t start, phys_addr_t end, bool enc)
735 {
736 	/* Retrying the hypercall a second time should succeed; use 3 just in case */
737 	const int max_retries_per_page = 3;
738 	int retry_count = 0;
739 
740 	if (!enc) {
741 		/* Set the shared (decrypted) bits: */
742 		start |= cc_mkdec(0);
743 		end   |= cc_mkdec(0);
744 	}
745 
746 	while (retry_count < max_retries_per_page) {
747 		struct tdx_module_args args = {
748 			.r10 = TDX_HYPERCALL_STANDARD,
749 			.r11 = TDVMCALL_MAP_GPA,
750 			.r12 = start,
751 			.r13 = end - start };
752 
753 		u64 map_fail_paddr;
754 		u64 ret = __tdx_hypercall(&args);
755 
756 		if (ret != TDVMCALL_STATUS_RETRY)
757 			return !ret;
758 		/*
759 		 * The guest must retry the operation for the pages in the
760 		 * region starting at the GPA specified in R11. R11 comes
761 		 * from the untrusted VMM. Sanity check it.
762 		 */
763 		map_fail_paddr = args.r11;
764 		if (map_fail_paddr < start || map_fail_paddr >= end)
765 			return false;
766 
767 		/* "Consume" a retry without forward progress */
768 		if (map_fail_paddr == start) {
769 			retry_count++;
770 			continue;
771 		}
772 
773 		start = map_fail_paddr;
774 		retry_count = 0;
775 	}
776 
777 	return false;
778 }
779 
780 /*
781  * Inform the VMM of the guest's intent for this physical page: shared with
782  * the VMM or private to the guest.  The VMM is expected to change its mapping
783  * of the page in response.
784  */
785 static bool tdx_enc_status_changed(unsigned long vaddr, int numpages, bool enc)
786 {
787 	phys_addr_t start = __pa(vaddr);
788 	phys_addr_t end   = __pa(vaddr + numpages * PAGE_SIZE);
789 
790 	if (!tdx_map_gpa(start, end, enc))
791 		return false;
792 
793 	/* shared->private conversion requires memory to be accepted before use */
794 	if (enc)
795 		return tdx_accept_memory(start, end);
796 
797 	return true;
798 }
799 
800 static bool tdx_enc_status_change_prepare(unsigned long vaddr, int numpages,
801 					  bool enc)
802 {
803 	/*
804 	 * Only handle shared->private conversion here.
805 	 * See the comment in tdx_early_init().
806 	 */
807 	if (enc)
808 		return tdx_enc_status_changed(vaddr, numpages, enc);
809 	return true;
810 }
811 
812 static bool tdx_enc_status_change_finish(unsigned long vaddr, int numpages,
813 					 bool enc)
814 {
815 	/*
816 	 * Only handle private->shared conversion here.
817 	 * See the comment in tdx_early_init().
818 	 */
819 	if (!enc)
820 		return tdx_enc_status_changed(vaddr, numpages, enc);
821 	return true;
822 }
823 
824 void __init tdx_early_init(void)
825 {
826 	struct tdx_module_args args = {
827 		.rdx = TDCS_NOTIFY_ENABLES,
828 		.r9 = -1ULL,
829 	};
830 	u64 cc_mask;
831 	u32 eax, sig[3];
832 
833 	cpuid_count(TDX_CPUID_LEAF_ID, 0, &eax, &sig[0], &sig[2],  &sig[1]);
834 
835 	if (memcmp(TDX_IDENT, sig, sizeof(sig)))
836 		return;
837 
838 	setup_force_cpu_cap(X86_FEATURE_TDX_GUEST);
839 
840 	/* TSC is the only reliable clock in TDX guest */
841 	setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
842 
843 	cc_vendor = CC_VENDOR_INTEL;
844 	tdx_parse_tdinfo(&cc_mask);
845 	cc_set_mask(cc_mask);
846 
847 	/* Kernel does not use NOTIFY_ENABLES and does not need random #VEs */
848 	tdcall(TDG_VM_WR, &args);
849 
850 	/*
851 	 * All bits above GPA width are reserved and kernel treats shared bit
852 	 * as flag, not as part of physical address.
853 	 *
854 	 * Adjust physical mask to only cover valid GPA bits.
855 	 */
856 	physical_mask &= cc_mask - 1;
857 
858 	/*
859 	 * The kernel mapping should match the TDX metadata for the page.
860 	 * load_unaligned_zeropad() can touch memory *adjacent* to that which is
861 	 * owned by the caller and can catch even _momentary_ mismatches.  Bad
862 	 * things happen on mismatch:
863 	 *
864 	 *   - Private mapping => Shared Page  == Guest shutdown
865          *   - Shared mapping  => Private Page == Recoverable #VE
866 	 *
867 	 * guest.enc_status_change_prepare() converts the page from
868 	 * shared=>private before the mapping becomes private.
869 	 *
870 	 * guest.enc_status_change_finish() converts the page from
871 	 * private=>shared after the mapping becomes private.
872 	 *
873 	 * In both cases there is a temporary shared mapping to a private page,
874 	 * which can result in a #VE.  But, there is never a private mapping to
875 	 * a shared page.
876 	 */
877 	x86_platform.guest.enc_status_change_prepare = tdx_enc_status_change_prepare;
878 	x86_platform.guest.enc_status_change_finish  = tdx_enc_status_change_finish;
879 
880 	x86_platform.guest.enc_cache_flush_required  = tdx_cache_flush_required;
881 	x86_platform.guest.enc_tlb_flush_required    = tdx_tlb_flush_required;
882 
883 	/*
884 	 * TDX intercepts the RDMSR to read the X2APIC ID in the parallel
885 	 * bringup low level code. That raises #VE which cannot be handled
886 	 * there.
887 	 *
888 	 * Intel-TDX has a secure RDMSR hypercall, but that needs to be
889 	 * implemented seperately in the low level startup ASM code.
890 	 * Until that is in place, disable parallel bringup for TDX.
891 	 */
892 	x86_cpuinit.parallel_bringup = false;
893 
894 	pr_info("Guest detected\n");
895 }
896