xref: /linux/arch/x86/coco/tdx/tdx.c (revision e6b9d8eddb1772d99a676a906d42865293934edd)
1 // SPDX-License-Identifier: GPL-2.0
2 /* Copyright (C) 2021-2022 Intel Corporation */
3 
4 #undef pr_fmt
5 #define pr_fmt(fmt)     "tdx: " fmt
6 
7 #include <linux/cpufeature.h>
8 #include <linux/export.h>
9 #include <linux/io.h>
10 #include <asm/coco.h>
11 #include <asm/tdx.h>
12 #include <asm/vmx.h>
13 #include <asm/insn.h>
14 #include <asm/insn-eval.h>
15 #include <asm/pgtable.h>
16 
17 /* TDX module Call Leaf IDs */
18 #define TDX_GET_INFO			1
19 #define TDX_GET_VEINFO			3
20 #define TDX_GET_REPORT			4
21 #define TDX_ACCEPT_PAGE			6
22 #define TDX_WR				8
23 
24 /* TDCS fields. To be used by TDG.VM.WR and TDG.VM.RD module calls */
25 #define TDCS_NOTIFY_ENABLES		0x9100000000000010
26 
27 /* TDX hypercall Leaf IDs */
28 #define TDVMCALL_MAP_GPA		0x10001
29 #define TDVMCALL_REPORT_FATAL_ERROR	0x10003
30 
31 /* MMIO direction */
32 #define EPT_READ	0
33 #define EPT_WRITE	1
34 
35 /* Port I/O direction */
36 #define PORT_READ	0
37 #define PORT_WRITE	1
38 
39 /* See Exit Qualification for I/O Instructions in VMX documentation */
40 #define VE_IS_IO_IN(e)		((e) & BIT(3))
41 #define VE_GET_IO_SIZE(e)	(((e) & GENMASK(2, 0)) + 1)
42 #define VE_GET_PORT_NUM(e)	((e) >> 16)
43 #define VE_IS_IO_STRING(e)	((e) & BIT(4))
44 
45 #define ATTR_DEBUG		BIT(0)
46 #define ATTR_SEPT_VE_DISABLE	BIT(28)
47 
48 /* TDX Module call error codes */
49 #define TDCALL_RETURN_CODE(a)	((a) >> 32)
50 #define TDCALL_INVALID_OPERAND	0xc0000100
51 
52 #define TDREPORT_SUBTYPE_0	0
53 
54 /*
55  * Wrapper for standard use of __tdx_hypercall with no output aside from
56  * return code.
57  */
58 static inline u64 _tdx_hypercall(u64 fn, u64 r12, u64 r13, u64 r14, u64 r15)
59 {
60 	struct tdx_hypercall_args args = {
61 		.r10 = TDX_HYPERCALL_STANDARD,
62 		.r11 = fn,
63 		.r12 = r12,
64 		.r13 = r13,
65 		.r14 = r14,
66 		.r15 = r15,
67 	};
68 
69 	return __tdx_hypercall(&args);
70 }
71 
72 /* Called from __tdx_hypercall() for unrecoverable failure */
73 noinstr void __tdx_hypercall_failed(void)
74 {
75 	instrumentation_begin();
76 	panic("TDVMCALL failed. TDX module bug?");
77 }
78 
79 /*
80  * The TDG.VP.VMCALL-Instruction-execution sub-functions are defined
81  * independently from but are currently matched 1:1 with VMX EXIT_REASONs.
82  * Reusing the KVM EXIT_REASON macros makes it easier to connect the host and
83  * guest sides of these calls.
84  */
85 static __always_inline u64 hcall_func(u64 exit_reason)
86 {
87 	return exit_reason;
88 }
89 
90 #ifdef CONFIG_KVM_GUEST
91 long tdx_kvm_hypercall(unsigned int nr, unsigned long p1, unsigned long p2,
92 		       unsigned long p3, unsigned long p4)
93 {
94 	struct tdx_hypercall_args args = {
95 		.r10 = nr,
96 		.r11 = p1,
97 		.r12 = p2,
98 		.r13 = p3,
99 		.r14 = p4,
100 	};
101 
102 	return __tdx_hypercall(&args);
103 }
104 EXPORT_SYMBOL_GPL(tdx_kvm_hypercall);
105 #endif
106 
107 /*
108  * Used for TDX guests to make calls directly to the TD module.  This
109  * should only be used for calls that have no legitimate reason to fail
110  * or where the kernel can not survive the call failing.
111  */
112 static inline void tdx_module_call(u64 fn, u64 rcx, u64 rdx, u64 r8, u64 r9,
113 				   struct tdx_module_output *out)
114 {
115 	if (__tdx_module_call(fn, rcx, rdx, r8, r9, out))
116 		panic("TDCALL %lld failed (Buggy TDX module!)\n", fn);
117 }
118 
119 /**
120  * tdx_mcall_get_report0() - Wrapper to get TDREPORT0 (a.k.a. TDREPORT
121  *                           subtype 0) using TDG.MR.REPORT TDCALL.
122  * @reportdata: Address of the input buffer which contains user-defined
123  *              REPORTDATA to be included into TDREPORT.
124  * @tdreport: Address of the output buffer to store TDREPORT.
125  *
126  * Refer to section titled "TDG.MR.REPORT leaf" in the TDX Module
127  * v1.0 specification for more information on TDG.MR.REPORT TDCALL.
128  * It is used in the TDX guest driver module to get the TDREPORT0.
129  *
130  * Return 0 on success, -EINVAL for invalid operands, or -EIO on
131  * other TDCALL failures.
132  */
133 int tdx_mcall_get_report0(u8 *reportdata, u8 *tdreport)
134 {
135 	u64 ret;
136 
137 	ret = __tdx_module_call(TDX_GET_REPORT, virt_to_phys(tdreport),
138 				virt_to_phys(reportdata), TDREPORT_SUBTYPE_0,
139 				0, NULL);
140 	if (ret) {
141 		if (TDCALL_RETURN_CODE(ret) == TDCALL_INVALID_OPERAND)
142 			return -EINVAL;
143 		return -EIO;
144 	}
145 
146 	return 0;
147 }
148 EXPORT_SYMBOL_GPL(tdx_mcall_get_report0);
149 
150 static void __noreturn tdx_panic(const char *msg)
151 {
152 	struct tdx_hypercall_args args = {
153 		.r10 = TDX_HYPERCALL_STANDARD,
154 		.r11 = TDVMCALL_REPORT_FATAL_ERROR,
155 		.r12 = 0, /* Error code: 0 is Panic */
156 	};
157 	union {
158 		/* Define register order according to the GHCI */
159 		struct { u64 r14, r15, rbx, rdi, rsi, r8, r9, rdx; };
160 
161 		char str[64];
162 	} message;
163 
164 	/* VMM assumes '\0' in byte 65, if the message took all 64 bytes */
165 	strncpy(message.str, msg, 64);
166 
167 	args.r8  = message.r8;
168 	args.r9  = message.r9;
169 	args.r14 = message.r14;
170 	args.r15 = message.r15;
171 	args.rdi = message.rdi;
172 	args.rsi = message.rsi;
173 	args.rbx = message.rbx;
174 	args.rdx = message.rdx;
175 
176 	/*
177 	 * This hypercall should never return and it is not safe
178 	 * to keep the guest running. Call it forever if it
179 	 * happens to return.
180 	 */
181 	while (1)
182 		__tdx_hypercall(&args);
183 }
184 
185 static void tdx_parse_tdinfo(u64 *cc_mask)
186 {
187 	struct tdx_module_output out;
188 	unsigned int gpa_width;
189 	u64 td_attr;
190 
191 	/*
192 	 * TDINFO TDX module call is used to get the TD execution environment
193 	 * information like GPA width, number of available vcpus, debug mode
194 	 * information, etc. More details about the ABI can be found in TDX
195 	 * Guest-Host-Communication Interface (GHCI), section 2.4.2 TDCALL
196 	 * [TDG.VP.INFO].
197 	 */
198 	tdx_module_call(TDX_GET_INFO, 0, 0, 0, 0, &out);
199 
200 	/*
201 	 * The highest bit of a guest physical address is the "sharing" bit.
202 	 * Set it for shared pages and clear it for private pages.
203 	 *
204 	 * The GPA width that comes out of this call is critical. TDX guests
205 	 * can not meaningfully run without it.
206 	 */
207 	gpa_width = out.rcx & GENMASK(5, 0);
208 	*cc_mask = BIT_ULL(gpa_width - 1);
209 
210 	/*
211 	 * The kernel can not handle #VE's when accessing normal kernel
212 	 * memory.  Ensure that no #VE will be delivered for accesses to
213 	 * TD-private memory.  Only VMM-shared memory (MMIO) will #VE.
214 	 */
215 	td_attr = out.rdx;
216 	if (!(td_attr & ATTR_SEPT_VE_DISABLE)) {
217 		const char *msg = "TD misconfiguration: SEPT_VE_DISABLE attribute must be set.";
218 
219 		/* Relax SEPT_VE_DISABLE check for debug TD. */
220 		if (td_attr & ATTR_DEBUG)
221 			pr_warn("%s\n", msg);
222 		else
223 			tdx_panic(msg);
224 	}
225 }
226 
227 /*
228  * The TDX module spec states that #VE may be injected for a limited set of
229  * reasons:
230  *
231  *  - Emulation of the architectural #VE injection on EPT violation;
232  *
233  *  - As a result of guest TD execution of a disallowed instruction,
234  *    a disallowed MSR access, or CPUID virtualization;
235  *
236  *  - A notification to the guest TD about anomalous behavior;
237  *
238  * The last one is opt-in and is not used by the kernel.
239  *
240  * The Intel Software Developer's Manual describes cases when instruction
241  * length field can be used in section "Information for VM Exits Due to
242  * Instruction Execution".
243  *
244  * For TDX, it ultimately means GET_VEINFO provides reliable instruction length
245  * information if #VE occurred due to instruction execution, but not for EPT
246  * violations.
247  */
248 static int ve_instr_len(struct ve_info *ve)
249 {
250 	switch (ve->exit_reason) {
251 	case EXIT_REASON_HLT:
252 	case EXIT_REASON_MSR_READ:
253 	case EXIT_REASON_MSR_WRITE:
254 	case EXIT_REASON_CPUID:
255 	case EXIT_REASON_IO_INSTRUCTION:
256 		/* It is safe to use ve->instr_len for #VE due instructions */
257 		return ve->instr_len;
258 	case EXIT_REASON_EPT_VIOLATION:
259 		/*
260 		 * For EPT violations, ve->insn_len is not defined. For those,
261 		 * the kernel must decode instructions manually and should not
262 		 * be using this function.
263 		 */
264 		WARN_ONCE(1, "ve->instr_len is not defined for EPT violations");
265 		return 0;
266 	default:
267 		WARN_ONCE(1, "Unexpected #VE-type: %lld\n", ve->exit_reason);
268 		return ve->instr_len;
269 	}
270 }
271 
272 static u64 __cpuidle __halt(const bool irq_disabled)
273 {
274 	struct tdx_hypercall_args args = {
275 		.r10 = TDX_HYPERCALL_STANDARD,
276 		.r11 = hcall_func(EXIT_REASON_HLT),
277 		.r12 = irq_disabled,
278 	};
279 
280 	/*
281 	 * Emulate HLT operation via hypercall. More info about ABI
282 	 * can be found in TDX Guest-Host-Communication Interface
283 	 * (GHCI), section 3.8 TDG.VP.VMCALL<Instruction.HLT>.
284 	 *
285 	 * The VMM uses the "IRQ disabled" param to understand IRQ
286 	 * enabled status (RFLAGS.IF) of the TD guest and to determine
287 	 * whether or not it should schedule the halted vCPU if an
288 	 * IRQ becomes pending. E.g. if IRQs are disabled, the VMM
289 	 * can keep the vCPU in virtual HLT, even if an IRQ is
290 	 * pending, without hanging/breaking the guest.
291 	 */
292 	return __tdx_hypercall(&args);
293 }
294 
295 static int handle_halt(struct ve_info *ve)
296 {
297 	const bool irq_disabled = irqs_disabled();
298 
299 	if (__halt(irq_disabled))
300 		return -EIO;
301 
302 	return ve_instr_len(ve);
303 }
304 
305 void __cpuidle tdx_safe_halt(void)
306 {
307 	const bool irq_disabled = false;
308 
309 	/*
310 	 * Use WARN_ONCE() to report the failure.
311 	 */
312 	if (__halt(irq_disabled))
313 		WARN_ONCE(1, "HLT instruction emulation failed\n");
314 }
315 
316 static int read_msr(struct pt_regs *regs, struct ve_info *ve)
317 {
318 	struct tdx_hypercall_args args = {
319 		.r10 = TDX_HYPERCALL_STANDARD,
320 		.r11 = hcall_func(EXIT_REASON_MSR_READ),
321 		.r12 = regs->cx,
322 	};
323 
324 	/*
325 	 * Emulate the MSR read via hypercall. More info about ABI
326 	 * can be found in TDX Guest-Host-Communication Interface
327 	 * (GHCI), section titled "TDG.VP.VMCALL<Instruction.RDMSR>".
328 	 */
329 	if (__tdx_hypercall_ret(&args))
330 		return -EIO;
331 
332 	regs->ax = lower_32_bits(args.r11);
333 	regs->dx = upper_32_bits(args.r11);
334 	return ve_instr_len(ve);
335 }
336 
337 static int write_msr(struct pt_regs *regs, struct ve_info *ve)
338 {
339 	struct tdx_hypercall_args args = {
340 		.r10 = TDX_HYPERCALL_STANDARD,
341 		.r11 = hcall_func(EXIT_REASON_MSR_WRITE),
342 		.r12 = regs->cx,
343 		.r13 = (u64)regs->dx << 32 | regs->ax,
344 	};
345 
346 	/*
347 	 * Emulate the MSR write via hypercall. More info about ABI
348 	 * can be found in TDX Guest-Host-Communication Interface
349 	 * (GHCI) section titled "TDG.VP.VMCALL<Instruction.WRMSR>".
350 	 */
351 	if (__tdx_hypercall(&args))
352 		return -EIO;
353 
354 	return ve_instr_len(ve);
355 }
356 
357 static int handle_cpuid(struct pt_regs *regs, struct ve_info *ve)
358 {
359 	struct tdx_hypercall_args args = {
360 		.r10 = TDX_HYPERCALL_STANDARD,
361 		.r11 = hcall_func(EXIT_REASON_CPUID),
362 		.r12 = regs->ax,
363 		.r13 = regs->cx,
364 	};
365 
366 	/*
367 	 * Only allow VMM to control range reserved for hypervisor
368 	 * communication.
369 	 *
370 	 * Return all-zeros for any CPUID outside the range. It matches CPU
371 	 * behaviour for non-supported leaf.
372 	 */
373 	if (regs->ax < 0x40000000 || regs->ax > 0x4FFFFFFF) {
374 		regs->ax = regs->bx = regs->cx = regs->dx = 0;
375 		return ve_instr_len(ve);
376 	}
377 
378 	/*
379 	 * Emulate the CPUID instruction via a hypercall. More info about
380 	 * ABI can be found in TDX Guest-Host-Communication Interface
381 	 * (GHCI), section titled "VP.VMCALL<Instruction.CPUID>".
382 	 */
383 	if (__tdx_hypercall_ret(&args))
384 		return -EIO;
385 
386 	/*
387 	 * As per TDX GHCI CPUID ABI, r12-r15 registers contain contents of
388 	 * EAX, EBX, ECX, EDX registers after the CPUID instruction execution.
389 	 * So copy the register contents back to pt_regs.
390 	 */
391 	regs->ax = args.r12;
392 	regs->bx = args.r13;
393 	regs->cx = args.r14;
394 	regs->dx = args.r15;
395 
396 	return ve_instr_len(ve);
397 }
398 
399 static bool mmio_read(int size, unsigned long addr, unsigned long *val)
400 {
401 	struct tdx_hypercall_args args = {
402 		.r10 = TDX_HYPERCALL_STANDARD,
403 		.r11 = hcall_func(EXIT_REASON_EPT_VIOLATION),
404 		.r12 = size,
405 		.r13 = EPT_READ,
406 		.r14 = addr,
407 		.r15 = *val,
408 	};
409 
410 	if (__tdx_hypercall_ret(&args))
411 		return false;
412 	*val = args.r11;
413 	return true;
414 }
415 
416 static bool mmio_write(int size, unsigned long addr, unsigned long val)
417 {
418 	return !_tdx_hypercall(hcall_func(EXIT_REASON_EPT_VIOLATION), size,
419 			       EPT_WRITE, addr, val);
420 }
421 
422 static int handle_mmio(struct pt_regs *regs, struct ve_info *ve)
423 {
424 	unsigned long *reg, val, vaddr;
425 	char buffer[MAX_INSN_SIZE];
426 	enum insn_mmio_type mmio;
427 	struct insn insn = {};
428 	int size, extend_size;
429 	u8 extend_val = 0;
430 
431 	/* Only in-kernel MMIO is supported */
432 	if (WARN_ON_ONCE(user_mode(regs)))
433 		return -EFAULT;
434 
435 	if (copy_from_kernel_nofault(buffer, (void *)regs->ip, MAX_INSN_SIZE))
436 		return -EFAULT;
437 
438 	if (insn_decode(&insn, buffer, MAX_INSN_SIZE, INSN_MODE_64))
439 		return -EINVAL;
440 
441 	mmio = insn_decode_mmio(&insn, &size);
442 	if (WARN_ON_ONCE(mmio == INSN_MMIO_DECODE_FAILED))
443 		return -EINVAL;
444 
445 	if (mmio != INSN_MMIO_WRITE_IMM && mmio != INSN_MMIO_MOVS) {
446 		reg = insn_get_modrm_reg_ptr(&insn, regs);
447 		if (!reg)
448 			return -EINVAL;
449 	}
450 
451 	/*
452 	 * Reject EPT violation #VEs that split pages.
453 	 *
454 	 * MMIO accesses are supposed to be naturally aligned and therefore
455 	 * never cross page boundaries. Seeing split page accesses indicates
456 	 * a bug or a load_unaligned_zeropad() that stepped into an MMIO page.
457 	 *
458 	 * load_unaligned_zeropad() will recover using exception fixups.
459 	 */
460 	vaddr = (unsigned long)insn_get_addr_ref(&insn, regs);
461 	if (vaddr / PAGE_SIZE != (vaddr + size - 1) / PAGE_SIZE)
462 		return -EFAULT;
463 
464 	/* Handle writes first */
465 	switch (mmio) {
466 	case INSN_MMIO_WRITE:
467 		memcpy(&val, reg, size);
468 		if (!mmio_write(size, ve->gpa, val))
469 			return -EIO;
470 		return insn.length;
471 	case INSN_MMIO_WRITE_IMM:
472 		val = insn.immediate.value;
473 		if (!mmio_write(size, ve->gpa, val))
474 			return -EIO;
475 		return insn.length;
476 	case INSN_MMIO_READ:
477 	case INSN_MMIO_READ_ZERO_EXTEND:
478 	case INSN_MMIO_READ_SIGN_EXTEND:
479 		/* Reads are handled below */
480 		break;
481 	case INSN_MMIO_MOVS:
482 	case INSN_MMIO_DECODE_FAILED:
483 		/*
484 		 * MMIO was accessed with an instruction that could not be
485 		 * decoded or handled properly. It was likely not using io.h
486 		 * helpers or accessed MMIO accidentally.
487 		 */
488 		return -EINVAL;
489 	default:
490 		WARN_ONCE(1, "Unknown insn_decode_mmio() decode value?");
491 		return -EINVAL;
492 	}
493 
494 	/* Handle reads */
495 	if (!mmio_read(size, ve->gpa, &val))
496 		return -EIO;
497 
498 	switch (mmio) {
499 	case INSN_MMIO_READ:
500 		/* Zero-extend for 32-bit operation */
501 		extend_size = size == 4 ? sizeof(*reg) : 0;
502 		break;
503 	case INSN_MMIO_READ_ZERO_EXTEND:
504 		/* Zero extend based on operand size */
505 		extend_size = insn.opnd_bytes;
506 		break;
507 	case INSN_MMIO_READ_SIGN_EXTEND:
508 		/* Sign extend based on operand size */
509 		extend_size = insn.opnd_bytes;
510 		if (size == 1 && val & BIT(7))
511 			extend_val = 0xFF;
512 		else if (size > 1 && val & BIT(15))
513 			extend_val = 0xFF;
514 		break;
515 	default:
516 		/* All other cases has to be covered with the first switch() */
517 		WARN_ON_ONCE(1);
518 		return -EINVAL;
519 	}
520 
521 	if (extend_size)
522 		memset(reg, extend_val, extend_size);
523 	memcpy(reg, &val, size);
524 	return insn.length;
525 }
526 
527 static bool handle_in(struct pt_regs *regs, int size, int port)
528 {
529 	struct tdx_hypercall_args args = {
530 		.r10 = TDX_HYPERCALL_STANDARD,
531 		.r11 = hcall_func(EXIT_REASON_IO_INSTRUCTION),
532 		.r12 = size,
533 		.r13 = PORT_READ,
534 		.r14 = port,
535 	};
536 	u64 mask = GENMASK(BITS_PER_BYTE * size, 0);
537 	bool success;
538 
539 	/*
540 	 * Emulate the I/O read via hypercall. More info about ABI can be found
541 	 * in TDX Guest-Host-Communication Interface (GHCI) section titled
542 	 * "TDG.VP.VMCALL<Instruction.IO>".
543 	 */
544 	success = !__tdx_hypercall_ret(&args);
545 
546 	/* Update part of the register affected by the emulated instruction */
547 	regs->ax &= ~mask;
548 	if (success)
549 		regs->ax |= args.r11 & mask;
550 
551 	return success;
552 }
553 
554 static bool handle_out(struct pt_regs *regs, int size, int port)
555 {
556 	u64 mask = GENMASK(BITS_PER_BYTE * size, 0);
557 
558 	/*
559 	 * Emulate the I/O write via hypercall. More info about ABI can be found
560 	 * in TDX Guest-Host-Communication Interface (GHCI) section titled
561 	 * "TDG.VP.VMCALL<Instruction.IO>".
562 	 */
563 	return !_tdx_hypercall(hcall_func(EXIT_REASON_IO_INSTRUCTION), size,
564 			       PORT_WRITE, port, regs->ax & mask);
565 }
566 
567 /*
568  * Emulate I/O using hypercall.
569  *
570  * Assumes the IO instruction was using ax, which is enforced
571  * by the standard io.h macros.
572  *
573  * Return True on success or False on failure.
574  */
575 static int handle_io(struct pt_regs *regs, struct ve_info *ve)
576 {
577 	u32 exit_qual = ve->exit_qual;
578 	int size, port;
579 	bool in, ret;
580 
581 	if (VE_IS_IO_STRING(exit_qual))
582 		return -EIO;
583 
584 	in   = VE_IS_IO_IN(exit_qual);
585 	size = VE_GET_IO_SIZE(exit_qual);
586 	port = VE_GET_PORT_NUM(exit_qual);
587 
588 
589 	if (in)
590 		ret = handle_in(regs, size, port);
591 	else
592 		ret = handle_out(regs, size, port);
593 	if (!ret)
594 		return -EIO;
595 
596 	return ve_instr_len(ve);
597 }
598 
599 /*
600  * Early #VE exception handler. Only handles a subset of port I/O.
601  * Intended only for earlyprintk. If failed, return false.
602  */
603 __init bool tdx_early_handle_ve(struct pt_regs *regs)
604 {
605 	struct ve_info ve;
606 	int insn_len;
607 
608 	tdx_get_ve_info(&ve);
609 
610 	if (ve.exit_reason != EXIT_REASON_IO_INSTRUCTION)
611 		return false;
612 
613 	insn_len = handle_io(regs, &ve);
614 	if (insn_len < 0)
615 		return false;
616 
617 	regs->ip += insn_len;
618 	return true;
619 }
620 
621 void tdx_get_ve_info(struct ve_info *ve)
622 {
623 	struct tdx_module_output out;
624 
625 	/*
626 	 * Called during #VE handling to retrieve the #VE info from the
627 	 * TDX module.
628 	 *
629 	 * This has to be called early in #VE handling.  A "nested" #VE which
630 	 * occurs before this will raise a #DF and is not recoverable.
631 	 *
632 	 * The call retrieves the #VE info from the TDX module, which also
633 	 * clears the "#VE valid" flag. This must be done before anything else
634 	 * because any #VE that occurs while the valid flag is set will lead to
635 	 * #DF.
636 	 *
637 	 * Note, the TDX module treats virtual NMIs as inhibited if the #VE
638 	 * valid flag is set. It means that NMI=>#VE will not result in a #DF.
639 	 */
640 	tdx_module_call(TDX_GET_VEINFO, 0, 0, 0, 0, &out);
641 
642 	/* Transfer the output parameters */
643 	ve->exit_reason = out.rcx;
644 	ve->exit_qual   = out.rdx;
645 	ve->gla         = out.r8;
646 	ve->gpa         = out.r9;
647 	ve->instr_len   = lower_32_bits(out.r10);
648 	ve->instr_info  = upper_32_bits(out.r10);
649 }
650 
651 /*
652  * Handle the user initiated #VE.
653  *
654  * On success, returns the number of bytes RIP should be incremented (>=0)
655  * or -errno on error.
656  */
657 static int virt_exception_user(struct pt_regs *regs, struct ve_info *ve)
658 {
659 	switch (ve->exit_reason) {
660 	case EXIT_REASON_CPUID:
661 		return handle_cpuid(regs, ve);
662 	default:
663 		pr_warn("Unexpected #VE: %lld\n", ve->exit_reason);
664 		return -EIO;
665 	}
666 }
667 
668 static inline bool is_private_gpa(u64 gpa)
669 {
670 	return gpa == cc_mkenc(gpa);
671 }
672 
673 /*
674  * Handle the kernel #VE.
675  *
676  * On success, returns the number of bytes RIP should be incremented (>=0)
677  * or -errno on error.
678  */
679 static int virt_exception_kernel(struct pt_regs *regs, struct ve_info *ve)
680 {
681 	switch (ve->exit_reason) {
682 	case EXIT_REASON_HLT:
683 		return handle_halt(ve);
684 	case EXIT_REASON_MSR_READ:
685 		return read_msr(regs, ve);
686 	case EXIT_REASON_MSR_WRITE:
687 		return write_msr(regs, ve);
688 	case EXIT_REASON_CPUID:
689 		return handle_cpuid(regs, ve);
690 	case EXIT_REASON_EPT_VIOLATION:
691 		if (is_private_gpa(ve->gpa))
692 			panic("Unexpected EPT-violation on private memory.");
693 		return handle_mmio(regs, ve);
694 	case EXIT_REASON_IO_INSTRUCTION:
695 		return handle_io(regs, ve);
696 	default:
697 		pr_warn("Unexpected #VE: %lld\n", ve->exit_reason);
698 		return -EIO;
699 	}
700 }
701 
702 bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve)
703 {
704 	int insn_len;
705 
706 	if (user_mode(regs))
707 		insn_len = virt_exception_user(regs, ve);
708 	else
709 		insn_len = virt_exception_kernel(regs, ve);
710 	if (insn_len < 0)
711 		return false;
712 
713 	/* After successful #VE handling, move the IP */
714 	regs->ip += insn_len;
715 
716 	return true;
717 }
718 
719 static bool tdx_tlb_flush_required(bool private)
720 {
721 	/*
722 	 * TDX guest is responsible for flushing TLB on private->shared
723 	 * transition. VMM is responsible for flushing on shared->private.
724 	 *
725 	 * The VMM _can't_ flush private addresses as it can't generate PAs
726 	 * with the guest's HKID.  Shared memory isn't subject to integrity
727 	 * checking, i.e. the VMM doesn't need to flush for its own protection.
728 	 *
729 	 * There's no need to flush when converting from shared to private,
730 	 * as flushing is the VMM's responsibility in this case, e.g. it must
731 	 * flush to avoid integrity failures in the face of a buggy or
732 	 * malicious guest.
733 	 */
734 	return !private;
735 }
736 
737 static bool tdx_cache_flush_required(void)
738 {
739 	/*
740 	 * AMD SME/SEV can avoid cache flushing if HW enforces cache coherence.
741 	 * TDX doesn't have such capability.
742 	 *
743 	 * Flush cache unconditionally.
744 	 */
745 	return true;
746 }
747 
748 static bool try_accept_one(phys_addr_t *start, unsigned long len,
749 			  enum pg_level pg_level)
750 {
751 	unsigned long accept_size = page_level_size(pg_level);
752 	u64 tdcall_rcx;
753 	u8 page_size;
754 
755 	if (!IS_ALIGNED(*start, accept_size))
756 		return false;
757 
758 	if (len < accept_size)
759 		return false;
760 
761 	/*
762 	 * Pass the page physical address to the TDX module to accept the
763 	 * pending, private page.
764 	 *
765 	 * Bits 2:0 of RCX encode page size: 0 - 4K, 1 - 2M, 2 - 1G.
766 	 */
767 	switch (pg_level) {
768 	case PG_LEVEL_4K:
769 		page_size = 0;
770 		break;
771 	case PG_LEVEL_2M:
772 		page_size = 1;
773 		break;
774 	case PG_LEVEL_1G:
775 		page_size = 2;
776 		break;
777 	default:
778 		return false;
779 	}
780 
781 	tdcall_rcx = *start | page_size;
782 	if (__tdx_module_call(TDX_ACCEPT_PAGE, tdcall_rcx, 0, 0, 0, NULL))
783 		return false;
784 
785 	*start += accept_size;
786 	return true;
787 }
788 
789 /*
790  * Inform the VMM of the guest's intent for this physical page: shared with
791  * the VMM or private to the guest.  The VMM is expected to change its mapping
792  * of the page in response.
793  */
794 static bool tdx_enc_status_changed(unsigned long vaddr, int numpages, bool enc)
795 {
796 	phys_addr_t start = __pa(vaddr);
797 	phys_addr_t end   = __pa(vaddr + numpages * PAGE_SIZE);
798 
799 	if (!enc) {
800 		/* Set the shared (decrypted) bits: */
801 		start |= cc_mkdec(0);
802 		end   |= cc_mkdec(0);
803 	}
804 
805 	/*
806 	 * Notify the VMM about page mapping conversion. More info about ABI
807 	 * can be found in TDX Guest-Host-Communication Interface (GHCI),
808 	 * section "TDG.VP.VMCALL<MapGPA>"
809 	 */
810 	if (_tdx_hypercall(TDVMCALL_MAP_GPA, start, end - start, 0, 0))
811 		return false;
812 
813 	/* private->shared conversion  requires only MapGPA call */
814 	if (!enc)
815 		return true;
816 
817 	/*
818 	 * For shared->private conversion, accept the page using
819 	 * TDX_ACCEPT_PAGE TDX module call.
820 	 */
821 	while (start < end) {
822 		unsigned long len = end - start;
823 
824 		/*
825 		 * Try larger accepts first. It gives chance to VMM to keep
826 		 * 1G/2M SEPT entries where possible and speeds up process by
827 		 * cutting number of hypercalls (if successful).
828 		 */
829 
830 		if (try_accept_one(&start, len, PG_LEVEL_1G))
831 			continue;
832 
833 		if (try_accept_one(&start, len, PG_LEVEL_2M))
834 			continue;
835 
836 		if (!try_accept_one(&start, len, PG_LEVEL_4K))
837 			return false;
838 	}
839 
840 	return true;
841 }
842 
843 void __init tdx_early_init(void)
844 {
845 	u64 cc_mask;
846 	u32 eax, sig[3];
847 
848 	cpuid_count(TDX_CPUID_LEAF_ID, 0, &eax, &sig[0], &sig[2],  &sig[1]);
849 
850 	if (memcmp(TDX_IDENT, sig, sizeof(sig)))
851 		return;
852 
853 	setup_force_cpu_cap(X86_FEATURE_TDX_GUEST);
854 
855 	cc_set_vendor(CC_VENDOR_INTEL);
856 	tdx_parse_tdinfo(&cc_mask);
857 	cc_set_mask(cc_mask);
858 
859 	/* Kernel does not use NOTIFY_ENABLES and does not need random #VEs */
860 	tdx_module_call(TDX_WR, 0, TDCS_NOTIFY_ENABLES, 0, -1ULL, NULL);
861 
862 	/*
863 	 * All bits above GPA width are reserved and kernel treats shared bit
864 	 * as flag, not as part of physical address.
865 	 *
866 	 * Adjust physical mask to only cover valid GPA bits.
867 	 */
868 	physical_mask &= cc_mask - 1;
869 
870 	x86_platform.guest.enc_cache_flush_required = tdx_cache_flush_required;
871 	x86_platform.guest.enc_tlb_flush_required   = tdx_tlb_flush_required;
872 	x86_platform.guest.enc_status_change_finish = tdx_enc_status_changed;
873 
874 	pr_info("Guest detected\n");
875 }
876