1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * AMD Memory Encryption Support
4 *
5 * Copyright (C) 2019 SUSE
6 *
7 * Author: Joerg Roedel <jroedel@suse.de>
8 */
9
10 #define pr_fmt(fmt) "SEV: " fmt
11
12 #include <linux/sched/debug.h> /* For show_regs() */
13 #include <linux/percpu-defs.h>
14 #include <linux/cc_platform.h>
15 #include <linux/printk.h>
16 #include <linux/mm_types.h>
17 #include <linux/set_memory.h>
18 #include <linux/memblock.h>
19 #include <linux/kernel.h>
20 #include <linux/mm.h>
21 #include <linux/cpumask.h>
22 #include <linux/efi.h>
23 #include <linux/platform_device.h>
24 #include <linux/io.h>
25 #include <linux/psp-sev.h>
26 #include <linux/dmi.h>
27 #include <uapi/linux/sev-guest.h>
28 #include <crypto/gcm.h>
29
30 #include <asm/init.h>
31 #include <asm/cpu_entry_area.h>
32 #include <asm/stacktrace.h>
33 #include <asm/sev.h>
34 #include <asm/insn-eval.h>
35 #include <asm/fpu/xcr.h>
36 #include <asm/processor.h>
37 #include <asm/realmode.h>
38 #include <asm/setup.h>
39 #include <asm/traps.h>
40 #include <asm/svm.h>
41 #include <asm/smp.h>
42 #include <asm/cpu.h>
43 #include <asm/apic.h>
44 #include <asm/cpuid.h>
45 #include <asm/cmdline.h>
46
47 #define DR7_RESET_VALUE 0x400
48
49 /* AP INIT values as documented in the APM2 section "Processor Initialization State" */
50 #define AP_INIT_CS_LIMIT 0xffff
51 #define AP_INIT_DS_LIMIT 0xffff
52 #define AP_INIT_LDTR_LIMIT 0xffff
53 #define AP_INIT_GDTR_LIMIT 0xffff
54 #define AP_INIT_IDTR_LIMIT 0xffff
55 #define AP_INIT_TR_LIMIT 0xffff
56 #define AP_INIT_RFLAGS_DEFAULT 0x2
57 #define AP_INIT_DR6_DEFAULT 0xffff0ff0
58 #define AP_INIT_GPAT_DEFAULT 0x0007040600070406ULL
59 #define AP_INIT_XCR0_DEFAULT 0x1
60 #define AP_INIT_X87_FTW_DEFAULT 0x5555
61 #define AP_INIT_X87_FCW_DEFAULT 0x0040
62 #define AP_INIT_CR0_DEFAULT 0x60000010
63 #define AP_INIT_MXCSR_DEFAULT 0x1f80
64
65 static const char * const sev_status_feat_names[] = {
66 [MSR_AMD64_SEV_ENABLED_BIT] = "SEV",
67 [MSR_AMD64_SEV_ES_ENABLED_BIT] = "SEV-ES",
68 [MSR_AMD64_SEV_SNP_ENABLED_BIT] = "SEV-SNP",
69 [MSR_AMD64_SNP_VTOM_BIT] = "vTom",
70 [MSR_AMD64_SNP_REFLECT_VC_BIT] = "ReflectVC",
71 [MSR_AMD64_SNP_RESTRICTED_INJ_BIT] = "RI",
72 [MSR_AMD64_SNP_ALT_INJ_BIT] = "AI",
73 [MSR_AMD64_SNP_DEBUG_SWAP_BIT] = "DebugSwap",
74 [MSR_AMD64_SNP_PREVENT_HOST_IBS_BIT] = "NoHostIBS",
75 [MSR_AMD64_SNP_BTB_ISOLATION_BIT] = "BTBIsol",
76 [MSR_AMD64_SNP_VMPL_SSS_BIT] = "VmplSSS",
77 [MSR_AMD64_SNP_SECURE_TSC_BIT] = "SecureTSC",
78 [MSR_AMD64_SNP_VMGEXIT_PARAM_BIT] = "VMGExitParam",
79 [MSR_AMD64_SNP_IBS_VIRT_BIT] = "IBSVirt",
80 [MSR_AMD64_SNP_VMSA_REG_PROT_BIT] = "VMSARegProt",
81 [MSR_AMD64_SNP_SMT_PROT_BIT] = "SMTProt",
82 };
83
84 /* For early boot hypervisor communication in SEV-ES enabled guests */
85 static struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE);
86
87 /*
88 * Needs to be in the .data section because we need it NULL before bss is
89 * cleared
90 */
91 static struct ghcb *boot_ghcb __section(".data");
92
93 /* Bitmap of SEV features supported by the hypervisor */
94 static u64 sev_hv_features __ro_after_init;
95
96 /* Secrets page physical address from the CC blob */
97 static u64 secrets_pa __ro_after_init;
98
99 /*
100 * For Secure TSC guests, the BSP fetches TSC_INFO using SNP guest messaging and
101 * initializes snp_tsc_scale and snp_tsc_offset. These values are replicated
102 * across the APs VMSA fields (TSC_SCALE and TSC_OFFSET).
103 */
104 static u64 snp_tsc_scale __ro_after_init;
105 static u64 snp_tsc_offset __ro_after_init;
106 static u64 snp_tsc_freq_khz __ro_after_init;
107
108 /* #VC handler runtime per-CPU data */
109 struct sev_es_runtime_data {
110 struct ghcb ghcb_page;
111
112 /*
113 * Reserve one page per CPU as backup storage for the unencrypted GHCB.
114 * It is needed when an NMI happens while the #VC handler uses the real
115 * GHCB, and the NMI handler itself is causing another #VC exception. In
116 * that case the GHCB content of the first handler needs to be backed up
117 * and restored.
118 */
119 struct ghcb backup_ghcb;
120
121 /*
122 * Mark the per-cpu GHCBs as in-use to detect nested #VC exceptions.
123 * There is no need for it to be atomic, because nothing is written to
124 * the GHCB between the read and the write of ghcb_active. So it is safe
125 * to use it when a nested #VC exception happens before the write.
126 *
127 * This is necessary for example in the #VC->NMI->#VC case when the NMI
128 * happens while the first #VC handler uses the GHCB. When the NMI code
129 * raises a second #VC handler it might overwrite the contents of the
130 * GHCB written by the first handler. To avoid this the content of the
131 * GHCB is saved and restored when the GHCB is detected to be in use
132 * already.
133 */
134 bool ghcb_active;
135 bool backup_ghcb_active;
136
137 /*
138 * Cached DR7 value - write it on DR7 writes and return it on reads.
139 * That value will never make it to the real hardware DR7 as debugging
140 * is currently unsupported in SEV-ES guests.
141 */
142 unsigned long dr7;
143 };
144
145 struct ghcb_state {
146 struct ghcb *ghcb;
147 };
148
149 /* For early boot SVSM communication */
150 static struct svsm_ca boot_svsm_ca_page __aligned(PAGE_SIZE);
151
152 static DEFINE_PER_CPU(struct sev_es_runtime_data*, runtime_data);
153 static DEFINE_PER_CPU(struct sev_es_save_area *, sev_vmsa);
154 static DEFINE_PER_CPU(struct svsm_ca *, svsm_caa);
155 static DEFINE_PER_CPU(u64, svsm_caa_pa);
156
on_vc_stack(struct pt_regs * regs)157 static __always_inline bool on_vc_stack(struct pt_regs *regs)
158 {
159 unsigned long sp = regs->sp;
160
161 /* User-mode RSP is not trusted */
162 if (user_mode(regs))
163 return false;
164
165 /* SYSCALL gap still has user-mode RSP */
166 if (ip_within_syscall_gap(regs))
167 return false;
168
169 return ((sp >= __this_cpu_ist_bottom_va(VC)) && (sp < __this_cpu_ist_top_va(VC)));
170 }
171
172 /*
173 * This function handles the case when an NMI is raised in the #VC
174 * exception handler entry code, before the #VC handler has switched off
175 * its IST stack. In this case, the IST entry for #VC must be adjusted,
176 * so that any nested #VC exception will not overwrite the stack
177 * contents of the interrupted #VC handler.
178 *
179 * The IST entry is adjusted unconditionally so that it can be also be
180 * unconditionally adjusted back in __sev_es_ist_exit(). Otherwise a
181 * nested sev_es_ist_exit() call may adjust back the IST entry too
182 * early.
183 *
184 * The __sev_es_ist_enter() and __sev_es_ist_exit() functions always run
185 * on the NMI IST stack, as they are only called from NMI handling code
186 * right now.
187 */
__sev_es_ist_enter(struct pt_regs * regs)188 void noinstr __sev_es_ist_enter(struct pt_regs *regs)
189 {
190 unsigned long old_ist, new_ist;
191
192 /* Read old IST entry */
193 new_ist = old_ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]);
194
195 /*
196 * If NMI happened while on the #VC IST stack, set the new IST
197 * value below regs->sp, so that the interrupted stack frame is
198 * not overwritten by subsequent #VC exceptions.
199 */
200 if (on_vc_stack(regs))
201 new_ist = regs->sp;
202
203 /*
204 * Reserve additional 8 bytes and store old IST value so this
205 * adjustment can be unrolled in __sev_es_ist_exit().
206 */
207 new_ist -= sizeof(old_ist);
208 *(unsigned long *)new_ist = old_ist;
209
210 /* Set new IST entry */
211 this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], new_ist);
212 }
213
__sev_es_ist_exit(void)214 void noinstr __sev_es_ist_exit(void)
215 {
216 unsigned long ist;
217
218 /* Read IST entry */
219 ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]);
220
221 if (WARN_ON(ist == __this_cpu_ist_top_va(VC)))
222 return;
223
224 /* Read back old IST entry and write it to the TSS */
225 this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], *(unsigned long *)ist);
226 }
227
228 /*
229 * Nothing shall interrupt this code path while holding the per-CPU
230 * GHCB. The backup GHCB is only for NMIs interrupting this path.
231 *
232 * Callers must disable local interrupts around it.
233 */
__sev_get_ghcb(struct ghcb_state * state)234 static noinstr struct ghcb *__sev_get_ghcb(struct ghcb_state *state)
235 {
236 struct sev_es_runtime_data *data;
237 struct ghcb *ghcb;
238
239 WARN_ON(!irqs_disabled());
240
241 data = this_cpu_read(runtime_data);
242 ghcb = &data->ghcb_page;
243
244 if (unlikely(data->ghcb_active)) {
245 /* GHCB is already in use - save its contents */
246
247 if (unlikely(data->backup_ghcb_active)) {
248 /*
249 * Backup-GHCB is also already in use. There is no way
250 * to continue here so just kill the machine. To make
251 * panic() work, mark GHCBs inactive so that messages
252 * can be printed out.
253 */
254 data->ghcb_active = false;
255 data->backup_ghcb_active = false;
256
257 instrumentation_begin();
258 panic("Unable to handle #VC exception! GHCB and Backup GHCB are already in use");
259 instrumentation_end();
260 }
261
262 /* Mark backup_ghcb active before writing to it */
263 data->backup_ghcb_active = true;
264
265 state->ghcb = &data->backup_ghcb;
266
267 /* Backup GHCB content */
268 *state->ghcb = *ghcb;
269 } else {
270 state->ghcb = NULL;
271 data->ghcb_active = true;
272 }
273
274 return ghcb;
275 }
276
sev_es_rd_ghcb_msr(void)277 static inline u64 sev_es_rd_ghcb_msr(void)
278 {
279 return __rdmsr(MSR_AMD64_SEV_ES_GHCB);
280 }
281
sev_es_wr_ghcb_msr(u64 val)282 static __always_inline void sev_es_wr_ghcb_msr(u64 val)
283 {
284 u32 low, high;
285
286 low = (u32)(val);
287 high = (u32)(val >> 32);
288
289 native_wrmsr(MSR_AMD64_SEV_ES_GHCB, low, high);
290 }
291
vc_fetch_insn_kernel(struct es_em_ctxt * ctxt,unsigned char * buffer)292 static int vc_fetch_insn_kernel(struct es_em_ctxt *ctxt,
293 unsigned char *buffer)
294 {
295 return copy_from_kernel_nofault(buffer, (unsigned char *)ctxt->regs->ip, MAX_INSN_SIZE);
296 }
297
__vc_decode_user_insn(struct es_em_ctxt * ctxt)298 static enum es_result __vc_decode_user_insn(struct es_em_ctxt *ctxt)
299 {
300 char buffer[MAX_INSN_SIZE];
301 int insn_bytes;
302
303 insn_bytes = insn_fetch_from_user_inatomic(ctxt->regs, buffer);
304 if (insn_bytes == 0) {
305 /* Nothing could be copied */
306 ctxt->fi.vector = X86_TRAP_PF;
307 ctxt->fi.error_code = X86_PF_INSTR | X86_PF_USER;
308 ctxt->fi.cr2 = ctxt->regs->ip;
309 return ES_EXCEPTION;
310 } else if (insn_bytes == -EINVAL) {
311 /* Effective RIP could not be calculated */
312 ctxt->fi.vector = X86_TRAP_GP;
313 ctxt->fi.error_code = 0;
314 ctxt->fi.cr2 = 0;
315 return ES_EXCEPTION;
316 }
317
318 if (!insn_decode_from_regs(&ctxt->insn, ctxt->regs, buffer, insn_bytes))
319 return ES_DECODE_FAILED;
320
321 if (ctxt->insn.immediate.got)
322 return ES_OK;
323 else
324 return ES_DECODE_FAILED;
325 }
326
__vc_decode_kern_insn(struct es_em_ctxt * ctxt)327 static enum es_result __vc_decode_kern_insn(struct es_em_ctxt *ctxt)
328 {
329 char buffer[MAX_INSN_SIZE];
330 int res, ret;
331
332 res = vc_fetch_insn_kernel(ctxt, buffer);
333 if (res) {
334 ctxt->fi.vector = X86_TRAP_PF;
335 ctxt->fi.error_code = X86_PF_INSTR;
336 ctxt->fi.cr2 = ctxt->regs->ip;
337 return ES_EXCEPTION;
338 }
339
340 ret = insn_decode(&ctxt->insn, buffer, MAX_INSN_SIZE, INSN_MODE_64);
341 if (ret < 0)
342 return ES_DECODE_FAILED;
343 else
344 return ES_OK;
345 }
346
vc_decode_insn(struct es_em_ctxt * ctxt)347 static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt)
348 {
349 if (user_mode(ctxt->regs))
350 return __vc_decode_user_insn(ctxt);
351 else
352 return __vc_decode_kern_insn(ctxt);
353 }
354
vc_write_mem(struct es_em_ctxt * ctxt,char * dst,char * buf,size_t size)355 static enum es_result vc_write_mem(struct es_em_ctxt *ctxt,
356 char *dst, char *buf, size_t size)
357 {
358 unsigned long error_code = X86_PF_PROT | X86_PF_WRITE;
359
360 /*
361 * This function uses __put_user() independent of whether kernel or user
362 * memory is accessed. This works fine because __put_user() does no
363 * sanity checks of the pointer being accessed. All that it does is
364 * to report when the access failed.
365 *
366 * Also, this function runs in atomic context, so __put_user() is not
367 * allowed to sleep. The page-fault handler detects that it is running
368 * in atomic context and will not try to take mmap_sem and handle the
369 * fault, so additional pagefault_enable()/disable() calls are not
370 * needed.
371 *
372 * The access can't be done via copy_to_user() here because
373 * vc_write_mem() must not use string instructions to access unsafe
374 * memory. The reason is that MOVS is emulated by the #VC handler by
375 * splitting the move up into a read and a write and taking a nested #VC
376 * exception on whatever of them is the MMIO access. Using string
377 * instructions here would cause infinite nesting.
378 */
379 switch (size) {
380 case 1: {
381 u8 d1;
382 u8 __user *target = (u8 __user *)dst;
383
384 memcpy(&d1, buf, 1);
385 if (__put_user(d1, target))
386 goto fault;
387 break;
388 }
389 case 2: {
390 u16 d2;
391 u16 __user *target = (u16 __user *)dst;
392
393 memcpy(&d2, buf, 2);
394 if (__put_user(d2, target))
395 goto fault;
396 break;
397 }
398 case 4: {
399 u32 d4;
400 u32 __user *target = (u32 __user *)dst;
401
402 memcpy(&d4, buf, 4);
403 if (__put_user(d4, target))
404 goto fault;
405 break;
406 }
407 case 8: {
408 u64 d8;
409 u64 __user *target = (u64 __user *)dst;
410
411 memcpy(&d8, buf, 8);
412 if (__put_user(d8, target))
413 goto fault;
414 break;
415 }
416 default:
417 WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size);
418 return ES_UNSUPPORTED;
419 }
420
421 return ES_OK;
422
423 fault:
424 if (user_mode(ctxt->regs))
425 error_code |= X86_PF_USER;
426
427 ctxt->fi.vector = X86_TRAP_PF;
428 ctxt->fi.error_code = error_code;
429 ctxt->fi.cr2 = (unsigned long)dst;
430
431 return ES_EXCEPTION;
432 }
433
vc_read_mem(struct es_em_ctxt * ctxt,char * src,char * buf,size_t size)434 static enum es_result vc_read_mem(struct es_em_ctxt *ctxt,
435 char *src, char *buf, size_t size)
436 {
437 unsigned long error_code = X86_PF_PROT;
438
439 /*
440 * This function uses __get_user() independent of whether kernel or user
441 * memory is accessed. This works fine because __get_user() does no
442 * sanity checks of the pointer being accessed. All that it does is
443 * to report when the access failed.
444 *
445 * Also, this function runs in atomic context, so __get_user() is not
446 * allowed to sleep. The page-fault handler detects that it is running
447 * in atomic context and will not try to take mmap_sem and handle the
448 * fault, so additional pagefault_enable()/disable() calls are not
449 * needed.
450 *
451 * The access can't be done via copy_from_user() here because
452 * vc_read_mem() must not use string instructions to access unsafe
453 * memory. The reason is that MOVS is emulated by the #VC handler by
454 * splitting the move up into a read and a write and taking a nested #VC
455 * exception on whatever of them is the MMIO access. Using string
456 * instructions here would cause infinite nesting.
457 */
458 switch (size) {
459 case 1: {
460 u8 d1;
461 u8 __user *s = (u8 __user *)src;
462
463 if (__get_user(d1, s))
464 goto fault;
465 memcpy(buf, &d1, 1);
466 break;
467 }
468 case 2: {
469 u16 d2;
470 u16 __user *s = (u16 __user *)src;
471
472 if (__get_user(d2, s))
473 goto fault;
474 memcpy(buf, &d2, 2);
475 break;
476 }
477 case 4: {
478 u32 d4;
479 u32 __user *s = (u32 __user *)src;
480
481 if (__get_user(d4, s))
482 goto fault;
483 memcpy(buf, &d4, 4);
484 break;
485 }
486 case 8: {
487 u64 d8;
488 u64 __user *s = (u64 __user *)src;
489 if (__get_user(d8, s))
490 goto fault;
491 memcpy(buf, &d8, 8);
492 break;
493 }
494 default:
495 WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size);
496 return ES_UNSUPPORTED;
497 }
498
499 return ES_OK;
500
501 fault:
502 if (user_mode(ctxt->regs))
503 error_code |= X86_PF_USER;
504
505 ctxt->fi.vector = X86_TRAP_PF;
506 ctxt->fi.error_code = error_code;
507 ctxt->fi.cr2 = (unsigned long)src;
508
509 return ES_EXCEPTION;
510 }
511
vc_slow_virt_to_phys(struct ghcb * ghcb,struct es_em_ctxt * ctxt,unsigned long vaddr,phys_addr_t * paddr)512 static enum es_result vc_slow_virt_to_phys(struct ghcb *ghcb, struct es_em_ctxt *ctxt,
513 unsigned long vaddr, phys_addr_t *paddr)
514 {
515 unsigned long va = (unsigned long)vaddr;
516 unsigned int level;
517 phys_addr_t pa;
518 pgd_t *pgd;
519 pte_t *pte;
520
521 pgd = __va(read_cr3_pa());
522 pgd = &pgd[pgd_index(va)];
523 pte = lookup_address_in_pgd(pgd, va, &level);
524 if (!pte) {
525 ctxt->fi.vector = X86_TRAP_PF;
526 ctxt->fi.cr2 = vaddr;
527 ctxt->fi.error_code = 0;
528
529 if (user_mode(ctxt->regs))
530 ctxt->fi.error_code |= X86_PF_USER;
531
532 return ES_EXCEPTION;
533 }
534
535 if (WARN_ON_ONCE(pte_val(*pte) & _PAGE_ENC))
536 /* Emulated MMIO to/from encrypted memory not supported */
537 return ES_UNSUPPORTED;
538
539 pa = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT;
540 pa |= va & ~page_level_mask(level);
541
542 *paddr = pa;
543
544 return ES_OK;
545 }
546
vc_ioio_check(struct es_em_ctxt * ctxt,u16 port,size_t size)547 static enum es_result vc_ioio_check(struct es_em_ctxt *ctxt, u16 port, size_t size)
548 {
549 BUG_ON(size > 4);
550
551 if (user_mode(ctxt->regs)) {
552 struct thread_struct *t = ¤t->thread;
553 struct io_bitmap *iobm = t->io_bitmap;
554 size_t idx;
555
556 if (!iobm)
557 goto fault;
558
559 for (idx = port; idx < port + size; ++idx) {
560 if (test_bit(idx, iobm->bitmap))
561 goto fault;
562 }
563 }
564
565 return ES_OK;
566
567 fault:
568 ctxt->fi.vector = X86_TRAP_GP;
569 ctxt->fi.error_code = 0;
570
571 return ES_EXCEPTION;
572 }
573
vc_forward_exception(struct es_em_ctxt * ctxt)574 static __always_inline void vc_forward_exception(struct es_em_ctxt *ctxt)
575 {
576 long error_code = ctxt->fi.error_code;
577 int trapnr = ctxt->fi.vector;
578
579 ctxt->regs->orig_ax = ctxt->fi.error_code;
580
581 switch (trapnr) {
582 case X86_TRAP_GP:
583 exc_general_protection(ctxt->regs, error_code);
584 break;
585 case X86_TRAP_UD:
586 exc_invalid_op(ctxt->regs);
587 break;
588 case X86_TRAP_PF:
589 write_cr2(ctxt->fi.cr2);
590 exc_page_fault(ctxt->regs, error_code);
591 break;
592 case X86_TRAP_AC:
593 exc_alignment_check(ctxt->regs, error_code);
594 break;
595 default:
596 pr_emerg("Unsupported exception in #VC instruction emulation - can't continue\n");
597 BUG();
598 }
599 }
600
601 /* Include code shared with pre-decompression boot stage */
602 #include "shared.c"
603
svsm_get_caa(void)604 static inline struct svsm_ca *svsm_get_caa(void)
605 {
606 /*
607 * Use rIP-relative references when called early in the boot. If
608 * ->use_cas is set, then it is late in the boot and no need
609 * to worry about rIP-relative references.
610 */
611 if (RIP_REL_REF(sev_cfg).use_cas)
612 return this_cpu_read(svsm_caa);
613 else
614 return RIP_REL_REF(boot_svsm_caa);
615 }
616
svsm_get_caa_pa(void)617 static u64 svsm_get_caa_pa(void)
618 {
619 /*
620 * Use rIP-relative references when called early in the boot. If
621 * ->use_cas is set, then it is late in the boot and no need
622 * to worry about rIP-relative references.
623 */
624 if (RIP_REL_REF(sev_cfg).use_cas)
625 return this_cpu_read(svsm_caa_pa);
626 else
627 return RIP_REL_REF(boot_svsm_caa_pa);
628 }
629
__sev_put_ghcb(struct ghcb_state * state)630 static noinstr void __sev_put_ghcb(struct ghcb_state *state)
631 {
632 struct sev_es_runtime_data *data;
633 struct ghcb *ghcb;
634
635 WARN_ON(!irqs_disabled());
636
637 data = this_cpu_read(runtime_data);
638 ghcb = &data->ghcb_page;
639
640 if (state->ghcb) {
641 /* Restore GHCB from Backup */
642 *ghcb = *state->ghcb;
643 data->backup_ghcb_active = false;
644 state->ghcb = NULL;
645 } else {
646 /*
647 * Invalidate the GHCB so a VMGEXIT instruction issued
648 * from userspace won't appear to be valid.
649 */
650 vc_ghcb_invalidate(ghcb);
651 data->ghcb_active = false;
652 }
653 }
654
svsm_perform_call_protocol(struct svsm_call * call)655 static int svsm_perform_call_protocol(struct svsm_call *call)
656 {
657 struct ghcb_state state;
658 unsigned long flags;
659 struct ghcb *ghcb;
660 int ret;
661
662 /*
663 * This can be called very early in the boot, use native functions in
664 * order to avoid paravirt issues.
665 */
666 flags = native_local_irq_save();
667
668 /*
669 * Use rip-relative references when called early in the boot. If
670 * ghcbs_initialized is set, then it is late in the boot and no need
671 * to worry about rip-relative references in called functions.
672 */
673 if (RIP_REL_REF(sev_cfg).ghcbs_initialized)
674 ghcb = __sev_get_ghcb(&state);
675 else if (RIP_REL_REF(boot_ghcb))
676 ghcb = RIP_REL_REF(boot_ghcb);
677 else
678 ghcb = NULL;
679
680 do {
681 ret = ghcb ? svsm_perform_ghcb_protocol(ghcb, call)
682 : svsm_perform_msr_protocol(call);
683 } while (ret == -EAGAIN);
684
685 if (RIP_REL_REF(sev_cfg).ghcbs_initialized)
686 __sev_put_ghcb(&state);
687
688 native_local_irq_restore(flags);
689
690 return ret;
691 }
692
__sev_es_nmi_complete(void)693 void noinstr __sev_es_nmi_complete(void)
694 {
695 struct ghcb_state state;
696 struct ghcb *ghcb;
697
698 ghcb = __sev_get_ghcb(&state);
699
700 vc_ghcb_invalidate(ghcb);
701 ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_NMI_COMPLETE);
702 ghcb_set_sw_exit_info_1(ghcb, 0);
703 ghcb_set_sw_exit_info_2(ghcb, 0);
704
705 sev_es_wr_ghcb_msr(__pa_nodebug(ghcb));
706 VMGEXIT();
707
708 __sev_put_ghcb(&state);
709 }
710
get_snp_jump_table_addr(void)711 static u64 __init get_snp_jump_table_addr(void)
712 {
713 struct snp_secrets_page *secrets;
714 void __iomem *mem;
715 u64 addr;
716
717 mem = ioremap_encrypted(secrets_pa, PAGE_SIZE);
718 if (!mem) {
719 pr_err("Unable to locate AP jump table address: failed to map the SNP secrets page.\n");
720 return 0;
721 }
722
723 secrets = (__force struct snp_secrets_page *)mem;
724
725 addr = secrets->os_area.ap_jump_table_pa;
726 iounmap(mem);
727
728 return addr;
729 }
730
get_jump_table_addr(void)731 static u64 __init get_jump_table_addr(void)
732 {
733 struct ghcb_state state;
734 unsigned long flags;
735 struct ghcb *ghcb;
736 u64 ret = 0;
737
738 if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
739 return get_snp_jump_table_addr();
740
741 local_irq_save(flags);
742
743 ghcb = __sev_get_ghcb(&state);
744
745 vc_ghcb_invalidate(ghcb);
746 ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_JUMP_TABLE);
747 ghcb_set_sw_exit_info_1(ghcb, SVM_VMGEXIT_GET_AP_JUMP_TABLE);
748 ghcb_set_sw_exit_info_2(ghcb, 0);
749
750 sev_es_wr_ghcb_msr(__pa(ghcb));
751 VMGEXIT();
752
753 if (ghcb_sw_exit_info_1_is_valid(ghcb) &&
754 ghcb_sw_exit_info_2_is_valid(ghcb))
755 ret = ghcb->save.sw_exit_info_2;
756
757 __sev_put_ghcb(&state);
758
759 local_irq_restore(flags);
760
761 return ret;
762 }
763
764 static void __head
early_set_pages_state(unsigned long vaddr,unsigned long paddr,unsigned long npages,enum psc_op op)765 early_set_pages_state(unsigned long vaddr, unsigned long paddr,
766 unsigned long npages, enum psc_op op)
767 {
768 unsigned long paddr_end;
769 u64 val;
770
771 vaddr = vaddr & PAGE_MASK;
772
773 paddr = paddr & PAGE_MASK;
774 paddr_end = paddr + (npages << PAGE_SHIFT);
775
776 while (paddr < paddr_end) {
777 /* Page validation must be rescinded before changing to shared */
778 if (op == SNP_PAGE_STATE_SHARED)
779 pvalidate_4k_page(vaddr, paddr, false);
780
781 /*
782 * Use the MSR protocol because this function can be called before
783 * the GHCB is established.
784 */
785 sev_es_wr_ghcb_msr(GHCB_MSR_PSC_REQ_GFN(paddr >> PAGE_SHIFT, op));
786 VMGEXIT();
787
788 val = sev_es_rd_ghcb_msr();
789
790 if (GHCB_RESP_CODE(val) != GHCB_MSR_PSC_RESP)
791 goto e_term;
792
793 if (GHCB_MSR_PSC_RESP_VAL(val))
794 goto e_term;
795
796 /* Page validation must be performed after changing to private */
797 if (op == SNP_PAGE_STATE_PRIVATE)
798 pvalidate_4k_page(vaddr, paddr, true);
799
800 vaddr += PAGE_SIZE;
801 paddr += PAGE_SIZE;
802 }
803
804 return;
805
806 e_term:
807 sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC);
808 }
809
early_snp_set_memory_private(unsigned long vaddr,unsigned long paddr,unsigned long npages)810 void __head early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr,
811 unsigned long npages)
812 {
813 /*
814 * This can be invoked in early boot while running identity mapped, so
815 * use an open coded check for SNP instead of using cc_platform_has().
816 * This eliminates worries about jump tables or checking boot_cpu_data
817 * in the cc_platform_has() function.
818 */
819 if (!(RIP_REL_REF(sev_status) & MSR_AMD64_SEV_SNP_ENABLED))
820 return;
821
822 /*
823 * Ask the hypervisor to mark the memory pages as private in the RMP
824 * table.
825 */
826 early_set_pages_state(vaddr, paddr, npages, SNP_PAGE_STATE_PRIVATE);
827 }
828
early_snp_set_memory_shared(unsigned long vaddr,unsigned long paddr,unsigned long npages)829 void __head early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr,
830 unsigned long npages)
831 {
832 /*
833 * This can be invoked in early boot while running identity mapped, so
834 * use an open coded check for SNP instead of using cc_platform_has().
835 * This eliminates worries about jump tables or checking boot_cpu_data
836 * in the cc_platform_has() function.
837 */
838 if (!(RIP_REL_REF(sev_status) & MSR_AMD64_SEV_SNP_ENABLED))
839 return;
840
841 /* Ask hypervisor to mark the memory pages shared in the RMP table. */
842 early_set_pages_state(vaddr, paddr, npages, SNP_PAGE_STATE_SHARED);
843 }
844
__set_pages_state(struct snp_psc_desc * data,unsigned long vaddr,unsigned long vaddr_end,int op)845 static unsigned long __set_pages_state(struct snp_psc_desc *data, unsigned long vaddr,
846 unsigned long vaddr_end, int op)
847 {
848 struct ghcb_state state;
849 bool use_large_entry;
850 struct psc_hdr *hdr;
851 struct psc_entry *e;
852 unsigned long flags;
853 unsigned long pfn;
854 struct ghcb *ghcb;
855 int i;
856
857 hdr = &data->hdr;
858 e = data->entries;
859
860 memset(data, 0, sizeof(*data));
861 i = 0;
862
863 while (vaddr < vaddr_end && i < ARRAY_SIZE(data->entries)) {
864 hdr->end_entry = i;
865
866 if (is_vmalloc_addr((void *)vaddr)) {
867 pfn = vmalloc_to_pfn((void *)vaddr);
868 use_large_entry = false;
869 } else {
870 pfn = __pa(vaddr) >> PAGE_SHIFT;
871 use_large_entry = true;
872 }
873
874 e->gfn = pfn;
875 e->operation = op;
876
877 if (use_large_entry && IS_ALIGNED(vaddr, PMD_SIZE) &&
878 (vaddr_end - vaddr) >= PMD_SIZE) {
879 e->pagesize = RMP_PG_SIZE_2M;
880 vaddr += PMD_SIZE;
881 } else {
882 e->pagesize = RMP_PG_SIZE_4K;
883 vaddr += PAGE_SIZE;
884 }
885
886 e++;
887 i++;
888 }
889
890 /* Page validation must be rescinded before changing to shared */
891 if (op == SNP_PAGE_STATE_SHARED)
892 pvalidate_pages(data);
893
894 local_irq_save(flags);
895
896 if (sev_cfg.ghcbs_initialized)
897 ghcb = __sev_get_ghcb(&state);
898 else
899 ghcb = boot_ghcb;
900
901 /* Invoke the hypervisor to perform the page state changes */
902 if (!ghcb || vmgexit_psc(ghcb, data))
903 sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC);
904
905 if (sev_cfg.ghcbs_initialized)
906 __sev_put_ghcb(&state);
907
908 local_irq_restore(flags);
909
910 /* Page validation must be performed after changing to private */
911 if (op == SNP_PAGE_STATE_PRIVATE)
912 pvalidate_pages(data);
913
914 return vaddr;
915 }
916
set_pages_state(unsigned long vaddr,unsigned long npages,int op)917 static void set_pages_state(unsigned long vaddr, unsigned long npages, int op)
918 {
919 struct snp_psc_desc desc;
920 unsigned long vaddr_end;
921
922 /* Use the MSR protocol when a GHCB is not available. */
923 if (!boot_ghcb)
924 return early_set_pages_state(vaddr, __pa(vaddr), npages, op);
925
926 vaddr = vaddr & PAGE_MASK;
927 vaddr_end = vaddr + (npages << PAGE_SHIFT);
928
929 while (vaddr < vaddr_end)
930 vaddr = __set_pages_state(&desc, vaddr, vaddr_end, op);
931 }
932
snp_set_memory_shared(unsigned long vaddr,unsigned long npages)933 void snp_set_memory_shared(unsigned long vaddr, unsigned long npages)
934 {
935 if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
936 return;
937
938 set_pages_state(vaddr, npages, SNP_PAGE_STATE_SHARED);
939 }
940
snp_set_memory_private(unsigned long vaddr,unsigned long npages)941 void snp_set_memory_private(unsigned long vaddr, unsigned long npages)
942 {
943 if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
944 return;
945
946 set_pages_state(vaddr, npages, SNP_PAGE_STATE_PRIVATE);
947 }
948
snp_accept_memory(phys_addr_t start,phys_addr_t end)949 void snp_accept_memory(phys_addr_t start, phys_addr_t end)
950 {
951 unsigned long vaddr, npages;
952
953 if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
954 return;
955
956 vaddr = (unsigned long)__va(start);
957 npages = (end - start) >> PAGE_SHIFT;
958
959 set_pages_state(vaddr, npages, SNP_PAGE_STATE_PRIVATE);
960 }
961
set_pte_enc(pte_t * kpte,int level,void * va)962 static void set_pte_enc(pte_t *kpte, int level, void *va)
963 {
964 struct pte_enc_desc d = {
965 .kpte = kpte,
966 .pte_level = level,
967 .va = va,
968 .encrypt = true
969 };
970
971 prepare_pte_enc(&d);
972 set_pte_enc_mask(kpte, d.pfn, d.new_pgprot);
973 }
974
unshare_all_memory(void)975 static void unshare_all_memory(void)
976 {
977 unsigned long addr, end, size, ghcb;
978 struct sev_es_runtime_data *data;
979 unsigned int npages, level;
980 bool skipped_addr;
981 pte_t *pte;
982 int cpu;
983
984 /* Unshare the direct mapping. */
985 addr = PAGE_OFFSET;
986 end = PAGE_OFFSET + get_max_mapped();
987
988 while (addr < end) {
989 pte = lookup_address(addr, &level);
990 size = page_level_size(level);
991 npages = size / PAGE_SIZE;
992 skipped_addr = false;
993
994 if (!pte || !pte_decrypted(*pte) || pte_none(*pte)) {
995 addr += size;
996 continue;
997 }
998
999 /*
1000 * Ensure that all the per-CPU GHCBs are made private at the
1001 * end of the unsharing loop so that the switch to the slower
1002 * MSR protocol happens last.
1003 */
1004 for_each_possible_cpu(cpu) {
1005 data = per_cpu(runtime_data, cpu);
1006 ghcb = (unsigned long)&data->ghcb_page;
1007
1008 if (addr <= ghcb && ghcb <= addr + size) {
1009 skipped_addr = true;
1010 break;
1011 }
1012 }
1013
1014 if (!skipped_addr) {
1015 set_pte_enc(pte, level, (void *)addr);
1016 snp_set_memory_private(addr, npages);
1017 }
1018 addr += size;
1019 }
1020
1021 /* Unshare all bss decrypted memory. */
1022 addr = (unsigned long)__start_bss_decrypted;
1023 end = (unsigned long)__start_bss_decrypted_unused;
1024 npages = (end - addr) >> PAGE_SHIFT;
1025
1026 for (; addr < end; addr += PAGE_SIZE) {
1027 pte = lookup_address(addr, &level);
1028 if (!pte || !pte_decrypted(*pte) || pte_none(*pte))
1029 continue;
1030
1031 set_pte_enc(pte, level, (void *)addr);
1032 }
1033 addr = (unsigned long)__start_bss_decrypted;
1034 snp_set_memory_private(addr, npages);
1035
1036 __flush_tlb_all();
1037 }
1038
1039 /* Stop new private<->shared conversions */
snp_kexec_begin(void)1040 void snp_kexec_begin(void)
1041 {
1042 if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
1043 return;
1044
1045 if (!IS_ENABLED(CONFIG_KEXEC_CORE))
1046 return;
1047
1048 /*
1049 * Crash kernel ends up here with interrupts disabled: can't wait for
1050 * conversions to finish.
1051 *
1052 * If race happened, just report and proceed.
1053 */
1054 if (!set_memory_enc_stop_conversion())
1055 pr_warn("Failed to stop shared<->private conversions\n");
1056 }
1057
snp_kexec_finish(void)1058 void snp_kexec_finish(void)
1059 {
1060 struct sev_es_runtime_data *data;
1061 unsigned int level, cpu;
1062 unsigned long size;
1063 struct ghcb *ghcb;
1064 pte_t *pte;
1065
1066 if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
1067 return;
1068
1069 if (!IS_ENABLED(CONFIG_KEXEC_CORE))
1070 return;
1071
1072 unshare_all_memory();
1073
1074 /*
1075 * Switch to using the MSR protocol to change per-CPU GHCBs to
1076 * private. All the per-CPU GHCBs have been switched back to private,
1077 * so can't do any more GHCB calls to the hypervisor beyond this point
1078 * until the kexec'ed kernel starts running.
1079 */
1080 boot_ghcb = NULL;
1081 sev_cfg.ghcbs_initialized = false;
1082
1083 for_each_possible_cpu(cpu) {
1084 data = per_cpu(runtime_data, cpu);
1085 ghcb = &data->ghcb_page;
1086 pte = lookup_address((unsigned long)ghcb, &level);
1087 size = page_level_size(level);
1088 set_pte_enc(pte, level, (void *)ghcb);
1089 snp_set_memory_private((unsigned long)ghcb, (size / PAGE_SIZE));
1090 }
1091 }
1092
snp_set_vmsa(void * va,void * caa,int apic_id,bool make_vmsa)1093 static int snp_set_vmsa(void *va, void *caa, int apic_id, bool make_vmsa)
1094 {
1095 int ret;
1096
1097 if (snp_vmpl) {
1098 struct svsm_call call = {};
1099 unsigned long flags;
1100
1101 local_irq_save(flags);
1102
1103 call.caa = this_cpu_read(svsm_caa);
1104 call.rcx = __pa(va);
1105
1106 if (make_vmsa) {
1107 /* Protocol 0, Call ID 2 */
1108 call.rax = SVSM_CORE_CALL(SVSM_CORE_CREATE_VCPU);
1109 call.rdx = __pa(caa);
1110 call.r8 = apic_id;
1111 } else {
1112 /* Protocol 0, Call ID 3 */
1113 call.rax = SVSM_CORE_CALL(SVSM_CORE_DELETE_VCPU);
1114 }
1115
1116 ret = svsm_perform_call_protocol(&call);
1117
1118 local_irq_restore(flags);
1119 } else {
1120 /*
1121 * If the kernel runs at VMPL0, it can change the VMSA
1122 * bit for a page using the RMPADJUST instruction.
1123 * However, for the instruction to succeed it must
1124 * target the permissions of a lesser privileged (higher
1125 * numbered) VMPL level, so use VMPL1.
1126 */
1127 u64 attrs = 1;
1128
1129 if (make_vmsa)
1130 attrs |= RMPADJUST_VMSA_PAGE_BIT;
1131
1132 ret = rmpadjust((unsigned long)va, RMP_PG_SIZE_4K, attrs);
1133 }
1134
1135 return ret;
1136 }
1137
1138 #define __ATTR_BASE (SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK)
1139 #define INIT_CS_ATTRIBS (__ATTR_BASE | SVM_SELECTOR_READ_MASK | SVM_SELECTOR_CODE_MASK)
1140 #define INIT_DS_ATTRIBS (__ATTR_BASE | SVM_SELECTOR_WRITE_MASK)
1141
1142 #define INIT_LDTR_ATTRIBS (SVM_SELECTOR_P_MASK | 2)
1143 #define INIT_TR_ATTRIBS (SVM_SELECTOR_P_MASK | 3)
1144
snp_alloc_vmsa_page(int cpu)1145 static void *snp_alloc_vmsa_page(int cpu)
1146 {
1147 struct page *p;
1148
1149 /*
1150 * Allocate VMSA page to work around the SNP erratum where the CPU will
1151 * incorrectly signal an RMP violation #PF if a large page (2MB or 1GB)
1152 * collides with the RMP entry of VMSA page. The recommended workaround
1153 * is to not use a large page.
1154 *
1155 * Allocate an 8k page which is also 8k-aligned.
1156 */
1157 p = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL_ACCOUNT | __GFP_ZERO, 1);
1158 if (!p)
1159 return NULL;
1160
1161 split_page(p, 1);
1162
1163 /* Free the first 4k. This page may be 2M/1G aligned and cannot be used. */
1164 __free_page(p);
1165
1166 return page_address(p + 1);
1167 }
1168
snp_cleanup_vmsa(struct sev_es_save_area * vmsa,int apic_id)1169 static void snp_cleanup_vmsa(struct sev_es_save_area *vmsa, int apic_id)
1170 {
1171 int err;
1172
1173 err = snp_set_vmsa(vmsa, NULL, apic_id, false);
1174 if (err)
1175 pr_err("clear VMSA page failed (%u), leaking page\n", err);
1176 else
1177 free_page((unsigned long)vmsa);
1178 }
1179
wakeup_cpu_via_vmgexit(u32 apic_id,unsigned long start_ip)1180 static int wakeup_cpu_via_vmgexit(u32 apic_id, unsigned long start_ip)
1181 {
1182 struct sev_es_save_area *cur_vmsa, *vmsa;
1183 struct ghcb_state state;
1184 struct svsm_ca *caa;
1185 unsigned long flags;
1186 struct ghcb *ghcb;
1187 u8 sipi_vector;
1188 int cpu, ret;
1189 u64 cr4;
1190
1191 /*
1192 * The hypervisor SNP feature support check has happened earlier, just check
1193 * the AP_CREATION one here.
1194 */
1195 if (!(sev_hv_features & GHCB_HV_FT_SNP_AP_CREATION))
1196 return -EOPNOTSUPP;
1197
1198 /*
1199 * Verify the desired start IP against the known trampoline start IP
1200 * to catch any future new trampolines that may be introduced that
1201 * would require a new protected guest entry point.
1202 */
1203 if (WARN_ONCE(start_ip != real_mode_header->trampoline_start,
1204 "Unsupported SNP start_ip: %lx\n", start_ip))
1205 return -EINVAL;
1206
1207 /* Override start_ip with known protected guest start IP */
1208 start_ip = real_mode_header->sev_es_trampoline_start;
1209
1210 /* Find the logical CPU for the APIC ID */
1211 for_each_present_cpu(cpu) {
1212 if (arch_match_cpu_phys_id(cpu, apic_id))
1213 break;
1214 }
1215 if (cpu >= nr_cpu_ids)
1216 return -EINVAL;
1217
1218 cur_vmsa = per_cpu(sev_vmsa, cpu);
1219
1220 /*
1221 * A new VMSA is created each time because there is no guarantee that
1222 * the current VMSA is the kernels or that the vCPU is not running. If
1223 * an attempt was done to use the current VMSA with a running vCPU, a
1224 * #VMEXIT of that vCPU would wipe out all of the settings being done
1225 * here.
1226 */
1227 vmsa = (struct sev_es_save_area *)snp_alloc_vmsa_page(cpu);
1228 if (!vmsa)
1229 return -ENOMEM;
1230
1231 /* If an SVSM is present, the SVSM per-CPU CAA will be !NULL */
1232 caa = per_cpu(svsm_caa, cpu);
1233
1234 /* CR4 should maintain the MCE value */
1235 cr4 = native_read_cr4() & X86_CR4_MCE;
1236
1237 /* Set the CS value based on the start_ip converted to a SIPI vector */
1238 sipi_vector = (start_ip >> 12);
1239 vmsa->cs.base = sipi_vector << 12;
1240 vmsa->cs.limit = AP_INIT_CS_LIMIT;
1241 vmsa->cs.attrib = INIT_CS_ATTRIBS;
1242 vmsa->cs.selector = sipi_vector << 8;
1243
1244 /* Set the RIP value based on start_ip */
1245 vmsa->rip = start_ip & 0xfff;
1246
1247 /* Set AP INIT defaults as documented in the APM */
1248 vmsa->ds.limit = AP_INIT_DS_LIMIT;
1249 vmsa->ds.attrib = INIT_DS_ATTRIBS;
1250 vmsa->es = vmsa->ds;
1251 vmsa->fs = vmsa->ds;
1252 vmsa->gs = vmsa->ds;
1253 vmsa->ss = vmsa->ds;
1254
1255 vmsa->gdtr.limit = AP_INIT_GDTR_LIMIT;
1256 vmsa->ldtr.limit = AP_INIT_LDTR_LIMIT;
1257 vmsa->ldtr.attrib = INIT_LDTR_ATTRIBS;
1258 vmsa->idtr.limit = AP_INIT_IDTR_LIMIT;
1259 vmsa->tr.limit = AP_INIT_TR_LIMIT;
1260 vmsa->tr.attrib = INIT_TR_ATTRIBS;
1261
1262 vmsa->cr4 = cr4;
1263 vmsa->cr0 = AP_INIT_CR0_DEFAULT;
1264 vmsa->dr7 = DR7_RESET_VALUE;
1265 vmsa->dr6 = AP_INIT_DR6_DEFAULT;
1266 vmsa->rflags = AP_INIT_RFLAGS_DEFAULT;
1267 vmsa->g_pat = AP_INIT_GPAT_DEFAULT;
1268 vmsa->xcr0 = AP_INIT_XCR0_DEFAULT;
1269 vmsa->mxcsr = AP_INIT_MXCSR_DEFAULT;
1270 vmsa->x87_ftw = AP_INIT_X87_FTW_DEFAULT;
1271 vmsa->x87_fcw = AP_INIT_X87_FCW_DEFAULT;
1272
1273 /* SVME must be set. */
1274 vmsa->efer = EFER_SVME;
1275
1276 /*
1277 * Set the SNP-specific fields for this VMSA:
1278 * VMPL level
1279 * SEV_FEATURES (matches the SEV STATUS MSR right shifted 2 bits)
1280 */
1281 vmsa->vmpl = snp_vmpl;
1282 vmsa->sev_features = sev_status >> 2;
1283
1284 /* Populate AP's TSC scale/offset to get accurate TSC values. */
1285 if (cc_platform_has(CC_ATTR_GUEST_SNP_SECURE_TSC)) {
1286 vmsa->tsc_scale = snp_tsc_scale;
1287 vmsa->tsc_offset = snp_tsc_offset;
1288 }
1289
1290 /* Switch the page over to a VMSA page now that it is initialized */
1291 ret = snp_set_vmsa(vmsa, caa, apic_id, true);
1292 if (ret) {
1293 pr_err("set VMSA page failed (%u)\n", ret);
1294 free_page((unsigned long)vmsa);
1295
1296 return -EINVAL;
1297 }
1298
1299 /* Issue VMGEXIT AP Creation NAE event */
1300 local_irq_save(flags);
1301
1302 ghcb = __sev_get_ghcb(&state);
1303
1304 vc_ghcb_invalidate(ghcb);
1305 ghcb_set_rax(ghcb, vmsa->sev_features);
1306 ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_CREATION);
1307 ghcb_set_sw_exit_info_1(ghcb,
1308 ((u64)apic_id << 32) |
1309 ((u64)snp_vmpl << 16) |
1310 SVM_VMGEXIT_AP_CREATE);
1311 ghcb_set_sw_exit_info_2(ghcb, __pa(vmsa));
1312
1313 sev_es_wr_ghcb_msr(__pa(ghcb));
1314 VMGEXIT();
1315
1316 if (!ghcb_sw_exit_info_1_is_valid(ghcb) ||
1317 lower_32_bits(ghcb->save.sw_exit_info_1)) {
1318 pr_err("SNP AP Creation error\n");
1319 ret = -EINVAL;
1320 }
1321
1322 __sev_put_ghcb(&state);
1323
1324 local_irq_restore(flags);
1325
1326 /* Perform cleanup if there was an error */
1327 if (ret) {
1328 snp_cleanup_vmsa(vmsa, apic_id);
1329 vmsa = NULL;
1330 }
1331
1332 /* Free up any previous VMSA page */
1333 if (cur_vmsa)
1334 snp_cleanup_vmsa(cur_vmsa, apic_id);
1335
1336 /* Record the current VMSA page */
1337 per_cpu(sev_vmsa, cpu) = vmsa;
1338
1339 return ret;
1340 }
1341
snp_set_wakeup_secondary_cpu(void)1342 void __init snp_set_wakeup_secondary_cpu(void)
1343 {
1344 if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
1345 return;
1346
1347 /*
1348 * Always set this override if SNP is enabled. This makes it the
1349 * required method to start APs under SNP. If the hypervisor does
1350 * not support AP creation, then no APs will be started.
1351 */
1352 apic_update_callback(wakeup_secondary_cpu, wakeup_cpu_via_vmgexit);
1353 }
1354
sev_es_setup_ap_jump_table(struct real_mode_header * rmh)1355 int __init sev_es_setup_ap_jump_table(struct real_mode_header *rmh)
1356 {
1357 u16 startup_cs, startup_ip;
1358 phys_addr_t jump_table_pa;
1359 u64 jump_table_addr;
1360 u16 __iomem *jump_table;
1361
1362 jump_table_addr = get_jump_table_addr();
1363
1364 /* On UP guests there is no jump table so this is not a failure */
1365 if (!jump_table_addr)
1366 return 0;
1367
1368 /* Check if AP Jump Table is page-aligned */
1369 if (jump_table_addr & ~PAGE_MASK)
1370 return -EINVAL;
1371
1372 jump_table_pa = jump_table_addr & PAGE_MASK;
1373
1374 startup_cs = (u16)(rmh->trampoline_start >> 4);
1375 startup_ip = (u16)(rmh->sev_es_trampoline_start -
1376 rmh->trampoline_start);
1377
1378 jump_table = ioremap_encrypted(jump_table_pa, PAGE_SIZE);
1379 if (!jump_table)
1380 return -EIO;
1381
1382 writew(startup_ip, &jump_table[0]);
1383 writew(startup_cs, &jump_table[1]);
1384
1385 iounmap(jump_table);
1386
1387 return 0;
1388 }
1389
1390 /*
1391 * This is needed by the OVMF UEFI firmware which will use whatever it finds in
1392 * the GHCB MSR as its GHCB to talk to the hypervisor. So make sure the per-cpu
1393 * runtime GHCBs used by the kernel are also mapped in the EFI page-table.
1394 */
sev_es_efi_map_ghcbs(pgd_t * pgd)1395 int __init sev_es_efi_map_ghcbs(pgd_t *pgd)
1396 {
1397 struct sev_es_runtime_data *data;
1398 unsigned long address, pflags;
1399 int cpu;
1400 u64 pfn;
1401
1402 if (!cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT))
1403 return 0;
1404
1405 pflags = _PAGE_NX | _PAGE_RW;
1406
1407 for_each_possible_cpu(cpu) {
1408 data = per_cpu(runtime_data, cpu);
1409
1410 address = __pa(&data->ghcb_page);
1411 pfn = address >> PAGE_SHIFT;
1412
1413 if (kernel_map_pages_in_pgd(pgd, pfn, address, 1, pflags))
1414 return 1;
1415 }
1416
1417 return 0;
1418 }
1419
1420 /* Writes to the SVSM CAA MSR are ignored */
__vc_handle_msr_caa(struct pt_regs * regs,bool write)1421 static enum es_result __vc_handle_msr_caa(struct pt_regs *regs, bool write)
1422 {
1423 if (write)
1424 return ES_OK;
1425
1426 regs->ax = lower_32_bits(this_cpu_read(svsm_caa_pa));
1427 regs->dx = upper_32_bits(this_cpu_read(svsm_caa_pa));
1428
1429 return ES_OK;
1430 }
1431
1432 /*
1433 * TSC related accesses should not exit to the hypervisor when a guest is
1434 * executing with Secure TSC enabled, so special handling is required for
1435 * accesses of MSR_IA32_TSC and MSR_AMD64_GUEST_TSC_FREQ.
1436 */
__vc_handle_secure_tsc_msrs(struct pt_regs * regs,bool write)1437 static enum es_result __vc_handle_secure_tsc_msrs(struct pt_regs *regs, bool write)
1438 {
1439 u64 tsc;
1440
1441 /*
1442 * GUEST_TSC_FREQ should not be intercepted when Secure TSC is enabled.
1443 * Terminate the SNP guest when the interception is enabled.
1444 */
1445 if (regs->cx == MSR_AMD64_GUEST_TSC_FREQ)
1446 return ES_VMM_ERROR;
1447
1448 /*
1449 * Writes: Writing to MSR_IA32_TSC can cause subsequent reads of the TSC
1450 * to return undefined values, so ignore all writes.
1451 *
1452 * Reads: Reads of MSR_IA32_TSC should return the current TSC value, use
1453 * the value returned by rdtsc_ordered().
1454 */
1455 if (write) {
1456 WARN_ONCE(1, "TSC MSR writes are verboten!\n");
1457 return ES_OK;
1458 }
1459
1460 tsc = rdtsc_ordered();
1461 regs->ax = lower_32_bits(tsc);
1462 regs->dx = upper_32_bits(tsc);
1463
1464 return ES_OK;
1465 }
1466
vc_handle_msr(struct ghcb * ghcb,struct es_em_ctxt * ctxt)1467 static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
1468 {
1469 struct pt_regs *regs = ctxt->regs;
1470 enum es_result ret;
1471 bool write;
1472
1473 /* Is it a WRMSR? */
1474 write = ctxt->insn.opcode.bytes[1] == 0x30;
1475
1476 switch (regs->cx) {
1477 case MSR_SVSM_CAA:
1478 return __vc_handle_msr_caa(regs, write);
1479 case MSR_IA32_TSC:
1480 case MSR_AMD64_GUEST_TSC_FREQ:
1481 if (sev_status & MSR_AMD64_SNP_SECURE_TSC)
1482 return __vc_handle_secure_tsc_msrs(regs, write);
1483 break;
1484 default:
1485 break;
1486 }
1487
1488 ghcb_set_rcx(ghcb, regs->cx);
1489 if (write) {
1490 ghcb_set_rax(ghcb, regs->ax);
1491 ghcb_set_rdx(ghcb, regs->dx);
1492 }
1493
1494 ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_MSR, write, 0);
1495
1496 if ((ret == ES_OK) && !write) {
1497 regs->ax = ghcb->save.rax;
1498 regs->dx = ghcb->save.rdx;
1499 }
1500
1501 return ret;
1502 }
1503
snp_register_per_cpu_ghcb(void)1504 static void snp_register_per_cpu_ghcb(void)
1505 {
1506 struct sev_es_runtime_data *data;
1507 struct ghcb *ghcb;
1508
1509 data = this_cpu_read(runtime_data);
1510 ghcb = &data->ghcb_page;
1511
1512 snp_register_ghcb_early(__pa(ghcb));
1513 }
1514
setup_ghcb(void)1515 void setup_ghcb(void)
1516 {
1517 if (!cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT))
1518 return;
1519
1520 /*
1521 * Check whether the runtime #VC exception handler is active. It uses
1522 * the per-CPU GHCB page which is set up by sev_es_init_vc_handling().
1523 *
1524 * If SNP is active, register the per-CPU GHCB page so that the runtime
1525 * exception handler can use it.
1526 */
1527 if (initial_vc_handler == (unsigned long)kernel_exc_vmm_communication) {
1528 if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
1529 snp_register_per_cpu_ghcb();
1530
1531 sev_cfg.ghcbs_initialized = true;
1532
1533 return;
1534 }
1535
1536 /*
1537 * Make sure the hypervisor talks a supported protocol.
1538 * This gets called only in the BSP boot phase.
1539 */
1540 if (!sev_es_negotiate_protocol())
1541 sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ);
1542
1543 /*
1544 * Clear the boot_ghcb. The first exception comes in before the bss
1545 * section is cleared.
1546 */
1547 memset(&boot_ghcb_page, 0, PAGE_SIZE);
1548
1549 /* Alright - Make the boot-ghcb public */
1550 boot_ghcb = &boot_ghcb_page;
1551
1552 /* SNP guest requires that GHCB GPA must be registered. */
1553 if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
1554 snp_register_ghcb_early(__pa(&boot_ghcb_page));
1555 }
1556
1557 #ifdef CONFIG_HOTPLUG_CPU
sev_es_ap_hlt_loop(void)1558 static void sev_es_ap_hlt_loop(void)
1559 {
1560 struct ghcb_state state;
1561 struct ghcb *ghcb;
1562
1563 ghcb = __sev_get_ghcb(&state);
1564
1565 while (true) {
1566 vc_ghcb_invalidate(ghcb);
1567 ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_HLT_LOOP);
1568 ghcb_set_sw_exit_info_1(ghcb, 0);
1569 ghcb_set_sw_exit_info_2(ghcb, 0);
1570
1571 sev_es_wr_ghcb_msr(__pa(ghcb));
1572 VMGEXIT();
1573
1574 /* Wakeup signal? */
1575 if (ghcb_sw_exit_info_2_is_valid(ghcb) &&
1576 ghcb->save.sw_exit_info_2)
1577 break;
1578 }
1579
1580 __sev_put_ghcb(&state);
1581 }
1582
1583 /*
1584 * Play_dead handler when running under SEV-ES. This is needed because
1585 * the hypervisor can't deliver an SIPI request to restart the AP.
1586 * Instead the kernel has to issue a VMGEXIT to halt the VCPU until the
1587 * hypervisor wakes it up again.
1588 */
sev_es_play_dead(void)1589 static void sev_es_play_dead(void)
1590 {
1591 play_dead_common();
1592
1593 /* IRQs now disabled */
1594
1595 sev_es_ap_hlt_loop();
1596
1597 /*
1598 * If we get here, the VCPU was woken up again. Jump to CPU
1599 * startup code to get it back online.
1600 */
1601 soft_restart_cpu();
1602 }
1603 #else /* CONFIG_HOTPLUG_CPU */
1604 #define sev_es_play_dead native_play_dead
1605 #endif /* CONFIG_HOTPLUG_CPU */
1606
1607 #ifdef CONFIG_SMP
sev_es_setup_play_dead(void)1608 static void __init sev_es_setup_play_dead(void)
1609 {
1610 smp_ops.play_dead = sev_es_play_dead;
1611 }
1612 #else
sev_es_setup_play_dead(void)1613 static inline void sev_es_setup_play_dead(void) { }
1614 #endif
1615
alloc_runtime_data(int cpu)1616 static void __init alloc_runtime_data(int cpu)
1617 {
1618 struct sev_es_runtime_data *data;
1619
1620 data = memblock_alloc_node(sizeof(*data), PAGE_SIZE, cpu_to_node(cpu));
1621 if (!data)
1622 panic("Can't allocate SEV-ES runtime data");
1623
1624 per_cpu(runtime_data, cpu) = data;
1625
1626 if (snp_vmpl) {
1627 struct svsm_ca *caa;
1628
1629 /* Allocate the SVSM CA page if an SVSM is present */
1630 caa = memblock_alloc_or_panic(sizeof(*caa), PAGE_SIZE);
1631
1632 per_cpu(svsm_caa, cpu) = caa;
1633 per_cpu(svsm_caa_pa, cpu) = __pa(caa);
1634 }
1635 }
1636
init_ghcb(int cpu)1637 static void __init init_ghcb(int cpu)
1638 {
1639 struct sev_es_runtime_data *data;
1640 int err;
1641
1642 data = per_cpu(runtime_data, cpu);
1643
1644 err = early_set_memory_decrypted((unsigned long)&data->ghcb_page,
1645 sizeof(data->ghcb_page));
1646 if (err)
1647 panic("Can't map GHCBs unencrypted");
1648
1649 memset(&data->ghcb_page, 0, sizeof(data->ghcb_page));
1650
1651 data->ghcb_active = false;
1652 data->backup_ghcb_active = false;
1653 }
1654
sev_es_init_vc_handling(void)1655 void __init sev_es_init_vc_handling(void)
1656 {
1657 int cpu;
1658
1659 BUILD_BUG_ON(offsetof(struct sev_es_runtime_data, ghcb_page) % PAGE_SIZE);
1660
1661 if (!cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT))
1662 return;
1663
1664 if (!sev_es_check_cpu_features())
1665 panic("SEV-ES CPU Features missing");
1666
1667 /*
1668 * SNP is supported in v2 of the GHCB spec which mandates support for HV
1669 * features.
1670 */
1671 if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) {
1672 sev_hv_features = get_hv_features();
1673
1674 if (!(sev_hv_features & GHCB_HV_FT_SNP))
1675 sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED);
1676 }
1677
1678 /* Initialize per-cpu GHCB pages */
1679 for_each_possible_cpu(cpu) {
1680 alloc_runtime_data(cpu);
1681 init_ghcb(cpu);
1682 }
1683
1684 /* If running under an SVSM, switch to the per-cpu CA */
1685 if (snp_vmpl) {
1686 struct svsm_call call = {};
1687 unsigned long flags;
1688 int ret;
1689
1690 local_irq_save(flags);
1691
1692 /*
1693 * SVSM_CORE_REMAP_CA call:
1694 * RAX = 0 (Protocol=0, CallID=0)
1695 * RCX = New CA GPA
1696 */
1697 call.caa = svsm_get_caa();
1698 call.rax = SVSM_CORE_CALL(SVSM_CORE_REMAP_CA);
1699 call.rcx = this_cpu_read(svsm_caa_pa);
1700 ret = svsm_perform_call_protocol(&call);
1701 if (ret)
1702 panic("Can't remap the SVSM CA, ret=%d, rax_out=0x%llx\n",
1703 ret, call.rax_out);
1704
1705 sev_cfg.use_cas = true;
1706
1707 local_irq_restore(flags);
1708 }
1709
1710 sev_es_setup_play_dead();
1711
1712 /* Secondary CPUs use the runtime #VC handler */
1713 initial_vc_handler = (unsigned long)kernel_exc_vmm_communication;
1714 }
1715
vc_early_forward_exception(struct es_em_ctxt * ctxt)1716 static void __init vc_early_forward_exception(struct es_em_ctxt *ctxt)
1717 {
1718 int trapnr = ctxt->fi.vector;
1719
1720 if (trapnr == X86_TRAP_PF)
1721 native_write_cr2(ctxt->fi.cr2);
1722
1723 ctxt->regs->orig_ax = ctxt->fi.error_code;
1724 do_early_exception(ctxt->regs, trapnr);
1725 }
1726
vc_insn_get_rm(struct es_em_ctxt * ctxt)1727 static long *vc_insn_get_rm(struct es_em_ctxt *ctxt)
1728 {
1729 long *reg_array;
1730 int offset;
1731
1732 reg_array = (long *)ctxt->regs;
1733 offset = insn_get_modrm_rm_off(&ctxt->insn, ctxt->regs);
1734
1735 if (offset < 0)
1736 return NULL;
1737
1738 offset /= sizeof(long);
1739
1740 return reg_array + offset;
1741 }
vc_do_mmio(struct ghcb * ghcb,struct es_em_ctxt * ctxt,unsigned int bytes,bool read)1742 static enum es_result vc_do_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt,
1743 unsigned int bytes, bool read)
1744 {
1745 u64 exit_code, exit_info_1, exit_info_2;
1746 unsigned long ghcb_pa = __pa(ghcb);
1747 enum es_result res;
1748 phys_addr_t paddr;
1749 void __user *ref;
1750
1751 ref = insn_get_addr_ref(&ctxt->insn, ctxt->regs);
1752 if (ref == (void __user *)-1L)
1753 return ES_UNSUPPORTED;
1754
1755 exit_code = read ? SVM_VMGEXIT_MMIO_READ : SVM_VMGEXIT_MMIO_WRITE;
1756
1757 res = vc_slow_virt_to_phys(ghcb, ctxt, (unsigned long)ref, &paddr);
1758 if (res != ES_OK) {
1759 if (res == ES_EXCEPTION && !read)
1760 ctxt->fi.error_code |= X86_PF_WRITE;
1761
1762 return res;
1763 }
1764
1765 exit_info_1 = paddr;
1766 /* Can never be greater than 8 */
1767 exit_info_2 = bytes;
1768
1769 ghcb_set_sw_scratch(ghcb, ghcb_pa + offsetof(struct ghcb, shared_buffer));
1770
1771 return sev_es_ghcb_hv_call(ghcb, ctxt, exit_code, exit_info_1, exit_info_2);
1772 }
1773
1774 /*
1775 * The MOVS instruction has two memory operands, which raises the
1776 * problem that it is not known whether the access to the source or the
1777 * destination caused the #VC exception (and hence whether an MMIO read
1778 * or write operation needs to be emulated).
1779 *
1780 * Instead of playing games with walking page-tables and trying to guess
1781 * whether the source or destination is an MMIO range, split the move
1782 * into two operations, a read and a write with only one memory operand.
1783 * This will cause a nested #VC exception on the MMIO address which can
1784 * then be handled.
1785 *
1786 * This implementation has the benefit that it also supports MOVS where
1787 * source _and_ destination are MMIO regions.
1788 *
1789 * It will slow MOVS on MMIO down a lot, but in SEV-ES guests it is a
1790 * rare operation. If it turns out to be a performance problem the split
1791 * operations can be moved to memcpy_fromio() and memcpy_toio().
1792 */
vc_handle_mmio_movs(struct es_em_ctxt * ctxt,unsigned int bytes)1793 static enum es_result vc_handle_mmio_movs(struct es_em_ctxt *ctxt,
1794 unsigned int bytes)
1795 {
1796 unsigned long ds_base, es_base;
1797 unsigned char *src, *dst;
1798 unsigned char buffer[8];
1799 enum es_result ret;
1800 bool rep;
1801 int off;
1802
1803 ds_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_DS);
1804 es_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_ES);
1805
1806 if (ds_base == -1L || es_base == -1L) {
1807 ctxt->fi.vector = X86_TRAP_GP;
1808 ctxt->fi.error_code = 0;
1809 return ES_EXCEPTION;
1810 }
1811
1812 src = ds_base + (unsigned char *)ctxt->regs->si;
1813 dst = es_base + (unsigned char *)ctxt->regs->di;
1814
1815 ret = vc_read_mem(ctxt, src, buffer, bytes);
1816 if (ret != ES_OK)
1817 return ret;
1818
1819 ret = vc_write_mem(ctxt, dst, buffer, bytes);
1820 if (ret != ES_OK)
1821 return ret;
1822
1823 if (ctxt->regs->flags & X86_EFLAGS_DF)
1824 off = -bytes;
1825 else
1826 off = bytes;
1827
1828 ctxt->regs->si += off;
1829 ctxt->regs->di += off;
1830
1831 rep = insn_has_rep_prefix(&ctxt->insn);
1832 if (rep)
1833 ctxt->regs->cx -= 1;
1834
1835 if (!rep || ctxt->regs->cx == 0)
1836 return ES_OK;
1837 else
1838 return ES_RETRY;
1839 }
1840
vc_handle_mmio(struct ghcb * ghcb,struct es_em_ctxt * ctxt)1841 static enum es_result vc_handle_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
1842 {
1843 struct insn *insn = &ctxt->insn;
1844 enum insn_mmio_type mmio;
1845 unsigned int bytes = 0;
1846 enum es_result ret;
1847 u8 sign_byte;
1848 long *reg_data;
1849
1850 mmio = insn_decode_mmio(insn, &bytes);
1851 if (mmio == INSN_MMIO_DECODE_FAILED)
1852 return ES_DECODE_FAILED;
1853
1854 if (mmio != INSN_MMIO_WRITE_IMM && mmio != INSN_MMIO_MOVS) {
1855 reg_data = insn_get_modrm_reg_ptr(insn, ctxt->regs);
1856 if (!reg_data)
1857 return ES_DECODE_FAILED;
1858 }
1859
1860 if (user_mode(ctxt->regs))
1861 return ES_UNSUPPORTED;
1862
1863 switch (mmio) {
1864 case INSN_MMIO_WRITE:
1865 memcpy(ghcb->shared_buffer, reg_data, bytes);
1866 ret = vc_do_mmio(ghcb, ctxt, bytes, false);
1867 break;
1868 case INSN_MMIO_WRITE_IMM:
1869 memcpy(ghcb->shared_buffer, insn->immediate1.bytes, bytes);
1870 ret = vc_do_mmio(ghcb, ctxt, bytes, false);
1871 break;
1872 case INSN_MMIO_READ:
1873 ret = vc_do_mmio(ghcb, ctxt, bytes, true);
1874 if (ret)
1875 break;
1876
1877 /* Zero-extend for 32-bit operation */
1878 if (bytes == 4)
1879 *reg_data = 0;
1880
1881 memcpy(reg_data, ghcb->shared_buffer, bytes);
1882 break;
1883 case INSN_MMIO_READ_ZERO_EXTEND:
1884 ret = vc_do_mmio(ghcb, ctxt, bytes, true);
1885 if (ret)
1886 break;
1887
1888 /* Zero extend based on operand size */
1889 memset(reg_data, 0, insn->opnd_bytes);
1890 memcpy(reg_data, ghcb->shared_buffer, bytes);
1891 break;
1892 case INSN_MMIO_READ_SIGN_EXTEND:
1893 ret = vc_do_mmio(ghcb, ctxt, bytes, true);
1894 if (ret)
1895 break;
1896
1897 if (bytes == 1) {
1898 u8 *val = (u8 *)ghcb->shared_buffer;
1899
1900 sign_byte = (*val & 0x80) ? 0xff : 0x00;
1901 } else {
1902 u16 *val = (u16 *)ghcb->shared_buffer;
1903
1904 sign_byte = (*val & 0x8000) ? 0xff : 0x00;
1905 }
1906
1907 /* Sign extend based on operand size */
1908 memset(reg_data, sign_byte, insn->opnd_bytes);
1909 memcpy(reg_data, ghcb->shared_buffer, bytes);
1910 break;
1911 case INSN_MMIO_MOVS:
1912 ret = vc_handle_mmio_movs(ctxt, bytes);
1913 break;
1914 default:
1915 ret = ES_UNSUPPORTED;
1916 break;
1917 }
1918
1919 return ret;
1920 }
1921
vc_handle_dr7_write(struct ghcb * ghcb,struct es_em_ctxt * ctxt)1922 static enum es_result vc_handle_dr7_write(struct ghcb *ghcb,
1923 struct es_em_ctxt *ctxt)
1924 {
1925 struct sev_es_runtime_data *data = this_cpu_read(runtime_data);
1926 long val, *reg = vc_insn_get_rm(ctxt);
1927 enum es_result ret;
1928
1929 if (sev_status & MSR_AMD64_SNP_DEBUG_SWAP)
1930 return ES_VMM_ERROR;
1931
1932 if (!reg)
1933 return ES_DECODE_FAILED;
1934
1935 val = *reg;
1936
1937 /* Upper 32 bits must be written as zeroes */
1938 if (val >> 32) {
1939 ctxt->fi.vector = X86_TRAP_GP;
1940 ctxt->fi.error_code = 0;
1941 return ES_EXCEPTION;
1942 }
1943
1944 /* Clear out other reserved bits and set bit 10 */
1945 val = (val & 0xffff23ffL) | BIT(10);
1946
1947 /* Early non-zero writes to DR7 are not supported */
1948 if (!data && (val & ~DR7_RESET_VALUE))
1949 return ES_UNSUPPORTED;
1950
1951 /* Using a value of 0 for ExitInfo1 means RAX holds the value */
1952 ghcb_set_rax(ghcb, val);
1953 ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_WRITE_DR7, 0, 0);
1954 if (ret != ES_OK)
1955 return ret;
1956
1957 if (data)
1958 data->dr7 = val;
1959
1960 return ES_OK;
1961 }
1962
vc_handle_dr7_read(struct ghcb * ghcb,struct es_em_ctxt * ctxt)1963 static enum es_result vc_handle_dr7_read(struct ghcb *ghcb,
1964 struct es_em_ctxt *ctxt)
1965 {
1966 struct sev_es_runtime_data *data = this_cpu_read(runtime_data);
1967 long *reg = vc_insn_get_rm(ctxt);
1968
1969 if (sev_status & MSR_AMD64_SNP_DEBUG_SWAP)
1970 return ES_VMM_ERROR;
1971
1972 if (!reg)
1973 return ES_DECODE_FAILED;
1974
1975 if (data)
1976 *reg = data->dr7;
1977 else
1978 *reg = DR7_RESET_VALUE;
1979
1980 return ES_OK;
1981 }
1982
vc_handle_wbinvd(struct ghcb * ghcb,struct es_em_ctxt * ctxt)1983 static enum es_result vc_handle_wbinvd(struct ghcb *ghcb,
1984 struct es_em_ctxt *ctxt)
1985 {
1986 return sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_WBINVD, 0, 0);
1987 }
1988
vc_handle_rdpmc(struct ghcb * ghcb,struct es_em_ctxt * ctxt)1989 static enum es_result vc_handle_rdpmc(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
1990 {
1991 enum es_result ret;
1992
1993 ghcb_set_rcx(ghcb, ctxt->regs->cx);
1994
1995 ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_RDPMC, 0, 0);
1996 if (ret != ES_OK)
1997 return ret;
1998
1999 if (!(ghcb_rax_is_valid(ghcb) && ghcb_rdx_is_valid(ghcb)))
2000 return ES_VMM_ERROR;
2001
2002 ctxt->regs->ax = ghcb->save.rax;
2003 ctxt->regs->dx = ghcb->save.rdx;
2004
2005 return ES_OK;
2006 }
2007
vc_handle_monitor(struct ghcb * ghcb,struct es_em_ctxt * ctxt)2008 static enum es_result vc_handle_monitor(struct ghcb *ghcb,
2009 struct es_em_ctxt *ctxt)
2010 {
2011 /*
2012 * Treat it as a NOP and do not leak a physical address to the
2013 * hypervisor.
2014 */
2015 return ES_OK;
2016 }
2017
vc_handle_mwait(struct ghcb * ghcb,struct es_em_ctxt * ctxt)2018 static enum es_result vc_handle_mwait(struct ghcb *ghcb,
2019 struct es_em_ctxt *ctxt)
2020 {
2021 /* Treat the same as MONITOR/MONITORX */
2022 return ES_OK;
2023 }
2024
vc_handle_vmmcall(struct ghcb * ghcb,struct es_em_ctxt * ctxt)2025 static enum es_result vc_handle_vmmcall(struct ghcb *ghcb,
2026 struct es_em_ctxt *ctxt)
2027 {
2028 enum es_result ret;
2029
2030 ghcb_set_rax(ghcb, ctxt->regs->ax);
2031 ghcb_set_cpl(ghcb, user_mode(ctxt->regs) ? 3 : 0);
2032
2033 if (x86_platform.hyper.sev_es_hcall_prepare)
2034 x86_platform.hyper.sev_es_hcall_prepare(ghcb, ctxt->regs);
2035
2036 ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_VMMCALL, 0, 0);
2037 if (ret != ES_OK)
2038 return ret;
2039
2040 if (!ghcb_rax_is_valid(ghcb))
2041 return ES_VMM_ERROR;
2042
2043 ctxt->regs->ax = ghcb->save.rax;
2044
2045 /*
2046 * Call sev_es_hcall_finish() after regs->ax is already set.
2047 * This allows the hypervisor handler to overwrite it again if
2048 * necessary.
2049 */
2050 if (x86_platform.hyper.sev_es_hcall_finish &&
2051 !x86_platform.hyper.sev_es_hcall_finish(ghcb, ctxt->regs))
2052 return ES_VMM_ERROR;
2053
2054 return ES_OK;
2055 }
2056
vc_handle_trap_ac(struct ghcb * ghcb,struct es_em_ctxt * ctxt)2057 static enum es_result vc_handle_trap_ac(struct ghcb *ghcb,
2058 struct es_em_ctxt *ctxt)
2059 {
2060 /*
2061 * Calling ecx_alignment_check() directly does not work, because it
2062 * enables IRQs and the GHCB is active. Forward the exception and call
2063 * it later from vc_forward_exception().
2064 */
2065 ctxt->fi.vector = X86_TRAP_AC;
2066 ctxt->fi.error_code = 0;
2067 return ES_EXCEPTION;
2068 }
2069
vc_handle_exitcode(struct es_em_ctxt * ctxt,struct ghcb * ghcb,unsigned long exit_code)2070 static enum es_result vc_handle_exitcode(struct es_em_ctxt *ctxt,
2071 struct ghcb *ghcb,
2072 unsigned long exit_code)
2073 {
2074 enum es_result result = vc_check_opcode_bytes(ctxt, exit_code);
2075
2076 if (result != ES_OK)
2077 return result;
2078
2079 switch (exit_code) {
2080 case SVM_EXIT_READ_DR7:
2081 result = vc_handle_dr7_read(ghcb, ctxt);
2082 break;
2083 case SVM_EXIT_WRITE_DR7:
2084 result = vc_handle_dr7_write(ghcb, ctxt);
2085 break;
2086 case SVM_EXIT_EXCP_BASE + X86_TRAP_AC:
2087 result = vc_handle_trap_ac(ghcb, ctxt);
2088 break;
2089 case SVM_EXIT_RDTSC:
2090 case SVM_EXIT_RDTSCP:
2091 result = vc_handle_rdtsc(ghcb, ctxt, exit_code);
2092 break;
2093 case SVM_EXIT_RDPMC:
2094 result = vc_handle_rdpmc(ghcb, ctxt);
2095 break;
2096 case SVM_EXIT_INVD:
2097 pr_err_ratelimited("#VC exception for INVD??? Seriously???\n");
2098 result = ES_UNSUPPORTED;
2099 break;
2100 case SVM_EXIT_CPUID:
2101 result = vc_handle_cpuid(ghcb, ctxt);
2102 break;
2103 case SVM_EXIT_IOIO:
2104 result = vc_handle_ioio(ghcb, ctxt);
2105 break;
2106 case SVM_EXIT_MSR:
2107 result = vc_handle_msr(ghcb, ctxt);
2108 break;
2109 case SVM_EXIT_VMMCALL:
2110 result = vc_handle_vmmcall(ghcb, ctxt);
2111 break;
2112 case SVM_EXIT_WBINVD:
2113 result = vc_handle_wbinvd(ghcb, ctxt);
2114 break;
2115 case SVM_EXIT_MONITOR:
2116 result = vc_handle_monitor(ghcb, ctxt);
2117 break;
2118 case SVM_EXIT_MWAIT:
2119 result = vc_handle_mwait(ghcb, ctxt);
2120 break;
2121 case SVM_EXIT_NPF:
2122 result = vc_handle_mmio(ghcb, ctxt);
2123 break;
2124 default:
2125 /*
2126 * Unexpected #VC exception
2127 */
2128 result = ES_UNSUPPORTED;
2129 }
2130
2131 return result;
2132 }
2133
is_vc2_stack(unsigned long sp)2134 static __always_inline bool is_vc2_stack(unsigned long sp)
2135 {
2136 return (sp >= __this_cpu_ist_bottom_va(VC2) && sp < __this_cpu_ist_top_va(VC2));
2137 }
2138
vc_from_invalid_context(struct pt_regs * regs)2139 static __always_inline bool vc_from_invalid_context(struct pt_regs *regs)
2140 {
2141 unsigned long sp, prev_sp;
2142
2143 sp = (unsigned long)regs;
2144 prev_sp = regs->sp;
2145
2146 /*
2147 * If the code was already executing on the VC2 stack when the #VC
2148 * happened, let it proceed to the normal handling routine. This way the
2149 * code executing on the VC2 stack can cause #VC exceptions to get handled.
2150 */
2151 return is_vc2_stack(sp) && !is_vc2_stack(prev_sp);
2152 }
2153
vc_raw_handle_exception(struct pt_regs * regs,unsigned long error_code)2154 static bool vc_raw_handle_exception(struct pt_regs *regs, unsigned long error_code)
2155 {
2156 struct ghcb_state state;
2157 struct es_em_ctxt ctxt;
2158 enum es_result result;
2159 struct ghcb *ghcb;
2160 bool ret = true;
2161
2162 ghcb = __sev_get_ghcb(&state);
2163
2164 vc_ghcb_invalidate(ghcb);
2165 result = vc_init_em_ctxt(&ctxt, regs, error_code);
2166
2167 if (result == ES_OK)
2168 result = vc_handle_exitcode(&ctxt, ghcb, error_code);
2169
2170 __sev_put_ghcb(&state);
2171
2172 /* Done - now check the result */
2173 switch (result) {
2174 case ES_OK:
2175 vc_finish_insn(&ctxt);
2176 break;
2177 case ES_UNSUPPORTED:
2178 pr_err_ratelimited("Unsupported exit-code 0x%02lx in #VC exception (IP: 0x%lx)\n",
2179 error_code, regs->ip);
2180 ret = false;
2181 break;
2182 case ES_VMM_ERROR:
2183 pr_err_ratelimited("Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n",
2184 error_code, regs->ip);
2185 ret = false;
2186 break;
2187 case ES_DECODE_FAILED:
2188 pr_err_ratelimited("Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n",
2189 error_code, regs->ip);
2190 ret = false;
2191 break;
2192 case ES_EXCEPTION:
2193 vc_forward_exception(&ctxt);
2194 break;
2195 case ES_RETRY:
2196 /* Nothing to do */
2197 break;
2198 default:
2199 pr_emerg("Unknown result in %s():%d\n", __func__, result);
2200 /*
2201 * Emulating the instruction which caused the #VC exception
2202 * failed - can't continue so print debug information
2203 */
2204 BUG();
2205 }
2206
2207 return ret;
2208 }
2209
vc_is_db(unsigned long error_code)2210 static __always_inline bool vc_is_db(unsigned long error_code)
2211 {
2212 return error_code == SVM_EXIT_EXCP_BASE + X86_TRAP_DB;
2213 }
2214
2215 /*
2216 * Runtime #VC exception handler when raised from kernel mode. Runs in NMI mode
2217 * and will panic when an error happens.
2218 */
DEFINE_IDTENTRY_VC_KERNEL(exc_vmm_communication)2219 DEFINE_IDTENTRY_VC_KERNEL(exc_vmm_communication)
2220 {
2221 irqentry_state_t irq_state;
2222
2223 /*
2224 * With the current implementation it is always possible to switch to a
2225 * safe stack because #VC exceptions only happen at known places, like
2226 * intercepted instructions or accesses to MMIO areas/IO ports. They can
2227 * also happen with code instrumentation when the hypervisor intercepts
2228 * #DB, but the critical paths are forbidden to be instrumented, so #DB
2229 * exceptions currently also only happen in safe places.
2230 *
2231 * But keep this here in case the noinstr annotations are violated due
2232 * to bug elsewhere.
2233 */
2234 if (unlikely(vc_from_invalid_context(regs))) {
2235 instrumentation_begin();
2236 panic("Can't handle #VC exception from unsupported context\n");
2237 instrumentation_end();
2238 }
2239
2240 /*
2241 * Handle #DB before calling into !noinstr code to avoid recursive #DB.
2242 */
2243 if (vc_is_db(error_code)) {
2244 exc_debug(regs);
2245 return;
2246 }
2247
2248 irq_state = irqentry_nmi_enter(regs);
2249
2250 instrumentation_begin();
2251
2252 if (!vc_raw_handle_exception(regs, error_code)) {
2253 /* Show some debug info */
2254 show_regs(regs);
2255
2256 /* Ask hypervisor to sev_es_terminate */
2257 sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ);
2258
2259 /* If that fails and we get here - just panic */
2260 panic("Returned from Terminate-Request to Hypervisor\n");
2261 }
2262
2263 instrumentation_end();
2264 irqentry_nmi_exit(regs, irq_state);
2265 }
2266
2267 /*
2268 * Runtime #VC exception handler when raised from user mode. Runs in IRQ mode
2269 * and will kill the current task with SIGBUS when an error happens.
2270 */
DEFINE_IDTENTRY_VC_USER(exc_vmm_communication)2271 DEFINE_IDTENTRY_VC_USER(exc_vmm_communication)
2272 {
2273 /*
2274 * Handle #DB before calling into !noinstr code to avoid recursive #DB.
2275 */
2276 if (vc_is_db(error_code)) {
2277 noist_exc_debug(regs);
2278 return;
2279 }
2280
2281 irqentry_enter_from_user_mode(regs);
2282 instrumentation_begin();
2283
2284 if (!vc_raw_handle_exception(regs, error_code)) {
2285 /*
2286 * Do not kill the machine if user-space triggered the
2287 * exception. Send SIGBUS instead and let user-space deal with
2288 * it.
2289 */
2290 force_sig_fault(SIGBUS, BUS_OBJERR, (void __user *)0);
2291 }
2292
2293 instrumentation_end();
2294 irqentry_exit_to_user_mode(regs);
2295 }
2296
handle_vc_boot_ghcb(struct pt_regs * regs)2297 bool __init handle_vc_boot_ghcb(struct pt_regs *regs)
2298 {
2299 unsigned long exit_code = regs->orig_ax;
2300 struct es_em_ctxt ctxt;
2301 enum es_result result;
2302
2303 vc_ghcb_invalidate(boot_ghcb);
2304
2305 result = vc_init_em_ctxt(&ctxt, regs, exit_code);
2306 if (result == ES_OK)
2307 result = vc_handle_exitcode(&ctxt, boot_ghcb, exit_code);
2308
2309 /* Done - now check the result */
2310 switch (result) {
2311 case ES_OK:
2312 vc_finish_insn(&ctxt);
2313 break;
2314 case ES_UNSUPPORTED:
2315 early_printk("PANIC: Unsupported exit-code 0x%02lx in early #VC exception (IP: 0x%lx)\n",
2316 exit_code, regs->ip);
2317 goto fail;
2318 case ES_VMM_ERROR:
2319 early_printk("PANIC: Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n",
2320 exit_code, regs->ip);
2321 goto fail;
2322 case ES_DECODE_FAILED:
2323 early_printk("PANIC: Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n",
2324 exit_code, regs->ip);
2325 goto fail;
2326 case ES_EXCEPTION:
2327 vc_early_forward_exception(&ctxt);
2328 break;
2329 case ES_RETRY:
2330 /* Nothing to do */
2331 break;
2332 default:
2333 BUG();
2334 }
2335
2336 return true;
2337
2338 fail:
2339 show_regs(regs);
2340
2341 sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ);
2342 }
2343
2344 /*
2345 * Initial set up of SNP relies on information provided by the
2346 * Confidential Computing blob, which can be passed to the kernel
2347 * in the following ways, depending on how it is booted:
2348 *
2349 * - when booted via the boot/decompress kernel:
2350 * - via boot_params
2351 *
2352 * - when booted directly by firmware/bootloader (e.g. CONFIG_PVH):
2353 * - via a setup_data entry, as defined by the Linux Boot Protocol
2354 *
2355 * Scan for the blob in that order.
2356 */
find_cc_blob(struct boot_params * bp)2357 static __head struct cc_blob_sev_info *find_cc_blob(struct boot_params *bp)
2358 {
2359 struct cc_blob_sev_info *cc_info;
2360
2361 /* Boot kernel would have passed the CC blob via boot_params. */
2362 if (bp->cc_blob_address) {
2363 cc_info = (struct cc_blob_sev_info *)(unsigned long)bp->cc_blob_address;
2364 goto found_cc_info;
2365 }
2366
2367 /*
2368 * If kernel was booted directly, without the use of the
2369 * boot/decompression kernel, the CC blob may have been passed via
2370 * setup_data instead.
2371 */
2372 cc_info = find_cc_blob_setup_data(bp);
2373 if (!cc_info)
2374 return NULL;
2375
2376 found_cc_info:
2377 if (cc_info->magic != CC_BLOB_SEV_HDR_MAGIC)
2378 snp_abort();
2379
2380 return cc_info;
2381 }
2382
svsm_setup(struct cc_blob_sev_info * cc_info)2383 static __head void svsm_setup(struct cc_blob_sev_info *cc_info)
2384 {
2385 struct svsm_call call = {};
2386 int ret;
2387 u64 pa;
2388
2389 /*
2390 * Record the SVSM Calling Area address (CAA) if the guest is not
2391 * running at VMPL0. The CA will be used to communicate with the
2392 * SVSM to perform the SVSM services.
2393 */
2394 if (!svsm_setup_ca(cc_info))
2395 return;
2396
2397 /*
2398 * It is very early in the boot and the kernel is running identity
2399 * mapped but without having adjusted the pagetables to where the
2400 * kernel was loaded (physbase), so the get the CA address using
2401 * RIP-relative addressing.
2402 */
2403 pa = (u64)&RIP_REL_REF(boot_svsm_ca_page);
2404
2405 /*
2406 * Switch over to the boot SVSM CA while the current CA is still
2407 * addressable. There is no GHCB at this point so use the MSR protocol.
2408 *
2409 * SVSM_CORE_REMAP_CA call:
2410 * RAX = 0 (Protocol=0, CallID=0)
2411 * RCX = New CA GPA
2412 */
2413 call.caa = svsm_get_caa();
2414 call.rax = SVSM_CORE_CALL(SVSM_CORE_REMAP_CA);
2415 call.rcx = pa;
2416 ret = svsm_perform_call_protocol(&call);
2417 if (ret)
2418 sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SVSM_CA_REMAP_FAIL);
2419
2420 RIP_REL_REF(boot_svsm_caa) = (struct svsm_ca *)pa;
2421 RIP_REL_REF(boot_svsm_caa_pa) = pa;
2422 }
2423
snp_init(struct boot_params * bp)2424 bool __head snp_init(struct boot_params *bp)
2425 {
2426 struct cc_blob_sev_info *cc_info;
2427
2428 if (!bp)
2429 return false;
2430
2431 cc_info = find_cc_blob(bp);
2432 if (!cc_info)
2433 return false;
2434
2435 if (cc_info->secrets_phys && cc_info->secrets_len == PAGE_SIZE)
2436 secrets_pa = cc_info->secrets_phys;
2437 else
2438 return false;
2439
2440 setup_cpuid_table(cc_info);
2441
2442 svsm_setup(cc_info);
2443
2444 /*
2445 * The CC blob will be used later to access the secrets page. Cache
2446 * it here like the boot kernel does.
2447 */
2448 bp->cc_blob_address = (u32)(unsigned long)cc_info;
2449
2450 return true;
2451 }
2452
snp_abort(void)2453 void __head __noreturn snp_abort(void)
2454 {
2455 sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED);
2456 }
2457
2458 /*
2459 * SEV-SNP guests should only execute dmi_setup() if EFI_CONFIG_TABLES are
2460 * enabled, as the alternative (fallback) logic for DMI probing in the legacy
2461 * ROM region can cause a crash since this region is not pre-validated.
2462 */
snp_dmi_setup(void)2463 void __init snp_dmi_setup(void)
2464 {
2465 if (efi_enabled(EFI_CONFIG_TABLES))
2466 dmi_setup();
2467 }
2468
dump_cpuid_table(void)2469 static void dump_cpuid_table(void)
2470 {
2471 const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table();
2472 int i = 0;
2473
2474 pr_info("count=%d reserved=0x%x reserved2=0x%llx\n",
2475 cpuid_table->count, cpuid_table->__reserved1, cpuid_table->__reserved2);
2476
2477 for (i = 0; i < SNP_CPUID_COUNT_MAX; i++) {
2478 const struct snp_cpuid_fn *fn = &cpuid_table->fn[i];
2479
2480 pr_info("index=%3d fn=0x%08x subfn=0x%08x: eax=0x%08x ebx=0x%08x ecx=0x%08x edx=0x%08x xcr0_in=0x%016llx xss_in=0x%016llx reserved=0x%016llx\n",
2481 i, fn->eax_in, fn->ecx_in, fn->eax, fn->ebx, fn->ecx,
2482 fn->edx, fn->xcr0_in, fn->xss_in, fn->__reserved);
2483 }
2484 }
2485
2486 /*
2487 * It is useful from an auditing/testing perspective to provide an easy way
2488 * for the guest owner to know that the CPUID table has been initialized as
2489 * expected, but that initialization happens too early in boot to print any
2490 * sort of indicator, and there's not really any other good place to do it,
2491 * so do it here.
2492 *
2493 * If running as an SNP guest, report the current VM privilege level (VMPL).
2494 */
report_snp_info(void)2495 static int __init report_snp_info(void)
2496 {
2497 const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table();
2498
2499 if (cpuid_table->count) {
2500 pr_info("Using SNP CPUID table, %d entries present.\n",
2501 cpuid_table->count);
2502
2503 if (sev_cfg.debug)
2504 dump_cpuid_table();
2505 }
2506
2507 if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
2508 pr_info("SNP running at VMPL%u.\n", snp_vmpl);
2509
2510 return 0;
2511 }
2512 arch_initcall(report_snp_info);
2513
update_attest_input(struct svsm_call * call,struct svsm_attest_call * input)2514 static void update_attest_input(struct svsm_call *call, struct svsm_attest_call *input)
2515 {
2516 /* If (new) lengths have been returned, propagate them up */
2517 if (call->rcx_out != call->rcx)
2518 input->manifest_buf.len = call->rcx_out;
2519
2520 if (call->rdx_out != call->rdx)
2521 input->certificates_buf.len = call->rdx_out;
2522
2523 if (call->r8_out != call->r8)
2524 input->report_buf.len = call->r8_out;
2525 }
2526
snp_issue_svsm_attest_req(u64 call_id,struct svsm_call * call,struct svsm_attest_call * input)2527 int snp_issue_svsm_attest_req(u64 call_id, struct svsm_call *call,
2528 struct svsm_attest_call *input)
2529 {
2530 struct svsm_attest_call *ac;
2531 unsigned long flags;
2532 u64 attest_call_pa;
2533 int ret;
2534
2535 if (!snp_vmpl)
2536 return -EINVAL;
2537
2538 local_irq_save(flags);
2539
2540 call->caa = svsm_get_caa();
2541
2542 ac = (struct svsm_attest_call *)call->caa->svsm_buffer;
2543 attest_call_pa = svsm_get_caa_pa() + offsetof(struct svsm_ca, svsm_buffer);
2544
2545 *ac = *input;
2546
2547 /*
2548 * Set input registers for the request and set RDX and R8 to known
2549 * values in order to detect length values being returned in them.
2550 */
2551 call->rax = call_id;
2552 call->rcx = attest_call_pa;
2553 call->rdx = -1;
2554 call->r8 = -1;
2555 ret = svsm_perform_call_protocol(call);
2556 update_attest_input(call, input);
2557
2558 local_irq_restore(flags);
2559
2560 return ret;
2561 }
2562 EXPORT_SYMBOL_GPL(snp_issue_svsm_attest_req);
2563
snp_issue_guest_request(struct snp_guest_req * req,struct snp_req_data * input,struct snp_guest_request_ioctl * rio)2564 static int snp_issue_guest_request(struct snp_guest_req *req, struct snp_req_data *input,
2565 struct snp_guest_request_ioctl *rio)
2566 {
2567 struct ghcb_state state;
2568 struct es_em_ctxt ctxt;
2569 unsigned long flags;
2570 struct ghcb *ghcb;
2571 int ret;
2572
2573 rio->exitinfo2 = SEV_RET_NO_FW_CALL;
2574
2575 /*
2576 * __sev_get_ghcb() needs to run with IRQs disabled because it is using
2577 * a per-CPU GHCB.
2578 */
2579 local_irq_save(flags);
2580
2581 ghcb = __sev_get_ghcb(&state);
2582 if (!ghcb) {
2583 ret = -EIO;
2584 goto e_restore_irq;
2585 }
2586
2587 vc_ghcb_invalidate(ghcb);
2588
2589 if (req->exit_code == SVM_VMGEXIT_EXT_GUEST_REQUEST) {
2590 ghcb_set_rax(ghcb, input->data_gpa);
2591 ghcb_set_rbx(ghcb, input->data_npages);
2592 }
2593
2594 ret = sev_es_ghcb_hv_call(ghcb, &ctxt, req->exit_code, input->req_gpa, input->resp_gpa);
2595 if (ret)
2596 goto e_put;
2597
2598 rio->exitinfo2 = ghcb->save.sw_exit_info_2;
2599 switch (rio->exitinfo2) {
2600 case 0:
2601 break;
2602
2603 case SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_BUSY):
2604 ret = -EAGAIN;
2605 break;
2606
2607 case SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_INVALID_LEN):
2608 /* Number of expected pages are returned in RBX */
2609 if (req->exit_code == SVM_VMGEXIT_EXT_GUEST_REQUEST) {
2610 input->data_npages = ghcb_get_rbx(ghcb);
2611 ret = -ENOSPC;
2612 break;
2613 }
2614 fallthrough;
2615 default:
2616 ret = -EIO;
2617 break;
2618 }
2619
2620 e_put:
2621 __sev_put_ghcb(&state);
2622 e_restore_irq:
2623 local_irq_restore(flags);
2624
2625 return ret;
2626 }
2627
2628 static struct platform_device sev_guest_device = {
2629 .name = "sev-guest",
2630 .id = -1,
2631 };
2632
snp_init_platform_device(void)2633 static int __init snp_init_platform_device(void)
2634 {
2635 if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
2636 return -ENODEV;
2637
2638 if (platform_device_register(&sev_guest_device))
2639 return -ENODEV;
2640
2641 pr_info("SNP guest platform device initialized.\n");
2642 return 0;
2643 }
2644 device_initcall(snp_init_platform_device);
2645
sev_show_status(void)2646 void sev_show_status(void)
2647 {
2648 int i;
2649
2650 pr_info("Status: ");
2651 for (i = 0; i < MSR_AMD64_SNP_RESV_BIT; i++) {
2652 if (sev_status & BIT_ULL(i)) {
2653 if (!sev_status_feat_names[i])
2654 continue;
2655
2656 pr_cont("%s ", sev_status_feat_names[i]);
2657 }
2658 }
2659 pr_cont("\n");
2660 }
2661
snp_update_svsm_ca(void)2662 void __init snp_update_svsm_ca(void)
2663 {
2664 if (!snp_vmpl)
2665 return;
2666
2667 /* Update the CAA to a proper kernel address */
2668 boot_svsm_caa = &boot_svsm_ca_page;
2669 }
2670
2671 #ifdef CONFIG_SYSFS
vmpl_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)2672 static ssize_t vmpl_show(struct kobject *kobj,
2673 struct kobj_attribute *attr, char *buf)
2674 {
2675 return sysfs_emit(buf, "%d\n", snp_vmpl);
2676 }
2677
2678 static struct kobj_attribute vmpl_attr = __ATTR_RO(vmpl);
2679
2680 static struct attribute *vmpl_attrs[] = {
2681 &vmpl_attr.attr,
2682 NULL
2683 };
2684
2685 static struct attribute_group sev_attr_group = {
2686 .attrs = vmpl_attrs,
2687 };
2688
sev_sysfs_init(void)2689 static int __init sev_sysfs_init(void)
2690 {
2691 struct kobject *sev_kobj;
2692 struct device *dev_root;
2693 int ret;
2694
2695 if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
2696 return -ENODEV;
2697
2698 dev_root = bus_get_dev_root(&cpu_subsys);
2699 if (!dev_root)
2700 return -ENODEV;
2701
2702 sev_kobj = kobject_create_and_add("sev", &dev_root->kobj);
2703 put_device(dev_root);
2704
2705 if (!sev_kobj)
2706 return -ENOMEM;
2707
2708 ret = sysfs_create_group(sev_kobj, &sev_attr_group);
2709 if (ret)
2710 kobject_put(sev_kobj);
2711
2712 return ret;
2713 }
2714 arch_initcall(sev_sysfs_init);
2715 #endif // CONFIG_SYSFS
2716
free_shared_pages(void * buf,size_t sz)2717 static void free_shared_pages(void *buf, size_t sz)
2718 {
2719 unsigned int npages = PAGE_ALIGN(sz) >> PAGE_SHIFT;
2720 int ret;
2721
2722 if (!buf)
2723 return;
2724
2725 ret = set_memory_encrypted((unsigned long)buf, npages);
2726 if (ret) {
2727 WARN_ONCE(ret, "failed to restore encryption mask (leak it)\n");
2728 return;
2729 }
2730
2731 __free_pages(virt_to_page(buf), get_order(sz));
2732 }
2733
alloc_shared_pages(size_t sz)2734 static void *alloc_shared_pages(size_t sz)
2735 {
2736 unsigned int npages = PAGE_ALIGN(sz) >> PAGE_SHIFT;
2737 struct page *page;
2738 int ret;
2739
2740 page = alloc_pages(GFP_KERNEL_ACCOUNT, get_order(sz));
2741 if (!page)
2742 return NULL;
2743
2744 ret = set_memory_decrypted((unsigned long)page_address(page), npages);
2745 if (ret) {
2746 pr_err("failed to mark page shared, ret=%d\n", ret);
2747 __free_pages(page, get_order(sz));
2748 return NULL;
2749 }
2750
2751 return page_address(page);
2752 }
2753
get_vmpck(int id,struct snp_secrets_page * secrets,u32 ** seqno)2754 static u8 *get_vmpck(int id, struct snp_secrets_page *secrets, u32 **seqno)
2755 {
2756 u8 *key = NULL;
2757
2758 switch (id) {
2759 case 0:
2760 *seqno = &secrets->os_area.msg_seqno_0;
2761 key = secrets->vmpck0;
2762 break;
2763 case 1:
2764 *seqno = &secrets->os_area.msg_seqno_1;
2765 key = secrets->vmpck1;
2766 break;
2767 case 2:
2768 *seqno = &secrets->os_area.msg_seqno_2;
2769 key = secrets->vmpck2;
2770 break;
2771 case 3:
2772 *seqno = &secrets->os_area.msg_seqno_3;
2773 key = secrets->vmpck3;
2774 break;
2775 default:
2776 break;
2777 }
2778
2779 return key;
2780 }
2781
snp_init_crypto(u8 * key,size_t keylen)2782 static struct aesgcm_ctx *snp_init_crypto(u8 *key, size_t keylen)
2783 {
2784 struct aesgcm_ctx *ctx;
2785
2786 ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
2787 if (!ctx)
2788 return NULL;
2789
2790 if (aesgcm_expandkey(ctx, key, keylen, AUTHTAG_LEN)) {
2791 pr_err("Crypto context initialization failed\n");
2792 kfree(ctx);
2793 return NULL;
2794 }
2795
2796 return ctx;
2797 }
2798
snp_msg_init(struct snp_msg_desc * mdesc,int vmpck_id)2799 int snp_msg_init(struct snp_msg_desc *mdesc, int vmpck_id)
2800 {
2801 /* Adjust the default VMPCK key based on the executing VMPL level */
2802 if (vmpck_id == -1)
2803 vmpck_id = snp_vmpl;
2804
2805 mdesc->vmpck = get_vmpck(vmpck_id, mdesc->secrets, &mdesc->os_area_msg_seqno);
2806 if (!mdesc->vmpck) {
2807 pr_err("Invalid VMPCK%d communication key\n", vmpck_id);
2808 return -EINVAL;
2809 }
2810
2811 /* Verify that VMPCK is not zero. */
2812 if (!memchr_inv(mdesc->vmpck, 0, VMPCK_KEY_LEN)) {
2813 pr_err("Empty VMPCK%d communication key\n", vmpck_id);
2814 return -EINVAL;
2815 }
2816
2817 mdesc->vmpck_id = vmpck_id;
2818
2819 mdesc->ctx = snp_init_crypto(mdesc->vmpck, VMPCK_KEY_LEN);
2820 if (!mdesc->ctx)
2821 return -ENOMEM;
2822
2823 return 0;
2824 }
2825 EXPORT_SYMBOL_GPL(snp_msg_init);
2826
snp_msg_alloc(void)2827 struct snp_msg_desc *snp_msg_alloc(void)
2828 {
2829 struct snp_msg_desc *mdesc;
2830 void __iomem *mem;
2831
2832 BUILD_BUG_ON(sizeof(struct snp_guest_msg) > PAGE_SIZE);
2833
2834 mdesc = kzalloc(sizeof(struct snp_msg_desc), GFP_KERNEL);
2835 if (!mdesc)
2836 return ERR_PTR(-ENOMEM);
2837
2838 mem = ioremap_encrypted(secrets_pa, PAGE_SIZE);
2839 if (!mem)
2840 goto e_free_mdesc;
2841
2842 mdesc->secrets = (__force struct snp_secrets_page *)mem;
2843
2844 /* Allocate the shared page used for the request and response message. */
2845 mdesc->request = alloc_shared_pages(sizeof(struct snp_guest_msg));
2846 if (!mdesc->request)
2847 goto e_unmap;
2848
2849 mdesc->response = alloc_shared_pages(sizeof(struct snp_guest_msg));
2850 if (!mdesc->response)
2851 goto e_free_request;
2852
2853 return mdesc;
2854
2855 e_free_request:
2856 free_shared_pages(mdesc->request, sizeof(struct snp_guest_msg));
2857 e_unmap:
2858 iounmap(mem);
2859 e_free_mdesc:
2860 kfree(mdesc);
2861
2862 return ERR_PTR(-ENOMEM);
2863 }
2864 EXPORT_SYMBOL_GPL(snp_msg_alloc);
2865
snp_msg_free(struct snp_msg_desc * mdesc)2866 void snp_msg_free(struct snp_msg_desc *mdesc)
2867 {
2868 if (!mdesc)
2869 return;
2870
2871 kfree(mdesc->ctx);
2872 free_shared_pages(mdesc->response, sizeof(struct snp_guest_msg));
2873 free_shared_pages(mdesc->request, sizeof(struct snp_guest_msg));
2874 iounmap((__force void __iomem *)mdesc->secrets);
2875
2876 memset(mdesc, 0, sizeof(*mdesc));
2877 kfree(mdesc);
2878 }
2879 EXPORT_SYMBOL_GPL(snp_msg_free);
2880
2881 /* Mutex to serialize the shared buffer access and command handling. */
2882 static DEFINE_MUTEX(snp_cmd_mutex);
2883
2884 /*
2885 * If an error is received from the host or AMD Secure Processor (ASP) there
2886 * are two options. Either retry the exact same encrypted request or discontinue
2887 * using the VMPCK.
2888 *
2889 * This is because in the current encryption scheme GHCB v2 uses AES-GCM to
2890 * encrypt the requests. The IV for this scheme is the sequence number. GCM
2891 * cannot tolerate IV reuse.
2892 *
2893 * The ASP FW v1.51 only increments the sequence numbers on a successful
2894 * guest<->ASP back and forth and only accepts messages at its exact sequence
2895 * number.
2896 *
2897 * So if the sequence number were to be reused the encryption scheme is
2898 * vulnerable. If the sequence number were incremented for a fresh IV the ASP
2899 * will reject the request.
2900 */
snp_disable_vmpck(struct snp_msg_desc * mdesc)2901 static void snp_disable_vmpck(struct snp_msg_desc *mdesc)
2902 {
2903 pr_alert("Disabling VMPCK%d communication key to prevent IV reuse.\n",
2904 mdesc->vmpck_id);
2905 memzero_explicit(mdesc->vmpck, VMPCK_KEY_LEN);
2906 mdesc->vmpck = NULL;
2907 }
2908
__snp_get_msg_seqno(struct snp_msg_desc * mdesc)2909 static inline u64 __snp_get_msg_seqno(struct snp_msg_desc *mdesc)
2910 {
2911 u64 count;
2912
2913 lockdep_assert_held(&snp_cmd_mutex);
2914
2915 /* Read the current message sequence counter from secrets pages */
2916 count = *mdesc->os_area_msg_seqno;
2917
2918 return count + 1;
2919 }
2920
2921 /* Return a non-zero on success */
snp_get_msg_seqno(struct snp_msg_desc * mdesc)2922 static u64 snp_get_msg_seqno(struct snp_msg_desc *mdesc)
2923 {
2924 u64 count = __snp_get_msg_seqno(mdesc);
2925
2926 /*
2927 * The message sequence counter for the SNP guest request is a 64-bit
2928 * value but the version 2 of GHCB specification defines a 32-bit storage
2929 * for it. If the counter exceeds the 32-bit value then return zero.
2930 * The caller should check the return value, but if the caller happens to
2931 * not check the value and use it, then the firmware treats zero as an
2932 * invalid number and will fail the message request.
2933 */
2934 if (count >= UINT_MAX) {
2935 pr_err("request message sequence counter overflow\n");
2936 return 0;
2937 }
2938
2939 return count;
2940 }
2941
snp_inc_msg_seqno(struct snp_msg_desc * mdesc)2942 static void snp_inc_msg_seqno(struct snp_msg_desc *mdesc)
2943 {
2944 /*
2945 * The counter is also incremented by the PSP, so increment it by 2
2946 * and save in secrets page.
2947 */
2948 *mdesc->os_area_msg_seqno += 2;
2949 }
2950
verify_and_dec_payload(struct snp_msg_desc * mdesc,struct snp_guest_req * req)2951 static int verify_and_dec_payload(struct snp_msg_desc *mdesc, struct snp_guest_req *req)
2952 {
2953 struct snp_guest_msg *resp_msg = &mdesc->secret_response;
2954 struct snp_guest_msg *req_msg = &mdesc->secret_request;
2955 struct snp_guest_msg_hdr *req_msg_hdr = &req_msg->hdr;
2956 struct snp_guest_msg_hdr *resp_msg_hdr = &resp_msg->hdr;
2957 struct aesgcm_ctx *ctx = mdesc->ctx;
2958 u8 iv[GCM_AES_IV_SIZE] = {};
2959
2960 pr_debug("response [seqno %lld type %d version %d sz %d]\n",
2961 resp_msg_hdr->msg_seqno, resp_msg_hdr->msg_type, resp_msg_hdr->msg_version,
2962 resp_msg_hdr->msg_sz);
2963
2964 /* Copy response from shared memory to encrypted memory. */
2965 memcpy(resp_msg, mdesc->response, sizeof(*resp_msg));
2966
2967 /* Verify that the sequence counter is incremented by 1 */
2968 if (unlikely(resp_msg_hdr->msg_seqno != (req_msg_hdr->msg_seqno + 1)))
2969 return -EBADMSG;
2970
2971 /* Verify response message type and version number. */
2972 if (resp_msg_hdr->msg_type != (req_msg_hdr->msg_type + 1) ||
2973 resp_msg_hdr->msg_version != req_msg_hdr->msg_version)
2974 return -EBADMSG;
2975
2976 /*
2977 * If the message size is greater than our buffer length then return
2978 * an error.
2979 */
2980 if (unlikely((resp_msg_hdr->msg_sz + ctx->authsize) > req->resp_sz))
2981 return -EBADMSG;
2982
2983 /* Decrypt the payload */
2984 memcpy(iv, &resp_msg_hdr->msg_seqno, min(sizeof(iv), sizeof(resp_msg_hdr->msg_seqno)));
2985 if (!aesgcm_decrypt(ctx, req->resp_buf, resp_msg->payload, resp_msg_hdr->msg_sz,
2986 &resp_msg_hdr->algo, AAD_LEN, iv, resp_msg_hdr->authtag))
2987 return -EBADMSG;
2988
2989 return 0;
2990 }
2991
enc_payload(struct snp_msg_desc * mdesc,u64 seqno,struct snp_guest_req * req)2992 static int enc_payload(struct snp_msg_desc *mdesc, u64 seqno, struct snp_guest_req *req)
2993 {
2994 struct snp_guest_msg *msg = &mdesc->secret_request;
2995 struct snp_guest_msg_hdr *hdr = &msg->hdr;
2996 struct aesgcm_ctx *ctx = mdesc->ctx;
2997 u8 iv[GCM_AES_IV_SIZE] = {};
2998
2999 memset(msg, 0, sizeof(*msg));
3000
3001 hdr->algo = SNP_AEAD_AES_256_GCM;
3002 hdr->hdr_version = MSG_HDR_VER;
3003 hdr->hdr_sz = sizeof(*hdr);
3004 hdr->msg_type = req->msg_type;
3005 hdr->msg_version = req->msg_version;
3006 hdr->msg_seqno = seqno;
3007 hdr->msg_vmpck = req->vmpck_id;
3008 hdr->msg_sz = req->req_sz;
3009
3010 /* Verify the sequence number is non-zero */
3011 if (!hdr->msg_seqno)
3012 return -ENOSR;
3013
3014 pr_debug("request [seqno %lld type %d version %d sz %d]\n",
3015 hdr->msg_seqno, hdr->msg_type, hdr->msg_version, hdr->msg_sz);
3016
3017 if (WARN_ON((req->req_sz + ctx->authsize) > sizeof(msg->payload)))
3018 return -EBADMSG;
3019
3020 memcpy(iv, &hdr->msg_seqno, min(sizeof(iv), sizeof(hdr->msg_seqno)));
3021 aesgcm_encrypt(ctx, msg->payload, req->req_buf, req->req_sz, &hdr->algo,
3022 AAD_LEN, iv, hdr->authtag);
3023
3024 return 0;
3025 }
3026
__handle_guest_request(struct snp_msg_desc * mdesc,struct snp_guest_req * req,struct snp_guest_request_ioctl * rio)3027 static int __handle_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_req *req,
3028 struct snp_guest_request_ioctl *rio)
3029 {
3030 unsigned long req_start = jiffies;
3031 unsigned int override_npages = 0;
3032 u64 override_err = 0;
3033 int rc;
3034
3035 retry_request:
3036 /*
3037 * Call firmware to process the request. In this function the encrypted
3038 * message enters shared memory with the host. So after this call the
3039 * sequence number must be incremented or the VMPCK must be deleted to
3040 * prevent reuse of the IV.
3041 */
3042 rc = snp_issue_guest_request(req, &req->input, rio);
3043 switch (rc) {
3044 case -ENOSPC:
3045 /*
3046 * If the extended guest request fails due to having too
3047 * small of a certificate data buffer, retry the same
3048 * guest request without the extended data request in
3049 * order to increment the sequence number and thus avoid
3050 * IV reuse.
3051 */
3052 override_npages = req->input.data_npages;
3053 req->exit_code = SVM_VMGEXIT_GUEST_REQUEST;
3054
3055 /*
3056 * Override the error to inform callers the given extended
3057 * request buffer size was too small and give the caller the
3058 * required buffer size.
3059 */
3060 override_err = SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_INVALID_LEN);
3061
3062 /*
3063 * If this call to the firmware succeeds, the sequence number can
3064 * be incremented allowing for continued use of the VMPCK. If
3065 * there is an error reflected in the return value, this value
3066 * is checked further down and the result will be the deletion
3067 * of the VMPCK and the error code being propagated back to the
3068 * user as an ioctl() return code.
3069 */
3070 goto retry_request;
3071
3072 /*
3073 * The host may return SNP_GUEST_VMM_ERR_BUSY if the request has been
3074 * throttled. Retry in the driver to avoid returning and reusing the
3075 * message sequence number on a different message.
3076 */
3077 case -EAGAIN:
3078 if (jiffies - req_start > SNP_REQ_MAX_RETRY_DURATION) {
3079 rc = -ETIMEDOUT;
3080 break;
3081 }
3082 schedule_timeout_killable(SNP_REQ_RETRY_DELAY);
3083 goto retry_request;
3084 }
3085
3086 /*
3087 * Increment the message sequence number. There is no harm in doing
3088 * this now because decryption uses the value stored in the response
3089 * structure and any failure will wipe the VMPCK, preventing further
3090 * use anyway.
3091 */
3092 snp_inc_msg_seqno(mdesc);
3093
3094 if (override_err) {
3095 rio->exitinfo2 = override_err;
3096
3097 /*
3098 * If an extended guest request was issued and the supplied certificate
3099 * buffer was not large enough, a standard guest request was issued to
3100 * prevent IV reuse. If the standard request was successful, return -EIO
3101 * back to the caller as would have originally been returned.
3102 */
3103 if (!rc && override_err == SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_INVALID_LEN))
3104 rc = -EIO;
3105 }
3106
3107 if (override_npages)
3108 req->input.data_npages = override_npages;
3109
3110 return rc;
3111 }
3112
snp_send_guest_request(struct snp_msg_desc * mdesc,struct snp_guest_req * req,struct snp_guest_request_ioctl * rio)3113 int snp_send_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_req *req,
3114 struct snp_guest_request_ioctl *rio)
3115 {
3116 u64 seqno;
3117 int rc;
3118
3119 guard(mutex)(&snp_cmd_mutex);
3120
3121 /* Check if the VMPCK is not empty */
3122 if (!mdesc->vmpck || !memchr_inv(mdesc->vmpck, 0, VMPCK_KEY_LEN)) {
3123 pr_err_ratelimited("VMPCK is disabled\n");
3124 return -ENOTTY;
3125 }
3126
3127 /* Get message sequence and verify that its a non-zero */
3128 seqno = snp_get_msg_seqno(mdesc);
3129 if (!seqno)
3130 return -EIO;
3131
3132 /* Clear shared memory's response for the host to populate. */
3133 memset(mdesc->response, 0, sizeof(struct snp_guest_msg));
3134
3135 /* Encrypt the userspace provided payload in mdesc->secret_request. */
3136 rc = enc_payload(mdesc, seqno, req);
3137 if (rc)
3138 return rc;
3139
3140 /*
3141 * Write the fully encrypted request to the shared unencrypted
3142 * request page.
3143 */
3144 memcpy(mdesc->request, &mdesc->secret_request, sizeof(mdesc->secret_request));
3145
3146 /* Initialize the input address for guest request */
3147 req->input.req_gpa = __pa(mdesc->request);
3148 req->input.resp_gpa = __pa(mdesc->response);
3149 req->input.data_gpa = req->certs_data ? __pa(req->certs_data) : 0;
3150
3151 rc = __handle_guest_request(mdesc, req, rio);
3152 if (rc) {
3153 if (rc == -EIO &&
3154 rio->exitinfo2 == SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_INVALID_LEN))
3155 return rc;
3156
3157 pr_alert("Detected error from ASP request. rc: %d, exitinfo2: 0x%llx\n",
3158 rc, rio->exitinfo2);
3159
3160 snp_disable_vmpck(mdesc);
3161 return rc;
3162 }
3163
3164 rc = verify_and_dec_payload(mdesc, req);
3165 if (rc) {
3166 pr_alert("Detected unexpected decode failure from ASP. rc: %d\n", rc);
3167 snp_disable_vmpck(mdesc);
3168 return rc;
3169 }
3170
3171 return 0;
3172 }
3173 EXPORT_SYMBOL_GPL(snp_send_guest_request);
3174
snp_get_tsc_info(void)3175 static int __init snp_get_tsc_info(void)
3176 {
3177 struct snp_guest_request_ioctl *rio;
3178 struct snp_tsc_info_resp *tsc_resp;
3179 struct snp_tsc_info_req *tsc_req;
3180 struct snp_msg_desc *mdesc;
3181 struct snp_guest_req *req;
3182 int rc = -ENOMEM;
3183
3184 tsc_req = kzalloc(sizeof(*tsc_req), GFP_KERNEL);
3185 if (!tsc_req)
3186 return rc;
3187
3188 /*
3189 * The intermediate response buffer is used while decrypting the
3190 * response payload. Make sure that it has enough space to cover
3191 * the authtag.
3192 */
3193 tsc_resp = kzalloc(sizeof(*tsc_resp) + AUTHTAG_LEN, GFP_KERNEL);
3194 if (!tsc_resp)
3195 goto e_free_tsc_req;
3196
3197 req = kzalloc(sizeof(*req), GFP_KERNEL);
3198 if (!req)
3199 goto e_free_tsc_resp;
3200
3201 rio = kzalloc(sizeof(*rio), GFP_KERNEL);
3202 if (!rio)
3203 goto e_free_req;
3204
3205 mdesc = snp_msg_alloc();
3206 if (IS_ERR_OR_NULL(mdesc))
3207 goto e_free_rio;
3208
3209 rc = snp_msg_init(mdesc, snp_vmpl);
3210 if (rc)
3211 goto e_free_mdesc;
3212
3213 req->msg_version = MSG_HDR_VER;
3214 req->msg_type = SNP_MSG_TSC_INFO_REQ;
3215 req->vmpck_id = snp_vmpl;
3216 req->req_buf = tsc_req;
3217 req->req_sz = sizeof(*tsc_req);
3218 req->resp_buf = (void *)tsc_resp;
3219 req->resp_sz = sizeof(*tsc_resp) + AUTHTAG_LEN;
3220 req->exit_code = SVM_VMGEXIT_GUEST_REQUEST;
3221
3222 rc = snp_send_guest_request(mdesc, req, rio);
3223 if (rc)
3224 goto e_request;
3225
3226 pr_debug("%s: response status 0x%x scale 0x%llx offset 0x%llx factor 0x%x\n",
3227 __func__, tsc_resp->status, tsc_resp->tsc_scale, tsc_resp->tsc_offset,
3228 tsc_resp->tsc_factor);
3229
3230 if (!tsc_resp->status) {
3231 snp_tsc_scale = tsc_resp->tsc_scale;
3232 snp_tsc_offset = tsc_resp->tsc_offset;
3233 } else {
3234 pr_err("Failed to get TSC info, response status 0x%x\n", tsc_resp->status);
3235 rc = -EIO;
3236 }
3237
3238 e_request:
3239 /* The response buffer contains sensitive data, explicitly clear it. */
3240 memzero_explicit(tsc_resp, sizeof(*tsc_resp) + AUTHTAG_LEN);
3241 e_free_mdesc:
3242 snp_msg_free(mdesc);
3243 e_free_rio:
3244 kfree(rio);
3245 e_free_req:
3246 kfree(req);
3247 e_free_tsc_resp:
3248 kfree(tsc_resp);
3249 e_free_tsc_req:
3250 kfree(tsc_req);
3251
3252 return rc;
3253 }
3254
snp_secure_tsc_prepare(void)3255 void __init snp_secure_tsc_prepare(void)
3256 {
3257 if (!cc_platform_has(CC_ATTR_GUEST_SNP_SECURE_TSC))
3258 return;
3259
3260 if (snp_get_tsc_info()) {
3261 pr_alert("Unable to retrieve Secure TSC info from ASP\n");
3262 sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SECURE_TSC);
3263 }
3264
3265 pr_debug("SecureTSC enabled");
3266 }
3267
securetsc_get_tsc_khz(void)3268 static unsigned long securetsc_get_tsc_khz(void)
3269 {
3270 return snp_tsc_freq_khz;
3271 }
3272
snp_secure_tsc_init(void)3273 void __init snp_secure_tsc_init(void)
3274 {
3275 unsigned long long tsc_freq_mhz;
3276
3277 if (!cc_platform_has(CC_ATTR_GUEST_SNP_SECURE_TSC))
3278 return;
3279
3280 setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
3281 rdmsrl(MSR_AMD64_GUEST_TSC_FREQ, tsc_freq_mhz);
3282 snp_tsc_freq_khz = (unsigned long)(tsc_freq_mhz * 1000);
3283
3284 x86_platform.calibrate_cpu = securetsc_get_tsc_khz;
3285 x86_platform.calibrate_tsc = securetsc_get_tsc_khz;
3286 }
3287