1*94212d34SMukesh Rathor // SPDX-License-Identifier: GPL-2.0-only
2*94212d34SMukesh Rathor /*
3*94212d34SMukesh Rathor * X86 specific Hyper-V root partition kdump/crash support module
4*94212d34SMukesh Rathor *
5*94212d34SMukesh Rathor * Copyright (C) 2025, Microsoft, Inc.
6*94212d34SMukesh Rathor *
7*94212d34SMukesh Rathor * This module implements hypervisor RAM collection into vmcore for both
8*94212d34SMukesh Rathor * cases of the hypervisor crash and Linux root crash. Hyper-V implements
9*94212d34SMukesh Rathor * a disable hypercall with a 32bit protected mode ABI callback. This
10*94212d34SMukesh Rathor * mechanism must be used to unlock hypervisor RAM. Since the hypervisor RAM
11*94212d34SMukesh Rathor * is already mapped in Linux, it is automatically collected into Linux vmcore,
12*94212d34SMukesh Rathor * and can be examined by the crash command (raw RAM dump) or windbg.
13*94212d34SMukesh Rathor *
14*94212d34SMukesh Rathor * At a high level:
15*94212d34SMukesh Rathor *
16*94212d34SMukesh Rathor * Hypervisor Crash:
17*94212d34SMukesh Rathor * Upon crash, hypervisor goes into an emergency minimal dispatch loop, a
18*94212d34SMukesh Rathor * restrictive mode with very limited hypercall and MSR support. Each cpu
19*94212d34SMukesh Rathor * then injects NMIs into root vcpus. A shared page is used to check
20*94212d34SMukesh Rathor * by Linux in the NMI handler if the hypervisor has crashed. This shared
21*94212d34SMukesh Rathor * page is setup in hv_root_crash_init during boot.
22*94212d34SMukesh Rathor *
23*94212d34SMukesh Rathor * Linux Crash:
24*94212d34SMukesh Rathor * In case of Linux crash, the callback hv_crash_stop_other_cpus will send
25*94212d34SMukesh Rathor * NMIs to all cpus, then proceed to the crash_nmi_callback where it waits
26*94212d34SMukesh Rathor * for all cpus to be in NMI.
27*94212d34SMukesh Rathor *
28*94212d34SMukesh Rathor * NMI Handler (upon quorum):
29*94212d34SMukesh Rathor * Eventually, in both cases, all cpus will end up in the NMI handler.
30*94212d34SMukesh Rathor * Hyper-V requires the disable hypervisor must be done from the BSP. So
31*94212d34SMukesh Rathor * the BSP NMI handler saves current context, does some fixups and makes
32*94212d34SMukesh Rathor * the hypercall to disable the hypervisor, ie, devirtualize. Hypervisor
33*94212d34SMukesh Rathor * at that point will suspend all vcpus (except the BSP), unlock all its
34*94212d34SMukesh Rathor * RAM, and return to Linux at the 32bit mode entry RIP.
35*94212d34SMukesh Rathor *
36*94212d34SMukesh Rathor * Linux 32bit entry trampoline will then restore long mode and call C
37*94212d34SMukesh Rathor * function here to restore context and continue execution to crash kexec.
38*94212d34SMukesh Rathor */
39*94212d34SMukesh Rathor
40*94212d34SMukesh Rathor #include <linux/delay.h>
41*94212d34SMukesh Rathor #include <linux/kexec.h>
42*94212d34SMukesh Rathor #include <linux/crash_dump.h>
43*94212d34SMukesh Rathor #include <linux/panic.h>
44*94212d34SMukesh Rathor #include <asm/apic.h>
45*94212d34SMukesh Rathor #include <asm/desc.h>
46*94212d34SMukesh Rathor #include <asm/page.h>
47*94212d34SMukesh Rathor #include <asm/pgalloc.h>
48*94212d34SMukesh Rathor #include <asm/mshyperv.h>
49*94212d34SMukesh Rathor #include <asm/nmi.h>
50*94212d34SMukesh Rathor #include <asm/idtentry.h>
51*94212d34SMukesh Rathor #include <asm/reboot.h>
52*94212d34SMukesh Rathor #include <asm/intel_pt.h>
53*94212d34SMukesh Rathor
54*94212d34SMukesh Rathor bool hv_crash_enabled;
55*94212d34SMukesh Rathor EXPORT_SYMBOL_GPL(hv_crash_enabled);
56*94212d34SMukesh Rathor
57*94212d34SMukesh Rathor struct hv_crash_ctxt {
58*94212d34SMukesh Rathor ulong rsp;
59*94212d34SMukesh Rathor ulong cr0;
60*94212d34SMukesh Rathor ulong cr2;
61*94212d34SMukesh Rathor ulong cr4;
62*94212d34SMukesh Rathor ulong cr8;
63*94212d34SMukesh Rathor
64*94212d34SMukesh Rathor u16 cs;
65*94212d34SMukesh Rathor u16 ss;
66*94212d34SMukesh Rathor u16 ds;
67*94212d34SMukesh Rathor u16 es;
68*94212d34SMukesh Rathor u16 fs;
69*94212d34SMukesh Rathor u16 gs;
70*94212d34SMukesh Rathor
71*94212d34SMukesh Rathor u16 gdt_fill;
72*94212d34SMukesh Rathor struct desc_ptr gdtr;
73*94212d34SMukesh Rathor char idt_fill[6];
74*94212d34SMukesh Rathor struct desc_ptr idtr;
75*94212d34SMukesh Rathor
76*94212d34SMukesh Rathor u64 gsbase;
77*94212d34SMukesh Rathor u64 efer;
78*94212d34SMukesh Rathor u64 pat;
79*94212d34SMukesh Rathor };
80*94212d34SMukesh Rathor static struct hv_crash_ctxt hv_crash_ctxt;
81*94212d34SMukesh Rathor
82*94212d34SMukesh Rathor /* Shared hypervisor page that contains crash dump area we peek into.
83*94212d34SMukesh Rathor * NB: windbg looks for "hv_cda" symbol so don't change it.
84*94212d34SMukesh Rathor */
85*94212d34SMukesh Rathor static struct hv_crashdump_area *hv_cda;
86*94212d34SMukesh Rathor
87*94212d34SMukesh Rathor static u32 trampoline_pa, devirt_arg;
88*94212d34SMukesh Rathor static atomic_t crash_cpus_wait;
89*94212d34SMukesh Rathor static void *hv_crash_ptpgs[4];
90*94212d34SMukesh Rathor static bool hv_has_crashed, lx_has_crashed;
91*94212d34SMukesh Rathor
hv_panic_timeout_reboot(void)92*94212d34SMukesh Rathor static void __noreturn hv_panic_timeout_reboot(void)
93*94212d34SMukesh Rathor {
94*94212d34SMukesh Rathor #define PANIC_TIMER_STEP 100
95*94212d34SMukesh Rathor
96*94212d34SMukesh Rathor if (panic_timeout > 0) {
97*94212d34SMukesh Rathor int i;
98*94212d34SMukesh Rathor
99*94212d34SMukesh Rathor for (i = 0; i < panic_timeout * 1000; i += PANIC_TIMER_STEP)
100*94212d34SMukesh Rathor mdelay(PANIC_TIMER_STEP);
101*94212d34SMukesh Rathor }
102*94212d34SMukesh Rathor
103*94212d34SMukesh Rathor if (panic_timeout)
104*94212d34SMukesh Rathor native_wrmsrq(HV_X64_MSR_RESET, 1); /* get hyp to reboot */
105*94212d34SMukesh Rathor
106*94212d34SMukesh Rathor for (;;)
107*94212d34SMukesh Rathor cpu_relax();
108*94212d34SMukesh Rathor }
109*94212d34SMukesh Rathor
110*94212d34SMukesh Rathor /* This cannot be inlined as it needs stack */
hv_crash_restore_tss(void)111*94212d34SMukesh Rathor static noinline __noclone void hv_crash_restore_tss(void)
112*94212d34SMukesh Rathor {
113*94212d34SMukesh Rathor load_TR_desc();
114*94212d34SMukesh Rathor }
115*94212d34SMukesh Rathor
116*94212d34SMukesh Rathor /* This cannot be inlined as it needs stack */
hv_crash_clear_kernpt(void)117*94212d34SMukesh Rathor static noinline void hv_crash_clear_kernpt(void)
118*94212d34SMukesh Rathor {
119*94212d34SMukesh Rathor pgd_t *pgd;
120*94212d34SMukesh Rathor p4d_t *p4d;
121*94212d34SMukesh Rathor
122*94212d34SMukesh Rathor /* Clear entry so it's not confusing to someone looking at the core */
123*94212d34SMukesh Rathor pgd = pgd_offset_k(trampoline_pa);
124*94212d34SMukesh Rathor p4d = p4d_offset(pgd, trampoline_pa);
125*94212d34SMukesh Rathor native_p4d_clear(p4d);
126*94212d34SMukesh Rathor }
127*94212d34SMukesh Rathor
128*94212d34SMukesh Rathor /*
129*94212d34SMukesh Rathor * This is the C entry point from the asm glue code after the disable hypercall.
130*94212d34SMukesh Rathor * We enter here in IA32-e long mode, ie, full 64bit mode running on kernel
131*94212d34SMukesh Rathor * page tables with our below 4G page identity mapped, but using a temporary
132*94212d34SMukesh Rathor * GDT. ds/fs/gs/es are null. ss is not usable. bp is null. stack is not
133*94212d34SMukesh Rathor * available. We restore kernel GDT, and rest of the context, and continue
134*94212d34SMukesh Rathor * to kexec.
135*94212d34SMukesh Rathor */
hv_crash_c_entry(void)136*94212d34SMukesh Rathor static asmlinkage void __noreturn hv_crash_c_entry(void)
137*94212d34SMukesh Rathor {
138*94212d34SMukesh Rathor struct hv_crash_ctxt *ctxt = &hv_crash_ctxt;
139*94212d34SMukesh Rathor
140*94212d34SMukesh Rathor /* first thing, restore kernel gdt */
141*94212d34SMukesh Rathor native_load_gdt(&ctxt->gdtr);
142*94212d34SMukesh Rathor
143*94212d34SMukesh Rathor asm volatile("movw %%ax, %%ss" : : "a"(ctxt->ss));
144*94212d34SMukesh Rathor asm volatile("movq %0, %%rsp" : : "m"(ctxt->rsp));
145*94212d34SMukesh Rathor
146*94212d34SMukesh Rathor asm volatile("movw %%ax, %%ds" : : "a"(ctxt->ds));
147*94212d34SMukesh Rathor asm volatile("movw %%ax, %%es" : : "a"(ctxt->es));
148*94212d34SMukesh Rathor asm volatile("movw %%ax, %%fs" : : "a"(ctxt->fs));
149*94212d34SMukesh Rathor asm volatile("movw %%ax, %%gs" : : "a"(ctxt->gs));
150*94212d34SMukesh Rathor
151*94212d34SMukesh Rathor native_wrmsrq(MSR_IA32_CR_PAT, ctxt->pat);
152*94212d34SMukesh Rathor asm volatile("movq %0, %%cr0" : : "r"(ctxt->cr0));
153*94212d34SMukesh Rathor
154*94212d34SMukesh Rathor asm volatile("movq %0, %%cr8" : : "r"(ctxt->cr8));
155*94212d34SMukesh Rathor asm volatile("movq %0, %%cr4" : : "r"(ctxt->cr4));
156*94212d34SMukesh Rathor asm volatile("movq %0, %%cr2" : : "r"(ctxt->cr4));
157*94212d34SMukesh Rathor
158*94212d34SMukesh Rathor native_load_idt(&ctxt->idtr);
159*94212d34SMukesh Rathor native_wrmsrq(MSR_GS_BASE, ctxt->gsbase);
160*94212d34SMukesh Rathor native_wrmsrq(MSR_EFER, ctxt->efer);
161*94212d34SMukesh Rathor
162*94212d34SMukesh Rathor /* restore the original kernel CS now via far return */
163*94212d34SMukesh Rathor asm volatile("movzwq %0, %%rax\n\t"
164*94212d34SMukesh Rathor "pushq %%rax\n\t"
165*94212d34SMukesh Rathor "pushq $1f\n\t"
166*94212d34SMukesh Rathor "lretq\n\t"
167*94212d34SMukesh Rathor "1:nop\n\t" : : "m"(ctxt->cs) : "rax");
168*94212d34SMukesh Rathor
169*94212d34SMukesh Rathor /* We are in asmlinkage without stack frame, hence make C function
170*94212d34SMukesh Rathor * calls which will buy stack frames.
171*94212d34SMukesh Rathor */
172*94212d34SMukesh Rathor hv_crash_restore_tss();
173*94212d34SMukesh Rathor hv_crash_clear_kernpt();
174*94212d34SMukesh Rathor
175*94212d34SMukesh Rathor /* we are now fully in devirtualized normal kernel mode */
176*94212d34SMukesh Rathor __crash_kexec(NULL);
177*94212d34SMukesh Rathor
178*94212d34SMukesh Rathor hv_panic_timeout_reboot();
179*94212d34SMukesh Rathor }
180*94212d34SMukesh Rathor /* Tell gcc we are using lretq long jump in the above function intentionally */
181*94212d34SMukesh Rathor STACK_FRAME_NON_STANDARD(hv_crash_c_entry);
182*94212d34SMukesh Rathor
hv_mark_tss_not_busy(void)183*94212d34SMukesh Rathor static void hv_mark_tss_not_busy(void)
184*94212d34SMukesh Rathor {
185*94212d34SMukesh Rathor struct desc_struct *desc = get_current_gdt_rw();
186*94212d34SMukesh Rathor tss_desc tss;
187*94212d34SMukesh Rathor
188*94212d34SMukesh Rathor memcpy(&tss, &desc[GDT_ENTRY_TSS], sizeof(tss_desc));
189*94212d34SMukesh Rathor tss.type = 0x9; /* available 64-bit TSS. 0xB is busy TSS */
190*94212d34SMukesh Rathor write_gdt_entry(desc, GDT_ENTRY_TSS, &tss, DESC_TSS);
191*94212d34SMukesh Rathor }
192*94212d34SMukesh Rathor
193*94212d34SMukesh Rathor /* Save essential context */
hv_hvcrash_ctxt_save(void)194*94212d34SMukesh Rathor static void hv_hvcrash_ctxt_save(void)
195*94212d34SMukesh Rathor {
196*94212d34SMukesh Rathor struct hv_crash_ctxt *ctxt = &hv_crash_ctxt;
197*94212d34SMukesh Rathor
198*94212d34SMukesh Rathor asm volatile("movq %%rsp,%0" : "=m"(ctxt->rsp));
199*94212d34SMukesh Rathor
200*94212d34SMukesh Rathor ctxt->cr0 = native_read_cr0();
201*94212d34SMukesh Rathor ctxt->cr4 = native_read_cr4();
202*94212d34SMukesh Rathor
203*94212d34SMukesh Rathor asm volatile("movq %%cr2, %0" : "=a"(ctxt->cr2));
204*94212d34SMukesh Rathor asm volatile("movq %%cr8, %0" : "=a"(ctxt->cr8));
205*94212d34SMukesh Rathor
206*94212d34SMukesh Rathor asm volatile("movl %%cs, %%eax" : "=a"(ctxt->cs));
207*94212d34SMukesh Rathor asm volatile("movl %%ss, %%eax" : "=a"(ctxt->ss));
208*94212d34SMukesh Rathor asm volatile("movl %%ds, %%eax" : "=a"(ctxt->ds));
209*94212d34SMukesh Rathor asm volatile("movl %%es, %%eax" : "=a"(ctxt->es));
210*94212d34SMukesh Rathor asm volatile("movl %%fs, %%eax" : "=a"(ctxt->fs));
211*94212d34SMukesh Rathor asm volatile("movl %%gs, %%eax" : "=a"(ctxt->gs));
212*94212d34SMukesh Rathor
213*94212d34SMukesh Rathor native_store_gdt(&ctxt->gdtr);
214*94212d34SMukesh Rathor store_idt(&ctxt->idtr);
215*94212d34SMukesh Rathor
216*94212d34SMukesh Rathor ctxt->gsbase = __rdmsr(MSR_GS_BASE);
217*94212d34SMukesh Rathor ctxt->efer = __rdmsr(MSR_EFER);
218*94212d34SMukesh Rathor ctxt->pat = __rdmsr(MSR_IA32_CR_PAT);
219*94212d34SMukesh Rathor }
220*94212d34SMukesh Rathor
221*94212d34SMukesh Rathor /* Add trampoline page to the kernel pagetable for transition to kernel PT */
hv_crash_fixup_kernpt(void)222*94212d34SMukesh Rathor static void hv_crash_fixup_kernpt(void)
223*94212d34SMukesh Rathor {
224*94212d34SMukesh Rathor pgd_t *pgd;
225*94212d34SMukesh Rathor p4d_t *p4d;
226*94212d34SMukesh Rathor
227*94212d34SMukesh Rathor pgd = pgd_offset_k(trampoline_pa);
228*94212d34SMukesh Rathor p4d = p4d_offset(pgd, trampoline_pa);
229*94212d34SMukesh Rathor
230*94212d34SMukesh Rathor /* trampoline_pa is below 4G, so no pre-existing entry to clobber */
231*94212d34SMukesh Rathor p4d_populate(&init_mm, p4d, (pud_t *)hv_crash_ptpgs[1]);
232*94212d34SMukesh Rathor p4d->p4d = p4d->p4d & ~(_PAGE_NX); /* enable execute */
233*94212d34SMukesh Rathor }
234*94212d34SMukesh Rathor
235*94212d34SMukesh Rathor /*
236*94212d34SMukesh Rathor * Notify the hyp that Linux has crashed. This will cause the hyp to quiesce
237*94212d34SMukesh Rathor * and suspend all guest VPs.
238*94212d34SMukesh Rathor */
hv_notify_prepare_hyp(void)239*94212d34SMukesh Rathor static void hv_notify_prepare_hyp(void)
240*94212d34SMukesh Rathor {
241*94212d34SMukesh Rathor u64 status;
242*94212d34SMukesh Rathor struct hv_input_notify_partition_event *input;
243*94212d34SMukesh Rathor struct hv_partition_event_root_crashdump_input *cda;
244*94212d34SMukesh Rathor
245*94212d34SMukesh Rathor input = *this_cpu_ptr(hyperv_pcpu_input_arg);
246*94212d34SMukesh Rathor cda = &input->input.crashdump_input;
247*94212d34SMukesh Rathor memset(input, 0, sizeof(*input));
248*94212d34SMukesh Rathor input->event = HV_PARTITION_EVENT_ROOT_CRASHDUMP;
249*94212d34SMukesh Rathor
250*94212d34SMukesh Rathor cda->crashdump_action = HV_CRASHDUMP_ENTRY;
251*94212d34SMukesh Rathor status = hv_do_hypercall(HVCALL_NOTIFY_PARTITION_EVENT, input, NULL);
252*94212d34SMukesh Rathor if (!hv_result_success(status))
253*94212d34SMukesh Rathor return;
254*94212d34SMukesh Rathor
255*94212d34SMukesh Rathor cda->crashdump_action = HV_CRASHDUMP_SUSPEND_ALL_VPS;
256*94212d34SMukesh Rathor hv_do_hypercall(HVCALL_NOTIFY_PARTITION_EVENT, input, NULL);
257*94212d34SMukesh Rathor }
258*94212d34SMukesh Rathor
259*94212d34SMukesh Rathor /*
260*94212d34SMukesh Rathor * Common function for all cpus before devirtualization.
261*94212d34SMukesh Rathor *
262*94212d34SMukesh Rathor * Hypervisor crash: all cpus get here in NMI context.
263*94212d34SMukesh Rathor * Linux crash: the panicing cpu gets here at base level, all others in NMI
264*94212d34SMukesh Rathor * context. Note, panicing cpu may not be the BSP.
265*94212d34SMukesh Rathor *
266*94212d34SMukesh Rathor * The function is not inlined so it will show on the stack. It is named so
267*94212d34SMukesh Rathor * because the crash cmd looks for certain well known function names on the
268*94212d34SMukesh Rathor * stack before looking into the cpu saved note in the elf section, and
269*94212d34SMukesh Rathor * that work is currently incomplete.
270*94212d34SMukesh Rathor *
271*94212d34SMukesh Rathor * Notes:
272*94212d34SMukesh Rathor * Hypervisor crash:
273*94212d34SMukesh Rathor * - the hypervisor is in a very restrictive mode at this point and any
274*94212d34SMukesh Rathor * vmexit it cannot handle would result in reboot. So, no mumbo jumbo,
275*94212d34SMukesh Rathor * just get to kexec as quickly as possible.
276*94212d34SMukesh Rathor *
277*94212d34SMukesh Rathor * Devirtualization is supported from the BSP only at present.
278*94212d34SMukesh Rathor */
crash_nmi_callback(struct pt_regs * regs)279*94212d34SMukesh Rathor static noinline __noclone void crash_nmi_callback(struct pt_regs *regs)
280*94212d34SMukesh Rathor {
281*94212d34SMukesh Rathor struct hv_input_disable_hyp_ex *input;
282*94212d34SMukesh Rathor u64 status;
283*94212d34SMukesh Rathor int msecs = 1000, ccpu = smp_processor_id();
284*94212d34SMukesh Rathor
285*94212d34SMukesh Rathor if (ccpu == 0) {
286*94212d34SMukesh Rathor /* crash_save_cpu() will be done in the kexec path */
287*94212d34SMukesh Rathor cpu_emergency_stop_pt(); /* disable performance trace */
288*94212d34SMukesh Rathor atomic_inc(&crash_cpus_wait);
289*94212d34SMukesh Rathor } else {
290*94212d34SMukesh Rathor crash_save_cpu(regs, ccpu);
291*94212d34SMukesh Rathor cpu_emergency_stop_pt(); /* disable performance trace */
292*94212d34SMukesh Rathor atomic_inc(&crash_cpus_wait);
293*94212d34SMukesh Rathor for (;;)
294*94212d34SMukesh Rathor cpu_relax();
295*94212d34SMukesh Rathor }
296*94212d34SMukesh Rathor
297*94212d34SMukesh Rathor while (atomic_read(&crash_cpus_wait) < num_online_cpus() && msecs--)
298*94212d34SMukesh Rathor mdelay(1);
299*94212d34SMukesh Rathor
300*94212d34SMukesh Rathor stop_nmi();
301*94212d34SMukesh Rathor if (!hv_has_crashed)
302*94212d34SMukesh Rathor hv_notify_prepare_hyp();
303*94212d34SMukesh Rathor
304*94212d34SMukesh Rathor if (crashing_cpu == -1)
305*94212d34SMukesh Rathor crashing_cpu = ccpu; /* crash cmd uses this */
306*94212d34SMukesh Rathor
307*94212d34SMukesh Rathor hv_hvcrash_ctxt_save();
308*94212d34SMukesh Rathor hv_mark_tss_not_busy();
309*94212d34SMukesh Rathor hv_crash_fixup_kernpt();
310*94212d34SMukesh Rathor
311*94212d34SMukesh Rathor input = *this_cpu_ptr(hyperv_pcpu_input_arg);
312*94212d34SMukesh Rathor memset(input, 0, sizeof(*input));
313*94212d34SMukesh Rathor input->rip = trampoline_pa;
314*94212d34SMukesh Rathor input->arg = devirt_arg;
315*94212d34SMukesh Rathor
316*94212d34SMukesh Rathor status = hv_do_hypercall(HVCALL_DISABLE_HYP_EX, input, NULL);
317*94212d34SMukesh Rathor
318*94212d34SMukesh Rathor hv_panic_timeout_reboot();
319*94212d34SMukesh Rathor }
320*94212d34SMukesh Rathor
321*94212d34SMukesh Rathor
322*94212d34SMukesh Rathor static DEFINE_SPINLOCK(hv_crash_reboot_lk);
323*94212d34SMukesh Rathor
324*94212d34SMukesh Rathor /*
325*94212d34SMukesh Rathor * Generic NMI callback handler: could be called without any crash also.
326*94212d34SMukesh Rathor * hv crash: hypervisor injects NMI's into all cpus
327*94212d34SMukesh Rathor * lx crash: panicing cpu sends NMI to all but self via crash_stop_other_cpus
328*94212d34SMukesh Rathor */
hv_crash_nmi_local(unsigned int cmd,struct pt_regs * regs)329*94212d34SMukesh Rathor static int hv_crash_nmi_local(unsigned int cmd, struct pt_regs *regs)
330*94212d34SMukesh Rathor {
331*94212d34SMukesh Rathor if (!hv_has_crashed && hv_cda && hv_cda->cda_valid)
332*94212d34SMukesh Rathor hv_has_crashed = true;
333*94212d34SMukesh Rathor
334*94212d34SMukesh Rathor if (!hv_has_crashed && !lx_has_crashed)
335*94212d34SMukesh Rathor return NMI_DONE; /* ignore the NMI */
336*94212d34SMukesh Rathor
337*94212d34SMukesh Rathor if (hv_has_crashed && !kexec_crash_loaded()) {
338*94212d34SMukesh Rathor if (spin_trylock(&hv_crash_reboot_lk))
339*94212d34SMukesh Rathor hv_panic_timeout_reboot();
340*94212d34SMukesh Rathor else
341*94212d34SMukesh Rathor for (;;)
342*94212d34SMukesh Rathor cpu_relax();
343*94212d34SMukesh Rathor }
344*94212d34SMukesh Rathor
345*94212d34SMukesh Rathor crash_nmi_callback(regs);
346*94212d34SMukesh Rathor
347*94212d34SMukesh Rathor return NMI_DONE;
348*94212d34SMukesh Rathor }
349*94212d34SMukesh Rathor
350*94212d34SMukesh Rathor /*
351*94212d34SMukesh Rathor * hv_crash_stop_other_cpus() == smp_ops.crash_stop_other_cpus
352*94212d34SMukesh Rathor *
353*94212d34SMukesh Rathor * On normal Linux panic, this is called twice: first from panic and then again
354*94212d34SMukesh Rathor * from native_machine_crash_shutdown.
355*94212d34SMukesh Rathor *
356*94212d34SMukesh Rathor * In case of hyperv, 3 ways to get here:
357*94212d34SMukesh Rathor * 1. hv crash (only BSP will get here):
358*94212d34SMukesh Rathor * BSP : NMI callback -> DisableHv -> hv_crash_asm32 -> hv_crash_c_entry
359*94212d34SMukesh Rathor * -> __crash_kexec -> native_machine_crash_shutdown
360*94212d34SMukesh Rathor * -> crash_smp_send_stop -> smp_ops.crash_stop_other_cpus
361*94212d34SMukesh Rathor * Linux panic:
362*94212d34SMukesh Rathor * 2. panic cpu x: panic() -> crash_smp_send_stop
363*94212d34SMukesh Rathor * -> smp_ops.crash_stop_other_cpus
364*94212d34SMukesh Rathor * 3. BSP: native_machine_crash_shutdown -> crash_smp_send_stop
365*94212d34SMukesh Rathor *
366*94212d34SMukesh Rathor * NB: noclone and non standard stack because of call to crash_setup_regs().
367*94212d34SMukesh Rathor */
hv_crash_stop_other_cpus(void)368*94212d34SMukesh Rathor static void __noclone hv_crash_stop_other_cpus(void)
369*94212d34SMukesh Rathor {
370*94212d34SMukesh Rathor static bool crash_stop_done;
371*94212d34SMukesh Rathor struct pt_regs lregs;
372*94212d34SMukesh Rathor int ccpu = smp_processor_id();
373*94212d34SMukesh Rathor
374*94212d34SMukesh Rathor if (hv_has_crashed)
375*94212d34SMukesh Rathor return; /* all cpus already in NMI handler path */
376*94212d34SMukesh Rathor
377*94212d34SMukesh Rathor if (!kexec_crash_loaded()) {
378*94212d34SMukesh Rathor hv_notify_prepare_hyp();
379*94212d34SMukesh Rathor hv_panic_timeout_reboot(); /* no return */
380*94212d34SMukesh Rathor }
381*94212d34SMukesh Rathor
382*94212d34SMukesh Rathor /* If the hv crashes also, we could come here again before cpus_stopped
383*94212d34SMukesh Rathor * is set in crash_smp_send_stop(). So use our own check.
384*94212d34SMukesh Rathor */
385*94212d34SMukesh Rathor if (crash_stop_done)
386*94212d34SMukesh Rathor return;
387*94212d34SMukesh Rathor crash_stop_done = true;
388*94212d34SMukesh Rathor
389*94212d34SMukesh Rathor /* Linux has crashed: hv is healthy, we can IPI safely */
390*94212d34SMukesh Rathor lx_has_crashed = true;
391*94212d34SMukesh Rathor wmb(); /* NMI handlers look at lx_has_crashed */
392*94212d34SMukesh Rathor
393*94212d34SMukesh Rathor apic->send_IPI_allbutself(NMI_VECTOR);
394*94212d34SMukesh Rathor
395*94212d34SMukesh Rathor if (crashing_cpu == -1)
396*94212d34SMukesh Rathor crashing_cpu = ccpu; /* crash cmd uses this */
397*94212d34SMukesh Rathor
398*94212d34SMukesh Rathor /* crash_setup_regs() happens in kexec also, but for the kexec cpu which
399*94212d34SMukesh Rathor * is the BSP. We could be here on non-BSP cpu, collect regs if so.
400*94212d34SMukesh Rathor */
401*94212d34SMukesh Rathor if (ccpu)
402*94212d34SMukesh Rathor crash_setup_regs(&lregs, NULL);
403*94212d34SMukesh Rathor
404*94212d34SMukesh Rathor crash_nmi_callback(&lregs);
405*94212d34SMukesh Rathor }
406*94212d34SMukesh Rathor STACK_FRAME_NON_STANDARD(hv_crash_stop_other_cpus);
407*94212d34SMukesh Rathor
408*94212d34SMukesh Rathor /* This GDT is accessed in IA32-e compat mode which uses 32bits addresses */
409*94212d34SMukesh Rathor struct hv_gdtreg_32 {
410*94212d34SMukesh Rathor u16 fill;
411*94212d34SMukesh Rathor u16 limit;
412*94212d34SMukesh Rathor u32 address;
413*94212d34SMukesh Rathor } __packed;
414*94212d34SMukesh Rathor
415*94212d34SMukesh Rathor /* We need a CS with L bit to goto IA32-e long mode from 32bit compat mode */
416*94212d34SMukesh Rathor struct hv_crash_tramp_gdt {
417*94212d34SMukesh Rathor u64 null; /* index 0, selector 0, null selector */
418*94212d34SMukesh Rathor u64 cs64; /* index 1, selector 8, cs64 selector */
419*94212d34SMukesh Rathor } __packed;
420*94212d34SMukesh Rathor
421*94212d34SMukesh Rathor /* No stack, so jump via far ptr in memory to load the 64bit CS */
422*94212d34SMukesh Rathor struct hv_cs_jmptgt {
423*94212d34SMukesh Rathor u32 address;
424*94212d34SMukesh Rathor u16 csval;
425*94212d34SMukesh Rathor u16 fill;
426*94212d34SMukesh Rathor } __packed;
427*94212d34SMukesh Rathor
428*94212d34SMukesh Rathor /* Linux use only, hypervisor doesn't look at this struct */
429*94212d34SMukesh Rathor struct hv_crash_tramp_data {
430*94212d34SMukesh Rathor u64 tramp32_cr3;
431*94212d34SMukesh Rathor u64 kernel_cr3;
432*94212d34SMukesh Rathor struct hv_gdtreg_32 gdtr32;
433*94212d34SMukesh Rathor struct hv_crash_tramp_gdt tramp_gdt;
434*94212d34SMukesh Rathor struct hv_cs_jmptgt cs_jmptgt;
435*94212d34SMukesh Rathor u64 c_entry_addr;
436*94212d34SMukesh Rathor } __packed;
437*94212d34SMukesh Rathor
438*94212d34SMukesh Rathor /*
439*94212d34SMukesh Rathor * Setup a temporary gdt to allow the asm code to switch to the long mode.
440*94212d34SMukesh Rathor * Since the asm code is relocated/copied to a below 4G page, it cannot use rip
441*94212d34SMukesh Rathor * relative addressing, hence we must use trampoline_pa here. Also, save other
442*94212d34SMukesh Rathor * info like jmp and C entry targets for same reasons.
443*94212d34SMukesh Rathor *
444*94212d34SMukesh Rathor * Returns: 0 on success, -1 on error
445*94212d34SMukesh Rathor */
hv_crash_setup_trampdata(u64 trampoline_va)446*94212d34SMukesh Rathor static int hv_crash_setup_trampdata(u64 trampoline_va)
447*94212d34SMukesh Rathor {
448*94212d34SMukesh Rathor int size, offs;
449*94212d34SMukesh Rathor void *dest;
450*94212d34SMukesh Rathor struct hv_crash_tramp_data *tramp;
451*94212d34SMukesh Rathor
452*94212d34SMukesh Rathor /* These must match exactly the ones in the corresponding asm file */
453*94212d34SMukesh Rathor BUILD_BUG_ON(offsetof(struct hv_crash_tramp_data, tramp32_cr3) != 0);
454*94212d34SMukesh Rathor BUILD_BUG_ON(offsetof(struct hv_crash_tramp_data, kernel_cr3) != 8);
455*94212d34SMukesh Rathor BUILD_BUG_ON(offsetof(struct hv_crash_tramp_data, gdtr32.limit) != 18);
456*94212d34SMukesh Rathor BUILD_BUG_ON(offsetof(struct hv_crash_tramp_data,
457*94212d34SMukesh Rathor cs_jmptgt.address) != 40);
458*94212d34SMukesh Rathor BUILD_BUG_ON(offsetof(struct hv_crash_tramp_data, c_entry_addr) != 48);
459*94212d34SMukesh Rathor
460*94212d34SMukesh Rathor /* hv_crash_asm_end is beyond last byte by 1 */
461*94212d34SMukesh Rathor size = &hv_crash_asm_end - &hv_crash_asm32;
462*94212d34SMukesh Rathor if (size + sizeof(struct hv_crash_tramp_data) > PAGE_SIZE) {
463*94212d34SMukesh Rathor pr_err("%s: trampoline page overflow\n", __func__);
464*94212d34SMukesh Rathor return -1;
465*94212d34SMukesh Rathor }
466*94212d34SMukesh Rathor
467*94212d34SMukesh Rathor dest = (void *)trampoline_va;
468*94212d34SMukesh Rathor memcpy(dest, &hv_crash_asm32, size);
469*94212d34SMukesh Rathor
470*94212d34SMukesh Rathor dest += size;
471*94212d34SMukesh Rathor dest = (void *)round_up((ulong)dest, 16);
472*94212d34SMukesh Rathor tramp = (struct hv_crash_tramp_data *)dest;
473*94212d34SMukesh Rathor
474*94212d34SMukesh Rathor /* see MAX_ASID_AVAILABLE in tlb.c: "PCID 0 is reserved for use by
475*94212d34SMukesh Rathor * non-PCID-aware users". Build cr3 with pcid 0
476*94212d34SMukesh Rathor */
477*94212d34SMukesh Rathor tramp->tramp32_cr3 = __sme_pa(hv_crash_ptpgs[0]);
478*94212d34SMukesh Rathor
479*94212d34SMukesh Rathor /* Note, when restoring X86_CR4_PCIDE, cr3[11:0] must be zero */
480*94212d34SMukesh Rathor tramp->kernel_cr3 = __sme_pa(init_mm.pgd);
481*94212d34SMukesh Rathor
482*94212d34SMukesh Rathor tramp->gdtr32.limit = sizeof(struct hv_crash_tramp_gdt);
483*94212d34SMukesh Rathor tramp->gdtr32.address = trampoline_pa +
484*94212d34SMukesh Rathor (ulong)&tramp->tramp_gdt - trampoline_va;
485*94212d34SMukesh Rathor
486*94212d34SMukesh Rathor /* base:0 limit:0xfffff type:b dpl:0 P:1 L:1 D:0 avl:0 G:1 */
487*94212d34SMukesh Rathor tramp->tramp_gdt.cs64 = 0x00af9a000000ffff;
488*94212d34SMukesh Rathor
489*94212d34SMukesh Rathor tramp->cs_jmptgt.csval = 0x8;
490*94212d34SMukesh Rathor offs = (ulong)&hv_crash_asm64 - (ulong)&hv_crash_asm32;
491*94212d34SMukesh Rathor tramp->cs_jmptgt.address = trampoline_pa + offs;
492*94212d34SMukesh Rathor
493*94212d34SMukesh Rathor tramp->c_entry_addr = (u64)&hv_crash_c_entry;
494*94212d34SMukesh Rathor
495*94212d34SMukesh Rathor devirt_arg = trampoline_pa + (ulong)dest - trampoline_va;
496*94212d34SMukesh Rathor
497*94212d34SMukesh Rathor return 0;
498*94212d34SMukesh Rathor }
499*94212d34SMukesh Rathor
500*94212d34SMukesh Rathor /*
501*94212d34SMukesh Rathor * Build 32bit trampoline page table for transition from protected mode
502*94212d34SMukesh Rathor * non-paging to long-mode paging. This transition needs pagetables below 4G.
503*94212d34SMukesh Rathor */
hv_crash_build_tramp_pt(void)504*94212d34SMukesh Rathor static void hv_crash_build_tramp_pt(void)
505*94212d34SMukesh Rathor {
506*94212d34SMukesh Rathor p4d_t *p4d;
507*94212d34SMukesh Rathor pud_t *pud;
508*94212d34SMukesh Rathor pmd_t *pmd;
509*94212d34SMukesh Rathor pte_t *pte;
510*94212d34SMukesh Rathor u64 pa, addr = trampoline_pa;
511*94212d34SMukesh Rathor
512*94212d34SMukesh Rathor p4d = hv_crash_ptpgs[0] + pgd_index(addr) * sizeof(p4d);
513*94212d34SMukesh Rathor pa = virt_to_phys(hv_crash_ptpgs[1]);
514*94212d34SMukesh Rathor set_p4d(p4d, __p4d(_PAGE_TABLE | pa));
515*94212d34SMukesh Rathor p4d->p4d &= ~(_PAGE_NX); /* enable execute */
516*94212d34SMukesh Rathor
517*94212d34SMukesh Rathor pud = hv_crash_ptpgs[1] + pud_index(addr) * sizeof(pud);
518*94212d34SMukesh Rathor pa = virt_to_phys(hv_crash_ptpgs[2]);
519*94212d34SMukesh Rathor set_pud(pud, __pud(_PAGE_TABLE | pa));
520*94212d34SMukesh Rathor
521*94212d34SMukesh Rathor pmd = hv_crash_ptpgs[2] + pmd_index(addr) * sizeof(pmd);
522*94212d34SMukesh Rathor pa = virt_to_phys(hv_crash_ptpgs[3]);
523*94212d34SMukesh Rathor set_pmd(pmd, __pmd(_PAGE_TABLE | pa));
524*94212d34SMukesh Rathor
525*94212d34SMukesh Rathor pte = hv_crash_ptpgs[3] + pte_index(addr) * sizeof(pte);
526*94212d34SMukesh Rathor set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_EXEC));
527*94212d34SMukesh Rathor }
528*94212d34SMukesh Rathor
529*94212d34SMukesh Rathor /*
530*94212d34SMukesh Rathor * Setup trampoline for devirtualization:
531*94212d34SMukesh Rathor * - a page below 4G, ie 32bit addr containing asm glue code that hyp jmps to
532*94212d34SMukesh Rathor * in protected mode.
533*94212d34SMukesh Rathor * - 4 pages for a temporary page table that asm code uses to turn paging on
534*94212d34SMukesh Rathor * - a temporary gdt to use in the compat mode.
535*94212d34SMukesh Rathor *
536*94212d34SMukesh Rathor * Returns: 0 on success
537*94212d34SMukesh Rathor */
hv_crash_trampoline_setup(void)538*94212d34SMukesh Rathor static int hv_crash_trampoline_setup(void)
539*94212d34SMukesh Rathor {
540*94212d34SMukesh Rathor int i, rc, order;
541*94212d34SMukesh Rathor struct page *page;
542*94212d34SMukesh Rathor u64 trampoline_va;
543*94212d34SMukesh Rathor gfp_t flags32 = GFP_KERNEL | GFP_DMA32 | __GFP_ZERO;
544*94212d34SMukesh Rathor
545*94212d34SMukesh Rathor /* page for 32bit trampoline assembly code + hv_crash_tramp_data */
546*94212d34SMukesh Rathor page = alloc_page(flags32);
547*94212d34SMukesh Rathor if (page == NULL) {
548*94212d34SMukesh Rathor pr_err("%s: failed to alloc asm stub page\n", __func__);
549*94212d34SMukesh Rathor return -1;
550*94212d34SMukesh Rathor }
551*94212d34SMukesh Rathor
552*94212d34SMukesh Rathor trampoline_va = (u64)page_to_virt(page);
553*94212d34SMukesh Rathor trampoline_pa = (u32)page_to_phys(page);
554*94212d34SMukesh Rathor
555*94212d34SMukesh Rathor order = 2; /* alloc 2^2 pages */
556*94212d34SMukesh Rathor page = alloc_pages(flags32, order);
557*94212d34SMukesh Rathor if (page == NULL) {
558*94212d34SMukesh Rathor pr_err("%s: failed to alloc pt pages\n", __func__);
559*94212d34SMukesh Rathor free_page(trampoline_va);
560*94212d34SMukesh Rathor return -1;
561*94212d34SMukesh Rathor }
562*94212d34SMukesh Rathor
563*94212d34SMukesh Rathor for (i = 0; i < 4; i++, page++)
564*94212d34SMukesh Rathor hv_crash_ptpgs[i] = page_to_virt(page);
565*94212d34SMukesh Rathor
566*94212d34SMukesh Rathor hv_crash_build_tramp_pt();
567*94212d34SMukesh Rathor
568*94212d34SMukesh Rathor rc = hv_crash_setup_trampdata(trampoline_va);
569*94212d34SMukesh Rathor if (rc)
570*94212d34SMukesh Rathor goto errout;
571*94212d34SMukesh Rathor
572*94212d34SMukesh Rathor return 0;
573*94212d34SMukesh Rathor
574*94212d34SMukesh Rathor errout:
575*94212d34SMukesh Rathor free_page(trampoline_va);
576*94212d34SMukesh Rathor free_pages((ulong)hv_crash_ptpgs[0], order);
577*94212d34SMukesh Rathor
578*94212d34SMukesh Rathor return rc;
579*94212d34SMukesh Rathor }
580*94212d34SMukesh Rathor
581*94212d34SMukesh Rathor /* Setup for kdump kexec to collect hypervisor RAM when running as root */
hv_root_crash_init(void)582*94212d34SMukesh Rathor void hv_root_crash_init(void)
583*94212d34SMukesh Rathor {
584*94212d34SMukesh Rathor int rc;
585*94212d34SMukesh Rathor struct hv_input_get_system_property *input;
586*94212d34SMukesh Rathor struct hv_output_get_system_property *output;
587*94212d34SMukesh Rathor unsigned long flags;
588*94212d34SMukesh Rathor u64 status;
589*94212d34SMukesh Rathor union hv_pfn_range cda_info;
590*94212d34SMukesh Rathor
591*94212d34SMukesh Rathor if (pgtable_l5_enabled()) {
592*94212d34SMukesh Rathor pr_err("Hyper-V: crash dump not yet supported on 5level PTs\n");
593*94212d34SMukesh Rathor return;
594*94212d34SMukesh Rathor }
595*94212d34SMukesh Rathor
596*94212d34SMukesh Rathor rc = register_nmi_handler(NMI_LOCAL, hv_crash_nmi_local, NMI_FLAG_FIRST,
597*94212d34SMukesh Rathor "hv_crash_nmi");
598*94212d34SMukesh Rathor if (rc) {
599*94212d34SMukesh Rathor pr_err("Hyper-V: failed to register crash nmi handler\n");
600*94212d34SMukesh Rathor return;
601*94212d34SMukesh Rathor }
602*94212d34SMukesh Rathor
603*94212d34SMukesh Rathor local_irq_save(flags);
604*94212d34SMukesh Rathor input = *this_cpu_ptr(hyperv_pcpu_input_arg);
605*94212d34SMukesh Rathor output = *this_cpu_ptr(hyperv_pcpu_output_arg);
606*94212d34SMukesh Rathor
607*94212d34SMukesh Rathor memset(input, 0, sizeof(*input));
608*94212d34SMukesh Rathor input->property_id = HV_SYSTEM_PROPERTY_CRASHDUMPAREA;
609*94212d34SMukesh Rathor
610*94212d34SMukesh Rathor status = hv_do_hypercall(HVCALL_GET_SYSTEM_PROPERTY, input, output);
611*94212d34SMukesh Rathor cda_info.as_uint64 = output->hv_cda_info.as_uint64;
612*94212d34SMukesh Rathor local_irq_restore(flags);
613*94212d34SMukesh Rathor
614*94212d34SMukesh Rathor if (!hv_result_success(status)) {
615*94212d34SMukesh Rathor pr_err("Hyper-V: %s: property:%d %s\n", __func__,
616*94212d34SMukesh Rathor input->property_id, hv_result_to_string(status));
617*94212d34SMukesh Rathor goto err_out;
618*94212d34SMukesh Rathor }
619*94212d34SMukesh Rathor
620*94212d34SMukesh Rathor if (cda_info.base_pfn == 0) {
621*94212d34SMukesh Rathor pr_err("Hyper-V: hypervisor crash dump area pfn is 0\n");
622*94212d34SMukesh Rathor goto err_out;
623*94212d34SMukesh Rathor }
624*94212d34SMukesh Rathor
625*94212d34SMukesh Rathor hv_cda = phys_to_virt(cda_info.base_pfn << HV_HYP_PAGE_SHIFT);
626*94212d34SMukesh Rathor
627*94212d34SMukesh Rathor rc = hv_crash_trampoline_setup();
628*94212d34SMukesh Rathor if (rc)
629*94212d34SMukesh Rathor goto err_out;
630*94212d34SMukesh Rathor
631*94212d34SMukesh Rathor smp_ops.crash_stop_other_cpus = hv_crash_stop_other_cpus;
632*94212d34SMukesh Rathor
633*94212d34SMukesh Rathor crash_kexec_post_notifiers = true;
634*94212d34SMukesh Rathor hv_crash_enabled = true;
635*94212d34SMukesh Rathor pr_info("Hyper-V: both linux and hypervisor kdump support enabled\n");
636*94212d34SMukesh Rathor
637*94212d34SMukesh Rathor return;
638*94212d34SMukesh Rathor
639*94212d34SMukesh Rathor err_out:
640*94212d34SMukesh Rathor unregister_nmi_handler(NMI_LOCAL, "hv_crash_nmi");
641*94212d34SMukesh Rathor pr_err("Hyper-V: only linux root kdump support enabled\n");
642*94212d34SMukesh Rathor }
643