xref: /linux/arch/x86/hyperv/hv_crash.c (revision c3d13784d5b200fc4b4a1f5d5f5585b8e3a5777e)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * X86 specific Hyper-V root partition kdump/crash support module
4  *
5  * Copyright (C) 2025, Microsoft, Inc.
6  *
7  * This module implements hypervisor RAM collection into vmcore for both
8  * cases of the hypervisor crash and Linux root crash. Hyper-V implements
9  * a disable hypercall with a 32bit protected mode ABI callback. This
10  * mechanism must be used to unlock hypervisor RAM. Since the hypervisor RAM
11  * is already mapped in Linux, it is automatically collected into Linux vmcore,
12  * and can be examined by the crash command (raw RAM dump) or windbg.
13  *
14  * At a high level:
15  *
16  *  Hypervisor Crash:
17  *    Upon crash, hypervisor goes into an emergency minimal dispatch loop, a
18  *    restrictive mode with very limited hypercall and MSR support. Each cpu
19  *    then injects NMIs into root vcpus. A shared page is used to check
20  *    by Linux in the NMI handler if the hypervisor has crashed. This shared
21  *    page is setup in hv_root_crash_init during boot.
22  *
23  *  Linux Crash:
24  *    In case of Linux crash, the callback hv_crash_stop_other_cpus will send
25  *    NMIs to all cpus, then proceed to the crash_nmi_callback where it waits
26  *    for all cpus to be in NMI.
27  *
28  *  NMI Handler (upon quorum):
29  *    Eventually, in both cases, all cpus will end up in the NMI handler.
30  *    Hyper-V requires the disable hypervisor must be done from the BSP. So
31  *    the BSP NMI handler saves current context, does some fixups and makes
32  *    the hypercall to disable the hypervisor, ie, devirtualize. Hypervisor
33  *    at that point will suspend all vcpus (except the BSP), unlock all its
34  *    RAM, and return to Linux at the 32bit mode entry RIP.
35  *
36  *  Linux 32bit entry trampoline will then restore long mode and call C
37  *  function here to restore context and continue execution to crash kexec.
38  */
39 
40 #include <linux/delay.h>
41 #include <linux/kexec.h>
42 #include <linux/crash_dump.h>
43 #include <linux/panic.h>
44 #include <asm/apic.h>
45 #include <asm/desc.h>
46 #include <asm/page.h>
47 #include <asm/pgalloc.h>
48 #include <asm/mshyperv.h>
49 #include <asm/nmi.h>
50 #include <asm/idtentry.h>
51 #include <asm/reboot.h>
52 #include <asm/intel_pt.h>
53 
54 bool hv_crash_enabled;
55 EXPORT_SYMBOL_GPL(hv_crash_enabled);
56 
57 struct hv_crash_ctxt {
58 	ulong rsp;
59 	ulong cr0;
60 	ulong cr2;
61 	ulong cr4;
62 	ulong cr8;
63 
64 	u16 cs;
65 	u16 ss;
66 	u16 ds;
67 	u16 es;
68 	u16 fs;
69 	u16 gs;
70 
71 	u16 gdt_fill;
72 	struct desc_ptr gdtr;
73 	char idt_fill[6];
74 	struct desc_ptr idtr;
75 
76 	u64 gsbase;
77 	u64 efer;
78 	u64 pat;
79 };
80 static struct hv_crash_ctxt hv_crash_ctxt;
81 
82 /* Shared hypervisor page that contains crash dump area we peek into.
83  * NB: windbg looks for "hv_cda" symbol so don't change it.
84  */
85 static struct hv_crashdump_area *hv_cda;
86 
87 static u32 trampoline_pa, devirt_arg;
88 static atomic_t crash_cpus_wait;
89 static void *hv_crash_ptpgs[4];
90 static bool hv_has_crashed, lx_has_crashed;
91 
hv_panic_timeout_reboot(void)92 static void __noreturn hv_panic_timeout_reboot(void)
93 {
94 	#define PANIC_TIMER_STEP 100
95 
96 	if (panic_timeout > 0) {
97 		int i;
98 
99 		for (i = 0; i < panic_timeout * 1000; i += PANIC_TIMER_STEP)
100 			mdelay(PANIC_TIMER_STEP);
101 	}
102 
103 	if (panic_timeout)
104 		native_wrmsrq(HV_X64_MSR_RESET, 1);    /* get hyp to reboot */
105 
106 	for (;;)
107 		cpu_relax();
108 }
109 
hv_crash_restore_tss(void)110 static void hv_crash_restore_tss(void)
111 {
112 	load_TR_desc();
113 }
114 
hv_crash_clear_kernpt(void)115 static void hv_crash_clear_kernpt(void)
116 {
117 	pgd_t *pgd;
118 	p4d_t *p4d;
119 
120 	/* Clear entry so it's not confusing to someone looking at the core */
121 	pgd = pgd_offset_k(trampoline_pa);
122 	p4d = p4d_offset(pgd, trampoline_pa);
123 	native_p4d_clear(p4d);
124 }
125 
126 
hv_crash_handle(void)127 static void __noreturn hv_crash_handle(void)
128 {
129 	hv_crash_restore_tss();
130 	hv_crash_clear_kernpt();
131 
132 	/* we are now fully in devirtualized normal kernel mode */
133 	__crash_kexec(NULL);
134 
135 	hv_panic_timeout_reboot();
136 }
137 
138 /*
139  * __naked functions do not permit function calls, not even to __always_inline
140  * functions that only contain asm() blocks themselves. So use a macro instead.
141  */
142 #define hv_wrmsr(msr, val) \
143 	asm volatile("wrmsr" :: "c"(msr), "a"((u32)val), "d"((u32)(val >> 32)) : "memory")
144 
145 /*
146  * This is the C entry point from the asm glue code after the disable hypercall.
147  * We enter here in IA32-e long mode, ie, full 64bit mode running on kernel
148  * page tables with our below 4G page identity mapped, but using a temporary
149  * GDT. ds/fs/gs/es are null. ss is not usable. bp is null. stack is not
150  * available. We restore kernel GDT, and rest of the context, and continue
151  * to kexec.
152  */
hv_crash_c_entry(void)153 static void __naked hv_crash_c_entry(void)
154 {
155 	/* first thing, restore kernel gdt */
156 	asm volatile("lgdt %0" : : "m" (hv_crash_ctxt.gdtr));
157 
158 	asm volatile("movw %0, %%ss\n\t"
159 		     "movq %1, %%rsp"
160 		     :: "m"(hv_crash_ctxt.ss), "m"(hv_crash_ctxt.rsp));
161 
162 	asm volatile("movw %0, %%ds" : : "m"(hv_crash_ctxt.ds));
163 	asm volatile("movw %0, %%es" : : "m"(hv_crash_ctxt.es));
164 	asm volatile("movw %0, %%fs" : : "m"(hv_crash_ctxt.fs));
165 	asm volatile("movw %0, %%gs" : : "m"(hv_crash_ctxt.gs));
166 
167 	hv_wrmsr(MSR_IA32_CR_PAT, hv_crash_ctxt.pat);
168 	asm volatile("movq %0, %%cr0" : : "r"(hv_crash_ctxt.cr0));
169 
170 	asm volatile("movq %0, %%cr8" : : "r"(hv_crash_ctxt.cr8));
171 	asm volatile("movq %0, %%cr4" : : "r"(hv_crash_ctxt.cr4));
172 	asm volatile("movq %0, %%cr2" : : "r"(hv_crash_ctxt.cr2));
173 
174 	asm volatile("lidt %0" : : "m" (hv_crash_ctxt.idtr));
175 	hv_wrmsr(MSR_GS_BASE, hv_crash_ctxt.gsbase);
176 	hv_wrmsr(MSR_EFER, hv_crash_ctxt.efer);
177 
178 	/* restore the original kernel CS now via far return */
179 	asm volatile("pushq %q0\n\t"
180 		     "pushq %q1\n\t"
181 		     "lretq"
182 		     :: "r"(hv_crash_ctxt.cs), "r"(hv_crash_handle));
183 }
184 /* Tell objtool we are using lretq long jump in the above function intentionally */
185 STACK_FRAME_NON_STANDARD(hv_crash_c_entry);
186 
hv_mark_tss_not_busy(void)187 static void hv_mark_tss_not_busy(void)
188 {
189 	struct desc_struct *desc = get_current_gdt_rw();
190 	tss_desc tss;
191 
192 	memcpy(&tss, &desc[GDT_ENTRY_TSS], sizeof(tss_desc));
193 	tss.type = 0x9;        /* available 64-bit TSS. 0xB is busy TSS */
194 	write_gdt_entry(desc, GDT_ENTRY_TSS, &tss, DESC_TSS);
195 }
196 
197 /* Save essential context */
hv_hvcrash_ctxt_save(void)198 static void hv_hvcrash_ctxt_save(void)
199 {
200 	struct hv_crash_ctxt *ctxt = &hv_crash_ctxt;
201 
202 	ctxt->rsp = current_stack_pointer;
203 
204 	ctxt->cr0 = native_read_cr0();
205 	ctxt->cr4 = native_read_cr4();
206 
207 	asm volatile("movq %%cr2, %0" : "=r"(ctxt->cr2));
208 	asm volatile("movq %%cr8, %0" : "=r"(ctxt->cr8));
209 
210 	asm volatile("movw %%cs, %0" : "=m"(ctxt->cs));
211 	asm volatile("movw %%ss, %0" : "=m"(ctxt->ss));
212 	asm volatile("movw %%ds, %0" : "=m"(ctxt->ds));
213 	asm volatile("movw %%es, %0" : "=m"(ctxt->es));
214 	asm volatile("movw %%fs, %0" : "=m"(ctxt->fs));
215 	asm volatile("movw %%gs, %0" : "=m"(ctxt->gs));
216 
217 	native_store_gdt(&ctxt->gdtr);
218 	store_idt(&ctxt->idtr);
219 
220 	ctxt->gsbase = __rdmsr(MSR_GS_BASE);
221 	ctxt->efer = __rdmsr(MSR_EFER);
222 	ctxt->pat = __rdmsr(MSR_IA32_CR_PAT);
223 }
224 
225 /* Add trampoline page to the kernel pagetable for transition to kernel PT */
hv_crash_fixup_kernpt(void)226 static void hv_crash_fixup_kernpt(void)
227 {
228 	pgd_t *pgd;
229 	p4d_t *p4d;
230 
231 	pgd = pgd_offset_k(trampoline_pa);
232 	p4d = p4d_offset(pgd, trampoline_pa);
233 
234 	/* trampoline_pa is below 4G, so no pre-existing entry to clobber */
235 	p4d_populate(&init_mm, p4d, (pud_t *)hv_crash_ptpgs[1]);
236 	p4d->p4d = p4d->p4d & ~(_PAGE_NX);    /* enable execute */
237 }
238 
239 /*
240  * Notify the hyp that Linux has crashed. This will cause the hyp to quiesce
241  * and suspend all guest VPs.
242  */
hv_notify_prepare_hyp(void)243 static void hv_notify_prepare_hyp(void)
244 {
245 	u64 status;
246 	struct hv_input_notify_partition_event *input;
247 	struct hv_partition_event_root_crashdump_input *cda;
248 
249 	input = *this_cpu_ptr(hyperv_pcpu_input_arg);
250 	cda = &input->input.crashdump_input;
251 	memset(input, 0, sizeof(*input));
252 	input->event = HV_PARTITION_EVENT_ROOT_CRASHDUMP;
253 
254 	cda->crashdump_action = HV_CRASHDUMP_ENTRY;
255 	status = hv_do_hypercall(HVCALL_NOTIFY_PARTITION_EVENT, input, NULL);
256 	if (!hv_result_success(status))
257 		return;
258 
259 	cda->crashdump_action = HV_CRASHDUMP_SUSPEND_ALL_VPS;
260 	hv_do_hypercall(HVCALL_NOTIFY_PARTITION_EVENT, input, NULL);
261 }
262 
263 /*
264  * Common function for all cpus before devirtualization.
265  *
266  * Hypervisor crash: all cpus get here in NMI context.
267  * Linux crash: the panicing cpu gets here at base level, all others in NMI
268  *		context. Note, panicing cpu may not be the BSP.
269  *
270  * The function is not inlined so it will show on the stack. It is named so
271  * because the crash cmd looks for certain well known function names on the
272  * stack before looking into the cpu saved note in the elf section, and
273  * that work is currently incomplete.
274  *
275  * Notes:
276  *  Hypervisor crash:
277  *    - the hypervisor is in a very restrictive mode at this point and any
278  *	vmexit it cannot handle would result in reboot. So, no mumbo jumbo,
279  *	just get to kexec as quickly as possible.
280  *
281  *  Devirtualization is supported from the BSP only at present.
282  */
crash_nmi_callback(struct pt_regs * regs)283 static noinline __noclone void crash_nmi_callback(struct pt_regs *regs)
284 {
285 	struct hv_input_disable_hyp_ex *input;
286 	int msecs = 1000, ccpu = smp_processor_id();
287 
288 	if (ccpu == 0) {
289 		/* crash_save_cpu() will be done in the kexec path */
290 		cpu_emergency_stop_pt();	/* disable performance trace */
291 		atomic_inc(&crash_cpus_wait);
292 	} else {
293 		crash_save_cpu(regs, ccpu);
294 		cpu_emergency_stop_pt();	/* disable performance trace */
295 		atomic_inc(&crash_cpus_wait);
296 		for (;;)
297 			cpu_relax();
298 	}
299 
300 	while (atomic_read(&crash_cpus_wait) < num_online_cpus() && msecs--)
301 		mdelay(1);
302 
303 	stop_nmi();
304 	if (!hv_has_crashed)
305 		hv_notify_prepare_hyp();
306 
307 	if (crashing_cpu == -1)
308 		crashing_cpu = ccpu;		/* crash cmd uses this */
309 
310 	hv_hvcrash_ctxt_save();
311 	hv_mark_tss_not_busy();
312 	hv_crash_fixup_kernpt();
313 
314 	input = *this_cpu_ptr(hyperv_pcpu_input_arg);
315 	memset(input, 0, sizeof(*input));
316 	input->rip = trampoline_pa;
317 	input->arg = devirt_arg;
318 
319 	(void)hv_do_hypercall(HVCALL_DISABLE_HYP_EX, input, NULL);
320 
321 	hv_panic_timeout_reboot();
322 }
323 
324 
325 static DEFINE_SPINLOCK(hv_crash_reboot_lk);
326 
327 /*
328  * Generic NMI callback handler: could be called without any crash also.
329  *   hv crash: hypervisor injects NMI's into all cpus
330  *   lx crash: panicing cpu sends NMI to all but self via crash_stop_other_cpus
331  */
hv_crash_nmi_local(unsigned int cmd,struct pt_regs * regs)332 static int hv_crash_nmi_local(unsigned int cmd, struct pt_regs *regs)
333 {
334 	if (!hv_has_crashed && hv_cda && hv_cda->cda_valid)
335 		hv_has_crashed = true;
336 
337 	if (!hv_has_crashed && !lx_has_crashed)
338 		return NMI_DONE;	/* ignore the NMI */
339 
340 	if (hv_has_crashed && !kexec_crash_loaded()) {
341 		if (spin_trylock(&hv_crash_reboot_lk))
342 			hv_panic_timeout_reboot();
343 		else
344 			for (;;)
345 				cpu_relax();
346 	}
347 
348 	crash_nmi_callback(regs);
349 
350 	return NMI_DONE;
351 }
352 
353 /*
354  * hv_crash_stop_other_cpus() == smp_ops.crash_stop_other_cpus
355  *
356  * On normal Linux panic, this is called twice: first from panic and then again
357  * from native_machine_crash_shutdown.
358  *
359  * In case of hyperv, 3 ways to get here:
360  *  1. hv crash (only BSP will get here):
361  *	BSP : NMI callback -> DisableHv -> hv_crash_asm32 -> hv_crash_c_entry
362  *		  -> __crash_kexec -> native_machine_crash_shutdown
363  *		  -> crash_smp_send_stop -> smp_ops.crash_stop_other_cpus
364  *  Linux panic:
365  *	2. panic cpu x: panic() -> crash_smp_send_stop
366  *				     -> smp_ops.crash_stop_other_cpus
367  *	3. BSP: native_machine_crash_shutdown -> crash_smp_send_stop
368  *
369  * NB: noclone and non standard stack because of call to crash_setup_regs().
370  */
hv_crash_stop_other_cpus(void)371 static void __noclone hv_crash_stop_other_cpus(void)
372 {
373 	static bool crash_stop_done;
374 	struct pt_regs lregs;
375 	int ccpu = smp_processor_id();
376 
377 	if (hv_has_crashed)
378 		return;		/* all cpus already in NMI handler path */
379 
380 	if (!kexec_crash_loaded()) {
381 		hv_notify_prepare_hyp();
382 		hv_panic_timeout_reboot();	/* no return */
383 	}
384 
385 	/* If the hv crashes also, we could come here again before cpus_stopped
386 	 * is set in crash_smp_send_stop(). So use our own check.
387 	 */
388 	if (crash_stop_done)
389 		return;
390 	crash_stop_done = true;
391 
392 	/* Linux has crashed: hv is healthy, we can IPI safely */
393 	lx_has_crashed = true;
394 	wmb();			/* NMI handlers look at lx_has_crashed */
395 
396 	apic->send_IPI_allbutself(NMI_VECTOR);
397 
398 	if (crashing_cpu == -1)
399 		crashing_cpu = ccpu;		/* crash cmd uses this */
400 
401 	/* crash_setup_regs() happens in kexec also, but for the kexec cpu which
402 	 * is the BSP. We could be here on non-BSP cpu, collect regs if so.
403 	 */
404 	if (ccpu)
405 		crash_setup_regs(&lregs, NULL);
406 
407 	crash_nmi_callback(&lregs);
408 }
409 STACK_FRAME_NON_STANDARD(hv_crash_stop_other_cpus);
410 
411 /* This GDT is accessed in IA32-e compat mode which uses 32bits addresses */
412 struct hv_gdtreg_32 {
413 	u16 fill;
414 	u16 limit;
415 	u32 address;
416 } __packed;
417 
418 /* We need a CS with L bit to goto IA32-e long mode from 32bit compat mode */
419 struct hv_crash_tramp_gdt {
420 	u64 null;	/* index 0, selector 0, null selector */
421 	u64 cs64;	/* index 1, selector 8, cs64 selector */
422 } __packed;
423 
424 /* No stack, so jump via far ptr in memory to load the 64bit CS */
425 struct hv_cs_jmptgt {
426 	u32 address;
427 	u16 csval;
428 	u16 fill;
429 } __packed;
430 
431 /* Linux use only, hypervisor doesn't look at this struct */
432 struct hv_crash_tramp_data {
433 	u64 tramp32_cr3;
434 	u64 kernel_cr3;
435 	struct hv_gdtreg_32 gdtr32;
436 	struct hv_crash_tramp_gdt tramp_gdt;
437 	struct hv_cs_jmptgt cs_jmptgt;
438 	u64 c_entry_addr;
439 } __packed;
440 
441 /*
442  * Setup a temporary gdt to allow the asm code to switch to the long mode.
443  * Since the asm code is relocated/copied to a below 4G page, it cannot use rip
444  * relative addressing, hence we must use trampoline_pa here. Also, save other
445  * info like jmp and C entry targets for same reasons.
446  *
447  * Returns: 0 on success, -1 on error
448  */
hv_crash_setup_trampdata(u64 trampoline_va)449 static int hv_crash_setup_trampdata(u64 trampoline_va)
450 {
451 	int size, offs;
452 	void *dest;
453 	struct hv_crash_tramp_data *tramp;
454 
455 	/* These must match exactly the ones in the corresponding asm file */
456 	BUILD_BUG_ON(offsetof(struct hv_crash_tramp_data, tramp32_cr3) != 0);
457 	BUILD_BUG_ON(offsetof(struct hv_crash_tramp_data, kernel_cr3) != 8);
458 	BUILD_BUG_ON(offsetof(struct hv_crash_tramp_data, gdtr32.limit) != 18);
459 	BUILD_BUG_ON(offsetof(struct hv_crash_tramp_data,
460 						     cs_jmptgt.address) != 40);
461 	BUILD_BUG_ON(offsetof(struct hv_crash_tramp_data, c_entry_addr) != 48);
462 
463 	/* hv_crash_asm_end is beyond last byte by 1 */
464 	size = &hv_crash_asm_end - &hv_crash_asm32;
465 	if (size + sizeof(struct hv_crash_tramp_data) > PAGE_SIZE) {
466 		pr_err("%s: trampoline page overflow\n", __func__);
467 		return -1;
468 	}
469 
470 	dest = (void *)trampoline_va;
471 	memcpy(dest, &hv_crash_asm32, size);
472 
473 	dest += size;
474 	dest = (void *)round_up((ulong)dest, 16);
475 	tramp = (struct hv_crash_tramp_data *)dest;
476 
477 	/* see MAX_ASID_AVAILABLE in tlb.c: "PCID 0 is reserved for use by
478 	 * non-PCID-aware users". Build cr3 with pcid 0
479 	 */
480 	tramp->tramp32_cr3 = __sme_pa(hv_crash_ptpgs[0]);
481 
482 	/* Note, when restoring X86_CR4_PCIDE, cr3[11:0] must be zero */
483 	tramp->kernel_cr3 = __sme_pa(init_mm.pgd);
484 
485 	tramp->gdtr32.limit = sizeof(struct hv_crash_tramp_gdt);
486 	tramp->gdtr32.address = trampoline_pa +
487 				   (ulong)&tramp->tramp_gdt - trampoline_va;
488 
489 	 /* base:0 limit:0xfffff type:b dpl:0 P:1 L:1 D:0 avl:0 G:1 */
490 	tramp->tramp_gdt.cs64 = 0x00af9a000000ffff;
491 
492 	tramp->cs_jmptgt.csval = 0x8;
493 	offs = (ulong)&hv_crash_asm64 - (ulong)&hv_crash_asm32;
494 	tramp->cs_jmptgt.address = trampoline_pa + offs;
495 
496 	tramp->c_entry_addr = (u64)&hv_crash_c_entry;
497 
498 	devirt_arg = trampoline_pa + (ulong)dest - trampoline_va;
499 
500 	return 0;
501 }
502 
503 /*
504  * Build 32bit trampoline page table for transition from protected mode
505  * non-paging to long-mode paging. This transition needs pagetables below 4G.
506  */
hv_crash_build_tramp_pt(void)507 static void hv_crash_build_tramp_pt(void)
508 {
509 	p4d_t *p4d;
510 	pud_t *pud;
511 	pmd_t *pmd;
512 	pte_t *pte;
513 	u64 pa, addr = trampoline_pa;
514 
515 	p4d = hv_crash_ptpgs[0] + pgd_index(addr) * sizeof(p4d);
516 	pa = virt_to_phys(hv_crash_ptpgs[1]);
517 	set_p4d(p4d, __p4d(_PAGE_TABLE | pa));
518 	p4d->p4d &= ~(_PAGE_NX);	/* enable execute */
519 
520 	pud = hv_crash_ptpgs[1] + pud_index(addr) * sizeof(pud);
521 	pa = virt_to_phys(hv_crash_ptpgs[2]);
522 	set_pud(pud, __pud(_PAGE_TABLE | pa));
523 
524 	pmd = hv_crash_ptpgs[2] + pmd_index(addr) * sizeof(pmd);
525 	pa = virt_to_phys(hv_crash_ptpgs[3]);
526 	set_pmd(pmd, __pmd(_PAGE_TABLE | pa));
527 
528 	pte = hv_crash_ptpgs[3] + pte_index(addr) * sizeof(pte);
529 	set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_EXEC));
530 }
531 
532 /*
533  * Setup trampoline for devirtualization:
534  *  - a page below 4G, ie 32bit addr containing asm glue code that hyp jmps to
535  *    in protected mode.
536  *  - 4 pages for a temporary page table that asm code uses to turn paging on
537  *  - a temporary gdt to use in the compat mode.
538  *
539  *  Returns: 0 on success
540  */
hv_crash_trampoline_setup(void)541 static int hv_crash_trampoline_setup(void)
542 {
543 	int i, rc, order;
544 	struct page *page;
545 	u64 trampoline_va;
546 	gfp_t flags32 = GFP_KERNEL | GFP_DMA32 | __GFP_ZERO;
547 
548 	/* page for 32bit trampoline assembly code + hv_crash_tramp_data */
549 	page = alloc_page(flags32);
550 	if (page == NULL) {
551 		pr_err("%s: failed to alloc asm stub page\n", __func__);
552 		return -1;
553 	}
554 
555 	trampoline_va = (u64)page_to_virt(page);
556 	trampoline_pa = (u32)page_to_phys(page);
557 
558 	order = 2;	   /* alloc 2^2 pages */
559 	page = alloc_pages(flags32, order);
560 	if (page == NULL) {
561 		pr_err("%s: failed to alloc pt pages\n", __func__);
562 		free_page(trampoline_va);
563 		return -1;
564 	}
565 
566 	for (i = 0; i < 4; i++, page++)
567 		hv_crash_ptpgs[i] = page_to_virt(page);
568 
569 	hv_crash_build_tramp_pt();
570 
571 	rc = hv_crash_setup_trampdata(trampoline_va);
572 	if (rc)
573 		goto errout;
574 
575 	return 0;
576 
577 errout:
578 	free_page(trampoline_va);
579 	free_pages((ulong)hv_crash_ptpgs[0], order);
580 
581 	return rc;
582 }
583 
584 /* Setup for kdump kexec to collect hypervisor RAM when running as root */
hv_root_crash_init(void)585 void hv_root_crash_init(void)
586 {
587 	int rc;
588 	struct hv_input_get_system_property *input;
589 	struct hv_output_get_system_property *output;
590 	unsigned long flags;
591 	u64 status;
592 	union hv_pfn_range cda_info;
593 
594 	if (pgtable_l5_enabled()) {
595 		pr_err("Hyper-V: crash dump not yet supported on 5level PTs\n");
596 		return;
597 	}
598 
599 	rc = register_nmi_handler(NMI_LOCAL, hv_crash_nmi_local, NMI_FLAG_FIRST,
600 				  "hv_crash_nmi");
601 	if (rc) {
602 		pr_err("Hyper-V: failed to register crash nmi handler\n");
603 		return;
604 	}
605 
606 	local_irq_save(flags);
607 	input = *this_cpu_ptr(hyperv_pcpu_input_arg);
608 	output = *this_cpu_ptr(hyperv_pcpu_output_arg);
609 
610 	memset(input, 0, sizeof(*input));
611 	input->property_id = HV_SYSTEM_PROPERTY_CRASHDUMPAREA;
612 
613 	status = hv_do_hypercall(HVCALL_GET_SYSTEM_PROPERTY, input, output);
614 	cda_info.as_uint64 = output->hv_cda_info.as_uint64;
615 	local_irq_restore(flags);
616 
617 	if (!hv_result_success(status)) {
618 		pr_err("Hyper-V: %s: property:%d %s\n", __func__,
619 		       input->property_id, hv_result_to_string(status));
620 		goto err_out;
621 	}
622 
623 	if (cda_info.base_pfn == 0) {
624 		pr_err("Hyper-V: hypervisor crash dump area pfn is 0\n");
625 		goto err_out;
626 	}
627 
628 	hv_cda = phys_to_virt(cda_info.base_pfn << HV_HYP_PAGE_SHIFT);
629 
630 	rc = hv_crash_trampoline_setup();
631 	if (rc)
632 		goto err_out;
633 
634 #ifdef CONFIG_SMP
635 	smp_ops.crash_stop_other_cpus = hv_crash_stop_other_cpus;
636 #endif
637 
638 	crash_kexec_post_notifiers = true;
639 	hv_crash_enabled = true;
640 	pr_info("Hyper-V: both linux and hypervisor kdump support enabled\n");
641 
642 	return;
643 
644 err_out:
645 	unregister_nmi_handler(NMI_LOCAL, "hv_crash_nmi");
646 	pr_err("Hyper-V: only linux root kdump support enabled\n");
647 }
648