xref: /illumos-gate/usr/src/uts/i86xpv/os/xpv_panic.c (revision 843e19887f64dde75055cf8842fc4db2171eff45)
1*843e1988Sjohnlev /*
2*843e1988Sjohnlev  * CDDL HEADER START
3*843e1988Sjohnlev  *
4*843e1988Sjohnlev  * The contents of this file are subject to the terms of the
5*843e1988Sjohnlev  * Common Development and Distribution License (the "License").
6*843e1988Sjohnlev  * You may not use this file except in compliance with the License.
7*843e1988Sjohnlev  *
8*843e1988Sjohnlev  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9*843e1988Sjohnlev  * or http://www.opensolaris.org/os/licensing.
10*843e1988Sjohnlev  * See the License for the specific language governing permissions
11*843e1988Sjohnlev  * and limitations under the License.
12*843e1988Sjohnlev  *
13*843e1988Sjohnlev  * When distributing Covered Code, include this CDDL HEADER in each
14*843e1988Sjohnlev  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15*843e1988Sjohnlev  * If applicable, add the following below this CDDL HEADER, with the
16*843e1988Sjohnlev  * fields enclosed by brackets "[]" replaced with your own identifying
17*843e1988Sjohnlev  * information: Portions Copyright [yyyy] [name of copyright owner]
18*843e1988Sjohnlev  *
19*843e1988Sjohnlev  * CDDL HEADER END
20*843e1988Sjohnlev  */
21*843e1988Sjohnlev 
22*843e1988Sjohnlev /*
23*843e1988Sjohnlev  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24*843e1988Sjohnlev  * Use is subject to license terms.
25*843e1988Sjohnlev  */
26*843e1988Sjohnlev 
27*843e1988Sjohnlev #pragma ident	"%Z%%M%	%I%	%E% SMI"
28*843e1988Sjohnlev 
29*843e1988Sjohnlev #include <sys/types.h>
30*843e1988Sjohnlev #include <sys/clock.h>
31*843e1988Sjohnlev #include <sys/psm.h>
32*843e1988Sjohnlev #include <sys/archsystm.h>
33*843e1988Sjohnlev #include <sys/machsystm.h>
34*843e1988Sjohnlev #include <sys/compress.h>
35*843e1988Sjohnlev #include <sys/modctl.h>
36*843e1988Sjohnlev #include <sys/trap.h>
37*843e1988Sjohnlev #include <sys/panic.h>
38*843e1988Sjohnlev #include <sys/regset.h>
39*843e1988Sjohnlev #include <sys/frame.h>
40*843e1988Sjohnlev #include <sys/kobj.h>
41*843e1988Sjohnlev #include <sys/apic.h>
42*843e1988Sjohnlev #include <sys/dumphdr.h>
43*843e1988Sjohnlev #include <sys/mem.h>
44*843e1988Sjohnlev #include <sys/x86_archext.h>
45*843e1988Sjohnlev #include <sys/xpv_panic.h>
46*843e1988Sjohnlev #include <sys/boot_console.h>
47*843e1988Sjohnlev #include <sys/bootsvcs.h>
48*843e1988Sjohnlev #include <sys/consdev.h>
49*843e1988Sjohnlev #include <vm/hat_pte.h>
50*843e1988Sjohnlev #include <vm/hat_i86.h>
51*843e1988Sjohnlev 
52*843e1988Sjohnlev /* XXX: need to add a PAE version too, if we ever support both PAE and non */
53*843e1988Sjohnlev #if defined(__i386)
54*843e1988Sjohnlev #define	XPV_FILENAME	"/boot/xen-syms"
55*843e1988Sjohnlev #else
56*843e1988Sjohnlev #define	XPV_FILENAME	"/boot/amd64/xen-syms"
57*843e1988Sjohnlev #endif
58*843e1988Sjohnlev #define	XPV_MODNAME	"xpv"
59*843e1988Sjohnlev 
60*843e1988Sjohnlev int xpv_panicking = 0;
61*843e1988Sjohnlev 
62*843e1988Sjohnlev struct module *xpv_module;
63*843e1988Sjohnlev struct modctl *xpv_modctl;
64*843e1988Sjohnlev 
65*843e1988Sjohnlev #define	ALIGN(x, a)	((a) == 0 ? (uintptr_t)(x) : \
66*843e1988Sjohnlev 	(((uintptr_t)(x) + (uintptr_t)(a) - 1l) & ~((uintptr_t)(a) - 1l)))
67*843e1988Sjohnlev 
68*843e1988Sjohnlev /* Pointer to the xpv_panic_info structure handed to us by Xen.  */
69*843e1988Sjohnlev static struct panic_info *xpv_panic_info = NULL;
70*843e1988Sjohnlev 
71*843e1988Sjohnlev /* Timer support */
72*843e1988Sjohnlev #define	NSEC_SHIFT 5
73*843e1988Sjohnlev #define	T_XPV_TIMER	0xd1
74*843e1988Sjohnlev #define	XPV_TIMER_INTERVAL	1000	/* 1000 microseconds */
75*843e1988Sjohnlev static uint32_t *xpv_apicadr = NULL;
76*843e1988Sjohnlev static uint_t	nsec_scale;
77*843e1988Sjohnlev 
78*843e1988Sjohnlev /* IDT support */
79*843e1988Sjohnlev #pragma	align	16(xpv_panic_idt)
80*843e1988Sjohnlev static gate_desc_t	xpv_panic_idt[NIDT];	/* interrupt descriptor table */
81*843e1988Sjohnlev 
82*843e1988Sjohnlev /* Xen pagetables mapped into our HAT's ptable windows */
83*843e1988Sjohnlev static pfn_t ptable_pfn[MAX_NUM_LEVEL];
84*843e1988Sjohnlev 
85*843e1988Sjohnlev /* Number of MMU_PAGESIZE pages we're adding to the Solaris dump */
86*843e1988Sjohnlev static int xpv_dump_pages;
87*843e1988Sjohnlev 
88*843e1988Sjohnlev /*
89*843e1988Sjohnlev  * Some commonly used values that we don't want to recompute over and over.
90*843e1988Sjohnlev  */
91*843e1988Sjohnlev static int xpv_panic_nptes[MAX_NUM_LEVEL];
92*843e1988Sjohnlev static ulong_t xpv_panic_cr3;
93*843e1988Sjohnlev static uintptr_t xpv_end;
94*843e1988Sjohnlev 
95*843e1988Sjohnlev static void xpv_panic_console_print(const char *fmt, ...);
96*843e1988Sjohnlev static void (*xpv_panic_printf)(const char *, ...) = xpv_panic_console_print;
97*843e1988Sjohnlev 
98*843e1988Sjohnlev #define	CONSOLE_BUF_SIZE	256
99*843e1988Sjohnlev static char console_buffer[CONSOLE_BUF_SIZE];
100*843e1988Sjohnlev static boolean_t use_polledio;
101*843e1988Sjohnlev 
102*843e1988Sjohnlev static void
103*843e1988Sjohnlev xpv_panic_putc(int m)
104*843e1988Sjohnlev {
105*843e1988Sjohnlev 	struct cons_polledio *c = cons_polledio;
106*843e1988Sjohnlev 
107*843e1988Sjohnlev 	/* This really shouldn't happen */
108*843e1988Sjohnlev 	if (console == CONS_HYPERVISOR)
109*843e1988Sjohnlev 		return;
110*843e1988Sjohnlev 
111*843e1988Sjohnlev 	if (use_polledio == B_TRUE)
112*843e1988Sjohnlev 		c->cons_polledio_putchar(c->cons_polledio_argument, m);
113*843e1988Sjohnlev 	else
114*843e1988Sjohnlev 		bcons_putchar(m);
115*843e1988Sjohnlev }
116*843e1988Sjohnlev 
117*843e1988Sjohnlev static void
118*843e1988Sjohnlev xpv_panic_puts(char *msg)
119*843e1988Sjohnlev {
120*843e1988Sjohnlev 	char *m;
121*843e1988Sjohnlev 
122*843e1988Sjohnlev 	dump_timeleft = dump_timeout;
123*843e1988Sjohnlev 	for (m = msg; *m; m++)
124*843e1988Sjohnlev 		xpv_panic_putc((int)*m);
125*843e1988Sjohnlev }
126*843e1988Sjohnlev 
127*843e1988Sjohnlev static void
128*843e1988Sjohnlev xpv_panic_console_print(const char *fmt, ...)
129*843e1988Sjohnlev {
130*843e1988Sjohnlev 	va_list ap;
131*843e1988Sjohnlev 
132*843e1988Sjohnlev 	va_start(ap, fmt);
133*843e1988Sjohnlev 	(void) vsnprintf(console_buffer, sizeof (console_buffer), fmt, ap);
134*843e1988Sjohnlev 	va_end(ap);
135*843e1988Sjohnlev 
136*843e1988Sjohnlev 	xpv_panic_puts(console_buffer);
137*843e1988Sjohnlev }
138*843e1988Sjohnlev 
139*843e1988Sjohnlev static void
140*843e1988Sjohnlev xpv_panic_map(int level, pfn_t pfn)
141*843e1988Sjohnlev {
142*843e1988Sjohnlev 	x86pte_t pte, *pteptr;
143*843e1988Sjohnlev 
144*843e1988Sjohnlev 	/*
145*843e1988Sjohnlev 	 * The provided pfn represents a level 'level' page table.  Map it
146*843e1988Sjohnlev 	 * into the 'level' slot in the list of page table windows.
147*843e1988Sjohnlev 	 */
148*843e1988Sjohnlev 	pteptr = (x86pte_t *)PWIN_PTE_VA(level);
149*843e1988Sjohnlev 	pte = pfn_to_pa(pfn) | PT_VALID;
150*843e1988Sjohnlev 
151*843e1988Sjohnlev 	XPV_ALLOW_PAGETABLE_UPDATES();
152*843e1988Sjohnlev 	if (mmu.pae_hat)
153*843e1988Sjohnlev 		*pteptr = pte;
154*843e1988Sjohnlev 	else
155*843e1988Sjohnlev 		*(x86pte32_t *)pteptr = pte;
156*843e1988Sjohnlev 	XPV_DISALLOW_PAGETABLE_UPDATES();
157*843e1988Sjohnlev 
158*843e1988Sjohnlev 	mmu_tlbflush_entry(PWIN_VA(level));
159*843e1988Sjohnlev }
160*843e1988Sjohnlev 
161*843e1988Sjohnlev /*
162*843e1988Sjohnlev  * Walk the page tables to find the pfn mapped by the given va.
163*843e1988Sjohnlev  */
164*843e1988Sjohnlev static pfn_t
165*843e1988Sjohnlev xpv_va_walk(uintptr_t *vaddr)
166*843e1988Sjohnlev {
167*843e1988Sjohnlev 	int l, idx;
168*843e1988Sjohnlev 	pfn_t pfn;
169*843e1988Sjohnlev 	x86pte_t pte;
170*843e1988Sjohnlev 	x86pte_t *ptep;
171*843e1988Sjohnlev 	uintptr_t va = *vaddr;
172*843e1988Sjohnlev 	uintptr_t scan_va;
173*843e1988Sjohnlev 	caddr_t ptable_window;
174*843e1988Sjohnlev 	static pfn_t toplevel_pfn;
175*843e1988Sjohnlev 	static uintptr_t lastva;
176*843e1988Sjohnlev 
177*843e1988Sjohnlev 	/*
178*843e1988Sjohnlev 	 * If we do anything other than a simple scan through memory, don't
179*843e1988Sjohnlev 	 * trust the mapped page tables.
180*843e1988Sjohnlev 	 */
181*843e1988Sjohnlev 	if (va != lastva + MMU_PAGESIZE)
182*843e1988Sjohnlev 		for (l = mmu.max_level; l >= 0; l--)
183*843e1988Sjohnlev 			ptable_pfn[l] = PFN_INVALID;
184*843e1988Sjohnlev 
185*843e1988Sjohnlev 	toplevel_pfn = mmu_btop(xpv_panic_cr3);
186*843e1988Sjohnlev 
187*843e1988Sjohnlev 	while (va < xpv_end && va >= *vaddr) {
188*843e1988Sjohnlev 		/* Find the lowest table with any entry for va */
189*843e1988Sjohnlev 		pfn = toplevel_pfn;
190*843e1988Sjohnlev 		for (l = mmu.max_level; l >= 0; l--) {
191*843e1988Sjohnlev 			if (ptable_pfn[l] != pfn) {
192*843e1988Sjohnlev 				xpv_panic_map(l, pfn);
193*843e1988Sjohnlev 				ptable_pfn[l] = pfn;
194*843e1988Sjohnlev 			}
195*843e1988Sjohnlev 
196*843e1988Sjohnlev 			/*
197*843e1988Sjohnlev 			 * Search this pagetable for any mapping to an
198*843e1988Sjohnlev 			 * address >= va.
199*843e1988Sjohnlev 			 */
200*843e1988Sjohnlev 			ptable_window = PWIN_VA(l);
201*843e1988Sjohnlev 			if (l == mmu.max_level && mmu.pae_hat)
202*843e1988Sjohnlev 				ptable_window +=
203*843e1988Sjohnlev 				    (xpv_panic_cr3 & MMU_PAGEOFFSET);
204*843e1988Sjohnlev 
205*843e1988Sjohnlev 			idx = (va >> LEVEL_SHIFT(l)) & (xpv_panic_nptes[l] - 1);
206*843e1988Sjohnlev 			scan_va = va;
207*843e1988Sjohnlev 			while (idx < xpv_panic_nptes[l] && scan_va < xpv_end &&
208*843e1988Sjohnlev 			    scan_va >= *vaddr) {
209*843e1988Sjohnlev 				ptep = (x86pte_t *)(ptable_window +
210*843e1988Sjohnlev 				    (idx << mmu.pte_size_shift));
211*843e1988Sjohnlev 				pte = GET_PTE(ptep);
212*843e1988Sjohnlev 				if (pte & PTE_VALID)
213*843e1988Sjohnlev 					break;
214*843e1988Sjohnlev 				idx++;
215*843e1988Sjohnlev 				scan_va += mmu.level_size[l];
216*843e1988Sjohnlev 			}
217*843e1988Sjohnlev 			va = scan_va;
218*843e1988Sjohnlev 
219*843e1988Sjohnlev 			/*
220*843e1988Sjohnlev 			 * See if we've hit the end of the range.
221*843e1988Sjohnlev 			 */
222*843e1988Sjohnlev 			if (scan_va >= xpv_end || scan_va < *vaddr) {
223*843e1988Sjohnlev 				va = scan_va;
224*843e1988Sjohnlev 				break;
225*843e1988Sjohnlev 			}
226*843e1988Sjohnlev 
227*843e1988Sjohnlev 			/*
228*843e1988Sjohnlev 			 * If there are no valid mappings in this table, we
229*843e1988Sjohnlev 			 * can skip to the end of the VA range it covers.
230*843e1988Sjohnlev 			 */
231*843e1988Sjohnlev 			if (idx == xpv_panic_nptes[l]) {
232*843e1988Sjohnlev 				va = NEXT_ENTRY_VA(va, l + 1);
233*843e1988Sjohnlev 				break;
234*843e1988Sjohnlev 			}
235*843e1988Sjohnlev 
236*843e1988Sjohnlev 			/*
237*843e1988Sjohnlev 			 * If this mapping is for a pagetable, we drop down
238*843e1988Sjohnlev 			 * to the next level in the hierarchy and look for
239*843e1988Sjohnlev 			 * a mapping in it.
240*843e1988Sjohnlev 			 */
241*843e1988Sjohnlev 			pfn = PTE2MFN(pte, l);
242*843e1988Sjohnlev 			if (!PTE_ISPAGE(pte, l))
243*843e1988Sjohnlev 				continue;
244*843e1988Sjohnlev 
245*843e1988Sjohnlev 			/*
246*843e1988Sjohnlev 			 * The APIC page is magic.  Nothing to see here;
247*843e1988Sjohnlev 			 * move along.
248*843e1988Sjohnlev 			 */
249*843e1988Sjohnlev 			if (((uintptr_t)xpv_apicadr & MMU_PAGEMASK) ==
250*843e1988Sjohnlev 			    (va & MMU_PAGEMASK)) {
251*843e1988Sjohnlev 				va += MMU_PAGESIZE;
252*843e1988Sjohnlev 				break;
253*843e1988Sjohnlev 			}
254*843e1988Sjohnlev 
255*843e1988Sjohnlev 			/* We also want to skip the Xen version of KPM */
256*843e1988Sjohnlev 			if (va >= (uintptr_t)xpv_panic_info->pi_ram_start &&
257*843e1988Sjohnlev 			    va < (uintptr_t)xpv_panic_info->pi_ram_end) {
258*843e1988Sjohnlev 				va = (uintptr_t)xpv_panic_info->pi_ram_end;
259*843e1988Sjohnlev 				break;
260*843e1988Sjohnlev 			}
261*843e1988Sjohnlev 
262*843e1988Sjohnlev 			/*
263*843e1988Sjohnlev 			 * The Xen panic code only handles small pages.  If
264*843e1988Sjohnlev 			 * this mapping is for a large page, we need to
265*843e1988Sjohnlev 			 * identify the consituent page that covers the
266*843e1988Sjohnlev 			 * specific VA we were looking for.
267*843e1988Sjohnlev 			 */
268*843e1988Sjohnlev 			if (l > 0) {
269*843e1988Sjohnlev 				if (l > 1)
270*843e1988Sjohnlev 					panic("Xen panic can't cope with "
271*843e1988Sjohnlev 					    "giant pages.");
272*843e1988Sjohnlev 				idx = (va >> LEVEL_SHIFT(0)) &
273*843e1988Sjohnlev 				    (xpv_panic_nptes[0] - 1);
274*843e1988Sjohnlev 				pfn += idx;
275*843e1988Sjohnlev 			}
276*843e1988Sjohnlev 
277*843e1988Sjohnlev 			*vaddr = va;
278*843e1988Sjohnlev 			lastva = va;
279*843e1988Sjohnlev 			return (pfn | PFN_IS_FOREIGN_MFN);
280*843e1988Sjohnlev 		}
281*843e1988Sjohnlev 	}
282*843e1988Sjohnlev 	return (PFN_INVALID);
283*843e1988Sjohnlev }
284*843e1988Sjohnlev 
285*843e1988Sjohnlev /*
286*843e1988Sjohnlev  * Walk through the Xen VA space, finding pages that are mapped in.
287*843e1988Sjohnlev  *
288*843e1988Sjohnlev  * These pages all have MFNs rather than PFNs, meaning they may be outside
289*843e1988Sjohnlev  * the physical address space the kernel knows about, or they may collide
290*843e1988Sjohnlev  * with PFNs the kernel is using.
291*843e1988Sjohnlev  *
292*843e1988Sjohnlev  * The obvious trick of just adding the PFN_IS_FOREIGN_MFN bit to the MFNs
293*843e1988Sjohnlev  * to avoid collisions doesn't work.  The pages need to be written to disk
294*843e1988Sjohnlev  * in PFN-order or savecore gets confused.  We can't allocate memory to
295*843e1988Sjohnlev  * contruct a sorted pfn->VA reverse mapping, so we have to write the pages
296*843e1988Sjohnlev  * to disk in VA order.
297*843e1988Sjohnlev  *
298*843e1988Sjohnlev  * To square this circle, we simply make up PFNs for each of Xen's pages.
299*843e1988Sjohnlev  * We assign each mapped page a fake PFN in ascending order.  These fake
300*843e1988Sjohnlev  * PFNs each have the FOREIGN bit set, ensuring that they fall outside the
301*843e1988Sjohnlev  * range of Solaris PFNs written by the kernel.
302*843e1988Sjohnlev  */
303*843e1988Sjohnlev int
304*843e1988Sjohnlev dump_xpv_addr()
305*843e1988Sjohnlev {
306*843e1988Sjohnlev 	uintptr_t va;
307*843e1988Sjohnlev 	mem_vtop_t mem_vtop;
308*843e1988Sjohnlev 
309*843e1988Sjohnlev 	xpv_dump_pages = 0;
310*843e1988Sjohnlev 	va = xen_virt_start;
311*843e1988Sjohnlev 
312*843e1988Sjohnlev 	while (xpv_va_walk(&va) != PFN_INVALID) {
313*843e1988Sjohnlev 		mem_vtop.m_as = &kas;
314*843e1988Sjohnlev 		mem_vtop.m_va = (void *)va;
315*843e1988Sjohnlev 		mem_vtop.m_pfn = (pfn_t)xpv_dump_pages | PFN_IS_FOREIGN_MFN;
316*843e1988Sjohnlev 
317*843e1988Sjohnlev 		dumpvp_write(&mem_vtop, sizeof (mem_vtop_t));
318*843e1988Sjohnlev 		xpv_dump_pages++;
319*843e1988Sjohnlev 
320*843e1988Sjohnlev 		va += MMU_PAGESIZE;
321*843e1988Sjohnlev 	}
322*843e1988Sjohnlev 
323*843e1988Sjohnlev 	/*
324*843e1988Sjohnlev 	 * Add the shared_info page.  This page actually ends up in the
325*843e1988Sjohnlev 	 * dump twice: once for the Xen va and once for the Solaris va.
326*843e1988Sjohnlev 	 * This isn't ideal, but we don't know the address Xen is using for
327*843e1988Sjohnlev 	 * the page, so we can't share it.
328*843e1988Sjohnlev 	 */
329*843e1988Sjohnlev 	mem_vtop.m_as = &kas;
330*843e1988Sjohnlev 	mem_vtop.m_va = HYPERVISOR_shared_info;
331*843e1988Sjohnlev 	mem_vtop.m_pfn = (pfn_t)xpv_dump_pages | PFN_IS_FOREIGN_MFN;
332*843e1988Sjohnlev 	dumpvp_write(&mem_vtop, sizeof (mem_vtop_t));
333*843e1988Sjohnlev 	xpv_dump_pages++;
334*843e1988Sjohnlev 
335*843e1988Sjohnlev 	return (xpv_dump_pages);
336*843e1988Sjohnlev }
337*843e1988Sjohnlev 
338*843e1988Sjohnlev void
339*843e1988Sjohnlev dump_xpv_pfn()
340*843e1988Sjohnlev {
341*843e1988Sjohnlev 	pfn_t pfn;
342*843e1988Sjohnlev 	int cnt;
343*843e1988Sjohnlev 
344*843e1988Sjohnlev 	for (cnt = 0; cnt < xpv_dump_pages; cnt++) {
345*843e1988Sjohnlev 		pfn = (pfn_t)cnt | PFN_IS_FOREIGN_MFN;
346*843e1988Sjohnlev 		dumpvp_write(&pfn, sizeof (pfn));
347*843e1988Sjohnlev 	}
348*843e1988Sjohnlev }
349*843e1988Sjohnlev 
350*843e1988Sjohnlev int
351*843e1988Sjohnlev dump_xpv_data(void *dump_cbuf)
352*843e1988Sjohnlev {
353*843e1988Sjohnlev 	uintptr_t va;
354*843e1988Sjohnlev 	uint32_t csize;
355*843e1988Sjohnlev 	int cnt = 0;
356*843e1988Sjohnlev 
357*843e1988Sjohnlev 	/*
358*843e1988Sjohnlev 	 * XXX: we should probably run this data through a UE check.  The
359*843e1988Sjohnlev 	 * catch is that the UE code relies on on_trap() and getpfnum()
360*843e1988Sjohnlev 	 * working.
361*843e1988Sjohnlev 	 */
362*843e1988Sjohnlev 	va = xen_virt_start;
363*843e1988Sjohnlev 
364*843e1988Sjohnlev 	while (xpv_va_walk(&va) != PFN_INVALID) {
365*843e1988Sjohnlev 		csize = (uint32_t)compress((void *)va, dump_cbuf, PAGESIZE);
366*843e1988Sjohnlev 		dumpvp_write(&csize, sizeof (uint32_t));
367*843e1988Sjohnlev 		dumpvp_write(dump_cbuf, csize);
368*843e1988Sjohnlev 		if (dump_ioerr) {
369*843e1988Sjohnlev 			dumphdr->dump_flags &= ~DF_COMPLETE;
370*843e1988Sjohnlev 			return (cnt);
371*843e1988Sjohnlev 		}
372*843e1988Sjohnlev 		cnt++;
373*843e1988Sjohnlev 		va += MMU_PAGESIZE;
374*843e1988Sjohnlev 	}
375*843e1988Sjohnlev 
376*843e1988Sjohnlev 	/*
377*843e1988Sjohnlev 	 * Finally, dump the shared_info page
378*843e1988Sjohnlev 	 */
379*843e1988Sjohnlev 	csize = (uint32_t)compress((void *)HYPERVISOR_shared_info, dump_cbuf,
380*843e1988Sjohnlev 	    PAGESIZE);
381*843e1988Sjohnlev 	dumpvp_write(&csize, sizeof (uint32_t));
382*843e1988Sjohnlev 	dumpvp_write(dump_cbuf, csize);
383*843e1988Sjohnlev 	if (dump_ioerr)
384*843e1988Sjohnlev 		dumphdr->dump_flags &= ~DF_COMPLETE;
385*843e1988Sjohnlev 	cnt++;
386*843e1988Sjohnlev 
387*843e1988Sjohnlev 	return (cnt);
388*843e1988Sjohnlev }
389*843e1988Sjohnlev 
390*843e1988Sjohnlev static void *
391*843e1988Sjohnlev showstack(void *fpreg, int xpv_only)
392*843e1988Sjohnlev {
393*843e1988Sjohnlev 	struct frame *fpp;
394*843e1988Sjohnlev 	ulong_t off;
395*843e1988Sjohnlev 	char *sym;
396*843e1988Sjohnlev 	uintptr_t pc, fp, lastfp;
397*843e1988Sjohnlev 	uintptr_t minaddr = min(KERNELBASE, xen_virt_start);
398*843e1988Sjohnlev 
399*843e1988Sjohnlev 	fp = (uintptr_t)fpreg;
400*843e1988Sjohnlev 	if (fp < minaddr) {
401*843e1988Sjohnlev 		xpv_panic_printf("Bad frame ptr: 0x%p\n", fpreg);
402*843e1988Sjohnlev 		return (fpreg);
403*843e1988Sjohnlev 	}
404*843e1988Sjohnlev 
405*843e1988Sjohnlev 	do {
406*843e1988Sjohnlev 		fpp = (struct frame *)fp;
407*843e1988Sjohnlev 		pc = fpp->fr_savpc;
408*843e1988Sjohnlev 
409*843e1988Sjohnlev 		if ((xpv_only != 0) &&
410*843e1988Sjohnlev 		    (fp > xpv_end || fp < xen_virt_start))
411*843e1988Sjohnlev 			break;
412*843e1988Sjohnlev 		if ((sym = kobj_getsymname(pc, &off)) != NULL)
413*843e1988Sjohnlev 			xpv_panic_printf("%08lx %s:%s+%lx\n", fp,
414*843e1988Sjohnlev 			    mod_containing_pc((caddr_t)pc), sym, off);
415*843e1988Sjohnlev 		else if ((pc >= xen_virt_start) && (pc <= xpv_end))
416*843e1988Sjohnlev 			xpv_panic_printf("%08lx 0x%lx (in Xen)\n", fp, pc);
417*843e1988Sjohnlev 		else
418*843e1988Sjohnlev 			xpv_panic_printf("%08lx %lx\n", fp, pc);
419*843e1988Sjohnlev 
420*843e1988Sjohnlev 		lastfp = fp;
421*843e1988Sjohnlev 		fp = fpp->fr_savfp;
422*843e1988Sjohnlev 
423*843e1988Sjohnlev 		/*
424*843e1988Sjohnlev 		 * Xen marks an exception frame by inverting the frame
425*843e1988Sjohnlev 		 * pointer.
426*843e1988Sjohnlev 		 */
427*843e1988Sjohnlev 		if (fp < lastfp) {
428*843e1988Sjohnlev 			if ((~fp > minaddr) && ((~fp) ^ lastfp) < 0xfff)
429*843e1988Sjohnlev 				fp = ~fp;
430*843e1988Sjohnlev 		}
431*843e1988Sjohnlev 	} while (fp > lastfp);
432*843e1988Sjohnlev 	return ((void *)fp);
433*843e1988Sjohnlev }
434*843e1988Sjohnlev 
435*843e1988Sjohnlev void *
436*843e1988Sjohnlev xpv_traceback(void *fpreg)
437*843e1988Sjohnlev {
438*843e1988Sjohnlev 	return (showstack(fpreg, 1));
439*843e1988Sjohnlev }
440*843e1988Sjohnlev 
441*843e1988Sjohnlev #if defined(__amd64)
442*843e1988Sjohnlev static void
443*843e1988Sjohnlev xpv_panic_hypercall(ulong_t call)
444*843e1988Sjohnlev {
445*843e1988Sjohnlev 	panic("Illegally issued hypercall %d during panic!\n", (int)call);
446*843e1988Sjohnlev }
447*843e1988Sjohnlev #endif
448*843e1988Sjohnlev 
449*843e1988Sjohnlev void
450*843e1988Sjohnlev xpv_die(struct regs *rp)
451*843e1988Sjohnlev {
452*843e1988Sjohnlev 	struct panic_trap_info ti;
453*843e1988Sjohnlev 	struct cregs creg;
454*843e1988Sjohnlev 
455*843e1988Sjohnlev 	ti.trap_regs = rp;
456*843e1988Sjohnlev 	ti.trap_type = rp->r_trapno;
457*843e1988Sjohnlev 
458*843e1988Sjohnlev 	curthread->t_panic_trap = &ti;
459*843e1988Sjohnlev 	if (ti.trap_type == T_PGFLT) {
460*843e1988Sjohnlev 		getcregs(&creg);
461*843e1988Sjohnlev 		ti.trap_addr = (caddr_t)creg.cr_cr2;
462*843e1988Sjohnlev 		panic("Fatal pagefault at 0x%lx.  fault addr=0x%p  rp=0x%p",
463*843e1988Sjohnlev 		    rp->r_pc, ti.trap_addr, rp);
464*843e1988Sjohnlev 	} else {
465*843e1988Sjohnlev 		ti.trap_addr = (caddr_t)rp->r_pc;
466*843e1988Sjohnlev 		panic("Fatal trap %ld at 0x%lx.  rp=0x%p", rp->r_trapno,
467*843e1988Sjohnlev 		    rp->r_pc, rp);
468*843e1988Sjohnlev 	}
469*843e1988Sjohnlev }
470*843e1988Sjohnlev 
471*843e1988Sjohnlev /*
472*843e1988Sjohnlev  * Build IDT to handle a Xen panic
473*843e1988Sjohnlev  */
474*843e1988Sjohnlev static void
475*843e1988Sjohnlev switch_to_xpv_panic_idt()
476*843e1988Sjohnlev {
477*843e1988Sjohnlev 	int i;
478*843e1988Sjohnlev 	desctbr_t idtr;
479*843e1988Sjohnlev 	gate_desc_t *idt = xpv_panic_idt;
480*843e1988Sjohnlev 	selector_t cs = get_cs_register();
481*843e1988Sjohnlev 
482*843e1988Sjohnlev 	for (i = 0; i < 32; i++)
483*843e1988Sjohnlev 		set_gatesegd(&idt[i], &xpv_invaltrap, cs, SDT_SYSIGT, TRP_XPL);
484*843e1988Sjohnlev 
485*843e1988Sjohnlev 	set_gatesegd(&idt[T_ZERODIV], &xpv_div0trap, cs, SDT_SYSIGT, TRP_XPL);
486*843e1988Sjohnlev 	set_gatesegd(&idt[T_SGLSTP], &xpv_dbgtrap, cs, SDT_SYSIGT, TRP_XPL);
487*843e1988Sjohnlev 	set_gatesegd(&idt[T_NMIFLT], &xpv_nmiint, cs, SDT_SYSIGT, TRP_XPL);
488*843e1988Sjohnlev 	set_gatesegd(&idt[T_BOUNDFLT], &xpv_boundstrap, cs, SDT_SYSIGT,
489*843e1988Sjohnlev 	    TRP_XPL);
490*843e1988Sjohnlev 	set_gatesegd(&idt[T_ILLINST], &xpv_invoptrap, cs, SDT_SYSIGT, TRP_XPL);
491*843e1988Sjohnlev 	set_gatesegd(&idt[T_NOEXTFLT], &xpv_ndptrap, cs, SDT_SYSIGT, TRP_XPL);
492*843e1988Sjohnlev 	set_gatesegd(&idt[T_TSSFLT], &xpv_invtsstrap, cs, SDT_SYSIGT, TRP_XPL);
493*843e1988Sjohnlev 	set_gatesegd(&idt[T_SEGFLT], &xpv_segnptrap, cs, SDT_SYSIGT, TRP_XPL);
494*843e1988Sjohnlev 	set_gatesegd(&idt[T_STKFLT], &xpv_stktrap, cs, SDT_SYSIGT, TRP_XPL);
495*843e1988Sjohnlev 	set_gatesegd(&idt[T_GPFLT], &xpv_gptrap, cs, SDT_SYSIGT, TRP_XPL);
496*843e1988Sjohnlev 	set_gatesegd(&idt[T_PGFLT], &xpv_pftrap, cs, SDT_SYSIGT, TRP_XPL);
497*843e1988Sjohnlev 	set_gatesegd(&idt[T_EXTERRFLT], &xpv_ndperr, cs, SDT_SYSIGT, TRP_XPL);
498*843e1988Sjohnlev 	set_gatesegd(&idt[T_ALIGNMENT], &xpv_achktrap, cs, SDT_SYSIGT, TRP_XPL);
499*843e1988Sjohnlev 	set_gatesegd(&idt[T_MCE], &xpv_mcetrap, cs, SDT_SYSIGT, TRP_XPL);
500*843e1988Sjohnlev 	set_gatesegd(&idt[T_SIMDFPE], &xpv_xmtrap, cs, SDT_SYSIGT, TRP_XPL);
501*843e1988Sjohnlev 
502*843e1988Sjohnlev 	/*
503*843e1988Sjohnlev 	 * We have no double fault handler.  Any single fault represents a
504*843e1988Sjohnlev 	 * catastrophic failure for us, so there is no attempt to handle
505*843e1988Sjohnlev 	 * them cleanly: we just print a message and reboot.  If we
506*843e1988Sjohnlev 	 * encounter a second fault while doing that, there is nothing
507*843e1988Sjohnlev 	 * else we can do.
508*843e1988Sjohnlev 	 */
509*843e1988Sjohnlev 
510*843e1988Sjohnlev 	/*
511*843e1988Sjohnlev 	 * Be prepared to absorb any stray device interrupts received
512*843e1988Sjohnlev 	 * while writing the core to disk.
513*843e1988Sjohnlev 	 */
514*843e1988Sjohnlev 	for (i = 33; i < NIDT; i++)
515*843e1988Sjohnlev 		set_gatesegd(&idt[i], &xpv_surprise_intr, cs, SDT_SYSIGT,
516*843e1988Sjohnlev 		    TRP_XPL);
517*843e1988Sjohnlev 
518*843e1988Sjohnlev 	/* The one interrupt we expect to get is from the APIC timer.  */
519*843e1988Sjohnlev 	set_gatesegd(&idt[T_XPV_TIMER], &xpv_timer_trap, cs, SDT_SYSIGT,
520*843e1988Sjohnlev 	    TRP_XPL);
521*843e1988Sjohnlev 
522*843e1988Sjohnlev 	idtr.dtr_base = (uintptr_t)xpv_panic_idt;
523*843e1988Sjohnlev 	idtr.dtr_limit = sizeof (xpv_panic_idt) - 1;
524*843e1988Sjohnlev 	wr_idtr(&idtr);
525*843e1988Sjohnlev 
526*843e1988Sjohnlev #if defined(__amd64)
527*843e1988Sjohnlev 	/* Catch any hypercalls. */
528*843e1988Sjohnlev 	wrmsr(MSR_AMD_LSTAR, (uintptr_t)xpv_panic_hypercall);
529*843e1988Sjohnlev 	wrmsr(MSR_AMD_CSTAR, (uintptr_t)xpv_panic_hypercall);
530*843e1988Sjohnlev #endif
531*843e1988Sjohnlev }
532*843e1988Sjohnlev 
533*843e1988Sjohnlev static void
534*843e1988Sjohnlev xpv_apic_clkinit()
535*843e1988Sjohnlev {
536*843e1988Sjohnlev 	uint_t		apic_ticks = 0;
537*843e1988Sjohnlev 
538*843e1988Sjohnlev 	/*
539*843e1988Sjohnlev 	 * Measure how many APIC ticks there are within a fixed time
540*843e1988Sjohnlev 	 * period.  We're going to be fairly coarse here.  This timer is
541*843e1988Sjohnlev 	 * just being used to detect a stalled panic, so as long as we have
542*843e1988Sjohnlev 	 * the right order of magnitude, everything should be fine.
543*843e1988Sjohnlev 	 */
544*843e1988Sjohnlev 	xpv_apicadr[APIC_SPUR_INT_REG] = AV_UNIT_ENABLE | APIC_SPUR_INTR;
545*843e1988Sjohnlev 	xpv_apicadr[APIC_LOCAL_TIMER] = AV_MASK;
546*843e1988Sjohnlev 	xpv_apicadr[APIC_INT_VECT0] = AV_MASK;	/* local intr reg 0 */
547*843e1988Sjohnlev 
548*843e1988Sjohnlev 	xpv_apicadr[APIC_DIVIDE_REG] = 0;
549*843e1988Sjohnlev 	xpv_apicadr[APIC_INIT_COUNT] = APIC_MAXVAL;
550*843e1988Sjohnlev 	drv_usecwait(XPV_TIMER_INTERVAL);
551*843e1988Sjohnlev 	apic_ticks = APIC_MAXVAL - xpv_apicadr[APIC_CURR_COUNT];
552*843e1988Sjohnlev 
553*843e1988Sjohnlev 	/*
554*843e1988Sjohnlev 	 * apic_ticks now represents roughly how many apic ticks comprise
555*843e1988Sjohnlev 	 * one timeout interval.  Program the timer to send us an interrupt
556*843e1988Sjohnlev 	 * every time that interval expires.
557*843e1988Sjohnlev 	 */
558*843e1988Sjohnlev 	xpv_apicadr[APIC_LOCAL_TIMER] = T_XPV_TIMER | AV_TIME;
559*843e1988Sjohnlev 	xpv_apicadr[APIC_INIT_COUNT] = apic_ticks;
560*843e1988Sjohnlev 	xpv_apicadr[APIC_EOI_REG] = 0;
561*843e1988Sjohnlev }
562*843e1988Sjohnlev 
563*843e1988Sjohnlev void
564*843e1988Sjohnlev xpv_timer_tick(void)
565*843e1988Sjohnlev {
566*843e1988Sjohnlev 	static int ticks = 0;
567*843e1988Sjohnlev 
568*843e1988Sjohnlev 	if (ticks++ >= MICROSEC / XPV_TIMER_INTERVAL) {
569*843e1988Sjohnlev 		ticks = 0;
570*843e1988Sjohnlev 		if (dump_timeleft && (--dump_timeleft == 0))
571*843e1988Sjohnlev 			panic("Xen panic timeout\n");
572*843e1988Sjohnlev 	}
573*843e1988Sjohnlev 	xpv_apicadr[APIC_EOI_REG] = 0;
574*843e1988Sjohnlev }
575*843e1988Sjohnlev 
576*843e1988Sjohnlev void
577*843e1988Sjohnlev xpv_interrupt(void)
578*843e1988Sjohnlev {
579*843e1988Sjohnlev #ifdef	DEBUG
580*843e1988Sjohnlev 	static int cnt = 0;
581*843e1988Sjohnlev 
582*843e1988Sjohnlev 	if (cnt++ < 10)
583*843e1988Sjohnlev 		xpv_panic_printf("Unexpected interrupt received.\n");
584*843e1988Sjohnlev 	if ((cnt < 1000) && ((cnt % 100) == 0))
585*843e1988Sjohnlev 		xpv_panic_printf("%d unexpected interrupts received.\n", cnt);
586*843e1988Sjohnlev #endif
587*843e1988Sjohnlev 
588*843e1988Sjohnlev 	xpv_apicadr[APIC_EOI_REG] = 0;
589*843e1988Sjohnlev }
590*843e1988Sjohnlev 
591*843e1988Sjohnlev /*
592*843e1988Sjohnlev  * Managing time in panic context is trivial.  We only have a single CPU,
593*843e1988Sjohnlev  * we never get rescheduled, we never get suspended.  We just need to
594*843e1988Sjohnlev  * convert clock ticks into nanoseconds.
595*843e1988Sjohnlev  */
596*843e1988Sjohnlev static hrtime_t
597*843e1988Sjohnlev xpv_panic_gethrtime(void)
598*843e1988Sjohnlev {
599*843e1988Sjohnlev 	hrtime_t tsc, hrt;
600*843e1988Sjohnlev 	unsigned int *l = (unsigned int *)&(tsc);
601*843e1988Sjohnlev 
602*843e1988Sjohnlev 	tsc = __rdtsc_insn();
603*843e1988Sjohnlev 	hrt = (mul32(l[1], nsec_scale) << NSEC_SHIFT) +
604*843e1988Sjohnlev 	    (mul32(l[0], nsec_scale) >> (32 - NSEC_SHIFT));
605*843e1988Sjohnlev 
606*843e1988Sjohnlev 	return (hrt);
607*843e1988Sjohnlev }
608*843e1988Sjohnlev 
609*843e1988Sjohnlev static void
610*843e1988Sjohnlev xpv_panic_time_init()
611*843e1988Sjohnlev {
612*843e1988Sjohnlev 	nsec_scale =
613*843e1988Sjohnlev 	    CPU->cpu_m.mcpu_vcpu_info->time.tsc_to_system_mul >> NSEC_SHIFT;
614*843e1988Sjohnlev 
615*843e1988Sjohnlev 	gethrtimef = xpv_panic_gethrtime;
616*843e1988Sjohnlev }
617*843e1988Sjohnlev 
618*843e1988Sjohnlev static void
619*843e1988Sjohnlev xpv_panicsys(struct regs *rp, char *fmt, ...)
620*843e1988Sjohnlev {
621*843e1988Sjohnlev 	extern void panicsys(const char *, va_list, struct regs *, int);
622*843e1988Sjohnlev 	va_list alist;
623*843e1988Sjohnlev 
624*843e1988Sjohnlev 	va_start(alist, fmt);
625*843e1988Sjohnlev 	panicsys(fmt, alist, rp, 1);
626*843e1988Sjohnlev 	va_end(alist);
627*843e1988Sjohnlev }
628*843e1988Sjohnlev 
629*843e1988Sjohnlev void
630*843e1988Sjohnlev xpv_do_panic(void *arg)
631*843e1988Sjohnlev {
632*843e1988Sjohnlev 	struct panic_info *pip = (struct panic_info *)arg;
633*843e1988Sjohnlev 	int l;
634*843e1988Sjohnlev 	struct cregs creg;
635*843e1988Sjohnlev #if defined(__amd64)
636*843e1988Sjohnlev 	extern uintptr_t postbootkernelbase;
637*843e1988Sjohnlev #endif
638*843e1988Sjohnlev 
639*843e1988Sjohnlev 	if (xpv_panicking++ > 0)
640*843e1988Sjohnlev 		panic("multiple calls to xpv_do_panic()");
641*843e1988Sjohnlev 
642*843e1988Sjohnlev 	/*
643*843e1988Sjohnlev 	 * Indicate to the underlying panic framework that a panic has been
644*843e1988Sjohnlev 	 * initiated.  This is ordinarily done as part of vpanic().  Since
645*843e1988Sjohnlev 	 * we already have all the register state saved by the hypervisor,
646*843e1988Sjohnlev 	 * we skip that and jump straight into the panic processing code.
647*843e1988Sjohnlev 	 */
648*843e1988Sjohnlev 	(void) panic_trigger(&panic_quiesce);
649*843e1988Sjohnlev 
650*843e1988Sjohnlev #if defined(__amd64)
651*843e1988Sjohnlev 	/*
652*843e1988Sjohnlev 	 * bzero() and bcopy() get unhappy when asked to operate on
653*843e1988Sjohnlev 	 * addresses outside of the kernel.  At this point Xen is really a
654*843e1988Sjohnlev 	 * part of the kernel, so we update the routines' notion of where
655*843e1988Sjohnlev 	 * the kernel starts.
656*843e1988Sjohnlev 	 */
657*843e1988Sjohnlev 	postbootkernelbase = xen_virt_start;
658*843e1988Sjohnlev #endif
659*843e1988Sjohnlev 
660*843e1988Sjohnlev #if defined(HYPERVISOR_VIRT_END)
661*843e1988Sjohnlev 	xpv_end = HYPERVISOR_VIRT_END;
662*843e1988Sjohnlev #else
663*843e1988Sjohnlev 	xpv_end = (uintptr_t)UINTPTR_MAX - sizeof (uintptr_t);
664*843e1988Sjohnlev #endif
665*843e1988Sjohnlev 
666*843e1988Sjohnlev 	/*
667*843e1988Sjohnlev 	 * If we were redirecting console output to the hypervisor, we have
668*843e1988Sjohnlev 	 * to stop.
669*843e1988Sjohnlev 	 */
670*843e1988Sjohnlev 	use_polledio = B_FALSE;
671*843e1988Sjohnlev 	if (console == CONS_HYPERVISOR) {
672*843e1988Sjohnlev 		bcons_device_change(CONS_HYPERVISOR);
673*843e1988Sjohnlev 	} else if (cons_polledio != NULL &&
674*843e1988Sjohnlev 	    cons_polledio->cons_polledio_putchar != NULL)  {
675*843e1988Sjohnlev 		if (cons_polledio->cons_polledio_enter != NULL)
676*843e1988Sjohnlev 			cons_polledio->cons_polledio_enter(
677*843e1988Sjohnlev 			    cons_polledio->cons_polledio_argument);
678*843e1988Sjohnlev 		use_polledio = 1;
679*843e1988Sjohnlev 	}
680*843e1988Sjohnlev 
681*843e1988Sjohnlev 	/* Make sure we handle all console output from here on. */
682*843e1988Sjohnlev 	sysp->bsvc_putchar = xpv_panic_putc;
683*843e1988Sjohnlev 
684*843e1988Sjohnlev 	/*
685*843e1988Sjohnlev 	 * If we find an unsupported panic_info structure, there's not much
686*843e1988Sjohnlev 	 * we can do other than complain, plow on, and hope for the best.
687*843e1988Sjohnlev 	 */
688*843e1988Sjohnlev 	if (pip->pi_version != PANIC_INFO_VERSION)
689*843e1988Sjohnlev 		xpv_panic_printf("Warning: Xen is using an unsupported "
690*843e1988Sjohnlev 		    "version of the panic_info structure.\n");
691*843e1988Sjohnlev 
692*843e1988Sjohnlev 	xpv_panic_info = pip;
693*843e1988Sjohnlev 
694*843e1988Sjohnlev 	/*
695*843e1988Sjohnlev 	 * Make sure we are running on the Solaris %gs.  The Xen panic code
696*843e1988Sjohnlev 	 * should already have set up the GDT properly.
697*843e1988Sjohnlev 	 */
698*843e1988Sjohnlev 	xpv_panic_resetgs();
699*843e1988Sjohnlev #if defined(__amd64)
700*843e1988Sjohnlev 	wrmsr(MSR_AMD_GSBASE, (uint64_t)&cpus[0]);
701*843e1988Sjohnlev #endif
702*843e1988Sjohnlev 
703*843e1988Sjohnlev 	xpv_panic_time_init();
704*843e1988Sjohnlev 
705*843e1988Sjohnlev 	/*
706*843e1988Sjohnlev 	 * Switch to our own IDT, avoiding any accidental returns to Xen
707*843e1988Sjohnlev 	 * world.
708*843e1988Sjohnlev 	 */
709*843e1988Sjohnlev 	switch_to_xpv_panic_idt();
710*843e1988Sjohnlev 
711*843e1988Sjohnlev 	/*
712*843e1988Sjohnlev 	 * Initialize the APIC timer, which is used to detect a hung dump
713*843e1988Sjohnlev 	 * attempt.
714*843e1988Sjohnlev 	 */
715*843e1988Sjohnlev 	xpv_apicadr = pip->pi_apic;
716*843e1988Sjohnlev 	xpv_apic_clkinit();
717*843e1988Sjohnlev 
718*843e1988Sjohnlev 	/*
719*843e1988Sjohnlev 	 * Set up a few values that we'll need repeatedly.
720*843e1988Sjohnlev 	 */
721*843e1988Sjohnlev 	getcregs(&creg);
722*843e1988Sjohnlev 	xpv_panic_cr3 = creg.cr_cr3;
723*843e1988Sjohnlev 	for (l = mmu.max_level; l >= 0; l--)
724*843e1988Sjohnlev 		xpv_panic_nptes[l] = mmu.ptes_per_table;
725*843e1988Sjohnlev #ifdef __i386
726*843e1988Sjohnlev 	if (mmu.pae_hat)
727*843e1988Sjohnlev 		xpv_panic_nptes[mmu.max_level] = 4;
728*843e1988Sjohnlev #endif
729*843e1988Sjohnlev 
730*843e1988Sjohnlev 	/* Add the fake Xen module to the module list */
731*843e1988Sjohnlev 	if (xpv_module != NULL) {
732*843e1988Sjohnlev 		extern int last_module_id;
733*843e1988Sjohnlev 
734*843e1988Sjohnlev 		xpv_modctl->mod_id = last_module_id++;
735*843e1988Sjohnlev 		xpv_modctl->mod_next = &modules;
736*843e1988Sjohnlev 		xpv_modctl->mod_prev = modules.mod_prev;
737*843e1988Sjohnlev 		modules.mod_prev->mod_next = xpv_modctl;
738*843e1988Sjohnlev 		modules.mod_prev = xpv_modctl;
739*843e1988Sjohnlev 	}
740*843e1988Sjohnlev 	xpv_panic_printf = printf;
741*843e1988Sjohnlev 	xpv_panicsys((struct regs *)pip->pi_regs, pip->pi_panicstr);
742*843e1988Sjohnlev 	xpv_panic_printf("Failed to reboot following panic.\n");
743*843e1988Sjohnlev 	for (;;)
744*843e1988Sjohnlev 		;
745*843e1988Sjohnlev }
746*843e1988Sjohnlev 
747*843e1988Sjohnlev /*
748*843e1988Sjohnlev  * Set up the necessary data structures to pretend that the Xen hypervisor
749*843e1988Sjohnlev  * is a loadable module, allowing mdb to find the Xen symbols in a crash
750*843e1988Sjohnlev  * dump.  Since these symbols all map to VA space Solaris doesn't normally
751*843e1988Sjohnlev  * have access to, we don't link these structures into the kernel's lists
752*843e1988Sjohnlev  * until/unless we hit a Xen panic.
753*843e1988Sjohnlev  *
754*843e1988Sjohnlev  * The observant reader will note a striking amount of overlap between this
755*843e1988Sjohnlev  * code and that found in krtld.  While it would be handy if we could just
756*843e1988Sjohnlev  * ask krtld to do this work for us, it's not that simple.  Among the
757*843e1988Sjohnlev  * complications: we're not actually loading the text here (grub did it at
758*843e1988Sjohnlev  * boot), the .text section is writable, there are no relocations to do,
759*843e1988Sjohnlev  * none of the module text/data is in readable memory, etc.  Training krtld
760*843e1988Sjohnlev  * to deal with this weird module is as complicated, and more risky, than
761*843e1988Sjohnlev  * reimplementing the necessary subset of it here.
762*843e1988Sjohnlev  */
763*843e1988Sjohnlev static void
764*843e1988Sjohnlev init_xen_module()
765*843e1988Sjohnlev {
766*843e1988Sjohnlev 	struct _buf *file = NULL;
767*843e1988Sjohnlev 	struct module *mp;
768*843e1988Sjohnlev 	struct modctl *mcp;
769*843e1988Sjohnlev 	int i, shn;
770*843e1988Sjohnlev 	Shdr *shp, *ctf_shp;
771*843e1988Sjohnlev 	char *names = NULL;
772*843e1988Sjohnlev 	size_t n, namesize, text_align, data_align;
773*843e1988Sjohnlev #if defined(__amd64)
774*843e1988Sjohnlev 	const char machine = EM_AMD64;
775*843e1988Sjohnlev #else
776*843e1988Sjohnlev 	const char machine = EM_386;
777*843e1988Sjohnlev #endif
778*843e1988Sjohnlev 
779*843e1988Sjohnlev 	/* Allocate and init the module structure */
780*843e1988Sjohnlev 	mp = kmem_zalloc(sizeof (*mp), KM_SLEEP);
781*843e1988Sjohnlev 	mp->filename = kobj_zalloc(strlen(XPV_FILENAME) + 1, KM_SLEEP);
782*843e1988Sjohnlev 	(void) strcpy(mp->filename, XPV_FILENAME);
783*843e1988Sjohnlev 
784*843e1988Sjohnlev 	/* Allocate and init the modctl structure */
785*843e1988Sjohnlev 	mcp = kmem_zalloc(sizeof (*mcp), KM_SLEEP);
786*843e1988Sjohnlev 	mcp->mod_modname = kobj_zalloc(strlen(XPV_MODNAME) + 1, KM_SLEEP);
787*843e1988Sjohnlev 	(void) strcpy(mcp->mod_modname, XPV_MODNAME);
788*843e1988Sjohnlev 	mcp->mod_filename = kobj_zalloc(strlen(XPV_FILENAME) + 1, KM_SLEEP);
789*843e1988Sjohnlev 	(void) strcpy(mcp->mod_filename, XPV_FILENAME);
790*843e1988Sjohnlev 	mcp->mod_inprogress_thread = (kthread_id_t)-1;
791*843e1988Sjohnlev 	mcp->mod_ref = 1;
792*843e1988Sjohnlev 	mcp->mod_loaded = 1;
793*843e1988Sjohnlev 	mcp->mod_loadcnt = 1;
794*843e1988Sjohnlev 	mcp->mod_mp = mp;
795*843e1988Sjohnlev 
796*843e1988Sjohnlev 	/*
797*843e1988Sjohnlev 	 * Try to open a Xen image that hasn't had its symbol and CTF
798*843e1988Sjohnlev 	 * information stripped off.
799*843e1988Sjohnlev 	 */
800*843e1988Sjohnlev 	file = kobj_open_file(XPV_FILENAME);
801*843e1988Sjohnlev 	if (file == (struct _buf *)-1) {
802*843e1988Sjohnlev 		file = NULL;
803*843e1988Sjohnlev 		goto err;
804*843e1988Sjohnlev 	}
805*843e1988Sjohnlev 
806*843e1988Sjohnlev 	/*
807*843e1988Sjohnlev 	 * Read the header and ensure that this is an ELF file for the
808*843e1988Sjohnlev 	 * proper ISA.  If it's not, somebody has done something very
809*843e1988Sjohnlev 	 * stupid.  Why bother?  See Mencken.
810*843e1988Sjohnlev 	 */
811*843e1988Sjohnlev 	if (kobj_read_file(file, (char *)&mp->hdr, sizeof (mp->hdr), 0) < 0)
812*843e1988Sjohnlev 		goto err;
813*843e1988Sjohnlev 	for (i = 0; i < SELFMAG; i++)
814*843e1988Sjohnlev 		if (mp->hdr.e_ident[i] != ELFMAG[i])
815*843e1988Sjohnlev 			goto err;
816*843e1988Sjohnlev 	if ((mp->hdr.e_ident[EI_DATA] != ELFDATA2LSB) ||
817*843e1988Sjohnlev 	    (mp->hdr.e_machine != machine))
818*843e1988Sjohnlev 		goto err;
819*843e1988Sjohnlev 
820*843e1988Sjohnlev 	/* Read in the section headers */
821*843e1988Sjohnlev 	n = mp->hdr.e_shentsize * mp->hdr.e_shnum;
822*843e1988Sjohnlev 	mp->shdrs = kmem_zalloc(n, KM_SLEEP);
823*843e1988Sjohnlev 	if (kobj_read_file(file, mp->shdrs, n, mp->hdr.e_shoff) < 0)
824*843e1988Sjohnlev 		goto err;
825*843e1988Sjohnlev 
826*843e1988Sjohnlev 	/* Read the section names */
827*843e1988Sjohnlev 	shp = (Shdr *)(mp->shdrs + mp->hdr.e_shstrndx * mp->hdr.e_shentsize);
828*843e1988Sjohnlev 	namesize = shp->sh_size;
829*843e1988Sjohnlev 	names = kmem_zalloc(shp->sh_size, KM_SLEEP);
830*843e1988Sjohnlev 	if (kobj_read_file(file, names, shp->sh_size, shp->sh_offset) < 0)
831*843e1988Sjohnlev 		goto err;
832*843e1988Sjohnlev 
833*843e1988Sjohnlev 	/*
834*843e1988Sjohnlev 	 * Fill in the text and data size fields.
835*843e1988Sjohnlev 	 */
836*843e1988Sjohnlev 	ctf_shp = NULL;
837*843e1988Sjohnlev 	text_align = data_align = 0;
838*843e1988Sjohnlev 	for (shn = 1; shn < mp->hdr.e_shnum; shn++) {
839*843e1988Sjohnlev 		shp = (Shdr *)(mp->shdrs + shn * mp->hdr.e_shentsize);
840*843e1988Sjohnlev 
841*843e1988Sjohnlev 		/* Sanity check the offset of the section name */
842*843e1988Sjohnlev 		if (shp->sh_name >= namesize)
843*843e1988Sjohnlev 			continue;
844*843e1988Sjohnlev 
845*843e1988Sjohnlev 		/* If we find the symtab section, remember it for later. */
846*843e1988Sjohnlev 		if (shp->sh_type == SHT_SYMTAB) {
847*843e1988Sjohnlev 			mp->symtbl_section = shn;
848*843e1988Sjohnlev 			mp->symhdr = shp;
849*843e1988Sjohnlev 			continue;
850*843e1988Sjohnlev 		}
851*843e1988Sjohnlev 
852*843e1988Sjohnlev 		/* If we find the CTF section, remember it for later. */
853*843e1988Sjohnlev 		if ((shp->sh_size != 0) &&
854*843e1988Sjohnlev 		    (strcmp(names + shp->sh_name, ".SUNW_ctf") == 0)) {
855*843e1988Sjohnlev 			ctf_shp = shp;
856*843e1988Sjohnlev 			continue;
857*843e1988Sjohnlev 		}
858*843e1988Sjohnlev 
859*843e1988Sjohnlev 		if (!(shp->sh_flags & SHF_ALLOC))
860*843e1988Sjohnlev 			continue;
861*843e1988Sjohnlev 
862*843e1988Sjohnlev 		/*
863*843e1988Sjohnlev 		 * Xen marks its text section as writable, so we need to
864*843e1988Sjohnlev 		 * look for the name - not just the flag.
865*843e1988Sjohnlev 		 */
866*843e1988Sjohnlev 		if ((strcmp(&names[shp->sh_name], ".text") != NULL) &&
867*843e1988Sjohnlev 		    (shp->sh_flags & SHF_WRITE) != 0) {
868*843e1988Sjohnlev 			if (shp->sh_addralign > data_align)
869*843e1988Sjohnlev 				data_align = shp->sh_addralign;
870*843e1988Sjohnlev 			mp->data_size = ALIGN(mp->data_size, data_align);
871*843e1988Sjohnlev 			mp->data_size += ALIGN(shp->sh_size, 8);
872*843e1988Sjohnlev 			if (mp->data == NULL || mp->data > (char *)shp->sh_addr)
873*843e1988Sjohnlev 				mp->data = (char *)shp->sh_addr;
874*843e1988Sjohnlev 		} else {
875*843e1988Sjohnlev 			if (shp->sh_addralign > text_align)
876*843e1988Sjohnlev 				text_align = shp->sh_addralign;
877*843e1988Sjohnlev 			mp->text_size = ALIGN(mp->text_size, text_align);
878*843e1988Sjohnlev 			mp->text_size += ALIGN(shp->sh_size, 8);
879*843e1988Sjohnlev 			if (mp->text == NULL || mp->text > (char *)shp->sh_addr)
880*843e1988Sjohnlev 				mp->text = (char *)shp->sh_addr;
881*843e1988Sjohnlev 		}
882*843e1988Sjohnlev 	}
883*843e1988Sjohnlev 	kmem_free(names, namesize);
884*843e1988Sjohnlev 	names = NULL;
885*843e1988Sjohnlev 	mcp->mod_text = mp->text;
886*843e1988Sjohnlev 	mcp->mod_text_size = mp->text_size;
887*843e1988Sjohnlev 
888*843e1988Sjohnlev 	/*
889*843e1988Sjohnlev 	 * If we have symbol table and string table sections, read them in
890*843e1988Sjohnlev 	 * now.  If we don't, we just plow on.  We'll still get a valid
891*843e1988Sjohnlev 	 * core dump, but finding anything useful will be just a bit
892*843e1988Sjohnlev 	 * harder.
893*843e1988Sjohnlev 	 *
894*843e1988Sjohnlev 	 * Note: we don't bother with a hash table.  We'll never do a
895*843e1988Sjohnlev 	 * symbol lookup unless we crash, and then mdb creates its own.  We
896*843e1988Sjohnlev 	 * also don't try to perform any relocations.  Xen should be loaded
897*843e1988Sjohnlev 	 * exactly where the ELF file indicates, and the symbol information
898*843e1988Sjohnlev 	 * in the file should be complete and correct already.  Static
899*843e1988Sjohnlev 	 * linking ain't all bad.
900*843e1988Sjohnlev 	 */
901*843e1988Sjohnlev 	if ((mp->symhdr != NULL) && (mp->symhdr->sh_link < mp->hdr.e_shnum)) {
902*843e1988Sjohnlev 		mp->strhdr = (Shdr *)
903*843e1988Sjohnlev 		    (mp->shdrs + mp->symhdr->sh_link * mp->hdr.e_shentsize);
904*843e1988Sjohnlev 		mp->nsyms = mp->symhdr->sh_size / mp->symhdr->sh_entsize;
905*843e1988Sjohnlev 
906*843e1988Sjohnlev 		/* Allocate space for the symbol table and strings.  */
907*843e1988Sjohnlev 		mp->symsize = mp->symhdr->sh_size +
908*843e1988Sjohnlev 		    mp->nsyms * sizeof (symid_t) + mp->strhdr->sh_size;
909*843e1988Sjohnlev 		mp->symspace = kmem_zalloc(mp->symsize, KM_SLEEP);
910*843e1988Sjohnlev 		mp->symtbl = mp->symspace;
911*843e1988Sjohnlev 		mp->strings = (char *)(mp->symtbl + mp->symhdr->sh_size);
912*843e1988Sjohnlev 
913*843e1988Sjohnlev 		if ((kobj_read_file(file, mp->symtbl,
914*843e1988Sjohnlev 		    mp->symhdr->sh_size, mp->symhdr->sh_offset) < 0) ||
915*843e1988Sjohnlev 		    (kobj_read_file(file, mp->strings,
916*843e1988Sjohnlev 		    mp->strhdr->sh_size, mp->strhdr->sh_offset) < 0))
917*843e1988Sjohnlev 			goto err;
918*843e1988Sjohnlev 	}
919*843e1988Sjohnlev 
920*843e1988Sjohnlev 	/*
921*843e1988Sjohnlev 	 * Read in the CTF section
922*843e1988Sjohnlev 	 */
923*843e1988Sjohnlev 	if ((ctf_shp != NULL) && ((moddebug & MODDEBUG_NOCTF) == 0)) {
924*843e1988Sjohnlev 		mp->ctfdata = kmem_zalloc(shp->sh_size, KM_SLEEP);
925*843e1988Sjohnlev 		mp->ctfsize = ctf_shp->sh_size;
926*843e1988Sjohnlev 		if (kobj_read_file(file, mp->ctfdata, mp->ctfsize,
927*843e1988Sjohnlev 		    ctf_shp->sh_offset) < 0)
928*843e1988Sjohnlev 			goto err;
929*843e1988Sjohnlev 	}
930*843e1988Sjohnlev 
931*843e1988Sjohnlev 	kobj_close_file(file);
932*843e1988Sjohnlev 
933*843e1988Sjohnlev 	xpv_module = mp;
934*843e1988Sjohnlev 	xpv_modctl = mcp;
935*843e1988Sjohnlev 	return;
936*843e1988Sjohnlev 
937*843e1988Sjohnlev err:
938*843e1988Sjohnlev 	cmn_err(CE_WARN, "Failed to initialize xpv module.");
939*843e1988Sjohnlev 	if (file != NULL)
940*843e1988Sjohnlev 		kobj_close_file(file);
941*843e1988Sjohnlev 
942*843e1988Sjohnlev 	kmem_free(mp->filename, strlen(XPV_FILENAME) + 1);
943*843e1988Sjohnlev 	if (mp->shdrs != NULL)
944*843e1988Sjohnlev 		kmem_free(mp->shdrs, mp->hdr.e_shentsize * mp->hdr.e_shnum);
945*843e1988Sjohnlev 	if (mp->symspace != NULL)
946*843e1988Sjohnlev 		kmem_free(mp->symspace, mp->symsize);
947*843e1988Sjohnlev 	if (mp->ctfdata != NULL)
948*843e1988Sjohnlev 		kmem_free(mp->ctfdata, mp->ctfsize);
949*843e1988Sjohnlev 	kmem_free(mp, sizeof (*mp));
950*843e1988Sjohnlev 	kmem_free(mcp->mod_filename, strlen(XPV_FILENAME) + 1);
951*843e1988Sjohnlev 	kmem_free(mcp->mod_modname, strlen(XPV_MODNAME) + 1);
952*843e1988Sjohnlev 	kmem_free(mcp, sizeof (*mcp));
953*843e1988Sjohnlev 	if (names != NULL)
954*843e1988Sjohnlev 		kmem_free(names, namesize);
955*843e1988Sjohnlev }
956*843e1988Sjohnlev 
957*843e1988Sjohnlev void
958*843e1988Sjohnlev xpv_panic_init()
959*843e1988Sjohnlev {
960*843e1988Sjohnlev 	xen_platform_op_t op;
961*843e1988Sjohnlev 	int i;
962*843e1988Sjohnlev 
963*843e1988Sjohnlev 	ASSERT(DOMAIN_IS_INITDOMAIN(xen_info));
964*843e1988Sjohnlev 
965*843e1988Sjohnlev 	for (i = 0; i < mmu.num_level; i++)
966*843e1988Sjohnlev 		ptable_pfn[i] = PFN_INVALID;
967*843e1988Sjohnlev 
968*843e1988Sjohnlev 	/* Let Xen know where to jump if/when it panics. */
969*843e1988Sjohnlev 	op.cmd = XENPF_panic_init;
970*843e1988Sjohnlev 	op.interface_version = XENPF_INTERFACE_VERSION;
971*843e1988Sjohnlev 	op.u.panic_init.panic_addr = (unsigned long)xpv_panic_hdlr;
972*843e1988Sjohnlev 
973*843e1988Sjohnlev 	(void) HYPERVISOR_platform_op(&op);
974*843e1988Sjohnlev 
975*843e1988Sjohnlev 	init_xen_module();
976*843e1988Sjohnlev }
977