xref: /illumos-gate/usr/src/uts/i86xpv/os/xpv_panic.c (revision c64c5389d6d65f1a8915fd0ff67288100f518172)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2012 Gary Mills
23  * Copyright 2016 PALO, Richard.
24  *
25  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
26  *
27  * Copyright 2018 Joyent, Inc.
28  */
29 
30 #include <sys/types.h>
31 #include <sys/clock.h>
32 #include <sys/psm.h>
33 #include <sys/archsystm.h>
34 #include <sys/machsystm.h>
35 #include <sys/compress.h>
36 #include <sys/modctl.h>
37 #include <sys/trap.h>
38 #include <sys/panic.h>
39 #include <sys/regset.h>
40 #include <sys/frame.h>
41 #include <sys/kobj.h>
42 #include <sys/apic.h>
43 #include <sys/apic_timer.h>
44 #include <sys/dumphdr.h>
45 #include <sys/mem.h>
46 #include <sys/x86_archext.h>
47 #include <sys/xpv_panic.h>
48 #include <sys/boot_console.h>
49 #include <sys/bootsvcs.h>
50 #include <sys/consdev.h>
51 #include <vm/hat_pte.h>
52 #include <vm/hat_i86.h>
53 
54 /* XXX: need to add a PAE version too, if we ever support both PAE and non */
55 #if defined(__i386)
56 #define	XPV_FILENAME	"/boot/xen-syms"
57 #else
58 #define	XPV_FILENAME	"/boot/amd64/xen-syms"
59 #endif
60 #define	XPV_MODNAME	"xpv"
61 
62 int xpv_panicking = 0;
63 
64 struct module *xpv_module;
65 struct modctl *xpv_modctl;
66 
67 #define	ALIGN(x, a)	((a) == 0 ? (uintptr_t)(x) : \
68 	(((uintptr_t)(x) + (uintptr_t)(a) - 1l) & ~((uintptr_t)(a) - 1l)))
69 
70 /* Pointer to the xpv_panic_info structure handed to us by Xen.  */
71 static struct panic_info *xpv_panic_info = NULL;
72 
73 /* Timer support */
74 #define	NSEC_SHIFT 5
75 #define	T_XPV_TIMER	0xd1
76 #define	XPV_TIMER_INTERVAL	1000	/* 1000 microseconds */
77 static uint32_t *xpv_apicadr = NULL;
78 static uint_t	nsec_scale;
79 
80 /* IDT support */
81 #pragma	align	16(xpv_panic_idt)
82 static gate_desc_t	xpv_panic_idt[NIDT];	/* interrupt descriptor table */
83 
84 /* Xen pagetables mapped into our HAT's ptable windows */
85 static pfn_t ptable_pfn[MAX_NUM_LEVEL];
86 
87 /* Number of MMU_PAGESIZE pages we're adding to the Solaris dump */
88 static int xpv_dump_pages;
89 
90 /*
91  * There are up to two large swathes of RAM that we don't want to include
92  * in the dump: those that comprise the Xen version of segkpm.  On 32-bit
93  * systems there is no such region of memory.  On 64-bit systems, there
94  * should be just a single contiguous region that corresponds to all of
95  * physical memory.  The tricky bit is that Xen's heap sometimes lives in
96  * the middle of their segkpm, and is mapped using only kpm-like addresses.
97  * In that case, we need to skip the swathes before and after Xen's heap.
98  */
99 uintptr_t kpm1_low = 0;
100 uintptr_t kpm1_high = 0;
101 uintptr_t kpm2_low = 0;
102 uintptr_t kpm2_high = 0;
103 
104 /*
105  * Some commonly used values that we don't want to recompute over and over.
106  */
107 static int xpv_panic_nptes[MAX_NUM_LEVEL];
108 static ulong_t xpv_panic_cr3;
109 static uintptr_t xpv_end;
110 
111 static void xpv_panic_console_print(const char *fmt, ...);
112 static void (*xpv_panic_printf)(const char *, ...) = xpv_panic_console_print;
113 
114 #define	CONSOLE_BUF_SIZE	256
115 static char console_buffer[CONSOLE_BUF_SIZE];
116 static boolean_t use_polledio;
117 
118 /*
119  * Pointers to machine check panic info (if any).
120  */
121 xpv_mca_panic_data_t *xpv_mca_panic_data = NULL;
122 
123 static void
124 xpv_panic_putc(int m)
125 {
126 	struct cons_polledio *c = cons_polledio;
127 
128 	/* This really shouldn't happen */
129 	if (boot_console_type(NULL) == CONS_HYPERVISOR)
130 		return;
131 
132 	if (use_polledio == B_TRUE)
133 		c->cons_polledio_putchar(c->cons_polledio_argument, m);
134 	else
135 		bcons_putchar(m);
136 }
137 
138 static void
139 xpv_panic_puts(char *msg)
140 {
141 	char *m;
142 
143 	dump_timeleft = dump_timeout;
144 	for (m = msg; *m; m++)
145 		xpv_panic_putc((int)*m);
146 }
147 
148 static void
149 xpv_panic_console_print(const char *fmt, ...)
150 {
151 	va_list ap;
152 
153 	va_start(ap, fmt);
154 	(void) vsnprintf(console_buffer, sizeof (console_buffer), fmt, ap);
155 	va_end(ap);
156 
157 	xpv_panic_puts(console_buffer);
158 }
159 
160 static void
161 xpv_panic_map(int level, pfn_t pfn)
162 {
163 	x86pte_t pte, *pteptr;
164 
165 	/*
166 	 * The provided pfn represents a level 'level' page table.  Map it
167 	 * into the 'level' slot in the list of page table windows.
168 	 */
169 	pteptr = (x86pte_t *)PWIN_PTE_VA(level);
170 	pte = pfn_to_pa(pfn) | PT_VALID;
171 
172 	XPV_ALLOW_PAGETABLE_UPDATES();
173 	if (mmu.pae_hat)
174 		*pteptr = pte;
175 	else
176 		*(x86pte32_t *)pteptr = pte;
177 	XPV_DISALLOW_PAGETABLE_UPDATES();
178 
179 	mmu_flush_tlb_page((uintptr_t)PWIN_VA(level));
180 }
181 
182 /*
183  * Walk the page tables to find the pfn mapped by the given va.
184  */
185 static pfn_t
186 xpv_va_walk(uintptr_t *vaddr)
187 {
188 	int l, idx;
189 	pfn_t pfn;
190 	x86pte_t pte;
191 	x86pte_t *ptep;
192 	uintptr_t va = *vaddr;
193 	uintptr_t scan_va;
194 	caddr_t ptable_window;
195 	static pfn_t toplevel_pfn;
196 	static uintptr_t lastva;
197 
198 	pte = 0;
199 	/*
200 	 * If we do anything other than a simple scan through memory, don't
201 	 * trust the mapped page tables.
202 	 */
203 	if (va != lastva + MMU_PAGESIZE)
204 		for (l = mmu.max_level; l >= 0; l--)
205 			ptable_pfn[l] = PFN_INVALID;
206 
207 	toplevel_pfn = mmu_btop(xpv_panic_cr3);
208 
209 	while (va < xpv_end && va >= *vaddr) {
210 		/* Find the lowest table with any entry for va */
211 		pfn = toplevel_pfn;
212 		for (l = mmu.max_level; l >= 0; l--) {
213 			if (ptable_pfn[l] != pfn) {
214 				xpv_panic_map(l, pfn);
215 				ptable_pfn[l] = pfn;
216 			}
217 
218 			/*
219 			 * Search this pagetable for any mapping to an
220 			 * address >= va.
221 			 */
222 			ptable_window = PWIN_VA(l);
223 			if (l == mmu.max_level && mmu.pae_hat)
224 				ptable_window +=
225 				    (xpv_panic_cr3 & MMU_PAGEOFFSET);
226 
227 			idx = (va >> LEVEL_SHIFT(l)) & (xpv_panic_nptes[l] - 1);
228 			scan_va = va;
229 			while (idx < xpv_panic_nptes[l] && scan_va < xpv_end &&
230 			    scan_va >= *vaddr) {
231 				ptep = (x86pte_t *)(ptable_window +
232 				    (idx << mmu.pte_size_shift));
233 				pte = GET_PTE(ptep);
234 				if (pte & PTE_VALID)
235 					break;
236 				idx++;
237 				scan_va += mmu.level_size[l];
238 			}
239 
240 			/*
241 			 * If there are no valid mappings in this table, we
242 			 * can skip to the end of the VA range it covers.
243 			 */
244 			if (idx == xpv_panic_nptes[l]) {
245 				va = NEXT_ENTRY_VA(va, l + 1);
246 				break;
247 			}
248 
249 			va = scan_va;
250 			/*
251 			 * See if we've hit the end of the range.
252 			 */
253 			if (va >= xpv_end || va < *vaddr)
254 				break;
255 
256 			/*
257 			 * If this mapping is for a pagetable, we drop down
258 			 * to the next level in the hierarchy and look for
259 			 * a mapping in it.
260 			 */
261 			pfn = PTE2MFN(pte, l);
262 			if (!PTE_ISPAGE(pte, l))
263 				continue;
264 
265 			/*
266 			 * The APIC page is magic.  Nothing to see here;
267 			 * move along.
268 			 */
269 			if (((uintptr_t)xpv_apicadr & MMU_PAGEMASK) ==
270 			    (va & MMU_PAGEMASK)) {
271 				va += MMU_PAGESIZE;
272 				break;
273 			}
274 
275 			/*
276 			 * See if the address is within one of the two
277 			 * kpm-like regions we want to skip.
278 			 */
279 			if (va >= kpm1_low && va < kpm1_high) {
280 				va = kpm1_high;
281 				break;
282 			}
283 			if (va >= kpm2_low && va < kpm2_high) {
284 				va = kpm2_high;
285 				break;
286 			}
287 
288 			/*
289 			 * The Xen panic code only handles small pages.  If
290 			 * this mapping is for a large page, we need to
291 			 * identify the consituent page that covers the
292 			 * specific VA we were looking for.
293 			 */
294 			if (l > 0) {
295 				if (l > 1)
296 					panic("Xen panic can't cope with "
297 					    "giant pages.");
298 				idx = (va >> LEVEL_SHIFT(0)) &
299 				    (xpv_panic_nptes[0] - 1);
300 				pfn += idx;
301 			}
302 
303 			*vaddr = va;
304 			lastva = va;
305 			return (pfn | PFN_IS_FOREIGN_MFN);
306 		}
307 	}
308 	return (PFN_INVALID);
309 }
310 
311 /*
312  * Walk through the Xen VA space, finding pages that are mapped in.
313  *
314  * These pages all have MFNs rather than PFNs, meaning they may be outside
315  * the physical address space the kernel knows about, or they may collide
316  * with PFNs the kernel is using.
317  *
318  * The obvious trick of just adding the PFN_IS_FOREIGN_MFN bit to the MFNs
319  * to avoid collisions doesn't work.  The pages need to be written to disk
320  * in PFN-order or savecore gets confused.  We can't allocate memory to
321  * contruct a sorted pfn->VA reverse mapping, so we have to write the pages
322  * to disk in VA order.
323  *
324  * To square this circle, we simply make up PFNs for each of Xen's pages.
325  * We assign each mapped page a fake PFN in ascending order.  These fake
326  * PFNs each have the FOREIGN bit set, ensuring that they fall outside the
327  * range of Solaris PFNs written by the kernel.
328  */
329 int
330 dump_xpv_addr()
331 {
332 	uintptr_t va;
333 	mem_vtop_t mem_vtop;
334 
335 	xpv_dump_pages = 0;
336 	va = xen_virt_start;
337 
338 	while (xpv_va_walk(&va) != PFN_INVALID) {
339 		mem_vtop.m_as = &kas;
340 		mem_vtop.m_va = (void *)va;
341 		mem_vtop.m_pfn = (pfn_t)xpv_dump_pages | PFN_IS_FOREIGN_MFN;
342 
343 		dumpvp_write(&mem_vtop, sizeof (mem_vtop_t));
344 		xpv_dump_pages++;
345 
346 		va += MMU_PAGESIZE;
347 	}
348 
349 	/*
350 	 * Add the shared_info page.  This page actually ends up in the
351 	 * dump twice: once for the Xen va and once for the Solaris va.
352 	 * This isn't ideal, but we don't know the address Xen is using for
353 	 * the page, so we can't share it.
354 	 */
355 	mem_vtop.m_as = &kas;
356 	mem_vtop.m_va = HYPERVISOR_shared_info;
357 	mem_vtop.m_pfn = (pfn_t)xpv_dump_pages | PFN_IS_FOREIGN_MFN;
358 	dumpvp_write(&mem_vtop, sizeof (mem_vtop_t));
359 	xpv_dump_pages++;
360 
361 	return (xpv_dump_pages);
362 }
363 
364 void
365 dump_xpv_pfn()
366 {
367 	pfn_t pfn;
368 	int cnt;
369 
370 	for (cnt = 0; cnt < xpv_dump_pages; cnt++) {
371 		pfn = (pfn_t)cnt | PFN_IS_FOREIGN_MFN;
372 		dumpvp_write(&pfn, sizeof (pfn));
373 	}
374 }
375 
376 int
377 dump_xpv_data(void *dump_cbuf)
378 {
379 	uintptr_t va;
380 	uint32_t csize;
381 	int cnt = 0;
382 
383 	/*
384 	 * XXX: we should probably run this data through a UE check.  The
385 	 * catch is that the UE code relies on on_trap() and getpfnum()
386 	 * working.
387 	 */
388 	va = xen_virt_start;
389 
390 	while (xpv_va_walk(&va) != PFN_INVALID) {
391 		csize = (uint32_t)compress((void *)va, dump_cbuf, PAGESIZE);
392 		dumpvp_write(&csize, sizeof (uint32_t));
393 		dumpvp_write(dump_cbuf, csize);
394 		if (dump_ioerr) {
395 			dumphdr->dump_flags &= ~DF_COMPLETE;
396 			return (cnt);
397 		}
398 		cnt++;
399 		va += MMU_PAGESIZE;
400 	}
401 
402 	/*
403 	 * Finally, dump the shared_info page
404 	 */
405 	csize = (uint32_t)compress((void *)HYPERVISOR_shared_info, dump_cbuf,
406 	    PAGESIZE);
407 	dumpvp_write(&csize, sizeof (uint32_t));
408 	dumpvp_write(dump_cbuf, csize);
409 	if (dump_ioerr)
410 		dumphdr->dump_flags &= ~DF_COMPLETE;
411 	cnt++;
412 
413 	return (cnt);
414 }
415 
416 static void *
417 showstack(void *fpreg, int xpv_only)
418 {
419 	struct frame *fpp;
420 	ulong_t off;
421 	char *sym;
422 	uintptr_t pc, fp, lastfp;
423 	uintptr_t minaddr = min(KERNELBASE, xen_virt_start);
424 
425 	fp = (uintptr_t)fpreg;
426 	if (fp < minaddr) {
427 		xpv_panic_printf("Bad frame ptr: 0x%p\n", fpreg);
428 		return (fpreg);
429 	}
430 
431 	do {
432 		fpp = (struct frame *)fp;
433 		pc = fpp->fr_savpc;
434 
435 		if ((xpv_only != 0) &&
436 		    (fp > xpv_end || fp < xen_virt_start))
437 			break;
438 		if ((sym = kobj_getsymname(pc, &off)) != NULL)
439 			xpv_panic_printf("%08lx %s:%s+%lx\n", fp,
440 			    mod_containing_pc((caddr_t)pc), sym, off);
441 		else if ((pc >= xen_virt_start) && (pc <= xpv_end))
442 			xpv_panic_printf("%08lx 0x%lx (in Xen)\n", fp, pc);
443 		else
444 			xpv_panic_printf("%08lx %lx\n", fp, pc);
445 
446 		lastfp = fp;
447 		fp = fpp->fr_savfp;
448 
449 		/*
450 		 * Xen marks an exception frame by inverting the frame
451 		 * pointer.
452 		 */
453 		if (fp < lastfp) {
454 			if ((~fp > minaddr) && ((~fp) ^ lastfp) < 0xfff)
455 				fp = ~fp;
456 		}
457 	} while (fp > lastfp);
458 	return ((void *)fp);
459 }
460 
461 void *
462 xpv_traceback(void *fpreg)
463 {
464 	return (showstack(fpreg, 1));
465 }
466 
467 #if defined(__amd64)
468 static void
469 xpv_panic_hypercall(ulong_t call)
470 {
471 	panic("Illegally issued hypercall %d during panic!\n", (int)call);
472 }
473 #endif
474 
475 void
476 xpv_die(struct regs *rp)
477 {
478 	struct panic_trap_info ti;
479 	struct cregs creg;
480 
481 	ti.trap_regs = rp;
482 	ti.trap_type = rp->r_trapno;
483 
484 	curthread->t_panic_trap = &ti;
485 	if (ti.trap_type == T_PGFLT) {
486 		getcregs(&creg);
487 		ti.trap_addr = (caddr_t)creg.cr_cr2;
488 		panic("Fatal pagefault at 0x%lx.  fault addr=0x%p  rp=0x%p",
489 		    rp->r_pc, (void *)ti.trap_addr, (void *)rp);
490 	} else {
491 		ti.trap_addr = (caddr_t)rp->r_pc;
492 		panic("Fatal trap %ld at 0x%lx.  rp=0x%p", rp->r_trapno,
493 		    rp->r_pc, (void *)rp);
494 	}
495 }
496 
497 /*
498  * Build IDT to handle a Xen panic
499  */
500 static void
501 switch_to_xpv_panic_idt()
502 {
503 	int i;
504 	desctbr_t idtr;
505 	gate_desc_t *idt = xpv_panic_idt;
506 	selector_t cs = get_cs_register();
507 
508 	for (i = 0; i < 32; i++)
509 		set_gatesegd(&idt[i], &xpv_invaltrap, cs, SDT_SYSIGT, TRP_XPL,
510 		    0);
511 
512 	set_gatesegd(&idt[T_ZERODIV], &xpv_div0trap, cs, SDT_SYSIGT, TRP_XPL,
513 	    0);
514 	set_gatesegd(&idt[T_SGLSTP], &xpv_dbgtrap, cs, SDT_SYSIGT, TRP_XPL, 0);
515 	set_gatesegd(&idt[T_NMIFLT], &xpv_nmiint, cs, SDT_SYSIGT, TRP_XPL, 0);
516 	set_gatesegd(&idt[T_BOUNDFLT], &xpv_boundstrap, cs, SDT_SYSIGT,
517 	    TRP_XPL, 0);
518 	set_gatesegd(&idt[T_ILLINST], &xpv_invoptrap, cs, SDT_SYSIGT, TRP_XPL,
519 	    0);
520 	set_gatesegd(&idt[T_NOEXTFLT], &xpv_ndptrap, cs, SDT_SYSIGT, TRP_XPL,
521 	    0);
522 	set_gatesegd(&idt[T_TSSFLT], &xpv_invtsstrap, cs, SDT_SYSIGT, TRP_XPL,
523 	    0);
524 	set_gatesegd(&idt[T_SEGFLT], &xpv_segnptrap, cs, SDT_SYSIGT, TRP_XPL,
525 	    0);
526 	set_gatesegd(&idt[T_STKFLT], &xpv_stktrap, cs, SDT_SYSIGT, TRP_XPL, 0);
527 	set_gatesegd(&idt[T_GPFLT], &xpv_gptrap, cs, SDT_SYSIGT, TRP_XPL, 0);
528 	set_gatesegd(&idt[T_PGFLT], &xpv_pftrap, cs, SDT_SYSIGT, TRP_XPL, 0);
529 	set_gatesegd(&idt[T_EXTERRFLT], &xpv_ndperr, cs, SDT_SYSIGT, TRP_XPL,
530 	    0);
531 	set_gatesegd(&idt[T_ALIGNMENT], &xpv_achktrap, cs, SDT_SYSIGT, TRP_XPL,
532 	    0);
533 	set_gatesegd(&idt[T_MCE], &xpv_mcetrap, cs, SDT_SYSIGT, TRP_XPL, 0);
534 	set_gatesegd(&idt[T_SIMDFPE], &xpv_xmtrap, cs, SDT_SYSIGT, TRP_XPL, 0);
535 
536 	/*
537 	 * We have no double fault handler.  Any single fault represents a
538 	 * catastrophic failure for us, so there is no attempt to handle
539 	 * them cleanly: we just print a message and reboot.  If we
540 	 * encounter a second fault while doing that, there is nothing
541 	 * else we can do.
542 	 */
543 
544 	/*
545 	 * Be prepared to absorb any stray device interrupts received
546 	 * while writing the core to disk.
547 	 */
548 	for (i = 33; i < NIDT; i++)
549 		set_gatesegd(&idt[i], &xpv_surprise_intr, cs, SDT_SYSIGT,
550 		    TRP_XPL, 0);
551 
552 	/* The one interrupt we expect to get is from the APIC timer.  */
553 	set_gatesegd(&idt[T_XPV_TIMER], &xpv_timer_trap, cs, SDT_SYSIGT,
554 	    TRP_XPL, 0);
555 
556 	idtr.dtr_base = (uintptr_t)xpv_panic_idt;
557 	idtr.dtr_limit = sizeof (xpv_panic_idt) - 1;
558 	wr_idtr(&idtr);
559 
560 #if defined(__amd64)
561 	/* Catch any hypercalls. */
562 	wrmsr(MSR_AMD_LSTAR, (uintptr_t)xpv_panic_hypercall);
563 	wrmsr(MSR_AMD_CSTAR, (uintptr_t)xpv_panic_hypercall);
564 #endif
565 }
566 
567 static void
568 xpv_apic_clkinit()
569 {
570 	uint_t		apic_ticks = 0;
571 
572 	/*
573 	 * Measure how many APIC ticks there are within a fixed time
574 	 * period.  We're going to be fairly coarse here.  This timer is
575 	 * just being used to detect a stalled panic, so as long as we have
576 	 * the right order of magnitude, everything should be fine.
577 	 */
578 	xpv_apicadr[APIC_SPUR_INT_REG] = AV_UNIT_ENABLE | APIC_SPUR_INTR;
579 	xpv_apicadr[APIC_LOCAL_TIMER] = AV_MASK;
580 	xpv_apicadr[APIC_INT_VECT0] = AV_MASK;	/* local intr reg 0 */
581 
582 	xpv_apicadr[APIC_DIVIDE_REG] = 0;
583 	xpv_apicadr[APIC_INIT_COUNT] = APIC_MAXVAL;
584 	drv_usecwait(XPV_TIMER_INTERVAL);
585 	apic_ticks = APIC_MAXVAL - xpv_apicadr[APIC_CURR_COUNT];
586 
587 	/*
588 	 * apic_ticks now represents roughly how many apic ticks comprise
589 	 * one timeout interval.  Program the timer to send us an interrupt
590 	 * every time that interval expires.
591 	 */
592 	xpv_apicadr[APIC_LOCAL_TIMER] = T_XPV_TIMER | AV_PERIODIC;
593 	xpv_apicadr[APIC_INIT_COUNT] = apic_ticks;
594 	xpv_apicadr[APIC_EOI_REG] = 0;
595 }
596 
597 void
598 xpv_timer_tick(void)
599 {
600 	static int ticks = 0;
601 
602 	if (ticks++ >= MICROSEC / XPV_TIMER_INTERVAL) {
603 		ticks = 0;
604 		if (dump_timeleft && (--dump_timeleft == 0))
605 			panic("Xen panic timeout\n");
606 	}
607 	xpv_apicadr[APIC_EOI_REG] = 0;
608 }
609 
610 void
611 xpv_interrupt(void)
612 {
613 #ifdef	DEBUG
614 	static int cnt = 0;
615 
616 	if (cnt++ < 10)
617 		xpv_panic_printf("Unexpected interrupt received.\n");
618 	if ((cnt < 1000) && ((cnt % 100) == 0))
619 		xpv_panic_printf("%d unexpected interrupts received.\n", cnt);
620 #endif
621 
622 	xpv_apicadr[APIC_EOI_REG] = 0;
623 }
624 
625 /*
626  * Managing time in panic context is trivial.  We only have a single CPU,
627  * we never get rescheduled, we never get suspended.  We just need to
628  * convert clock ticks into nanoseconds.
629  */
630 static hrtime_t
631 xpv_panic_gethrtime(void)
632 {
633 	hrtime_t tsc, hrt;
634 	unsigned int *l = (unsigned int *)&(tsc);
635 
636 	tsc = __rdtsc_insn();
637 	hrt = (mul32(l[1], nsec_scale) << NSEC_SHIFT) +
638 	    (mul32(l[0], nsec_scale) >> (32 - NSEC_SHIFT));
639 
640 	return (hrt);
641 }
642 
643 static void
644 xpv_panic_time_init()
645 {
646 	nsec_scale =
647 	    CPU->cpu_m.mcpu_vcpu_info->time.tsc_to_system_mul >> NSEC_SHIFT;
648 
649 	gethrtimef = xpv_panic_gethrtime;
650 }
651 
652 static void
653 xpv_panicsys(struct regs *rp, char *fmt, ...)
654 {
655 	extern void panicsys(const char *, va_list, struct regs *, int);
656 	va_list alist;
657 
658 	va_start(alist, fmt);
659 	panicsys(fmt, alist, rp, 1);
660 	va_end(alist);
661 }
662 
663 void
664 xpv_do_panic(void *arg)
665 {
666 	struct panic_info *pip = (struct panic_info *)arg;
667 	int l;
668 	struct cregs creg;
669 #if defined(__amd64)
670 	extern uintptr_t postbootkernelbase;
671 #endif
672 
673 	if (xpv_panicking++ > 0)
674 		panic("multiple calls to xpv_do_panic()");
675 
676 	/*
677 	 * Indicate to the underlying panic framework that a panic has been
678 	 * initiated.  This is ordinarily done as part of vpanic().  Since
679 	 * we already have all the register state saved by the hypervisor,
680 	 * we skip that and jump straight into the panic processing code.
681 	 *
682 	 * XXX If another thread grabs and wins the panic_quiesce trigger
683 	 * then we'll have two threads in panicsys believing they are in
684 	 * charge of the panic attempt!
685 	 */
686 	(void) panic_trigger(&panic_quiesce);
687 
688 #if defined(__amd64)
689 	/*
690 	 * bzero() and bcopy() get unhappy when asked to operate on
691 	 * addresses outside of the kernel.  At this point Xen is really a
692 	 * part of the kernel, so we update the routines' notion of where
693 	 * the kernel starts.
694 	 */
695 	postbootkernelbase = xen_virt_start;
696 #endif
697 
698 #if defined(HYPERVISOR_VIRT_END)
699 	xpv_end = HYPERVISOR_VIRT_END;
700 #else
701 	xpv_end = (uintptr_t)UINTPTR_MAX - sizeof (uintptr_t);
702 #endif
703 
704 	/*
705 	 * If we were redirecting console output to the hypervisor, we have
706 	 * to stop.
707 	 */
708 	use_polledio = B_FALSE;
709 	if (boot_console_type(NULL) == CONS_HYPERVISOR) {
710 		bcons_device_change(CONS_HYPERVISOR);
711 	} else if (cons_polledio != NULL &&
712 	    cons_polledio->cons_polledio_putchar != NULL)  {
713 		if (cons_polledio->cons_polledio_enter != NULL)
714 			cons_polledio->cons_polledio_enter(
715 			    cons_polledio->cons_polledio_argument);
716 		use_polledio = 1;
717 	}
718 
719 	/* Make sure we handle all console output from here on. */
720 	sysp->bsvc_putchar = xpv_panic_putc;
721 
722 	/*
723 	 * If we find an unsupported panic_info structure, there's not much
724 	 * we can do other than complain, plow on, and hope for the best.
725 	 */
726 	if (pip->pi_version != PANIC_INFO_VERSION)
727 		xpv_panic_printf("Warning: Xen is using an unsupported "
728 		    "version of the panic_info structure.\n");
729 
730 	xpv_panic_info = pip;
731 
732 #if defined(__amd64)
733 	kpm1_low = (uintptr_t)xpv_panic_info->pi_ram_start;
734 	if (xpv_panic_info->pi_xen_start == NULL) {
735 		kpm1_high = (uintptr_t)xpv_panic_info->pi_ram_end;
736 	} else {
737 		kpm1_high = (uintptr_t)xpv_panic_info->pi_xen_start;
738 		kpm2_low = (uintptr_t)xpv_panic_info->pi_xen_end;
739 		kpm2_high = (uintptr_t)xpv_panic_info->pi_ram_end;
740 	}
741 #endif
742 
743 	/*
744 	 * Make sure we are running on the Solaris %gs.  The Xen panic code
745 	 * should already have set up the GDT properly.
746 	 */
747 	xpv_panic_resetgs();
748 #if defined(__amd64)
749 	wrmsr(MSR_AMD_GSBASE, (uint64_t)&cpus[0]);
750 #endif
751 
752 	xpv_panic_time_init();
753 
754 	/*
755 	 * Switch to our own IDT, avoiding any accidental returns to Xen
756 	 * world.
757 	 */
758 	switch_to_xpv_panic_idt();
759 
760 	/*
761 	 * Initialize the APIC timer, which is used to detect a hung dump
762 	 * attempt.
763 	 */
764 	xpv_apicadr = pip->pi_apic;
765 	xpv_apic_clkinit();
766 
767 	/*
768 	 * Set up a few values that we'll need repeatedly.
769 	 */
770 	getcregs(&creg);
771 	xpv_panic_cr3 = creg.cr_cr3;
772 	for (l = mmu.max_level; l >= 0; l--)
773 		xpv_panic_nptes[l] = mmu.ptes_per_table;
774 #ifdef __i386
775 	if (mmu.pae_hat)
776 		xpv_panic_nptes[mmu.max_level] = 4;
777 #endif
778 
779 	/* Add the fake Xen module to the module list */
780 	if (xpv_module != NULL) {
781 		extern int last_module_id;
782 
783 		xpv_modctl->mod_id = last_module_id++;
784 		xpv_modctl->mod_next = &modules;
785 		xpv_modctl->mod_prev = modules.mod_prev;
786 		modules.mod_prev->mod_next = xpv_modctl;
787 		modules.mod_prev = xpv_modctl;
788 	}
789 
790 	if (pip->pi_mca.mpd_magic == MCA_PANICDATA_MAGIC)
791 		xpv_mca_panic_data = &pip->pi_mca;
792 
793 	xpv_panic_printf = printf;
794 	xpv_panicsys((struct regs *)pip->pi_regs, pip->pi_panicstr);
795 	xpv_panic_printf("Failed to reboot following panic.\n");
796 	for (;;)
797 		;
798 }
799 
800 /*
801  * Set up the necessary data structures to pretend that the Xen hypervisor
802  * is a loadable module, allowing mdb to find the Xen symbols in a crash
803  * dump.  Since these symbols all map to VA space Solaris doesn't normally
804  * have access to, we don't link these structures into the kernel's lists
805  * until/unless we hit a Xen panic.
806  *
807  * The observant reader will note a striking amount of overlap between this
808  * code and that found in krtld.  While it would be handy if we could just
809  * ask krtld to do this work for us, it's not that simple.  Among the
810  * complications: we're not actually loading the text here (grub did it at
811  * boot), the .text section is writable, there are no relocations to do,
812  * none of the module text/data is in readable memory, etc.  Training krtld
813  * to deal with this weird module is as complicated, and more risky, than
814  * reimplementing the necessary subset of it here.
815  */
816 static void
817 init_xen_module()
818 {
819 	struct _buf *file = NULL;
820 	struct module *mp;
821 	struct modctl *mcp;
822 	int i, shn;
823 	Shdr *shp, *ctf_shp;
824 	char *names = NULL;
825 	size_t n, namesize, text_align, data_align;
826 #if defined(__amd64)
827 	const char machine = EM_AMD64;
828 #else
829 	const char machine = EM_386;
830 #endif
831 
832 	/* Allocate and init the module structure */
833 	mp = kmem_zalloc(sizeof (*mp), KM_SLEEP);
834 	mp->filename = kobj_zalloc(strlen(XPV_FILENAME) + 1, KM_SLEEP);
835 	(void) strcpy(mp->filename, XPV_FILENAME);
836 
837 	/* Allocate and init the modctl structure */
838 	mcp = kmem_zalloc(sizeof (*mcp), KM_SLEEP);
839 	mcp->mod_modname = kobj_zalloc(strlen(XPV_MODNAME) + 1, KM_SLEEP);
840 	(void) strcpy(mcp->mod_modname, XPV_MODNAME);
841 	mcp->mod_filename = kobj_zalloc(strlen(XPV_FILENAME) + 1, KM_SLEEP);
842 	(void) strcpy(mcp->mod_filename, XPV_FILENAME);
843 	mcp->mod_inprogress_thread = (kthread_id_t)-1;
844 	mcp->mod_ref = 1;
845 	mcp->mod_loaded = 1;
846 	mcp->mod_loadcnt = 1;
847 	mcp->mod_mp = mp;
848 
849 	/*
850 	 * Try to open a Xen image that hasn't had its symbol and CTF
851 	 * information stripped off.
852 	 */
853 	file = kobj_open_file(XPV_FILENAME);
854 	if (file == (struct _buf *)-1) {
855 		file = NULL;
856 		goto err;
857 	}
858 
859 	/*
860 	 * Read the header and ensure that this is an ELF file for the
861 	 * proper ISA.  If it's not, somebody has done something very
862 	 * stupid.  Why bother?  See Mencken.
863 	 */
864 	if (kobj_read_file(file, (char *)&mp->hdr, sizeof (mp->hdr), 0) < 0)
865 		goto err;
866 	for (i = 0; i < SELFMAG; i++)
867 		if (mp->hdr.e_ident[i] != ELFMAG[i])
868 			goto err;
869 	if ((mp->hdr.e_ident[EI_DATA] != ELFDATA2LSB) ||
870 	    (mp->hdr.e_machine != machine))
871 		goto err;
872 
873 	/* Read in the section headers */
874 	n = mp->hdr.e_shentsize * mp->hdr.e_shnum;
875 	mp->shdrs = kmem_zalloc(n, KM_SLEEP);
876 	if (kobj_read_file(file, mp->shdrs, n, mp->hdr.e_shoff) < 0)
877 		goto err;
878 
879 	/* Read the section names */
880 	shp = (Shdr *)(mp->shdrs + mp->hdr.e_shstrndx * mp->hdr.e_shentsize);
881 	namesize = shp->sh_size;
882 	names = kmem_zalloc(shp->sh_size, KM_SLEEP);
883 	if (kobj_read_file(file, names, shp->sh_size, shp->sh_offset) < 0)
884 		goto err;
885 
886 	/*
887 	 * Fill in the text and data size fields.
888 	 */
889 	ctf_shp = NULL;
890 	text_align = data_align = 0;
891 	for (shn = 1; shn < mp->hdr.e_shnum; shn++) {
892 		shp = (Shdr *)(mp->shdrs + shn * mp->hdr.e_shentsize);
893 
894 		/* Sanity check the offset of the section name */
895 		if (shp->sh_name >= namesize)
896 			continue;
897 
898 		/* If we find the symtab section, remember it for later. */
899 		if (shp->sh_type == SHT_SYMTAB) {
900 			mp->symtbl_section = shn;
901 			mp->symhdr = shp;
902 			continue;
903 		}
904 
905 		/* If we find the CTF section, remember it for later. */
906 		if ((shp->sh_size != 0) &&
907 		    (strcmp(names + shp->sh_name, ".SUNW_ctf") == 0)) {
908 			ctf_shp = shp;
909 			continue;
910 		}
911 
912 		if (!(shp->sh_flags & SHF_ALLOC))
913 			continue;
914 
915 		/*
916 		 * Xen marks its text section as writable, so we need to
917 		 * look for the name - not just the flag.
918 		 */
919 		if ((strcmp(&names[shp->sh_name], ".text") != 0) &&
920 		    (shp->sh_flags & SHF_WRITE) != 0) {
921 			if (shp->sh_addralign > data_align)
922 				data_align = shp->sh_addralign;
923 			mp->data_size = ALIGN(mp->data_size, data_align);
924 			mp->data_size += ALIGN(shp->sh_size, 8);
925 			if (mp->data == NULL || mp->data > (char *)shp->sh_addr)
926 				mp->data = (char *)shp->sh_addr;
927 		} else {
928 			if (shp->sh_addralign > text_align)
929 				text_align = shp->sh_addralign;
930 			mp->text_size = ALIGN(mp->text_size, text_align);
931 			mp->text_size += ALIGN(shp->sh_size, 8);
932 			if (mp->text == NULL || mp->text > (char *)shp->sh_addr)
933 				mp->text = (char *)shp->sh_addr;
934 		}
935 	}
936 	kmem_free(names, namesize);
937 	names = NULL;
938 	shp = NULL;
939 	mcp->mod_text = mp->text;
940 	mcp->mod_text_size = mp->text_size;
941 
942 	/*
943 	 * If we have symbol table and string table sections, read them in
944 	 * now.  If we don't, we just plow on.  We'll still get a valid
945 	 * core dump, but finding anything useful will be just a bit
946 	 * harder.
947 	 *
948 	 * Note: we don't bother with a hash table.  We'll never do a
949 	 * symbol lookup unless we crash, and then mdb creates its own.  We
950 	 * also don't try to perform any relocations.  Xen should be loaded
951 	 * exactly where the ELF file indicates, and the symbol information
952 	 * in the file should be complete and correct already.  Static
953 	 * linking ain't all bad.
954 	 */
955 	if ((mp->symhdr != NULL) && (mp->symhdr->sh_link < mp->hdr.e_shnum)) {
956 		mp->strhdr = (Shdr *)
957 		    (mp->shdrs + mp->symhdr->sh_link * mp->hdr.e_shentsize);
958 		mp->nsyms = mp->symhdr->sh_size / mp->symhdr->sh_entsize;
959 
960 		/* Allocate space for the symbol table and strings.  */
961 		mp->symsize = mp->symhdr->sh_size +
962 		    mp->nsyms * sizeof (symid_t) + mp->strhdr->sh_size;
963 		mp->symspace = kmem_zalloc(mp->symsize, KM_SLEEP);
964 		mp->symtbl = mp->symspace;
965 		mp->strings = (char *)(mp->symtbl + mp->symhdr->sh_size);
966 
967 		if ((kobj_read_file(file, mp->symtbl,
968 		    mp->symhdr->sh_size, mp->symhdr->sh_offset) < 0) ||
969 		    (kobj_read_file(file, mp->strings,
970 		    mp->strhdr->sh_size, mp->strhdr->sh_offset) < 0))
971 			goto err;
972 	}
973 
974 	/*
975 	 * Read in the CTF section
976 	 */
977 	if ((ctf_shp != NULL) && ((moddebug & MODDEBUG_NOCTF) == 0)) {
978 		mp->ctfdata = kmem_zalloc(ctf_shp->sh_size, KM_SLEEP);
979 		mp->ctfsize = ctf_shp->sh_size;
980 		if (kobj_read_file(file, mp->ctfdata, mp->ctfsize,
981 		    ctf_shp->sh_offset) < 0)
982 			goto err;
983 	}
984 
985 	kobj_close_file(file);
986 
987 	xpv_module = mp;
988 	xpv_modctl = mcp;
989 	return;
990 
991 err:
992 	cmn_err(CE_WARN, "Failed to initialize xpv module.");
993 	if (file != NULL)
994 		kobj_close_file(file);
995 
996 	kmem_free(mp->filename, strlen(XPV_FILENAME) + 1);
997 	if (mp->shdrs != NULL)
998 		kmem_free(mp->shdrs, mp->hdr.e_shentsize * mp->hdr.e_shnum);
999 	if (mp->symspace != NULL)
1000 		kmem_free(mp->symspace, mp->symsize);
1001 	if (mp->ctfdata != NULL)
1002 		kmem_free(mp->ctfdata, mp->ctfsize);
1003 	kmem_free(mp, sizeof (*mp));
1004 	kmem_free(mcp->mod_filename, strlen(XPV_FILENAME) + 1);
1005 	kmem_free(mcp->mod_modname, strlen(XPV_MODNAME) + 1);
1006 	kmem_free(mcp, sizeof (*mcp));
1007 	if (names != NULL)
1008 		kmem_free(names, namesize);
1009 }
1010 
1011 void
1012 xpv_panic_init()
1013 {
1014 	xen_platform_op_t op;
1015 	int i;
1016 
1017 	ASSERT(DOMAIN_IS_INITDOMAIN(xen_info));
1018 
1019 	for (i = 0; i < mmu.num_level; i++)
1020 		ptable_pfn[i] = PFN_INVALID;
1021 
1022 	/* Let Xen know where to jump if/when it panics. */
1023 	op.cmd = XENPF_panic_init;
1024 	op.interface_version = XENPF_INTERFACE_VERSION;
1025 	op.u.panic_init.panic_addr = (unsigned long)xpv_panic_hdlr;
1026 
1027 	(void) HYPERVISOR_platform_op(&op);
1028 
1029 	init_xen_module();
1030 }
1031