xref: /titanic_50/usr/src/uts/i86xpv/os/xpv_panic.c (revision ff3124eff995e6cd8ebd8c6543648e0670920034)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/types.h>
30 #include <sys/clock.h>
31 #include <sys/psm.h>
32 #include <sys/archsystm.h>
33 #include <sys/machsystm.h>
34 #include <sys/compress.h>
35 #include <sys/modctl.h>
36 #include <sys/trap.h>
37 #include <sys/panic.h>
38 #include <sys/regset.h>
39 #include <sys/frame.h>
40 #include <sys/kobj.h>
41 #include <sys/apic.h>
42 #include <sys/dumphdr.h>
43 #include <sys/mem.h>
44 #include <sys/x86_archext.h>
45 #include <sys/xpv_panic.h>
46 #include <sys/boot_console.h>
47 #include <sys/bootsvcs.h>
48 #include <sys/consdev.h>
49 #include <vm/hat_pte.h>
50 #include <vm/hat_i86.h>
51 
52 /* XXX: need to add a PAE version too, if we ever support both PAE and non */
53 #if defined(__i386)
54 #define	XPV_FILENAME	"/boot/xen-syms"
55 #else
56 #define	XPV_FILENAME	"/boot/amd64/xen-syms"
57 #endif
58 #define	XPV_MODNAME	"xpv"
59 
60 int xpv_panicking = 0;
61 
62 struct module *xpv_module;
63 struct modctl *xpv_modctl;
64 
65 #define	ALIGN(x, a)	((a) == 0 ? (uintptr_t)(x) : \
66 	(((uintptr_t)(x) + (uintptr_t)(a) - 1l) & ~((uintptr_t)(a) - 1l)))
67 
68 /* Pointer to the xpv_panic_info structure handed to us by Xen.  */
69 static struct panic_info *xpv_panic_info = NULL;
70 
71 /* Timer support */
72 #define	NSEC_SHIFT 5
73 #define	T_XPV_TIMER	0xd1
74 #define	XPV_TIMER_INTERVAL	1000	/* 1000 microseconds */
75 static uint32_t *xpv_apicadr = NULL;
76 static uint_t	nsec_scale;
77 
78 /* IDT support */
79 #pragma	align	16(xpv_panic_idt)
80 static gate_desc_t	xpv_panic_idt[NIDT];	/* interrupt descriptor table */
81 
82 /* Xen pagetables mapped into our HAT's ptable windows */
83 static pfn_t ptable_pfn[MAX_NUM_LEVEL];
84 
85 /* Number of MMU_PAGESIZE pages we're adding to the Solaris dump */
86 static int xpv_dump_pages;
87 
88 /*
89  * There are up to two large swathes of RAM that we don't want to include
90  * in the dump: those that comprise the Xen version of segkpm.  On 32-bit
91  * systems there is no such region of memory.  On 64-bit systems, there
92  * should be just a single contiguous region that corresponds to all of
93  * physical memory.  The tricky bit is that Xen's heap sometimes lives in
94  * the middle of their segkpm, and is mapped using only kpm-like addresses.
95  * In that case, we need to skip the swathes before and after Xen's heap.
96  */
97 uintptr_t kpm1_low = 0;
98 uintptr_t kpm1_high = 0;
99 uintptr_t kpm2_low = 0;
100 uintptr_t kpm2_high = 0;
101 
102 /*
103  * Some commonly used values that we don't want to recompute over and over.
104  */
105 static int xpv_panic_nptes[MAX_NUM_LEVEL];
106 static ulong_t xpv_panic_cr3;
107 static uintptr_t xpv_end;
108 
109 static void xpv_panic_console_print(const char *fmt, ...);
110 static void (*xpv_panic_printf)(const char *, ...) = xpv_panic_console_print;
111 
112 #define	CONSOLE_BUF_SIZE	256
113 static char console_buffer[CONSOLE_BUF_SIZE];
114 static boolean_t use_polledio;
115 
116 static void
117 xpv_panic_putc(int m)
118 {
119 	struct cons_polledio *c = cons_polledio;
120 
121 	/* This really shouldn't happen */
122 	if (console == CONS_HYPERVISOR)
123 		return;
124 
125 	if (use_polledio == B_TRUE)
126 		c->cons_polledio_putchar(c->cons_polledio_argument, m);
127 	else
128 		bcons_putchar(m);
129 }
130 
131 static void
132 xpv_panic_puts(char *msg)
133 {
134 	char *m;
135 
136 	dump_timeleft = dump_timeout;
137 	for (m = msg; *m; m++)
138 		xpv_panic_putc((int)*m);
139 }
140 
141 static void
142 xpv_panic_console_print(const char *fmt, ...)
143 {
144 	va_list ap;
145 
146 	va_start(ap, fmt);
147 	(void) vsnprintf(console_buffer, sizeof (console_buffer), fmt, ap);
148 	va_end(ap);
149 
150 	xpv_panic_puts(console_buffer);
151 }
152 
153 static void
154 xpv_panic_map(int level, pfn_t pfn)
155 {
156 	x86pte_t pte, *pteptr;
157 
158 	/*
159 	 * The provided pfn represents a level 'level' page table.  Map it
160 	 * into the 'level' slot in the list of page table windows.
161 	 */
162 	pteptr = (x86pte_t *)PWIN_PTE_VA(level);
163 	pte = pfn_to_pa(pfn) | PT_VALID;
164 
165 	XPV_ALLOW_PAGETABLE_UPDATES();
166 	if (mmu.pae_hat)
167 		*pteptr = pte;
168 	else
169 		*(x86pte32_t *)pteptr = pte;
170 	XPV_DISALLOW_PAGETABLE_UPDATES();
171 
172 	mmu_tlbflush_entry(PWIN_VA(level));
173 }
174 
175 /*
176  * Walk the page tables to find the pfn mapped by the given va.
177  */
178 static pfn_t
179 xpv_va_walk(uintptr_t *vaddr)
180 {
181 	int l, idx;
182 	pfn_t pfn;
183 	x86pte_t pte;
184 	x86pte_t *ptep;
185 	uintptr_t va = *vaddr;
186 	uintptr_t scan_va;
187 	caddr_t ptable_window;
188 	static pfn_t toplevel_pfn;
189 	static uintptr_t lastva;
190 
191 	/*
192 	 * If we do anything other than a simple scan through memory, don't
193 	 * trust the mapped page tables.
194 	 */
195 	if (va != lastva + MMU_PAGESIZE)
196 		for (l = mmu.max_level; l >= 0; l--)
197 			ptable_pfn[l] = PFN_INVALID;
198 
199 	toplevel_pfn = mmu_btop(xpv_panic_cr3);
200 
201 	while (va < xpv_end && va >= *vaddr) {
202 		/* Find the lowest table with any entry for va */
203 		pfn = toplevel_pfn;
204 		for (l = mmu.max_level; l >= 0; l--) {
205 			if (ptable_pfn[l] != pfn) {
206 				xpv_panic_map(l, pfn);
207 				ptable_pfn[l] = pfn;
208 			}
209 
210 			/*
211 			 * Search this pagetable for any mapping to an
212 			 * address >= va.
213 			 */
214 			ptable_window = PWIN_VA(l);
215 			if (l == mmu.max_level && mmu.pae_hat)
216 				ptable_window +=
217 				    (xpv_panic_cr3 & MMU_PAGEOFFSET);
218 
219 			idx = (va >> LEVEL_SHIFT(l)) & (xpv_panic_nptes[l] - 1);
220 			scan_va = va;
221 			while (idx < xpv_panic_nptes[l] && scan_va < xpv_end &&
222 			    scan_va >= *vaddr) {
223 				ptep = (x86pte_t *)(ptable_window +
224 				    (idx << mmu.pte_size_shift));
225 				pte = GET_PTE(ptep);
226 				if (pte & PTE_VALID)
227 					break;
228 				idx++;
229 				scan_va += mmu.level_size[l];
230 			}
231 
232 			/*
233 			 * If there are no valid mappings in this table, we
234 			 * can skip to the end of the VA range it covers.
235 			 */
236 			if (idx == xpv_panic_nptes[l]) {
237 				va = NEXT_ENTRY_VA(va, l + 1);
238 				break;
239 			}
240 
241 			va = scan_va;
242 			/*
243 			 * See if we've hit the end of the range.
244 			 */
245 			if (va >= xpv_end || va < *vaddr)
246 				break;
247 
248 			/*
249 			 * If this mapping is for a pagetable, we drop down
250 			 * to the next level in the hierarchy and look for
251 			 * a mapping in it.
252 			 */
253 			pfn = PTE2MFN(pte, l);
254 			if (!PTE_ISPAGE(pte, l))
255 				continue;
256 
257 			/*
258 			 * The APIC page is magic.  Nothing to see here;
259 			 * move along.
260 			 */
261 			if (((uintptr_t)xpv_apicadr & MMU_PAGEMASK) ==
262 			    (va & MMU_PAGEMASK)) {
263 				va += MMU_PAGESIZE;
264 				break;
265 			}
266 
267 			/*
268 			 * See if the address is within one of the two
269 			 * kpm-like regions we want to skip.
270 			 */
271 			if (va >= kpm1_low && va < kpm1_high) {
272 				va = kpm1_high;
273 				break;
274 			}
275 			if (va >= kpm2_low && va < kpm2_high) {
276 				va = kpm2_high;
277 				break;
278 			}
279 
280 			/*
281 			 * The Xen panic code only handles small pages.  If
282 			 * this mapping is for a large page, we need to
283 			 * identify the consituent page that covers the
284 			 * specific VA we were looking for.
285 			 */
286 			if (l > 0) {
287 				if (l > 1)
288 					panic("Xen panic can't cope with "
289 					    "giant pages.");
290 				idx = (va >> LEVEL_SHIFT(0)) &
291 				    (xpv_panic_nptes[0] - 1);
292 				pfn += idx;
293 			}
294 
295 			*vaddr = va;
296 			lastva = va;
297 			return (pfn | PFN_IS_FOREIGN_MFN);
298 		}
299 	}
300 	return (PFN_INVALID);
301 }
302 
303 /*
304  * Walk through the Xen VA space, finding pages that are mapped in.
305  *
306  * These pages all have MFNs rather than PFNs, meaning they may be outside
307  * the physical address space the kernel knows about, or they may collide
308  * with PFNs the kernel is using.
309  *
310  * The obvious trick of just adding the PFN_IS_FOREIGN_MFN bit to the MFNs
311  * to avoid collisions doesn't work.  The pages need to be written to disk
312  * in PFN-order or savecore gets confused.  We can't allocate memory to
313  * contruct a sorted pfn->VA reverse mapping, so we have to write the pages
314  * to disk in VA order.
315  *
316  * To square this circle, we simply make up PFNs for each of Xen's pages.
317  * We assign each mapped page a fake PFN in ascending order.  These fake
318  * PFNs each have the FOREIGN bit set, ensuring that they fall outside the
319  * range of Solaris PFNs written by the kernel.
320  */
321 int
322 dump_xpv_addr()
323 {
324 	uintptr_t va;
325 	mem_vtop_t mem_vtop;
326 
327 	xpv_dump_pages = 0;
328 	va = xen_virt_start;
329 
330 	while (xpv_va_walk(&va) != PFN_INVALID) {
331 		mem_vtop.m_as = &kas;
332 		mem_vtop.m_va = (void *)va;
333 		mem_vtop.m_pfn = (pfn_t)xpv_dump_pages | PFN_IS_FOREIGN_MFN;
334 
335 		dumpvp_write(&mem_vtop, sizeof (mem_vtop_t));
336 		xpv_dump_pages++;
337 
338 		va += MMU_PAGESIZE;
339 	}
340 
341 	/*
342 	 * Add the shared_info page.  This page actually ends up in the
343 	 * dump twice: once for the Xen va and once for the Solaris va.
344 	 * This isn't ideal, but we don't know the address Xen is using for
345 	 * the page, so we can't share it.
346 	 */
347 	mem_vtop.m_as = &kas;
348 	mem_vtop.m_va = HYPERVISOR_shared_info;
349 	mem_vtop.m_pfn = (pfn_t)xpv_dump_pages | PFN_IS_FOREIGN_MFN;
350 	dumpvp_write(&mem_vtop, sizeof (mem_vtop_t));
351 	xpv_dump_pages++;
352 
353 	return (xpv_dump_pages);
354 }
355 
356 void
357 dump_xpv_pfn()
358 {
359 	pfn_t pfn;
360 	int cnt;
361 
362 	for (cnt = 0; cnt < xpv_dump_pages; cnt++) {
363 		pfn = (pfn_t)cnt | PFN_IS_FOREIGN_MFN;
364 		dumpvp_write(&pfn, sizeof (pfn));
365 	}
366 }
367 
368 int
369 dump_xpv_data(void *dump_cbuf)
370 {
371 	uintptr_t va;
372 	uint32_t csize;
373 	int cnt = 0;
374 
375 	/*
376 	 * XXX: we should probably run this data through a UE check.  The
377 	 * catch is that the UE code relies on on_trap() and getpfnum()
378 	 * working.
379 	 */
380 	va = xen_virt_start;
381 
382 	while (xpv_va_walk(&va) != PFN_INVALID) {
383 		csize = (uint32_t)compress((void *)va, dump_cbuf, PAGESIZE);
384 		dumpvp_write(&csize, sizeof (uint32_t));
385 		dumpvp_write(dump_cbuf, csize);
386 		if (dump_ioerr) {
387 			dumphdr->dump_flags &= ~DF_COMPLETE;
388 			return (cnt);
389 		}
390 		cnt++;
391 		va += MMU_PAGESIZE;
392 	}
393 
394 	/*
395 	 * Finally, dump the shared_info page
396 	 */
397 	csize = (uint32_t)compress((void *)HYPERVISOR_shared_info, dump_cbuf,
398 	    PAGESIZE);
399 	dumpvp_write(&csize, sizeof (uint32_t));
400 	dumpvp_write(dump_cbuf, csize);
401 	if (dump_ioerr)
402 		dumphdr->dump_flags &= ~DF_COMPLETE;
403 	cnt++;
404 
405 	return (cnt);
406 }
407 
408 static void *
409 showstack(void *fpreg, int xpv_only)
410 {
411 	struct frame *fpp;
412 	ulong_t off;
413 	char *sym;
414 	uintptr_t pc, fp, lastfp;
415 	uintptr_t minaddr = min(KERNELBASE, xen_virt_start);
416 
417 	fp = (uintptr_t)fpreg;
418 	if (fp < minaddr) {
419 		xpv_panic_printf("Bad frame ptr: 0x%p\n", fpreg);
420 		return (fpreg);
421 	}
422 
423 	do {
424 		fpp = (struct frame *)fp;
425 		pc = fpp->fr_savpc;
426 
427 		if ((xpv_only != 0) &&
428 		    (fp > xpv_end || fp < xen_virt_start))
429 			break;
430 		if ((sym = kobj_getsymname(pc, &off)) != NULL)
431 			xpv_panic_printf("%08lx %s:%s+%lx\n", fp,
432 			    mod_containing_pc((caddr_t)pc), sym, off);
433 		else if ((pc >= xen_virt_start) && (pc <= xpv_end))
434 			xpv_panic_printf("%08lx 0x%lx (in Xen)\n", fp, pc);
435 		else
436 			xpv_panic_printf("%08lx %lx\n", fp, pc);
437 
438 		lastfp = fp;
439 		fp = fpp->fr_savfp;
440 
441 		/*
442 		 * Xen marks an exception frame by inverting the frame
443 		 * pointer.
444 		 */
445 		if (fp < lastfp) {
446 			if ((~fp > minaddr) && ((~fp) ^ lastfp) < 0xfff)
447 				fp = ~fp;
448 		}
449 	} while (fp > lastfp);
450 	return ((void *)fp);
451 }
452 
453 void *
454 xpv_traceback(void *fpreg)
455 {
456 	return (showstack(fpreg, 1));
457 }
458 
459 #if defined(__amd64)
460 static void
461 xpv_panic_hypercall(ulong_t call)
462 {
463 	panic("Illegally issued hypercall %d during panic!\n", (int)call);
464 }
465 #endif
466 
467 void
468 xpv_die(struct regs *rp)
469 {
470 	struct panic_trap_info ti;
471 	struct cregs creg;
472 
473 	ti.trap_regs = rp;
474 	ti.trap_type = rp->r_trapno;
475 
476 	curthread->t_panic_trap = &ti;
477 	if (ti.trap_type == T_PGFLT) {
478 		getcregs(&creg);
479 		ti.trap_addr = (caddr_t)creg.cr_cr2;
480 		panic("Fatal pagefault at 0x%lx.  fault addr=0x%p  rp=0x%p",
481 		    rp->r_pc, (void *)ti.trap_addr, (void *)rp);
482 	} else {
483 		ti.trap_addr = (caddr_t)rp->r_pc;
484 		panic("Fatal trap %ld at 0x%lx.  rp=0x%p", rp->r_trapno,
485 		    rp->r_pc, (void *)rp);
486 	}
487 }
488 
489 /*
490  * Build IDT to handle a Xen panic
491  */
492 static void
493 switch_to_xpv_panic_idt()
494 {
495 	int i;
496 	desctbr_t idtr;
497 	gate_desc_t *idt = xpv_panic_idt;
498 	selector_t cs = get_cs_register();
499 
500 	for (i = 0; i < 32; i++)
501 		set_gatesegd(&idt[i], &xpv_invaltrap, cs, SDT_SYSIGT, TRP_XPL);
502 
503 	set_gatesegd(&idt[T_ZERODIV], &xpv_div0trap, cs, SDT_SYSIGT, TRP_XPL);
504 	set_gatesegd(&idt[T_SGLSTP], &xpv_dbgtrap, cs, SDT_SYSIGT, TRP_XPL);
505 	set_gatesegd(&idt[T_NMIFLT], &xpv_nmiint, cs, SDT_SYSIGT, TRP_XPL);
506 	set_gatesegd(&idt[T_BOUNDFLT], &xpv_boundstrap, cs, SDT_SYSIGT,
507 	    TRP_XPL);
508 	set_gatesegd(&idt[T_ILLINST], &xpv_invoptrap, cs, SDT_SYSIGT, TRP_XPL);
509 	set_gatesegd(&idt[T_NOEXTFLT], &xpv_ndptrap, cs, SDT_SYSIGT, TRP_XPL);
510 	set_gatesegd(&idt[T_TSSFLT], &xpv_invtsstrap, cs, SDT_SYSIGT, TRP_XPL);
511 	set_gatesegd(&idt[T_SEGFLT], &xpv_segnptrap, cs, SDT_SYSIGT, TRP_XPL);
512 	set_gatesegd(&idt[T_STKFLT], &xpv_stktrap, cs, SDT_SYSIGT, TRP_XPL);
513 	set_gatesegd(&idt[T_GPFLT], &xpv_gptrap, cs, SDT_SYSIGT, TRP_XPL);
514 	set_gatesegd(&idt[T_PGFLT], &xpv_pftrap, cs, SDT_SYSIGT, TRP_XPL);
515 	set_gatesegd(&idt[T_EXTERRFLT], &xpv_ndperr, cs, SDT_SYSIGT, TRP_XPL);
516 	set_gatesegd(&idt[T_ALIGNMENT], &xpv_achktrap, cs, SDT_SYSIGT, TRP_XPL);
517 	set_gatesegd(&idt[T_MCE], &xpv_mcetrap, cs, SDT_SYSIGT, TRP_XPL);
518 	set_gatesegd(&idt[T_SIMDFPE], &xpv_xmtrap, cs, SDT_SYSIGT, TRP_XPL);
519 
520 	/*
521 	 * We have no double fault handler.  Any single fault represents a
522 	 * catastrophic failure for us, so there is no attempt to handle
523 	 * them cleanly: we just print a message and reboot.  If we
524 	 * encounter a second fault while doing that, there is nothing
525 	 * else we can do.
526 	 */
527 
528 	/*
529 	 * Be prepared to absorb any stray device interrupts received
530 	 * while writing the core to disk.
531 	 */
532 	for (i = 33; i < NIDT; i++)
533 		set_gatesegd(&idt[i], &xpv_surprise_intr, cs, SDT_SYSIGT,
534 		    TRP_XPL);
535 
536 	/* The one interrupt we expect to get is from the APIC timer.  */
537 	set_gatesegd(&idt[T_XPV_TIMER], &xpv_timer_trap, cs, SDT_SYSIGT,
538 	    TRP_XPL);
539 
540 	idtr.dtr_base = (uintptr_t)xpv_panic_idt;
541 	idtr.dtr_limit = sizeof (xpv_panic_idt) - 1;
542 	wr_idtr(&idtr);
543 
544 #if defined(__amd64)
545 	/* Catch any hypercalls. */
546 	wrmsr(MSR_AMD_LSTAR, (uintptr_t)xpv_panic_hypercall);
547 	wrmsr(MSR_AMD_CSTAR, (uintptr_t)xpv_panic_hypercall);
548 #endif
549 }
550 
551 static void
552 xpv_apic_clkinit()
553 {
554 	uint_t		apic_ticks = 0;
555 
556 	/*
557 	 * Measure how many APIC ticks there are within a fixed time
558 	 * period.  We're going to be fairly coarse here.  This timer is
559 	 * just being used to detect a stalled panic, so as long as we have
560 	 * the right order of magnitude, everything should be fine.
561 	 */
562 	xpv_apicadr[APIC_SPUR_INT_REG] = AV_UNIT_ENABLE | APIC_SPUR_INTR;
563 	xpv_apicadr[APIC_LOCAL_TIMER] = AV_MASK;
564 	xpv_apicadr[APIC_INT_VECT0] = AV_MASK;	/* local intr reg 0 */
565 
566 	xpv_apicadr[APIC_DIVIDE_REG] = 0;
567 	xpv_apicadr[APIC_INIT_COUNT] = APIC_MAXVAL;
568 	drv_usecwait(XPV_TIMER_INTERVAL);
569 	apic_ticks = APIC_MAXVAL - xpv_apicadr[APIC_CURR_COUNT];
570 
571 	/*
572 	 * apic_ticks now represents roughly how many apic ticks comprise
573 	 * one timeout interval.  Program the timer to send us an interrupt
574 	 * every time that interval expires.
575 	 */
576 	xpv_apicadr[APIC_LOCAL_TIMER] = T_XPV_TIMER | AV_TIME;
577 	xpv_apicadr[APIC_INIT_COUNT] = apic_ticks;
578 	xpv_apicadr[APIC_EOI_REG] = 0;
579 }
580 
581 void
582 xpv_timer_tick(void)
583 {
584 	static int ticks = 0;
585 
586 	if (ticks++ >= MICROSEC / XPV_TIMER_INTERVAL) {
587 		ticks = 0;
588 		if (dump_timeleft && (--dump_timeleft == 0))
589 			panic("Xen panic timeout\n");
590 	}
591 	xpv_apicadr[APIC_EOI_REG] = 0;
592 }
593 
594 void
595 xpv_interrupt(void)
596 {
597 #ifdef	DEBUG
598 	static int cnt = 0;
599 
600 	if (cnt++ < 10)
601 		xpv_panic_printf("Unexpected interrupt received.\n");
602 	if ((cnt < 1000) && ((cnt % 100) == 0))
603 		xpv_panic_printf("%d unexpected interrupts received.\n", cnt);
604 #endif
605 
606 	xpv_apicadr[APIC_EOI_REG] = 0;
607 }
608 
609 /*
610  * Managing time in panic context is trivial.  We only have a single CPU,
611  * we never get rescheduled, we never get suspended.  We just need to
612  * convert clock ticks into nanoseconds.
613  */
614 static hrtime_t
615 xpv_panic_gethrtime(void)
616 {
617 	hrtime_t tsc, hrt;
618 	unsigned int *l = (unsigned int *)&(tsc);
619 
620 	tsc = __rdtsc_insn();
621 	hrt = (mul32(l[1], nsec_scale) << NSEC_SHIFT) +
622 	    (mul32(l[0], nsec_scale) >> (32 - NSEC_SHIFT));
623 
624 	return (hrt);
625 }
626 
627 static void
628 xpv_panic_time_init()
629 {
630 	nsec_scale =
631 	    CPU->cpu_m.mcpu_vcpu_info->time.tsc_to_system_mul >> NSEC_SHIFT;
632 
633 	gethrtimef = xpv_panic_gethrtime;
634 }
635 
636 static void
637 xpv_panicsys(struct regs *rp, char *fmt, ...)
638 {
639 	extern void panicsys(const char *, va_list, struct regs *, int);
640 	va_list alist;
641 
642 	va_start(alist, fmt);
643 	panicsys(fmt, alist, rp, 1);
644 	va_end(alist);
645 }
646 
647 void
648 xpv_do_panic(void *arg)
649 {
650 	struct panic_info *pip = (struct panic_info *)arg;
651 	int l;
652 	struct cregs creg;
653 #if defined(__amd64)
654 	extern uintptr_t postbootkernelbase;
655 #endif
656 
657 	if (xpv_panicking++ > 0)
658 		panic("multiple calls to xpv_do_panic()");
659 
660 	/*
661 	 * Indicate to the underlying panic framework that a panic has been
662 	 * initiated.  This is ordinarily done as part of vpanic().  Since
663 	 * we already have all the register state saved by the hypervisor,
664 	 * we skip that and jump straight into the panic processing code.
665 	 */
666 	(void) panic_trigger(&panic_quiesce);
667 
668 #if defined(__amd64)
669 	/*
670 	 * bzero() and bcopy() get unhappy when asked to operate on
671 	 * addresses outside of the kernel.  At this point Xen is really a
672 	 * part of the kernel, so we update the routines' notion of where
673 	 * the kernel starts.
674 	 */
675 	postbootkernelbase = xen_virt_start;
676 #endif
677 
678 #if defined(HYPERVISOR_VIRT_END)
679 	xpv_end = HYPERVISOR_VIRT_END;
680 #else
681 	xpv_end = (uintptr_t)UINTPTR_MAX - sizeof (uintptr_t);
682 #endif
683 
684 	/*
685 	 * If we were redirecting console output to the hypervisor, we have
686 	 * to stop.
687 	 */
688 	use_polledio = B_FALSE;
689 	if (console == CONS_HYPERVISOR) {
690 		bcons_device_change(CONS_HYPERVISOR);
691 	} else if (cons_polledio != NULL &&
692 	    cons_polledio->cons_polledio_putchar != NULL)  {
693 		if (cons_polledio->cons_polledio_enter != NULL)
694 			cons_polledio->cons_polledio_enter(
695 			    cons_polledio->cons_polledio_argument);
696 		use_polledio = 1;
697 	}
698 
699 	/* Make sure we handle all console output from here on. */
700 	sysp->bsvc_putchar = xpv_panic_putc;
701 
702 	/*
703 	 * If we find an unsupported panic_info structure, there's not much
704 	 * we can do other than complain, plow on, and hope for the best.
705 	 */
706 	if (pip->pi_version != PANIC_INFO_VERSION)
707 		xpv_panic_printf("Warning: Xen is using an unsupported "
708 		    "version of the panic_info structure.\n");
709 
710 	xpv_panic_info = pip;
711 
712 #if defined(__amd64)
713 	kpm1_low = (uintptr_t)xpv_panic_info->pi_ram_start;
714 	if (xpv_panic_info->pi_xen_start == NULL) {
715 		kpm1_high = (uintptr_t)xpv_panic_info->pi_ram_end;
716 	} else {
717 		kpm1_high = (uintptr_t)xpv_panic_info->pi_xen_start;
718 		kpm2_low = (uintptr_t)xpv_panic_info->pi_xen_end;
719 		kpm2_high = (uintptr_t)xpv_panic_info->pi_ram_end;
720 	}
721 #endif
722 
723 	/*
724 	 * Make sure we are running on the Solaris %gs.  The Xen panic code
725 	 * should already have set up the GDT properly.
726 	 */
727 	xpv_panic_resetgs();
728 #if defined(__amd64)
729 	wrmsr(MSR_AMD_GSBASE, (uint64_t)&cpus[0]);
730 #endif
731 
732 	xpv_panic_time_init();
733 
734 	/*
735 	 * Switch to our own IDT, avoiding any accidental returns to Xen
736 	 * world.
737 	 */
738 	switch_to_xpv_panic_idt();
739 
740 	/*
741 	 * Initialize the APIC timer, which is used to detect a hung dump
742 	 * attempt.
743 	 */
744 	xpv_apicadr = pip->pi_apic;
745 	xpv_apic_clkinit();
746 
747 	/*
748 	 * Set up a few values that we'll need repeatedly.
749 	 */
750 	getcregs(&creg);
751 	xpv_panic_cr3 = creg.cr_cr3;
752 	for (l = mmu.max_level; l >= 0; l--)
753 		xpv_panic_nptes[l] = mmu.ptes_per_table;
754 #ifdef __i386
755 	if (mmu.pae_hat)
756 		xpv_panic_nptes[mmu.max_level] = 4;
757 #endif
758 
759 	/* Add the fake Xen module to the module list */
760 	if (xpv_module != NULL) {
761 		extern int last_module_id;
762 
763 		xpv_modctl->mod_id = last_module_id++;
764 		xpv_modctl->mod_next = &modules;
765 		xpv_modctl->mod_prev = modules.mod_prev;
766 		modules.mod_prev->mod_next = xpv_modctl;
767 		modules.mod_prev = xpv_modctl;
768 	}
769 	xpv_panic_printf = printf;
770 	xpv_panicsys((struct regs *)pip->pi_regs, pip->pi_panicstr);
771 	xpv_panic_printf("Failed to reboot following panic.\n");
772 	for (;;)
773 		;
774 }
775 
776 /*
777  * Set up the necessary data structures to pretend that the Xen hypervisor
778  * is a loadable module, allowing mdb to find the Xen symbols in a crash
779  * dump.  Since these symbols all map to VA space Solaris doesn't normally
780  * have access to, we don't link these structures into the kernel's lists
781  * until/unless we hit a Xen panic.
782  *
783  * The observant reader will note a striking amount of overlap between this
784  * code and that found in krtld.  While it would be handy if we could just
785  * ask krtld to do this work for us, it's not that simple.  Among the
786  * complications: we're not actually loading the text here (grub did it at
787  * boot), the .text section is writable, there are no relocations to do,
788  * none of the module text/data is in readable memory, etc.  Training krtld
789  * to deal with this weird module is as complicated, and more risky, than
790  * reimplementing the necessary subset of it here.
791  */
792 static void
793 init_xen_module()
794 {
795 	struct _buf *file = NULL;
796 	struct module *mp;
797 	struct modctl *mcp;
798 	int i, shn;
799 	Shdr *shp, *ctf_shp;
800 	char *names = NULL;
801 	size_t n, namesize, text_align, data_align;
802 #if defined(__amd64)
803 	const char machine = EM_AMD64;
804 #else
805 	const char machine = EM_386;
806 #endif
807 
808 	/* Allocate and init the module structure */
809 	mp = kmem_zalloc(sizeof (*mp), KM_SLEEP);
810 	mp->filename = kobj_zalloc(strlen(XPV_FILENAME) + 1, KM_SLEEP);
811 	(void) strcpy(mp->filename, XPV_FILENAME);
812 
813 	/* Allocate and init the modctl structure */
814 	mcp = kmem_zalloc(sizeof (*mcp), KM_SLEEP);
815 	mcp->mod_modname = kobj_zalloc(strlen(XPV_MODNAME) + 1, KM_SLEEP);
816 	(void) strcpy(mcp->mod_modname, XPV_MODNAME);
817 	mcp->mod_filename = kobj_zalloc(strlen(XPV_FILENAME) + 1, KM_SLEEP);
818 	(void) strcpy(mcp->mod_filename, XPV_FILENAME);
819 	mcp->mod_inprogress_thread = (kthread_id_t)-1;
820 	mcp->mod_ref = 1;
821 	mcp->mod_loaded = 1;
822 	mcp->mod_loadcnt = 1;
823 	mcp->mod_mp = mp;
824 
825 	/*
826 	 * Try to open a Xen image that hasn't had its symbol and CTF
827 	 * information stripped off.
828 	 */
829 	file = kobj_open_file(XPV_FILENAME);
830 	if (file == (struct _buf *)-1) {
831 		file = NULL;
832 		goto err;
833 	}
834 
835 	/*
836 	 * Read the header and ensure that this is an ELF file for the
837 	 * proper ISA.  If it's not, somebody has done something very
838 	 * stupid.  Why bother?  See Mencken.
839 	 */
840 	if (kobj_read_file(file, (char *)&mp->hdr, sizeof (mp->hdr), 0) < 0)
841 		goto err;
842 	for (i = 0; i < SELFMAG; i++)
843 		if (mp->hdr.e_ident[i] != ELFMAG[i])
844 			goto err;
845 	if ((mp->hdr.e_ident[EI_DATA] != ELFDATA2LSB) ||
846 	    (mp->hdr.e_machine != machine))
847 		goto err;
848 
849 	/* Read in the section headers */
850 	n = mp->hdr.e_shentsize * mp->hdr.e_shnum;
851 	mp->shdrs = kmem_zalloc(n, KM_SLEEP);
852 	if (kobj_read_file(file, mp->shdrs, n, mp->hdr.e_shoff) < 0)
853 		goto err;
854 
855 	/* Read the section names */
856 	shp = (Shdr *)(mp->shdrs + mp->hdr.e_shstrndx * mp->hdr.e_shentsize);
857 	namesize = shp->sh_size;
858 	names = kmem_zalloc(shp->sh_size, KM_SLEEP);
859 	if (kobj_read_file(file, names, shp->sh_size, shp->sh_offset) < 0)
860 		goto err;
861 
862 	/*
863 	 * Fill in the text and data size fields.
864 	 */
865 	ctf_shp = NULL;
866 	text_align = data_align = 0;
867 	for (shn = 1; shn < mp->hdr.e_shnum; shn++) {
868 		shp = (Shdr *)(mp->shdrs + shn * mp->hdr.e_shentsize);
869 
870 		/* Sanity check the offset of the section name */
871 		if (shp->sh_name >= namesize)
872 			continue;
873 
874 		/* If we find the symtab section, remember it for later. */
875 		if (shp->sh_type == SHT_SYMTAB) {
876 			mp->symtbl_section = shn;
877 			mp->symhdr = shp;
878 			continue;
879 		}
880 
881 		/* If we find the CTF section, remember it for later. */
882 		if ((shp->sh_size != 0) &&
883 		    (strcmp(names + shp->sh_name, ".SUNW_ctf") == 0)) {
884 			ctf_shp = shp;
885 			continue;
886 		}
887 
888 		if (!(shp->sh_flags & SHF_ALLOC))
889 			continue;
890 
891 		/*
892 		 * Xen marks its text section as writable, so we need to
893 		 * look for the name - not just the flag.
894 		 */
895 		if ((strcmp(&names[shp->sh_name], ".text") != NULL) &&
896 		    (shp->sh_flags & SHF_WRITE) != 0) {
897 			if (shp->sh_addralign > data_align)
898 				data_align = shp->sh_addralign;
899 			mp->data_size = ALIGN(mp->data_size, data_align);
900 			mp->data_size += ALIGN(shp->sh_size, 8);
901 			if (mp->data == NULL || mp->data > (char *)shp->sh_addr)
902 				mp->data = (char *)shp->sh_addr;
903 		} else {
904 			if (shp->sh_addralign > text_align)
905 				text_align = shp->sh_addralign;
906 			mp->text_size = ALIGN(mp->text_size, text_align);
907 			mp->text_size += ALIGN(shp->sh_size, 8);
908 			if (mp->text == NULL || mp->text > (char *)shp->sh_addr)
909 				mp->text = (char *)shp->sh_addr;
910 		}
911 	}
912 	kmem_free(names, namesize);
913 	names = NULL;
914 	shp = NULL;
915 	mcp->mod_text = mp->text;
916 	mcp->mod_text_size = mp->text_size;
917 
918 	/*
919 	 * If we have symbol table and string table sections, read them in
920 	 * now.  If we don't, we just plow on.  We'll still get a valid
921 	 * core dump, but finding anything useful will be just a bit
922 	 * harder.
923 	 *
924 	 * Note: we don't bother with a hash table.  We'll never do a
925 	 * symbol lookup unless we crash, and then mdb creates its own.  We
926 	 * also don't try to perform any relocations.  Xen should be loaded
927 	 * exactly where the ELF file indicates, and the symbol information
928 	 * in the file should be complete and correct already.  Static
929 	 * linking ain't all bad.
930 	 */
931 	if ((mp->symhdr != NULL) && (mp->symhdr->sh_link < mp->hdr.e_shnum)) {
932 		mp->strhdr = (Shdr *)
933 		    (mp->shdrs + mp->symhdr->sh_link * mp->hdr.e_shentsize);
934 		mp->nsyms = mp->symhdr->sh_size / mp->symhdr->sh_entsize;
935 
936 		/* Allocate space for the symbol table and strings.  */
937 		mp->symsize = mp->symhdr->sh_size +
938 		    mp->nsyms * sizeof (symid_t) + mp->strhdr->sh_size;
939 		mp->symspace = kmem_zalloc(mp->symsize, KM_SLEEP);
940 		mp->symtbl = mp->symspace;
941 		mp->strings = (char *)(mp->symtbl + mp->symhdr->sh_size);
942 
943 		if ((kobj_read_file(file, mp->symtbl,
944 		    mp->symhdr->sh_size, mp->symhdr->sh_offset) < 0) ||
945 		    (kobj_read_file(file, mp->strings,
946 		    mp->strhdr->sh_size, mp->strhdr->sh_offset) < 0))
947 			goto err;
948 	}
949 
950 	/*
951 	 * Read in the CTF section
952 	 */
953 	if ((ctf_shp != NULL) && ((moddebug & MODDEBUG_NOCTF) == 0)) {
954 		mp->ctfdata = kmem_zalloc(ctf_shp->sh_size, KM_SLEEP);
955 		mp->ctfsize = ctf_shp->sh_size;
956 		if (kobj_read_file(file, mp->ctfdata, mp->ctfsize,
957 		    ctf_shp->sh_offset) < 0)
958 			goto err;
959 	}
960 
961 	kobj_close_file(file);
962 
963 	xpv_module = mp;
964 	xpv_modctl = mcp;
965 	return;
966 
967 err:
968 	cmn_err(CE_WARN, "Failed to initialize xpv module.");
969 	if (file != NULL)
970 		kobj_close_file(file);
971 
972 	kmem_free(mp->filename, strlen(XPV_FILENAME) + 1);
973 	if (mp->shdrs != NULL)
974 		kmem_free(mp->shdrs, mp->hdr.e_shentsize * mp->hdr.e_shnum);
975 	if (mp->symspace != NULL)
976 		kmem_free(mp->symspace, mp->symsize);
977 	if (mp->ctfdata != NULL)
978 		kmem_free(mp->ctfdata, mp->ctfsize);
979 	kmem_free(mp, sizeof (*mp));
980 	kmem_free(mcp->mod_filename, strlen(XPV_FILENAME) + 1);
981 	kmem_free(mcp->mod_modname, strlen(XPV_MODNAME) + 1);
982 	kmem_free(mcp, sizeof (*mcp));
983 	if (names != NULL)
984 		kmem_free(names, namesize);
985 }
986 
987 void
988 xpv_panic_init()
989 {
990 	xen_platform_op_t op;
991 	int i;
992 
993 	ASSERT(DOMAIN_IS_INITDOMAIN(xen_info));
994 
995 	for (i = 0; i < mmu.num_level; i++)
996 		ptable_pfn[i] = PFN_INVALID;
997 
998 	/* Let Xen know where to jump if/when it panics. */
999 	op.cmd = XENPF_panic_init;
1000 	op.interface_version = XENPF_INTERFACE_VERSION;
1001 	op.u.panic_init.panic_addr = (unsigned long)xpv_panic_hdlr;
1002 
1003 	(void) HYPERVISOR_platform_op(&op);
1004 
1005 	init_xen_module();
1006 }
1007