1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2012 Gary Mills
23 *
24 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
25 */
26
27 #include <sys/types.h>
28 #include <sys/clock.h>
29 #include <sys/psm.h>
30 #include <sys/archsystm.h>
31 #include <sys/machsystm.h>
32 #include <sys/compress.h>
33 #include <sys/modctl.h>
34 #include <sys/trap.h>
35 #include <sys/panic.h>
36 #include <sys/regset.h>
37 #include <sys/frame.h>
38 #include <sys/kobj.h>
39 #include <sys/apic.h>
40 #include <sys/apic_timer.h>
41 #include <sys/dumphdr.h>
42 #include <sys/mem.h>
43 #include <sys/x86_archext.h>
44 #include <sys/xpv_panic.h>
45 #include <sys/boot_console.h>
46 #include <sys/bootsvcs.h>
47 #include <sys/consdev.h>
48 #include <vm/hat_pte.h>
49 #include <vm/hat_i86.h>
50
51 /* XXX: need to add a PAE version too, if we ever support both PAE and non */
52 #if defined(__i386)
53 #define XPV_FILENAME "/boot/xen-syms"
54 #else
55 #define XPV_FILENAME "/boot/amd64/xen-syms"
56 #endif
57 #define XPV_MODNAME "xpv"
58
59 int xpv_panicking = 0;
60
61 struct module *xpv_module;
62 struct modctl *xpv_modctl;
63
64 #define ALIGN(x, a) ((a) == 0 ? (uintptr_t)(x) : \
65 (((uintptr_t)(x) + (uintptr_t)(a) - 1l) & ~((uintptr_t)(a) - 1l)))
66
67 /* Pointer to the xpv_panic_info structure handed to us by Xen. */
68 static struct panic_info *xpv_panic_info = NULL;
69
70 /* Timer support */
71 #define NSEC_SHIFT 5
72 #define T_XPV_TIMER 0xd1
73 #define XPV_TIMER_INTERVAL 1000 /* 1000 microseconds */
74 static uint32_t *xpv_apicadr = NULL;
75 static uint_t nsec_scale;
76
77 /* IDT support */
78 #pragma align 16(xpv_panic_idt)
79 static gate_desc_t xpv_panic_idt[NIDT]; /* interrupt descriptor table */
80
81 /* Xen pagetables mapped into our HAT's ptable windows */
82 static pfn_t ptable_pfn[MAX_NUM_LEVEL];
83
84 /* Number of MMU_PAGESIZE pages we're adding to the Solaris dump */
85 static int xpv_dump_pages;
86
87 /*
88 * There are up to two large swathes of RAM that we don't want to include
89 * in the dump: those that comprise the Xen version of segkpm. On 32-bit
90 * systems there is no such region of memory. On 64-bit systems, there
91 * should be just a single contiguous region that corresponds to all of
92 * physical memory. The tricky bit is that Xen's heap sometimes lives in
93 * the middle of their segkpm, and is mapped using only kpm-like addresses.
94 * In that case, we need to skip the swathes before and after Xen's heap.
95 */
96 uintptr_t kpm1_low = 0;
97 uintptr_t kpm1_high = 0;
98 uintptr_t kpm2_low = 0;
99 uintptr_t kpm2_high = 0;
100
101 /*
102 * Some commonly used values that we don't want to recompute over and over.
103 */
104 static int xpv_panic_nptes[MAX_NUM_LEVEL];
105 static ulong_t xpv_panic_cr3;
106 static uintptr_t xpv_end;
107
108 static void xpv_panic_console_print(const char *fmt, ...);
109 static void (*xpv_panic_printf)(const char *, ...) = xpv_panic_console_print;
110
111 #define CONSOLE_BUF_SIZE 256
112 static char console_buffer[CONSOLE_BUF_SIZE];
113 static boolean_t use_polledio;
114
115 /*
116 * Pointers to machine check panic info (if any).
117 */
118 xpv_mca_panic_data_t *xpv_mca_panic_data = NULL;
119
120 static void
xpv_panic_putc(int m)121 xpv_panic_putc(int m)
122 {
123 struct cons_polledio *c = cons_polledio;
124
125 /* This really shouldn't happen */
126 if (boot_console_type(NULL) == CONS_HYPERVISOR)
127 return;
128
129 if (use_polledio == B_TRUE)
130 c->cons_polledio_putchar(c->cons_polledio_argument, m);
131 else
132 bcons_putchar(m);
133 }
134
135 static void
xpv_panic_puts(char * msg)136 xpv_panic_puts(char *msg)
137 {
138 char *m;
139
140 dump_timeleft = dump_timeout;
141 for (m = msg; *m; m++)
142 xpv_panic_putc((int)*m);
143 }
144
145 static void
xpv_panic_console_print(const char * fmt,...)146 xpv_panic_console_print(const char *fmt, ...)
147 {
148 va_list ap;
149
150 va_start(ap, fmt);
151 (void) vsnprintf(console_buffer, sizeof (console_buffer), fmt, ap);
152 va_end(ap);
153
154 xpv_panic_puts(console_buffer);
155 }
156
157 static void
xpv_panic_map(int level,pfn_t pfn)158 xpv_panic_map(int level, pfn_t pfn)
159 {
160 x86pte_t pte, *pteptr;
161
162 /*
163 * The provided pfn represents a level 'level' page table. Map it
164 * into the 'level' slot in the list of page table windows.
165 */
166 pteptr = (x86pte_t *)PWIN_PTE_VA(level);
167 pte = pfn_to_pa(pfn) | PT_VALID;
168
169 XPV_ALLOW_PAGETABLE_UPDATES();
170 if (mmu.pae_hat)
171 *pteptr = pte;
172 else
173 *(x86pte32_t *)pteptr = pte;
174 XPV_DISALLOW_PAGETABLE_UPDATES();
175
176 mmu_tlbflush_entry(PWIN_VA(level));
177 }
178
179 /*
180 * Walk the page tables to find the pfn mapped by the given va.
181 */
182 static pfn_t
xpv_va_walk(uintptr_t * vaddr)183 xpv_va_walk(uintptr_t *vaddr)
184 {
185 int l, idx;
186 pfn_t pfn;
187 x86pte_t pte;
188 x86pte_t *ptep;
189 uintptr_t va = *vaddr;
190 uintptr_t scan_va;
191 caddr_t ptable_window;
192 static pfn_t toplevel_pfn;
193 static uintptr_t lastva;
194
195 /*
196 * If we do anything other than a simple scan through memory, don't
197 * trust the mapped page tables.
198 */
199 if (va != lastva + MMU_PAGESIZE)
200 for (l = mmu.max_level; l >= 0; l--)
201 ptable_pfn[l] = PFN_INVALID;
202
203 toplevel_pfn = mmu_btop(xpv_panic_cr3);
204
205 while (va < xpv_end && va >= *vaddr) {
206 /* Find the lowest table with any entry for va */
207 pfn = toplevel_pfn;
208 for (l = mmu.max_level; l >= 0; l--) {
209 if (ptable_pfn[l] != pfn) {
210 xpv_panic_map(l, pfn);
211 ptable_pfn[l] = pfn;
212 }
213
214 /*
215 * Search this pagetable for any mapping to an
216 * address >= va.
217 */
218 ptable_window = PWIN_VA(l);
219 if (l == mmu.max_level && mmu.pae_hat)
220 ptable_window +=
221 (xpv_panic_cr3 & MMU_PAGEOFFSET);
222
223 idx = (va >> LEVEL_SHIFT(l)) & (xpv_panic_nptes[l] - 1);
224 scan_va = va;
225 while (idx < xpv_panic_nptes[l] && scan_va < xpv_end &&
226 scan_va >= *vaddr) {
227 ptep = (x86pte_t *)(ptable_window +
228 (idx << mmu.pte_size_shift));
229 pte = GET_PTE(ptep);
230 if (pte & PTE_VALID)
231 break;
232 idx++;
233 scan_va += mmu.level_size[l];
234 }
235
236 /*
237 * If there are no valid mappings in this table, we
238 * can skip to the end of the VA range it covers.
239 */
240 if (idx == xpv_panic_nptes[l]) {
241 va = NEXT_ENTRY_VA(va, l + 1);
242 break;
243 }
244
245 va = scan_va;
246 /*
247 * See if we've hit the end of the range.
248 */
249 if (va >= xpv_end || va < *vaddr)
250 break;
251
252 /*
253 * If this mapping is for a pagetable, we drop down
254 * to the next level in the hierarchy and look for
255 * a mapping in it.
256 */
257 pfn = PTE2MFN(pte, l);
258 if (!PTE_ISPAGE(pte, l))
259 continue;
260
261 /*
262 * The APIC page is magic. Nothing to see here;
263 * move along.
264 */
265 if (((uintptr_t)xpv_apicadr & MMU_PAGEMASK) ==
266 (va & MMU_PAGEMASK)) {
267 va += MMU_PAGESIZE;
268 break;
269 }
270
271 /*
272 * See if the address is within one of the two
273 * kpm-like regions we want to skip.
274 */
275 if (va >= kpm1_low && va < kpm1_high) {
276 va = kpm1_high;
277 break;
278 }
279 if (va >= kpm2_low && va < kpm2_high) {
280 va = kpm2_high;
281 break;
282 }
283
284 /*
285 * The Xen panic code only handles small pages. If
286 * this mapping is for a large page, we need to
287 * identify the consituent page that covers the
288 * specific VA we were looking for.
289 */
290 if (l > 0) {
291 if (l > 1)
292 panic("Xen panic can't cope with "
293 "giant pages.");
294 idx = (va >> LEVEL_SHIFT(0)) &
295 (xpv_panic_nptes[0] - 1);
296 pfn += idx;
297 }
298
299 *vaddr = va;
300 lastva = va;
301 return (pfn | PFN_IS_FOREIGN_MFN);
302 }
303 }
304 return (PFN_INVALID);
305 }
306
307 /*
308 * Walk through the Xen VA space, finding pages that are mapped in.
309 *
310 * These pages all have MFNs rather than PFNs, meaning they may be outside
311 * the physical address space the kernel knows about, or they may collide
312 * with PFNs the kernel is using.
313 *
314 * The obvious trick of just adding the PFN_IS_FOREIGN_MFN bit to the MFNs
315 * to avoid collisions doesn't work. The pages need to be written to disk
316 * in PFN-order or savecore gets confused. We can't allocate memory to
317 * contruct a sorted pfn->VA reverse mapping, so we have to write the pages
318 * to disk in VA order.
319 *
320 * To square this circle, we simply make up PFNs for each of Xen's pages.
321 * We assign each mapped page a fake PFN in ascending order. These fake
322 * PFNs each have the FOREIGN bit set, ensuring that they fall outside the
323 * range of Solaris PFNs written by the kernel.
324 */
325 int
dump_xpv_addr()326 dump_xpv_addr()
327 {
328 uintptr_t va;
329 mem_vtop_t mem_vtop;
330
331 xpv_dump_pages = 0;
332 va = xen_virt_start;
333
334 while (xpv_va_walk(&va) != PFN_INVALID) {
335 mem_vtop.m_as = &kas;
336 mem_vtop.m_va = (void *)va;
337 mem_vtop.m_pfn = (pfn_t)xpv_dump_pages | PFN_IS_FOREIGN_MFN;
338
339 dumpvp_write(&mem_vtop, sizeof (mem_vtop_t));
340 xpv_dump_pages++;
341
342 va += MMU_PAGESIZE;
343 }
344
345 /*
346 * Add the shared_info page. This page actually ends up in the
347 * dump twice: once for the Xen va and once for the Solaris va.
348 * This isn't ideal, but we don't know the address Xen is using for
349 * the page, so we can't share it.
350 */
351 mem_vtop.m_as = &kas;
352 mem_vtop.m_va = HYPERVISOR_shared_info;
353 mem_vtop.m_pfn = (pfn_t)xpv_dump_pages | PFN_IS_FOREIGN_MFN;
354 dumpvp_write(&mem_vtop, sizeof (mem_vtop_t));
355 xpv_dump_pages++;
356
357 return (xpv_dump_pages);
358 }
359
360 void
dump_xpv_pfn()361 dump_xpv_pfn()
362 {
363 pfn_t pfn;
364 int cnt;
365
366 for (cnt = 0; cnt < xpv_dump_pages; cnt++) {
367 pfn = (pfn_t)cnt | PFN_IS_FOREIGN_MFN;
368 dumpvp_write(&pfn, sizeof (pfn));
369 }
370 }
371
372 int
dump_xpv_data(void * dump_cbuf)373 dump_xpv_data(void *dump_cbuf)
374 {
375 uintptr_t va;
376 uint32_t csize;
377 int cnt = 0;
378
379 /*
380 * XXX: we should probably run this data through a UE check. The
381 * catch is that the UE code relies on on_trap() and getpfnum()
382 * working.
383 */
384 va = xen_virt_start;
385
386 while (xpv_va_walk(&va) != PFN_INVALID) {
387 csize = (uint32_t)compress((void *)va, dump_cbuf, PAGESIZE);
388 dumpvp_write(&csize, sizeof (uint32_t));
389 dumpvp_write(dump_cbuf, csize);
390 if (dump_ioerr) {
391 dumphdr->dump_flags &= ~DF_COMPLETE;
392 return (cnt);
393 }
394 cnt++;
395 va += MMU_PAGESIZE;
396 }
397
398 /*
399 * Finally, dump the shared_info page
400 */
401 csize = (uint32_t)compress((void *)HYPERVISOR_shared_info, dump_cbuf,
402 PAGESIZE);
403 dumpvp_write(&csize, sizeof (uint32_t));
404 dumpvp_write(dump_cbuf, csize);
405 if (dump_ioerr)
406 dumphdr->dump_flags &= ~DF_COMPLETE;
407 cnt++;
408
409 return (cnt);
410 }
411
412 static void *
showstack(void * fpreg,int xpv_only)413 showstack(void *fpreg, int xpv_only)
414 {
415 struct frame *fpp;
416 ulong_t off;
417 char *sym;
418 uintptr_t pc, fp, lastfp;
419 uintptr_t minaddr = min(KERNELBASE, xen_virt_start);
420
421 fp = (uintptr_t)fpreg;
422 if (fp < minaddr) {
423 xpv_panic_printf("Bad frame ptr: 0x%p\n", fpreg);
424 return (fpreg);
425 }
426
427 do {
428 fpp = (struct frame *)fp;
429 pc = fpp->fr_savpc;
430
431 if ((xpv_only != 0) &&
432 (fp > xpv_end || fp < xen_virt_start))
433 break;
434 if ((sym = kobj_getsymname(pc, &off)) != NULL)
435 xpv_panic_printf("%08lx %s:%s+%lx\n", fp,
436 mod_containing_pc((caddr_t)pc), sym, off);
437 else if ((pc >= xen_virt_start) && (pc <= xpv_end))
438 xpv_panic_printf("%08lx 0x%lx (in Xen)\n", fp, pc);
439 else
440 xpv_panic_printf("%08lx %lx\n", fp, pc);
441
442 lastfp = fp;
443 fp = fpp->fr_savfp;
444
445 /*
446 * Xen marks an exception frame by inverting the frame
447 * pointer.
448 */
449 if (fp < lastfp) {
450 if ((~fp > minaddr) && ((~fp) ^ lastfp) < 0xfff)
451 fp = ~fp;
452 }
453 } while (fp > lastfp);
454 return ((void *)fp);
455 }
456
457 void *
xpv_traceback(void * fpreg)458 xpv_traceback(void *fpreg)
459 {
460 return (showstack(fpreg, 1));
461 }
462
463 #if defined(__amd64)
464 static void
xpv_panic_hypercall(ulong_t call)465 xpv_panic_hypercall(ulong_t call)
466 {
467 panic("Illegally issued hypercall %d during panic!\n", (int)call);
468 }
469 #endif
470
471 void
xpv_die(struct regs * rp)472 xpv_die(struct regs *rp)
473 {
474 struct panic_trap_info ti;
475 struct cregs creg;
476
477 ti.trap_regs = rp;
478 ti.trap_type = rp->r_trapno;
479
480 curthread->t_panic_trap = &ti;
481 if (ti.trap_type == T_PGFLT) {
482 getcregs(&creg);
483 ti.trap_addr = (caddr_t)creg.cr_cr2;
484 panic("Fatal pagefault at 0x%lx. fault addr=0x%p rp=0x%p",
485 rp->r_pc, (void *)ti.trap_addr, (void *)rp);
486 } else {
487 ti.trap_addr = (caddr_t)rp->r_pc;
488 panic("Fatal trap %ld at 0x%lx. rp=0x%p", rp->r_trapno,
489 rp->r_pc, (void *)rp);
490 }
491 }
492
493 /*
494 * Build IDT to handle a Xen panic
495 */
496 static void
switch_to_xpv_panic_idt()497 switch_to_xpv_panic_idt()
498 {
499 int i;
500 desctbr_t idtr;
501 gate_desc_t *idt = xpv_panic_idt;
502 selector_t cs = get_cs_register();
503
504 for (i = 0; i < 32; i++)
505 set_gatesegd(&idt[i], &xpv_invaltrap, cs, SDT_SYSIGT, TRP_XPL,
506 0);
507
508 set_gatesegd(&idt[T_ZERODIV], &xpv_div0trap, cs, SDT_SYSIGT, TRP_XPL,
509 0);
510 set_gatesegd(&idt[T_SGLSTP], &xpv_dbgtrap, cs, SDT_SYSIGT, TRP_XPL, 0);
511 set_gatesegd(&idt[T_NMIFLT], &xpv_nmiint, cs, SDT_SYSIGT, TRP_XPL, 0);
512 set_gatesegd(&idt[T_BOUNDFLT], &xpv_boundstrap, cs, SDT_SYSIGT,
513 TRP_XPL, 0);
514 set_gatesegd(&idt[T_ILLINST], &xpv_invoptrap, cs, SDT_SYSIGT, TRP_XPL,
515 0);
516 set_gatesegd(&idt[T_NOEXTFLT], &xpv_ndptrap, cs, SDT_SYSIGT, TRP_XPL,
517 0);
518 set_gatesegd(&idt[T_TSSFLT], &xpv_invtsstrap, cs, SDT_SYSIGT, TRP_XPL,
519 0);
520 set_gatesegd(&idt[T_SEGFLT], &xpv_segnptrap, cs, SDT_SYSIGT, TRP_XPL,
521 0);
522 set_gatesegd(&idt[T_STKFLT], &xpv_stktrap, cs, SDT_SYSIGT, TRP_XPL, 0);
523 set_gatesegd(&idt[T_GPFLT], &xpv_gptrap, cs, SDT_SYSIGT, TRP_XPL, 0);
524 set_gatesegd(&idt[T_PGFLT], &xpv_pftrap, cs, SDT_SYSIGT, TRP_XPL, 0);
525 set_gatesegd(&idt[T_EXTERRFLT], &xpv_ndperr, cs, SDT_SYSIGT, TRP_XPL,
526 0);
527 set_gatesegd(&idt[T_ALIGNMENT], &xpv_achktrap, cs, SDT_SYSIGT, TRP_XPL,
528 0);
529 set_gatesegd(&idt[T_MCE], &xpv_mcetrap, cs, SDT_SYSIGT, TRP_XPL, 0);
530 set_gatesegd(&idt[T_SIMDFPE], &xpv_xmtrap, cs, SDT_SYSIGT, TRP_XPL, 0);
531
532 /*
533 * We have no double fault handler. Any single fault represents a
534 * catastrophic failure for us, so there is no attempt to handle
535 * them cleanly: we just print a message and reboot. If we
536 * encounter a second fault while doing that, there is nothing
537 * else we can do.
538 */
539
540 /*
541 * Be prepared to absorb any stray device interrupts received
542 * while writing the core to disk.
543 */
544 for (i = 33; i < NIDT; i++)
545 set_gatesegd(&idt[i], &xpv_surprise_intr, cs, SDT_SYSIGT,
546 TRP_XPL, 0);
547
548 /* The one interrupt we expect to get is from the APIC timer. */
549 set_gatesegd(&idt[T_XPV_TIMER], &xpv_timer_trap, cs, SDT_SYSIGT,
550 TRP_XPL, 0);
551
552 idtr.dtr_base = (uintptr_t)xpv_panic_idt;
553 idtr.dtr_limit = sizeof (xpv_panic_idt) - 1;
554 wr_idtr(&idtr);
555
556 #if defined(__amd64)
557 /* Catch any hypercalls. */
558 wrmsr(MSR_AMD_LSTAR, (uintptr_t)xpv_panic_hypercall);
559 wrmsr(MSR_AMD_CSTAR, (uintptr_t)xpv_panic_hypercall);
560 #endif
561 }
562
563 static void
xpv_apic_clkinit()564 xpv_apic_clkinit()
565 {
566 uint_t apic_ticks = 0;
567
568 /*
569 * Measure how many APIC ticks there are within a fixed time
570 * period. We're going to be fairly coarse here. This timer is
571 * just being used to detect a stalled panic, so as long as we have
572 * the right order of magnitude, everything should be fine.
573 */
574 xpv_apicadr[APIC_SPUR_INT_REG] = AV_UNIT_ENABLE | APIC_SPUR_INTR;
575 xpv_apicadr[APIC_LOCAL_TIMER] = AV_MASK;
576 xpv_apicadr[APIC_INT_VECT0] = AV_MASK; /* local intr reg 0 */
577
578 xpv_apicadr[APIC_DIVIDE_REG] = 0;
579 xpv_apicadr[APIC_INIT_COUNT] = APIC_MAXVAL;
580 drv_usecwait(XPV_TIMER_INTERVAL);
581 apic_ticks = APIC_MAXVAL - xpv_apicadr[APIC_CURR_COUNT];
582
583 /*
584 * apic_ticks now represents roughly how many apic ticks comprise
585 * one timeout interval. Program the timer to send us an interrupt
586 * every time that interval expires.
587 */
588 xpv_apicadr[APIC_LOCAL_TIMER] = T_XPV_TIMER | AV_PERIODIC;
589 xpv_apicadr[APIC_INIT_COUNT] = apic_ticks;
590 xpv_apicadr[APIC_EOI_REG] = 0;
591 }
592
593 void
xpv_timer_tick(void)594 xpv_timer_tick(void)
595 {
596 static int ticks = 0;
597
598 if (ticks++ >= MICROSEC / XPV_TIMER_INTERVAL) {
599 ticks = 0;
600 if (dump_timeleft && (--dump_timeleft == 0))
601 panic("Xen panic timeout\n");
602 }
603 xpv_apicadr[APIC_EOI_REG] = 0;
604 }
605
606 void
xpv_interrupt(void)607 xpv_interrupt(void)
608 {
609 #ifdef DEBUG
610 static int cnt = 0;
611
612 if (cnt++ < 10)
613 xpv_panic_printf("Unexpected interrupt received.\n");
614 if ((cnt < 1000) && ((cnt % 100) == 0))
615 xpv_panic_printf("%d unexpected interrupts received.\n", cnt);
616 #endif
617
618 xpv_apicadr[APIC_EOI_REG] = 0;
619 }
620
621 /*
622 * Managing time in panic context is trivial. We only have a single CPU,
623 * we never get rescheduled, we never get suspended. We just need to
624 * convert clock ticks into nanoseconds.
625 */
626 static hrtime_t
xpv_panic_gethrtime(void)627 xpv_panic_gethrtime(void)
628 {
629 hrtime_t tsc, hrt;
630 unsigned int *l = (unsigned int *)&(tsc);
631
632 tsc = __rdtsc_insn();
633 hrt = (mul32(l[1], nsec_scale) << NSEC_SHIFT) +
634 (mul32(l[0], nsec_scale) >> (32 - NSEC_SHIFT));
635
636 return (hrt);
637 }
638
639 static void
xpv_panic_time_init()640 xpv_panic_time_init()
641 {
642 nsec_scale =
643 CPU->cpu_m.mcpu_vcpu_info->time.tsc_to_system_mul >> NSEC_SHIFT;
644
645 gethrtimef = xpv_panic_gethrtime;
646 }
647
648 static void
xpv_panicsys(struct regs * rp,char * fmt,...)649 xpv_panicsys(struct regs *rp, char *fmt, ...)
650 {
651 extern void panicsys(const char *, va_list, struct regs *, int);
652 va_list alist;
653
654 va_start(alist, fmt);
655 panicsys(fmt, alist, rp, 1);
656 va_end(alist);
657 }
658
659 void
xpv_do_panic(void * arg)660 xpv_do_panic(void *arg)
661 {
662 struct panic_info *pip = (struct panic_info *)arg;
663 int l;
664 struct cregs creg;
665 #if defined(__amd64)
666 extern uintptr_t postbootkernelbase;
667 #endif
668
669 if (xpv_panicking++ > 0)
670 panic("multiple calls to xpv_do_panic()");
671
672 /*
673 * Indicate to the underlying panic framework that a panic has been
674 * initiated. This is ordinarily done as part of vpanic(). Since
675 * we already have all the register state saved by the hypervisor,
676 * we skip that and jump straight into the panic processing code.
677 *
678 * XXX If another thread grabs and wins the panic_quiesce trigger
679 * then we'll have two threads in panicsys believing they are in
680 * charge of the panic attempt!
681 */
682 (void) panic_trigger(&panic_quiesce);
683
684 #if defined(__amd64)
685 /*
686 * bzero() and bcopy() get unhappy when asked to operate on
687 * addresses outside of the kernel. At this point Xen is really a
688 * part of the kernel, so we update the routines' notion of where
689 * the kernel starts.
690 */
691 postbootkernelbase = xen_virt_start;
692 #endif
693
694 #if defined(HYPERVISOR_VIRT_END)
695 xpv_end = HYPERVISOR_VIRT_END;
696 #else
697 xpv_end = (uintptr_t)UINTPTR_MAX - sizeof (uintptr_t);
698 #endif
699
700 /*
701 * If we were redirecting console output to the hypervisor, we have
702 * to stop.
703 */
704 use_polledio = B_FALSE;
705 if (boot_console_type(NULL) == CONS_HYPERVISOR) {
706 bcons_device_change(CONS_HYPERVISOR);
707 } else if (cons_polledio != NULL &&
708 cons_polledio->cons_polledio_putchar != NULL) {
709 if (cons_polledio->cons_polledio_enter != NULL)
710 cons_polledio->cons_polledio_enter(
711 cons_polledio->cons_polledio_argument);
712 use_polledio = 1;
713 }
714
715 /* Make sure we handle all console output from here on. */
716 sysp->bsvc_putchar = xpv_panic_putc;
717
718 /*
719 * If we find an unsupported panic_info structure, there's not much
720 * we can do other than complain, plow on, and hope for the best.
721 */
722 if (pip->pi_version != PANIC_INFO_VERSION)
723 xpv_panic_printf("Warning: Xen is using an unsupported "
724 "version of the panic_info structure.\n");
725
726 xpv_panic_info = pip;
727
728 #if defined(__amd64)
729 kpm1_low = (uintptr_t)xpv_panic_info->pi_ram_start;
730 if (xpv_panic_info->pi_xen_start == NULL) {
731 kpm1_high = (uintptr_t)xpv_panic_info->pi_ram_end;
732 } else {
733 kpm1_high = (uintptr_t)xpv_panic_info->pi_xen_start;
734 kpm2_low = (uintptr_t)xpv_panic_info->pi_xen_end;
735 kpm2_high = (uintptr_t)xpv_panic_info->pi_ram_end;
736 }
737 #endif
738
739 /*
740 * Make sure we are running on the Solaris %gs. The Xen panic code
741 * should already have set up the GDT properly.
742 */
743 xpv_panic_resetgs();
744 #if defined(__amd64)
745 wrmsr(MSR_AMD_GSBASE, (uint64_t)&cpus[0]);
746 #endif
747
748 xpv_panic_time_init();
749
750 /*
751 * Switch to our own IDT, avoiding any accidental returns to Xen
752 * world.
753 */
754 switch_to_xpv_panic_idt();
755
756 /*
757 * Initialize the APIC timer, which is used to detect a hung dump
758 * attempt.
759 */
760 xpv_apicadr = pip->pi_apic;
761 xpv_apic_clkinit();
762
763 /*
764 * Set up a few values that we'll need repeatedly.
765 */
766 getcregs(&creg);
767 xpv_panic_cr3 = creg.cr_cr3;
768 for (l = mmu.max_level; l >= 0; l--)
769 xpv_panic_nptes[l] = mmu.ptes_per_table;
770 #ifdef __i386
771 if (mmu.pae_hat)
772 xpv_panic_nptes[mmu.max_level] = 4;
773 #endif
774
775 /* Add the fake Xen module to the module list */
776 if (xpv_module != NULL) {
777 extern int last_module_id;
778
779 xpv_modctl->mod_id = last_module_id++;
780 xpv_modctl->mod_next = &modules;
781 xpv_modctl->mod_prev = modules.mod_prev;
782 modules.mod_prev->mod_next = xpv_modctl;
783 modules.mod_prev = xpv_modctl;
784 }
785
786 if (pip->pi_mca.mpd_magic == MCA_PANICDATA_MAGIC)
787 xpv_mca_panic_data = &pip->pi_mca;
788
789 xpv_panic_printf = printf;
790 xpv_panicsys((struct regs *)pip->pi_regs, pip->pi_panicstr);
791 xpv_panic_printf("Failed to reboot following panic.\n");
792 for (;;)
793 ;
794 }
795
796 /*
797 * Set up the necessary data structures to pretend that the Xen hypervisor
798 * is a loadable module, allowing mdb to find the Xen symbols in a crash
799 * dump. Since these symbols all map to VA space Solaris doesn't normally
800 * have access to, we don't link these structures into the kernel's lists
801 * until/unless we hit a Xen panic.
802 *
803 * The observant reader will note a striking amount of overlap between this
804 * code and that found in krtld. While it would be handy if we could just
805 * ask krtld to do this work for us, it's not that simple. Among the
806 * complications: we're not actually loading the text here (grub did it at
807 * boot), the .text section is writable, there are no relocations to do,
808 * none of the module text/data is in readable memory, etc. Training krtld
809 * to deal with this weird module is as complicated, and more risky, than
810 * reimplementing the necessary subset of it here.
811 */
812 static void
init_xen_module()813 init_xen_module()
814 {
815 struct _buf *file = NULL;
816 struct module *mp;
817 struct modctl *mcp;
818 int i, shn;
819 Shdr *shp, *ctf_shp;
820 char *names = NULL;
821 size_t n, namesize, text_align, data_align;
822 #if defined(__amd64)
823 const char machine = EM_AMD64;
824 #else
825 const char machine = EM_386;
826 #endif
827
828 /* Allocate and init the module structure */
829 mp = kmem_zalloc(sizeof (*mp), KM_SLEEP);
830 mp->filename = kobj_zalloc(strlen(XPV_FILENAME) + 1, KM_SLEEP);
831 (void) strcpy(mp->filename, XPV_FILENAME);
832
833 /* Allocate and init the modctl structure */
834 mcp = kmem_zalloc(sizeof (*mcp), KM_SLEEP);
835 mcp->mod_modname = kobj_zalloc(strlen(XPV_MODNAME) + 1, KM_SLEEP);
836 (void) strcpy(mcp->mod_modname, XPV_MODNAME);
837 mcp->mod_filename = kobj_zalloc(strlen(XPV_FILENAME) + 1, KM_SLEEP);
838 (void) strcpy(mcp->mod_filename, XPV_FILENAME);
839 mcp->mod_inprogress_thread = (kthread_id_t)-1;
840 mcp->mod_ref = 1;
841 mcp->mod_loaded = 1;
842 mcp->mod_loadcnt = 1;
843 mcp->mod_mp = mp;
844
845 /*
846 * Try to open a Xen image that hasn't had its symbol and CTF
847 * information stripped off.
848 */
849 file = kobj_open_file(XPV_FILENAME);
850 if (file == (struct _buf *)-1) {
851 file = NULL;
852 goto err;
853 }
854
855 /*
856 * Read the header and ensure that this is an ELF file for the
857 * proper ISA. If it's not, somebody has done something very
858 * stupid. Why bother? See Mencken.
859 */
860 if (kobj_read_file(file, (char *)&mp->hdr, sizeof (mp->hdr), 0) < 0)
861 goto err;
862 for (i = 0; i < SELFMAG; i++)
863 if (mp->hdr.e_ident[i] != ELFMAG[i])
864 goto err;
865 if ((mp->hdr.e_ident[EI_DATA] != ELFDATA2LSB) ||
866 (mp->hdr.e_machine != machine))
867 goto err;
868
869 /* Read in the section headers */
870 n = mp->hdr.e_shentsize * mp->hdr.e_shnum;
871 mp->shdrs = kmem_zalloc(n, KM_SLEEP);
872 if (kobj_read_file(file, mp->shdrs, n, mp->hdr.e_shoff) < 0)
873 goto err;
874
875 /* Read the section names */
876 shp = (Shdr *)(mp->shdrs + mp->hdr.e_shstrndx * mp->hdr.e_shentsize);
877 namesize = shp->sh_size;
878 names = kmem_zalloc(shp->sh_size, KM_SLEEP);
879 if (kobj_read_file(file, names, shp->sh_size, shp->sh_offset) < 0)
880 goto err;
881
882 /*
883 * Fill in the text and data size fields.
884 */
885 ctf_shp = NULL;
886 text_align = data_align = 0;
887 for (shn = 1; shn < mp->hdr.e_shnum; shn++) {
888 shp = (Shdr *)(mp->shdrs + shn * mp->hdr.e_shentsize);
889
890 /* Sanity check the offset of the section name */
891 if (shp->sh_name >= namesize)
892 continue;
893
894 /* If we find the symtab section, remember it for later. */
895 if (shp->sh_type == SHT_SYMTAB) {
896 mp->symtbl_section = shn;
897 mp->symhdr = shp;
898 continue;
899 }
900
901 /* If we find the CTF section, remember it for later. */
902 if ((shp->sh_size != 0) &&
903 (strcmp(names + shp->sh_name, ".SUNW_ctf") == 0)) {
904 ctf_shp = shp;
905 continue;
906 }
907
908 if (!(shp->sh_flags & SHF_ALLOC))
909 continue;
910
911 /*
912 * Xen marks its text section as writable, so we need to
913 * look for the name - not just the flag.
914 */
915 if ((strcmp(&names[shp->sh_name], ".text") != NULL) &&
916 (shp->sh_flags & SHF_WRITE) != 0) {
917 if (shp->sh_addralign > data_align)
918 data_align = shp->sh_addralign;
919 mp->data_size = ALIGN(mp->data_size, data_align);
920 mp->data_size += ALIGN(shp->sh_size, 8);
921 if (mp->data == NULL || mp->data > (char *)shp->sh_addr)
922 mp->data = (char *)shp->sh_addr;
923 } else {
924 if (shp->sh_addralign > text_align)
925 text_align = shp->sh_addralign;
926 mp->text_size = ALIGN(mp->text_size, text_align);
927 mp->text_size += ALIGN(shp->sh_size, 8);
928 if (mp->text == NULL || mp->text > (char *)shp->sh_addr)
929 mp->text = (char *)shp->sh_addr;
930 }
931 }
932 kmem_free(names, namesize);
933 names = NULL;
934 shp = NULL;
935 mcp->mod_text = mp->text;
936 mcp->mod_text_size = mp->text_size;
937
938 /*
939 * If we have symbol table and string table sections, read them in
940 * now. If we don't, we just plow on. We'll still get a valid
941 * core dump, but finding anything useful will be just a bit
942 * harder.
943 *
944 * Note: we don't bother with a hash table. We'll never do a
945 * symbol lookup unless we crash, and then mdb creates its own. We
946 * also don't try to perform any relocations. Xen should be loaded
947 * exactly where the ELF file indicates, and the symbol information
948 * in the file should be complete and correct already. Static
949 * linking ain't all bad.
950 */
951 if ((mp->symhdr != NULL) && (mp->symhdr->sh_link < mp->hdr.e_shnum)) {
952 mp->strhdr = (Shdr *)
953 (mp->shdrs + mp->symhdr->sh_link * mp->hdr.e_shentsize);
954 mp->nsyms = mp->symhdr->sh_size / mp->symhdr->sh_entsize;
955
956 /* Allocate space for the symbol table and strings. */
957 mp->symsize = mp->symhdr->sh_size +
958 mp->nsyms * sizeof (symid_t) + mp->strhdr->sh_size;
959 mp->symspace = kmem_zalloc(mp->symsize, KM_SLEEP);
960 mp->symtbl = mp->symspace;
961 mp->strings = (char *)(mp->symtbl + mp->symhdr->sh_size);
962
963 if ((kobj_read_file(file, mp->symtbl,
964 mp->symhdr->sh_size, mp->symhdr->sh_offset) < 0) ||
965 (kobj_read_file(file, mp->strings,
966 mp->strhdr->sh_size, mp->strhdr->sh_offset) < 0))
967 goto err;
968 }
969
970 /*
971 * Read in the CTF section
972 */
973 if ((ctf_shp != NULL) && ((moddebug & MODDEBUG_NOCTF) == 0)) {
974 mp->ctfdata = kmem_zalloc(ctf_shp->sh_size, KM_SLEEP);
975 mp->ctfsize = ctf_shp->sh_size;
976 if (kobj_read_file(file, mp->ctfdata, mp->ctfsize,
977 ctf_shp->sh_offset) < 0)
978 goto err;
979 }
980
981 kobj_close_file(file);
982
983 xpv_module = mp;
984 xpv_modctl = mcp;
985 return;
986
987 err:
988 cmn_err(CE_WARN, "Failed to initialize xpv module.");
989 if (file != NULL)
990 kobj_close_file(file);
991
992 kmem_free(mp->filename, strlen(XPV_FILENAME) + 1);
993 if (mp->shdrs != NULL)
994 kmem_free(mp->shdrs, mp->hdr.e_shentsize * mp->hdr.e_shnum);
995 if (mp->symspace != NULL)
996 kmem_free(mp->symspace, mp->symsize);
997 if (mp->ctfdata != NULL)
998 kmem_free(mp->ctfdata, mp->ctfsize);
999 kmem_free(mp, sizeof (*mp));
1000 kmem_free(mcp->mod_filename, strlen(XPV_FILENAME) + 1);
1001 kmem_free(mcp->mod_modname, strlen(XPV_MODNAME) + 1);
1002 kmem_free(mcp, sizeof (*mcp));
1003 if (names != NULL)
1004 kmem_free(names, namesize);
1005 }
1006
1007 void
xpv_panic_init()1008 xpv_panic_init()
1009 {
1010 xen_platform_op_t op;
1011 int i;
1012
1013 ASSERT(DOMAIN_IS_INITDOMAIN(xen_info));
1014
1015 for (i = 0; i < mmu.num_level; i++)
1016 ptable_pfn[i] = PFN_INVALID;
1017
1018 /* Let Xen know where to jump if/when it panics. */
1019 op.cmd = XENPF_panic_init;
1020 op.interface_version = XENPF_INTERFACE_VERSION;
1021 op.u.panic_init.panic_addr = (unsigned long)xpv_panic_hdlr;
1022
1023 (void) HYPERVISOR_platform_op(&op);
1024
1025 init_xen_module();
1026 }
1027