1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #include <sys/types.h> 28 #include <sys/clock.h> 29 #include <sys/psm.h> 30 #include <sys/archsystm.h> 31 #include <sys/machsystm.h> 32 #include <sys/compress.h> 33 #include <sys/modctl.h> 34 #include <sys/trap.h> 35 #include <sys/panic.h> 36 #include <sys/regset.h> 37 #include <sys/frame.h> 38 #include <sys/kobj.h> 39 #include <sys/apic.h> 40 #include <sys/dumphdr.h> 41 #include <sys/mem.h> 42 #include <sys/x86_archext.h> 43 #include <sys/xpv_panic.h> 44 #include <sys/boot_console.h> 45 #include <sys/bootsvcs.h> 46 #include <sys/consdev.h> 47 #include <vm/hat_pte.h> 48 #include <vm/hat_i86.h> 49 50 /* XXX: need to add a PAE version too, if we ever support both PAE and non */ 51 #if defined(__i386) 52 #define XPV_FILENAME "/boot/xen-syms" 53 #else 54 #define XPV_FILENAME "/boot/amd64/xen-syms" 55 #endif 56 #define XPV_MODNAME "xpv" 57 58 int xpv_panicking = 0; 59 60 struct module *xpv_module; 61 struct modctl *xpv_modctl; 62 63 #define ALIGN(x, a) ((a) == 0 ? (uintptr_t)(x) : \ 64 (((uintptr_t)(x) + (uintptr_t)(a) - 1l) & ~((uintptr_t)(a) - 1l))) 65 66 /* Pointer to the xpv_panic_info structure handed to us by Xen. */ 67 static struct panic_info *xpv_panic_info = NULL; 68 69 /* Timer support */ 70 #define NSEC_SHIFT 5 71 #define T_XPV_TIMER 0xd1 72 #define XPV_TIMER_INTERVAL 1000 /* 1000 microseconds */ 73 static uint32_t *xpv_apicadr = NULL; 74 static uint_t nsec_scale; 75 76 /* IDT support */ 77 #pragma align 16(xpv_panic_idt) 78 static gate_desc_t xpv_panic_idt[NIDT]; /* interrupt descriptor table */ 79 80 /* Xen pagetables mapped into our HAT's ptable windows */ 81 static pfn_t ptable_pfn[MAX_NUM_LEVEL]; 82 83 /* Number of MMU_PAGESIZE pages we're adding to the Solaris dump */ 84 static int xpv_dump_pages; 85 86 /* 87 * There are up to two large swathes of RAM that we don't want to include 88 * in the dump: those that comprise the Xen version of segkpm. On 32-bit 89 * systems there is no such region of memory. On 64-bit systems, there 90 * should be just a single contiguous region that corresponds to all of 91 * physical memory. The tricky bit is that Xen's heap sometimes lives in 92 * the middle of their segkpm, and is mapped using only kpm-like addresses. 93 * In that case, we need to skip the swathes before and after Xen's heap. 94 */ 95 uintptr_t kpm1_low = 0; 96 uintptr_t kpm1_high = 0; 97 uintptr_t kpm2_low = 0; 98 uintptr_t kpm2_high = 0; 99 100 /* 101 * Some commonly used values that we don't want to recompute over and over. 102 */ 103 static int xpv_panic_nptes[MAX_NUM_LEVEL]; 104 static ulong_t xpv_panic_cr3; 105 static uintptr_t xpv_end; 106 107 static void xpv_panic_console_print(const char *fmt, ...); 108 static void (*xpv_panic_printf)(const char *, ...) = xpv_panic_console_print; 109 110 #define CONSOLE_BUF_SIZE 256 111 static char console_buffer[CONSOLE_BUF_SIZE]; 112 static boolean_t use_polledio; 113 114 /* 115 * Pointers to machine check panic info (if any). 116 */ 117 xpv_mca_panic_data_t *xpv_mca_panic_data = NULL; 118 119 static void 120 xpv_panic_putc(int m) 121 { 122 struct cons_polledio *c = cons_polledio; 123 124 /* This really shouldn't happen */ 125 if (console == CONS_HYPERVISOR) 126 return; 127 128 if (use_polledio == B_TRUE) 129 c->cons_polledio_putchar(c->cons_polledio_argument, m); 130 else 131 bcons_putchar(m); 132 } 133 134 static void 135 xpv_panic_puts(char *msg) 136 { 137 char *m; 138 139 dump_timeleft = dump_timeout; 140 for (m = msg; *m; m++) 141 xpv_panic_putc((int)*m); 142 } 143 144 static void 145 xpv_panic_console_print(const char *fmt, ...) 146 { 147 va_list ap; 148 149 va_start(ap, fmt); 150 (void) vsnprintf(console_buffer, sizeof (console_buffer), fmt, ap); 151 va_end(ap); 152 153 xpv_panic_puts(console_buffer); 154 } 155 156 static void 157 xpv_panic_map(int level, pfn_t pfn) 158 { 159 x86pte_t pte, *pteptr; 160 161 /* 162 * The provided pfn represents a level 'level' page table. Map it 163 * into the 'level' slot in the list of page table windows. 164 */ 165 pteptr = (x86pte_t *)PWIN_PTE_VA(level); 166 pte = pfn_to_pa(pfn) | PT_VALID; 167 168 XPV_ALLOW_PAGETABLE_UPDATES(); 169 if (mmu.pae_hat) 170 *pteptr = pte; 171 else 172 *(x86pte32_t *)pteptr = pte; 173 XPV_DISALLOW_PAGETABLE_UPDATES(); 174 175 mmu_tlbflush_entry(PWIN_VA(level)); 176 } 177 178 /* 179 * Walk the page tables to find the pfn mapped by the given va. 180 */ 181 static pfn_t 182 xpv_va_walk(uintptr_t *vaddr) 183 { 184 int l, idx; 185 pfn_t pfn; 186 x86pte_t pte; 187 x86pte_t *ptep; 188 uintptr_t va = *vaddr; 189 uintptr_t scan_va; 190 caddr_t ptable_window; 191 static pfn_t toplevel_pfn; 192 static uintptr_t lastva; 193 194 /* 195 * If we do anything other than a simple scan through memory, don't 196 * trust the mapped page tables. 197 */ 198 if (va != lastva + MMU_PAGESIZE) 199 for (l = mmu.max_level; l >= 0; l--) 200 ptable_pfn[l] = PFN_INVALID; 201 202 toplevel_pfn = mmu_btop(xpv_panic_cr3); 203 204 while (va < xpv_end && va >= *vaddr) { 205 /* Find the lowest table with any entry for va */ 206 pfn = toplevel_pfn; 207 for (l = mmu.max_level; l >= 0; l--) { 208 if (ptable_pfn[l] != pfn) { 209 xpv_panic_map(l, pfn); 210 ptable_pfn[l] = pfn; 211 } 212 213 /* 214 * Search this pagetable for any mapping to an 215 * address >= va. 216 */ 217 ptable_window = PWIN_VA(l); 218 if (l == mmu.max_level && mmu.pae_hat) 219 ptable_window += 220 (xpv_panic_cr3 & MMU_PAGEOFFSET); 221 222 idx = (va >> LEVEL_SHIFT(l)) & (xpv_panic_nptes[l] - 1); 223 scan_va = va; 224 while (idx < xpv_panic_nptes[l] && scan_va < xpv_end && 225 scan_va >= *vaddr) { 226 ptep = (x86pte_t *)(ptable_window + 227 (idx << mmu.pte_size_shift)); 228 pte = GET_PTE(ptep); 229 if (pte & PTE_VALID) 230 break; 231 idx++; 232 scan_va += mmu.level_size[l]; 233 } 234 235 /* 236 * If there are no valid mappings in this table, we 237 * can skip to the end of the VA range it covers. 238 */ 239 if (idx == xpv_panic_nptes[l]) { 240 va = NEXT_ENTRY_VA(va, l + 1); 241 break; 242 } 243 244 va = scan_va; 245 /* 246 * See if we've hit the end of the range. 247 */ 248 if (va >= xpv_end || va < *vaddr) 249 break; 250 251 /* 252 * If this mapping is for a pagetable, we drop down 253 * to the next level in the hierarchy and look for 254 * a mapping in it. 255 */ 256 pfn = PTE2MFN(pte, l); 257 if (!PTE_ISPAGE(pte, l)) 258 continue; 259 260 /* 261 * The APIC page is magic. Nothing to see here; 262 * move along. 263 */ 264 if (((uintptr_t)xpv_apicadr & MMU_PAGEMASK) == 265 (va & MMU_PAGEMASK)) { 266 va += MMU_PAGESIZE; 267 break; 268 } 269 270 /* 271 * See if the address is within one of the two 272 * kpm-like regions we want to skip. 273 */ 274 if (va >= kpm1_low && va < kpm1_high) { 275 va = kpm1_high; 276 break; 277 } 278 if (va >= kpm2_low && va < kpm2_high) { 279 va = kpm2_high; 280 break; 281 } 282 283 /* 284 * The Xen panic code only handles small pages. If 285 * this mapping is for a large page, we need to 286 * identify the consituent page that covers the 287 * specific VA we were looking for. 288 */ 289 if (l > 0) { 290 if (l > 1) 291 panic("Xen panic can't cope with " 292 "giant pages."); 293 idx = (va >> LEVEL_SHIFT(0)) & 294 (xpv_panic_nptes[0] - 1); 295 pfn += idx; 296 } 297 298 *vaddr = va; 299 lastva = va; 300 return (pfn | PFN_IS_FOREIGN_MFN); 301 } 302 } 303 return (PFN_INVALID); 304 } 305 306 /* 307 * Walk through the Xen VA space, finding pages that are mapped in. 308 * 309 * These pages all have MFNs rather than PFNs, meaning they may be outside 310 * the physical address space the kernel knows about, or they may collide 311 * with PFNs the kernel is using. 312 * 313 * The obvious trick of just adding the PFN_IS_FOREIGN_MFN bit to the MFNs 314 * to avoid collisions doesn't work. The pages need to be written to disk 315 * in PFN-order or savecore gets confused. We can't allocate memory to 316 * contruct a sorted pfn->VA reverse mapping, so we have to write the pages 317 * to disk in VA order. 318 * 319 * To square this circle, we simply make up PFNs for each of Xen's pages. 320 * We assign each mapped page a fake PFN in ascending order. These fake 321 * PFNs each have the FOREIGN bit set, ensuring that they fall outside the 322 * range of Solaris PFNs written by the kernel. 323 */ 324 int 325 dump_xpv_addr() 326 { 327 uintptr_t va; 328 mem_vtop_t mem_vtop; 329 330 xpv_dump_pages = 0; 331 va = xen_virt_start; 332 333 while (xpv_va_walk(&va) != PFN_INVALID) { 334 mem_vtop.m_as = &kas; 335 mem_vtop.m_va = (void *)va; 336 mem_vtop.m_pfn = (pfn_t)xpv_dump_pages | PFN_IS_FOREIGN_MFN; 337 338 dumpvp_write(&mem_vtop, sizeof (mem_vtop_t)); 339 xpv_dump_pages++; 340 341 va += MMU_PAGESIZE; 342 } 343 344 /* 345 * Add the shared_info page. This page actually ends up in the 346 * dump twice: once for the Xen va and once for the Solaris va. 347 * This isn't ideal, but we don't know the address Xen is using for 348 * the page, so we can't share it. 349 */ 350 mem_vtop.m_as = &kas; 351 mem_vtop.m_va = HYPERVISOR_shared_info; 352 mem_vtop.m_pfn = (pfn_t)xpv_dump_pages | PFN_IS_FOREIGN_MFN; 353 dumpvp_write(&mem_vtop, sizeof (mem_vtop_t)); 354 xpv_dump_pages++; 355 356 return (xpv_dump_pages); 357 } 358 359 void 360 dump_xpv_pfn() 361 { 362 pfn_t pfn; 363 int cnt; 364 365 for (cnt = 0; cnt < xpv_dump_pages; cnt++) { 366 pfn = (pfn_t)cnt | PFN_IS_FOREIGN_MFN; 367 dumpvp_write(&pfn, sizeof (pfn)); 368 } 369 } 370 371 int 372 dump_xpv_data(void *dump_cbuf) 373 { 374 uintptr_t va; 375 uint32_t csize; 376 int cnt = 0; 377 378 /* 379 * XXX: we should probably run this data through a UE check. The 380 * catch is that the UE code relies on on_trap() and getpfnum() 381 * working. 382 */ 383 va = xen_virt_start; 384 385 while (xpv_va_walk(&va) != PFN_INVALID) { 386 csize = (uint32_t)compress((void *)va, dump_cbuf, PAGESIZE); 387 dumpvp_write(&csize, sizeof (uint32_t)); 388 dumpvp_write(dump_cbuf, csize); 389 if (dump_ioerr) { 390 dumphdr->dump_flags &= ~DF_COMPLETE; 391 return (cnt); 392 } 393 cnt++; 394 va += MMU_PAGESIZE; 395 } 396 397 /* 398 * Finally, dump the shared_info page 399 */ 400 csize = (uint32_t)compress((void *)HYPERVISOR_shared_info, dump_cbuf, 401 PAGESIZE); 402 dumpvp_write(&csize, sizeof (uint32_t)); 403 dumpvp_write(dump_cbuf, csize); 404 if (dump_ioerr) 405 dumphdr->dump_flags &= ~DF_COMPLETE; 406 cnt++; 407 408 return (cnt); 409 } 410 411 static void * 412 showstack(void *fpreg, int xpv_only) 413 { 414 struct frame *fpp; 415 ulong_t off; 416 char *sym; 417 uintptr_t pc, fp, lastfp; 418 uintptr_t minaddr = min(KERNELBASE, xen_virt_start); 419 420 fp = (uintptr_t)fpreg; 421 if (fp < minaddr) { 422 xpv_panic_printf("Bad frame ptr: 0x%p\n", fpreg); 423 return (fpreg); 424 } 425 426 do { 427 fpp = (struct frame *)fp; 428 pc = fpp->fr_savpc; 429 430 if ((xpv_only != 0) && 431 (fp > xpv_end || fp < xen_virt_start)) 432 break; 433 if ((sym = kobj_getsymname(pc, &off)) != NULL) 434 xpv_panic_printf("%08lx %s:%s+%lx\n", fp, 435 mod_containing_pc((caddr_t)pc), sym, off); 436 else if ((pc >= xen_virt_start) && (pc <= xpv_end)) 437 xpv_panic_printf("%08lx 0x%lx (in Xen)\n", fp, pc); 438 else 439 xpv_panic_printf("%08lx %lx\n", fp, pc); 440 441 lastfp = fp; 442 fp = fpp->fr_savfp; 443 444 /* 445 * Xen marks an exception frame by inverting the frame 446 * pointer. 447 */ 448 if (fp < lastfp) { 449 if ((~fp > minaddr) && ((~fp) ^ lastfp) < 0xfff) 450 fp = ~fp; 451 } 452 } while (fp > lastfp); 453 return ((void *)fp); 454 } 455 456 void * 457 xpv_traceback(void *fpreg) 458 { 459 return (showstack(fpreg, 1)); 460 } 461 462 #if defined(__amd64) 463 static void 464 xpv_panic_hypercall(ulong_t call) 465 { 466 panic("Illegally issued hypercall %d during panic!\n", (int)call); 467 } 468 #endif 469 470 void 471 xpv_die(struct regs *rp) 472 { 473 struct panic_trap_info ti; 474 struct cregs creg; 475 476 ti.trap_regs = rp; 477 ti.trap_type = rp->r_trapno; 478 479 curthread->t_panic_trap = &ti; 480 if (ti.trap_type == T_PGFLT) { 481 getcregs(&creg); 482 ti.trap_addr = (caddr_t)creg.cr_cr2; 483 panic("Fatal pagefault at 0x%lx. fault addr=0x%p rp=0x%p", 484 rp->r_pc, (void *)ti.trap_addr, (void *)rp); 485 } else { 486 ti.trap_addr = (caddr_t)rp->r_pc; 487 panic("Fatal trap %ld at 0x%lx. rp=0x%p", rp->r_trapno, 488 rp->r_pc, (void *)rp); 489 } 490 } 491 492 /* 493 * Build IDT to handle a Xen panic 494 */ 495 static void 496 switch_to_xpv_panic_idt() 497 { 498 int i; 499 desctbr_t idtr; 500 gate_desc_t *idt = xpv_panic_idt; 501 selector_t cs = get_cs_register(); 502 503 for (i = 0; i < 32; i++) 504 set_gatesegd(&idt[i], &xpv_invaltrap, cs, SDT_SYSIGT, TRP_XPL, 505 0); 506 507 set_gatesegd(&idt[T_ZERODIV], &xpv_div0trap, cs, SDT_SYSIGT, TRP_XPL, 508 0); 509 set_gatesegd(&idt[T_SGLSTP], &xpv_dbgtrap, cs, SDT_SYSIGT, TRP_XPL, 0); 510 set_gatesegd(&idt[T_NMIFLT], &xpv_nmiint, cs, SDT_SYSIGT, TRP_XPL, 0); 511 set_gatesegd(&idt[T_BOUNDFLT], &xpv_boundstrap, cs, SDT_SYSIGT, 512 TRP_XPL, 0); 513 set_gatesegd(&idt[T_ILLINST], &xpv_invoptrap, cs, SDT_SYSIGT, TRP_XPL, 514 0); 515 set_gatesegd(&idt[T_NOEXTFLT], &xpv_ndptrap, cs, SDT_SYSIGT, TRP_XPL, 516 0); 517 set_gatesegd(&idt[T_TSSFLT], &xpv_invtsstrap, cs, SDT_SYSIGT, TRP_XPL, 518 0); 519 set_gatesegd(&idt[T_SEGFLT], &xpv_segnptrap, cs, SDT_SYSIGT, TRP_XPL, 520 0); 521 set_gatesegd(&idt[T_STKFLT], &xpv_stktrap, cs, SDT_SYSIGT, TRP_XPL, 0); 522 set_gatesegd(&idt[T_GPFLT], &xpv_gptrap, cs, SDT_SYSIGT, TRP_XPL, 0); 523 set_gatesegd(&idt[T_PGFLT], &xpv_pftrap, cs, SDT_SYSIGT, TRP_XPL, 0); 524 set_gatesegd(&idt[T_EXTERRFLT], &xpv_ndperr, cs, SDT_SYSIGT, TRP_XPL, 525 0); 526 set_gatesegd(&idt[T_ALIGNMENT], &xpv_achktrap, cs, SDT_SYSIGT, TRP_XPL, 527 0); 528 set_gatesegd(&idt[T_MCE], &xpv_mcetrap, cs, SDT_SYSIGT, TRP_XPL, 0); 529 set_gatesegd(&idt[T_SIMDFPE], &xpv_xmtrap, cs, SDT_SYSIGT, TRP_XPL, 0); 530 531 /* 532 * We have no double fault handler. Any single fault represents a 533 * catastrophic failure for us, so there is no attempt to handle 534 * them cleanly: we just print a message and reboot. If we 535 * encounter a second fault while doing that, there is nothing 536 * else we can do. 537 */ 538 539 /* 540 * Be prepared to absorb any stray device interrupts received 541 * while writing the core to disk. 542 */ 543 for (i = 33; i < NIDT; i++) 544 set_gatesegd(&idt[i], &xpv_surprise_intr, cs, SDT_SYSIGT, 545 TRP_XPL, 0); 546 547 /* The one interrupt we expect to get is from the APIC timer. */ 548 set_gatesegd(&idt[T_XPV_TIMER], &xpv_timer_trap, cs, SDT_SYSIGT, 549 TRP_XPL, 0); 550 551 idtr.dtr_base = (uintptr_t)xpv_panic_idt; 552 idtr.dtr_limit = sizeof (xpv_panic_idt) - 1; 553 wr_idtr(&idtr); 554 555 #if defined(__amd64) 556 /* Catch any hypercalls. */ 557 wrmsr(MSR_AMD_LSTAR, (uintptr_t)xpv_panic_hypercall); 558 wrmsr(MSR_AMD_CSTAR, (uintptr_t)xpv_panic_hypercall); 559 #endif 560 } 561 562 static void 563 xpv_apic_clkinit() 564 { 565 uint_t apic_ticks = 0; 566 567 /* 568 * Measure how many APIC ticks there are within a fixed time 569 * period. We're going to be fairly coarse here. This timer is 570 * just being used to detect a stalled panic, so as long as we have 571 * the right order of magnitude, everything should be fine. 572 */ 573 xpv_apicadr[APIC_SPUR_INT_REG] = AV_UNIT_ENABLE | APIC_SPUR_INTR; 574 xpv_apicadr[APIC_LOCAL_TIMER] = AV_MASK; 575 xpv_apicadr[APIC_INT_VECT0] = AV_MASK; /* local intr reg 0 */ 576 577 xpv_apicadr[APIC_DIVIDE_REG] = 0; 578 xpv_apicadr[APIC_INIT_COUNT] = APIC_MAXVAL; 579 drv_usecwait(XPV_TIMER_INTERVAL); 580 apic_ticks = APIC_MAXVAL - xpv_apicadr[APIC_CURR_COUNT]; 581 582 /* 583 * apic_ticks now represents roughly how many apic ticks comprise 584 * one timeout interval. Program the timer to send us an interrupt 585 * every time that interval expires. 586 */ 587 xpv_apicadr[APIC_LOCAL_TIMER] = T_XPV_TIMER | AV_TIME; 588 xpv_apicadr[APIC_INIT_COUNT] = apic_ticks; 589 xpv_apicadr[APIC_EOI_REG] = 0; 590 } 591 592 void 593 xpv_timer_tick(void) 594 { 595 static int ticks = 0; 596 597 if (ticks++ >= MICROSEC / XPV_TIMER_INTERVAL) { 598 ticks = 0; 599 if (dump_timeleft && (--dump_timeleft == 0)) 600 panic("Xen panic timeout\n"); 601 } 602 xpv_apicadr[APIC_EOI_REG] = 0; 603 } 604 605 void 606 xpv_interrupt(void) 607 { 608 #ifdef DEBUG 609 static int cnt = 0; 610 611 if (cnt++ < 10) 612 xpv_panic_printf("Unexpected interrupt received.\n"); 613 if ((cnt < 1000) && ((cnt % 100) == 0)) 614 xpv_panic_printf("%d unexpected interrupts received.\n", cnt); 615 #endif 616 617 xpv_apicadr[APIC_EOI_REG] = 0; 618 } 619 620 /* 621 * Managing time in panic context is trivial. We only have a single CPU, 622 * we never get rescheduled, we never get suspended. We just need to 623 * convert clock ticks into nanoseconds. 624 */ 625 static hrtime_t 626 xpv_panic_gethrtime(void) 627 { 628 hrtime_t tsc, hrt; 629 unsigned int *l = (unsigned int *)&(tsc); 630 631 tsc = __rdtsc_insn(); 632 hrt = (mul32(l[1], nsec_scale) << NSEC_SHIFT) + 633 (mul32(l[0], nsec_scale) >> (32 - NSEC_SHIFT)); 634 635 return (hrt); 636 } 637 638 static void 639 xpv_panic_time_init() 640 { 641 nsec_scale = 642 CPU->cpu_m.mcpu_vcpu_info->time.tsc_to_system_mul >> NSEC_SHIFT; 643 644 gethrtimef = xpv_panic_gethrtime; 645 } 646 647 static void 648 xpv_panicsys(struct regs *rp, char *fmt, ...) 649 { 650 extern void panicsys(const char *, va_list, struct regs *, int); 651 va_list alist; 652 653 va_start(alist, fmt); 654 panicsys(fmt, alist, rp, 1); 655 va_end(alist); 656 } 657 658 void 659 xpv_do_panic(void *arg) 660 { 661 struct panic_info *pip = (struct panic_info *)arg; 662 int l; 663 struct cregs creg; 664 #if defined(__amd64) 665 extern uintptr_t postbootkernelbase; 666 #endif 667 668 if (xpv_panicking++ > 0) 669 panic("multiple calls to xpv_do_panic()"); 670 671 /* 672 * Indicate to the underlying panic framework that a panic has been 673 * initiated. This is ordinarily done as part of vpanic(). Since 674 * we already have all the register state saved by the hypervisor, 675 * we skip that and jump straight into the panic processing code. 676 * 677 * XXX If another thread grabs and wins the panic_quiesce trigger 678 * then we'll have two threads in panicsys believing they are in 679 * charge of the panic attempt! 680 */ 681 (void) panic_trigger(&panic_quiesce); 682 683 #if defined(__amd64) 684 /* 685 * bzero() and bcopy() get unhappy when asked to operate on 686 * addresses outside of the kernel. At this point Xen is really a 687 * part of the kernel, so we update the routines' notion of where 688 * the kernel starts. 689 */ 690 postbootkernelbase = xen_virt_start; 691 #endif 692 693 #if defined(HYPERVISOR_VIRT_END) 694 xpv_end = HYPERVISOR_VIRT_END; 695 #else 696 xpv_end = (uintptr_t)UINTPTR_MAX - sizeof (uintptr_t); 697 #endif 698 699 /* 700 * If we were redirecting console output to the hypervisor, we have 701 * to stop. 702 */ 703 use_polledio = B_FALSE; 704 if (console == CONS_HYPERVISOR) { 705 bcons_device_change(CONS_HYPERVISOR); 706 } else if (cons_polledio != NULL && 707 cons_polledio->cons_polledio_putchar != NULL) { 708 if (cons_polledio->cons_polledio_enter != NULL) 709 cons_polledio->cons_polledio_enter( 710 cons_polledio->cons_polledio_argument); 711 use_polledio = 1; 712 } 713 714 /* Make sure we handle all console output from here on. */ 715 sysp->bsvc_putchar = xpv_panic_putc; 716 717 /* 718 * If we find an unsupported panic_info structure, there's not much 719 * we can do other than complain, plow on, and hope for the best. 720 */ 721 if (pip->pi_version != PANIC_INFO_VERSION) 722 xpv_panic_printf("Warning: Xen is using an unsupported " 723 "version of the panic_info structure.\n"); 724 725 xpv_panic_info = pip; 726 727 #if defined(__amd64) 728 kpm1_low = (uintptr_t)xpv_panic_info->pi_ram_start; 729 if (xpv_panic_info->pi_xen_start == NULL) { 730 kpm1_high = (uintptr_t)xpv_panic_info->pi_ram_end; 731 } else { 732 kpm1_high = (uintptr_t)xpv_panic_info->pi_xen_start; 733 kpm2_low = (uintptr_t)xpv_panic_info->pi_xen_end; 734 kpm2_high = (uintptr_t)xpv_panic_info->pi_ram_end; 735 } 736 #endif 737 738 /* 739 * Make sure we are running on the Solaris %gs. The Xen panic code 740 * should already have set up the GDT properly. 741 */ 742 xpv_panic_resetgs(); 743 #if defined(__amd64) 744 wrmsr(MSR_AMD_GSBASE, (uint64_t)&cpus[0]); 745 #endif 746 747 xpv_panic_time_init(); 748 749 /* 750 * Switch to our own IDT, avoiding any accidental returns to Xen 751 * world. 752 */ 753 switch_to_xpv_panic_idt(); 754 755 /* 756 * Initialize the APIC timer, which is used to detect a hung dump 757 * attempt. 758 */ 759 xpv_apicadr = pip->pi_apic; 760 xpv_apic_clkinit(); 761 762 /* 763 * Set up a few values that we'll need repeatedly. 764 */ 765 getcregs(&creg); 766 xpv_panic_cr3 = creg.cr_cr3; 767 for (l = mmu.max_level; l >= 0; l--) 768 xpv_panic_nptes[l] = mmu.ptes_per_table; 769 #ifdef __i386 770 if (mmu.pae_hat) 771 xpv_panic_nptes[mmu.max_level] = 4; 772 #endif 773 774 /* Add the fake Xen module to the module list */ 775 if (xpv_module != NULL) { 776 extern int last_module_id; 777 778 xpv_modctl->mod_id = last_module_id++; 779 xpv_modctl->mod_next = &modules; 780 xpv_modctl->mod_prev = modules.mod_prev; 781 modules.mod_prev->mod_next = xpv_modctl; 782 modules.mod_prev = xpv_modctl; 783 } 784 785 if (pip->pi_mca.mpd_magic == MCA_PANICDATA_MAGIC) 786 xpv_mca_panic_data = &pip->pi_mca; 787 788 xpv_panic_printf = printf; 789 xpv_panicsys((struct regs *)pip->pi_regs, pip->pi_panicstr); 790 xpv_panic_printf("Failed to reboot following panic.\n"); 791 for (;;) 792 ; 793 } 794 795 /* 796 * Set up the necessary data structures to pretend that the Xen hypervisor 797 * is a loadable module, allowing mdb to find the Xen symbols in a crash 798 * dump. Since these symbols all map to VA space Solaris doesn't normally 799 * have access to, we don't link these structures into the kernel's lists 800 * until/unless we hit a Xen panic. 801 * 802 * The observant reader will note a striking amount of overlap between this 803 * code and that found in krtld. While it would be handy if we could just 804 * ask krtld to do this work for us, it's not that simple. Among the 805 * complications: we're not actually loading the text here (grub did it at 806 * boot), the .text section is writable, there are no relocations to do, 807 * none of the module text/data is in readable memory, etc. Training krtld 808 * to deal with this weird module is as complicated, and more risky, than 809 * reimplementing the necessary subset of it here. 810 */ 811 static void 812 init_xen_module() 813 { 814 struct _buf *file = NULL; 815 struct module *mp; 816 struct modctl *mcp; 817 int i, shn; 818 Shdr *shp, *ctf_shp; 819 char *names = NULL; 820 size_t n, namesize, text_align, data_align; 821 #if defined(__amd64) 822 const char machine = EM_AMD64; 823 #else 824 const char machine = EM_386; 825 #endif 826 827 /* Allocate and init the module structure */ 828 mp = kmem_zalloc(sizeof (*mp), KM_SLEEP); 829 mp->filename = kobj_zalloc(strlen(XPV_FILENAME) + 1, KM_SLEEP); 830 (void) strcpy(mp->filename, XPV_FILENAME); 831 832 /* Allocate and init the modctl structure */ 833 mcp = kmem_zalloc(sizeof (*mcp), KM_SLEEP); 834 mcp->mod_modname = kobj_zalloc(strlen(XPV_MODNAME) + 1, KM_SLEEP); 835 (void) strcpy(mcp->mod_modname, XPV_MODNAME); 836 mcp->mod_filename = kobj_zalloc(strlen(XPV_FILENAME) + 1, KM_SLEEP); 837 (void) strcpy(mcp->mod_filename, XPV_FILENAME); 838 mcp->mod_inprogress_thread = (kthread_id_t)-1; 839 mcp->mod_ref = 1; 840 mcp->mod_loaded = 1; 841 mcp->mod_loadcnt = 1; 842 mcp->mod_mp = mp; 843 844 /* 845 * Try to open a Xen image that hasn't had its symbol and CTF 846 * information stripped off. 847 */ 848 file = kobj_open_file(XPV_FILENAME); 849 if (file == (struct _buf *)-1) { 850 file = NULL; 851 goto err; 852 } 853 854 /* 855 * Read the header and ensure that this is an ELF file for the 856 * proper ISA. If it's not, somebody has done something very 857 * stupid. Why bother? See Mencken. 858 */ 859 if (kobj_read_file(file, (char *)&mp->hdr, sizeof (mp->hdr), 0) < 0) 860 goto err; 861 for (i = 0; i < SELFMAG; i++) 862 if (mp->hdr.e_ident[i] != ELFMAG[i]) 863 goto err; 864 if ((mp->hdr.e_ident[EI_DATA] != ELFDATA2LSB) || 865 (mp->hdr.e_machine != machine)) 866 goto err; 867 868 /* Read in the section headers */ 869 n = mp->hdr.e_shentsize * mp->hdr.e_shnum; 870 mp->shdrs = kmem_zalloc(n, KM_SLEEP); 871 if (kobj_read_file(file, mp->shdrs, n, mp->hdr.e_shoff) < 0) 872 goto err; 873 874 /* Read the section names */ 875 shp = (Shdr *)(mp->shdrs + mp->hdr.e_shstrndx * mp->hdr.e_shentsize); 876 namesize = shp->sh_size; 877 names = kmem_zalloc(shp->sh_size, KM_SLEEP); 878 if (kobj_read_file(file, names, shp->sh_size, shp->sh_offset) < 0) 879 goto err; 880 881 /* 882 * Fill in the text and data size fields. 883 */ 884 ctf_shp = NULL; 885 text_align = data_align = 0; 886 for (shn = 1; shn < mp->hdr.e_shnum; shn++) { 887 shp = (Shdr *)(mp->shdrs + shn * mp->hdr.e_shentsize); 888 889 /* Sanity check the offset of the section name */ 890 if (shp->sh_name >= namesize) 891 continue; 892 893 /* If we find the symtab section, remember it for later. */ 894 if (shp->sh_type == SHT_SYMTAB) { 895 mp->symtbl_section = shn; 896 mp->symhdr = shp; 897 continue; 898 } 899 900 /* If we find the CTF section, remember it for later. */ 901 if ((shp->sh_size != 0) && 902 (strcmp(names + shp->sh_name, ".SUNW_ctf") == 0)) { 903 ctf_shp = shp; 904 continue; 905 } 906 907 if (!(shp->sh_flags & SHF_ALLOC)) 908 continue; 909 910 /* 911 * Xen marks its text section as writable, so we need to 912 * look for the name - not just the flag. 913 */ 914 if ((strcmp(&names[shp->sh_name], ".text") != NULL) && 915 (shp->sh_flags & SHF_WRITE) != 0) { 916 if (shp->sh_addralign > data_align) 917 data_align = shp->sh_addralign; 918 mp->data_size = ALIGN(mp->data_size, data_align); 919 mp->data_size += ALIGN(shp->sh_size, 8); 920 if (mp->data == NULL || mp->data > (char *)shp->sh_addr) 921 mp->data = (char *)shp->sh_addr; 922 } else { 923 if (shp->sh_addralign > text_align) 924 text_align = shp->sh_addralign; 925 mp->text_size = ALIGN(mp->text_size, text_align); 926 mp->text_size += ALIGN(shp->sh_size, 8); 927 if (mp->text == NULL || mp->text > (char *)shp->sh_addr) 928 mp->text = (char *)shp->sh_addr; 929 } 930 } 931 kmem_free(names, namesize); 932 names = NULL; 933 shp = NULL; 934 mcp->mod_text = mp->text; 935 mcp->mod_text_size = mp->text_size; 936 937 /* 938 * If we have symbol table and string table sections, read them in 939 * now. If we don't, we just plow on. We'll still get a valid 940 * core dump, but finding anything useful will be just a bit 941 * harder. 942 * 943 * Note: we don't bother with a hash table. We'll never do a 944 * symbol lookup unless we crash, and then mdb creates its own. We 945 * also don't try to perform any relocations. Xen should be loaded 946 * exactly where the ELF file indicates, and the symbol information 947 * in the file should be complete and correct already. Static 948 * linking ain't all bad. 949 */ 950 if ((mp->symhdr != NULL) && (mp->symhdr->sh_link < mp->hdr.e_shnum)) { 951 mp->strhdr = (Shdr *) 952 (mp->shdrs + mp->symhdr->sh_link * mp->hdr.e_shentsize); 953 mp->nsyms = mp->symhdr->sh_size / mp->symhdr->sh_entsize; 954 955 /* Allocate space for the symbol table and strings. */ 956 mp->symsize = mp->symhdr->sh_size + 957 mp->nsyms * sizeof (symid_t) + mp->strhdr->sh_size; 958 mp->symspace = kmem_zalloc(mp->symsize, KM_SLEEP); 959 mp->symtbl = mp->symspace; 960 mp->strings = (char *)(mp->symtbl + mp->symhdr->sh_size); 961 962 if ((kobj_read_file(file, mp->symtbl, 963 mp->symhdr->sh_size, mp->symhdr->sh_offset) < 0) || 964 (kobj_read_file(file, mp->strings, 965 mp->strhdr->sh_size, mp->strhdr->sh_offset) < 0)) 966 goto err; 967 } 968 969 /* 970 * Read in the CTF section 971 */ 972 if ((ctf_shp != NULL) && ((moddebug & MODDEBUG_NOCTF) == 0)) { 973 mp->ctfdata = kmem_zalloc(ctf_shp->sh_size, KM_SLEEP); 974 mp->ctfsize = ctf_shp->sh_size; 975 if (kobj_read_file(file, mp->ctfdata, mp->ctfsize, 976 ctf_shp->sh_offset) < 0) 977 goto err; 978 } 979 980 kobj_close_file(file); 981 982 xpv_module = mp; 983 xpv_modctl = mcp; 984 return; 985 986 err: 987 cmn_err(CE_WARN, "Failed to initialize xpv module."); 988 if (file != NULL) 989 kobj_close_file(file); 990 991 kmem_free(mp->filename, strlen(XPV_FILENAME) + 1); 992 if (mp->shdrs != NULL) 993 kmem_free(mp->shdrs, mp->hdr.e_shentsize * mp->hdr.e_shnum); 994 if (mp->symspace != NULL) 995 kmem_free(mp->symspace, mp->symsize); 996 if (mp->ctfdata != NULL) 997 kmem_free(mp->ctfdata, mp->ctfsize); 998 kmem_free(mp, sizeof (*mp)); 999 kmem_free(mcp->mod_filename, strlen(XPV_FILENAME) + 1); 1000 kmem_free(mcp->mod_modname, strlen(XPV_MODNAME) + 1); 1001 kmem_free(mcp, sizeof (*mcp)); 1002 if (names != NULL) 1003 kmem_free(names, namesize); 1004 } 1005 1006 void 1007 xpv_panic_init() 1008 { 1009 xen_platform_op_t op; 1010 int i; 1011 1012 ASSERT(DOMAIN_IS_INITDOMAIN(xen_info)); 1013 1014 for (i = 0; i < mmu.num_level; i++) 1015 ptable_pfn[i] = PFN_INVALID; 1016 1017 /* Let Xen know where to jump if/when it panics. */ 1018 op.cmd = XENPF_panic_init; 1019 op.interface_version = XENPF_INTERFACE_VERSION; 1020 op.u.panic_init.panic_addr = (unsigned long)xpv_panic_hdlr; 1021 1022 (void) HYPERVISOR_platform_op(&op); 1023 1024 init_xen_module(); 1025 } 1026