1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2012 Gary Mills 23 * 24 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 25 */ 26 27 #include <sys/types.h> 28 #include <sys/clock.h> 29 #include <sys/psm.h> 30 #include <sys/archsystm.h> 31 #include <sys/machsystm.h> 32 #include <sys/compress.h> 33 #include <sys/modctl.h> 34 #include <sys/trap.h> 35 #include <sys/panic.h> 36 #include <sys/regset.h> 37 #include <sys/frame.h> 38 #include <sys/kobj.h> 39 #include <sys/apic.h> 40 #include <sys/apic_timer.h> 41 #include <sys/dumphdr.h> 42 #include <sys/mem.h> 43 #include <sys/x86_archext.h> 44 #include <sys/xpv_panic.h> 45 #include <sys/boot_console.h> 46 #include <sys/bootsvcs.h> 47 #include <sys/consdev.h> 48 #include <vm/hat_pte.h> 49 #include <vm/hat_i86.h> 50 51 /* XXX: need to add a PAE version too, if we ever support both PAE and non */ 52 #if defined(__i386) 53 #define XPV_FILENAME "/boot/xen-syms" 54 #else 55 #define XPV_FILENAME "/boot/amd64/xen-syms" 56 #endif 57 #define XPV_MODNAME "xpv" 58 59 int xpv_panicking = 0; 60 61 struct module *xpv_module; 62 struct modctl *xpv_modctl; 63 64 #define ALIGN(x, a) ((a) == 0 ? (uintptr_t)(x) : \ 65 (((uintptr_t)(x) + (uintptr_t)(a) - 1l) & ~((uintptr_t)(a) - 1l))) 66 67 /* Pointer to the xpv_panic_info structure handed to us by Xen. */ 68 static struct panic_info *xpv_panic_info = NULL; 69 70 /* Timer support */ 71 #define NSEC_SHIFT 5 72 #define T_XPV_TIMER 0xd1 73 #define XPV_TIMER_INTERVAL 1000 /* 1000 microseconds */ 74 static uint32_t *xpv_apicadr = NULL; 75 static uint_t nsec_scale; 76 77 /* IDT support */ 78 #pragma align 16(xpv_panic_idt) 79 static gate_desc_t xpv_panic_idt[NIDT]; /* interrupt descriptor table */ 80 81 /* Xen pagetables mapped into our HAT's ptable windows */ 82 static pfn_t ptable_pfn[MAX_NUM_LEVEL]; 83 84 /* Number of MMU_PAGESIZE pages we're adding to the Solaris dump */ 85 static int xpv_dump_pages; 86 87 /* 88 * There are up to two large swathes of RAM that we don't want to include 89 * in the dump: those that comprise the Xen version of segkpm. On 32-bit 90 * systems there is no such region of memory. On 64-bit systems, there 91 * should be just a single contiguous region that corresponds to all of 92 * physical memory. The tricky bit is that Xen's heap sometimes lives in 93 * the middle of their segkpm, and is mapped using only kpm-like addresses. 94 * In that case, we need to skip the swathes before and after Xen's heap. 95 */ 96 uintptr_t kpm1_low = 0; 97 uintptr_t kpm1_high = 0; 98 uintptr_t kpm2_low = 0; 99 uintptr_t kpm2_high = 0; 100 101 /* 102 * Some commonly used values that we don't want to recompute over and over. 103 */ 104 static int xpv_panic_nptes[MAX_NUM_LEVEL]; 105 static ulong_t xpv_panic_cr3; 106 static uintptr_t xpv_end; 107 108 static void xpv_panic_console_print(const char *fmt, ...); 109 static void (*xpv_panic_printf)(const char *, ...) = xpv_panic_console_print; 110 111 #define CONSOLE_BUF_SIZE 256 112 static char console_buffer[CONSOLE_BUF_SIZE]; 113 static boolean_t use_polledio; 114 115 /* 116 * Pointers to machine check panic info (if any). 117 */ 118 xpv_mca_panic_data_t *xpv_mca_panic_data = NULL; 119 120 static void 121 xpv_panic_putc(int m) 122 { 123 struct cons_polledio *c = cons_polledio; 124 125 /* This really shouldn't happen */ 126 if (boot_console_type(NULL) == CONS_HYPERVISOR) 127 return; 128 129 if (use_polledio == B_TRUE) 130 c->cons_polledio_putchar(c->cons_polledio_argument, m); 131 else 132 bcons_putchar(m); 133 } 134 135 static void 136 xpv_panic_puts(char *msg) 137 { 138 char *m; 139 140 dump_timeleft = dump_timeout; 141 for (m = msg; *m; m++) 142 xpv_panic_putc((int)*m); 143 } 144 145 static void 146 xpv_panic_console_print(const char *fmt, ...) 147 { 148 va_list ap; 149 150 va_start(ap, fmt); 151 (void) vsnprintf(console_buffer, sizeof (console_buffer), fmt, ap); 152 va_end(ap); 153 154 xpv_panic_puts(console_buffer); 155 } 156 157 static void 158 xpv_panic_map(int level, pfn_t pfn) 159 { 160 x86pte_t pte, *pteptr; 161 162 /* 163 * The provided pfn represents a level 'level' page table. Map it 164 * into the 'level' slot in the list of page table windows. 165 */ 166 pteptr = (x86pte_t *)PWIN_PTE_VA(level); 167 pte = pfn_to_pa(pfn) | PT_VALID; 168 169 XPV_ALLOW_PAGETABLE_UPDATES(); 170 if (mmu.pae_hat) 171 *pteptr = pte; 172 else 173 *(x86pte32_t *)pteptr = pte; 174 XPV_DISALLOW_PAGETABLE_UPDATES(); 175 176 mmu_tlbflush_entry(PWIN_VA(level)); 177 } 178 179 /* 180 * Walk the page tables to find the pfn mapped by the given va. 181 */ 182 static pfn_t 183 xpv_va_walk(uintptr_t *vaddr) 184 { 185 int l, idx; 186 pfn_t pfn; 187 x86pte_t pte; 188 x86pte_t *ptep; 189 uintptr_t va = *vaddr; 190 uintptr_t scan_va; 191 caddr_t ptable_window; 192 static pfn_t toplevel_pfn; 193 static uintptr_t lastva; 194 195 /* 196 * If we do anything other than a simple scan through memory, don't 197 * trust the mapped page tables. 198 */ 199 if (va != lastva + MMU_PAGESIZE) 200 for (l = mmu.max_level; l >= 0; l--) 201 ptable_pfn[l] = PFN_INVALID; 202 203 toplevel_pfn = mmu_btop(xpv_panic_cr3); 204 205 while (va < xpv_end && va >= *vaddr) { 206 /* Find the lowest table with any entry for va */ 207 pfn = toplevel_pfn; 208 for (l = mmu.max_level; l >= 0; l--) { 209 if (ptable_pfn[l] != pfn) { 210 xpv_panic_map(l, pfn); 211 ptable_pfn[l] = pfn; 212 } 213 214 /* 215 * Search this pagetable for any mapping to an 216 * address >= va. 217 */ 218 ptable_window = PWIN_VA(l); 219 if (l == mmu.max_level && mmu.pae_hat) 220 ptable_window += 221 (xpv_panic_cr3 & MMU_PAGEOFFSET); 222 223 idx = (va >> LEVEL_SHIFT(l)) & (xpv_panic_nptes[l] - 1); 224 scan_va = va; 225 while (idx < xpv_panic_nptes[l] && scan_va < xpv_end && 226 scan_va >= *vaddr) { 227 ptep = (x86pte_t *)(ptable_window + 228 (idx << mmu.pte_size_shift)); 229 pte = GET_PTE(ptep); 230 if (pte & PTE_VALID) 231 break; 232 idx++; 233 scan_va += mmu.level_size[l]; 234 } 235 236 /* 237 * If there are no valid mappings in this table, we 238 * can skip to the end of the VA range it covers. 239 */ 240 if (idx == xpv_panic_nptes[l]) { 241 va = NEXT_ENTRY_VA(va, l + 1); 242 break; 243 } 244 245 va = scan_va; 246 /* 247 * See if we've hit the end of the range. 248 */ 249 if (va >= xpv_end || va < *vaddr) 250 break; 251 252 /* 253 * If this mapping is for a pagetable, we drop down 254 * to the next level in the hierarchy and look for 255 * a mapping in it. 256 */ 257 pfn = PTE2MFN(pte, l); 258 if (!PTE_ISPAGE(pte, l)) 259 continue; 260 261 /* 262 * The APIC page is magic. Nothing to see here; 263 * move along. 264 */ 265 if (((uintptr_t)xpv_apicadr & MMU_PAGEMASK) == 266 (va & MMU_PAGEMASK)) { 267 va += MMU_PAGESIZE; 268 break; 269 } 270 271 /* 272 * See if the address is within one of the two 273 * kpm-like regions we want to skip. 274 */ 275 if (va >= kpm1_low && va < kpm1_high) { 276 va = kpm1_high; 277 break; 278 } 279 if (va >= kpm2_low && va < kpm2_high) { 280 va = kpm2_high; 281 break; 282 } 283 284 /* 285 * The Xen panic code only handles small pages. If 286 * this mapping is for a large page, we need to 287 * identify the consituent page that covers the 288 * specific VA we were looking for. 289 */ 290 if (l > 0) { 291 if (l > 1) 292 panic("Xen panic can't cope with " 293 "giant pages."); 294 idx = (va >> LEVEL_SHIFT(0)) & 295 (xpv_panic_nptes[0] - 1); 296 pfn += idx; 297 } 298 299 *vaddr = va; 300 lastva = va; 301 return (pfn | PFN_IS_FOREIGN_MFN); 302 } 303 } 304 return (PFN_INVALID); 305 } 306 307 /* 308 * Walk through the Xen VA space, finding pages that are mapped in. 309 * 310 * These pages all have MFNs rather than PFNs, meaning they may be outside 311 * the physical address space the kernel knows about, or they may collide 312 * with PFNs the kernel is using. 313 * 314 * The obvious trick of just adding the PFN_IS_FOREIGN_MFN bit to the MFNs 315 * to avoid collisions doesn't work. The pages need to be written to disk 316 * in PFN-order or savecore gets confused. We can't allocate memory to 317 * contruct a sorted pfn->VA reverse mapping, so we have to write the pages 318 * to disk in VA order. 319 * 320 * To square this circle, we simply make up PFNs for each of Xen's pages. 321 * We assign each mapped page a fake PFN in ascending order. These fake 322 * PFNs each have the FOREIGN bit set, ensuring that they fall outside the 323 * range of Solaris PFNs written by the kernel. 324 */ 325 int 326 dump_xpv_addr() 327 { 328 uintptr_t va; 329 mem_vtop_t mem_vtop; 330 331 xpv_dump_pages = 0; 332 va = xen_virt_start; 333 334 while (xpv_va_walk(&va) != PFN_INVALID) { 335 mem_vtop.m_as = &kas; 336 mem_vtop.m_va = (void *)va; 337 mem_vtop.m_pfn = (pfn_t)xpv_dump_pages | PFN_IS_FOREIGN_MFN; 338 339 dumpvp_write(&mem_vtop, sizeof (mem_vtop_t)); 340 xpv_dump_pages++; 341 342 va += MMU_PAGESIZE; 343 } 344 345 /* 346 * Add the shared_info page. This page actually ends up in the 347 * dump twice: once for the Xen va and once for the Solaris va. 348 * This isn't ideal, but we don't know the address Xen is using for 349 * the page, so we can't share it. 350 */ 351 mem_vtop.m_as = &kas; 352 mem_vtop.m_va = HYPERVISOR_shared_info; 353 mem_vtop.m_pfn = (pfn_t)xpv_dump_pages | PFN_IS_FOREIGN_MFN; 354 dumpvp_write(&mem_vtop, sizeof (mem_vtop_t)); 355 xpv_dump_pages++; 356 357 return (xpv_dump_pages); 358 } 359 360 void 361 dump_xpv_pfn() 362 { 363 pfn_t pfn; 364 int cnt; 365 366 for (cnt = 0; cnt < xpv_dump_pages; cnt++) { 367 pfn = (pfn_t)cnt | PFN_IS_FOREIGN_MFN; 368 dumpvp_write(&pfn, sizeof (pfn)); 369 } 370 } 371 372 int 373 dump_xpv_data(void *dump_cbuf) 374 { 375 uintptr_t va; 376 uint32_t csize; 377 int cnt = 0; 378 379 /* 380 * XXX: we should probably run this data through a UE check. The 381 * catch is that the UE code relies on on_trap() and getpfnum() 382 * working. 383 */ 384 va = xen_virt_start; 385 386 while (xpv_va_walk(&va) != PFN_INVALID) { 387 csize = (uint32_t)compress((void *)va, dump_cbuf, PAGESIZE); 388 dumpvp_write(&csize, sizeof (uint32_t)); 389 dumpvp_write(dump_cbuf, csize); 390 if (dump_ioerr) { 391 dumphdr->dump_flags &= ~DF_COMPLETE; 392 return (cnt); 393 } 394 cnt++; 395 va += MMU_PAGESIZE; 396 } 397 398 /* 399 * Finally, dump the shared_info page 400 */ 401 csize = (uint32_t)compress((void *)HYPERVISOR_shared_info, dump_cbuf, 402 PAGESIZE); 403 dumpvp_write(&csize, sizeof (uint32_t)); 404 dumpvp_write(dump_cbuf, csize); 405 if (dump_ioerr) 406 dumphdr->dump_flags &= ~DF_COMPLETE; 407 cnt++; 408 409 return (cnt); 410 } 411 412 static void * 413 showstack(void *fpreg, int xpv_only) 414 { 415 struct frame *fpp; 416 ulong_t off; 417 char *sym; 418 uintptr_t pc, fp, lastfp; 419 uintptr_t minaddr = min(KERNELBASE, xen_virt_start); 420 421 fp = (uintptr_t)fpreg; 422 if (fp < minaddr) { 423 xpv_panic_printf("Bad frame ptr: 0x%p\n", fpreg); 424 return (fpreg); 425 } 426 427 do { 428 fpp = (struct frame *)fp; 429 pc = fpp->fr_savpc; 430 431 if ((xpv_only != 0) && 432 (fp > xpv_end || fp < xen_virt_start)) 433 break; 434 if ((sym = kobj_getsymname(pc, &off)) != NULL) 435 xpv_panic_printf("%08lx %s:%s+%lx\n", fp, 436 mod_containing_pc((caddr_t)pc), sym, off); 437 else if ((pc >= xen_virt_start) && (pc <= xpv_end)) 438 xpv_panic_printf("%08lx 0x%lx (in Xen)\n", fp, pc); 439 else 440 xpv_panic_printf("%08lx %lx\n", fp, pc); 441 442 lastfp = fp; 443 fp = fpp->fr_savfp; 444 445 /* 446 * Xen marks an exception frame by inverting the frame 447 * pointer. 448 */ 449 if (fp < lastfp) { 450 if ((~fp > minaddr) && ((~fp) ^ lastfp) < 0xfff) 451 fp = ~fp; 452 } 453 } while (fp > lastfp); 454 return ((void *)fp); 455 } 456 457 void * 458 xpv_traceback(void *fpreg) 459 { 460 return (showstack(fpreg, 1)); 461 } 462 463 #if defined(__amd64) 464 static void 465 xpv_panic_hypercall(ulong_t call) 466 { 467 panic("Illegally issued hypercall %d during panic!\n", (int)call); 468 } 469 #endif 470 471 void 472 xpv_die(struct regs *rp) 473 { 474 struct panic_trap_info ti; 475 struct cregs creg; 476 477 ti.trap_regs = rp; 478 ti.trap_type = rp->r_trapno; 479 480 curthread->t_panic_trap = &ti; 481 if (ti.trap_type == T_PGFLT) { 482 getcregs(&creg); 483 ti.trap_addr = (caddr_t)creg.cr_cr2; 484 panic("Fatal pagefault at 0x%lx. fault addr=0x%p rp=0x%p", 485 rp->r_pc, (void *)ti.trap_addr, (void *)rp); 486 } else { 487 ti.trap_addr = (caddr_t)rp->r_pc; 488 panic("Fatal trap %ld at 0x%lx. rp=0x%p", rp->r_trapno, 489 rp->r_pc, (void *)rp); 490 } 491 } 492 493 /* 494 * Build IDT to handle a Xen panic 495 */ 496 static void 497 switch_to_xpv_panic_idt() 498 { 499 int i; 500 desctbr_t idtr; 501 gate_desc_t *idt = xpv_panic_idt; 502 selector_t cs = get_cs_register(); 503 504 for (i = 0; i < 32; i++) 505 set_gatesegd(&idt[i], &xpv_invaltrap, cs, SDT_SYSIGT, TRP_XPL, 506 0); 507 508 set_gatesegd(&idt[T_ZERODIV], &xpv_div0trap, cs, SDT_SYSIGT, TRP_XPL, 509 0); 510 set_gatesegd(&idt[T_SGLSTP], &xpv_dbgtrap, cs, SDT_SYSIGT, TRP_XPL, 0); 511 set_gatesegd(&idt[T_NMIFLT], &xpv_nmiint, cs, SDT_SYSIGT, TRP_XPL, 0); 512 set_gatesegd(&idt[T_BOUNDFLT], &xpv_boundstrap, cs, SDT_SYSIGT, 513 TRP_XPL, 0); 514 set_gatesegd(&idt[T_ILLINST], &xpv_invoptrap, cs, SDT_SYSIGT, TRP_XPL, 515 0); 516 set_gatesegd(&idt[T_NOEXTFLT], &xpv_ndptrap, cs, SDT_SYSIGT, TRP_XPL, 517 0); 518 set_gatesegd(&idt[T_TSSFLT], &xpv_invtsstrap, cs, SDT_SYSIGT, TRP_XPL, 519 0); 520 set_gatesegd(&idt[T_SEGFLT], &xpv_segnptrap, cs, SDT_SYSIGT, TRP_XPL, 521 0); 522 set_gatesegd(&idt[T_STKFLT], &xpv_stktrap, cs, SDT_SYSIGT, TRP_XPL, 0); 523 set_gatesegd(&idt[T_GPFLT], &xpv_gptrap, cs, SDT_SYSIGT, TRP_XPL, 0); 524 set_gatesegd(&idt[T_PGFLT], &xpv_pftrap, cs, SDT_SYSIGT, TRP_XPL, 0); 525 set_gatesegd(&idt[T_EXTERRFLT], &xpv_ndperr, cs, SDT_SYSIGT, TRP_XPL, 526 0); 527 set_gatesegd(&idt[T_ALIGNMENT], &xpv_achktrap, cs, SDT_SYSIGT, TRP_XPL, 528 0); 529 set_gatesegd(&idt[T_MCE], &xpv_mcetrap, cs, SDT_SYSIGT, TRP_XPL, 0); 530 set_gatesegd(&idt[T_SIMDFPE], &xpv_xmtrap, cs, SDT_SYSIGT, TRP_XPL, 0); 531 532 /* 533 * We have no double fault handler. Any single fault represents a 534 * catastrophic failure for us, so there is no attempt to handle 535 * them cleanly: we just print a message and reboot. If we 536 * encounter a second fault while doing that, there is nothing 537 * else we can do. 538 */ 539 540 /* 541 * Be prepared to absorb any stray device interrupts received 542 * while writing the core to disk. 543 */ 544 for (i = 33; i < NIDT; i++) 545 set_gatesegd(&idt[i], &xpv_surprise_intr, cs, SDT_SYSIGT, 546 TRP_XPL, 0); 547 548 /* The one interrupt we expect to get is from the APIC timer. */ 549 set_gatesegd(&idt[T_XPV_TIMER], &xpv_timer_trap, cs, SDT_SYSIGT, 550 TRP_XPL, 0); 551 552 idtr.dtr_base = (uintptr_t)xpv_panic_idt; 553 idtr.dtr_limit = sizeof (xpv_panic_idt) - 1; 554 wr_idtr(&idtr); 555 556 #if defined(__amd64) 557 /* Catch any hypercalls. */ 558 wrmsr(MSR_AMD_LSTAR, (uintptr_t)xpv_panic_hypercall); 559 wrmsr(MSR_AMD_CSTAR, (uintptr_t)xpv_panic_hypercall); 560 #endif 561 } 562 563 static void 564 xpv_apic_clkinit() 565 { 566 uint_t apic_ticks = 0; 567 568 /* 569 * Measure how many APIC ticks there are within a fixed time 570 * period. We're going to be fairly coarse here. This timer is 571 * just being used to detect a stalled panic, so as long as we have 572 * the right order of magnitude, everything should be fine. 573 */ 574 xpv_apicadr[APIC_SPUR_INT_REG] = AV_UNIT_ENABLE | APIC_SPUR_INTR; 575 xpv_apicadr[APIC_LOCAL_TIMER] = AV_MASK; 576 xpv_apicadr[APIC_INT_VECT0] = AV_MASK; /* local intr reg 0 */ 577 578 xpv_apicadr[APIC_DIVIDE_REG] = 0; 579 xpv_apicadr[APIC_INIT_COUNT] = APIC_MAXVAL; 580 drv_usecwait(XPV_TIMER_INTERVAL); 581 apic_ticks = APIC_MAXVAL - xpv_apicadr[APIC_CURR_COUNT]; 582 583 /* 584 * apic_ticks now represents roughly how many apic ticks comprise 585 * one timeout interval. Program the timer to send us an interrupt 586 * every time that interval expires. 587 */ 588 xpv_apicadr[APIC_LOCAL_TIMER] = T_XPV_TIMER | AV_PERIODIC; 589 xpv_apicadr[APIC_INIT_COUNT] = apic_ticks; 590 xpv_apicadr[APIC_EOI_REG] = 0; 591 } 592 593 void 594 xpv_timer_tick(void) 595 { 596 static int ticks = 0; 597 598 if (ticks++ >= MICROSEC / XPV_TIMER_INTERVAL) { 599 ticks = 0; 600 if (dump_timeleft && (--dump_timeleft == 0)) 601 panic("Xen panic timeout\n"); 602 } 603 xpv_apicadr[APIC_EOI_REG] = 0; 604 } 605 606 void 607 xpv_interrupt(void) 608 { 609 #ifdef DEBUG 610 static int cnt = 0; 611 612 if (cnt++ < 10) 613 xpv_panic_printf("Unexpected interrupt received.\n"); 614 if ((cnt < 1000) && ((cnt % 100) == 0)) 615 xpv_panic_printf("%d unexpected interrupts received.\n", cnt); 616 #endif 617 618 xpv_apicadr[APIC_EOI_REG] = 0; 619 } 620 621 /* 622 * Managing time in panic context is trivial. We only have a single CPU, 623 * we never get rescheduled, we never get suspended. We just need to 624 * convert clock ticks into nanoseconds. 625 */ 626 static hrtime_t 627 xpv_panic_gethrtime(void) 628 { 629 hrtime_t tsc, hrt; 630 unsigned int *l = (unsigned int *)&(tsc); 631 632 tsc = __rdtsc_insn(); 633 hrt = (mul32(l[1], nsec_scale) << NSEC_SHIFT) + 634 (mul32(l[0], nsec_scale) >> (32 - NSEC_SHIFT)); 635 636 return (hrt); 637 } 638 639 static void 640 xpv_panic_time_init() 641 { 642 nsec_scale = 643 CPU->cpu_m.mcpu_vcpu_info->time.tsc_to_system_mul >> NSEC_SHIFT; 644 645 gethrtimef = xpv_panic_gethrtime; 646 } 647 648 static void 649 xpv_panicsys(struct regs *rp, char *fmt, ...) 650 { 651 extern void panicsys(const char *, va_list, struct regs *, int); 652 va_list alist; 653 654 va_start(alist, fmt); 655 panicsys(fmt, alist, rp, 1); 656 va_end(alist); 657 } 658 659 void 660 xpv_do_panic(void *arg) 661 { 662 struct panic_info *pip = (struct panic_info *)arg; 663 int l; 664 struct cregs creg; 665 #if defined(__amd64) 666 extern uintptr_t postbootkernelbase; 667 #endif 668 669 if (xpv_panicking++ > 0) 670 panic("multiple calls to xpv_do_panic()"); 671 672 /* 673 * Indicate to the underlying panic framework that a panic has been 674 * initiated. This is ordinarily done as part of vpanic(). Since 675 * we already have all the register state saved by the hypervisor, 676 * we skip that and jump straight into the panic processing code. 677 * 678 * XXX If another thread grabs and wins the panic_quiesce trigger 679 * then we'll have two threads in panicsys believing they are in 680 * charge of the panic attempt! 681 */ 682 (void) panic_trigger(&panic_quiesce); 683 684 #if defined(__amd64) 685 /* 686 * bzero() and bcopy() get unhappy when asked to operate on 687 * addresses outside of the kernel. At this point Xen is really a 688 * part of the kernel, so we update the routines' notion of where 689 * the kernel starts. 690 */ 691 postbootkernelbase = xen_virt_start; 692 #endif 693 694 #if defined(HYPERVISOR_VIRT_END) 695 xpv_end = HYPERVISOR_VIRT_END; 696 #else 697 xpv_end = (uintptr_t)UINTPTR_MAX - sizeof (uintptr_t); 698 #endif 699 700 /* 701 * If we were redirecting console output to the hypervisor, we have 702 * to stop. 703 */ 704 use_polledio = B_FALSE; 705 if (boot_console_type(NULL) == CONS_HYPERVISOR) { 706 bcons_device_change(CONS_HYPERVISOR); 707 } else if (cons_polledio != NULL && 708 cons_polledio->cons_polledio_putchar != NULL) { 709 if (cons_polledio->cons_polledio_enter != NULL) 710 cons_polledio->cons_polledio_enter( 711 cons_polledio->cons_polledio_argument); 712 use_polledio = 1; 713 } 714 715 /* Make sure we handle all console output from here on. */ 716 sysp->bsvc_putchar = xpv_panic_putc; 717 718 /* 719 * If we find an unsupported panic_info structure, there's not much 720 * we can do other than complain, plow on, and hope for the best. 721 */ 722 if (pip->pi_version != PANIC_INFO_VERSION) 723 xpv_panic_printf("Warning: Xen is using an unsupported " 724 "version of the panic_info structure.\n"); 725 726 xpv_panic_info = pip; 727 728 #if defined(__amd64) 729 kpm1_low = (uintptr_t)xpv_panic_info->pi_ram_start; 730 if (xpv_panic_info->pi_xen_start == NULL) { 731 kpm1_high = (uintptr_t)xpv_panic_info->pi_ram_end; 732 } else { 733 kpm1_high = (uintptr_t)xpv_panic_info->pi_xen_start; 734 kpm2_low = (uintptr_t)xpv_panic_info->pi_xen_end; 735 kpm2_high = (uintptr_t)xpv_panic_info->pi_ram_end; 736 } 737 #endif 738 739 /* 740 * Make sure we are running on the Solaris %gs. The Xen panic code 741 * should already have set up the GDT properly. 742 */ 743 xpv_panic_resetgs(); 744 #if defined(__amd64) 745 wrmsr(MSR_AMD_GSBASE, (uint64_t)&cpus[0]); 746 #endif 747 748 xpv_panic_time_init(); 749 750 /* 751 * Switch to our own IDT, avoiding any accidental returns to Xen 752 * world. 753 */ 754 switch_to_xpv_panic_idt(); 755 756 /* 757 * Initialize the APIC timer, which is used to detect a hung dump 758 * attempt. 759 */ 760 xpv_apicadr = pip->pi_apic; 761 xpv_apic_clkinit(); 762 763 /* 764 * Set up a few values that we'll need repeatedly. 765 */ 766 getcregs(&creg); 767 xpv_panic_cr3 = creg.cr_cr3; 768 for (l = mmu.max_level; l >= 0; l--) 769 xpv_panic_nptes[l] = mmu.ptes_per_table; 770 #ifdef __i386 771 if (mmu.pae_hat) 772 xpv_panic_nptes[mmu.max_level] = 4; 773 #endif 774 775 /* Add the fake Xen module to the module list */ 776 if (xpv_module != NULL) { 777 extern int last_module_id; 778 779 xpv_modctl->mod_id = last_module_id++; 780 xpv_modctl->mod_next = &modules; 781 xpv_modctl->mod_prev = modules.mod_prev; 782 modules.mod_prev->mod_next = xpv_modctl; 783 modules.mod_prev = xpv_modctl; 784 } 785 786 if (pip->pi_mca.mpd_magic == MCA_PANICDATA_MAGIC) 787 xpv_mca_panic_data = &pip->pi_mca; 788 789 xpv_panic_printf = printf; 790 xpv_panicsys((struct regs *)pip->pi_regs, pip->pi_panicstr); 791 xpv_panic_printf("Failed to reboot following panic.\n"); 792 for (;;) 793 ; 794 } 795 796 /* 797 * Set up the necessary data structures to pretend that the Xen hypervisor 798 * is a loadable module, allowing mdb to find the Xen symbols in a crash 799 * dump. Since these symbols all map to VA space Solaris doesn't normally 800 * have access to, we don't link these structures into the kernel's lists 801 * until/unless we hit a Xen panic. 802 * 803 * The observant reader will note a striking amount of overlap between this 804 * code and that found in krtld. While it would be handy if we could just 805 * ask krtld to do this work for us, it's not that simple. Among the 806 * complications: we're not actually loading the text here (grub did it at 807 * boot), the .text section is writable, there are no relocations to do, 808 * none of the module text/data is in readable memory, etc. Training krtld 809 * to deal with this weird module is as complicated, and more risky, than 810 * reimplementing the necessary subset of it here. 811 */ 812 static void 813 init_xen_module() 814 { 815 struct _buf *file = NULL; 816 struct module *mp; 817 struct modctl *mcp; 818 int i, shn; 819 Shdr *shp, *ctf_shp; 820 char *names = NULL; 821 size_t n, namesize, text_align, data_align; 822 #if defined(__amd64) 823 const char machine = EM_AMD64; 824 #else 825 const char machine = EM_386; 826 #endif 827 828 /* Allocate and init the module structure */ 829 mp = kmem_zalloc(sizeof (*mp), KM_SLEEP); 830 mp->filename = kobj_zalloc(strlen(XPV_FILENAME) + 1, KM_SLEEP); 831 (void) strcpy(mp->filename, XPV_FILENAME); 832 833 /* Allocate and init the modctl structure */ 834 mcp = kmem_zalloc(sizeof (*mcp), KM_SLEEP); 835 mcp->mod_modname = kobj_zalloc(strlen(XPV_MODNAME) + 1, KM_SLEEP); 836 (void) strcpy(mcp->mod_modname, XPV_MODNAME); 837 mcp->mod_filename = kobj_zalloc(strlen(XPV_FILENAME) + 1, KM_SLEEP); 838 (void) strcpy(mcp->mod_filename, XPV_FILENAME); 839 mcp->mod_inprogress_thread = (kthread_id_t)-1; 840 mcp->mod_ref = 1; 841 mcp->mod_loaded = 1; 842 mcp->mod_loadcnt = 1; 843 mcp->mod_mp = mp; 844 845 /* 846 * Try to open a Xen image that hasn't had its symbol and CTF 847 * information stripped off. 848 */ 849 file = kobj_open_file(XPV_FILENAME); 850 if (file == (struct _buf *)-1) { 851 file = NULL; 852 goto err; 853 } 854 855 /* 856 * Read the header and ensure that this is an ELF file for the 857 * proper ISA. If it's not, somebody has done something very 858 * stupid. Why bother? See Mencken. 859 */ 860 if (kobj_read_file(file, (char *)&mp->hdr, sizeof (mp->hdr), 0) < 0) 861 goto err; 862 for (i = 0; i < SELFMAG; i++) 863 if (mp->hdr.e_ident[i] != ELFMAG[i]) 864 goto err; 865 if ((mp->hdr.e_ident[EI_DATA] != ELFDATA2LSB) || 866 (mp->hdr.e_machine != machine)) 867 goto err; 868 869 /* Read in the section headers */ 870 n = mp->hdr.e_shentsize * mp->hdr.e_shnum; 871 mp->shdrs = kmem_zalloc(n, KM_SLEEP); 872 if (kobj_read_file(file, mp->shdrs, n, mp->hdr.e_shoff) < 0) 873 goto err; 874 875 /* Read the section names */ 876 shp = (Shdr *)(mp->shdrs + mp->hdr.e_shstrndx * mp->hdr.e_shentsize); 877 namesize = shp->sh_size; 878 names = kmem_zalloc(shp->sh_size, KM_SLEEP); 879 if (kobj_read_file(file, names, shp->sh_size, shp->sh_offset) < 0) 880 goto err; 881 882 /* 883 * Fill in the text and data size fields. 884 */ 885 ctf_shp = NULL; 886 text_align = data_align = 0; 887 for (shn = 1; shn < mp->hdr.e_shnum; shn++) { 888 shp = (Shdr *)(mp->shdrs + shn * mp->hdr.e_shentsize); 889 890 /* Sanity check the offset of the section name */ 891 if (shp->sh_name >= namesize) 892 continue; 893 894 /* If we find the symtab section, remember it for later. */ 895 if (shp->sh_type == SHT_SYMTAB) { 896 mp->symtbl_section = shn; 897 mp->symhdr = shp; 898 continue; 899 } 900 901 /* If we find the CTF section, remember it for later. */ 902 if ((shp->sh_size != 0) && 903 (strcmp(names + shp->sh_name, ".SUNW_ctf") == 0)) { 904 ctf_shp = shp; 905 continue; 906 } 907 908 if (!(shp->sh_flags & SHF_ALLOC)) 909 continue; 910 911 /* 912 * Xen marks its text section as writable, so we need to 913 * look for the name - not just the flag. 914 */ 915 if ((strcmp(&names[shp->sh_name], ".text") != NULL) && 916 (shp->sh_flags & SHF_WRITE) != 0) { 917 if (shp->sh_addralign > data_align) 918 data_align = shp->sh_addralign; 919 mp->data_size = ALIGN(mp->data_size, data_align); 920 mp->data_size += ALIGN(shp->sh_size, 8); 921 if (mp->data == NULL || mp->data > (char *)shp->sh_addr) 922 mp->data = (char *)shp->sh_addr; 923 } else { 924 if (shp->sh_addralign > text_align) 925 text_align = shp->sh_addralign; 926 mp->text_size = ALIGN(mp->text_size, text_align); 927 mp->text_size += ALIGN(shp->sh_size, 8); 928 if (mp->text == NULL || mp->text > (char *)shp->sh_addr) 929 mp->text = (char *)shp->sh_addr; 930 } 931 } 932 kmem_free(names, namesize); 933 names = NULL; 934 shp = NULL; 935 mcp->mod_text = mp->text; 936 mcp->mod_text_size = mp->text_size; 937 938 /* 939 * If we have symbol table and string table sections, read them in 940 * now. If we don't, we just plow on. We'll still get a valid 941 * core dump, but finding anything useful will be just a bit 942 * harder. 943 * 944 * Note: we don't bother with a hash table. We'll never do a 945 * symbol lookup unless we crash, and then mdb creates its own. We 946 * also don't try to perform any relocations. Xen should be loaded 947 * exactly where the ELF file indicates, and the symbol information 948 * in the file should be complete and correct already. Static 949 * linking ain't all bad. 950 */ 951 if ((mp->symhdr != NULL) && (mp->symhdr->sh_link < mp->hdr.e_shnum)) { 952 mp->strhdr = (Shdr *) 953 (mp->shdrs + mp->symhdr->sh_link * mp->hdr.e_shentsize); 954 mp->nsyms = mp->symhdr->sh_size / mp->symhdr->sh_entsize; 955 956 /* Allocate space for the symbol table and strings. */ 957 mp->symsize = mp->symhdr->sh_size + 958 mp->nsyms * sizeof (symid_t) + mp->strhdr->sh_size; 959 mp->symspace = kmem_zalloc(mp->symsize, KM_SLEEP); 960 mp->symtbl = mp->symspace; 961 mp->strings = (char *)(mp->symtbl + mp->symhdr->sh_size); 962 963 if ((kobj_read_file(file, mp->symtbl, 964 mp->symhdr->sh_size, mp->symhdr->sh_offset) < 0) || 965 (kobj_read_file(file, mp->strings, 966 mp->strhdr->sh_size, mp->strhdr->sh_offset) < 0)) 967 goto err; 968 } 969 970 /* 971 * Read in the CTF section 972 */ 973 if ((ctf_shp != NULL) && ((moddebug & MODDEBUG_NOCTF) == 0)) { 974 mp->ctfdata = kmem_zalloc(ctf_shp->sh_size, KM_SLEEP); 975 mp->ctfsize = ctf_shp->sh_size; 976 if (kobj_read_file(file, mp->ctfdata, mp->ctfsize, 977 ctf_shp->sh_offset) < 0) 978 goto err; 979 } 980 981 kobj_close_file(file); 982 983 xpv_module = mp; 984 xpv_modctl = mcp; 985 return; 986 987 err: 988 cmn_err(CE_WARN, "Failed to initialize xpv module."); 989 if (file != NULL) 990 kobj_close_file(file); 991 992 kmem_free(mp->filename, strlen(XPV_FILENAME) + 1); 993 if (mp->shdrs != NULL) 994 kmem_free(mp->shdrs, mp->hdr.e_shentsize * mp->hdr.e_shnum); 995 if (mp->symspace != NULL) 996 kmem_free(mp->symspace, mp->symsize); 997 if (mp->ctfdata != NULL) 998 kmem_free(mp->ctfdata, mp->ctfsize); 999 kmem_free(mp, sizeof (*mp)); 1000 kmem_free(mcp->mod_filename, strlen(XPV_FILENAME) + 1); 1001 kmem_free(mcp->mod_modname, strlen(XPV_MODNAME) + 1); 1002 kmem_free(mcp, sizeof (*mcp)); 1003 if (names != NULL) 1004 kmem_free(names, namesize); 1005 } 1006 1007 void 1008 xpv_panic_init() 1009 { 1010 xen_platform_op_t op; 1011 int i; 1012 1013 ASSERT(DOMAIN_IS_INITDOMAIN(xen_info)); 1014 1015 for (i = 0; i < mmu.num_level; i++) 1016 ptable_pfn[i] = PFN_INVALID; 1017 1018 /* Let Xen know where to jump if/when it panics. */ 1019 op.cmd = XENPF_panic_init; 1020 op.interface_version = XENPF_INTERFACE_VERSION; 1021 op.u.panic_init.panic_addr = (unsigned long)xpv_panic_hdlr; 1022 1023 (void) HYPERVISOR_platform_op(&op); 1024 1025 init_xen_module(); 1026 } 1027