1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/types.h> 30 #include <sys/clock.h> 31 #include <sys/psm.h> 32 #include <sys/archsystm.h> 33 #include <sys/machsystm.h> 34 #include <sys/compress.h> 35 #include <sys/modctl.h> 36 #include <sys/trap.h> 37 #include <sys/panic.h> 38 #include <sys/regset.h> 39 #include <sys/frame.h> 40 #include <sys/kobj.h> 41 #include <sys/apic.h> 42 #include <sys/dumphdr.h> 43 #include <sys/mem.h> 44 #include <sys/x86_archext.h> 45 #include <sys/xpv_panic.h> 46 #include <sys/boot_console.h> 47 #include <sys/bootsvcs.h> 48 #include <sys/consdev.h> 49 #include <vm/hat_pte.h> 50 #include <vm/hat_i86.h> 51 52 /* XXX: need to add a PAE version too, if we ever support both PAE and non */ 53 #if defined(__i386) 54 #define XPV_FILENAME "/boot/xen-syms" 55 #else 56 #define XPV_FILENAME "/boot/amd64/xen-syms" 57 #endif 58 #define XPV_MODNAME "xpv" 59 60 int xpv_panicking = 0; 61 62 struct module *xpv_module; 63 struct modctl *xpv_modctl; 64 65 #define ALIGN(x, a) ((a) == 0 ? (uintptr_t)(x) : \ 66 (((uintptr_t)(x) + (uintptr_t)(a) - 1l) & ~((uintptr_t)(a) - 1l))) 67 68 /* Pointer to the xpv_panic_info structure handed to us by Xen. */ 69 static struct panic_info *xpv_panic_info = NULL; 70 71 /* Timer support */ 72 #define NSEC_SHIFT 5 73 #define T_XPV_TIMER 0xd1 74 #define XPV_TIMER_INTERVAL 1000 /* 1000 microseconds */ 75 static uint32_t *xpv_apicadr = NULL; 76 static uint_t nsec_scale; 77 78 /* IDT support */ 79 #pragma align 16(xpv_panic_idt) 80 static gate_desc_t xpv_panic_idt[NIDT]; /* interrupt descriptor table */ 81 82 /* Xen pagetables mapped into our HAT's ptable windows */ 83 static pfn_t ptable_pfn[MAX_NUM_LEVEL]; 84 85 /* Number of MMU_PAGESIZE pages we're adding to the Solaris dump */ 86 static int xpv_dump_pages; 87 88 /* 89 * Some commonly used values that we don't want to recompute over and over. 90 */ 91 static int xpv_panic_nptes[MAX_NUM_LEVEL]; 92 static ulong_t xpv_panic_cr3; 93 static uintptr_t xpv_end; 94 95 static void xpv_panic_console_print(const char *fmt, ...); 96 static void (*xpv_panic_printf)(const char *, ...) = xpv_panic_console_print; 97 98 #define CONSOLE_BUF_SIZE 256 99 static char console_buffer[CONSOLE_BUF_SIZE]; 100 static boolean_t use_polledio; 101 102 static void 103 xpv_panic_putc(int m) 104 { 105 struct cons_polledio *c = cons_polledio; 106 107 /* This really shouldn't happen */ 108 if (console == CONS_HYPERVISOR) 109 return; 110 111 if (use_polledio == B_TRUE) 112 c->cons_polledio_putchar(c->cons_polledio_argument, m); 113 else 114 bcons_putchar(m); 115 } 116 117 static void 118 xpv_panic_puts(char *msg) 119 { 120 char *m; 121 122 dump_timeleft = dump_timeout; 123 for (m = msg; *m; m++) 124 xpv_panic_putc((int)*m); 125 } 126 127 static void 128 xpv_panic_console_print(const char *fmt, ...) 129 { 130 va_list ap; 131 132 va_start(ap, fmt); 133 (void) vsnprintf(console_buffer, sizeof (console_buffer), fmt, ap); 134 va_end(ap); 135 136 xpv_panic_puts(console_buffer); 137 } 138 139 static void 140 xpv_panic_map(int level, pfn_t pfn) 141 { 142 x86pte_t pte, *pteptr; 143 144 /* 145 * The provided pfn represents a level 'level' page table. Map it 146 * into the 'level' slot in the list of page table windows. 147 */ 148 pteptr = (x86pte_t *)PWIN_PTE_VA(level); 149 pte = pfn_to_pa(pfn) | PT_VALID; 150 151 XPV_ALLOW_PAGETABLE_UPDATES(); 152 if (mmu.pae_hat) 153 *pteptr = pte; 154 else 155 *(x86pte32_t *)pteptr = pte; 156 XPV_DISALLOW_PAGETABLE_UPDATES(); 157 158 mmu_tlbflush_entry(PWIN_VA(level)); 159 } 160 161 /* 162 * Walk the page tables to find the pfn mapped by the given va. 163 */ 164 static pfn_t 165 xpv_va_walk(uintptr_t *vaddr) 166 { 167 int l, idx; 168 pfn_t pfn; 169 x86pte_t pte; 170 x86pte_t *ptep; 171 uintptr_t va = *vaddr; 172 uintptr_t scan_va; 173 caddr_t ptable_window; 174 static pfn_t toplevel_pfn; 175 static uintptr_t lastva; 176 177 /* 178 * If we do anything other than a simple scan through memory, don't 179 * trust the mapped page tables. 180 */ 181 if (va != lastva + MMU_PAGESIZE) 182 for (l = mmu.max_level; l >= 0; l--) 183 ptable_pfn[l] = PFN_INVALID; 184 185 toplevel_pfn = mmu_btop(xpv_panic_cr3); 186 187 while (va < xpv_end && va >= *vaddr) { 188 /* Find the lowest table with any entry for va */ 189 pfn = toplevel_pfn; 190 for (l = mmu.max_level; l >= 0; l--) { 191 if (ptable_pfn[l] != pfn) { 192 xpv_panic_map(l, pfn); 193 ptable_pfn[l] = pfn; 194 } 195 196 /* 197 * Search this pagetable for any mapping to an 198 * address >= va. 199 */ 200 ptable_window = PWIN_VA(l); 201 if (l == mmu.max_level && mmu.pae_hat) 202 ptable_window += 203 (xpv_panic_cr3 & MMU_PAGEOFFSET); 204 205 idx = (va >> LEVEL_SHIFT(l)) & (xpv_panic_nptes[l] - 1); 206 scan_va = va; 207 while (idx < xpv_panic_nptes[l] && scan_va < xpv_end && 208 scan_va >= *vaddr) { 209 ptep = (x86pte_t *)(ptable_window + 210 (idx << mmu.pte_size_shift)); 211 pte = GET_PTE(ptep); 212 if (pte & PTE_VALID) 213 break; 214 idx++; 215 scan_va += mmu.level_size[l]; 216 } 217 va = scan_va; 218 219 /* 220 * See if we've hit the end of the range. 221 */ 222 if (scan_va >= xpv_end || scan_va < *vaddr) { 223 va = scan_va; 224 break; 225 } 226 227 /* 228 * If there are no valid mappings in this table, we 229 * can skip to the end of the VA range it covers. 230 */ 231 if (idx == xpv_panic_nptes[l]) { 232 va = NEXT_ENTRY_VA(va, l + 1); 233 break; 234 } 235 236 /* 237 * If this mapping is for a pagetable, we drop down 238 * to the next level in the hierarchy and look for 239 * a mapping in it. 240 */ 241 pfn = PTE2MFN(pte, l); 242 if (!PTE_ISPAGE(pte, l)) 243 continue; 244 245 /* 246 * The APIC page is magic. Nothing to see here; 247 * move along. 248 */ 249 if (((uintptr_t)xpv_apicadr & MMU_PAGEMASK) == 250 (va & MMU_PAGEMASK)) { 251 va += MMU_PAGESIZE; 252 break; 253 } 254 255 /* We also want to skip the Xen version of KPM */ 256 if (va >= (uintptr_t)xpv_panic_info->pi_ram_start && 257 va < (uintptr_t)xpv_panic_info->pi_ram_end) { 258 va = (uintptr_t)xpv_panic_info->pi_ram_end; 259 break; 260 } 261 262 /* 263 * The Xen panic code only handles small pages. If 264 * this mapping is for a large page, we need to 265 * identify the consituent page that covers the 266 * specific VA we were looking for. 267 */ 268 if (l > 0) { 269 if (l > 1) 270 panic("Xen panic can't cope with " 271 "giant pages."); 272 idx = (va >> LEVEL_SHIFT(0)) & 273 (xpv_panic_nptes[0] - 1); 274 pfn += idx; 275 } 276 277 *vaddr = va; 278 lastva = va; 279 return (pfn | PFN_IS_FOREIGN_MFN); 280 } 281 } 282 return (PFN_INVALID); 283 } 284 285 /* 286 * Walk through the Xen VA space, finding pages that are mapped in. 287 * 288 * These pages all have MFNs rather than PFNs, meaning they may be outside 289 * the physical address space the kernel knows about, or they may collide 290 * with PFNs the kernel is using. 291 * 292 * The obvious trick of just adding the PFN_IS_FOREIGN_MFN bit to the MFNs 293 * to avoid collisions doesn't work. The pages need to be written to disk 294 * in PFN-order or savecore gets confused. We can't allocate memory to 295 * contruct a sorted pfn->VA reverse mapping, so we have to write the pages 296 * to disk in VA order. 297 * 298 * To square this circle, we simply make up PFNs for each of Xen's pages. 299 * We assign each mapped page a fake PFN in ascending order. These fake 300 * PFNs each have the FOREIGN bit set, ensuring that they fall outside the 301 * range of Solaris PFNs written by the kernel. 302 */ 303 int 304 dump_xpv_addr() 305 { 306 uintptr_t va; 307 mem_vtop_t mem_vtop; 308 309 xpv_dump_pages = 0; 310 va = xen_virt_start; 311 312 while (xpv_va_walk(&va) != PFN_INVALID) { 313 mem_vtop.m_as = &kas; 314 mem_vtop.m_va = (void *)va; 315 mem_vtop.m_pfn = (pfn_t)xpv_dump_pages | PFN_IS_FOREIGN_MFN; 316 317 dumpvp_write(&mem_vtop, sizeof (mem_vtop_t)); 318 xpv_dump_pages++; 319 320 va += MMU_PAGESIZE; 321 } 322 323 /* 324 * Add the shared_info page. This page actually ends up in the 325 * dump twice: once for the Xen va and once for the Solaris va. 326 * This isn't ideal, but we don't know the address Xen is using for 327 * the page, so we can't share it. 328 */ 329 mem_vtop.m_as = &kas; 330 mem_vtop.m_va = HYPERVISOR_shared_info; 331 mem_vtop.m_pfn = (pfn_t)xpv_dump_pages | PFN_IS_FOREIGN_MFN; 332 dumpvp_write(&mem_vtop, sizeof (mem_vtop_t)); 333 xpv_dump_pages++; 334 335 return (xpv_dump_pages); 336 } 337 338 void 339 dump_xpv_pfn() 340 { 341 pfn_t pfn; 342 int cnt; 343 344 for (cnt = 0; cnt < xpv_dump_pages; cnt++) { 345 pfn = (pfn_t)cnt | PFN_IS_FOREIGN_MFN; 346 dumpvp_write(&pfn, sizeof (pfn)); 347 } 348 } 349 350 int 351 dump_xpv_data(void *dump_cbuf) 352 { 353 uintptr_t va; 354 uint32_t csize; 355 int cnt = 0; 356 357 /* 358 * XXX: we should probably run this data through a UE check. The 359 * catch is that the UE code relies on on_trap() and getpfnum() 360 * working. 361 */ 362 va = xen_virt_start; 363 364 while (xpv_va_walk(&va) != PFN_INVALID) { 365 csize = (uint32_t)compress((void *)va, dump_cbuf, PAGESIZE); 366 dumpvp_write(&csize, sizeof (uint32_t)); 367 dumpvp_write(dump_cbuf, csize); 368 if (dump_ioerr) { 369 dumphdr->dump_flags &= ~DF_COMPLETE; 370 return (cnt); 371 } 372 cnt++; 373 va += MMU_PAGESIZE; 374 } 375 376 /* 377 * Finally, dump the shared_info page 378 */ 379 csize = (uint32_t)compress((void *)HYPERVISOR_shared_info, dump_cbuf, 380 PAGESIZE); 381 dumpvp_write(&csize, sizeof (uint32_t)); 382 dumpvp_write(dump_cbuf, csize); 383 if (dump_ioerr) 384 dumphdr->dump_flags &= ~DF_COMPLETE; 385 cnt++; 386 387 return (cnt); 388 } 389 390 static void * 391 showstack(void *fpreg, int xpv_only) 392 { 393 struct frame *fpp; 394 ulong_t off; 395 char *sym; 396 uintptr_t pc, fp, lastfp; 397 uintptr_t minaddr = min(KERNELBASE, xen_virt_start); 398 399 fp = (uintptr_t)fpreg; 400 if (fp < minaddr) { 401 xpv_panic_printf("Bad frame ptr: 0x%p\n", fpreg); 402 return (fpreg); 403 } 404 405 do { 406 fpp = (struct frame *)fp; 407 pc = fpp->fr_savpc; 408 409 if ((xpv_only != 0) && 410 (fp > xpv_end || fp < xen_virt_start)) 411 break; 412 if ((sym = kobj_getsymname(pc, &off)) != NULL) 413 xpv_panic_printf("%08lx %s:%s+%lx\n", fp, 414 mod_containing_pc((caddr_t)pc), sym, off); 415 else if ((pc >= xen_virt_start) && (pc <= xpv_end)) 416 xpv_panic_printf("%08lx 0x%lx (in Xen)\n", fp, pc); 417 else 418 xpv_panic_printf("%08lx %lx\n", fp, pc); 419 420 lastfp = fp; 421 fp = fpp->fr_savfp; 422 423 /* 424 * Xen marks an exception frame by inverting the frame 425 * pointer. 426 */ 427 if (fp < lastfp) { 428 if ((~fp > minaddr) && ((~fp) ^ lastfp) < 0xfff) 429 fp = ~fp; 430 } 431 } while (fp > lastfp); 432 return ((void *)fp); 433 } 434 435 void * 436 xpv_traceback(void *fpreg) 437 { 438 return (showstack(fpreg, 1)); 439 } 440 441 #if defined(__amd64) 442 static void 443 xpv_panic_hypercall(ulong_t call) 444 { 445 panic("Illegally issued hypercall %d during panic!\n", (int)call); 446 } 447 #endif 448 449 void 450 xpv_die(struct regs *rp) 451 { 452 struct panic_trap_info ti; 453 struct cregs creg; 454 455 ti.trap_regs = rp; 456 ti.trap_type = rp->r_trapno; 457 458 curthread->t_panic_trap = &ti; 459 if (ti.trap_type == T_PGFLT) { 460 getcregs(&creg); 461 ti.trap_addr = (caddr_t)creg.cr_cr2; 462 panic("Fatal pagefault at 0x%lx. fault addr=0x%p rp=0x%p", 463 rp->r_pc, ti.trap_addr, rp); 464 } else { 465 ti.trap_addr = (caddr_t)rp->r_pc; 466 panic("Fatal trap %ld at 0x%lx. rp=0x%p", rp->r_trapno, 467 rp->r_pc, rp); 468 } 469 } 470 471 /* 472 * Build IDT to handle a Xen panic 473 */ 474 static void 475 switch_to_xpv_panic_idt() 476 { 477 int i; 478 desctbr_t idtr; 479 gate_desc_t *idt = xpv_panic_idt; 480 selector_t cs = get_cs_register(); 481 482 for (i = 0; i < 32; i++) 483 set_gatesegd(&idt[i], &xpv_invaltrap, cs, SDT_SYSIGT, TRP_XPL); 484 485 set_gatesegd(&idt[T_ZERODIV], &xpv_div0trap, cs, SDT_SYSIGT, TRP_XPL); 486 set_gatesegd(&idt[T_SGLSTP], &xpv_dbgtrap, cs, SDT_SYSIGT, TRP_XPL); 487 set_gatesegd(&idt[T_NMIFLT], &xpv_nmiint, cs, SDT_SYSIGT, TRP_XPL); 488 set_gatesegd(&idt[T_BOUNDFLT], &xpv_boundstrap, cs, SDT_SYSIGT, 489 TRP_XPL); 490 set_gatesegd(&idt[T_ILLINST], &xpv_invoptrap, cs, SDT_SYSIGT, TRP_XPL); 491 set_gatesegd(&idt[T_NOEXTFLT], &xpv_ndptrap, cs, SDT_SYSIGT, TRP_XPL); 492 set_gatesegd(&idt[T_TSSFLT], &xpv_invtsstrap, cs, SDT_SYSIGT, TRP_XPL); 493 set_gatesegd(&idt[T_SEGFLT], &xpv_segnptrap, cs, SDT_SYSIGT, TRP_XPL); 494 set_gatesegd(&idt[T_STKFLT], &xpv_stktrap, cs, SDT_SYSIGT, TRP_XPL); 495 set_gatesegd(&idt[T_GPFLT], &xpv_gptrap, cs, SDT_SYSIGT, TRP_XPL); 496 set_gatesegd(&idt[T_PGFLT], &xpv_pftrap, cs, SDT_SYSIGT, TRP_XPL); 497 set_gatesegd(&idt[T_EXTERRFLT], &xpv_ndperr, cs, SDT_SYSIGT, TRP_XPL); 498 set_gatesegd(&idt[T_ALIGNMENT], &xpv_achktrap, cs, SDT_SYSIGT, TRP_XPL); 499 set_gatesegd(&idt[T_MCE], &xpv_mcetrap, cs, SDT_SYSIGT, TRP_XPL); 500 set_gatesegd(&idt[T_SIMDFPE], &xpv_xmtrap, cs, SDT_SYSIGT, TRP_XPL); 501 502 /* 503 * We have no double fault handler. Any single fault represents a 504 * catastrophic failure for us, so there is no attempt to handle 505 * them cleanly: we just print a message and reboot. If we 506 * encounter a second fault while doing that, there is nothing 507 * else we can do. 508 */ 509 510 /* 511 * Be prepared to absorb any stray device interrupts received 512 * while writing the core to disk. 513 */ 514 for (i = 33; i < NIDT; i++) 515 set_gatesegd(&idt[i], &xpv_surprise_intr, cs, SDT_SYSIGT, 516 TRP_XPL); 517 518 /* The one interrupt we expect to get is from the APIC timer. */ 519 set_gatesegd(&idt[T_XPV_TIMER], &xpv_timer_trap, cs, SDT_SYSIGT, 520 TRP_XPL); 521 522 idtr.dtr_base = (uintptr_t)xpv_panic_idt; 523 idtr.dtr_limit = sizeof (xpv_panic_idt) - 1; 524 wr_idtr(&idtr); 525 526 #if defined(__amd64) 527 /* Catch any hypercalls. */ 528 wrmsr(MSR_AMD_LSTAR, (uintptr_t)xpv_panic_hypercall); 529 wrmsr(MSR_AMD_CSTAR, (uintptr_t)xpv_panic_hypercall); 530 #endif 531 } 532 533 static void 534 xpv_apic_clkinit() 535 { 536 uint_t apic_ticks = 0; 537 538 /* 539 * Measure how many APIC ticks there are within a fixed time 540 * period. We're going to be fairly coarse here. This timer is 541 * just being used to detect a stalled panic, so as long as we have 542 * the right order of magnitude, everything should be fine. 543 */ 544 xpv_apicadr[APIC_SPUR_INT_REG] = AV_UNIT_ENABLE | APIC_SPUR_INTR; 545 xpv_apicadr[APIC_LOCAL_TIMER] = AV_MASK; 546 xpv_apicadr[APIC_INT_VECT0] = AV_MASK; /* local intr reg 0 */ 547 548 xpv_apicadr[APIC_DIVIDE_REG] = 0; 549 xpv_apicadr[APIC_INIT_COUNT] = APIC_MAXVAL; 550 drv_usecwait(XPV_TIMER_INTERVAL); 551 apic_ticks = APIC_MAXVAL - xpv_apicadr[APIC_CURR_COUNT]; 552 553 /* 554 * apic_ticks now represents roughly how many apic ticks comprise 555 * one timeout interval. Program the timer to send us an interrupt 556 * every time that interval expires. 557 */ 558 xpv_apicadr[APIC_LOCAL_TIMER] = T_XPV_TIMER | AV_TIME; 559 xpv_apicadr[APIC_INIT_COUNT] = apic_ticks; 560 xpv_apicadr[APIC_EOI_REG] = 0; 561 } 562 563 void 564 xpv_timer_tick(void) 565 { 566 static int ticks = 0; 567 568 if (ticks++ >= MICROSEC / XPV_TIMER_INTERVAL) { 569 ticks = 0; 570 if (dump_timeleft && (--dump_timeleft == 0)) 571 panic("Xen panic timeout\n"); 572 } 573 xpv_apicadr[APIC_EOI_REG] = 0; 574 } 575 576 void 577 xpv_interrupt(void) 578 { 579 #ifdef DEBUG 580 static int cnt = 0; 581 582 if (cnt++ < 10) 583 xpv_panic_printf("Unexpected interrupt received.\n"); 584 if ((cnt < 1000) && ((cnt % 100) == 0)) 585 xpv_panic_printf("%d unexpected interrupts received.\n", cnt); 586 #endif 587 588 xpv_apicadr[APIC_EOI_REG] = 0; 589 } 590 591 /* 592 * Managing time in panic context is trivial. We only have a single CPU, 593 * we never get rescheduled, we never get suspended. We just need to 594 * convert clock ticks into nanoseconds. 595 */ 596 static hrtime_t 597 xpv_panic_gethrtime(void) 598 { 599 hrtime_t tsc, hrt; 600 unsigned int *l = (unsigned int *)&(tsc); 601 602 tsc = __rdtsc_insn(); 603 hrt = (mul32(l[1], nsec_scale) << NSEC_SHIFT) + 604 (mul32(l[0], nsec_scale) >> (32 - NSEC_SHIFT)); 605 606 return (hrt); 607 } 608 609 static void 610 xpv_panic_time_init() 611 { 612 nsec_scale = 613 CPU->cpu_m.mcpu_vcpu_info->time.tsc_to_system_mul >> NSEC_SHIFT; 614 615 gethrtimef = xpv_panic_gethrtime; 616 } 617 618 static void 619 xpv_panicsys(struct regs *rp, char *fmt, ...) 620 { 621 extern void panicsys(const char *, va_list, struct regs *, int); 622 va_list alist; 623 624 va_start(alist, fmt); 625 panicsys(fmt, alist, rp, 1); 626 va_end(alist); 627 } 628 629 void 630 xpv_do_panic(void *arg) 631 { 632 struct panic_info *pip = (struct panic_info *)arg; 633 int l; 634 struct cregs creg; 635 #if defined(__amd64) 636 extern uintptr_t postbootkernelbase; 637 #endif 638 639 if (xpv_panicking++ > 0) 640 panic("multiple calls to xpv_do_panic()"); 641 642 /* 643 * Indicate to the underlying panic framework that a panic has been 644 * initiated. This is ordinarily done as part of vpanic(). Since 645 * we already have all the register state saved by the hypervisor, 646 * we skip that and jump straight into the panic processing code. 647 */ 648 (void) panic_trigger(&panic_quiesce); 649 650 #if defined(__amd64) 651 /* 652 * bzero() and bcopy() get unhappy when asked to operate on 653 * addresses outside of the kernel. At this point Xen is really a 654 * part of the kernel, so we update the routines' notion of where 655 * the kernel starts. 656 */ 657 postbootkernelbase = xen_virt_start; 658 #endif 659 660 #if defined(HYPERVISOR_VIRT_END) 661 xpv_end = HYPERVISOR_VIRT_END; 662 #else 663 xpv_end = (uintptr_t)UINTPTR_MAX - sizeof (uintptr_t); 664 #endif 665 666 /* 667 * If we were redirecting console output to the hypervisor, we have 668 * to stop. 669 */ 670 use_polledio = B_FALSE; 671 if (console == CONS_HYPERVISOR) { 672 bcons_device_change(CONS_HYPERVISOR); 673 } else if (cons_polledio != NULL && 674 cons_polledio->cons_polledio_putchar != NULL) { 675 if (cons_polledio->cons_polledio_enter != NULL) 676 cons_polledio->cons_polledio_enter( 677 cons_polledio->cons_polledio_argument); 678 use_polledio = 1; 679 } 680 681 /* Make sure we handle all console output from here on. */ 682 sysp->bsvc_putchar = xpv_panic_putc; 683 684 /* 685 * If we find an unsupported panic_info structure, there's not much 686 * we can do other than complain, plow on, and hope for the best. 687 */ 688 if (pip->pi_version != PANIC_INFO_VERSION) 689 xpv_panic_printf("Warning: Xen is using an unsupported " 690 "version of the panic_info structure.\n"); 691 692 xpv_panic_info = pip; 693 694 /* 695 * Make sure we are running on the Solaris %gs. The Xen panic code 696 * should already have set up the GDT properly. 697 */ 698 xpv_panic_resetgs(); 699 #if defined(__amd64) 700 wrmsr(MSR_AMD_GSBASE, (uint64_t)&cpus[0]); 701 #endif 702 703 xpv_panic_time_init(); 704 705 /* 706 * Switch to our own IDT, avoiding any accidental returns to Xen 707 * world. 708 */ 709 switch_to_xpv_panic_idt(); 710 711 /* 712 * Initialize the APIC timer, which is used to detect a hung dump 713 * attempt. 714 */ 715 xpv_apicadr = pip->pi_apic; 716 xpv_apic_clkinit(); 717 718 /* 719 * Set up a few values that we'll need repeatedly. 720 */ 721 getcregs(&creg); 722 xpv_panic_cr3 = creg.cr_cr3; 723 for (l = mmu.max_level; l >= 0; l--) 724 xpv_panic_nptes[l] = mmu.ptes_per_table; 725 #ifdef __i386 726 if (mmu.pae_hat) 727 xpv_panic_nptes[mmu.max_level] = 4; 728 #endif 729 730 /* Add the fake Xen module to the module list */ 731 if (xpv_module != NULL) { 732 extern int last_module_id; 733 734 xpv_modctl->mod_id = last_module_id++; 735 xpv_modctl->mod_next = &modules; 736 xpv_modctl->mod_prev = modules.mod_prev; 737 modules.mod_prev->mod_next = xpv_modctl; 738 modules.mod_prev = xpv_modctl; 739 } 740 xpv_panic_printf = printf; 741 xpv_panicsys((struct regs *)pip->pi_regs, pip->pi_panicstr); 742 xpv_panic_printf("Failed to reboot following panic.\n"); 743 for (;;) 744 ; 745 } 746 747 /* 748 * Set up the necessary data structures to pretend that the Xen hypervisor 749 * is a loadable module, allowing mdb to find the Xen symbols in a crash 750 * dump. Since these symbols all map to VA space Solaris doesn't normally 751 * have access to, we don't link these structures into the kernel's lists 752 * until/unless we hit a Xen panic. 753 * 754 * The observant reader will note a striking amount of overlap between this 755 * code and that found in krtld. While it would be handy if we could just 756 * ask krtld to do this work for us, it's not that simple. Among the 757 * complications: we're not actually loading the text here (grub did it at 758 * boot), the .text section is writable, there are no relocations to do, 759 * none of the module text/data is in readable memory, etc. Training krtld 760 * to deal with this weird module is as complicated, and more risky, than 761 * reimplementing the necessary subset of it here. 762 */ 763 static void 764 init_xen_module() 765 { 766 struct _buf *file = NULL; 767 struct module *mp; 768 struct modctl *mcp; 769 int i, shn; 770 Shdr *shp, *ctf_shp; 771 char *names = NULL; 772 size_t n, namesize, text_align, data_align; 773 #if defined(__amd64) 774 const char machine = EM_AMD64; 775 #else 776 const char machine = EM_386; 777 #endif 778 779 /* Allocate and init the module structure */ 780 mp = kmem_zalloc(sizeof (*mp), KM_SLEEP); 781 mp->filename = kobj_zalloc(strlen(XPV_FILENAME) + 1, KM_SLEEP); 782 (void) strcpy(mp->filename, XPV_FILENAME); 783 784 /* Allocate and init the modctl structure */ 785 mcp = kmem_zalloc(sizeof (*mcp), KM_SLEEP); 786 mcp->mod_modname = kobj_zalloc(strlen(XPV_MODNAME) + 1, KM_SLEEP); 787 (void) strcpy(mcp->mod_modname, XPV_MODNAME); 788 mcp->mod_filename = kobj_zalloc(strlen(XPV_FILENAME) + 1, KM_SLEEP); 789 (void) strcpy(mcp->mod_filename, XPV_FILENAME); 790 mcp->mod_inprogress_thread = (kthread_id_t)-1; 791 mcp->mod_ref = 1; 792 mcp->mod_loaded = 1; 793 mcp->mod_loadcnt = 1; 794 mcp->mod_mp = mp; 795 796 /* 797 * Try to open a Xen image that hasn't had its symbol and CTF 798 * information stripped off. 799 */ 800 file = kobj_open_file(XPV_FILENAME); 801 if (file == (struct _buf *)-1) { 802 file = NULL; 803 goto err; 804 } 805 806 /* 807 * Read the header and ensure that this is an ELF file for the 808 * proper ISA. If it's not, somebody has done something very 809 * stupid. Why bother? See Mencken. 810 */ 811 if (kobj_read_file(file, (char *)&mp->hdr, sizeof (mp->hdr), 0) < 0) 812 goto err; 813 for (i = 0; i < SELFMAG; i++) 814 if (mp->hdr.e_ident[i] != ELFMAG[i]) 815 goto err; 816 if ((mp->hdr.e_ident[EI_DATA] != ELFDATA2LSB) || 817 (mp->hdr.e_machine != machine)) 818 goto err; 819 820 /* Read in the section headers */ 821 n = mp->hdr.e_shentsize * mp->hdr.e_shnum; 822 mp->shdrs = kmem_zalloc(n, KM_SLEEP); 823 if (kobj_read_file(file, mp->shdrs, n, mp->hdr.e_shoff) < 0) 824 goto err; 825 826 /* Read the section names */ 827 shp = (Shdr *)(mp->shdrs + mp->hdr.e_shstrndx * mp->hdr.e_shentsize); 828 namesize = shp->sh_size; 829 names = kmem_zalloc(shp->sh_size, KM_SLEEP); 830 if (kobj_read_file(file, names, shp->sh_size, shp->sh_offset) < 0) 831 goto err; 832 833 /* 834 * Fill in the text and data size fields. 835 */ 836 ctf_shp = NULL; 837 text_align = data_align = 0; 838 for (shn = 1; shn < mp->hdr.e_shnum; shn++) { 839 shp = (Shdr *)(mp->shdrs + shn * mp->hdr.e_shentsize); 840 841 /* Sanity check the offset of the section name */ 842 if (shp->sh_name >= namesize) 843 continue; 844 845 /* If we find the symtab section, remember it for later. */ 846 if (shp->sh_type == SHT_SYMTAB) { 847 mp->symtbl_section = shn; 848 mp->symhdr = shp; 849 continue; 850 } 851 852 /* If we find the CTF section, remember it for later. */ 853 if ((shp->sh_size != 0) && 854 (strcmp(names + shp->sh_name, ".SUNW_ctf") == 0)) { 855 ctf_shp = shp; 856 continue; 857 } 858 859 if (!(shp->sh_flags & SHF_ALLOC)) 860 continue; 861 862 /* 863 * Xen marks its text section as writable, so we need to 864 * look for the name - not just the flag. 865 */ 866 if ((strcmp(&names[shp->sh_name], ".text") != NULL) && 867 (shp->sh_flags & SHF_WRITE) != 0) { 868 if (shp->sh_addralign > data_align) 869 data_align = shp->sh_addralign; 870 mp->data_size = ALIGN(mp->data_size, data_align); 871 mp->data_size += ALIGN(shp->sh_size, 8); 872 if (mp->data == NULL || mp->data > (char *)shp->sh_addr) 873 mp->data = (char *)shp->sh_addr; 874 } else { 875 if (shp->sh_addralign > text_align) 876 text_align = shp->sh_addralign; 877 mp->text_size = ALIGN(mp->text_size, text_align); 878 mp->text_size += ALIGN(shp->sh_size, 8); 879 if (mp->text == NULL || mp->text > (char *)shp->sh_addr) 880 mp->text = (char *)shp->sh_addr; 881 } 882 } 883 kmem_free(names, namesize); 884 names = NULL; 885 shp = NULL; 886 mcp->mod_text = mp->text; 887 mcp->mod_text_size = mp->text_size; 888 889 /* 890 * If we have symbol table and string table sections, read them in 891 * now. If we don't, we just plow on. We'll still get a valid 892 * core dump, but finding anything useful will be just a bit 893 * harder. 894 * 895 * Note: we don't bother with a hash table. We'll never do a 896 * symbol lookup unless we crash, and then mdb creates its own. We 897 * also don't try to perform any relocations. Xen should be loaded 898 * exactly where the ELF file indicates, and the symbol information 899 * in the file should be complete and correct already. Static 900 * linking ain't all bad. 901 */ 902 if ((mp->symhdr != NULL) && (mp->symhdr->sh_link < mp->hdr.e_shnum)) { 903 mp->strhdr = (Shdr *) 904 (mp->shdrs + mp->symhdr->sh_link * mp->hdr.e_shentsize); 905 mp->nsyms = mp->symhdr->sh_size / mp->symhdr->sh_entsize; 906 907 /* Allocate space for the symbol table and strings. */ 908 mp->symsize = mp->symhdr->sh_size + 909 mp->nsyms * sizeof (symid_t) + mp->strhdr->sh_size; 910 mp->symspace = kmem_zalloc(mp->symsize, KM_SLEEP); 911 mp->symtbl = mp->symspace; 912 mp->strings = (char *)(mp->symtbl + mp->symhdr->sh_size); 913 914 if ((kobj_read_file(file, mp->symtbl, 915 mp->symhdr->sh_size, mp->symhdr->sh_offset) < 0) || 916 (kobj_read_file(file, mp->strings, 917 mp->strhdr->sh_size, mp->strhdr->sh_offset) < 0)) 918 goto err; 919 } 920 921 /* 922 * Read in the CTF section 923 */ 924 if ((ctf_shp != NULL) && ((moddebug & MODDEBUG_NOCTF) == 0)) { 925 mp->ctfdata = kmem_zalloc(ctf_shp->sh_size, KM_SLEEP); 926 mp->ctfsize = ctf_shp->sh_size; 927 if (kobj_read_file(file, mp->ctfdata, mp->ctfsize, 928 ctf_shp->sh_offset) < 0) 929 goto err; 930 } 931 932 kobj_close_file(file); 933 934 xpv_module = mp; 935 xpv_modctl = mcp; 936 return; 937 938 err: 939 cmn_err(CE_WARN, "Failed to initialize xpv module."); 940 if (file != NULL) 941 kobj_close_file(file); 942 943 kmem_free(mp->filename, strlen(XPV_FILENAME) + 1); 944 if (mp->shdrs != NULL) 945 kmem_free(mp->shdrs, mp->hdr.e_shentsize * mp->hdr.e_shnum); 946 if (mp->symspace != NULL) 947 kmem_free(mp->symspace, mp->symsize); 948 if (mp->ctfdata != NULL) 949 kmem_free(mp->ctfdata, mp->ctfsize); 950 kmem_free(mp, sizeof (*mp)); 951 kmem_free(mcp->mod_filename, strlen(XPV_FILENAME) + 1); 952 kmem_free(mcp->mod_modname, strlen(XPV_MODNAME) + 1); 953 kmem_free(mcp, sizeof (*mcp)); 954 if (names != NULL) 955 kmem_free(names, namesize); 956 } 957 958 void 959 xpv_panic_init() 960 { 961 xen_platform_op_t op; 962 int i; 963 964 ASSERT(DOMAIN_IS_INITDOMAIN(xen_info)); 965 966 for (i = 0; i < mmu.num_level; i++) 967 ptable_pfn[i] = PFN_INVALID; 968 969 /* Let Xen know where to jump if/when it panics. */ 970 op.cmd = XENPF_panic_init; 971 op.interface_version = XENPF_INTERFACE_VERSION; 972 op.u.panic_init.panic_addr = (unsigned long)xpv_panic_hdlr; 973 974 (void) HYPERVISOR_platform_op(&op); 975 976 init_xen_module(); 977 } 978