1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2012 Gary Mills 23 * Copyright 2016 PALO, Richard. 24 * 25 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 26 * 27 * Copyright 2018 Joyent, Inc. 28 */ 29 30 #include <sys/types.h> 31 #include <sys/clock.h> 32 #include <sys/psm.h> 33 #include <sys/archsystm.h> 34 #include <sys/machsystm.h> 35 #include <sys/compress.h> 36 #include <sys/modctl.h> 37 #include <sys/trap.h> 38 #include <sys/panic.h> 39 #include <sys/regset.h> 40 #include <sys/frame.h> 41 #include <sys/kobj.h> 42 #include <sys/apic.h> 43 #include <sys/apic_timer.h> 44 #include <sys/dumphdr.h> 45 #include <sys/mem.h> 46 #include <sys/x86_archext.h> 47 #include <sys/xpv_panic.h> 48 #include <sys/boot_console.h> 49 #include <sys/bootsvcs.h> 50 #include <sys/consdev.h> 51 #include <vm/hat_pte.h> 52 #include <vm/hat_i86.h> 53 54 /* XXX: need to add a PAE version too, if we ever support both PAE and non */ 55 #if defined(__i386) 56 #define XPV_FILENAME "/boot/xen-syms" 57 #else 58 #define XPV_FILENAME "/boot/amd64/xen-syms" 59 #endif 60 #define XPV_MODNAME "xpv" 61 62 int xpv_panicking = 0; 63 64 struct module *xpv_module; 65 struct modctl *xpv_modctl; 66 67 #define ALIGN(x, a) ((a) == 0 ? (uintptr_t)(x) : \ 68 (((uintptr_t)(x) + (uintptr_t)(a) - 1l) & ~((uintptr_t)(a) - 1l))) 69 70 /* Pointer to the xpv_panic_info structure handed to us by Xen. */ 71 static struct panic_info *xpv_panic_info = NULL; 72 73 /* Timer support */ 74 #define NSEC_SHIFT 5 75 #define T_XPV_TIMER 0xd1 76 #define XPV_TIMER_INTERVAL 1000 /* 1000 microseconds */ 77 static uint32_t *xpv_apicadr = NULL; 78 static uint_t nsec_scale; 79 80 /* IDT support */ 81 #pragma align 16(xpv_panic_idt) 82 static gate_desc_t xpv_panic_idt[NIDT]; /* interrupt descriptor table */ 83 84 /* Xen pagetables mapped into our HAT's ptable windows */ 85 static pfn_t ptable_pfn[MAX_NUM_LEVEL]; 86 87 /* Number of MMU_PAGESIZE pages we're adding to the Solaris dump */ 88 static int xpv_dump_pages; 89 90 /* 91 * There are up to two large swathes of RAM that we don't want to include 92 * in the dump: those that comprise the Xen version of segkpm. On 32-bit 93 * systems there is no such region of memory. On 64-bit systems, there 94 * should be just a single contiguous region that corresponds to all of 95 * physical memory. The tricky bit is that Xen's heap sometimes lives in 96 * the middle of their segkpm, and is mapped using only kpm-like addresses. 97 * In that case, we need to skip the swathes before and after Xen's heap. 98 */ 99 uintptr_t kpm1_low = 0; 100 uintptr_t kpm1_high = 0; 101 uintptr_t kpm2_low = 0; 102 uintptr_t kpm2_high = 0; 103 104 /* 105 * Some commonly used values that we don't want to recompute over and over. 106 */ 107 static int xpv_panic_nptes[MAX_NUM_LEVEL]; 108 static ulong_t xpv_panic_cr3; 109 static uintptr_t xpv_end; 110 111 static void xpv_panic_console_print(const char *fmt, ...); 112 static void (*xpv_panic_printf)(const char *, ...) = xpv_panic_console_print; 113 114 #define CONSOLE_BUF_SIZE 256 115 static char console_buffer[CONSOLE_BUF_SIZE]; 116 static boolean_t use_polledio; 117 118 /* 119 * Pointers to machine check panic info (if any). 120 */ 121 xpv_mca_panic_data_t *xpv_mca_panic_data = NULL; 122 123 static void 124 xpv_panic_putc(int m) 125 { 126 struct cons_polledio *c = cons_polledio; 127 128 /* This really shouldn't happen */ 129 if (boot_console_type(NULL) == CONS_HYPERVISOR) 130 return; 131 132 if (use_polledio == B_TRUE) 133 c->cons_polledio_putchar(c->cons_polledio_argument, m); 134 else 135 bcons_putchar(m); 136 } 137 138 static void 139 xpv_panic_puts(char *msg) 140 { 141 char *m; 142 143 dump_timeleft = dump_timeout; 144 for (m = msg; *m; m++) 145 xpv_panic_putc((int)*m); 146 } 147 148 static void 149 xpv_panic_console_print(const char *fmt, ...) 150 { 151 va_list ap; 152 153 va_start(ap, fmt); 154 (void) vsnprintf(console_buffer, sizeof (console_buffer), fmt, ap); 155 va_end(ap); 156 157 xpv_panic_puts(console_buffer); 158 } 159 160 static void 161 xpv_panic_map(int level, pfn_t pfn) 162 { 163 x86pte_t pte, *pteptr; 164 165 /* 166 * The provided pfn represents a level 'level' page table. Map it 167 * into the 'level' slot in the list of page table windows. 168 */ 169 pteptr = (x86pte_t *)PWIN_PTE_VA(level); 170 pte = pfn_to_pa(pfn) | PT_VALID; 171 172 XPV_ALLOW_PAGETABLE_UPDATES(); 173 if (mmu.pae_hat) 174 *pteptr = pte; 175 else 176 *(x86pte32_t *)pteptr = pte; 177 XPV_DISALLOW_PAGETABLE_UPDATES(); 178 179 mmu_flush_tlb_page((uintptr_t)PWIN_VA(level)); 180 } 181 182 /* 183 * Walk the page tables to find the pfn mapped by the given va. 184 */ 185 static pfn_t 186 xpv_va_walk(uintptr_t *vaddr) 187 { 188 int l, idx; 189 pfn_t pfn; 190 x86pte_t pte; 191 x86pte_t *ptep; 192 uintptr_t va = *vaddr; 193 uintptr_t scan_va; 194 caddr_t ptable_window; 195 static pfn_t toplevel_pfn; 196 static uintptr_t lastva; 197 198 pte = 0; 199 /* 200 * If we do anything other than a simple scan through memory, don't 201 * trust the mapped page tables. 202 */ 203 if (va != lastva + MMU_PAGESIZE) 204 for (l = mmu.max_level; l >= 0; l--) 205 ptable_pfn[l] = PFN_INVALID; 206 207 toplevel_pfn = mmu_btop(xpv_panic_cr3); 208 209 while (va < xpv_end && va >= *vaddr) { 210 /* Find the lowest table with any entry for va */ 211 pfn = toplevel_pfn; 212 for (l = mmu.max_level; l >= 0; l--) { 213 if (ptable_pfn[l] != pfn) { 214 xpv_panic_map(l, pfn); 215 ptable_pfn[l] = pfn; 216 } 217 218 /* 219 * Search this pagetable for any mapping to an 220 * address >= va. 221 */ 222 ptable_window = PWIN_VA(l); 223 if (l == mmu.max_level && mmu.pae_hat) 224 ptable_window += 225 (xpv_panic_cr3 & MMU_PAGEOFFSET); 226 227 idx = (va >> LEVEL_SHIFT(l)) & (xpv_panic_nptes[l] - 1); 228 scan_va = va; 229 while (idx < xpv_panic_nptes[l] && scan_va < xpv_end && 230 scan_va >= *vaddr) { 231 ptep = (x86pte_t *)(ptable_window + 232 (idx << mmu.pte_size_shift)); 233 pte = GET_PTE(ptep); 234 if (pte & PTE_VALID) 235 break; 236 idx++; 237 scan_va += mmu.level_size[l]; 238 } 239 240 /* 241 * If there are no valid mappings in this table, we 242 * can skip to the end of the VA range it covers. 243 */ 244 if (idx == xpv_panic_nptes[l]) { 245 va = NEXT_ENTRY_VA(va, l + 1); 246 break; 247 } 248 249 va = scan_va; 250 /* 251 * See if we've hit the end of the range. 252 */ 253 if (va >= xpv_end || va < *vaddr) 254 break; 255 256 /* 257 * If this mapping is for a pagetable, we drop down 258 * to the next level in the hierarchy and look for 259 * a mapping in it. 260 */ 261 pfn = PTE2MFN(pte, l); 262 if (!PTE_ISPAGE(pte, l)) 263 continue; 264 265 /* 266 * The APIC page is magic. Nothing to see here; 267 * move along. 268 */ 269 if (((uintptr_t)xpv_apicadr & MMU_PAGEMASK) == 270 (va & MMU_PAGEMASK)) { 271 va += MMU_PAGESIZE; 272 break; 273 } 274 275 /* 276 * See if the address is within one of the two 277 * kpm-like regions we want to skip. 278 */ 279 if (va >= kpm1_low && va < kpm1_high) { 280 va = kpm1_high; 281 break; 282 } 283 if (va >= kpm2_low && va < kpm2_high) { 284 va = kpm2_high; 285 break; 286 } 287 288 /* 289 * The Xen panic code only handles small pages. If 290 * this mapping is for a large page, we need to 291 * identify the consituent page that covers the 292 * specific VA we were looking for. 293 */ 294 if (l > 0) { 295 if (l > 1) 296 panic("Xen panic can't cope with " 297 "giant pages."); 298 idx = (va >> LEVEL_SHIFT(0)) & 299 (xpv_panic_nptes[0] - 1); 300 pfn += idx; 301 } 302 303 *vaddr = va; 304 lastva = va; 305 return (pfn | PFN_IS_FOREIGN_MFN); 306 } 307 } 308 return (PFN_INVALID); 309 } 310 311 /* 312 * Walk through the Xen VA space, finding pages that are mapped in. 313 * 314 * These pages all have MFNs rather than PFNs, meaning they may be outside 315 * the physical address space the kernel knows about, or they may collide 316 * with PFNs the kernel is using. 317 * 318 * The obvious trick of just adding the PFN_IS_FOREIGN_MFN bit to the MFNs 319 * to avoid collisions doesn't work. The pages need to be written to disk 320 * in PFN-order or savecore gets confused. We can't allocate memory to 321 * contruct a sorted pfn->VA reverse mapping, so we have to write the pages 322 * to disk in VA order. 323 * 324 * To square this circle, we simply make up PFNs for each of Xen's pages. 325 * We assign each mapped page a fake PFN in ascending order. These fake 326 * PFNs each have the FOREIGN bit set, ensuring that they fall outside the 327 * range of Solaris PFNs written by the kernel. 328 */ 329 int 330 dump_xpv_addr() 331 { 332 uintptr_t va; 333 mem_vtop_t mem_vtop; 334 335 xpv_dump_pages = 0; 336 va = xen_virt_start; 337 338 while (xpv_va_walk(&va) != PFN_INVALID) { 339 mem_vtop.m_as = &kas; 340 mem_vtop.m_va = (void *)va; 341 mem_vtop.m_pfn = (pfn_t)xpv_dump_pages | PFN_IS_FOREIGN_MFN; 342 343 dumpvp_write(&mem_vtop, sizeof (mem_vtop_t)); 344 xpv_dump_pages++; 345 346 va += MMU_PAGESIZE; 347 } 348 349 /* 350 * Add the shared_info page. This page actually ends up in the 351 * dump twice: once for the Xen va and once for the Solaris va. 352 * This isn't ideal, but we don't know the address Xen is using for 353 * the page, so we can't share it. 354 */ 355 mem_vtop.m_as = &kas; 356 mem_vtop.m_va = HYPERVISOR_shared_info; 357 mem_vtop.m_pfn = (pfn_t)xpv_dump_pages | PFN_IS_FOREIGN_MFN; 358 dumpvp_write(&mem_vtop, sizeof (mem_vtop_t)); 359 xpv_dump_pages++; 360 361 return (xpv_dump_pages); 362 } 363 364 void 365 dump_xpv_pfn() 366 { 367 pfn_t pfn; 368 int cnt; 369 370 for (cnt = 0; cnt < xpv_dump_pages; cnt++) { 371 pfn = (pfn_t)cnt | PFN_IS_FOREIGN_MFN; 372 dumpvp_write(&pfn, sizeof (pfn)); 373 } 374 } 375 376 int 377 dump_xpv_data(void *dump_cbuf) 378 { 379 uintptr_t va; 380 uint32_t csize; 381 int cnt = 0; 382 383 /* 384 * XXX: we should probably run this data through a UE check. The 385 * catch is that the UE code relies on on_trap() and getpfnum() 386 * working. 387 */ 388 va = xen_virt_start; 389 390 while (xpv_va_walk(&va) != PFN_INVALID) { 391 csize = (uint32_t)compress((void *)va, dump_cbuf, PAGESIZE); 392 dumpvp_write(&csize, sizeof (uint32_t)); 393 dumpvp_write(dump_cbuf, csize); 394 if (dump_ioerr) { 395 dumphdr->dump_flags &= ~DF_COMPLETE; 396 return (cnt); 397 } 398 cnt++; 399 va += MMU_PAGESIZE; 400 } 401 402 /* 403 * Finally, dump the shared_info page 404 */ 405 csize = (uint32_t)compress((void *)HYPERVISOR_shared_info, dump_cbuf, 406 PAGESIZE); 407 dumpvp_write(&csize, sizeof (uint32_t)); 408 dumpvp_write(dump_cbuf, csize); 409 if (dump_ioerr) 410 dumphdr->dump_flags &= ~DF_COMPLETE; 411 cnt++; 412 413 return (cnt); 414 } 415 416 static void * 417 showstack(void *fpreg, int xpv_only) 418 { 419 struct frame *fpp; 420 ulong_t off; 421 char *sym; 422 uintptr_t pc, fp, lastfp; 423 uintptr_t minaddr = min(KERNELBASE, xen_virt_start); 424 425 fp = (uintptr_t)fpreg; 426 if (fp < minaddr) { 427 xpv_panic_printf("Bad frame ptr: 0x%p\n", fpreg); 428 return (fpreg); 429 } 430 431 do { 432 fpp = (struct frame *)fp; 433 pc = fpp->fr_savpc; 434 435 if ((xpv_only != 0) && 436 (fp > xpv_end || fp < xen_virt_start)) 437 break; 438 if ((sym = kobj_getsymname(pc, &off)) != NULL) 439 xpv_panic_printf("%08lx %s:%s+%lx\n", fp, 440 mod_containing_pc((caddr_t)pc), sym, off); 441 else if ((pc >= xen_virt_start) && (pc <= xpv_end)) 442 xpv_panic_printf("%08lx 0x%lx (in Xen)\n", fp, pc); 443 else 444 xpv_panic_printf("%08lx %lx\n", fp, pc); 445 446 lastfp = fp; 447 fp = fpp->fr_savfp; 448 449 /* 450 * Xen marks an exception frame by inverting the frame 451 * pointer. 452 */ 453 if (fp < lastfp) { 454 if ((~fp > minaddr) && ((~fp) ^ lastfp) < 0xfff) 455 fp = ~fp; 456 } 457 } while (fp > lastfp); 458 return ((void *)fp); 459 } 460 461 void * 462 xpv_traceback(void *fpreg) 463 { 464 return (showstack(fpreg, 1)); 465 } 466 467 #if defined(__amd64) 468 static void 469 xpv_panic_hypercall(ulong_t call) 470 { 471 panic("Illegally issued hypercall %d during panic!\n", (int)call); 472 } 473 #endif 474 475 void 476 xpv_die(struct regs *rp) 477 { 478 struct panic_trap_info ti; 479 struct cregs creg; 480 481 ti.trap_regs = rp; 482 ti.trap_type = rp->r_trapno; 483 484 curthread->t_panic_trap = &ti; 485 if (ti.trap_type == T_PGFLT) { 486 getcregs(&creg); 487 ti.trap_addr = (caddr_t)creg.cr_cr2; 488 panic("Fatal pagefault at 0x%lx. fault addr=0x%p rp=0x%p", 489 rp->r_pc, (void *)ti.trap_addr, (void *)rp); 490 } else { 491 ti.trap_addr = (caddr_t)rp->r_pc; 492 panic("Fatal trap %ld at 0x%lx. rp=0x%p", rp->r_trapno, 493 rp->r_pc, (void *)rp); 494 } 495 } 496 497 /* 498 * Build IDT to handle a Xen panic 499 */ 500 static void 501 switch_to_xpv_panic_idt() 502 { 503 int i; 504 desctbr_t idtr; 505 gate_desc_t *idt = xpv_panic_idt; 506 selector_t cs = get_cs_register(); 507 508 for (i = 0; i < 32; i++) 509 set_gatesegd(&idt[i], &xpv_invaltrap, cs, SDT_SYSIGT, TRP_XPL, 510 0); 511 512 set_gatesegd(&idt[T_ZERODIV], &xpv_div0trap, cs, SDT_SYSIGT, TRP_XPL, 513 0); 514 set_gatesegd(&idt[T_SGLSTP], &xpv_dbgtrap, cs, SDT_SYSIGT, TRP_XPL, 0); 515 set_gatesegd(&idt[T_NMIFLT], &xpv_nmiint, cs, SDT_SYSIGT, TRP_XPL, 0); 516 set_gatesegd(&idt[T_BOUNDFLT], &xpv_boundstrap, cs, SDT_SYSIGT, 517 TRP_XPL, 0); 518 set_gatesegd(&idt[T_ILLINST], &xpv_invoptrap, cs, SDT_SYSIGT, TRP_XPL, 519 0); 520 set_gatesegd(&idt[T_NOEXTFLT], &xpv_ndptrap, cs, SDT_SYSIGT, TRP_XPL, 521 0); 522 set_gatesegd(&idt[T_TSSFLT], &xpv_invtsstrap, cs, SDT_SYSIGT, TRP_XPL, 523 0); 524 set_gatesegd(&idt[T_SEGFLT], &xpv_segnptrap, cs, SDT_SYSIGT, TRP_XPL, 525 0); 526 set_gatesegd(&idt[T_STKFLT], &xpv_stktrap, cs, SDT_SYSIGT, TRP_XPL, 0); 527 set_gatesegd(&idt[T_GPFLT], &xpv_gptrap, cs, SDT_SYSIGT, TRP_XPL, 0); 528 set_gatesegd(&idt[T_PGFLT], &xpv_pftrap, cs, SDT_SYSIGT, TRP_XPL, 0); 529 set_gatesegd(&idt[T_EXTERRFLT], &xpv_ndperr, cs, SDT_SYSIGT, TRP_XPL, 530 0); 531 set_gatesegd(&idt[T_ALIGNMENT], &xpv_achktrap, cs, SDT_SYSIGT, TRP_XPL, 532 0); 533 set_gatesegd(&idt[T_MCE], &xpv_mcetrap, cs, SDT_SYSIGT, TRP_XPL, 0); 534 set_gatesegd(&idt[T_SIMDFPE], &xpv_xmtrap, cs, SDT_SYSIGT, TRP_XPL, 0); 535 536 /* 537 * We have no double fault handler. Any single fault represents a 538 * catastrophic failure for us, so there is no attempt to handle 539 * them cleanly: we just print a message and reboot. If we 540 * encounter a second fault while doing that, there is nothing 541 * else we can do. 542 */ 543 544 /* 545 * Be prepared to absorb any stray device interrupts received 546 * while writing the core to disk. 547 */ 548 for (i = 33; i < NIDT; i++) 549 set_gatesegd(&idt[i], &xpv_surprise_intr, cs, SDT_SYSIGT, 550 TRP_XPL, 0); 551 552 /* The one interrupt we expect to get is from the APIC timer. */ 553 set_gatesegd(&idt[T_XPV_TIMER], &xpv_timer_trap, cs, SDT_SYSIGT, 554 TRP_XPL, 0); 555 556 idtr.dtr_base = (uintptr_t)xpv_panic_idt; 557 idtr.dtr_limit = sizeof (xpv_panic_idt) - 1; 558 wr_idtr(&idtr); 559 560 #if defined(__amd64) 561 /* Catch any hypercalls. */ 562 wrmsr(MSR_AMD_LSTAR, (uintptr_t)xpv_panic_hypercall); 563 wrmsr(MSR_AMD_CSTAR, (uintptr_t)xpv_panic_hypercall); 564 #endif 565 } 566 567 static void 568 xpv_apic_clkinit() 569 { 570 uint_t apic_ticks = 0; 571 572 /* 573 * Measure how many APIC ticks there are within a fixed time 574 * period. We're going to be fairly coarse here. This timer is 575 * just being used to detect a stalled panic, so as long as we have 576 * the right order of magnitude, everything should be fine. 577 */ 578 xpv_apicadr[APIC_SPUR_INT_REG] = AV_UNIT_ENABLE | APIC_SPUR_INTR; 579 xpv_apicadr[APIC_LOCAL_TIMER] = AV_MASK; 580 xpv_apicadr[APIC_INT_VECT0] = AV_MASK; /* local intr reg 0 */ 581 582 xpv_apicadr[APIC_DIVIDE_REG] = 0; 583 xpv_apicadr[APIC_INIT_COUNT] = APIC_MAXVAL; 584 drv_usecwait(XPV_TIMER_INTERVAL); 585 apic_ticks = APIC_MAXVAL - xpv_apicadr[APIC_CURR_COUNT]; 586 587 /* 588 * apic_ticks now represents roughly how many apic ticks comprise 589 * one timeout interval. Program the timer to send us an interrupt 590 * every time that interval expires. 591 */ 592 xpv_apicadr[APIC_LOCAL_TIMER] = T_XPV_TIMER | AV_PERIODIC; 593 xpv_apicadr[APIC_INIT_COUNT] = apic_ticks; 594 xpv_apicadr[APIC_EOI_REG] = 0; 595 } 596 597 void 598 xpv_timer_tick(void) 599 { 600 static int ticks = 0; 601 602 if (ticks++ >= MICROSEC / XPV_TIMER_INTERVAL) { 603 ticks = 0; 604 if (dump_timeleft && (--dump_timeleft == 0)) 605 panic("Xen panic timeout\n"); 606 } 607 xpv_apicadr[APIC_EOI_REG] = 0; 608 } 609 610 void 611 xpv_interrupt(void) 612 { 613 #ifdef DEBUG 614 static int cnt = 0; 615 616 if (cnt++ < 10) 617 xpv_panic_printf("Unexpected interrupt received.\n"); 618 if ((cnt < 1000) && ((cnt % 100) == 0)) 619 xpv_panic_printf("%d unexpected interrupts received.\n", cnt); 620 #endif 621 622 xpv_apicadr[APIC_EOI_REG] = 0; 623 } 624 625 /* 626 * Managing time in panic context is trivial. We only have a single CPU, 627 * we never get rescheduled, we never get suspended. We just need to 628 * convert clock ticks into nanoseconds. 629 */ 630 static hrtime_t 631 xpv_panic_gethrtime(void) 632 { 633 hrtime_t tsc, hrt; 634 unsigned int *l = (unsigned int *)&(tsc); 635 636 tsc = __rdtsc_insn(); 637 hrt = (mul32(l[1], nsec_scale) << NSEC_SHIFT) + 638 (mul32(l[0], nsec_scale) >> (32 - NSEC_SHIFT)); 639 640 return (hrt); 641 } 642 643 static void 644 xpv_panic_time_init() 645 { 646 nsec_scale = 647 CPU->cpu_m.mcpu_vcpu_info->time.tsc_to_system_mul >> NSEC_SHIFT; 648 649 gethrtimef = xpv_panic_gethrtime; 650 } 651 652 static void 653 xpv_panicsys(struct regs *rp, char *fmt, ...) 654 { 655 extern void panicsys(const char *, va_list, struct regs *, int); 656 va_list alist; 657 658 va_start(alist, fmt); 659 panicsys(fmt, alist, rp, 1); 660 va_end(alist); 661 } 662 663 void 664 xpv_do_panic(void *arg) 665 { 666 struct panic_info *pip = (struct panic_info *)arg; 667 int l; 668 struct cregs creg; 669 #if defined(__amd64) 670 extern uintptr_t postbootkernelbase; 671 #endif 672 673 if (xpv_panicking++ > 0) 674 panic("multiple calls to xpv_do_panic()"); 675 676 /* 677 * Indicate to the underlying panic framework that a panic has been 678 * initiated. This is ordinarily done as part of vpanic(). Since 679 * we already have all the register state saved by the hypervisor, 680 * we skip that and jump straight into the panic processing code. 681 * 682 * XXX If another thread grabs and wins the panic_quiesce trigger 683 * then we'll have two threads in panicsys believing they are in 684 * charge of the panic attempt! 685 */ 686 (void) panic_trigger(&panic_quiesce); 687 688 #if defined(__amd64) 689 /* 690 * bzero() and bcopy() get unhappy when asked to operate on 691 * addresses outside of the kernel. At this point Xen is really a 692 * part of the kernel, so we update the routines' notion of where 693 * the kernel starts. 694 */ 695 postbootkernelbase = xen_virt_start; 696 #endif 697 698 #if defined(HYPERVISOR_VIRT_END) 699 xpv_end = HYPERVISOR_VIRT_END; 700 #else 701 xpv_end = (uintptr_t)UINTPTR_MAX - sizeof (uintptr_t); 702 #endif 703 704 /* 705 * If we were redirecting console output to the hypervisor, we have 706 * to stop. 707 */ 708 use_polledio = B_FALSE; 709 if (boot_console_type(NULL) == CONS_HYPERVISOR) { 710 bcons_device_change(CONS_HYPERVISOR); 711 } else if (cons_polledio != NULL && 712 cons_polledio->cons_polledio_putchar != NULL) { 713 if (cons_polledio->cons_polledio_enter != NULL) 714 cons_polledio->cons_polledio_enter( 715 cons_polledio->cons_polledio_argument); 716 use_polledio = 1; 717 } 718 719 /* Make sure we handle all console output from here on. */ 720 sysp->bsvc_putchar = xpv_panic_putc; 721 722 /* 723 * If we find an unsupported panic_info structure, there's not much 724 * we can do other than complain, plow on, and hope for the best. 725 */ 726 if (pip->pi_version != PANIC_INFO_VERSION) 727 xpv_panic_printf("Warning: Xen is using an unsupported " 728 "version of the panic_info structure.\n"); 729 730 xpv_panic_info = pip; 731 732 #if defined(__amd64) 733 kpm1_low = (uintptr_t)xpv_panic_info->pi_ram_start; 734 if (xpv_panic_info->pi_xen_start == NULL) { 735 kpm1_high = (uintptr_t)xpv_panic_info->pi_ram_end; 736 } else { 737 kpm1_high = (uintptr_t)xpv_panic_info->pi_xen_start; 738 kpm2_low = (uintptr_t)xpv_panic_info->pi_xen_end; 739 kpm2_high = (uintptr_t)xpv_panic_info->pi_ram_end; 740 } 741 #endif 742 743 /* 744 * Make sure we are running on the Solaris %gs. The Xen panic code 745 * should already have set up the GDT properly. 746 */ 747 xpv_panic_resetgs(); 748 #if defined(__amd64) 749 wrmsr(MSR_AMD_GSBASE, (uint64_t)&cpus[0]); 750 #endif 751 752 xpv_panic_time_init(); 753 754 /* 755 * Switch to our own IDT, avoiding any accidental returns to Xen 756 * world. 757 */ 758 switch_to_xpv_panic_idt(); 759 760 /* 761 * Initialize the APIC timer, which is used to detect a hung dump 762 * attempt. 763 */ 764 xpv_apicadr = pip->pi_apic; 765 xpv_apic_clkinit(); 766 767 /* 768 * Set up a few values that we'll need repeatedly. 769 */ 770 getcregs(&creg); 771 xpv_panic_cr3 = creg.cr_cr3; 772 for (l = mmu.max_level; l >= 0; l--) 773 xpv_panic_nptes[l] = mmu.ptes_per_table; 774 #ifdef __i386 775 if (mmu.pae_hat) 776 xpv_panic_nptes[mmu.max_level] = 4; 777 #endif 778 779 /* Add the fake Xen module to the module list */ 780 if (xpv_module != NULL) { 781 extern int last_module_id; 782 783 xpv_modctl->mod_id = last_module_id++; 784 xpv_modctl->mod_next = &modules; 785 xpv_modctl->mod_prev = modules.mod_prev; 786 modules.mod_prev->mod_next = xpv_modctl; 787 modules.mod_prev = xpv_modctl; 788 } 789 790 if (pip->pi_mca.mpd_magic == MCA_PANICDATA_MAGIC) 791 xpv_mca_panic_data = &pip->pi_mca; 792 793 xpv_panic_printf = printf; 794 xpv_panicsys((struct regs *)pip->pi_regs, pip->pi_panicstr); 795 xpv_panic_printf("Failed to reboot following panic.\n"); 796 for (;;) 797 ; 798 } 799 800 /* 801 * Set up the necessary data structures to pretend that the Xen hypervisor 802 * is a loadable module, allowing mdb to find the Xen symbols in a crash 803 * dump. Since these symbols all map to VA space Solaris doesn't normally 804 * have access to, we don't link these structures into the kernel's lists 805 * until/unless we hit a Xen panic. 806 * 807 * The observant reader will note a striking amount of overlap between this 808 * code and that found in krtld. While it would be handy if we could just 809 * ask krtld to do this work for us, it's not that simple. Among the 810 * complications: we're not actually loading the text here (grub did it at 811 * boot), the .text section is writable, there are no relocations to do, 812 * none of the module text/data is in readable memory, etc. Training krtld 813 * to deal with this weird module is as complicated, and more risky, than 814 * reimplementing the necessary subset of it here. 815 */ 816 static void 817 init_xen_module() 818 { 819 struct _buf *file = NULL; 820 struct module *mp; 821 struct modctl *mcp; 822 int i, shn; 823 Shdr *shp, *ctf_shp; 824 char *names = NULL; 825 size_t n, namesize, text_align, data_align; 826 #if defined(__amd64) 827 const char machine = EM_AMD64; 828 #else 829 const char machine = EM_386; 830 #endif 831 832 /* Allocate and init the module structure */ 833 mp = kmem_zalloc(sizeof (*mp), KM_SLEEP); 834 mp->filename = kobj_zalloc(strlen(XPV_FILENAME) + 1, KM_SLEEP); 835 (void) strcpy(mp->filename, XPV_FILENAME); 836 837 /* Allocate and init the modctl structure */ 838 mcp = kmem_zalloc(sizeof (*mcp), KM_SLEEP); 839 mcp->mod_modname = kobj_zalloc(strlen(XPV_MODNAME) + 1, KM_SLEEP); 840 (void) strcpy(mcp->mod_modname, XPV_MODNAME); 841 mcp->mod_filename = kobj_zalloc(strlen(XPV_FILENAME) + 1, KM_SLEEP); 842 (void) strcpy(mcp->mod_filename, XPV_FILENAME); 843 mcp->mod_inprogress_thread = (kthread_id_t)-1; 844 mcp->mod_ref = 1; 845 mcp->mod_loaded = 1; 846 mcp->mod_loadcnt = 1; 847 mcp->mod_mp = mp; 848 849 /* 850 * Try to open a Xen image that hasn't had its symbol and CTF 851 * information stripped off. 852 */ 853 file = kobj_open_file(XPV_FILENAME); 854 if (file == (struct _buf *)-1) { 855 file = NULL; 856 goto err; 857 } 858 859 /* 860 * Read the header and ensure that this is an ELF file for the 861 * proper ISA. If it's not, somebody has done something very 862 * stupid. Why bother? See Mencken. 863 */ 864 if (kobj_read_file(file, (char *)&mp->hdr, sizeof (mp->hdr), 0) < 0) 865 goto err; 866 for (i = 0; i < SELFMAG; i++) 867 if (mp->hdr.e_ident[i] != ELFMAG[i]) 868 goto err; 869 if ((mp->hdr.e_ident[EI_DATA] != ELFDATA2LSB) || 870 (mp->hdr.e_machine != machine)) 871 goto err; 872 873 /* Read in the section headers */ 874 n = mp->hdr.e_shentsize * mp->hdr.e_shnum; 875 mp->shdrs = kmem_zalloc(n, KM_SLEEP); 876 if (kobj_read_file(file, mp->shdrs, n, mp->hdr.e_shoff) < 0) 877 goto err; 878 879 /* Read the section names */ 880 shp = (Shdr *)(mp->shdrs + mp->hdr.e_shstrndx * mp->hdr.e_shentsize); 881 namesize = shp->sh_size; 882 names = kmem_zalloc(shp->sh_size, KM_SLEEP); 883 if (kobj_read_file(file, names, shp->sh_size, shp->sh_offset) < 0) 884 goto err; 885 886 /* 887 * Fill in the text and data size fields. 888 */ 889 ctf_shp = NULL; 890 text_align = data_align = 0; 891 for (shn = 1; shn < mp->hdr.e_shnum; shn++) { 892 shp = (Shdr *)(mp->shdrs + shn * mp->hdr.e_shentsize); 893 894 /* Sanity check the offset of the section name */ 895 if (shp->sh_name >= namesize) 896 continue; 897 898 /* If we find the symtab section, remember it for later. */ 899 if (shp->sh_type == SHT_SYMTAB) { 900 mp->symtbl_section = shn; 901 mp->symhdr = shp; 902 continue; 903 } 904 905 /* If we find the CTF section, remember it for later. */ 906 if ((shp->sh_size != 0) && 907 (strcmp(names + shp->sh_name, ".SUNW_ctf") == 0)) { 908 ctf_shp = shp; 909 continue; 910 } 911 912 if (!(shp->sh_flags & SHF_ALLOC)) 913 continue; 914 915 /* 916 * Xen marks its text section as writable, so we need to 917 * look for the name - not just the flag. 918 */ 919 if ((strcmp(&names[shp->sh_name], ".text") != 0) && 920 (shp->sh_flags & SHF_WRITE) != 0) { 921 if (shp->sh_addralign > data_align) 922 data_align = shp->sh_addralign; 923 mp->data_size = ALIGN(mp->data_size, data_align); 924 mp->data_size += ALIGN(shp->sh_size, 8); 925 if (mp->data == NULL || mp->data > (char *)shp->sh_addr) 926 mp->data = (char *)shp->sh_addr; 927 } else { 928 if (shp->sh_addralign > text_align) 929 text_align = shp->sh_addralign; 930 mp->text_size = ALIGN(mp->text_size, text_align); 931 mp->text_size += ALIGN(shp->sh_size, 8); 932 if (mp->text == NULL || mp->text > (char *)shp->sh_addr) 933 mp->text = (char *)shp->sh_addr; 934 } 935 } 936 kmem_free(names, namesize); 937 names = NULL; 938 shp = NULL; 939 mcp->mod_text = mp->text; 940 mcp->mod_text_size = mp->text_size; 941 942 /* 943 * If we have symbol table and string table sections, read them in 944 * now. If we don't, we just plow on. We'll still get a valid 945 * core dump, but finding anything useful will be just a bit 946 * harder. 947 * 948 * Note: we don't bother with a hash table. We'll never do a 949 * symbol lookup unless we crash, and then mdb creates its own. We 950 * also don't try to perform any relocations. Xen should be loaded 951 * exactly where the ELF file indicates, and the symbol information 952 * in the file should be complete and correct already. Static 953 * linking ain't all bad. 954 */ 955 if ((mp->symhdr != NULL) && (mp->symhdr->sh_link < mp->hdr.e_shnum)) { 956 mp->strhdr = (Shdr *) 957 (mp->shdrs + mp->symhdr->sh_link * mp->hdr.e_shentsize); 958 mp->nsyms = mp->symhdr->sh_size / mp->symhdr->sh_entsize; 959 960 /* Allocate space for the symbol table and strings. */ 961 mp->symsize = mp->symhdr->sh_size + 962 mp->nsyms * sizeof (symid_t) + mp->strhdr->sh_size; 963 mp->symspace = kmem_zalloc(mp->symsize, KM_SLEEP); 964 mp->symtbl = mp->symspace; 965 mp->strings = (char *)(mp->symtbl + mp->symhdr->sh_size); 966 967 if ((kobj_read_file(file, mp->symtbl, 968 mp->symhdr->sh_size, mp->symhdr->sh_offset) < 0) || 969 (kobj_read_file(file, mp->strings, 970 mp->strhdr->sh_size, mp->strhdr->sh_offset) < 0)) 971 goto err; 972 } 973 974 /* 975 * Read in the CTF section 976 */ 977 if ((ctf_shp != NULL) && ((moddebug & MODDEBUG_NOCTF) == 0)) { 978 mp->ctfdata = kmem_zalloc(ctf_shp->sh_size, KM_SLEEP); 979 mp->ctfsize = ctf_shp->sh_size; 980 if (kobj_read_file(file, mp->ctfdata, mp->ctfsize, 981 ctf_shp->sh_offset) < 0) 982 goto err; 983 } 984 985 kobj_close_file(file); 986 987 xpv_module = mp; 988 xpv_modctl = mcp; 989 return; 990 991 err: 992 cmn_err(CE_WARN, "Failed to initialize xpv module."); 993 if (file != NULL) 994 kobj_close_file(file); 995 996 kmem_free(mp->filename, strlen(XPV_FILENAME) + 1); 997 if (mp->shdrs != NULL) 998 kmem_free(mp->shdrs, mp->hdr.e_shentsize * mp->hdr.e_shnum); 999 if (mp->symspace != NULL) 1000 kmem_free(mp->symspace, mp->symsize); 1001 if (mp->ctfdata != NULL) 1002 kmem_free(mp->ctfdata, mp->ctfsize); 1003 kmem_free(mp, sizeof (*mp)); 1004 kmem_free(mcp->mod_filename, strlen(XPV_FILENAME) + 1); 1005 kmem_free(mcp->mod_modname, strlen(XPV_MODNAME) + 1); 1006 kmem_free(mcp, sizeof (*mcp)); 1007 if (names != NULL) 1008 kmem_free(names, namesize); 1009 } 1010 1011 void 1012 xpv_panic_init() 1013 { 1014 xen_platform_op_t op; 1015 int i; 1016 1017 ASSERT(DOMAIN_IS_INITDOMAIN(xen_info)); 1018 1019 for (i = 0; i < mmu.num_level; i++) 1020 ptable_pfn[i] = PFN_INVALID; 1021 1022 /* Let Xen know where to jump if/when it panics. */ 1023 op.cmd = XENPF_panic_init; 1024 op.interface_version = XENPF_INTERFACE_VERSION; 1025 op.u.panic_init.panic_addr = (unsigned long)xpv_panic_hdlr; 1026 1027 (void) HYPERVISOR_platform_op(&op); 1028 1029 init_xen_module(); 1030 } 1031