1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2012 Gary Mills 23 * Copyright 2016 PALO, Richard. 24 * 25 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 26 */ 27 28 #include <sys/types.h> 29 #include <sys/clock.h> 30 #include <sys/psm.h> 31 #include <sys/archsystm.h> 32 #include <sys/machsystm.h> 33 #include <sys/compress.h> 34 #include <sys/modctl.h> 35 #include <sys/trap.h> 36 #include <sys/panic.h> 37 #include <sys/regset.h> 38 #include <sys/frame.h> 39 #include <sys/kobj.h> 40 #include <sys/apic.h> 41 #include <sys/apic_timer.h> 42 #include <sys/dumphdr.h> 43 #include <sys/mem.h> 44 #include <sys/x86_archext.h> 45 #include <sys/xpv_panic.h> 46 #include <sys/boot_console.h> 47 #include <sys/bootsvcs.h> 48 #include <sys/consdev.h> 49 #include <vm/hat_pte.h> 50 #include <vm/hat_i86.h> 51 52 /* XXX: need to add a PAE version too, if we ever support both PAE and non */ 53 #if defined(__i386) 54 #define XPV_FILENAME "/boot/xen-syms" 55 #else 56 #define XPV_FILENAME "/boot/amd64/xen-syms" 57 #endif 58 #define XPV_MODNAME "xpv" 59 60 int xpv_panicking = 0; 61 62 struct module *xpv_module; 63 struct modctl *xpv_modctl; 64 65 #define ALIGN(x, a) ((a) == 0 ? (uintptr_t)(x) : \ 66 (((uintptr_t)(x) + (uintptr_t)(a) - 1l) & ~((uintptr_t)(a) - 1l))) 67 68 /* Pointer to the xpv_panic_info structure handed to us by Xen. */ 69 static struct panic_info *xpv_panic_info = NULL; 70 71 /* Timer support */ 72 #define NSEC_SHIFT 5 73 #define T_XPV_TIMER 0xd1 74 #define XPV_TIMER_INTERVAL 1000 /* 1000 microseconds */ 75 static uint32_t *xpv_apicadr = NULL; 76 static uint_t nsec_scale; 77 78 /* IDT support */ 79 #pragma align 16(xpv_panic_idt) 80 static gate_desc_t xpv_panic_idt[NIDT]; /* interrupt descriptor table */ 81 82 /* Xen pagetables mapped into our HAT's ptable windows */ 83 static pfn_t ptable_pfn[MAX_NUM_LEVEL]; 84 85 /* Number of MMU_PAGESIZE pages we're adding to the Solaris dump */ 86 static int xpv_dump_pages; 87 88 /* 89 * There are up to two large swathes of RAM that we don't want to include 90 * in the dump: those that comprise the Xen version of segkpm. On 32-bit 91 * systems there is no such region of memory. On 64-bit systems, there 92 * should be just a single contiguous region that corresponds to all of 93 * physical memory. The tricky bit is that Xen's heap sometimes lives in 94 * the middle of their segkpm, and is mapped using only kpm-like addresses. 95 * In that case, we need to skip the swathes before and after Xen's heap. 96 */ 97 uintptr_t kpm1_low = 0; 98 uintptr_t kpm1_high = 0; 99 uintptr_t kpm2_low = 0; 100 uintptr_t kpm2_high = 0; 101 102 /* 103 * Some commonly used values that we don't want to recompute over and over. 104 */ 105 static int xpv_panic_nptes[MAX_NUM_LEVEL]; 106 static ulong_t xpv_panic_cr3; 107 static uintptr_t xpv_end; 108 109 static void xpv_panic_console_print(const char *fmt, ...); 110 static void (*xpv_panic_printf)(const char *, ...) = xpv_panic_console_print; 111 112 #define CONSOLE_BUF_SIZE 256 113 static char console_buffer[CONSOLE_BUF_SIZE]; 114 static boolean_t use_polledio; 115 116 /* 117 * Pointers to machine check panic info (if any). 118 */ 119 xpv_mca_panic_data_t *xpv_mca_panic_data = NULL; 120 121 static void 122 xpv_panic_putc(int m) 123 { 124 struct cons_polledio *c = cons_polledio; 125 126 /* This really shouldn't happen */ 127 if (boot_console_type(NULL) == CONS_HYPERVISOR) 128 return; 129 130 if (use_polledio == B_TRUE) 131 c->cons_polledio_putchar(c->cons_polledio_argument, m); 132 else 133 bcons_putchar(m); 134 } 135 136 static void 137 xpv_panic_puts(char *msg) 138 { 139 char *m; 140 141 dump_timeleft = dump_timeout; 142 for (m = msg; *m; m++) 143 xpv_panic_putc((int)*m); 144 } 145 146 static void 147 xpv_panic_console_print(const char *fmt, ...) 148 { 149 va_list ap; 150 151 va_start(ap, fmt); 152 (void) vsnprintf(console_buffer, sizeof (console_buffer), fmt, ap); 153 va_end(ap); 154 155 xpv_panic_puts(console_buffer); 156 } 157 158 static void 159 xpv_panic_map(int level, pfn_t pfn) 160 { 161 x86pte_t pte, *pteptr; 162 163 /* 164 * The provided pfn represents a level 'level' page table. Map it 165 * into the 'level' slot in the list of page table windows. 166 */ 167 pteptr = (x86pte_t *)PWIN_PTE_VA(level); 168 pte = pfn_to_pa(pfn) | PT_VALID; 169 170 XPV_ALLOW_PAGETABLE_UPDATES(); 171 if (mmu.pae_hat) 172 *pteptr = pte; 173 else 174 *(x86pte32_t *)pteptr = pte; 175 XPV_DISALLOW_PAGETABLE_UPDATES(); 176 177 mmu_tlbflush_entry(PWIN_VA(level)); 178 } 179 180 /* 181 * Walk the page tables to find the pfn mapped by the given va. 182 */ 183 static pfn_t 184 xpv_va_walk(uintptr_t *vaddr) 185 { 186 int l, idx; 187 pfn_t pfn; 188 x86pte_t pte; 189 x86pte_t *ptep; 190 uintptr_t va = *vaddr; 191 uintptr_t scan_va; 192 caddr_t ptable_window; 193 static pfn_t toplevel_pfn; 194 static uintptr_t lastva; 195 196 /* 197 * If we do anything other than a simple scan through memory, don't 198 * trust the mapped page tables. 199 */ 200 if (va != lastva + MMU_PAGESIZE) 201 for (l = mmu.max_level; l >= 0; l--) 202 ptable_pfn[l] = PFN_INVALID; 203 204 toplevel_pfn = mmu_btop(xpv_panic_cr3); 205 206 while (va < xpv_end && va >= *vaddr) { 207 /* Find the lowest table with any entry for va */ 208 pfn = toplevel_pfn; 209 for (l = mmu.max_level; l >= 0; l--) { 210 if (ptable_pfn[l] != pfn) { 211 xpv_panic_map(l, pfn); 212 ptable_pfn[l] = pfn; 213 } 214 215 /* 216 * Search this pagetable for any mapping to an 217 * address >= va. 218 */ 219 ptable_window = PWIN_VA(l); 220 if (l == mmu.max_level && mmu.pae_hat) 221 ptable_window += 222 (xpv_panic_cr3 & MMU_PAGEOFFSET); 223 224 idx = (va >> LEVEL_SHIFT(l)) & (xpv_panic_nptes[l] - 1); 225 scan_va = va; 226 while (idx < xpv_panic_nptes[l] && scan_va < xpv_end && 227 scan_va >= *vaddr) { 228 ptep = (x86pte_t *)(ptable_window + 229 (idx << mmu.pte_size_shift)); 230 pte = GET_PTE(ptep); 231 if (pte & PTE_VALID) 232 break; 233 idx++; 234 scan_va += mmu.level_size[l]; 235 } 236 237 /* 238 * If there are no valid mappings in this table, we 239 * can skip to the end of the VA range it covers. 240 */ 241 if (idx == xpv_panic_nptes[l]) { 242 va = NEXT_ENTRY_VA(va, l + 1); 243 break; 244 } 245 246 va = scan_va; 247 /* 248 * See if we've hit the end of the range. 249 */ 250 if (va >= xpv_end || va < *vaddr) 251 break; 252 253 /* 254 * If this mapping is for a pagetable, we drop down 255 * to the next level in the hierarchy and look for 256 * a mapping in it. 257 */ 258 pfn = PTE2MFN(pte, l); 259 if (!PTE_ISPAGE(pte, l)) 260 continue; 261 262 /* 263 * The APIC page is magic. Nothing to see here; 264 * move along. 265 */ 266 if (((uintptr_t)xpv_apicadr & MMU_PAGEMASK) == 267 (va & MMU_PAGEMASK)) { 268 va += MMU_PAGESIZE; 269 break; 270 } 271 272 /* 273 * See if the address is within one of the two 274 * kpm-like regions we want to skip. 275 */ 276 if (va >= kpm1_low && va < kpm1_high) { 277 va = kpm1_high; 278 break; 279 } 280 if (va >= kpm2_low && va < kpm2_high) { 281 va = kpm2_high; 282 break; 283 } 284 285 /* 286 * The Xen panic code only handles small pages. If 287 * this mapping is for a large page, we need to 288 * identify the consituent page that covers the 289 * specific VA we were looking for. 290 */ 291 if (l > 0) { 292 if (l > 1) 293 panic("Xen panic can't cope with " 294 "giant pages."); 295 idx = (va >> LEVEL_SHIFT(0)) & 296 (xpv_panic_nptes[0] - 1); 297 pfn += idx; 298 } 299 300 *vaddr = va; 301 lastva = va; 302 return (pfn | PFN_IS_FOREIGN_MFN); 303 } 304 } 305 return (PFN_INVALID); 306 } 307 308 /* 309 * Walk through the Xen VA space, finding pages that are mapped in. 310 * 311 * These pages all have MFNs rather than PFNs, meaning they may be outside 312 * the physical address space the kernel knows about, or they may collide 313 * with PFNs the kernel is using. 314 * 315 * The obvious trick of just adding the PFN_IS_FOREIGN_MFN bit to the MFNs 316 * to avoid collisions doesn't work. The pages need to be written to disk 317 * in PFN-order or savecore gets confused. We can't allocate memory to 318 * contruct a sorted pfn->VA reverse mapping, so we have to write the pages 319 * to disk in VA order. 320 * 321 * To square this circle, we simply make up PFNs for each of Xen's pages. 322 * We assign each mapped page a fake PFN in ascending order. These fake 323 * PFNs each have the FOREIGN bit set, ensuring that they fall outside the 324 * range of Solaris PFNs written by the kernel. 325 */ 326 int 327 dump_xpv_addr() 328 { 329 uintptr_t va; 330 mem_vtop_t mem_vtop; 331 332 xpv_dump_pages = 0; 333 va = xen_virt_start; 334 335 while (xpv_va_walk(&va) != PFN_INVALID) { 336 mem_vtop.m_as = &kas; 337 mem_vtop.m_va = (void *)va; 338 mem_vtop.m_pfn = (pfn_t)xpv_dump_pages | PFN_IS_FOREIGN_MFN; 339 340 dumpvp_write(&mem_vtop, sizeof (mem_vtop_t)); 341 xpv_dump_pages++; 342 343 va += MMU_PAGESIZE; 344 } 345 346 /* 347 * Add the shared_info page. This page actually ends up in the 348 * dump twice: once for the Xen va and once for the Solaris va. 349 * This isn't ideal, but we don't know the address Xen is using for 350 * the page, so we can't share it. 351 */ 352 mem_vtop.m_as = &kas; 353 mem_vtop.m_va = HYPERVISOR_shared_info; 354 mem_vtop.m_pfn = (pfn_t)xpv_dump_pages | PFN_IS_FOREIGN_MFN; 355 dumpvp_write(&mem_vtop, sizeof (mem_vtop_t)); 356 xpv_dump_pages++; 357 358 return (xpv_dump_pages); 359 } 360 361 void 362 dump_xpv_pfn() 363 { 364 pfn_t pfn; 365 int cnt; 366 367 for (cnt = 0; cnt < xpv_dump_pages; cnt++) { 368 pfn = (pfn_t)cnt | PFN_IS_FOREIGN_MFN; 369 dumpvp_write(&pfn, sizeof (pfn)); 370 } 371 } 372 373 int 374 dump_xpv_data(void *dump_cbuf) 375 { 376 uintptr_t va; 377 uint32_t csize; 378 int cnt = 0; 379 380 /* 381 * XXX: we should probably run this data through a UE check. The 382 * catch is that the UE code relies on on_trap() and getpfnum() 383 * working. 384 */ 385 va = xen_virt_start; 386 387 while (xpv_va_walk(&va) != PFN_INVALID) { 388 csize = (uint32_t)compress((void *)va, dump_cbuf, PAGESIZE); 389 dumpvp_write(&csize, sizeof (uint32_t)); 390 dumpvp_write(dump_cbuf, csize); 391 if (dump_ioerr) { 392 dumphdr->dump_flags &= ~DF_COMPLETE; 393 return (cnt); 394 } 395 cnt++; 396 va += MMU_PAGESIZE; 397 } 398 399 /* 400 * Finally, dump the shared_info page 401 */ 402 csize = (uint32_t)compress((void *)HYPERVISOR_shared_info, dump_cbuf, 403 PAGESIZE); 404 dumpvp_write(&csize, sizeof (uint32_t)); 405 dumpvp_write(dump_cbuf, csize); 406 if (dump_ioerr) 407 dumphdr->dump_flags &= ~DF_COMPLETE; 408 cnt++; 409 410 return (cnt); 411 } 412 413 static void * 414 showstack(void *fpreg, int xpv_only) 415 { 416 struct frame *fpp; 417 ulong_t off; 418 char *sym; 419 uintptr_t pc, fp, lastfp; 420 uintptr_t minaddr = min(KERNELBASE, xen_virt_start); 421 422 fp = (uintptr_t)fpreg; 423 if (fp < minaddr) { 424 xpv_panic_printf("Bad frame ptr: 0x%p\n", fpreg); 425 return (fpreg); 426 } 427 428 do { 429 fpp = (struct frame *)fp; 430 pc = fpp->fr_savpc; 431 432 if ((xpv_only != 0) && 433 (fp > xpv_end || fp < xen_virt_start)) 434 break; 435 if ((sym = kobj_getsymname(pc, &off)) != NULL) 436 xpv_panic_printf("%08lx %s:%s+%lx\n", fp, 437 mod_containing_pc((caddr_t)pc), sym, off); 438 else if ((pc >= xen_virt_start) && (pc <= xpv_end)) 439 xpv_panic_printf("%08lx 0x%lx (in Xen)\n", fp, pc); 440 else 441 xpv_panic_printf("%08lx %lx\n", fp, pc); 442 443 lastfp = fp; 444 fp = fpp->fr_savfp; 445 446 /* 447 * Xen marks an exception frame by inverting the frame 448 * pointer. 449 */ 450 if (fp < lastfp) { 451 if ((~fp > minaddr) && ((~fp) ^ lastfp) < 0xfff) 452 fp = ~fp; 453 } 454 } while (fp > lastfp); 455 return ((void *)fp); 456 } 457 458 void * 459 xpv_traceback(void *fpreg) 460 { 461 return (showstack(fpreg, 1)); 462 } 463 464 #if defined(__amd64) 465 static void 466 xpv_panic_hypercall(ulong_t call) 467 { 468 panic("Illegally issued hypercall %d during panic!\n", (int)call); 469 } 470 #endif 471 472 void 473 xpv_die(struct regs *rp) 474 { 475 struct panic_trap_info ti; 476 struct cregs creg; 477 478 ti.trap_regs = rp; 479 ti.trap_type = rp->r_trapno; 480 481 curthread->t_panic_trap = &ti; 482 if (ti.trap_type == T_PGFLT) { 483 getcregs(&creg); 484 ti.trap_addr = (caddr_t)creg.cr_cr2; 485 panic("Fatal pagefault at 0x%lx. fault addr=0x%p rp=0x%p", 486 rp->r_pc, (void *)ti.trap_addr, (void *)rp); 487 } else { 488 ti.trap_addr = (caddr_t)rp->r_pc; 489 panic("Fatal trap %ld at 0x%lx. rp=0x%p", rp->r_trapno, 490 rp->r_pc, (void *)rp); 491 } 492 } 493 494 /* 495 * Build IDT to handle a Xen panic 496 */ 497 static void 498 switch_to_xpv_panic_idt() 499 { 500 int i; 501 desctbr_t idtr; 502 gate_desc_t *idt = xpv_panic_idt; 503 selector_t cs = get_cs_register(); 504 505 for (i = 0; i < 32; i++) 506 set_gatesegd(&idt[i], &xpv_invaltrap, cs, SDT_SYSIGT, TRP_XPL, 507 0); 508 509 set_gatesegd(&idt[T_ZERODIV], &xpv_div0trap, cs, SDT_SYSIGT, TRP_XPL, 510 0); 511 set_gatesegd(&idt[T_SGLSTP], &xpv_dbgtrap, cs, SDT_SYSIGT, TRP_XPL, 0); 512 set_gatesegd(&idt[T_NMIFLT], &xpv_nmiint, cs, SDT_SYSIGT, TRP_XPL, 0); 513 set_gatesegd(&idt[T_BOUNDFLT], &xpv_boundstrap, cs, SDT_SYSIGT, 514 TRP_XPL, 0); 515 set_gatesegd(&idt[T_ILLINST], &xpv_invoptrap, cs, SDT_SYSIGT, TRP_XPL, 516 0); 517 set_gatesegd(&idt[T_NOEXTFLT], &xpv_ndptrap, cs, SDT_SYSIGT, TRP_XPL, 518 0); 519 set_gatesegd(&idt[T_TSSFLT], &xpv_invtsstrap, cs, SDT_SYSIGT, TRP_XPL, 520 0); 521 set_gatesegd(&idt[T_SEGFLT], &xpv_segnptrap, cs, SDT_SYSIGT, TRP_XPL, 522 0); 523 set_gatesegd(&idt[T_STKFLT], &xpv_stktrap, cs, SDT_SYSIGT, TRP_XPL, 0); 524 set_gatesegd(&idt[T_GPFLT], &xpv_gptrap, cs, SDT_SYSIGT, TRP_XPL, 0); 525 set_gatesegd(&idt[T_PGFLT], &xpv_pftrap, cs, SDT_SYSIGT, TRP_XPL, 0); 526 set_gatesegd(&idt[T_EXTERRFLT], &xpv_ndperr, cs, SDT_SYSIGT, TRP_XPL, 527 0); 528 set_gatesegd(&idt[T_ALIGNMENT], &xpv_achktrap, cs, SDT_SYSIGT, TRP_XPL, 529 0); 530 set_gatesegd(&idt[T_MCE], &xpv_mcetrap, cs, SDT_SYSIGT, TRP_XPL, 0); 531 set_gatesegd(&idt[T_SIMDFPE], &xpv_xmtrap, cs, SDT_SYSIGT, TRP_XPL, 0); 532 533 /* 534 * We have no double fault handler. Any single fault represents a 535 * catastrophic failure for us, so there is no attempt to handle 536 * them cleanly: we just print a message and reboot. If we 537 * encounter a second fault while doing that, there is nothing 538 * else we can do. 539 */ 540 541 /* 542 * Be prepared to absorb any stray device interrupts received 543 * while writing the core to disk. 544 */ 545 for (i = 33; i < NIDT; i++) 546 set_gatesegd(&idt[i], &xpv_surprise_intr, cs, SDT_SYSIGT, 547 TRP_XPL, 0); 548 549 /* The one interrupt we expect to get is from the APIC timer. */ 550 set_gatesegd(&idt[T_XPV_TIMER], &xpv_timer_trap, cs, SDT_SYSIGT, 551 TRP_XPL, 0); 552 553 idtr.dtr_base = (uintptr_t)xpv_panic_idt; 554 idtr.dtr_limit = sizeof (xpv_panic_idt) - 1; 555 wr_idtr(&idtr); 556 557 #if defined(__amd64) 558 /* Catch any hypercalls. */ 559 wrmsr(MSR_AMD_LSTAR, (uintptr_t)xpv_panic_hypercall); 560 wrmsr(MSR_AMD_CSTAR, (uintptr_t)xpv_panic_hypercall); 561 #endif 562 } 563 564 static void 565 xpv_apic_clkinit() 566 { 567 uint_t apic_ticks = 0; 568 569 /* 570 * Measure how many APIC ticks there are within a fixed time 571 * period. We're going to be fairly coarse here. This timer is 572 * just being used to detect a stalled panic, so as long as we have 573 * the right order of magnitude, everything should be fine. 574 */ 575 xpv_apicadr[APIC_SPUR_INT_REG] = AV_UNIT_ENABLE | APIC_SPUR_INTR; 576 xpv_apicadr[APIC_LOCAL_TIMER] = AV_MASK; 577 xpv_apicadr[APIC_INT_VECT0] = AV_MASK; /* local intr reg 0 */ 578 579 xpv_apicadr[APIC_DIVIDE_REG] = 0; 580 xpv_apicadr[APIC_INIT_COUNT] = APIC_MAXVAL; 581 drv_usecwait(XPV_TIMER_INTERVAL); 582 apic_ticks = APIC_MAXVAL - xpv_apicadr[APIC_CURR_COUNT]; 583 584 /* 585 * apic_ticks now represents roughly how many apic ticks comprise 586 * one timeout interval. Program the timer to send us an interrupt 587 * every time that interval expires. 588 */ 589 xpv_apicadr[APIC_LOCAL_TIMER] = T_XPV_TIMER | AV_PERIODIC; 590 xpv_apicadr[APIC_INIT_COUNT] = apic_ticks; 591 xpv_apicadr[APIC_EOI_REG] = 0; 592 } 593 594 void 595 xpv_timer_tick(void) 596 { 597 static int ticks = 0; 598 599 if (ticks++ >= MICROSEC / XPV_TIMER_INTERVAL) { 600 ticks = 0; 601 if (dump_timeleft && (--dump_timeleft == 0)) 602 panic("Xen panic timeout\n"); 603 } 604 xpv_apicadr[APIC_EOI_REG] = 0; 605 } 606 607 void 608 xpv_interrupt(void) 609 { 610 #ifdef DEBUG 611 static int cnt = 0; 612 613 if (cnt++ < 10) 614 xpv_panic_printf("Unexpected interrupt received.\n"); 615 if ((cnt < 1000) && ((cnt % 100) == 0)) 616 xpv_panic_printf("%d unexpected interrupts received.\n", cnt); 617 #endif 618 619 xpv_apicadr[APIC_EOI_REG] = 0; 620 } 621 622 /* 623 * Managing time in panic context is trivial. We only have a single CPU, 624 * we never get rescheduled, we never get suspended. We just need to 625 * convert clock ticks into nanoseconds. 626 */ 627 static hrtime_t 628 xpv_panic_gethrtime(void) 629 { 630 hrtime_t tsc, hrt; 631 unsigned int *l = (unsigned int *)&(tsc); 632 633 tsc = __rdtsc_insn(); 634 hrt = (mul32(l[1], nsec_scale) << NSEC_SHIFT) + 635 (mul32(l[0], nsec_scale) >> (32 - NSEC_SHIFT)); 636 637 return (hrt); 638 } 639 640 static void 641 xpv_panic_time_init() 642 { 643 nsec_scale = 644 CPU->cpu_m.mcpu_vcpu_info->time.tsc_to_system_mul >> NSEC_SHIFT; 645 646 gethrtimef = xpv_panic_gethrtime; 647 } 648 649 static void 650 xpv_panicsys(struct regs *rp, char *fmt, ...) 651 { 652 extern void panicsys(const char *, va_list, struct regs *, int); 653 va_list alist; 654 655 va_start(alist, fmt); 656 panicsys(fmt, alist, rp, 1); 657 va_end(alist); 658 } 659 660 void 661 xpv_do_panic(void *arg) 662 { 663 struct panic_info *pip = (struct panic_info *)arg; 664 int l; 665 struct cregs creg; 666 #if defined(__amd64) 667 extern uintptr_t postbootkernelbase; 668 #endif 669 670 if (xpv_panicking++ > 0) 671 panic("multiple calls to xpv_do_panic()"); 672 673 /* 674 * Indicate to the underlying panic framework that a panic has been 675 * initiated. This is ordinarily done as part of vpanic(). Since 676 * we already have all the register state saved by the hypervisor, 677 * we skip that and jump straight into the panic processing code. 678 * 679 * XXX If another thread grabs and wins the panic_quiesce trigger 680 * then we'll have two threads in panicsys believing they are in 681 * charge of the panic attempt! 682 */ 683 (void) panic_trigger(&panic_quiesce); 684 685 #if defined(__amd64) 686 /* 687 * bzero() and bcopy() get unhappy when asked to operate on 688 * addresses outside of the kernel. At this point Xen is really a 689 * part of the kernel, so we update the routines' notion of where 690 * the kernel starts. 691 */ 692 postbootkernelbase = xen_virt_start; 693 #endif 694 695 #if defined(HYPERVISOR_VIRT_END) 696 xpv_end = HYPERVISOR_VIRT_END; 697 #else 698 xpv_end = (uintptr_t)UINTPTR_MAX - sizeof (uintptr_t); 699 #endif 700 701 /* 702 * If we were redirecting console output to the hypervisor, we have 703 * to stop. 704 */ 705 use_polledio = B_FALSE; 706 if (boot_console_type(NULL) == CONS_HYPERVISOR) { 707 bcons_device_change(CONS_HYPERVISOR); 708 } else if (cons_polledio != NULL && 709 cons_polledio->cons_polledio_putchar != NULL) { 710 if (cons_polledio->cons_polledio_enter != NULL) 711 cons_polledio->cons_polledio_enter( 712 cons_polledio->cons_polledio_argument); 713 use_polledio = 1; 714 } 715 716 /* Make sure we handle all console output from here on. */ 717 sysp->bsvc_putchar = xpv_panic_putc; 718 719 /* 720 * If we find an unsupported panic_info structure, there's not much 721 * we can do other than complain, plow on, and hope for the best. 722 */ 723 if (pip->pi_version != PANIC_INFO_VERSION) 724 xpv_panic_printf("Warning: Xen is using an unsupported " 725 "version of the panic_info structure.\n"); 726 727 xpv_panic_info = pip; 728 729 #if defined(__amd64) 730 kpm1_low = (uintptr_t)xpv_panic_info->pi_ram_start; 731 if (xpv_panic_info->pi_xen_start == NULL) { 732 kpm1_high = (uintptr_t)xpv_panic_info->pi_ram_end; 733 } else { 734 kpm1_high = (uintptr_t)xpv_panic_info->pi_xen_start; 735 kpm2_low = (uintptr_t)xpv_panic_info->pi_xen_end; 736 kpm2_high = (uintptr_t)xpv_panic_info->pi_ram_end; 737 } 738 #endif 739 740 /* 741 * Make sure we are running on the Solaris %gs. The Xen panic code 742 * should already have set up the GDT properly. 743 */ 744 xpv_panic_resetgs(); 745 #if defined(__amd64) 746 wrmsr(MSR_AMD_GSBASE, (uint64_t)&cpus[0]); 747 #endif 748 749 xpv_panic_time_init(); 750 751 /* 752 * Switch to our own IDT, avoiding any accidental returns to Xen 753 * world. 754 */ 755 switch_to_xpv_panic_idt(); 756 757 /* 758 * Initialize the APIC timer, which is used to detect a hung dump 759 * attempt. 760 */ 761 xpv_apicadr = pip->pi_apic; 762 xpv_apic_clkinit(); 763 764 /* 765 * Set up a few values that we'll need repeatedly. 766 */ 767 getcregs(&creg); 768 xpv_panic_cr3 = creg.cr_cr3; 769 for (l = mmu.max_level; l >= 0; l--) 770 xpv_panic_nptes[l] = mmu.ptes_per_table; 771 #ifdef __i386 772 if (mmu.pae_hat) 773 xpv_panic_nptes[mmu.max_level] = 4; 774 #endif 775 776 /* Add the fake Xen module to the module list */ 777 if (xpv_module != NULL) { 778 extern int last_module_id; 779 780 xpv_modctl->mod_id = last_module_id++; 781 xpv_modctl->mod_next = &modules; 782 xpv_modctl->mod_prev = modules.mod_prev; 783 modules.mod_prev->mod_next = xpv_modctl; 784 modules.mod_prev = xpv_modctl; 785 } 786 787 if (pip->pi_mca.mpd_magic == MCA_PANICDATA_MAGIC) 788 xpv_mca_panic_data = &pip->pi_mca; 789 790 xpv_panic_printf = printf; 791 xpv_panicsys((struct regs *)pip->pi_regs, pip->pi_panicstr); 792 xpv_panic_printf("Failed to reboot following panic.\n"); 793 for (;;) 794 ; 795 } 796 797 /* 798 * Set up the necessary data structures to pretend that the Xen hypervisor 799 * is a loadable module, allowing mdb to find the Xen symbols in a crash 800 * dump. Since these symbols all map to VA space Solaris doesn't normally 801 * have access to, we don't link these structures into the kernel's lists 802 * until/unless we hit a Xen panic. 803 * 804 * The observant reader will note a striking amount of overlap between this 805 * code and that found in krtld. While it would be handy if we could just 806 * ask krtld to do this work for us, it's not that simple. Among the 807 * complications: we're not actually loading the text here (grub did it at 808 * boot), the .text section is writable, there are no relocations to do, 809 * none of the module text/data is in readable memory, etc. Training krtld 810 * to deal with this weird module is as complicated, and more risky, than 811 * reimplementing the necessary subset of it here. 812 */ 813 static void 814 init_xen_module() 815 { 816 struct _buf *file = NULL; 817 struct module *mp; 818 struct modctl *mcp; 819 int i, shn; 820 Shdr *shp, *ctf_shp; 821 char *names = NULL; 822 size_t n, namesize, text_align, data_align; 823 #if defined(__amd64) 824 const char machine = EM_AMD64; 825 #else 826 const char machine = EM_386; 827 #endif 828 829 /* Allocate and init the module structure */ 830 mp = kmem_zalloc(sizeof (*mp), KM_SLEEP); 831 mp->filename = kobj_zalloc(strlen(XPV_FILENAME) + 1, KM_SLEEP); 832 (void) strcpy(mp->filename, XPV_FILENAME); 833 834 /* Allocate and init the modctl structure */ 835 mcp = kmem_zalloc(sizeof (*mcp), KM_SLEEP); 836 mcp->mod_modname = kobj_zalloc(strlen(XPV_MODNAME) + 1, KM_SLEEP); 837 (void) strcpy(mcp->mod_modname, XPV_MODNAME); 838 mcp->mod_filename = kobj_zalloc(strlen(XPV_FILENAME) + 1, KM_SLEEP); 839 (void) strcpy(mcp->mod_filename, XPV_FILENAME); 840 mcp->mod_inprogress_thread = (kthread_id_t)-1; 841 mcp->mod_ref = 1; 842 mcp->mod_loaded = 1; 843 mcp->mod_loadcnt = 1; 844 mcp->mod_mp = mp; 845 846 /* 847 * Try to open a Xen image that hasn't had its symbol and CTF 848 * information stripped off. 849 */ 850 file = kobj_open_file(XPV_FILENAME); 851 if (file == (struct _buf *)-1) { 852 file = NULL; 853 goto err; 854 } 855 856 /* 857 * Read the header and ensure that this is an ELF file for the 858 * proper ISA. If it's not, somebody has done something very 859 * stupid. Why bother? See Mencken. 860 */ 861 if (kobj_read_file(file, (char *)&mp->hdr, sizeof (mp->hdr), 0) < 0) 862 goto err; 863 for (i = 0; i < SELFMAG; i++) 864 if (mp->hdr.e_ident[i] != ELFMAG[i]) 865 goto err; 866 if ((mp->hdr.e_ident[EI_DATA] != ELFDATA2LSB) || 867 (mp->hdr.e_machine != machine)) 868 goto err; 869 870 /* Read in the section headers */ 871 n = mp->hdr.e_shentsize * mp->hdr.e_shnum; 872 mp->shdrs = kmem_zalloc(n, KM_SLEEP); 873 if (kobj_read_file(file, mp->shdrs, n, mp->hdr.e_shoff) < 0) 874 goto err; 875 876 /* Read the section names */ 877 shp = (Shdr *)(mp->shdrs + mp->hdr.e_shstrndx * mp->hdr.e_shentsize); 878 namesize = shp->sh_size; 879 names = kmem_zalloc(shp->sh_size, KM_SLEEP); 880 if (kobj_read_file(file, names, shp->sh_size, shp->sh_offset) < 0) 881 goto err; 882 883 /* 884 * Fill in the text and data size fields. 885 */ 886 ctf_shp = NULL; 887 text_align = data_align = 0; 888 for (shn = 1; shn < mp->hdr.e_shnum; shn++) { 889 shp = (Shdr *)(mp->shdrs + shn * mp->hdr.e_shentsize); 890 891 /* Sanity check the offset of the section name */ 892 if (shp->sh_name >= namesize) 893 continue; 894 895 /* If we find the symtab section, remember it for later. */ 896 if (shp->sh_type == SHT_SYMTAB) { 897 mp->symtbl_section = shn; 898 mp->symhdr = shp; 899 continue; 900 } 901 902 /* If we find the CTF section, remember it for later. */ 903 if ((shp->sh_size != 0) && 904 (strcmp(names + shp->sh_name, ".SUNW_ctf") == 0)) { 905 ctf_shp = shp; 906 continue; 907 } 908 909 if (!(shp->sh_flags & SHF_ALLOC)) 910 continue; 911 912 /* 913 * Xen marks its text section as writable, so we need to 914 * look for the name - not just the flag. 915 */ 916 if ((strcmp(&names[shp->sh_name], ".text") != 0) && 917 (shp->sh_flags & SHF_WRITE) != 0) { 918 if (shp->sh_addralign > data_align) 919 data_align = shp->sh_addralign; 920 mp->data_size = ALIGN(mp->data_size, data_align); 921 mp->data_size += ALIGN(shp->sh_size, 8); 922 if (mp->data == NULL || mp->data > (char *)shp->sh_addr) 923 mp->data = (char *)shp->sh_addr; 924 } else { 925 if (shp->sh_addralign > text_align) 926 text_align = shp->sh_addralign; 927 mp->text_size = ALIGN(mp->text_size, text_align); 928 mp->text_size += ALIGN(shp->sh_size, 8); 929 if (mp->text == NULL || mp->text > (char *)shp->sh_addr) 930 mp->text = (char *)shp->sh_addr; 931 } 932 } 933 kmem_free(names, namesize); 934 names = NULL; 935 shp = NULL; 936 mcp->mod_text = mp->text; 937 mcp->mod_text_size = mp->text_size; 938 939 /* 940 * If we have symbol table and string table sections, read them in 941 * now. If we don't, we just plow on. We'll still get a valid 942 * core dump, but finding anything useful will be just a bit 943 * harder. 944 * 945 * Note: we don't bother with a hash table. We'll never do a 946 * symbol lookup unless we crash, and then mdb creates its own. We 947 * also don't try to perform any relocations. Xen should be loaded 948 * exactly where the ELF file indicates, and the symbol information 949 * in the file should be complete and correct already. Static 950 * linking ain't all bad. 951 */ 952 if ((mp->symhdr != NULL) && (mp->symhdr->sh_link < mp->hdr.e_shnum)) { 953 mp->strhdr = (Shdr *) 954 (mp->shdrs + mp->symhdr->sh_link * mp->hdr.e_shentsize); 955 mp->nsyms = mp->symhdr->sh_size / mp->symhdr->sh_entsize; 956 957 /* Allocate space for the symbol table and strings. */ 958 mp->symsize = mp->symhdr->sh_size + 959 mp->nsyms * sizeof (symid_t) + mp->strhdr->sh_size; 960 mp->symspace = kmem_zalloc(mp->symsize, KM_SLEEP); 961 mp->symtbl = mp->symspace; 962 mp->strings = (char *)(mp->symtbl + mp->symhdr->sh_size); 963 964 if ((kobj_read_file(file, mp->symtbl, 965 mp->symhdr->sh_size, mp->symhdr->sh_offset) < 0) || 966 (kobj_read_file(file, mp->strings, 967 mp->strhdr->sh_size, mp->strhdr->sh_offset) < 0)) 968 goto err; 969 } 970 971 /* 972 * Read in the CTF section 973 */ 974 if ((ctf_shp != NULL) && ((moddebug & MODDEBUG_NOCTF) == 0)) { 975 mp->ctfdata = kmem_zalloc(ctf_shp->sh_size, KM_SLEEP); 976 mp->ctfsize = ctf_shp->sh_size; 977 if (kobj_read_file(file, mp->ctfdata, mp->ctfsize, 978 ctf_shp->sh_offset) < 0) 979 goto err; 980 } 981 982 kobj_close_file(file); 983 984 xpv_module = mp; 985 xpv_modctl = mcp; 986 return; 987 988 err: 989 cmn_err(CE_WARN, "Failed to initialize xpv module."); 990 if (file != NULL) 991 kobj_close_file(file); 992 993 kmem_free(mp->filename, strlen(XPV_FILENAME) + 1); 994 if (mp->shdrs != NULL) 995 kmem_free(mp->shdrs, mp->hdr.e_shentsize * mp->hdr.e_shnum); 996 if (mp->symspace != NULL) 997 kmem_free(mp->symspace, mp->symsize); 998 if (mp->ctfdata != NULL) 999 kmem_free(mp->ctfdata, mp->ctfsize); 1000 kmem_free(mp, sizeof (*mp)); 1001 kmem_free(mcp->mod_filename, strlen(XPV_FILENAME) + 1); 1002 kmem_free(mcp->mod_modname, strlen(XPV_MODNAME) + 1); 1003 kmem_free(mcp, sizeof (*mcp)); 1004 if (names != NULL) 1005 kmem_free(names, namesize); 1006 } 1007 1008 void 1009 xpv_panic_init() 1010 { 1011 xen_platform_op_t op; 1012 int i; 1013 1014 ASSERT(DOMAIN_IS_INITDOMAIN(xen_info)); 1015 1016 for (i = 0; i < mmu.num_level; i++) 1017 ptable_pfn[i] = PFN_INVALID; 1018 1019 /* Let Xen know where to jump if/when it panics. */ 1020 op.cmd = XENPF_panic_init; 1021 op.interface_version = XENPF_INTERFACE_VERSION; 1022 op.u.panic_init.panic_addr = (unsigned long)xpv_panic_hdlr; 1023 1024 (void) HYPERVISOR_platform_op(&op); 1025 1026 init_xen_module(); 1027 } 1028