1*843e1988Sjohnlev /* 2*843e1988Sjohnlev * CDDL HEADER START 3*843e1988Sjohnlev * 4*843e1988Sjohnlev * The contents of this file are subject to the terms of the 5*843e1988Sjohnlev * Common Development and Distribution License (the "License"). 6*843e1988Sjohnlev * You may not use this file except in compliance with the License. 7*843e1988Sjohnlev * 8*843e1988Sjohnlev * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9*843e1988Sjohnlev * or http://www.opensolaris.org/os/licensing. 10*843e1988Sjohnlev * See the License for the specific language governing permissions 11*843e1988Sjohnlev * and limitations under the License. 12*843e1988Sjohnlev * 13*843e1988Sjohnlev * When distributing Covered Code, include this CDDL HEADER in each 14*843e1988Sjohnlev * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15*843e1988Sjohnlev * If applicable, add the following below this CDDL HEADER, with the 16*843e1988Sjohnlev * fields enclosed by brackets "[]" replaced with your own identifying 17*843e1988Sjohnlev * information: Portions Copyright [yyyy] [name of copyright owner] 18*843e1988Sjohnlev * 19*843e1988Sjohnlev * CDDL HEADER END 20*843e1988Sjohnlev */ 21*843e1988Sjohnlev 22*843e1988Sjohnlev /* 23*843e1988Sjohnlev * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24*843e1988Sjohnlev * Use is subject to license terms. 25*843e1988Sjohnlev */ 26*843e1988Sjohnlev 27*843e1988Sjohnlev #pragma ident "%Z%%M% %I% %E% SMI" 28*843e1988Sjohnlev 29*843e1988Sjohnlev #include <sys/types.h> 30*843e1988Sjohnlev #include <sys/clock.h> 31*843e1988Sjohnlev #include <sys/psm.h> 32*843e1988Sjohnlev #include <sys/archsystm.h> 33*843e1988Sjohnlev #include <sys/machsystm.h> 34*843e1988Sjohnlev #include <sys/compress.h> 35*843e1988Sjohnlev #include <sys/modctl.h> 36*843e1988Sjohnlev #include <sys/trap.h> 37*843e1988Sjohnlev #include <sys/panic.h> 38*843e1988Sjohnlev #include <sys/regset.h> 39*843e1988Sjohnlev #include <sys/frame.h> 40*843e1988Sjohnlev #include <sys/kobj.h> 41*843e1988Sjohnlev #include <sys/apic.h> 42*843e1988Sjohnlev #include <sys/dumphdr.h> 43*843e1988Sjohnlev #include <sys/mem.h> 44*843e1988Sjohnlev #include <sys/x86_archext.h> 45*843e1988Sjohnlev #include <sys/xpv_panic.h> 46*843e1988Sjohnlev #include <sys/boot_console.h> 47*843e1988Sjohnlev #include <sys/bootsvcs.h> 48*843e1988Sjohnlev #include <sys/consdev.h> 49*843e1988Sjohnlev #include <vm/hat_pte.h> 50*843e1988Sjohnlev #include <vm/hat_i86.h> 51*843e1988Sjohnlev 52*843e1988Sjohnlev /* XXX: need to add a PAE version too, if we ever support both PAE and non */ 53*843e1988Sjohnlev #if defined(__i386) 54*843e1988Sjohnlev #define XPV_FILENAME "/boot/xen-syms" 55*843e1988Sjohnlev #else 56*843e1988Sjohnlev #define XPV_FILENAME "/boot/amd64/xen-syms" 57*843e1988Sjohnlev #endif 58*843e1988Sjohnlev #define XPV_MODNAME "xpv" 59*843e1988Sjohnlev 60*843e1988Sjohnlev int xpv_panicking = 0; 61*843e1988Sjohnlev 62*843e1988Sjohnlev struct module *xpv_module; 63*843e1988Sjohnlev struct modctl *xpv_modctl; 64*843e1988Sjohnlev 65*843e1988Sjohnlev #define ALIGN(x, a) ((a) == 0 ? (uintptr_t)(x) : \ 66*843e1988Sjohnlev (((uintptr_t)(x) + (uintptr_t)(a) - 1l) & ~((uintptr_t)(a) - 1l))) 67*843e1988Sjohnlev 68*843e1988Sjohnlev /* Pointer to the xpv_panic_info structure handed to us by Xen. */ 69*843e1988Sjohnlev static struct panic_info *xpv_panic_info = NULL; 70*843e1988Sjohnlev 71*843e1988Sjohnlev /* Timer support */ 72*843e1988Sjohnlev #define NSEC_SHIFT 5 73*843e1988Sjohnlev #define T_XPV_TIMER 0xd1 74*843e1988Sjohnlev #define XPV_TIMER_INTERVAL 1000 /* 1000 microseconds */ 75*843e1988Sjohnlev static uint32_t *xpv_apicadr = NULL; 76*843e1988Sjohnlev static uint_t nsec_scale; 77*843e1988Sjohnlev 78*843e1988Sjohnlev /* IDT support */ 79*843e1988Sjohnlev #pragma align 16(xpv_panic_idt) 80*843e1988Sjohnlev static gate_desc_t xpv_panic_idt[NIDT]; /* interrupt descriptor table */ 81*843e1988Sjohnlev 82*843e1988Sjohnlev /* Xen pagetables mapped into our HAT's ptable windows */ 83*843e1988Sjohnlev static pfn_t ptable_pfn[MAX_NUM_LEVEL]; 84*843e1988Sjohnlev 85*843e1988Sjohnlev /* Number of MMU_PAGESIZE pages we're adding to the Solaris dump */ 86*843e1988Sjohnlev static int xpv_dump_pages; 87*843e1988Sjohnlev 88*843e1988Sjohnlev /* 89*843e1988Sjohnlev * Some commonly used values that we don't want to recompute over and over. 90*843e1988Sjohnlev */ 91*843e1988Sjohnlev static int xpv_panic_nptes[MAX_NUM_LEVEL]; 92*843e1988Sjohnlev static ulong_t xpv_panic_cr3; 93*843e1988Sjohnlev static uintptr_t xpv_end; 94*843e1988Sjohnlev 95*843e1988Sjohnlev static void xpv_panic_console_print(const char *fmt, ...); 96*843e1988Sjohnlev static void (*xpv_panic_printf)(const char *, ...) = xpv_panic_console_print; 97*843e1988Sjohnlev 98*843e1988Sjohnlev #define CONSOLE_BUF_SIZE 256 99*843e1988Sjohnlev static char console_buffer[CONSOLE_BUF_SIZE]; 100*843e1988Sjohnlev static boolean_t use_polledio; 101*843e1988Sjohnlev 102*843e1988Sjohnlev static void 103*843e1988Sjohnlev xpv_panic_putc(int m) 104*843e1988Sjohnlev { 105*843e1988Sjohnlev struct cons_polledio *c = cons_polledio; 106*843e1988Sjohnlev 107*843e1988Sjohnlev /* This really shouldn't happen */ 108*843e1988Sjohnlev if (console == CONS_HYPERVISOR) 109*843e1988Sjohnlev return; 110*843e1988Sjohnlev 111*843e1988Sjohnlev if (use_polledio == B_TRUE) 112*843e1988Sjohnlev c->cons_polledio_putchar(c->cons_polledio_argument, m); 113*843e1988Sjohnlev else 114*843e1988Sjohnlev bcons_putchar(m); 115*843e1988Sjohnlev } 116*843e1988Sjohnlev 117*843e1988Sjohnlev static void 118*843e1988Sjohnlev xpv_panic_puts(char *msg) 119*843e1988Sjohnlev { 120*843e1988Sjohnlev char *m; 121*843e1988Sjohnlev 122*843e1988Sjohnlev dump_timeleft = dump_timeout; 123*843e1988Sjohnlev for (m = msg; *m; m++) 124*843e1988Sjohnlev xpv_panic_putc((int)*m); 125*843e1988Sjohnlev } 126*843e1988Sjohnlev 127*843e1988Sjohnlev static void 128*843e1988Sjohnlev xpv_panic_console_print(const char *fmt, ...) 129*843e1988Sjohnlev { 130*843e1988Sjohnlev va_list ap; 131*843e1988Sjohnlev 132*843e1988Sjohnlev va_start(ap, fmt); 133*843e1988Sjohnlev (void) vsnprintf(console_buffer, sizeof (console_buffer), fmt, ap); 134*843e1988Sjohnlev va_end(ap); 135*843e1988Sjohnlev 136*843e1988Sjohnlev xpv_panic_puts(console_buffer); 137*843e1988Sjohnlev } 138*843e1988Sjohnlev 139*843e1988Sjohnlev static void 140*843e1988Sjohnlev xpv_panic_map(int level, pfn_t pfn) 141*843e1988Sjohnlev { 142*843e1988Sjohnlev x86pte_t pte, *pteptr; 143*843e1988Sjohnlev 144*843e1988Sjohnlev /* 145*843e1988Sjohnlev * The provided pfn represents a level 'level' page table. Map it 146*843e1988Sjohnlev * into the 'level' slot in the list of page table windows. 147*843e1988Sjohnlev */ 148*843e1988Sjohnlev pteptr = (x86pte_t *)PWIN_PTE_VA(level); 149*843e1988Sjohnlev pte = pfn_to_pa(pfn) | PT_VALID; 150*843e1988Sjohnlev 151*843e1988Sjohnlev XPV_ALLOW_PAGETABLE_UPDATES(); 152*843e1988Sjohnlev if (mmu.pae_hat) 153*843e1988Sjohnlev *pteptr = pte; 154*843e1988Sjohnlev else 155*843e1988Sjohnlev *(x86pte32_t *)pteptr = pte; 156*843e1988Sjohnlev XPV_DISALLOW_PAGETABLE_UPDATES(); 157*843e1988Sjohnlev 158*843e1988Sjohnlev mmu_tlbflush_entry(PWIN_VA(level)); 159*843e1988Sjohnlev } 160*843e1988Sjohnlev 161*843e1988Sjohnlev /* 162*843e1988Sjohnlev * Walk the page tables to find the pfn mapped by the given va. 163*843e1988Sjohnlev */ 164*843e1988Sjohnlev static pfn_t 165*843e1988Sjohnlev xpv_va_walk(uintptr_t *vaddr) 166*843e1988Sjohnlev { 167*843e1988Sjohnlev int l, idx; 168*843e1988Sjohnlev pfn_t pfn; 169*843e1988Sjohnlev x86pte_t pte; 170*843e1988Sjohnlev x86pte_t *ptep; 171*843e1988Sjohnlev uintptr_t va = *vaddr; 172*843e1988Sjohnlev uintptr_t scan_va; 173*843e1988Sjohnlev caddr_t ptable_window; 174*843e1988Sjohnlev static pfn_t toplevel_pfn; 175*843e1988Sjohnlev static uintptr_t lastva; 176*843e1988Sjohnlev 177*843e1988Sjohnlev /* 178*843e1988Sjohnlev * If we do anything other than a simple scan through memory, don't 179*843e1988Sjohnlev * trust the mapped page tables. 180*843e1988Sjohnlev */ 181*843e1988Sjohnlev if (va != lastva + MMU_PAGESIZE) 182*843e1988Sjohnlev for (l = mmu.max_level; l >= 0; l--) 183*843e1988Sjohnlev ptable_pfn[l] = PFN_INVALID; 184*843e1988Sjohnlev 185*843e1988Sjohnlev toplevel_pfn = mmu_btop(xpv_panic_cr3); 186*843e1988Sjohnlev 187*843e1988Sjohnlev while (va < xpv_end && va >= *vaddr) { 188*843e1988Sjohnlev /* Find the lowest table with any entry for va */ 189*843e1988Sjohnlev pfn = toplevel_pfn; 190*843e1988Sjohnlev for (l = mmu.max_level; l >= 0; l--) { 191*843e1988Sjohnlev if (ptable_pfn[l] != pfn) { 192*843e1988Sjohnlev xpv_panic_map(l, pfn); 193*843e1988Sjohnlev ptable_pfn[l] = pfn; 194*843e1988Sjohnlev } 195*843e1988Sjohnlev 196*843e1988Sjohnlev /* 197*843e1988Sjohnlev * Search this pagetable for any mapping to an 198*843e1988Sjohnlev * address >= va. 199*843e1988Sjohnlev */ 200*843e1988Sjohnlev ptable_window = PWIN_VA(l); 201*843e1988Sjohnlev if (l == mmu.max_level && mmu.pae_hat) 202*843e1988Sjohnlev ptable_window += 203*843e1988Sjohnlev (xpv_panic_cr3 & MMU_PAGEOFFSET); 204*843e1988Sjohnlev 205*843e1988Sjohnlev idx = (va >> LEVEL_SHIFT(l)) & (xpv_panic_nptes[l] - 1); 206*843e1988Sjohnlev scan_va = va; 207*843e1988Sjohnlev while (idx < xpv_panic_nptes[l] && scan_va < xpv_end && 208*843e1988Sjohnlev scan_va >= *vaddr) { 209*843e1988Sjohnlev ptep = (x86pte_t *)(ptable_window + 210*843e1988Sjohnlev (idx << mmu.pte_size_shift)); 211*843e1988Sjohnlev pte = GET_PTE(ptep); 212*843e1988Sjohnlev if (pte & PTE_VALID) 213*843e1988Sjohnlev break; 214*843e1988Sjohnlev idx++; 215*843e1988Sjohnlev scan_va += mmu.level_size[l]; 216*843e1988Sjohnlev } 217*843e1988Sjohnlev va = scan_va; 218*843e1988Sjohnlev 219*843e1988Sjohnlev /* 220*843e1988Sjohnlev * See if we've hit the end of the range. 221*843e1988Sjohnlev */ 222*843e1988Sjohnlev if (scan_va >= xpv_end || scan_va < *vaddr) { 223*843e1988Sjohnlev va = scan_va; 224*843e1988Sjohnlev break; 225*843e1988Sjohnlev } 226*843e1988Sjohnlev 227*843e1988Sjohnlev /* 228*843e1988Sjohnlev * If there are no valid mappings in this table, we 229*843e1988Sjohnlev * can skip to the end of the VA range it covers. 230*843e1988Sjohnlev */ 231*843e1988Sjohnlev if (idx == xpv_panic_nptes[l]) { 232*843e1988Sjohnlev va = NEXT_ENTRY_VA(va, l + 1); 233*843e1988Sjohnlev break; 234*843e1988Sjohnlev } 235*843e1988Sjohnlev 236*843e1988Sjohnlev /* 237*843e1988Sjohnlev * If this mapping is for a pagetable, we drop down 238*843e1988Sjohnlev * to the next level in the hierarchy and look for 239*843e1988Sjohnlev * a mapping in it. 240*843e1988Sjohnlev */ 241*843e1988Sjohnlev pfn = PTE2MFN(pte, l); 242*843e1988Sjohnlev if (!PTE_ISPAGE(pte, l)) 243*843e1988Sjohnlev continue; 244*843e1988Sjohnlev 245*843e1988Sjohnlev /* 246*843e1988Sjohnlev * The APIC page is magic. Nothing to see here; 247*843e1988Sjohnlev * move along. 248*843e1988Sjohnlev */ 249*843e1988Sjohnlev if (((uintptr_t)xpv_apicadr & MMU_PAGEMASK) == 250*843e1988Sjohnlev (va & MMU_PAGEMASK)) { 251*843e1988Sjohnlev va += MMU_PAGESIZE; 252*843e1988Sjohnlev break; 253*843e1988Sjohnlev } 254*843e1988Sjohnlev 255*843e1988Sjohnlev /* We also want to skip the Xen version of KPM */ 256*843e1988Sjohnlev if (va >= (uintptr_t)xpv_panic_info->pi_ram_start && 257*843e1988Sjohnlev va < (uintptr_t)xpv_panic_info->pi_ram_end) { 258*843e1988Sjohnlev va = (uintptr_t)xpv_panic_info->pi_ram_end; 259*843e1988Sjohnlev break; 260*843e1988Sjohnlev } 261*843e1988Sjohnlev 262*843e1988Sjohnlev /* 263*843e1988Sjohnlev * The Xen panic code only handles small pages. If 264*843e1988Sjohnlev * this mapping is for a large page, we need to 265*843e1988Sjohnlev * identify the consituent page that covers the 266*843e1988Sjohnlev * specific VA we were looking for. 267*843e1988Sjohnlev */ 268*843e1988Sjohnlev if (l > 0) { 269*843e1988Sjohnlev if (l > 1) 270*843e1988Sjohnlev panic("Xen panic can't cope with " 271*843e1988Sjohnlev "giant pages."); 272*843e1988Sjohnlev idx = (va >> LEVEL_SHIFT(0)) & 273*843e1988Sjohnlev (xpv_panic_nptes[0] - 1); 274*843e1988Sjohnlev pfn += idx; 275*843e1988Sjohnlev } 276*843e1988Sjohnlev 277*843e1988Sjohnlev *vaddr = va; 278*843e1988Sjohnlev lastva = va; 279*843e1988Sjohnlev return (pfn | PFN_IS_FOREIGN_MFN); 280*843e1988Sjohnlev } 281*843e1988Sjohnlev } 282*843e1988Sjohnlev return (PFN_INVALID); 283*843e1988Sjohnlev } 284*843e1988Sjohnlev 285*843e1988Sjohnlev /* 286*843e1988Sjohnlev * Walk through the Xen VA space, finding pages that are mapped in. 287*843e1988Sjohnlev * 288*843e1988Sjohnlev * These pages all have MFNs rather than PFNs, meaning they may be outside 289*843e1988Sjohnlev * the physical address space the kernel knows about, or they may collide 290*843e1988Sjohnlev * with PFNs the kernel is using. 291*843e1988Sjohnlev * 292*843e1988Sjohnlev * The obvious trick of just adding the PFN_IS_FOREIGN_MFN bit to the MFNs 293*843e1988Sjohnlev * to avoid collisions doesn't work. The pages need to be written to disk 294*843e1988Sjohnlev * in PFN-order or savecore gets confused. We can't allocate memory to 295*843e1988Sjohnlev * contruct a sorted pfn->VA reverse mapping, so we have to write the pages 296*843e1988Sjohnlev * to disk in VA order. 297*843e1988Sjohnlev * 298*843e1988Sjohnlev * To square this circle, we simply make up PFNs for each of Xen's pages. 299*843e1988Sjohnlev * We assign each mapped page a fake PFN in ascending order. These fake 300*843e1988Sjohnlev * PFNs each have the FOREIGN bit set, ensuring that they fall outside the 301*843e1988Sjohnlev * range of Solaris PFNs written by the kernel. 302*843e1988Sjohnlev */ 303*843e1988Sjohnlev int 304*843e1988Sjohnlev dump_xpv_addr() 305*843e1988Sjohnlev { 306*843e1988Sjohnlev uintptr_t va; 307*843e1988Sjohnlev mem_vtop_t mem_vtop; 308*843e1988Sjohnlev 309*843e1988Sjohnlev xpv_dump_pages = 0; 310*843e1988Sjohnlev va = xen_virt_start; 311*843e1988Sjohnlev 312*843e1988Sjohnlev while (xpv_va_walk(&va) != PFN_INVALID) { 313*843e1988Sjohnlev mem_vtop.m_as = &kas; 314*843e1988Sjohnlev mem_vtop.m_va = (void *)va; 315*843e1988Sjohnlev mem_vtop.m_pfn = (pfn_t)xpv_dump_pages | PFN_IS_FOREIGN_MFN; 316*843e1988Sjohnlev 317*843e1988Sjohnlev dumpvp_write(&mem_vtop, sizeof (mem_vtop_t)); 318*843e1988Sjohnlev xpv_dump_pages++; 319*843e1988Sjohnlev 320*843e1988Sjohnlev va += MMU_PAGESIZE; 321*843e1988Sjohnlev } 322*843e1988Sjohnlev 323*843e1988Sjohnlev /* 324*843e1988Sjohnlev * Add the shared_info page. This page actually ends up in the 325*843e1988Sjohnlev * dump twice: once for the Xen va and once for the Solaris va. 326*843e1988Sjohnlev * This isn't ideal, but we don't know the address Xen is using for 327*843e1988Sjohnlev * the page, so we can't share it. 328*843e1988Sjohnlev */ 329*843e1988Sjohnlev mem_vtop.m_as = &kas; 330*843e1988Sjohnlev mem_vtop.m_va = HYPERVISOR_shared_info; 331*843e1988Sjohnlev mem_vtop.m_pfn = (pfn_t)xpv_dump_pages | PFN_IS_FOREIGN_MFN; 332*843e1988Sjohnlev dumpvp_write(&mem_vtop, sizeof (mem_vtop_t)); 333*843e1988Sjohnlev xpv_dump_pages++; 334*843e1988Sjohnlev 335*843e1988Sjohnlev return (xpv_dump_pages); 336*843e1988Sjohnlev } 337*843e1988Sjohnlev 338*843e1988Sjohnlev void 339*843e1988Sjohnlev dump_xpv_pfn() 340*843e1988Sjohnlev { 341*843e1988Sjohnlev pfn_t pfn; 342*843e1988Sjohnlev int cnt; 343*843e1988Sjohnlev 344*843e1988Sjohnlev for (cnt = 0; cnt < xpv_dump_pages; cnt++) { 345*843e1988Sjohnlev pfn = (pfn_t)cnt | PFN_IS_FOREIGN_MFN; 346*843e1988Sjohnlev dumpvp_write(&pfn, sizeof (pfn)); 347*843e1988Sjohnlev } 348*843e1988Sjohnlev } 349*843e1988Sjohnlev 350*843e1988Sjohnlev int 351*843e1988Sjohnlev dump_xpv_data(void *dump_cbuf) 352*843e1988Sjohnlev { 353*843e1988Sjohnlev uintptr_t va; 354*843e1988Sjohnlev uint32_t csize; 355*843e1988Sjohnlev int cnt = 0; 356*843e1988Sjohnlev 357*843e1988Sjohnlev /* 358*843e1988Sjohnlev * XXX: we should probably run this data through a UE check. The 359*843e1988Sjohnlev * catch is that the UE code relies on on_trap() and getpfnum() 360*843e1988Sjohnlev * working. 361*843e1988Sjohnlev */ 362*843e1988Sjohnlev va = xen_virt_start; 363*843e1988Sjohnlev 364*843e1988Sjohnlev while (xpv_va_walk(&va) != PFN_INVALID) { 365*843e1988Sjohnlev csize = (uint32_t)compress((void *)va, dump_cbuf, PAGESIZE); 366*843e1988Sjohnlev dumpvp_write(&csize, sizeof (uint32_t)); 367*843e1988Sjohnlev dumpvp_write(dump_cbuf, csize); 368*843e1988Sjohnlev if (dump_ioerr) { 369*843e1988Sjohnlev dumphdr->dump_flags &= ~DF_COMPLETE; 370*843e1988Sjohnlev return (cnt); 371*843e1988Sjohnlev } 372*843e1988Sjohnlev cnt++; 373*843e1988Sjohnlev va += MMU_PAGESIZE; 374*843e1988Sjohnlev } 375*843e1988Sjohnlev 376*843e1988Sjohnlev /* 377*843e1988Sjohnlev * Finally, dump the shared_info page 378*843e1988Sjohnlev */ 379*843e1988Sjohnlev csize = (uint32_t)compress((void *)HYPERVISOR_shared_info, dump_cbuf, 380*843e1988Sjohnlev PAGESIZE); 381*843e1988Sjohnlev dumpvp_write(&csize, sizeof (uint32_t)); 382*843e1988Sjohnlev dumpvp_write(dump_cbuf, csize); 383*843e1988Sjohnlev if (dump_ioerr) 384*843e1988Sjohnlev dumphdr->dump_flags &= ~DF_COMPLETE; 385*843e1988Sjohnlev cnt++; 386*843e1988Sjohnlev 387*843e1988Sjohnlev return (cnt); 388*843e1988Sjohnlev } 389*843e1988Sjohnlev 390*843e1988Sjohnlev static void * 391*843e1988Sjohnlev showstack(void *fpreg, int xpv_only) 392*843e1988Sjohnlev { 393*843e1988Sjohnlev struct frame *fpp; 394*843e1988Sjohnlev ulong_t off; 395*843e1988Sjohnlev char *sym; 396*843e1988Sjohnlev uintptr_t pc, fp, lastfp; 397*843e1988Sjohnlev uintptr_t minaddr = min(KERNELBASE, xen_virt_start); 398*843e1988Sjohnlev 399*843e1988Sjohnlev fp = (uintptr_t)fpreg; 400*843e1988Sjohnlev if (fp < minaddr) { 401*843e1988Sjohnlev xpv_panic_printf("Bad frame ptr: 0x%p\n", fpreg); 402*843e1988Sjohnlev return (fpreg); 403*843e1988Sjohnlev } 404*843e1988Sjohnlev 405*843e1988Sjohnlev do { 406*843e1988Sjohnlev fpp = (struct frame *)fp; 407*843e1988Sjohnlev pc = fpp->fr_savpc; 408*843e1988Sjohnlev 409*843e1988Sjohnlev if ((xpv_only != 0) && 410*843e1988Sjohnlev (fp > xpv_end || fp < xen_virt_start)) 411*843e1988Sjohnlev break; 412*843e1988Sjohnlev if ((sym = kobj_getsymname(pc, &off)) != NULL) 413*843e1988Sjohnlev xpv_panic_printf("%08lx %s:%s+%lx\n", fp, 414*843e1988Sjohnlev mod_containing_pc((caddr_t)pc), sym, off); 415*843e1988Sjohnlev else if ((pc >= xen_virt_start) && (pc <= xpv_end)) 416*843e1988Sjohnlev xpv_panic_printf("%08lx 0x%lx (in Xen)\n", fp, pc); 417*843e1988Sjohnlev else 418*843e1988Sjohnlev xpv_panic_printf("%08lx %lx\n", fp, pc); 419*843e1988Sjohnlev 420*843e1988Sjohnlev lastfp = fp; 421*843e1988Sjohnlev fp = fpp->fr_savfp; 422*843e1988Sjohnlev 423*843e1988Sjohnlev /* 424*843e1988Sjohnlev * Xen marks an exception frame by inverting the frame 425*843e1988Sjohnlev * pointer. 426*843e1988Sjohnlev */ 427*843e1988Sjohnlev if (fp < lastfp) { 428*843e1988Sjohnlev if ((~fp > minaddr) && ((~fp) ^ lastfp) < 0xfff) 429*843e1988Sjohnlev fp = ~fp; 430*843e1988Sjohnlev } 431*843e1988Sjohnlev } while (fp > lastfp); 432*843e1988Sjohnlev return ((void *)fp); 433*843e1988Sjohnlev } 434*843e1988Sjohnlev 435*843e1988Sjohnlev void * 436*843e1988Sjohnlev xpv_traceback(void *fpreg) 437*843e1988Sjohnlev { 438*843e1988Sjohnlev return (showstack(fpreg, 1)); 439*843e1988Sjohnlev } 440*843e1988Sjohnlev 441*843e1988Sjohnlev #if defined(__amd64) 442*843e1988Sjohnlev static void 443*843e1988Sjohnlev xpv_panic_hypercall(ulong_t call) 444*843e1988Sjohnlev { 445*843e1988Sjohnlev panic("Illegally issued hypercall %d during panic!\n", (int)call); 446*843e1988Sjohnlev } 447*843e1988Sjohnlev #endif 448*843e1988Sjohnlev 449*843e1988Sjohnlev void 450*843e1988Sjohnlev xpv_die(struct regs *rp) 451*843e1988Sjohnlev { 452*843e1988Sjohnlev struct panic_trap_info ti; 453*843e1988Sjohnlev struct cregs creg; 454*843e1988Sjohnlev 455*843e1988Sjohnlev ti.trap_regs = rp; 456*843e1988Sjohnlev ti.trap_type = rp->r_trapno; 457*843e1988Sjohnlev 458*843e1988Sjohnlev curthread->t_panic_trap = &ti; 459*843e1988Sjohnlev if (ti.trap_type == T_PGFLT) { 460*843e1988Sjohnlev getcregs(&creg); 461*843e1988Sjohnlev ti.trap_addr = (caddr_t)creg.cr_cr2; 462*843e1988Sjohnlev panic("Fatal pagefault at 0x%lx. fault addr=0x%p rp=0x%p", 463*843e1988Sjohnlev rp->r_pc, ti.trap_addr, rp); 464*843e1988Sjohnlev } else { 465*843e1988Sjohnlev ti.trap_addr = (caddr_t)rp->r_pc; 466*843e1988Sjohnlev panic("Fatal trap %ld at 0x%lx. rp=0x%p", rp->r_trapno, 467*843e1988Sjohnlev rp->r_pc, rp); 468*843e1988Sjohnlev } 469*843e1988Sjohnlev } 470*843e1988Sjohnlev 471*843e1988Sjohnlev /* 472*843e1988Sjohnlev * Build IDT to handle a Xen panic 473*843e1988Sjohnlev */ 474*843e1988Sjohnlev static void 475*843e1988Sjohnlev switch_to_xpv_panic_idt() 476*843e1988Sjohnlev { 477*843e1988Sjohnlev int i; 478*843e1988Sjohnlev desctbr_t idtr; 479*843e1988Sjohnlev gate_desc_t *idt = xpv_panic_idt; 480*843e1988Sjohnlev selector_t cs = get_cs_register(); 481*843e1988Sjohnlev 482*843e1988Sjohnlev for (i = 0; i < 32; i++) 483*843e1988Sjohnlev set_gatesegd(&idt[i], &xpv_invaltrap, cs, SDT_SYSIGT, TRP_XPL); 484*843e1988Sjohnlev 485*843e1988Sjohnlev set_gatesegd(&idt[T_ZERODIV], &xpv_div0trap, cs, SDT_SYSIGT, TRP_XPL); 486*843e1988Sjohnlev set_gatesegd(&idt[T_SGLSTP], &xpv_dbgtrap, cs, SDT_SYSIGT, TRP_XPL); 487*843e1988Sjohnlev set_gatesegd(&idt[T_NMIFLT], &xpv_nmiint, cs, SDT_SYSIGT, TRP_XPL); 488*843e1988Sjohnlev set_gatesegd(&idt[T_BOUNDFLT], &xpv_boundstrap, cs, SDT_SYSIGT, 489*843e1988Sjohnlev TRP_XPL); 490*843e1988Sjohnlev set_gatesegd(&idt[T_ILLINST], &xpv_invoptrap, cs, SDT_SYSIGT, TRP_XPL); 491*843e1988Sjohnlev set_gatesegd(&idt[T_NOEXTFLT], &xpv_ndptrap, cs, SDT_SYSIGT, TRP_XPL); 492*843e1988Sjohnlev set_gatesegd(&idt[T_TSSFLT], &xpv_invtsstrap, cs, SDT_SYSIGT, TRP_XPL); 493*843e1988Sjohnlev set_gatesegd(&idt[T_SEGFLT], &xpv_segnptrap, cs, SDT_SYSIGT, TRP_XPL); 494*843e1988Sjohnlev set_gatesegd(&idt[T_STKFLT], &xpv_stktrap, cs, SDT_SYSIGT, TRP_XPL); 495*843e1988Sjohnlev set_gatesegd(&idt[T_GPFLT], &xpv_gptrap, cs, SDT_SYSIGT, TRP_XPL); 496*843e1988Sjohnlev set_gatesegd(&idt[T_PGFLT], &xpv_pftrap, cs, SDT_SYSIGT, TRP_XPL); 497*843e1988Sjohnlev set_gatesegd(&idt[T_EXTERRFLT], &xpv_ndperr, cs, SDT_SYSIGT, TRP_XPL); 498*843e1988Sjohnlev set_gatesegd(&idt[T_ALIGNMENT], &xpv_achktrap, cs, SDT_SYSIGT, TRP_XPL); 499*843e1988Sjohnlev set_gatesegd(&idt[T_MCE], &xpv_mcetrap, cs, SDT_SYSIGT, TRP_XPL); 500*843e1988Sjohnlev set_gatesegd(&idt[T_SIMDFPE], &xpv_xmtrap, cs, SDT_SYSIGT, TRP_XPL); 501*843e1988Sjohnlev 502*843e1988Sjohnlev /* 503*843e1988Sjohnlev * We have no double fault handler. Any single fault represents a 504*843e1988Sjohnlev * catastrophic failure for us, so there is no attempt to handle 505*843e1988Sjohnlev * them cleanly: we just print a message and reboot. If we 506*843e1988Sjohnlev * encounter a second fault while doing that, there is nothing 507*843e1988Sjohnlev * else we can do. 508*843e1988Sjohnlev */ 509*843e1988Sjohnlev 510*843e1988Sjohnlev /* 511*843e1988Sjohnlev * Be prepared to absorb any stray device interrupts received 512*843e1988Sjohnlev * while writing the core to disk. 513*843e1988Sjohnlev */ 514*843e1988Sjohnlev for (i = 33; i < NIDT; i++) 515*843e1988Sjohnlev set_gatesegd(&idt[i], &xpv_surprise_intr, cs, SDT_SYSIGT, 516*843e1988Sjohnlev TRP_XPL); 517*843e1988Sjohnlev 518*843e1988Sjohnlev /* The one interrupt we expect to get is from the APIC timer. */ 519*843e1988Sjohnlev set_gatesegd(&idt[T_XPV_TIMER], &xpv_timer_trap, cs, SDT_SYSIGT, 520*843e1988Sjohnlev TRP_XPL); 521*843e1988Sjohnlev 522*843e1988Sjohnlev idtr.dtr_base = (uintptr_t)xpv_panic_idt; 523*843e1988Sjohnlev idtr.dtr_limit = sizeof (xpv_panic_idt) - 1; 524*843e1988Sjohnlev wr_idtr(&idtr); 525*843e1988Sjohnlev 526*843e1988Sjohnlev #if defined(__amd64) 527*843e1988Sjohnlev /* Catch any hypercalls. */ 528*843e1988Sjohnlev wrmsr(MSR_AMD_LSTAR, (uintptr_t)xpv_panic_hypercall); 529*843e1988Sjohnlev wrmsr(MSR_AMD_CSTAR, (uintptr_t)xpv_panic_hypercall); 530*843e1988Sjohnlev #endif 531*843e1988Sjohnlev } 532*843e1988Sjohnlev 533*843e1988Sjohnlev static void 534*843e1988Sjohnlev xpv_apic_clkinit() 535*843e1988Sjohnlev { 536*843e1988Sjohnlev uint_t apic_ticks = 0; 537*843e1988Sjohnlev 538*843e1988Sjohnlev /* 539*843e1988Sjohnlev * Measure how many APIC ticks there are within a fixed time 540*843e1988Sjohnlev * period. We're going to be fairly coarse here. This timer is 541*843e1988Sjohnlev * just being used to detect a stalled panic, so as long as we have 542*843e1988Sjohnlev * the right order of magnitude, everything should be fine. 543*843e1988Sjohnlev */ 544*843e1988Sjohnlev xpv_apicadr[APIC_SPUR_INT_REG] = AV_UNIT_ENABLE | APIC_SPUR_INTR; 545*843e1988Sjohnlev xpv_apicadr[APIC_LOCAL_TIMER] = AV_MASK; 546*843e1988Sjohnlev xpv_apicadr[APIC_INT_VECT0] = AV_MASK; /* local intr reg 0 */ 547*843e1988Sjohnlev 548*843e1988Sjohnlev xpv_apicadr[APIC_DIVIDE_REG] = 0; 549*843e1988Sjohnlev xpv_apicadr[APIC_INIT_COUNT] = APIC_MAXVAL; 550*843e1988Sjohnlev drv_usecwait(XPV_TIMER_INTERVAL); 551*843e1988Sjohnlev apic_ticks = APIC_MAXVAL - xpv_apicadr[APIC_CURR_COUNT]; 552*843e1988Sjohnlev 553*843e1988Sjohnlev /* 554*843e1988Sjohnlev * apic_ticks now represents roughly how many apic ticks comprise 555*843e1988Sjohnlev * one timeout interval. Program the timer to send us an interrupt 556*843e1988Sjohnlev * every time that interval expires. 557*843e1988Sjohnlev */ 558*843e1988Sjohnlev xpv_apicadr[APIC_LOCAL_TIMER] = T_XPV_TIMER | AV_TIME; 559*843e1988Sjohnlev xpv_apicadr[APIC_INIT_COUNT] = apic_ticks; 560*843e1988Sjohnlev xpv_apicadr[APIC_EOI_REG] = 0; 561*843e1988Sjohnlev } 562*843e1988Sjohnlev 563*843e1988Sjohnlev void 564*843e1988Sjohnlev xpv_timer_tick(void) 565*843e1988Sjohnlev { 566*843e1988Sjohnlev static int ticks = 0; 567*843e1988Sjohnlev 568*843e1988Sjohnlev if (ticks++ >= MICROSEC / XPV_TIMER_INTERVAL) { 569*843e1988Sjohnlev ticks = 0; 570*843e1988Sjohnlev if (dump_timeleft && (--dump_timeleft == 0)) 571*843e1988Sjohnlev panic("Xen panic timeout\n"); 572*843e1988Sjohnlev } 573*843e1988Sjohnlev xpv_apicadr[APIC_EOI_REG] = 0; 574*843e1988Sjohnlev } 575*843e1988Sjohnlev 576*843e1988Sjohnlev void 577*843e1988Sjohnlev xpv_interrupt(void) 578*843e1988Sjohnlev { 579*843e1988Sjohnlev #ifdef DEBUG 580*843e1988Sjohnlev static int cnt = 0; 581*843e1988Sjohnlev 582*843e1988Sjohnlev if (cnt++ < 10) 583*843e1988Sjohnlev xpv_panic_printf("Unexpected interrupt received.\n"); 584*843e1988Sjohnlev if ((cnt < 1000) && ((cnt % 100) == 0)) 585*843e1988Sjohnlev xpv_panic_printf("%d unexpected interrupts received.\n", cnt); 586*843e1988Sjohnlev #endif 587*843e1988Sjohnlev 588*843e1988Sjohnlev xpv_apicadr[APIC_EOI_REG] = 0; 589*843e1988Sjohnlev } 590*843e1988Sjohnlev 591*843e1988Sjohnlev /* 592*843e1988Sjohnlev * Managing time in panic context is trivial. We only have a single CPU, 593*843e1988Sjohnlev * we never get rescheduled, we never get suspended. We just need to 594*843e1988Sjohnlev * convert clock ticks into nanoseconds. 595*843e1988Sjohnlev */ 596*843e1988Sjohnlev static hrtime_t 597*843e1988Sjohnlev xpv_panic_gethrtime(void) 598*843e1988Sjohnlev { 599*843e1988Sjohnlev hrtime_t tsc, hrt; 600*843e1988Sjohnlev unsigned int *l = (unsigned int *)&(tsc); 601*843e1988Sjohnlev 602*843e1988Sjohnlev tsc = __rdtsc_insn(); 603*843e1988Sjohnlev hrt = (mul32(l[1], nsec_scale) << NSEC_SHIFT) + 604*843e1988Sjohnlev (mul32(l[0], nsec_scale) >> (32 - NSEC_SHIFT)); 605*843e1988Sjohnlev 606*843e1988Sjohnlev return (hrt); 607*843e1988Sjohnlev } 608*843e1988Sjohnlev 609*843e1988Sjohnlev static void 610*843e1988Sjohnlev xpv_panic_time_init() 611*843e1988Sjohnlev { 612*843e1988Sjohnlev nsec_scale = 613*843e1988Sjohnlev CPU->cpu_m.mcpu_vcpu_info->time.tsc_to_system_mul >> NSEC_SHIFT; 614*843e1988Sjohnlev 615*843e1988Sjohnlev gethrtimef = xpv_panic_gethrtime; 616*843e1988Sjohnlev } 617*843e1988Sjohnlev 618*843e1988Sjohnlev static void 619*843e1988Sjohnlev xpv_panicsys(struct regs *rp, char *fmt, ...) 620*843e1988Sjohnlev { 621*843e1988Sjohnlev extern void panicsys(const char *, va_list, struct regs *, int); 622*843e1988Sjohnlev va_list alist; 623*843e1988Sjohnlev 624*843e1988Sjohnlev va_start(alist, fmt); 625*843e1988Sjohnlev panicsys(fmt, alist, rp, 1); 626*843e1988Sjohnlev va_end(alist); 627*843e1988Sjohnlev } 628*843e1988Sjohnlev 629*843e1988Sjohnlev void 630*843e1988Sjohnlev xpv_do_panic(void *arg) 631*843e1988Sjohnlev { 632*843e1988Sjohnlev struct panic_info *pip = (struct panic_info *)arg; 633*843e1988Sjohnlev int l; 634*843e1988Sjohnlev struct cregs creg; 635*843e1988Sjohnlev #if defined(__amd64) 636*843e1988Sjohnlev extern uintptr_t postbootkernelbase; 637*843e1988Sjohnlev #endif 638*843e1988Sjohnlev 639*843e1988Sjohnlev if (xpv_panicking++ > 0) 640*843e1988Sjohnlev panic("multiple calls to xpv_do_panic()"); 641*843e1988Sjohnlev 642*843e1988Sjohnlev /* 643*843e1988Sjohnlev * Indicate to the underlying panic framework that a panic has been 644*843e1988Sjohnlev * initiated. This is ordinarily done as part of vpanic(). Since 645*843e1988Sjohnlev * we already have all the register state saved by the hypervisor, 646*843e1988Sjohnlev * we skip that and jump straight into the panic processing code. 647*843e1988Sjohnlev */ 648*843e1988Sjohnlev (void) panic_trigger(&panic_quiesce); 649*843e1988Sjohnlev 650*843e1988Sjohnlev #if defined(__amd64) 651*843e1988Sjohnlev /* 652*843e1988Sjohnlev * bzero() and bcopy() get unhappy when asked to operate on 653*843e1988Sjohnlev * addresses outside of the kernel. At this point Xen is really a 654*843e1988Sjohnlev * part of the kernel, so we update the routines' notion of where 655*843e1988Sjohnlev * the kernel starts. 656*843e1988Sjohnlev */ 657*843e1988Sjohnlev postbootkernelbase = xen_virt_start; 658*843e1988Sjohnlev #endif 659*843e1988Sjohnlev 660*843e1988Sjohnlev #if defined(HYPERVISOR_VIRT_END) 661*843e1988Sjohnlev xpv_end = HYPERVISOR_VIRT_END; 662*843e1988Sjohnlev #else 663*843e1988Sjohnlev xpv_end = (uintptr_t)UINTPTR_MAX - sizeof (uintptr_t); 664*843e1988Sjohnlev #endif 665*843e1988Sjohnlev 666*843e1988Sjohnlev /* 667*843e1988Sjohnlev * If we were redirecting console output to the hypervisor, we have 668*843e1988Sjohnlev * to stop. 669*843e1988Sjohnlev */ 670*843e1988Sjohnlev use_polledio = B_FALSE; 671*843e1988Sjohnlev if (console == CONS_HYPERVISOR) { 672*843e1988Sjohnlev bcons_device_change(CONS_HYPERVISOR); 673*843e1988Sjohnlev } else if (cons_polledio != NULL && 674*843e1988Sjohnlev cons_polledio->cons_polledio_putchar != NULL) { 675*843e1988Sjohnlev if (cons_polledio->cons_polledio_enter != NULL) 676*843e1988Sjohnlev cons_polledio->cons_polledio_enter( 677*843e1988Sjohnlev cons_polledio->cons_polledio_argument); 678*843e1988Sjohnlev use_polledio = 1; 679*843e1988Sjohnlev } 680*843e1988Sjohnlev 681*843e1988Sjohnlev /* Make sure we handle all console output from here on. */ 682*843e1988Sjohnlev sysp->bsvc_putchar = xpv_panic_putc; 683*843e1988Sjohnlev 684*843e1988Sjohnlev /* 685*843e1988Sjohnlev * If we find an unsupported panic_info structure, there's not much 686*843e1988Sjohnlev * we can do other than complain, plow on, and hope for the best. 687*843e1988Sjohnlev */ 688*843e1988Sjohnlev if (pip->pi_version != PANIC_INFO_VERSION) 689*843e1988Sjohnlev xpv_panic_printf("Warning: Xen is using an unsupported " 690*843e1988Sjohnlev "version of the panic_info structure.\n"); 691*843e1988Sjohnlev 692*843e1988Sjohnlev xpv_panic_info = pip; 693*843e1988Sjohnlev 694*843e1988Sjohnlev /* 695*843e1988Sjohnlev * Make sure we are running on the Solaris %gs. The Xen panic code 696*843e1988Sjohnlev * should already have set up the GDT properly. 697*843e1988Sjohnlev */ 698*843e1988Sjohnlev xpv_panic_resetgs(); 699*843e1988Sjohnlev #if defined(__amd64) 700*843e1988Sjohnlev wrmsr(MSR_AMD_GSBASE, (uint64_t)&cpus[0]); 701*843e1988Sjohnlev #endif 702*843e1988Sjohnlev 703*843e1988Sjohnlev xpv_panic_time_init(); 704*843e1988Sjohnlev 705*843e1988Sjohnlev /* 706*843e1988Sjohnlev * Switch to our own IDT, avoiding any accidental returns to Xen 707*843e1988Sjohnlev * world. 708*843e1988Sjohnlev */ 709*843e1988Sjohnlev switch_to_xpv_panic_idt(); 710*843e1988Sjohnlev 711*843e1988Sjohnlev /* 712*843e1988Sjohnlev * Initialize the APIC timer, which is used to detect a hung dump 713*843e1988Sjohnlev * attempt. 714*843e1988Sjohnlev */ 715*843e1988Sjohnlev xpv_apicadr = pip->pi_apic; 716*843e1988Sjohnlev xpv_apic_clkinit(); 717*843e1988Sjohnlev 718*843e1988Sjohnlev /* 719*843e1988Sjohnlev * Set up a few values that we'll need repeatedly. 720*843e1988Sjohnlev */ 721*843e1988Sjohnlev getcregs(&creg); 722*843e1988Sjohnlev xpv_panic_cr3 = creg.cr_cr3; 723*843e1988Sjohnlev for (l = mmu.max_level; l >= 0; l--) 724*843e1988Sjohnlev xpv_panic_nptes[l] = mmu.ptes_per_table; 725*843e1988Sjohnlev #ifdef __i386 726*843e1988Sjohnlev if (mmu.pae_hat) 727*843e1988Sjohnlev xpv_panic_nptes[mmu.max_level] = 4; 728*843e1988Sjohnlev #endif 729*843e1988Sjohnlev 730*843e1988Sjohnlev /* Add the fake Xen module to the module list */ 731*843e1988Sjohnlev if (xpv_module != NULL) { 732*843e1988Sjohnlev extern int last_module_id; 733*843e1988Sjohnlev 734*843e1988Sjohnlev xpv_modctl->mod_id = last_module_id++; 735*843e1988Sjohnlev xpv_modctl->mod_next = &modules; 736*843e1988Sjohnlev xpv_modctl->mod_prev = modules.mod_prev; 737*843e1988Sjohnlev modules.mod_prev->mod_next = xpv_modctl; 738*843e1988Sjohnlev modules.mod_prev = xpv_modctl; 739*843e1988Sjohnlev } 740*843e1988Sjohnlev xpv_panic_printf = printf; 741*843e1988Sjohnlev xpv_panicsys((struct regs *)pip->pi_regs, pip->pi_panicstr); 742*843e1988Sjohnlev xpv_panic_printf("Failed to reboot following panic.\n"); 743*843e1988Sjohnlev for (;;) 744*843e1988Sjohnlev ; 745*843e1988Sjohnlev } 746*843e1988Sjohnlev 747*843e1988Sjohnlev /* 748*843e1988Sjohnlev * Set up the necessary data structures to pretend that the Xen hypervisor 749*843e1988Sjohnlev * is a loadable module, allowing mdb to find the Xen symbols in a crash 750*843e1988Sjohnlev * dump. Since these symbols all map to VA space Solaris doesn't normally 751*843e1988Sjohnlev * have access to, we don't link these structures into the kernel's lists 752*843e1988Sjohnlev * until/unless we hit a Xen panic. 753*843e1988Sjohnlev * 754*843e1988Sjohnlev * The observant reader will note a striking amount of overlap between this 755*843e1988Sjohnlev * code and that found in krtld. While it would be handy if we could just 756*843e1988Sjohnlev * ask krtld to do this work for us, it's not that simple. Among the 757*843e1988Sjohnlev * complications: we're not actually loading the text here (grub did it at 758*843e1988Sjohnlev * boot), the .text section is writable, there are no relocations to do, 759*843e1988Sjohnlev * none of the module text/data is in readable memory, etc. Training krtld 760*843e1988Sjohnlev * to deal with this weird module is as complicated, and more risky, than 761*843e1988Sjohnlev * reimplementing the necessary subset of it here. 762*843e1988Sjohnlev */ 763*843e1988Sjohnlev static void 764*843e1988Sjohnlev init_xen_module() 765*843e1988Sjohnlev { 766*843e1988Sjohnlev struct _buf *file = NULL; 767*843e1988Sjohnlev struct module *mp; 768*843e1988Sjohnlev struct modctl *mcp; 769*843e1988Sjohnlev int i, shn; 770*843e1988Sjohnlev Shdr *shp, *ctf_shp; 771*843e1988Sjohnlev char *names = NULL; 772*843e1988Sjohnlev size_t n, namesize, text_align, data_align; 773*843e1988Sjohnlev #if defined(__amd64) 774*843e1988Sjohnlev const char machine = EM_AMD64; 775*843e1988Sjohnlev #else 776*843e1988Sjohnlev const char machine = EM_386; 777*843e1988Sjohnlev #endif 778*843e1988Sjohnlev 779*843e1988Sjohnlev /* Allocate and init the module structure */ 780*843e1988Sjohnlev mp = kmem_zalloc(sizeof (*mp), KM_SLEEP); 781*843e1988Sjohnlev mp->filename = kobj_zalloc(strlen(XPV_FILENAME) + 1, KM_SLEEP); 782*843e1988Sjohnlev (void) strcpy(mp->filename, XPV_FILENAME); 783*843e1988Sjohnlev 784*843e1988Sjohnlev /* Allocate and init the modctl structure */ 785*843e1988Sjohnlev mcp = kmem_zalloc(sizeof (*mcp), KM_SLEEP); 786*843e1988Sjohnlev mcp->mod_modname = kobj_zalloc(strlen(XPV_MODNAME) + 1, KM_SLEEP); 787*843e1988Sjohnlev (void) strcpy(mcp->mod_modname, XPV_MODNAME); 788*843e1988Sjohnlev mcp->mod_filename = kobj_zalloc(strlen(XPV_FILENAME) + 1, KM_SLEEP); 789*843e1988Sjohnlev (void) strcpy(mcp->mod_filename, XPV_FILENAME); 790*843e1988Sjohnlev mcp->mod_inprogress_thread = (kthread_id_t)-1; 791*843e1988Sjohnlev mcp->mod_ref = 1; 792*843e1988Sjohnlev mcp->mod_loaded = 1; 793*843e1988Sjohnlev mcp->mod_loadcnt = 1; 794*843e1988Sjohnlev mcp->mod_mp = mp; 795*843e1988Sjohnlev 796*843e1988Sjohnlev /* 797*843e1988Sjohnlev * Try to open a Xen image that hasn't had its symbol and CTF 798*843e1988Sjohnlev * information stripped off. 799*843e1988Sjohnlev */ 800*843e1988Sjohnlev file = kobj_open_file(XPV_FILENAME); 801*843e1988Sjohnlev if (file == (struct _buf *)-1) { 802*843e1988Sjohnlev file = NULL; 803*843e1988Sjohnlev goto err; 804*843e1988Sjohnlev } 805*843e1988Sjohnlev 806*843e1988Sjohnlev /* 807*843e1988Sjohnlev * Read the header and ensure that this is an ELF file for the 808*843e1988Sjohnlev * proper ISA. If it's not, somebody has done something very 809*843e1988Sjohnlev * stupid. Why bother? See Mencken. 810*843e1988Sjohnlev */ 811*843e1988Sjohnlev if (kobj_read_file(file, (char *)&mp->hdr, sizeof (mp->hdr), 0) < 0) 812*843e1988Sjohnlev goto err; 813*843e1988Sjohnlev for (i = 0; i < SELFMAG; i++) 814*843e1988Sjohnlev if (mp->hdr.e_ident[i] != ELFMAG[i]) 815*843e1988Sjohnlev goto err; 816*843e1988Sjohnlev if ((mp->hdr.e_ident[EI_DATA] != ELFDATA2LSB) || 817*843e1988Sjohnlev (mp->hdr.e_machine != machine)) 818*843e1988Sjohnlev goto err; 819*843e1988Sjohnlev 820*843e1988Sjohnlev /* Read in the section headers */ 821*843e1988Sjohnlev n = mp->hdr.e_shentsize * mp->hdr.e_shnum; 822*843e1988Sjohnlev mp->shdrs = kmem_zalloc(n, KM_SLEEP); 823*843e1988Sjohnlev if (kobj_read_file(file, mp->shdrs, n, mp->hdr.e_shoff) < 0) 824*843e1988Sjohnlev goto err; 825*843e1988Sjohnlev 826*843e1988Sjohnlev /* Read the section names */ 827*843e1988Sjohnlev shp = (Shdr *)(mp->shdrs + mp->hdr.e_shstrndx * mp->hdr.e_shentsize); 828*843e1988Sjohnlev namesize = shp->sh_size; 829*843e1988Sjohnlev names = kmem_zalloc(shp->sh_size, KM_SLEEP); 830*843e1988Sjohnlev if (kobj_read_file(file, names, shp->sh_size, shp->sh_offset) < 0) 831*843e1988Sjohnlev goto err; 832*843e1988Sjohnlev 833*843e1988Sjohnlev /* 834*843e1988Sjohnlev * Fill in the text and data size fields. 835*843e1988Sjohnlev */ 836*843e1988Sjohnlev ctf_shp = NULL; 837*843e1988Sjohnlev text_align = data_align = 0; 838*843e1988Sjohnlev for (shn = 1; shn < mp->hdr.e_shnum; shn++) { 839*843e1988Sjohnlev shp = (Shdr *)(mp->shdrs + shn * mp->hdr.e_shentsize); 840*843e1988Sjohnlev 841*843e1988Sjohnlev /* Sanity check the offset of the section name */ 842*843e1988Sjohnlev if (shp->sh_name >= namesize) 843*843e1988Sjohnlev continue; 844*843e1988Sjohnlev 845*843e1988Sjohnlev /* If we find the symtab section, remember it for later. */ 846*843e1988Sjohnlev if (shp->sh_type == SHT_SYMTAB) { 847*843e1988Sjohnlev mp->symtbl_section = shn; 848*843e1988Sjohnlev mp->symhdr = shp; 849*843e1988Sjohnlev continue; 850*843e1988Sjohnlev } 851*843e1988Sjohnlev 852*843e1988Sjohnlev /* If we find the CTF section, remember it for later. */ 853*843e1988Sjohnlev if ((shp->sh_size != 0) && 854*843e1988Sjohnlev (strcmp(names + shp->sh_name, ".SUNW_ctf") == 0)) { 855*843e1988Sjohnlev ctf_shp = shp; 856*843e1988Sjohnlev continue; 857*843e1988Sjohnlev } 858*843e1988Sjohnlev 859*843e1988Sjohnlev if (!(shp->sh_flags & SHF_ALLOC)) 860*843e1988Sjohnlev continue; 861*843e1988Sjohnlev 862*843e1988Sjohnlev /* 863*843e1988Sjohnlev * Xen marks its text section as writable, so we need to 864*843e1988Sjohnlev * look for the name - not just the flag. 865*843e1988Sjohnlev */ 866*843e1988Sjohnlev if ((strcmp(&names[shp->sh_name], ".text") != NULL) && 867*843e1988Sjohnlev (shp->sh_flags & SHF_WRITE) != 0) { 868*843e1988Sjohnlev if (shp->sh_addralign > data_align) 869*843e1988Sjohnlev data_align = shp->sh_addralign; 870*843e1988Sjohnlev mp->data_size = ALIGN(mp->data_size, data_align); 871*843e1988Sjohnlev mp->data_size += ALIGN(shp->sh_size, 8); 872*843e1988Sjohnlev if (mp->data == NULL || mp->data > (char *)shp->sh_addr) 873*843e1988Sjohnlev mp->data = (char *)shp->sh_addr; 874*843e1988Sjohnlev } else { 875*843e1988Sjohnlev if (shp->sh_addralign > text_align) 876*843e1988Sjohnlev text_align = shp->sh_addralign; 877*843e1988Sjohnlev mp->text_size = ALIGN(mp->text_size, text_align); 878*843e1988Sjohnlev mp->text_size += ALIGN(shp->sh_size, 8); 879*843e1988Sjohnlev if (mp->text == NULL || mp->text > (char *)shp->sh_addr) 880*843e1988Sjohnlev mp->text = (char *)shp->sh_addr; 881*843e1988Sjohnlev } 882*843e1988Sjohnlev } 883*843e1988Sjohnlev kmem_free(names, namesize); 884*843e1988Sjohnlev names = NULL; 885*843e1988Sjohnlev mcp->mod_text = mp->text; 886*843e1988Sjohnlev mcp->mod_text_size = mp->text_size; 887*843e1988Sjohnlev 888*843e1988Sjohnlev /* 889*843e1988Sjohnlev * If we have symbol table and string table sections, read them in 890*843e1988Sjohnlev * now. If we don't, we just plow on. We'll still get a valid 891*843e1988Sjohnlev * core dump, but finding anything useful will be just a bit 892*843e1988Sjohnlev * harder. 893*843e1988Sjohnlev * 894*843e1988Sjohnlev * Note: we don't bother with a hash table. We'll never do a 895*843e1988Sjohnlev * symbol lookup unless we crash, and then mdb creates its own. We 896*843e1988Sjohnlev * also don't try to perform any relocations. Xen should be loaded 897*843e1988Sjohnlev * exactly where the ELF file indicates, and the symbol information 898*843e1988Sjohnlev * in the file should be complete and correct already. Static 899*843e1988Sjohnlev * linking ain't all bad. 900*843e1988Sjohnlev */ 901*843e1988Sjohnlev if ((mp->symhdr != NULL) && (mp->symhdr->sh_link < mp->hdr.e_shnum)) { 902*843e1988Sjohnlev mp->strhdr = (Shdr *) 903*843e1988Sjohnlev (mp->shdrs + mp->symhdr->sh_link * mp->hdr.e_shentsize); 904*843e1988Sjohnlev mp->nsyms = mp->symhdr->sh_size / mp->symhdr->sh_entsize; 905*843e1988Sjohnlev 906*843e1988Sjohnlev /* Allocate space for the symbol table and strings. */ 907*843e1988Sjohnlev mp->symsize = mp->symhdr->sh_size + 908*843e1988Sjohnlev mp->nsyms * sizeof (symid_t) + mp->strhdr->sh_size; 909*843e1988Sjohnlev mp->symspace = kmem_zalloc(mp->symsize, KM_SLEEP); 910*843e1988Sjohnlev mp->symtbl = mp->symspace; 911*843e1988Sjohnlev mp->strings = (char *)(mp->symtbl + mp->symhdr->sh_size); 912*843e1988Sjohnlev 913*843e1988Sjohnlev if ((kobj_read_file(file, mp->symtbl, 914*843e1988Sjohnlev mp->symhdr->sh_size, mp->symhdr->sh_offset) < 0) || 915*843e1988Sjohnlev (kobj_read_file(file, mp->strings, 916*843e1988Sjohnlev mp->strhdr->sh_size, mp->strhdr->sh_offset) < 0)) 917*843e1988Sjohnlev goto err; 918*843e1988Sjohnlev } 919*843e1988Sjohnlev 920*843e1988Sjohnlev /* 921*843e1988Sjohnlev * Read in the CTF section 922*843e1988Sjohnlev */ 923*843e1988Sjohnlev if ((ctf_shp != NULL) && ((moddebug & MODDEBUG_NOCTF) == 0)) { 924*843e1988Sjohnlev mp->ctfdata = kmem_zalloc(shp->sh_size, KM_SLEEP); 925*843e1988Sjohnlev mp->ctfsize = ctf_shp->sh_size; 926*843e1988Sjohnlev if (kobj_read_file(file, mp->ctfdata, mp->ctfsize, 927*843e1988Sjohnlev ctf_shp->sh_offset) < 0) 928*843e1988Sjohnlev goto err; 929*843e1988Sjohnlev } 930*843e1988Sjohnlev 931*843e1988Sjohnlev kobj_close_file(file); 932*843e1988Sjohnlev 933*843e1988Sjohnlev xpv_module = mp; 934*843e1988Sjohnlev xpv_modctl = mcp; 935*843e1988Sjohnlev return; 936*843e1988Sjohnlev 937*843e1988Sjohnlev err: 938*843e1988Sjohnlev cmn_err(CE_WARN, "Failed to initialize xpv module."); 939*843e1988Sjohnlev if (file != NULL) 940*843e1988Sjohnlev kobj_close_file(file); 941*843e1988Sjohnlev 942*843e1988Sjohnlev kmem_free(mp->filename, strlen(XPV_FILENAME) + 1); 943*843e1988Sjohnlev if (mp->shdrs != NULL) 944*843e1988Sjohnlev kmem_free(mp->shdrs, mp->hdr.e_shentsize * mp->hdr.e_shnum); 945*843e1988Sjohnlev if (mp->symspace != NULL) 946*843e1988Sjohnlev kmem_free(mp->symspace, mp->symsize); 947*843e1988Sjohnlev if (mp->ctfdata != NULL) 948*843e1988Sjohnlev kmem_free(mp->ctfdata, mp->ctfsize); 949*843e1988Sjohnlev kmem_free(mp, sizeof (*mp)); 950*843e1988Sjohnlev kmem_free(mcp->mod_filename, strlen(XPV_FILENAME) + 1); 951*843e1988Sjohnlev kmem_free(mcp->mod_modname, strlen(XPV_MODNAME) + 1); 952*843e1988Sjohnlev kmem_free(mcp, sizeof (*mcp)); 953*843e1988Sjohnlev if (names != NULL) 954*843e1988Sjohnlev kmem_free(names, namesize); 955*843e1988Sjohnlev } 956*843e1988Sjohnlev 957*843e1988Sjohnlev void 958*843e1988Sjohnlev xpv_panic_init() 959*843e1988Sjohnlev { 960*843e1988Sjohnlev xen_platform_op_t op; 961*843e1988Sjohnlev int i; 962*843e1988Sjohnlev 963*843e1988Sjohnlev ASSERT(DOMAIN_IS_INITDOMAIN(xen_info)); 964*843e1988Sjohnlev 965*843e1988Sjohnlev for (i = 0; i < mmu.num_level; i++) 966*843e1988Sjohnlev ptable_pfn[i] = PFN_INVALID; 967*843e1988Sjohnlev 968*843e1988Sjohnlev /* Let Xen know where to jump if/when it panics. */ 969*843e1988Sjohnlev op.cmd = XENPF_panic_init; 970*843e1988Sjohnlev op.interface_version = XENPF_INTERFACE_VERSION; 971*843e1988Sjohnlev op.u.panic_init.panic_addr = (unsigned long)xpv_panic_hdlr; 972*843e1988Sjohnlev 973*843e1988Sjohnlev (void) HYPERVISOR_platform_op(&op); 974*843e1988Sjohnlev 975*843e1988Sjohnlev init_xen_module(); 976*843e1988Sjohnlev } 977