1843e1988Sjohnlev /* 2843e1988Sjohnlev * CDDL HEADER START 3843e1988Sjohnlev * 4843e1988Sjohnlev * The contents of this file are subject to the terms of the 5843e1988Sjohnlev * Common Development and Distribution License (the "License"). 6843e1988Sjohnlev * You may not use this file except in compliance with the License. 7843e1988Sjohnlev * 8843e1988Sjohnlev * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9843e1988Sjohnlev * or http://www.opensolaris.org/os/licensing. 10843e1988Sjohnlev * See the License for the specific language governing permissions 11843e1988Sjohnlev * and limitations under the License. 12843e1988Sjohnlev * 13843e1988Sjohnlev * When distributing Covered Code, include this CDDL HEADER in each 14843e1988Sjohnlev * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15843e1988Sjohnlev * If applicable, add the following below this CDDL HEADER, with the 16843e1988Sjohnlev * fields enclosed by brackets "[]" replaced with your own identifying 17843e1988Sjohnlev * information: Portions Copyright [yyyy] [name of copyright owner] 18843e1988Sjohnlev * 19843e1988Sjohnlev * CDDL HEADER END 20843e1988Sjohnlev */ 21843e1988Sjohnlev /* 220d928757SGary Mills * Copyright (c) 2012 Gary Mills 23a9ba5504SRichard PALO * Copyright 2016 PALO, Richard. 240d928757SGary Mills * 2541afdfa7SKrishnendu Sadhukhan - Sun Microsystems * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 26*74ecdb51SJohn Levon * 27*74ecdb51SJohn Levon * Copyright 2018 Joyent, Inc. 28843e1988Sjohnlev */ 29843e1988Sjohnlev 30843e1988Sjohnlev #include <sys/types.h> 31843e1988Sjohnlev #include <sys/clock.h> 32843e1988Sjohnlev #include <sys/psm.h> 33843e1988Sjohnlev #include <sys/archsystm.h> 34843e1988Sjohnlev #include <sys/machsystm.h> 35843e1988Sjohnlev #include <sys/compress.h> 36843e1988Sjohnlev #include <sys/modctl.h> 37843e1988Sjohnlev #include <sys/trap.h> 38843e1988Sjohnlev #include <sys/panic.h> 39843e1988Sjohnlev #include <sys/regset.h> 40843e1988Sjohnlev #include <sys/frame.h> 41843e1988Sjohnlev #include <sys/kobj.h> 42843e1988Sjohnlev #include <sys/apic.h> 4341afdfa7SKrishnendu Sadhukhan - Sun Microsystems #include <sys/apic_timer.h> 44843e1988Sjohnlev #include <sys/dumphdr.h> 45843e1988Sjohnlev #include <sys/mem.h> 46843e1988Sjohnlev #include <sys/x86_archext.h> 47843e1988Sjohnlev #include <sys/xpv_panic.h> 48843e1988Sjohnlev #include <sys/boot_console.h> 49843e1988Sjohnlev #include <sys/bootsvcs.h> 50843e1988Sjohnlev #include <sys/consdev.h> 51843e1988Sjohnlev #include <vm/hat_pte.h> 52843e1988Sjohnlev #include <vm/hat_i86.h> 53843e1988Sjohnlev 54843e1988Sjohnlev /* XXX: need to add a PAE version too, if we ever support both PAE and non */ 55843e1988Sjohnlev #if defined(__i386) 56843e1988Sjohnlev #define XPV_FILENAME "/boot/xen-syms" 57843e1988Sjohnlev #else 58843e1988Sjohnlev #define XPV_FILENAME "/boot/amd64/xen-syms" 59843e1988Sjohnlev #endif 60843e1988Sjohnlev #define XPV_MODNAME "xpv" 61843e1988Sjohnlev 62843e1988Sjohnlev int xpv_panicking = 0; 63843e1988Sjohnlev 64843e1988Sjohnlev struct module *xpv_module; 65843e1988Sjohnlev struct modctl *xpv_modctl; 66843e1988Sjohnlev 67843e1988Sjohnlev #define ALIGN(x, a) ((a) == 0 ? (uintptr_t)(x) : \ 68843e1988Sjohnlev (((uintptr_t)(x) + (uintptr_t)(a) - 1l) & ~((uintptr_t)(a) - 1l))) 69843e1988Sjohnlev 70843e1988Sjohnlev /* Pointer to the xpv_panic_info structure handed to us by Xen. */ 71843e1988Sjohnlev static struct panic_info *xpv_panic_info = NULL; 72843e1988Sjohnlev 73843e1988Sjohnlev /* Timer support */ 74843e1988Sjohnlev #define NSEC_SHIFT 5 75843e1988Sjohnlev #define T_XPV_TIMER 0xd1 76843e1988Sjohnlev #define XPV_TIMER_INTERVAL 1000 /* 1000 microseconds */ 77843e1988Sjohnlev static uint32_t *xpv_apicadr = NULL; 78843e1988Sjohnlev static uint_t nsec_scale; 79843e1988Sjohnlev 80843e1988Sjohnlev /* IDT support */ 81843e1988Sjohnlev #pragma align 16(xpv_panic_idt) 82843e1988Sjohnlev static gate_desc_t xpv_panic_idt[NIDT]; /* interrupt descriptor table */ 83843e1988Sjohnlev 84843e1988Sjohnlev /* Xen pagetables mapped into our HAT's ptable windows */ 85843e1988Sjohnlev static pfn_t ptable_pfn[MAX_NUM_LEVEL]; 86843e1988Sjohnlev 87843e1988Sjohnlev /* Number of MMU_PAGESIZE pages we're adding to the Solaris dump */ 88843e1988Sjohnlev static int xpv_dump_pages; 89843e1988Sjohnlev 90843e1988Sjohnlev /* 91a576ab5bSrab * There are up to two large swathes of RAM that we don't want to include 92a576ab5bSrab * in the dump: those that comprise the Xen version of segkpm. On 32-bit 93a576ab5bSrab * systems there is no such region of memory. On 64-bit systems, there 94a576ab5bSrab * should be just a single contiguous region that corresponds to all of 95a576ab5bSrab * physical memory. The tricky bit is that Xen's heap sometimes lives in 96a576ab5bSrab * the middle of their segkpm, and is mapped using only kpm-like addresses. 97a576ab5bSrab * In that case, we need to skip the swathes before and after Xen's heap. 98a576ab5bSrab */ 99a576ab5bSrab uintptr_t kpm1_low = 0; 100a576ab5bSrab uintptr_t kpm1_high = 0; 101a576ab5bSrab uintptr_t kpm2_low = 0; 102a576ab5bSrab uintptr_t kpm2_high = 0; 103a576ab5bSrab 104a576ab5bSrab /* 105843e1988Sjohnlev * Some commonly used values that we don't want to recompute over and over. 106843e1988Sjohnlev */ 107843e1988Sjohnlev static int xpv_panic_nptes[MAX_NUM_LEVEL]; 108843e1988Sjohnlev static ulong_t xpv_panic_cr3; 109843e1988Sjohnlev static uintptr_t xpv_end; 110843e1988Sjohnlev 111843e1988Sjohnlev static void xpv_panic_console_print(const char *fmt, ...); 112843e1988Sjohnlev static void (*xpv_panic_printf)(const char *, ...) = xpv_panic_console_print; 113843e1988Sjohnlev 114843e1988Sjohnlev #define CONSOLE_BUF_SIZE 256 115843e1988Sjohnlev static char console_buffer[CONSOLE_BUF_SIZE]; 116843e1988Sjohnlev static boolean_t use_polledio; 117843e1988Sjohnlev 118e4b86885SCheng Sean Ye /* 119e4b86885SCheng Sean Ye * Pointers to machine check panic info (if any). 120e4b86885SCheng Sean Ye */ 121e4b86885SCheng Sean Ye xpv_mca_panic_data_t *xpv_mca_panic_data = NULL; 122e4b86885SCheng Sean Ye 123843e1988Sjohnlev static void 124843e1988Sjohnlev xpv_panic_putc(int m) 125843e1988Sjohnlev { 126843e1988Sjohnlev struct cons_polledio *c = cons_polledio; 127843e1988Sjohnlev 128843e1988Sjohnlev /* This really shouldn't happen */ 1290d928757SGary Mills if (boot_console_type(NULL) == CONS_HYPERVISOR) 130843e1988Sjohnlev return; 131843e1988Sjohnlev 132843e1988Sjohnlev if (use_polledio == B_TRUE) 133843e1988Sjohnlev c->cons_polledio_putchar(c->cons_polledio_argument, m); 134843e1988Sjohnlev else 135843e1988Sjohnlev bcons_putchar(m); 136843e1988Sjohnlev } 137843e1988Sjohnlev 138843e1988Sjohnlev static void 139843e1988Sjohnlev xpv_panic_puts(char *msg) 140843e1988Sjohnlev { 141843e1988Sjohnlev char *m; 142843e1988Sjohnlev 143843e1988Sjohnlev dump_timeleft = dump_timeout; 144843e1988Sjohnlev for (m = msg; *m; m++) 145843e1988Sjohnlev xpv_panic_putc((int)*m); 146843e1988Sjohnlev } 147843e1988Sjohnlev 148843e1988Sjohnlev static void 149843e1988Sjohnlev xpv_panic_console_print(const char *fmt, ...) 150843e1988Sjohnlev { 151843e1988Sjohnlev va_list ap; 152843e1988Sjohnlev 153843e1988Sjohnlev va_start(ap, fmt); 154843e1988Sjohnlev (void) vsnprintf(console_buffer, sizeof (console_buffer), fmt, ap); 155843e1988Sjohnlev va_end(ap); 156843e1988Sjohnlev 157843e1988Sjohnlev xpv_panic_puts(console_buffer); 158843e1988Sjohnlev } 159843e1988Sjohnlev 160843e1988Sjohnlev static void 161843e1988Sjohnlev xpv_panic_map(int level, pfn_t pfn) 162843e1988Sjohnlev { 163843e1988Sjohnlev x86pte_t pte, *pteptr; 164843e1988Sjohnlev 165843e1988Sjohnlev /* 166843e1988Sjohnlev * The provided pfn represents a level 'level' page table. Map it 167843e1988Sjohnlev * into the 'level' slot in the list of page table windows. 168843e1988Sjohnlev */ 169843e1988Sjohnlev pteptr = (x86pte_t *)PWIN_PTE_VA(level); 170843e1988Sjohnlev pte = pfn_to_pa(pfn) | PT_VALID; 171843e1988Sjohnlev 172843e1988Sjohnlev XPV_ALLOW_PAGETABLE_UPDATES(); 173843e1988Sjohnlev if (mmu.pae_hat) 174843e1988Sjohnlev *pteptr = pte; 175843e1988Sjohnlev else 176843e1988Sjohnlev *(x86pte32_t *)pteptr = pte; 177843e1988Sjohnlev XPV_DISALLOW_PAGETABLE_UPDATES(); 178843e1988Sjohnlev 179*74ecdb51SJohn Levon mmu_flush_tlb_page((uintptr_t)PWIN_VA(level)); 180843e1988Sjohnlev } 181843e1988Sjohnlev 182843e1988Sjohnlev /* 183843e1988Sjohnlev * Walk the page tables to find the pfn mapped by the given va. 184843e1988Sjohnlev */ 185843e1988Sjohnlev static pfn_t 186843e1988Sjohnlev xpv_va_walk(uintptr_t *vaddr) 187843e1988Sjohnlev { 188843e1988Sjohnlev int l, idx; 189843e1988Sjohnlev pfn_t pfn; 190843e1988Sjohnlev x86pte_t pte; 191843e1988Sjohnlev x86pte_t *ptep; 192843e1988Sjohnlev uintptr_t va = *vaddr; 193843e1988Sjohnlev uintptr_t scan_va; 194843e1988Sjohnlev caddr_t ptable_window; 195843e1988Sjohnlev static pfn_t toplevel_pfn; 196843e1988Sjohnlev static uintptr_t lastva; 197843e1988Sjohnlev 198843e1988Sjohnlev /* 199843e1988Sjohnlev * If we do anything other than a simple scan through memory, don't 200843e1988Sjohnlev * trust the mapped page tables. 201843e1988Sjohnlev */ 202843e1988Sjohnlev if (va != lastva + MMU_PAGESIZE) 203843e1988Sjohnlev for (l = mmu.max_level; l >= 0; l--) 204843e1988Sjohnlev ptable_pfn[l] = PFN_INVALID; 205843e1988Sjohnlev 206843e1988Sjohnlev toplevel_pfn = mmu_btop(xpv_panic_cr3); 207843e1988Sjohnlev 208843e1988Sjohnlev while (va < xpv_end && va >= *vaddr) { 209843e1988Sjohnlev /* Find the lowest table with any entry for va */ 210843e1988Sjohnlev pfn = toplevel_pfn; 211843e1988Sjohnlev for (l = mmu.max_level; l >= 0; l--) { 212843e1988Sjohnlev if (ptable_pfn[l] != pfn) { 213843e1988Sjohnlev xpv_panic_map(l, pfn); 214843e1988Sjohnlev ptable_pfn[l] = pfn; 215843e1988Sjohnlev } 216843e1988Sjohnlev 217843e1988Sjohnlev /* 218843e1988Sjohnlev * Search this pagetable for any mapping to an 219843e1988Sjohnlev * address >= va. 220843e1988Sjohnlev */ 221843e1988Sjohnlev ptable_window = PWIN_VA(l); 222843e1988Sjohnlev if (l == mmu.max_level && mmu.pae_hat) 223843e1988Sjohnlev ptable_window += 224843e1988Sjohnlev (xpv_panic_cr3 & MMU_PAGEOFFSET); 225843e1988Sjohnlev 226843e1988Sjohnlev idx = (va >> LEVEL_SHIFT(l)) & (xpv_panic_nptes[l] - 1); 227843e1988Sjohnlev scan_va = va; 228843e1988Sjohnlev while (idx < xpv_panic_nptes[l] && scan_va < xpv_end && 229843e1988Sjohnlev scan_va >= *vaddr) { 230843e1988Sjohnlev ptep = (x86pte_t *)(ptable_window + 231843e1988Sjohnlev (idx << mmu.pte_size_shift)); 232843e1988Sjohnlev pte = GET_PTE(ptep); 233843e1988Sjohnlev if (pte & PTE_VALID) 234843e1988Sjohnlev break; 235843e1988Sjohnlev idx++; 236843e1988Sjohnlev scan_va += mmu.level_size[l]; 237843e1988Sjohnlev } 238843e1988Sjohnlev 239843e1988Sjohnlev /* 240843e1988Sjohnlev * If there are no valid mappings in this table, we 241843e1988Sjohnlev * can skip to the end of the VA range it covers. 242843e1988Sjohnlev */ 243843e1988Sjohnlev if (idx == xpv_panic_nptes[l]) { 244843e1988Sjohnlev va = NEXT_ENTRY_VA(va, l + 1); 245843e1988Sjohnlev break; 246843e1988Sjohnlev } 247843e1988Sjohnlev 248a576ab5bSrab va = scan_va; 249a576ab5bSrab /* 250a576ab5bSrab * See if we've hit the end of the range. 251a576ab5bSrab */ 252a576ab5bSrab if (va >= xpv_end || va < *vaddr) 253a576ab5bSrab break; 254a576ab5bSrab 255843e1988Sjohnlev /* 256843e1988Sjohnlev * If this mapping is for a pagetable, we drop down 257843e1988Sjohnlev * to the next level in the hierarchy and look for 258843e1988Sjohnlev * a mapping in it. 259843e1988Sjohnlev */ 260843e1988Sjohnlev pfn = PTE2MFN(pte, l); 261843e1988Sjohnlev if (!PTE_ISPAGE(pte, l)) 262843e1988Sjohnlev continue; 263843e1988Sjohnlev 264843e1988Sjohnlev /* 265843e1988Sjohnlev * The APIC page is magic. Nothing to see here; 266843e1988Sjohnlev * move along. 267843e1988Sjohnlev */ 268843e1988Sjohnlev if (((uintptr_t)xpv_apicadr & MMU_PAGEMASK) == 269843e1988Sjohnlev (va & MMU_PAGEMASK)) { 270843e1988Sjohnlev va += MMU_PAGESIZE; 271843e1988Sjohnlev break; 272843e1988Sjohnlev } 273843e1988Sjohnlev 274a576ab5bSrab /* 275a576ab5bSrab * See if the address is within one of the two 276a576ab5bSrab * kpm-like regions we want to skip. 277a576ab5bSrab */ 278a576ab5bSrab if (va >= kpm1_low && va < kpm1_high) { 279a576ab5bSrab va = kpm1_high; 280a576ab5bSrab break; 281a576ab5bSrab } 282a576ab5bSrab if (va >= kpm2_low && va < kpm2_high) { 283a576ab5bSrab va = kpm2_high; 284843e1988Sjohnlev break; 285843e1988Sjohnlev } 286843e1988Sjohnlev 287843e1988Sjohnlev /* 288843e1988Sjohnlev * The Xen panic code only handles small pages. If 289843e1988Sjohnlev * this mapping is for a large page, we need to 290843e1988Sjohnlev * identify the consituent page that covers the 291843e1988Sjohnlev * specific VA we were looking for. 292843e1988Sjohnlev */ 293843e1988Sjohnlev if (l > 0) { 294843e1988Sjohnlev if (l > 1) 295843e1988Sjohnlev panic("Xen panic can't cope with " 296843e1988Sjohnlev "giant pages."); 297843e1988Sjohnlev idx = (va >> LEVEL_SHIFT(0)) & 298843e1988Sjohnlev (xpv_panic_nptes[0] - 1); 299843e1988Sjohnlev pfn += idx; 300843e1988Sjohnlev } 301843e1988Sjohnlev 302843e1988Sjohnlev *vaddr = va; 303843e1988Sjohnlev lastva = va; 304843e1988Sjohnlev return (pfn | PFN_IS_FOREIGN_MFN); 305843e1988Sjohnlev } 306843e1988Sjohnlev } 307843e1988Sjohnlev return (PFN_INVALID); 308843e1988Sjohnlev } 309843e1988Sjohnlev 310843e1988Sjohnlev /* 311843e1988Sjohnlev * Walk through the Xen VA space, finding pages that are mapped in. 312843e1988Sjohnlev * 313843e1988Sjohnlev * These pages all have MFNs rather than PFNs, meaning they may be outside 314843e1988Sjohnlev * the physical address space the kernel knows about, or they may collide 315843e1988Sjohnlev * with PFNs the kernel is using. 316843e1988Sjohnlev * 317843e1988Sjohnlev * The obvious trick of just adding the PFN_IS_FOREIGN_MFN bit to the MFNs 318843e1988Sjohnlev * to avoid collisions doesn't work. The pages need to be written to disk 319843e1988Sjohnlev * in PFN-order or savecore gets confused. We can't allocate memory to 320843e1988Sjohnlev * contruct a sorted pfn->VA reverse mapping, so we have to write the pages 321843e1988Sjohnlev * to disk in VA order. 322843e1988Sjohnlev * 323843e1988Sjohnlev * To square this circle, we simply make up PFNs for each of Xen's pages. 324843e1988Sjohnlev * We assign each mapped page a fake PFN in ascending order. These fake 325843e1988Sjohnlev * PFNs each have the FOREIGN bit set, ensuring that they fall outside the 326843e1988Sjohnlev * range of Solaris PFNs written by the kernel. 327843e1988Sjohnlev */ 328843e1988Sjohnlev int 329843e1988Sjohnlev dump_xpv_addr() 330843e1988Sjohnlev { 331843e1988Sjohnlev uintptr_t va; 332843e1988Sjohnlev mem_vtop_t mem_vtop; 333843e1988Sjohnlev 334843e1988Sjohnlev xpv_dump_pages = 0; 335843e1988Sjohnlev va = xen_virt_start; 336843e1988Sjohnlev 337843e1988Sjohnlev while (xpv_va_walk(&va) != PFN_INVALID) { 338843e1988Sjohnlev mem_vtop.m_as = &kas; 339843e1988Sjohnlev mem_vtop.m_va = (void *)va; 340843e1988Sjohnlev mem_vtop.m_pfn = (pfn_t)xpv_dump_pages | PFN_IS_FOREIGN_MFN; 341843e1988Sjohnlev 342843e1988Sjohnlev dumpvp_write(&mem_vtop, sizeof (mem_vtop_t)); 343843e1988Sjohnlev xpv_dump_pages++; 344843e1988Sjohnlev 345843e1988Sjohnlev va += MMU_PAGESIZE; 346843e1988Sjohnlev } 347843e1988Sjohnlev 348843e1988Sjohnlev /* 349843e1988Sjohnlev * Add the shared_info page. This page actually ends up in the 350843e1988Sjohnlev * dump twice: once for the Xen va and once for the Solaris va. 351843e1988Sjohnlev * This isn't ideal, but we don't know the address Xen is using for 352843e1988Sjohnlev * the page, so we can't share it. 353843e1988Sjohnlev */ 354843e1988Sjohnlev mem_vtop.m_as = &kas; 355843e1988Sjohnlev mem_vtop.m_va = HYPERVISOR_shared_info; 356843e1988Sjohnlev mem_vtop.m_pfn = (pfn_t)xpv_dump_pages | PFN_IS_FOREIGN_MFN; 357843e1988Sjohnlev dumpvp_write(&mem_vtop, sizeof (mem_vtop_t)); 358843e1988Sjohnlev xpv_dump_pages++; 359843e1988Sjohnlev 360843e1988Sjohnlev return (xpv_dump_pages); 361843e1988Sjohnlev } 362843e1988Sjohnlev 363843e1988Sjohnlev void 364843e1988Sjohnlev dump_xpv_pfn() 365843e1988Sjohnlev { 366843e1988Sjohnlev pfn_t pfn; 367843e1988Sjohnlev int cnt; 368843e1988Sjohnlev 369843e1988Sjohnlev for (cnt = 0; cnt < xpv_dump_pages; cnt++) { 370843e1988Sjohnlev pfn = (pfn_t)cnt | PFN_IS_FOREIGN_MFN; 371843e1988Sjohnlev dumpvp_write(&pfn, sizeof (pfn)); 372843e1988Sjohnlev } 373843e1988Sjohnlev } 374843e1988Sjohnlev 375843e1988Sjohnlev int 376843e1988Sjohnlev dump_xpv_data(void *dump_cbuf) 377843e1988Sjohnlev { 378843e1988Sjohnlev uintptr_t va; 379843e1988Sjohnlev uint32_t csize; 380843e1988Sjohnlev int cnt = 0; 381843e1988Sjohnlev 382843e1988Sjohnlev /* 383843e1988Sjohnlev * XXX: we should probably run this data through a UE check. The 384843e1988Sjohnlev * catch is that the UE code relies on on_trap() and getpfnum() 385843e1988Sjohnlev * working. 386843e1988Sjohnlev */ 387843e1988Sjohnlev va = xen_virt_start; 388843e1988Sjohnlev 389843e1988Sjohnlev while (xpv_va_walk(&va) != PFN_INVALID) { 390843e1988Sjohnlev csize = (uint32_t)compress((void *)va, dump_cbuf, PAGESIZE); 391843e1988Sjohnlev dumpvp_write(&csize, sizeof (uint32_t)); 392843e1988Sjohnlev dumpvp_write(dump_cbuf, csize); 393843e1988Sjohnlev if (dump_ioerr) { 394843e1988Sjohnlev dumphdr->dump_flags &= ~DF_COMPLETE; 395843e1988Sjohnlev return (cnt); 396843e1988Sjohnlev } 397843e1988Sjohnlev cnt++; 398843e1988Sjohnlev va += MMU_PAGESIZE; 399843e1988Sjohnlev } 400843e1988Sjohnlev 401843e1988Sjohnlev /* 402843e1988Sjohnlev * Finally, dump the shared_info page 403843e1988Sjohnlev */ 404843e1988Sjohnlev csize = (uint32_t)compress((void *)HYPERVISOR_shared_info, dump_cbuf, 405843e1988Sjohnlev PAGESIZE); 406843e1988Sjohnlev dumpvp_write(&csize, sizeof (uint32_t)); 407843e1988Sjohnlev dumpvp_write(dump_cbuf, csize); 408843e1988Sjohnlev if (dump_ioerr) 409843e1988Sjohnlev dumphdr->dump_flags &= ~DF_COMPLETE; 410843e1988Sjohnlev cnt++; 411843e1988Sjohnlev 412843e1988Sjohnlev return (cnt); 413843e1988Sjohnlev } 414843e1988Sjohnlev 415843e1988Sjohnlev static void * 416843e1988Sjohnlev showstack(void *fpreg, int xpv_only) 417843e1988Sjohnlev { 418843e1988Sjohnlev struct frame *fpp; 419843e1988Sjohnlev ulong_t off; 420843e1988Sjohnlev char *sym; 421843e1988Sjohnlev uintptr_t pc, fp, lastfp; 422843e1988Sjohnlev uintptr_t minaddr = min(KERNELBASE, xen_virt_start); 423843e1988Sjohnlev 424843e1988Sjohnlev fp = (uintptr_t)fpreg; 425843e1988Sjohnlev if (fp < minaddr) { 426843e1988Sjohnlev xpv_panic_printf("Bad frame ptr: 0x%p\n", fpreg); 427843e1988Sjohnlev return (fpreg); 428843e1988Sjohnlev } 429843e1988Sjohnlev 430843e1988Sjohnlev do { 431843e1988Sjohnlev fpp = (struct frame *)fp; 432843e1988Sjohnlev pc = fpp->fr_savpc; 433843e1988Sjohnlev 434843e1988Sjohnlev if ((xpv_only != 0) && 435843e1988Sjohnlev (fp > xpv_end || fp < xen_virt_start)) 436843e1988Sjohnlev break; 437843e1988Sjohnlev if ((sym = kobj_getsymname(pc, &off)) != NULL) 438843e1988Sjohnlev xpv_panic_printf("%08lx %s:%s+%lx\n", fp, 439843e1988Sjohnlev mod_containing_pc((caddr_t)pc), sym, off); 440843e1988Sjohnlev else if ((pc >= xen_virt_start) && (pc <= xpv_end)) 441843e1988Sjohnlev xpv_panic_printf("%08lx 0x%lx (in Xen)\n", fp, pc); 442843e1988Sjohnlev else 443843e1988Sjohnlev xpv_panic_printf("%08lx %lx\n", fp, pc); 444843e1988Sjohnlev 445843e1988Sjohnlev lastfp = fp; 446843e1988Sjohnlev fp = fpp->fr_savfp; 447843e1988Sjohnlev 448843e1988Sjohnlev /* 449843e1988Sjohnlev * Xen marks an exception frame by inverting the frame 450843e1988Sjohnlev * pointer. 451843e1988Sjohnlev */ 452843e1988Sjohnlev if (fp < lastfp) { 453843e1988Sjohnlev if ((~fp > minaddr) && ((~fp) ^ lastfp) < 0xfff) 454843e1988Sjohnlev fp = ~fp; 455843e1988Sjohnlev } 456843e1988Sjohnlev } while (fp > lastfp); 457843e1988Sjohnlev return ((void *)fp); 458843e1988Sjohnlev } 459843e1988Sjohnlev 460843e1988Sjohnlev void * 461843e1988Sjohnlev xpv_traceback(void *fpreg) 462843e1988Sjohnlev { 463843e1988Sjohnlev return (showstack(fpreg, 1)); 464843e1988Sjohnlev } 465843e1988Sjohnlev 466843e1988Sjohnlev #if defined(__amd64) 467843e1988Sjohnlev static void 468843e1988Sjohnlev xpv_panic_hypercall(ulong_t call) 469843e1988Sjohnlev { 470843e1988Sjohnlev panic("Illegally issued hypercall %d during panic!\n", (int)call); 471843e1988Sjohnlev } 472843e1988Sjohnlev #endif 473843e1988Sjohnlev 474843e1988Sjohnlev void 475843e1988Sjohnlev xpv_die(struct regs *rp) 476843e1988Sjohnlev { 477843e1988Sjohnlev struct panic_trap_info ti; 478843e1988Sjohnlev struct cregs creg; 479843e1988Sjohnlev 480843e1988Sjohnlev ti.trap_regs = rp; 481843e1988Sjohnlev ti.trap_type = rp->r_trapno; 482843e1988Sjohnlev 483843e1988Sjohnlev curthread->t_panic_trap = &ti; 484843e1988Sjohnlev if (ti.trap_type == T_PGFLT) { 485843e1988Sjohnlev getcregs(&creg); 486843e1988Sjohnlev ti.trap_addr = (caddr_t)creg.cr_cr2; 487843e1988Sjohnlev panic("Fatal pagefault at 0x%lx. fault addr=0x%p rp=0x%p", 488903a11ebSrh87107 rp->r_pc, (void *)ti.trap_addr, (void *)rp); 489843e1988Sjohnlev } else { 490843e1988Sjohnlev ti.trap_addr = (caddr_t)rp->r_pc; 491843e1988Sjohnlev panic("Fatal trap %ld at 0x%lx. rp=0x%p", rp->r_trapno, 492903a11ebSrh87107 rp->r_pc, (void *)rp); 493843e1988Sjohnlev } 494843e1988Sjohnlev } 495843e1988Sjohnlev 496843e1988Sjohnlev /* 497843e1988Sjohnlev * Build IDT to handle a Xen panic 498843e1988Sjohnlev */ 499843e1988Sjohnlev static void 500843e1988Sjohnlev switch_to_xpv_panic_idt() 501843e1988Sjohnlev { 502843e1988Sjohnlev int i; 503843e1988Sjohnlev desctbr_t idtr; 504843e1988Sjohnlev gate_desc_t *idt = xpv_panic_idt; 505843e1988Sjohnlev selector_t cs = get_cs_register(); 506843e1988Sjohnlev 507843e1988Sjohnlev for (i = 0; i < 32; i++) 5089844da31SSeth Goldberg set_gatesegd(&idt[i], &xpv_invaltrap, cs, SDT_SYSIGT, TRP_XPL, 5099844da31SSeth Goldberg 0); 510843e1988Sjohnlev 5119844da31SSeth Goldberg set_gatesegd(&idt[T_ZERODIV], &xpv_div0trap, cs, SDT_SYSIGT, TRP_XPL, 5129844da31SSeth Goldberg 0); 5139844da31SSeth Goldberg set_gatesegd(&idt[T_SGLSTP], &xpv_dbgtrap, cs, SDT_SYSIGT, TRP_XPL, 0); 5149844da31SSeth Goldberg set_gatesegd(&idt[T_NMIFLT], &xpv_nmiint, cs, SDT_SYSIGT, TRP_XPL, 0); 515843e1988Sjohnlev set_gatesegd(&idt[T_BOUNDFLT], &xpv_boundstrap, cs, SDT_SYSIGT, 5169844da31SSeth Goldberg TRP_XPL, 0); 5179844da31SSeth Goldberg set_gatesegd(&idt[T_ILLINST], &xpv_invoptrap, cs, SDT_SYSIGT, TRP_XPL, 5189844da31SSeth Goldberg 0); 5199844da31SSeth Goldberg set_gatesegd(&idt[T_NOEXTFLT], &xpv_ndptrap, cs, SDT_SYSIGT, TRP_XPL, 5209844da31SSeth Goldberg 0); 5219844da31SSeth Goldberg set_gatesegd(&idt[T_TSSFLT], &xpv_invtsstrap, cs, SDT_SYSIGT, TRP_XPL, 5229844da31SSeth Goldberg 0); 5239844da31SSeth Goldberg set_gatesegd(&idt[T_SEGFLT], &xpv_segnptrap, cs, SDT_SYSIGT, TRP_XPL, 5249844da31SSeth Goldberg 0); 5259844da31SSeth Goldberg set_gatesegd(&idt[T_STKFLT], &xpv_stktrap, cs, SDT_SYSIGT, TRP_XPL, 0); 5269844da31SSeth Goldberg set_gatesegd(&idt[T_GPFLT], &xpv_gptrap, cs, SDT_SYSIGT, TRP_XPL, 0); 5279844da31SSeth Goldberg set_gatesegd(&idt[T_PGFLT], &xpv_pftrap, cs, SDT_SYSIGT, TRP_XPL, 0); 5289844da31SSeth Goldberg set_gatesegd(&idt[T_EXTERRFLT], &xpv_ndperr, cs, SDT_SYSIGT, TRP_XPL, 5299844da31SSeth Goldberg 0); 5309844da31SSeth Goldberg set_gatesegd(&idt[T_ALIGNMENT], &xpv_achktrap, cs, SDT_SYSIGT, TRP_XPL, 5319844da31SSeth Goldberg 0); 5329844da31SSeth Goldberg set_gatesegd(&idt[T_MCE], &xpv_mcetrap, cs, SDT_SYSIGT, TRP_XPL, 0); 5339844da31SSeth Goldberg set_gatesegd(&idt[T_SIMDFPE], &xpv_xmtrap, cs, SDT_SYSIGT, TRP_XPL, 0); 534843e1988Sjohnlev 535843e1988Sjohnlev /* 536843e1988Sjohnlev * We have no double fault handler. Any single fault represents a 537843e1988Sjohnlev * catastrophic failure for us, so there is no attempt to handle 538843e1988Sjohnlev * them cleanly: we just print a message and reboot. If we 539843e1988Sjohnlev * encounter a second fault while doing that, there is nothing 540843e1988Sjohnlev * else we can do. 541843e1988Sjohnlev */ 542843e1988Sjohnlev 543843e1988Sjohnlev /* 544843e1988Sjohnlev * Be prepared to absorb any stray device interrupts received 545843e1988Sjohnlev * while writing the core to disk. 546843e1988Sjohnlev */ 547843e1988Sjohnlev for (i = 33; i < NIDT; i++) 548843e1988Sjohnlev set_gatesegd(&idt[i], &xpv_surprise_intr, cs, SDT_SYSIGT, 5499844da31SSeth Goldberg TRP_XPL, 0); 550843e1988Sjohnlev 551843e1988Sjohnlev /* The one interrupt we expect to get is from the APIC timer. */ 552843e1988Sjohnlev set_gatesegd(&idt[T_XPV_TIMER], &xpv_timer_trap, cs, SDT_SYSIGT, 5539844da31SSeth Goldberg TRP_XPL, 0); 554843e1988Sjohnlev 555843e1988Sjohnlev idtr.dtr_base = (uintptr_t)xpv_panic_idt; 556843e1988Sjohnlev idtr.dtr_limit = sizeof (xpv_panic_idt) - 1; 557843e1988Sjohnlev wr_idtr(&idtr); 558843e1988Sjohnlev 559843e1988Sjohnlev #if defined(__amd64) 560843e1988Sjohnlev /* Catch any hypercalls. */ 561843e1988Sjohnlev wrmsr(MSR_AMD_LSTAR, (uintptr_t)xpv_panic_hypercall); 562843e1988Sjohnlev wrmsr(MSR_AMD_CSTAR, (uintptr_t)xpv_panic_hypercall); 563843e1988Sjohnlev #endif 564843e1988Sjohnlev } 565843e1988Sjohnlev 566843e1988Sjohnlev static void 567843e1988Sjohnlev xpv_apic_clkinit() 568843e1988Sjohnlev { 569843e1988Sjohnlev uint_t apic_ticks = 0; 570843e1988Sjohnlev 571843e1988Sjohnlev /* 572843e1988Sjohnlev * Measure how many APIC ticks there are within a fixed time 573843e1988Sjohnlev * period. We're going to be fairly coarse here. This timer is 574843e1988Sjohnlev * just being used to detect a stalled panic, so as long as we have 575843e1988Sjohnlev * the right order of magnitude, everything should be fine. 576843e1988Sjohnlev */ 577843e1988Sjohnlev xpv_apicadr[APIC_SPUR_INT_REG] = AV_UNIT_ENABLE | APIC_SPUR_INTR; 578843e1988Sjohnlev xpv_apicadr[APIC_LOCAL_TIMER] = AV_MASK; 579843e1988Sjohnlev xpv_apicadr[APIC_INT_VECT0] = AV_MASK; /* local intr reg 0 */ 580843e1988Sjohnlev 581843e1988Sjohnlev xpv_apicadr[APIC_DIVIDE_REG] = 0; 582843e1988Sjohnlev xpv_apicadr[APIC_INIT_COUNT] = APIC_MAXVAL; 583843e1988Sjohnlev drv_usecwait(XPV_TIMER_INTERVAL); 584843e1988Sjohnlev apic_ticks = APIC_MAXVAL - xpv_apicadr[APIC_CURR_COUNT]; 585843e1988Sjohnlev 586843e1988Sjohnlev /* 587843e1988Sjohnlev * apic_ticks now represents roughly how many apic ticks comprise 588843e1988Sjohnlev * one timeout interval. Program the timer to send us an interrupt 589843e1988Sjohnlev * every time that interval expires. 590843e1988Sjohnlev */ 59141afdfa7SKrishnendu Sadhukhan - Sun Microsystems xpv_apicadr[APIC_LOCAL_TIMER] = T_XPV_TIMER | AV_PERIODIC; 592843e1988Sjohnlev xpv_apicadr[APIC_INIT_COUNT] = apic_ticks; 593843e1988Sjohnlev xpv_apicadr[APIC_EOI_REG] = 0; 594843e1988Sjohnlev } 595843e1988Sjohnlev 596843e1988Sjohnlev void 597843e1988Sjohnlev xpv_timer_tick(void) 598843e1988Sjohnlev { 599843e1988Sjohnlev static int ticks = 0; 600843e1988Sjohnlev 601843e1988Sjohnlev if (ticks++ >= MICROSEC / XPV_TIMER_INTERVAL) { 602843e1988Sjohnlev ticks = 0; 603843e1988Sjohnlev if (dump_timeleft && (--dump_timeleft == 0)) 604843e1988Sjohnlev panic("Xen panic timeout\n"); 605843e1988Sjohnlev } 606843e1988Sjohnlev xpv_apicadr[APIC_EOI_REG] = 0; 607843e1988Sjohnlev } 608843e1988Sjohnlev 609843e1988Sjohnlev void 610843e1988Sjohnlev xpv_interrupt(void) 611843e1988Sjohnlev { 612843e1988Sjohnlev #ifdef DEBUG 613843e1988Sjohnlev static int cnt = 0; 614843e1988Sjohnlev 615843e1988Sjohnlev if (cnt++ < 10) 616843e1988Sjohnlev xpv_panic_printf("Unexpected interrupt received.\n"); 617843e1988Sjohnlev if ((cnt < 1000) && ((cnt % 100) == 0)) 618843e1988Sjohnlev xpv_panic_printf("%d unexpected interrupts received.\n", cnt); 619843e1988Sjohnlev #endif 620843e1988Sjohnlev 621843e1988Sjohnlev xpv_apicadr[APIC_EOI_REG] = 0; 622843e1988Sjohnlev } 623843e1988Sjohnlev 624843e1988Sjohnlev /* 625843e1988Sjohnlev * Managing time in panic context is trivial. We only have a single CPU, 626843e1988Sjohnlev * we never get rescheduled, we never get suspended. We just need to 627843e1988Sjohnlev * convert clock ticks into nanoseconds. 628843e1988Sjohnlev */ 629843e1988Sjohnlev static hrtime_t 630843e1988Sjohnlev xpv_panic_gethrtime(void) 631843e1988Sjohnlev { 632843e1988Sjohnlev hrtime_t tsc, hrt; 633843e1988Sjohnlev unsigned int *l = (unsigned int *)&(tsc); 634843e1988Sjohnlev 635843e1988Sjohnlev tsc = __rdtsc_insn(); 636843e1988Sjohnlev hrt = (mul32(l[1], nsec_scale) << NSEC_SHIFT) + 637843e1988Sjohnlev (mul32(l[0], nsec_scale) >> (32 - NSEC_SHIFT)); 638843e1988Sjohnlev 639843e1988Sjohnlev return (hrt); 640843e1988Sjohnlev } 641843e1988Sjohnlev 642843e1988Sjohnlev static void 643843e1988Sjohnlev xpv_panic_time_init() 644843e1988Sjohnlev { 645843e1988Sjohnlev nsec_scale = 646843e1988Sjohnlev CPU->cpu_m.mcpu_vcpu_info->time.tsc_to_system_mul >> NSEC_SHIFT; 647843e1988Sjohnlev 648843e1988Sjohnlev gethrtimef = xpv_panic_gethrtime; 649843e1988Sjohnlev } 650843e1988Sjohnlev 651843e1988Sjohnlev static void 652843e1988Sjohnlev xpv_panicsys(struct regs *rp, char *fmt, ...) 653843e1988Sjohnlev { 654843e1988Sjohnlev extern void panicsys(const char *, va_list, struct regs *, int); 655843e1988Sjohnlev va_list alist; 656843e1988Sjohnlev 657843e1988Sjohnlev va_start(alist, fmt); 658843e1988Sjohnlev panicsys(fmt, alist, rp, 1); 659843e1988Sjohnlev va_end(alist); 660843e1988Sjohnlev } 661843e1988Sjohnlev 662843e1988Sjohnlev void 663843e1988Sjohnlev xpv_do_panic(void *arg) 664843e1988Sjohnlev { 665843e1988Sjohnlev struct panic_info *pip = (struct panic_info *)arg; 666843e1988Sjohnlev int l; 667843e1988Sjohnlev struct cregs creg; 668843e1988Sjohnlev #if defined(__amd64) 669843e1988Sjohnlev extern uintptr_t postbootkernelbase; 670843e1988Sjohnlev #endif 671843e1988Sjohnlev 672843e1988Sjohnlev if (xpv_panicking++ > 0) 673843e1988Sjohnlev panic("multiple calls to xpv_do_panic()"); 674843e1988Sjohnlev 675843e1988Sjohnlev /* 676843e1988Sjohnlev * Indicate to the underlying panic framework that a panic has been 677843e1988Sjohnlev * initiated. This is ordinarily done as part of vpanic(). Since 678843e1988Sjohnlev * we already have all the register state saved by the hypervisor, 679843e1988Sjohnlev * we skip that and jump straight into the panic processing code. 680e4b86885SCheng Sean Ye * 681e4b86885SCheng Sean Ye * XXX If another thread grabs and wins the panic_quiesce trigger 682e4b86885SCheng Sean Ye * then we'll have two threads in panicsys believing they are in 683e4b86885SCheng Sean Ye * charge of the panic attempt! 684843e1988Sjohnlev */ 685843e1988Sjohnlev (void) panic_trigger(&panic_quiesce); 686843e1988Sjohnlev 687843e1988Sjohnlev #if defined(__amd64) 688843e1988Sjohnlev /* 689843e1988Sjohnlev * bzero() and bcopy() get unhappy when asked to operate on 690843e1988Sjohnlev * addresses outside of the kernel. At this point Xen is really a 691843e1988Sjohnlev * part of the kernel, so we update the routines' notion of where 692843e1988Sjohnlev * the kernel starts. 693843e1988Sjohnlev */ 694843e1988Sjohnlev postbootkernelbase = xen_virt_start; 695843e1988Sjohnlev #endif 696843e1988Sjohnlev 697843e1988Sjohnlev #if defined(HYPERVISOR_VIRT_END) 698843e1988Sjohnlev xpv_end = HYPERVISOR_VIRT_END; 699843e1988Sjohnlev #else 700843e1988Sjohnlev xpv_end = (uintptr_t)UINTPTR_MAX - sizeof (uintptr_t); 701843e1988Sjohnlev #endif 702843e1988Sjohnlev 703843e1988Sjohnlev /* 704843e1988Sjohnlev * If we were redirecting console output to the hypervisor, we have 705843e1988Sjohnlev * to stop. 706843e1988Sjohnlev */ 707843e1988Sjohnlev use_polledio = B_FALSE; 7080d928757SGary Mills if (boot_console_type(NULL) == CONS_HYPERVISOR) { 709843e1988Sjohnlev bcons_device_change(CONS_HYPERVISOR); 710843e1988Sjohnlev } else if (cons_polledio != NULL && 711843e1988Sjohnlev cons_polledio->cons_polledio_putchar != NULL) { 712843e1988Sjohnlev if (cons_polledio->cons_polledio_enter != NULL) 713843e1988Sjohnlev cons_polledio->cons_polledio_enter( 714843e1988Sjohnlev cons_polledio->cons_polledio_argument); 715843e1988Sjohnlev use_polledio = 1; 716843e1988Sjohnlev } 717843e1988Sjohnlev 718843e1988Sjohnlev /* Make sure we handle all console output from here on. */ 719843e1988Sjohnlev sysp->bsvc_putchar = xpv_panic_putc; 720843e1988Sjohnlev 721843e1988Sjohnlev /* 722843e1988Sjohnlev * If we find an unsupported panic_info structure, there's not much 723843e1988Sjohnlev * we can do other than complain, plow on, and hope for the best. 724843e1988Sjohnlev */ 725843e1988Sjohnlev if (pip->pi_version != PANIC_INFO_VERSION) 726843e1988Sjohnlev xpv_panic_printf("Warning: Xen is using an unsupported " 727843e1988Sjohnlev "version of the panic_info structure.\n"); 728843e1988Sjohnlev 729843e1988Sjohnlev xpv_panic_info = pip; 730843e1988Sjohnlev 731a576ab5bSrab #if defined(__amd64) 732a576ab5bSrab kpm1_low = (uintptr_t)xpv_panic_info->pi_ram_start; 733a576ab5bSrab if (xpv_panic_info->pi_xen_start == NULL) { 734a576ab5bSrab kpm1_high = (uintptr_t)xpv_panic_info->pi_ram_end; 735a576ab5bSrab } else { 736a576ab5bSrab kpm1_high = (uintptr_t)xpv_panic_info->pi_xen_start; 737a576ab5bSrab kpm2_low = (uintptr_t)xpv_panic_info->pi_xen_end; 738a576ab5bSrab kpm2_high = (uintptr_t)xpv_panic_info->pi_ram_end; 739a576ab5bSrab } 740a576ab5bSrab #endif 741a576ab5bSrab 742843e1988Sjohnlev /* 743843e1988Sjohnlev * Make sure we are running on the Solaris %gs. The Xen panic code 744843e1988Sjohnlev * should already have set up the GDT properly. 745843e1988Sjohnlev */ 746843e1988Sjohnlev xpv_panic_resetgs(); 747843e1988Sjohnlev #if defined(__amd64) 748843e1988Sjohnlev wrmsr(MSR_AMD_GSBASE, (uint64_t)&cpus[0]); 749843e1988Sjohnlev #endif 750843e1988Sjohnlev 751843e1988Sjohnlev xpv_panic_time_init(); 752843e1988Sjohnlev 753843e1988Sjohnlev /* 754843e1988Sjohnlev * Switch to our own IDT, avoiding any accidental returns to Xen 755843e1988Sjohnlev * world. 756843e1988Sjohnlev */ 757843e1988Sjohnlev switch_to_xpv_panic_idt(); 758843e1988Sjohnlev 759843e1988Sjohnlev /* 760843e1988Sjohnlev * Initialize the APIC timer, which is used to detect a hung dump 761843e1988Sjohnlev * attempt. 762843e1988Sjohnlev */ 763843e1988Sjohnlev xpv_apicadr = pip->pi_apic; 764843e1988Sjohnlev xpv_apic_clkinit(); 765843e1988Sjohnlev 766843e1988Sjohnlev /* 767843e1988Sjohnlev * Set up a few values that we'll need repeatedly. 768843e1988Sjohnlev */ 769843e1988Sjohnlev getcregs(&creg); 770843e1988Sjohnlev xpv_panic_cr3 = creg.cr_cr3; 771843e1988Sjohnlev for (l = mmu.max_level; l >= 0; l--) 772843e1988Sjohnlev xpv_panic_nptes[l] = mmu.ptes_per_table; 773843e1988Sjohnlev #ifdef __i386 774843e1988Sjohnlev if (mmu.pae_hat) 775843e1988Sjohnlev xpv_panic_nptes[mmu.max_level] = 4; 776843e1988Sjohnlev #endif 777843e1988Sjohnlev 778843e1988Sjohnlev /* Add the fake Xen module to the module list */ 779843e1988Sjohnlev if (xpv_module != NULL) { 780843e1988Sjohnlev extern int last_module_id; 781843e1988Sjohnlev 782843e1988Sjohnlev xpv_modctl->mod_id = last_module_id++; 783843e1988Sjohnlev xpv_modctl->mod_next = &modules; 784843e1988Sjohnlev xpv_modctl->mod_prev = modules.mod_prev; 785843e1988Sjohnlev modules.mod_prev->mod_next = xpv_modctl; 786843e1988Sjohnlev modules.mod_prev = xpv_modctl; 787843e1988Sjohnlev } 788e4b86885SCheng Sean Ye 789e4b86885SCheng Sean Ye if (pip->pi_mca.mpd_magic == MCA_PANICDATA_MAGIC) 790e4b86885SCheng Sean Ye xpv_mca_panic_data = &pip->pi_mca; 791e4b86885SCheng Sean Ye 792843e1988Sjohnlev xpv_panic_printf = printf; 793843e1988Sjohnlev xpv_panicsys((struct regs *)pip->pi_regs, pip->pi_panicstr); 794843e1988Sjohnlev xpv_panic_printf("Failed to reboot following panic.\n"); 795843e1988Sjohnlev for (;;) 796843e1988Sjohnlev ; 797843e1988Sjohnlev } 798843e1988Sjohnlev 799843e1988Sjohnlev /* 800843e1988Sjohnlev * Set up the necessary data structures to pretend that the Xen hypervisor 801843e1988Sjohnlev * is a loadable module, allowing mdb to find the Xen symbols in a crash 802843e1988Sjohnlev * dump. Since these symbols all map to VA space Solaris doesn't normally 803843e1988Sjohnlev * have access to, we don't link these structures into the kernel's lists 804843e1988Sjohnlev * until/unless we hit a Xen panic. 805843e1988Sjohnlev * 806843e1988Sjohnlev * The observant reader will note a striking amount of overlap between this 807843e1988Sjohnlev * code and that found in krtld. While it would be handy if we could just 808843e1988Sjohnlev * ask krtld to do this work for us, it's not that simple. Among the 809843e1988Sjohnlev * complications: we're not actually loading the text here (grub did it at 810843e1988Sjohnlev * boot), the .text section is writable, there are no relocations to do, 811843e1988Sjohnlev * none of the module text/data is in readable memory, etc. Training krtld 812843e1988Sjohnlev * to deal with this weird module is as complicated, and more risky, than 813843e1988Sjohnlev * reimplementing the necessary subset of it here. 814843e1988Sjohnlev */ 815843e1988Sjohnlev static void 816843e1988Sjohnlev init_xen_module() 817843e1988Sjohnlev { 818843e1988Sjohnlev struct _buf *file = NULL; 819843e1988Sjohnlev struct module *mp; 820843e1988Sjohnlev struct modctl *mcp; 821843e1988Sjohnlev int i, shn; 822843e1988Sjohnlev Shdr *shp, *ctf_shp; 823843e1988Sjohnlev char *names = NULL; 824843e1988Sjohnlev size_t n, namesize, text_align, data_align; 825843e1988Sjohnlev #if defined(__amd64) 826843e1988Sjohnlev const char machine = EM_AMD64; 827843e1988Sjohnlev #else 828843e1988Sjohnlev const char machine = EM_386; 829843e1988Sjohnlev #endif 830843e1988Sjohnlev 831843e1988Sjohnlev /* Allocate and init the module structure */ 832843e1988Sjohnlev mp = kmem_zalloc(sizeof (*mp), KM_SLEEP); 833843e1988Sjohnlev mp->filename = kobj_zalloc(strlen(XPV_FILENAME) + 1, KM_SLEEP); 834843e1988Sjohnlev (void) strcpy(mp->filename, XPV_FILENAME); 835843e1988Sjohnlev 836843e1988Sjohnlev /* Allocate and init the modctl structure */ 837843e1988Sjohnlev mcp = kmem_zalloc(sizeof (*mcp), KM_SLEEP); 838843e1988Sjohnlev mcp->mod_modname = kobj_zalloc(strlen(XPV_MODNAME) + 1, KM_SLEEP); 839843e1988Sjohnlev (void) strcpy(mcp->mod_modname, XPV_MODNAME); 840843e1988Sjohnlev mcp->mod_filename = kobj_zalloc(strlen(XPV_FILENAME) + 1, KM_SLEEP); 841843e1988Sjohnlev (void) strcpy(mcp->mod_filename, XPV_FILENAME); 842843e1988Sjohnlev mcp->mod_inprogress_thread = (kthread_id_t)-1; 843843e1988Sjohnlev mcp->mod_ref = 1; 844843e1988Sjohnlev mcp->mod_loaded = 1; 845843e1988Sjohnlev mcp->mod_loadcnt = 1; 846843e1988Sjohnlev mcp->mod_mp = mp; 847843e1988Sjohnlev 848843e1988Sjohnlev /* 849843e1988Sjohnlev * Try to open a Xen image that hasn't had its symbol and CTF 850843e1988Sjohnlev * information stripped off. 851843e1988Sjohnlev */ 852843e1988Sjohnlev file = kobj_open_file(XPV_FILENAME); 853843e1988Sjohnlev if (file == (struct _buf *)-1) { 854843e1988Sjohnlev file = NULL; 855843e1988Sjohnlev goto err; 856843e1988Sjohnlev } 857843e1988Sjohnlev 858843e1988Sjohnlev /* 859843e1988Sjohnlev * Read the header and ensure that this is an ELF file for the 860843e1988Sjohnlev * proper ISA. If it's not, somebody has done something very 861843e1988Sjohnlev * stupid. Why bother? See Mencken. 862843e1988Sjohnlev */ 863843e1988Sjohnlev if (kobj_read_file(file, (char *)&mp->hdr, sizeof (mp->hdr), 0) < 0) 864843e1988Sjohnlev goto err; 865843e1988Sjohnlev for (i = 0; i < SELFMAG; i++) 866843e1988Sjohnlev if (mp->hdr.e_ident[i] != ELFMAG[i]) 867843e1988Sjohnlev goto err; 868843e1988Sjohnlev if ((mp->hdr.e_ident[EI_DATA] != ELFDATA2LSB) || 869843e1988Sjohnlev (mp->hdr.e_machine != machine)) 870843e1988Sjohnlev goto err; 871843e1988Sjohnlev 872843e1988Sjohnlev /* Read in the section headers */ 873843e1988Sjohnlev n = mp->hdr.e_shentsize * mp->hdr.e_shnum; 874843e1988Sjohnlev mp->shdrs = kmem_zalloc(n, KM_SLEEP); 875843e1988Sjohnlev if (kobj_read_file(file, mp->shdrs, n, mp->hdr.e_shoff) < 0) 876843e1988Sjohnlev goto err; 877843e1988Sjohnlev 878843e1988Sjohnlev /* Read the section names */ 879843e1988Sjohnlev shp = (Shdr *)(mp->shdrs + mp->hdr.e_shstrndx * mp->hdr.e_shentsize); 880843e1988Sjohnlev namesize = shp->sh_size; 881843e1988Sjohnlev names = kmem_zalloc(shp->sh_size, KM_SLEEP); 882843e1988Sjohnlev if (kobj_read_file(file, names, shp->sh_size, shp->sh_offset) < 0) 883843e1988Sjohnlev goto err; 884843e1988Sjohnlev 885843e1988Sjohnlev /* 886843e1988Sjohnlev * Fill in the text and data size fields. 887843e1988Sjohnlev */ 888843e1988Sjohnlev ctf_shp = NULL; 889843e1988Sjohnlev text_align = data_align = 0; 890843e1988Sjohnlev for (shn = 1; shn < mp->hdr.e_shnum; shn++) { 891843e1988Sjohnlev shp = (Shdr *)(mp->shdrs + shn * mp->hdr.e_shentsize); 892843e1988Sjohnlev 893843e1988Sjohnlev /* Sanity check the offset of the section name */ 894843e1988Sjohnlev if (shp->sh_name >= namesize) 895843e1988Sjohnlev continue; 896843e1988Sjohnlev 897843e1988Sjohnlev /* If we find the symtab section, remember it for later. */ 898843e1988Sjohnlev if (shp->sh_type == SHT_SYMTAB) { 899843e1988Sjohnlev mp->symtbl_section = shn; 900843e1988Sjohnlev mp->symhdr = shp; 901843e1988Sjohnlev continue; 902843e1988Sjohnlev } 903843e1988Sjohnlev 904843e1988Sjohnlev /* If we find the CTF section, remember it for later. */ 905843e1988Sjohnlev if ((shp->sh_size != 0) && 906843e1988Sjohnlev (strcmp(names + shp->sh_name, ".SUNW_ctf") == 0)) { 907843e1988Sjohnlev ctf_shp = shp; 908843e1988Sjohnlev continue; 909843e1988Sjohnlev } 910843e1988Sjohnlev 911843e1988Sjohnlev if (!(shp->sh_flags & SHF_ALLOC)) 912843e1988Sjohnlev continue; 913843e1988Sjohnlev 914843e1988Sjohnlev /* 915843e1988Sjohnlev * Xen marks its text section as writable, so we need to 916843e1988Sjohnlev * look for the name - not just the flag. 917843e1988Sjohnlev */ 918a9ba5504SRichard PALO if ((strcmp(&names[shp->sh_name], ".text") != 0) && 919843e1988Sjohnlev (shp->sh_flags & SHF_WRITE) != 0) { 920843e1988Sjohnlev if (shp->sh_addralign > data_align) 921843e1988Sjohnlev data_align = shp->sh_addralign; 922843e1988Sjohnlev mp->data_size = ALIGN(mp->data_size, data_align); 923843e1988Sjohnlev mp->data_size += ALIGN(shp->sh_size, 8); 924843e1988Sjohnlev if (mp->data == NULL || mp->data > (char *)shp->sh_addr) 925843e1988Sjohnlev mp->data = (char *)shp->sh_addr; 926843e1988Sjohnlev } else { 927843e1988Sjohnlev if (shp->sh_addralign > text_align) 928843e1988Sjohnlev text_align = shp->sh_addralign; 929843e1988Sjohnlev mp->text_size = ALIGN(mp->text_size, text_align); 930843e1988Sjohnlev mp->text_size += ALIGN(shp->sh_size, 8); 931843e1988Sjohnlev if (mp->text == NULL || mp->text > (char *)shp->sh_addr) 932843e1988Sjohnlev mp->text = (char *)shp->sh_addr; 933843e1988Sjohnlev } 934843e1988Sjohnlev } 935843e1988Sjohnlev kmem_free(names, namesize); 936843e1988Sjohnlev names = NULL; 9378cdfbd11Snn35248 shp = NULL; 938843e1988Sjohnlev mcp->mod_text = mp->text; 939843e1988Sjohnlev mcp->mod_text_size = mp->text_size; 940843e1988Sjohnlev 941843e1988Sjohnlev /* 942843e1988Sjohnlev * If we have symbol table and string table sections, read them in 943843e1988Sjohnlev * now. If we don't, we just plow on. We'll still get a valid 944843e1988Sjohnlev * core dump, but finding anything useful will be just a bit 945843e1988Sjohnlev * harder. 946843e1988Sjohnlev * 947843e1988Sjohnlev * Note: we don't bother with a hash table. We'll never do a 948843e1988Sjohnlev * symbol lookup unless we crash, and then mdb creates its own. We 949843e1988Sjohnlev * also don't try to perform any relocations. Xen should be loaded 950843e1988Sjohnlev * exactly where the ELF file indicates, and the symbol information 951843e1988Sjohnlev * in the file should be complete and correct already. Static 952843e1988Sjohnlev * linking ain't all bad. 953843e1988Sjohnlev */ 954843e1988Sjohnlev if ((mp->symhdr != NULL) && (mp->symhdr->sh_link < mp->hdr.e_shnum)) { 955843e1988Sjohnlev mp->strhdr = (Shdr *) 956843e1988Sjohnlev (mp->shdrs + mp->symhdr->sh_link * mp->hdr.e_shentsize); 957843e1988Sjohnlev mp->nsyms = mp->symhdr->sh_size / mp->symhdr->sh_entsize; 958843e1988Sjohnlev 959843e1988Sjohnlev /* Allocate space for the symbol table and strings. */ 960843e1988Sjohnlev mp->symsize = mp->symhdr->sh_size + 961843e1988Sjohnlev mp->nsyms * sizeof (symid_t) + mp->strhdr->sh_size; 962843e1988Sjohnlev mp->symspace = kmem_zalloc(mp->symsize, KM_SLEEP); 963843e1988Sjohnlev mp->symtbl = mp->symspace; 964843e1988Sjohnlev mp->strings = (char *)(mp->symtbl + mp->symhdr->sh_size); 965843e1988Sjohnlev 966843e1988Sjohnlev if ((kobj_read_file(file, mp->symtbl, 967843e1988Sjohnlev mp->symhdr->sh_size, mp->symhdr->sh_offset) < 0) || 968843e1988Sjohnlev (kobj_read_file(file, mp->strings, 969843e1988Sjohnlev mp->strhdr->sh_size, mp->strhdr->sh_offset) < 0)) 970843e1988Sjohnlev goto err; 971843e1988Sjohnlev } 972843e1988Sjohnlev 973843e1988Sjohnlev /* 974843e1988Sjohnlev * Read in the CTF section 975843e1988Sjohnlev */ 976843e1988Sjohnlev if ((ctf_shp != NULL) && ((moddebug & MODDEBUG_NOCTF) == 0)) { 9778cdfbd11Snn35248 mp->ctfdata = kmem_zalloc(ctf_shp->sh_size, KM_SLEEP); 978843e1988Sjohnlev mp->ctfsize = ctf_shp->sh_size; 979843e1988Sjohnlev if (kobj_read_file(file, mp->ctfdata, mp->ctfsize, 980843e1988Sjohnlev ctf_shp->sh_offset) < 0) 981843e1988Sjohnlev goto err; 982843e1988Sjohnlev } 983843e1988Sjohnlev 984843e1988Sjohnlev kobj_close_file(file); 985843e1988Sjohnlev 986843e1988Sjohnlev xpv_module = mp; 987843e1988Sjohnlev xpv_modctl = mcp; 988843e1988Sjohnlev return; 989843e1988Sjohnlev 990843e1988Sjohnlev err: 991843e1988Sjohnlev cmn_err(CE_WARN, "Failed to initialize xpv module."); 992843e1988Sjohnlev if (file != NULL) 993843e1988Sjohnlev kobj_close_file(file); 994843e1988Sjohnlev 995843e1988Sjohnlev kmem_free(mp->filename, strlen(XPV_FILENAME) + 1); 996843e1988Sjohnlev if (mp->shdrs != NULL) 997843e1988Sjohnlev kmem_free(mp->shdrs, mp->hdr.e_shentsize * mp->hdr.e_shnum); 998843e1988Sjohnlev if (mp->symspace != NULL) 999843e1988Sjohnlev kmem_free(mp->symspace, mp->symsize); 1000843e1988Sjohnlev if (mp->ctfdata != NULL) 1001843e1988Sjohnlev kmem_free(mp->ctfdata, mp->ctfsize); 1002843e1988Sjohnlev kmem_free(mp, sizeof (*mp)); 1003843e1988Sjohnlev kmem_free(mcp->mod_filename, strlen(XPV_FILENAME) + 1); 1004843e1988Sjohnlev kmem_free(mcp->mod_modname, strlen(XPV_MODNAME) + 1); 1005843e1988Sjohnlev kmem_free(mcp, sizeof (*mcp)); 1006843e1988Sjohnlev if (names != NULL) 1007843e1988Sjohnlev kmem_free(names, namesize); 1008843e1988Sjohnlev } 1009843e1988Sjohnlev 1010843e1988Sjohnlev void 1011843e1988Sjohnlev xpv_panic_init() 1012843e1988Sjohnlev { 1013843e1988Sjohnlev xen_platform_op_t op; 1014843e1988Sjohnlev int i; 1015843e1988Sjohnlev 1016843e1988Sjohnlev ASSERT(DOMAIN_IS_INITDOMAIN(xen_info)); 1017843e1988Sjohnlev 1018843e1988Sjohnlev for (i = 0; i < mmu.num_level; i++) 1019843e1988Sjohnlev ptable_pfn[i] = PFN_INVALID; 1020843e1988Sjohnlev 1021843e1988Sjohnlev /* Let Xen know where to jump if/when it panics. */ 1022843e1988Sjohnlev op.cmd = XENPF_panic_init; 1023843e1988Sjohnlev op.interface_version = XENPF_INTERFACE_VERSION; 1024843e1988Sjohnlev op.u.panic_init.panic_addr = (unsigned long)xpv_panic_hdlr; 1025843e1988Sjohnlev 1026843e1988Sjohnlev (void) HYPERVISOR_platform_op(&op); 1027843e1988Sjohnlev 1028843e1988Sjohnlev init_xen_module(); 1029843e1988Sjohnlev } 1030