1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */ 28 /* Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T */ 29 /* All Rights Reserved */ 30 /* */ 31 /* Copyright (c) 1987, 1988 Microsoft Corporation */ 32 /* All Rights Reserved */ 33 /* */ 34 35 #pragma ident "%Z%%M% %I% %E% SMI" 36 37 #include <sys/types.h> 38 #include <sys/sysmacros.h> 39 #include <sys/param.h> 40 #include <sys/signal.h> 41 #include <sys/systm.h> 42 #include <sys/user.h> 43 #include <sys/proc.h> 44 #include <sys/disp.h> 45 #include <sys/class.h> 46 #include <sys/core.h> 47 #include <sys/syscall.h> 48 #include <sys/cpuvar.h> 49 #include <sys/vm.h> 50 #include <sys/sysinfo.h> 51 #include <sys/fault.h> 52 #include <sys/stack.h> 53 #include <sys/mmu.h> 54 #include <sys/psw.h> 55 #include <sys/regset.h> 56 #include <sys/fp.h> 57 #include <sys/trap.h> 58 #include <sys/kmem.h> 59 #include <sys/vtrace.h> 60 #include <sys/cmn_err.h> 61 #include <sys/prsystm.h> 62 #include <sys/mutex_impl.h> 63 #include <sys/machsystm.h> 64 #include <sys/archsystm.h> 65 #include <sys/sdt.h> 66 #include <sys/avintr.h> 67 #include <sys/kobj.h> 68 69 #include <vm/hat.h> 70 71 #include <vm/seg_kmem.h> 72 #include <vm/as.h> 73 #include <vm/seg.h> 74 #include <vm/hat_pte.h> 75 76 #include <sys/procfs.h> 77 78 #include <sys/reboot.h> 79 #include <sys/debug.h> 80 #include <sys/debugreg.h> 81 #include <sys/modctl.h> 82 #include <sys/aio_impl.h> 83 #include <sys/tnf.h> 84 #include <sys/tnf_probe.h> 85 #include <sys/cred.h> 86 #include <sys/mman.h> 87 #include <sys/x86_archext.h> 88 #include <sys/copyops.h> 89 #include <c2/audit.h> 90 #include <sys/ftrace.h> 91 #include <sys/panic.h> 92 #include <sys/traptrace.h> 93 #include <sys/ontrap.h> 94 #include <sys/cpc_impl.h> 95 96 #define USER 0x10000 /* user-mode flag added to trap type */ 97 98 static const char *trap_type_mnemonic[] = { 99 "de", "db", "2", "bp", 100 "of", "br", "ud", "nm", 101 "df", "9", "ts", "np", 102 "ss", "gp", "pf", "15", 103 "mf", "ac", "mc", "xf" 104 }; 105 106 static const char *trap_type[] = { 107 "Divide error", /* trap id 0 */ 108 "Debug", /* trap id 1 */ 109 "NMI interrupt", /* trap id 2 */ 110 "Breakpoint", /* trap id 3 */ 111 "Overflow", /* trap id 4 */ 112 "BOUND range exceeded", /* trap id 5 */ 113 "Invalid opcode", /* trap id 6 */ 114 "Device not available", /* trap id 7 */ 115 "Double fault", /* trap id 8 */ 116 "Coprocessor segment overrun", /* trap id 9 */ 117 "Invalid TSS", /* trap id 10 */ 118 "Segment not present", /* trap id 11 */ 119 "Stack segment fault", /* trap id 12 */ 120 "General protection", /* trap id 13 */ 121 "Page fault", /* trap id 14 */ 122 "Reserved", /* trap id 15 */ 123 "x87 floating point error", /* trap id 16 */ 124 "Alignment check", /* trap id 17 */ 125 "Machine check", /* trap id 18 */ 126 "SIMD floating point exception", /* trap id 19 */ 127 }; 128 129 #define TRAP_TYPES (sizeof (trap_type) / sizeof (trap_type[0])) 130 131 int tudebug = 0; 132 int tudebugbpt = 0; 133 int tudebugfpe = 0; 134 int tudebugsse = 0; 135 136 #if defined(TRAPDEBUG) || defined(lint) 137 int tdebug = 0; 138 int lodebug = 0; 139 int faultdebug = 0; 140 #else 141 #define tdebug 0 142 #define lodebug 0 143 #define faultdebug 0 144 #endif /* defined(TRAPDEBUG) || defined(lint) */ 145 146 #if defined(TRAPTRACE) 147 static void dump_ttrace(void); 148 #endif /* TRAPTRACE */ 149 static void dumpregs(struct regs *); 150 static void showregs(uint_t, struct regs *, caddr_t); 151 static void dump_tss(void); 152 static int kern_gpfault(struct regs *); 153 154 struct trap_info { 155 struct regs *trap_regs; 156 uint_t trap_type; 157 caddr_t trap_addr; 158 }; 159 160 /*ARGSUSED*/ 161 static int 162 die(uint_t type, struct regs *rp, caddr_t addr, processorid_t cpuid) 163 { 164 struct trap_info ti; 165 const char *trap_name, *trap_mnemonic; 166 167 if (type < TRAP_TYPES) { 168 trap_name = trap_type[type]; 169 trap_mnemonic = trap_type_mnemonic[type]; 170 } else { 171 trap_name = "trap"; 172 trap_mnemonic = "-"; 173 } 174 175 #ifdef TRAPTRACE 176 TRAPTRACE_FREEZE; 177 #endif 178 179 ti.trap_regs = rp; 180 ti.trap_type = type & ~USER; 181 ti.trap_addr = addr; 182 183 curthread->t_panic_trap = &ti; 184 185 if (type == T_PGFLT && addr < (caddr_t)KERNELBASE) { 186 panic("BAD TRAP: type=%x (#%s %s) rp=%p addr=%p " 187 "occurred in module \"%s\" due to %s", 188 type, trap_mnemonic, trap_name, (void *)rp, (void *)addr, 189 mod_containing_pc((caddr_t)rp->r_pc), 190 addr < (caddr_t)PAGESIZE ? 191 "a NULL pointer dereference" : 192 "an illegal access to a user address"); 193 } else 194 panic("BAD TRAP: type=%x (#%s %s) rp=%p addr=%p", 195 type, trap_mnemonic, trap_name, (void *)rp, (void *)addr); 196 return (0); 197 } 198 199 /* 200 * Rewrite the instruction at pc to be an int $T_SYSCALLINT instruction. 201 * 202 * int <vector> is two bytes: 0xCD <vector> 203 */ 204 205 #define SLOW_SCALL_SIZE 2 206 207 static int 208 rewrite_syscall(caddr_t pc) 209 { 210 uchar_t instr[SLOW_SCALL_SIZE] = { 0xCD, T_SYSCALLINT }; 211 212 if (uwrite(curthread->t_procp, instr, SLOW_SCALL_SIZE, 213 (uintptr_t)pc) != 0) 214 return (1); 215 216 return (0); 217 } 218 219 /* 220 * Test to see if the instruction at pc is sysenter or syscall. The second 221 * argument should be the x86 feature flag corresponding to the expected 222 * instruction. 223 * 224 * sysenter is two bytes: 0x0F 0x34 225 * syscall is two bytes: 0x0F 0x05 226 */ 227 228 #define FAST_SCALL_SIZE 2 229 230 static int 231 instr_is_fast_syscall(caddr_t pc, int which) 232 { 233 uchar_t instr[FAST_SCALL_SIZE]; 234 235 ASSERT(which == X86_SEP || which == X86_ASYSC); 236 237 if (copyin_nowatch(pc, (caddr_t)instr, FAST_SCALL_SIZE) != 0 || 238 instr[0] != 0x0F) 239 return (0); 240 241 if ((which == X86_SEP && instr[1] == 0x34) || 242 (which == X86_ASYSC && instr[1] == 0x05)) 243 return (1); 244 245 return (0); 246 } 247 248 /* 249 * Test to see if the instruction at pc is a system call instruction. 250 * 251 * The bytes of an lcall instruction used for the syscall trap. 252 * static uchar_t lcall[7] = { 0x9a, 0, 0, 0, 0, 0x7, 0 }; 253 * static uchar_t lcallalt[7] = { 0x9a, 0, 0, 0, 0, 0x27, 0 }; 254 */ 255 256 #define LCALLSIZE 7 257 258 static int 259 instr_is_syscall(caddr_t pc) 260 { 261 uchar_t instr[LCALLSIZE]; 262 263 if (copyin_nowatch(pc, (caddr_t)instr, LCALLSIZE) == 0 && 264 instr[0] == 0x9a && 265 instr[1] == 0 && 266 instr[2] == 0 && 267 instr[3] == 0 && 268 instr[4] == 0 && 269 (instr[5] == 0x7 || instr[5] == 0x27) && 270 instr[6] == 0) 271 return (1); 272 273 return (0); 274 } 275 276 #ifdef OPTERON_ERRATUM_91 277 278 /* 279 * Test to see if the instruction at pc is a prefetch instruction. 280 * 281 * The first byte of prefetch instructions is always 0x0F. 282 * The second byte is 0x18 for regular prefetch or 0x0D for AMD 3dnow prefetch. 283 * The third byte is between 0 and 3 inclusive. 284 */ 285 286 #define PREFETCHSIZE 3 287 288 static int 289 cmp_to_prefetch(uchar_t *p) 290 { 291 if (*p == 0x0F && (*(p+1) == 0x18 || *(p+1) == 0x0D) && *(p+2) <= 3) 292 return (1); 293 return (0); 294 } 295 296 static int 297 instr_is_prefetch(caddr_t pc) 298 { 299 uchar_t instr[PREFETCHSIZE]; 300 int error; 301 302 error = copyin_nowatch(pc, (caddr_t)instr, PREFETCHSIZE); 303 304 if (error == 0 && cmp_to_prefetch(instr)) 305 return (1); 306 return (0); 307 } 308 309 #endif /* OPTERON_ERRATUM_91 */ 310 311 /* 312 * Called from the trap handler when a processor trap occurs. 313 * 314 * Note: All user-level traps that might call stop() must exit 315 * trap() by 'goto out' or by falling through. 316 */ 317 void 318 trap(struct regs *rp, caddr_t addr, processorid_t cpuid) 319 { 320 kthread_t *cur_thread = curthread; 321 enum seg_rw rw; 322 unsigned type; 323 extern int stop_on_fault(uint_t, k_siginfo_t *); 324 proc_t *p = ttoproc(cur_thread); 325 klwp_t *lwp = ttolwp(cur_thread); 326 uintptr_t lofault; 327 faultcode_t pagefault(), res, errcode; 328 enum fault_type fault_type; 329 k_siginfo_t siginfo; 330 uint_t fault = 0; 331 int mstate; 332 int sicode = 0; 333 int watchcode; 334 int watchpage; 335 caddr_t vaddr; 336 size_t sz; 337 int ta; 338 339 ASSERT_STACK_ALIGNED(); 340 341 type = rp->r_trapno; 342 CPU_STATS_ADDQ(CPU, sys, trap, 1); 343 344 ASSERT(cur_thread->t_schedflag & TS_DONT_SWAP); 345 346 if (type == T_PGFLT) { 347 348 errcode = rp->r_err; 349 if (errcode & PF_ERR_WRITE) 350 rw = S_WRITE; 351 else if ((caddr_t)rp->r_pc == addr || 352 (mmu.pt_nx != 0 && (errcode & PF_ERR_EXEC))) 353 rw = S_EXEC; 354 else 355 rw = S_READ; 356 357 #if defined(__i386) 358 /* 359 * Pentium Pro work-around 360 */ 361 if ((errcode & PF_ERR_PROT) && pentiumpro_bug4046376) { 362 uint_t attr; 363 uint_t priv_violation; 364 uint_t access_violation; 365 366 if (hat_getattr(addr < (caddr_t)kernelbase ? 367 curproc->p_as->a_hat : kas.a_hat, addr, &attr) 368 == -1) { 369 errcode &= ~PF_ERR_PROT; 370 } else { 371 priv_violation = (errcode & PF_ERR_USER) && 372 !(attr & PROT_USER); 373 access_violation = (errcode & PF_ERR_WRITE) && 374 !(attr & PROT_WRITE); 375 if (!priv_violation && !access_violation) 376 goto cleanup; 377 } 378 } 379 #endif /* __i386 */ 380 381 } 382 383 if (tdebug) 384 showregs(type, rp, addr); 385 386 if (USERMODE(rp->r_cs)) { 387 /* 388 * Set up the current cred to use during this trap. u_cred 389 * no longer exists. t_cred is used instead. 390 * The current process credential applies to the thread for 391 * the entire trap. If trapping from the kernel, this 392 * should already be set up. 393 */ 394 if (cur_thread->t_cred != p->p_cred) { 395 cred_t *oldcred = cur_thread->t_cred; 396 /* 397 * DTrace accesses t_cred in probe context. t_cred 398 * must always be either NULL, or point to a valid, 399 * allocated cred structure. 400 */ 401 cur_thread->t_cred = crgetcred(); 402 crfree(oldcred); 403 } 404 ASSERT(lwp != NULL); 405 type |= USER; 406 ASSERT(lwptoregs(lwp) == rp); 407 lwp->lwp_state = LWP_SYS; 408 409 switch (type) { 410 case T_PGFLT + USER: 411 if ((caddr_t)rp->r_pc == addr) 412 mstate = LMS_TFAULT; 413 else 414 mstate = LMS_DFAULT; 415 break; 416 default: 417 mstate = LMS_TRAP; 418 break; 419 } 420 /* Kernel probe */ 421 TNF_PROBE_1(thread_state, "thread", /* CSTYLED */, 422 tnf_microstate, state, mstate); 423 mstate = new_mstate(cur_thread, mstate); 424 425 bzero(&siginfo, sizeof (siginfo)); 426 } 427 428 switch (type) { 429 case T_PGFLT + USER: 430 case T_SGLSTP: 431 case T_SGLSTP + USER: 432 case T_BPTFLT + USER: 433 break; 434 435 default: 436 FTRACE_2("trap(): type=0x%lx, regs=0x%lx", 437 (ulong_t)type, (ulong_t)rp); 438 break; 439 } 440 441 switch (type) { 442 443 case T_MCE: /* Machine check exception */ 444 case T_MCE + USER: 445 if (x86_feature & X86_MCA) { 446 if (mca_exception(rp)) 447 (void) die(type, rp, addr, cpuid); 448 type &= ~USER; 449 goto cleanup; 450 } 451 default: 452 if (type & USER) { 453 if (tudebug) 454 showregs(type, rp, (caddr_t)0); 455 printf("trap: Unknown trap type %d in user mode\n", 456 type & ~USER); 457 siginfo.si_signo = SIGILL; 458 siginfo.si_code = ILL_ILLTRP; 459 siginfo.si_addr = (caddr_t)rp->r_pc; 460 siginfo.si_trapno = type & ~USER; 461 fault = FLTILL; 462 break; 463 } else { 464 (void) die(type, rp, addr, cpuid); 465 /*NOTREACHED*/ 466 } 467 468 case T_PGFLT: /* system page fault */ 469 /* 470 * If we're under on_trap() protection (see <sys/ontrap.h>), 471 * set ot_trap and longjmp back to the on_trap() call site. 472 */ 473 if ((cur_thread->t_ontrap != NULL) && 474 (cur_thread->t_ontrap->ot_prot & OT_DATA_ACCESS)) { 475 curthread->t_ontrap->ot_trap |= OT_DATA_ACCESS; 476 longjmp(&curthread->t_ontrap->ot_jmpbuf); 477 } 478 479 /* 480 * See if we can handle as pagefault. Save lofault 481 * across this. Here we assume that an address 482 * less than KERNELBASE is a user fault. 483 * We can do this as copy.s routines verify that the 484 * starting address is less than KERNELBASE before 485 * starting and because we know that we always have 486 * KERNELBASE mapped as invalid to serve as a "barrier". 487 */ 488 lofault = cur_thread->t_lofault; 489 cur_thread->t_lofault = 0; 490 491 mstate = new_mstate(cur_thread, LMS_KFAULT); 492 493 if (addr < (caddr_t)kernelbase) { 494 res = pagefault(addr, 495 (errcode & PF_ERR_PROT)? F_PROT: F_INVAL, rw, 0); 496 if (res == FC_NOMAP && 497 addr < p->p_usrstack && 498 grow(addr)) 499 res = 0; 500 } else { 501 res = pagefault(addr, 502 (errcode & PF_ERR_PROT)? F_PROT: F_INVAL, rw, 1); 503 } 504 (void) new_mstate(cur_thread, mstate); 505 506 /* 507 * Restore lofault. If we resolved the fault, exit. 508 * If we didn't and lofault wasn't set, die. 509 */ 510 cur_thread->t_lofault = lofault; 511 if (res == 0) 512 goto cleanup; 513 514 #if defined(OPTERON_ERRATUM_93) && defined(_LP64) 515 if (lofault == 0 && opteron_erratum_93) { 516 /* 517 * Workaround for Opteron Erratum 93. On return from 518 * a System Managment Interrupt at a HLT instruction 519 * the %rip might be truncated to a 32 bit value. 520 * BIOS is supposed to fix this, but some don't. 521 * If this occurs we simply restore the high order bits. 522 * The HLT instruction is 1 byte of 0xf4. 523 */ 524 uintptr_t rip = rp->r_pc; 525 526 if ((rip & 0xfffffffful) == rip) { 527 rip |= 0xfffffffful << 32; 528 if (hat_getpfnum(kas.a_hat, (caddr_t)rip) != 529 PFN_INVALID && 530 (*(uchar_t *)rip == 0xf4 || 531 *(uchar_t *)(rip - 1) == 0xf4)) { 532 rp->r_pc = rip; 533 goto cleanup; 534 } 535 } 536 } 537 #endif /* OPTERON_ERRATUM_93 && _LP64 */ 538 539 #ifdef OPTERON_ERRATUM_91 540 if (lofault == 0 && opteron_erratum_91) { 541 /* 542 * Workaround for Opteron Erratum 91. Prefetches may 543 * generate a page fault (they're not supposed to do 544 * that!). If this occurs we simply return back to the 545 * instruction. 546 */ 547 caddr_t pc = (caddr_t)rp->r_pc; 548 549 /* 550 * If the faulting PC is not mapped, this is a 551 * legitimate kernel page fault that must result in a 552 * panic. If the faulting PC is mapped, it could contain 553 * a prefetch instruction. Check for that here. 554 */ 555 if (hat_getpfnum(kas.a_hat, pc) != PFN_INVALID) { 556 if (cmp_to_prefetch((uchar_t *)pc)) { 557 #ifdef DEBUG 558 cmn_err(CE_WARN, "Opteron erratum 91 " 559 "occurred: kernel prefetch" 560 " at %p generated a page fault!", 561 (void *)rp->r_pc); 562 #endif /* DEBUG */ 563 goto cleanup; 564 } 565 } 566 (void) die(type, rp, addr, cpuid); 567 } 568 #endif /* OPTERON_ERRATUM_91 */ 569 570 if (lofault == 0) 571 (void) die(type, rp, addr, cpuid); 572 573 /* 574 * Cannot resolve fault. Return to lofault. 575 */ 576 if (lodebug) { 577 showregs(type, rp, addr); 578 traceregs(rp); 579 } 580 if (FC_CODE(res) == FC_OBJERR) 581 res = FC_ERRNO(res); 582 else 583 res = EFAULT; 584 rp->r_r0 = res; 585 rp->r_pc = cur_thread->t_lofault; 586 goto cleanup; 587 588 case T_PGFLT + USER: /* user page fault */ 589 if (faultdebug) { 590 char *fault_str; 591 592 switch (rw) { 593 case S_READ: 594 fault_str = "read"; 595 break; 596 case S_WRITE: 597 fault_str = "write"; 598 break; 599 case S_EXEC: 600 fault_str = "exec"; 601 break; 602 default: 603 fault_str = ""; 604 break; 605 } 606 printf("user %s fault: addr=0x%lx errcode=0x%x\n", 607 fault_str, (uintptr_t)addr, errcode); 608 } 609 610 #if defined(OPTERON_ERRATUM_100) && defined(_LP64) 611 /* 612 * Workaround for AMD erratum 100 613 * 614 * A 32-bit process may receive a page fault on a non 615 * 32-bit address by mistake. The range of the faulting 616 * address will be 617 * 618 * 0xffffffff80000000 .. 0xffffffffffffffff or 619 * 0x0000000100000000 .. 0x000000017fffffff 620 * 621 * The fault is always due to an instruction fetch, however 622 * the value of r_pc should be correct (in 32 bit range), 623 * so we ignore the page fault on the bogus address. 624 */ 625 if (p->p_model == DATAMODEL_ILP32 && 626 (0xffffffff80000000 <= (uintptr_t)addr || 627 (0x100000000 <= (uintptr_t)addr && 628 (uintptr_t)addr <= 0x17fffffff))) { 629 if (!opteron_erratum_100) 630 panic("unexpected erratum #100"); 631 if (rp->r_pc <= 0xffffffff) 632 goto out; 633 } 634 #endif /* OPTERON_ERRATUM_100 && _LP64 */ 635 636 ASSERT(!(curthread->t_flag & T_WATCHPT)); 637 watchpage = (pr_watch_active(p) && pr_is_watchpage(addr, rw)); 638 #ifdef __i386 639 /* 640 * In 32-bit mode, the lcall (system call) instruction fetches 641 * one word from the stack, at the stack pointer, because of the 642 * way the call gate is constructed. This is a bogus 643 * read and should not be counted as a read watchpoint. 644 * We work around the problem here by testing to see if 645 * this situation applies and, if so, simply jumping to 646 * the code in locore.s that fields the system call trap. 647 * The registers on the stack are already set up properly 648 * due to the match between the call gate sequence and the 649 * trap gate sequence. We just have to adjust the pc. 650 */ 651 if (watchpage && addr == (caddr_t)rp->r_sp && 652 rw == S_READ && instr_is_syscall((caddr_t)rp->r_pc)) { 653 extern void watch_syscall(void); 654 655 rp->r_pc += LCALLSIZE; 656 watch_syscall(); /* never returns */ 657 /* NOTREACHED */ 658 } 659 #endif /* __i386 */ 660 vaddr = addr; 661 if (!watchpage || (sz = instr_size(rp, &vaddr, rw)) <= 0) 662 fault_type = (errcode & PF_ERR_PROT)? F_PROT: F_INVAL; 663 else if ((watchcode = pr_is_watchpoint(&vaddr, &ta, 664 sz, NULL, rw)) != 0) { 665 if (ta) { 666 do_watch_step(vaddr, sz, rw, 667 watchcode, rp->r_pc); 668 fault_type = F_INVAL; 669 } else { 670 bzero(&siginfo, sizeof (siginfo)); 671 siginfo.si_signo = SIGTRAP; 672 siginfo.si_code = watchcode; 673 siginfo.si_addr = vaddr; 674 siginfo.si_trapafter = 0; 675 siginfo.si_pc = (caddr_t)rp->r_pc; 676 fault = FLTWATCH; 677 break; 678 } 679 } else { 680 /* XXX pr_watch_emul() never succeeds (for now) */ 681 if (rw != S_EXEC && pr_watch_emul(rp, vaddr, rw)) 682 goto out; 683 do_watch_step(vaddr, sz, rw, 0, 0); 684 fault_type = F_INVAL; 685 } 686 687 res = pagefault(addr, fault_type, rw, 0); 688 689 /* 690 * If pagefault() succeeded, ok. 691 * Otherwise attempt to grow the stack. 692 */ 693 if (res == 0 || 694 (res == FC_NOMAP && 695 addr < p->p_usrstack && 696 grow(addr))) { 697 lwp->lwp_lastfault = FLTPAGE; 698 lwp->lwp_lastfaddr = addr; 699 if (prismember(&p->p_fltmask, FLTPAGE)) { 700 bzero(&siginfo, sizeof (siginfo)); 701 siginfo.si_addr = addr; 702 (void) stop_on_fault(FLTPAGE, &siginfo); 703 } 704 goto out; 705 } else if (res == FC_PROT && addr < p->p_usrstack && 706 (mmu.pt_nx != 0 && (errcode & PF_ERR_EXEC))) { 707 report_stack_exec(p, addr); 708 } 709 710 #ifdef OPTERON_ERRATUM_91 711 /* 712 * Workaround for Opteron Erratum 91. Prefetches may generate a 713 * page fault (they're not supposed to do that!). If this 714 * occurs we simply return back to the instruction. 715 * 716 * We rely on copyin to properly fault in the page with r_pc. 717 */ 718 if (opteron_erratum_91 && 719 addr != (caddr_t)rp->r_pc && 720 instr_is_prefetch((caddr_t)rp->r_pc)) { 721 #ifdef DEBUG 722 cmn_err(CE_WARN, "Opteron erratum 91 occurred: " 723 "prefetch at %p in pid %d generated a trap!", 724 (void *)rp->r_pc, p->p_pid); 725 #endif /* DEBUG */ 726 goto out; 727 } 728 #endif /* OPTERON_ERRATUM_91 */ 729 730 if (tudebug) 731 showregs(type, rp, addr); 732 /* 733 * In the case where both pagefault and grow fail, 734 * set the code to the value provided by pagefault. 735 * We map all errors returned from pagefault() to SIGSEGV. 736 */ 737 bzero(&siginfo, sizeof (siginfo)); 738 siginfo.si_addr = addr; 739 switch (FC_CODE(res)) { 740 case FC_HWERR: 741 case FC_NOSUPPORT: 742 siginfo.si_signo = SIGBUS; 743 siginfo.si_code = BUS_ADRERR; 744 fault = FLTACCESS; 745 break; 746 case FC_ALIGN: 747 siginfo.si_signo = SIGBUS; 748 siginfo.si_code = BUS_ADRALN; 749 fault = FLTACCESS; 750 break; 751 case FC_OBJERR: 752 if ((siginfo.si_errno = FC_ERRNO(res)) != EINTR) { 753 siginfo.si_signo = SIGBUS; 754 siginfo.si_code = BUS_OBJERR; 755 fault = FLTACCESS; 756 } 757 break; 758 default: /* FC_NOMAP or FC_PROT */ 759 siginfo.si_signo = SIGSEGV; 760 siginfo.si_code = 761 (res == FC_NOMAP)? SEGV_MAPERR : SEGV_ACCERR; 762 fault = FLTBOUNDS; 763 break; 764 } 765 break; 766 767 case T_ILLINST + USER: /* invalid opcode fault */ 768 /* 769 * If the syscall instruction is disabled due to LDT usage, a 770 * user program that attempts to execute it will trigger a #ud 771 * trap. Check for that case here. If this occurs on a CPU which 772 * doesn't even support syscall, the result of all of this will 773 * be to emulate that particular instruction. 774 */ 775 if (p->p_ldt != NULL && 776 instr_is_fast_syscall((caddr_t)rp->r_pc, X86_ASYSC)) { 777 if (rewrite_syscall((caddr_t)rp->r_pc) == 0) 778 goto out; 779 #ifdef DEBUG 780 else 781 cmn_err(CE_WARN, "failed to rewrite syscall " 782 "instruction in process %d", 783 curthread->t_procp->p_pid); 784 #endif /* DEBUG */ 785 } 786 /*FALLTHROUGH*/ 787 788 if (tudebug) 789 showregs(type, rp, (caddr_t)0); 790 siginfo.si_signo = SIGILL; 791 siginfo.si_code = ILL_ILLOPC; 792 siginfo.si_addr = (caddr_t)rp->r_pc; 793 fault = FLTILL; 794 break; 795 796 case T_ZERODIV + USER: /* integer divide by zero */ 797 if (tudebug && tudebugfpe) 798 showregs(type, rp, (caddr_t)0); 799 siginfo.si_signo = SIGFPE; 800 siginfo.si_code = FPE_INTDIV; 801 siginfo.si_addr = (caddr_t)rp->r_pc; 802 fault = FLTIZDIV; 803 break; 804 805 case T_OVFLW + USER: /* integer overflow */ 806 if (tudebug && tudebugfpe) 807 showregs(type, rp, (caddr_t)0); 808 siginfo.si_signo = SIGFPE; 809 siginfo.si_code = FPE_INTOVF; 810 siginfo.si_addr = (caddr_t)rp->r_pc; 811 fault = FLTIOVF; 812 break; 813 814 case T_NOEXTFLT + USER: /* math coprocessor not available */ 815 if (tudebug && tudebugfpe) 816 showregs(type, rp, addr); 817 if (fpnoextflt(rp)) { 818 siginfo.si_signo = SIGFPE; 819 siginfo.si_code = ILL_ILLOPC; 820 siginfo.si_addr = (caddr_t)rp->r_pc; 821 fault = FLTFPE; 822 } 823 break; 824 825 case T_EXTOVRFLT: /* extension overrun fault */ 826 /* check if we took a kernel trap on behalf of user */ 827 { 828 extern void ndptrap_frstor(void); 829 if (rp->r_pc != (uintptr_t)ndptrap_frstor) 830 (void) die(type, rp, addr, cpuid); 831 type |= USER; 832 } 833 /*FALLTHROUGH*/ 834 case T_EXTOVRFLT + USER: /* extension overrun fault */ 835 if (tudebug && tudebugfpe) 836 showregs(type, rp, addr); 837 if (fpextovrflt(rp)) { 838 siginfo.si_signo = SIGSEGV; 839 siginfo.si_code = SEGV_MAPERR; 840 siginfo.si_addr = (caddr_t)rp->r_pc; 841 fault = FLTBOUNDS; 842 } 843 break; 844 845 case T_EXTERRFLT: /* x87 floating point exception pending */ 846 /* check if we took a kernel trap on behalf of user */ 847 { 848 extern void ndptrap_frstor(void); 849 if (rp->r_pc != (uintptr_t)ndptrap_frstor) 850 (void) die(type, rp, addr, cpuid); 851 type |= USER; 852 } 853 /*FALLTHROUGH*/ 854 855 case T_EXTERRFLT + USER: /* x87 floating point exception pending */ 856 if (tudebug && tudebugfpe) 857 showregs(type, rp, addr); 858 if (sicode = fpexterrflt(rp)) { 859 siginfo.si_signo = SIGFPE; 860 siginfo.si_code = sicode; 861 siginfo.si_addr = (caddr_t)rp->r_pc; 862 fault = FLTFPE; 863 } 864 break; 865 866 case T_SIMDFPE + USER: /* SSE and SSE2 exceptions */ 867 if (tudebug && tudebugsse) 868 showregs(type, rp, addr); 869 if ((x86_feature & (X86_SSE|X86_SSE2)) == 0) { 870 /* 871 * There are rumours that some user instructions 872 * on older CPUs can cause this trap to occur; in 873 * which case send a SIGILL instead of a SIGFPE. 874 */ 875 siginfo.si_signo = SIGILL; 876 siginfo.si_code = ILL_ILLTRP; 877 siginfo.si_addr = (caddr_t)rp->r_pc; 878 siginfo.si_trapno = type & ~USER; 879 fault = FLTILL; 880 } else if ((sicode = fpsimderrflt(rp)) != 0) { 881 siginfo.si_signo = SIGFPE; 882 siginfo.si_code = sicode; 883 siginfo.si_addr = (caddr_t)rp->r_pc; 884 fault = FLTFPE; 885 } 886 break; 887 888 case T_BPTFLT: /* breakpoint trap */ 889 /* 890 * Kernel breakpoint traps should only happen when kmdb is 891 * active, and even then, it'll have interposed on the IDT, so 892 * control won't get here. If it does, we've hit a breakpoint 893 * without the debugger, which is very strange, and very 894 * fatal. 895 */ 896 if (tudebug && tudebugbpt) 897 showregs(type, rp, (caddr_t)0); 898 899 (void) die(type, rp, addr, cpuid); 900 break; 901 902 case T_SGLSTP: /* single step/hw breakpoint exception */ 903 if (tudebug && tudebugbpt) 904 showregs(type, rp, (caddr_t)0); 905 906 /* Now evaluate how we got here */ 907 if (lwp != NULL && (lwp->lwp_pcb.pcb_drstat & DR_SINGLESTEP)) { 908 /* 909 * i386 single-steps even through lcalls which 910 * change the privilege level. So we take a trap at 911 * the first instruction in privileged mode. 912 * 913 * Set a flag to indicate that upon completion of 914 * the system call, deal with the single-step trap. 915 * 916 * The same thing happens for sysenter, too. 917 */ 918 #if defined(__amd64) 919 if (rp->r_pc == (uintptr_t)sys_sysenter) { 920 /* 921 * Adjust the pc so that we don't execute the 922 * swapgs instruction at the head of the 923 * handler and completely confuse things. 924 */ 925 rp->r_pc = (uintptr_t) 926 _sys_sysenter_post_swapgs; 927 #elif defined(__i386) 928 if (rp->r_pc == (uintptr_t)sys_call || 929 rp->r_pc == (uintptr_t)sys_sysenter) { 930 #endif 931 rp->r_ps &= ~PS_T; /* turn off trace */ 932 lwp->lwp_pcb.pcb_flags |= DEBUG_PENDING; 933 cur_thread->t_post_sys = 1; 934 goto cleanup; 935 } 936 } 937 /* XXX - needs review on debugger interface? */ 938 if (boothowto & RB_DEBUG) 939 debug_enter((char *)NULL); 940 else 941 (void) die(type, rp, addr, cpuid); 942 break; 943 944 case T_NMIFLT: /* NMI interrupt */ 945 printf("Unexpected NMI in system mode\n"); 946 goto cleanup; 947 948 case T_NMIFLT + USER: /* NMI interrupt */ 949 printf("Unexpected NMI in user mode\n"); 950 break; 951 952 case T_GPFLT: /* general protection violation */ 953 #if defined(__amd64) 954 /* 955 * On amd64, we can get a #gp from referencing addresses 956 * in the virtual address hole e.g. from a copyin. 957 */ 958 959 /* 960 * If we're under on_trap() protection (see <sys/ontrap.h>), 961 * set ot_trap and longjmp back to the on_trap() call site. 962 */ 963 if ((cur_thread->t_ontrap != NULL) && 964 (cur_thread->t_ontrap->ot_prot & OT_DATA_ACCESS)) { 965 curthread->t_ontrap->ot_trap |= OT_DATA_ACCESS; 966 longjmp(&curthread->t_ontrap->ot_jmpbuf); 967 } 968 969 /* 970 * If we're under lofault protection (copyin etc.), 971 * longjmp back to lofault with an EFAULT. 972 */ 973 if (cur_thread->t_lofault) { 974 /* 975 * Fault is not resolvable, so just return to lofault 976 */ 977 if (lodebug) { 978 showregs(type, rp, addr); 979 traceregs(rp); 980 } 981 rp->r_r0 = EFAULT; 982 rp->r_pc = cur_thread->t_lofault; 983 goto cleanup; 984 } 985 /*FALLTHROUGH*/ 986 #endif 987 case T_STKFLT: /* stack fault */ 988 case T_TSSFLT: /* invalid TSS fault */ 989 case T_SEGFLT: /* segment not present fault */ 990 if (tudebug) 991 showregs(type, rp, (caddr_t)0); 992 if (kern_gpfault(rp)) 993 (void) die(type, rp, addr, cpuid); 994 goto cleanup; 995 996 case T_SEGFLT + USER: /* segment not present fault */ 997 #ifdef _SYSCALL32_IMPL 998 if (instr_is_syscall((caddr_t)rp->r_pc)) { 999 /* 1000 * System calls via the call gate come in through 1001 * not-present traps. 1002 * 1003 * Since this is a not-present trap, rp->r_pc points to 1004 * the trapping lcall instruction. We need to bump it 1005 * to the next insn so the app can continue on. 1006 */ 1007 rp->r_pc += LCALLSIZE; 1008 lwp->lwp_regs = rp; 1009 1010 /* 1011 * Normally the microstate of the LWP is forced back to 1012 * LMS_USER by the syscall handlers. Emulate that 1013 * behavior here. 1014 */ 1015 mstate = LMS_USER; 1016 1017 dosyscall(); 1018 goto out; 1019 } 1020 #endif /* _SYSCALL32_IMPL */ 1021 /*FALLTHROUGH*/ 1022 1023 case T_GPFLT + USER: /* general protection violation */ 1024 /* 1025 * If the current process is using a private LDT and the 1026 * trapping instruction is sysenter, the sysenter instruction 1027 * has been disabled on the CPU because it destroys segment 1028 * registers. If this is the case, rewrite the instruction to 1029 * be a safe system call and retry it. If this occurs on a CPU 1030 * which doesn't even support sysenter, the result of all of 1031 * this will be to emulate that particular instruction. 1032 */ 1033 if (p->p_ldt != NULL && 1034 instr_is_fast_syscall((caddr_t)rp->r_pc, X86_SEP)) { 1035 if (rewrite_syscall((caddr_t)rp->r_pc) == 0) 1036 goto out; 1037 #ifdef DEBUG 1038 else 1039 cmn_err(CE_WARN, "failed to rewrite sysenter " 1040 "instruction in process %d", 1041 curthread->t_procp->p_pid); 1042 #endif /* DEBUG */ 1043 } 1044 /*FALLTHROUGH*/ 1045 1046 case T_BOUNDFLT + USER: /* bound fault */ 1047 case T_STKFLT + USER: /* stack fault */ 1048 case T_TSSFLT + USER: /* invalid TSS fault */ 1049 if (tudebug) 1050 showregs(type, rp, (caddr_t)0); 1051 siginfo.si_signo = SIGSEGV; 1052 siginfo.si_code = SEGV_MAPERR; 1053 siginfo.si_addr = (caddr_t)rp->r_pc; 1054 fault = FLTBOUNDS; 1055 break; 1056 1057 case T_ALIGNMENT + USER: /* user alignment error (486) */ 1058 if (tudebug) 1059 showregs(type, rp, (caddr_t)0); 1060 bzero(&siginfo, sizeof (siginfo)); 1061 siginfo.si_signo = SIGBUS; 1062 siginfo.si_code = BUS_ADRALN; 1063 siginfo.si_addr = (caddr_t)rp->r_pc; 1064 fault = FLTACCESS; 1065 break; 1066 1067 case T_SGLSTP + USER: /* single step/hw breakpoint exception */ 1068 if (tudebug && tudebugbpt) 1069 showregs(type, rp, (caddr_t)0); 1070 1071 /* Was it single-stepping? */ 1072 if (lwp->lwp_pcb.pcb_drstat & DR_SINGLESTEP) { 1073 pcb_t *pcb = &lwp->lwp_pcb; 1074 1075 rp->r_ps &= ~PS_T; 1076 /* 1077 * If both NORMAL_STEP and WATCH_STEP are in effect, 1078 * give precedence to NORMAL_STEP. If neither is set, 1079 * user must have set the PS_T bit in %efl; treat this 1080 * as NORMAL_STEP. 1081 */ 1082 if ((pcb->pcb_flags & NORMAL_STEP) || 1083 !(pcb->pcb_flags & WATCH_STEP)) { 1084 siginfo.si_signo = SIGTRAP; 1085 siginfo.si_code = TRAP_TRACE; 1086 siginfo.si_addr = (caddr_t)rp->r_pc; 1087 fault = FLTTRACE; 1088 if (pcb->pcb_flags & WATCH_STEP) 1089 (void) undo_watch_step(NULL); 1090 } else { 1091 fault = undo_watch_step(&siginfo); 1092 } 1093 pcb->pcb_flags &= ~(NORMAL_STEP|WATCH_STEP); 1094 } else { 1095 cmn_err(CE_WARN, 1096 "Unexpected INT 1 in user mode, dr6=%lx", 1097 lwp->lwp_pcb.pcb_drstat); 1098 } 1099 break; 1100 1101 case T_BPTFLT + USER: /* breakpoint trap */ 1102 if (tudebug && tudebugbpt) 1103 showregs(type, rp, (caddr_t)0); 1104 /* 1105 * int 3 (the breakpoint instruction) leaves the pc referring 1106 * to the address one byte after the breakpointed address. 1107 * If the P_PR_BPTADJ flag has been set via /proc, We adjust 1108 * it back so it refers to the breakpointed address. 1109 */ 1110 if (p->p_proc_flag & P_PR_BPTADJ) 1111 rp->r_pc--; 1112 siginfo.si_signo = SIGTRAP; 1113 siginfo.si_code = TRAP_BRKPT; 1114 siginfo.si_addr = (caddr_t)rp->r_pc; 1115 fault = FLTBPT; 1116 break; 1117 1118 case T_AST: 1119 /* 1120 * This occurs only after the cs register has been made to 1121 * look like a kernel selector, either through debugging or 1122 * possibly by functions like setcontext(). The thread is 1123 * about to cause a general protection fault at common_iret() 1124 * in locore. We let that happen immediately instead of 1125 * doing the T_AST processing. 1126 */ 1127 goto cleanup; 1128 1129 case T_AST + USER: /* profiling or resched pseudo trap */ 1130 if (lwp->lwp_pcb.pcb_flags & CPC_OVERFLOW) { 1131 lwp->lwp_pcb.pcb_flags &= ~CPC_OVERFLOW; 1132 if (kcpc_overflow_ast()) { 1133 /* 1134 * Signal performance counter overflow 1135 */ 1136 if (tudebug) 1137 showregs(type, rp, (caddr_t)0); 1138 bzero(&siginfo, sizeof (siginfo)); 1139 siginfo.si_signo = SIGEMT; 1140 siginfo.si_code = EMT_CPCOVF; 1141 siginfo.si_addr = (caddr_t)rp->r_pc; 1142 fault = FLTCPCOVF; 1143 } 1144 } 1145 break; 1146 } 1147 1148 /* 1149 * We can't get here from a system trap 1150 */ 1151 ASSERT(type & USER); 1152 1153 if (fault) { 1154 /* 1155 * Remember the fault and fault adddress 1156 * for real-time (SIGPROF) profiling. 1157 */ 1158 lwp->lwp_lastfault = fault; 1159 lwp->lwp_lastfaddr = siginfo.si_addr; 1160 1161 DTRACE_PROC2(fault, int, fault, ksiginfo_t *, &siginfo); 1162 1163 /* 1164 * If a debugger has declared this fault to be an 1165 * event of interest, stop the lwp. Otherwise just 1166 * deliver the associated signal. 1167 */ 1168 if (siginfo.si_signo != SIGKILL && 1169 prismember(&p->p_fltmask, fault) && 1170 stop_on_fault(fault, &siginfo) == 0) 1171 siginfo.si_signo = 0; 1172 } 1173 1174 if (siginfo.si_signo) 1175 trapsig(&siginfo, (fault == FLTCPCOVF)? 0 : 1); 1176 1177 if (lwp->lwp_oweupc) 1178 profil_tick(rp->r_pc); 1179 1180 if (cur_thread->t_astflag | cur_thread->t_sig_check) { 1181 /* 1182 * Turn off the AST flag before checking all the conditions that 1183 * may have caused an AST. This flag is on whenever a signal or 1184 * unusual condition should be handled after the next trap or 1185 * syscall. 1186 */ 1187 astoff(cur_thread); 1188 cur_thread->t_sig_check = 0; 1189 1190 mutex_enter(&p->p_lock); 1191 if (curthread->t_proc_flag & TP_CHANGEBIND) { 1192 timer_lwpbind(); 1193 curthread->t_proc_flag &= ~TP_CHANGEBIND; 1194 } 1195 mutex_exit(&p->p_lock); 1196 1197 /* 1198 * for kaio requests that are on the per-process poll queue, 1199 * aiop->aio_pollq, they're AIO_POLL bit is set, the kernel 1200 * should copyout their result_t to user memory. by copying 1201 * out the result_t, the user can poll on memory waiting 1202 * for the kaio request to complete. 1203 */ 1204 if (p->p_aio) 1205 aio_cleanup(0); 1206 /* 1207 * If this LWP was asked to hold, call holdlwp(), which will 1208 * stop. holdlwps() sets this up and calls pokelwps() which 1209 * sets the AST flag. 1210 * 1211 * Also check TP_EXITLWP, since this is used by fresh new LWPs 1212 * through lwp_rtt(). That flag is set if the lwp_create(2) 1213 * syscall failed after creating the LWP. 1214 */ 1215 if (ISHOLD(p)) 1216 holdlwp(); 1217 1218 /* 1219 * All code that sets signals and makes ISSIG evaluate true must 1220 * set t_astflag afterwards. 1221 */ 1222 if (ISSIG_PENDING(cur_thread, lwp, p)) { 1223 if (issig(FORREAL)) 1224 psig(); 1225 cur_thread->t_sig_check = 1; 1226 } 1227 1228 if (cur_thread->t_rprof != NULL) { 1229 realsigprof(0, 0); 1230 cur_thread->t_sig_check = 1; 1231 } 1232 } 1233 1234 out: /* We can't get here from a system trap */ 1235 ASSERT(type & USER); 1236 1237 if (ISHOLD(p)) 1238 holdlwp(); 1239 1240 /* 1241 * Set state to LWP_USER here so preempt won't give us a kernel 1242 * priority if it occurs after this point. Call CL_TRAPRET() to 1243 * restore the user-level priority. 1244 * 1245 * It is important that no locks (other than spinlocks) be entered 1246 * after this point before returning to user mode (unless lwp_state 1247 * is set back to LWP_SYS). 1248 */ 1249 lwp->lwp_state = LWP_USER; 1250 1251 if (cur_thread->t_trapret) { 1252 cur_thread->t_trapret = 0; 1253 thread_lock(cur_thread); 1254 CL_TRAPRET(cur_thread); 1255 thread_unlock(cur_thread); 1256 } 1257 if (CPU->cpu_runrun) 1258 preempt(); 1259 (void) new_mstate(cur_thread, mstate); 1260 1261 /* Kernel probe */ 1262 TNF_PROBE_1(thread_state, "thread", /* CSTYLED */, 1263 tnf_microstate, state, LMS_USER); 1264 1265 return; 1266 1267 cleanup: /* system traps end up here */ 1268 ASSERT(!(type & USER)); 1269 } 1270 1271 /* 1272 * Patch non-zero to disable preemption of threads in the kernel. 1273 */ 1274 int IGNORE_KERNEL_PREEMPTION = 0; /* XXX - delete this someday */ 1275 1276 struct kpreempt_cnts { /* kernel preemption statistics */ 1277 int kpc_idle; /* executing idle thread */ 1278 int kpc_intr; /* executing interrupt thread */ 1279 int kpc_clock; /* executing clock thread */ 1280 int kpc_blocked; /* thread has blocked preemption (t_preempt) */ 1281 int kpc_notonproc; /* thread is surrendering processor */ 1282 int kpc_inswtch; /* thread has ratified scheduling decision */ 1283 int kpc_prilevel; /* processor interrupt level is too high */ 1284 int kpc_apreempt; /* asynchronous preemption */ 1285 int kpc_spreempt; /* synchronous preemption */ 1286 } kpreempt_cnts; 1287 1288 /* 1289 * kernel preemption: forced rescheduling, preempt the running kernel thread. 1290 * the argument is old PIL for an interrupt, 1291 * or the distingished value KPREEMPT_SYNC. 1292 */ 1293 void 1294 kpreempt(int asyncspl) 1295 { 1296 kthread_t *cur_thread = curthread; 1297 1298 if (IGNORE_KERNEL_PREEMPTION) { 1299 aston(CPU->cpu_dispthread); 1300 return; 1301 } 1302 1303 /* 1304 * Check that conditions are right for kernel preemption 1305 */ 1306 do { 1307 if (cur_thread->t_preempt) { 1308 /* 1309 * either a privileged thread (idle, panic, interrupt) 1310 * or will check when t_preempt is lowered 1311 */ 1312 if (cur_thread->t_pri < 0) 1313 kpreempt_cnts.kpc_idle++; 1314 else if (cur_thread->t_flag & T_INTR_THREAD) { 1315 kpreempt_cnts.kpc_intr++; 1316 if (cur_thread->t_pil == CLOCK_LEVEL) 1317 kpreempt_cnts.kpc_clock++; 1318 } else 1319 kpreempt_cnts.kpc_blocked++; 1320 aston(CPU->cpu_dispthread); 1321 return; 1322 } 1323 if (cur_thread->t_state != TS_ONPROC || 1324 cur_thread->t_disp_queue != CPU->cpu_disp) { 1325 /* this thread will be calling swtch() shortly */ 1326 kpreempt_cnts.kpc_notonproc++; 1327 if (CPU->cpu_thread != CPU->cpu_dispthread) { 1328 /* already in swtch(), force another */ 1329 kpreempt_cnts.kpc_inswtch++; 1330 siron(); 1331 } 1332 return; 1333 } 1334 if (getpil() >= DISP_LEVEL) { 1335 /* 1336 * We can't preempt this thread if it is at 1337 * a PIL >= DISP_LEVEL since it may be holding 1338 * a spin lock (like sched_lock). 1339 */ 1340 siron(); /* check back later */ 1341 kpreempt_cnts.kpc_prilevel++; 1342 return; 1343 } 1344 1345 if (asyncspl != KPREEMPT_SYNC) 1346 kpreempt_cnts.kpc_apreempt++; 1347 else 1348 kpreempt_cnts.kpc_spreempt++; 1349 1350 cur_thread->t_preempt++; 1351 preempt(); 1352 cur_thread->t_preempt--; 1353 } while (CPU->cpu_kprunrun); 1354 } 1355 1356 /* 1357 * Print out debugging info. 1358 */ 1359 static void 1360 showregs(uint_t type, struct regs *rp, caddr_t addr) 1361 { 1362 int s; 1363 1364 s = spl7(); 1365 type &= ~USER; 1366 if (u.u_comm[0]) 1367 printf("%s: ", u.u_comm); 1368 if (type < TRAP_TYPES) 1369 printf("#%s %s\n", trap_type_mnemonic[type], trap_type[type]); 1370 else 1371 switch (type) { 1372 case T_SYSCALL: 1373 printf("Syscall Trap:\n"); 1374 break; 1375 case T_AST: 1376 printf("AST\n"); 1377 break; 1378 default: 1379 printf("Bad Trap = %d\n", type); 1380 break; 1381 } 1382 if (type == T_PGFLT) { 1383 printf("Bad %s fault at addr=0x%lx\n", 1384 USERMODE(rp->r_cs) ? "user": "kernel", (uintptr_t)addr); 1385 } else if (addr) { 1386 printf("addr=0x%lx\n", (uintptr_t)addr); 1387 } 1388 1389 printf("pid=%d, pc=0x%lx, sp=0x%lx, eflags=0x%lx\n", 1390 (ttoproc(curthread) && ttoproc(curthread)->p_pidp) ? 1391 ttoproc(curthread)->p_pid : 0, rp->r_pc, rp->r_sp, rp->r_ps); 1392 1393 #if defined(__lint) 1394 /* 1395 * this clause can be deleted when lint bug 4870403 is fixed 1396 * (lint thinks that bit 32 is illegal in a %b format string) 1397 */ 1398 printf("cr0: %x cr4: %b\n", 1399 (uint_t)getcr0(), (uint_t)getcr4(), FMT_CR4); 1400 #else 1401 printf("cr0: %b cr4: %b\n", 1402 (uint_t)getcr0(), FMT_CR0, (uint_t)getcr4(), FMT_CR4); 1403 #endif 1404 1405 #if defined(__amd64) 1406 printf("cr2: %lx cr3: %lx cr8: %lx\n", getcr2(), getcr3(), getcr8()); 1407 #elif defined(__i386) 1408 printf("cr2: %lx cr3: %lx\n", getcr2(), getcr3()); 1409 #endif 1410 1411 dumpregs(rp); 1412 splx(s); 1413 } 1414 1415 static void 1416 dumpregs(struct regs *rp) 1417 { 1418 #if defined(__amd64) 1419 const char fmt[] = "\t%3s: %16lx %3s: %16lx %3s: %16lx\n"; 1420 1421 printf(fmt, "rdi", rp->r_rdi, "rsi", rp->r_rsi, "rdx", rp->r_rdx); 1422 printf(fmt, "rcx", rp->r_rcx, " r8", rp->r_r8, " r9", rp->r_r9); 1423 printf(fmt, "rax", rp->r_rax, "rbx", rp->r_rbx, "rbp", rp->r_rbp); 1424 printf(fmt, "r10", rp->r_r10, "r11", rp->r_r11, "r12", rp->r_r12); 1425 printf(fmt, "r13", rp->r_r13, "r14", rp->r_r14, "r15", rp->r_r15); 1426 1427 printf(fmt, "fsb", rp->r_fsbase, "gsb", rp->r_gsbase, " ds", rp->r_ds); 1428 printf(fmt, " es", rp->r_es, " fs", rp->r_fs, " gs", rp->r_gs); 1429 1430 printf(fmt, "trp", rp->r_trapno, "err", rp->r_err, "rip", rp->r_rip); 1431 printf(fmt, " cs", rp->r_cs, "rfl", rp->r_rfl, "rsp", rp->r_rsp); 1432 1433 printf("\t%3s: %16lx\n", " ss", rp->r_ss); 1434 1435 #elif defined(__i386) 1436 const char fmt[] = "\t%3s: %8lx %3s: %8lx %3s: %8lx %3s: %8lx\n"; 1437 1438 printf(fmt, " gs", rp->r_gs, " fs", rp->r_fs, 1439 " es", rp->r_es, " ds", rp->r_ds); 1440 printf(fmt, "edi", rp->r_edi, "esi", rp->r_esi, 1441 "ebp", rp->r_ebp, "esp", rp->r_esp); 1442 printf(fmt, "ebx", rp->r_ebx, "edx", rp->r_edx, 1443 "ecx", rp->r_ecx, "eax", rp->r_eax); 1444 printf(fmt, "trp", rp->r_trapno, "err", rp->r_err, 1445 "eip", rp->r_eip, " cs", rp->r_cs); 1446 printf("\t%3s: %8lx %3s: %8lx %3s: %8lx\n", 1447 "efl", rp->r_efl, "usp", rp->r_uesp, " ss", rp->r_ss); 1448 1449 #endif /* __i386 */ 1450 } 1451 1452 /* 1453 * Handle #gp faults in kernel mode. 1454 * 1455 * One legitimate way this can happen is if we attempt to update segment 1456 * registers to naughty values on the way out of the kernel. 1457 * 1458 * This can happen in a couple of ways: someone - either accidentally or 1459 * on purpose - creates (setcontext(2), lwp_create(2)) or modifies 1460 * (signal(2)) a ucontext that contains silly segment register values. 1461 * Or someone - either accidentally or on purpose - modifies the prgregset_t 1462 * of a subject process via /proc to contain silly segment register values. 1463 * 1464 * (The unfortunate part is that we can end up discovering the bad segment 1465 * register value in the middle of an 'iret' after we've popped most of the 1466 * stack. So it becomes quite difficult to associate an accurate ucontext 1467 * with the lwp, because the act of taking the #gp trap overwrites most of 1468 * what we were going to send the lwp.) 1469 * 1470 * OTOH if it turns out that's -not- the problem, and we're -not- an lwp 1471 * trying to return to user mode and we get a #gp fault, then we need 1472 * to die() -- which will happen if we return non-zero from this routine. 1473 */ 1474 static int 1475 kern_gpfault(struct regs *rp) 1476 { 1477 kthread_t *t = curthread; 1478 proc_t *p = ttoproc(t); 1479 klwp_t *lwp = ttolwp(t); 1480 struct regs tmpregs, *trp = NULL; 1481 caddr_t pc = (caddr_t)rp->r_pc; 1482 int v; 1483 1484 extern void _sys_rtt(), sr_sup(); 1485 1486 #if defined(__amd64) 1487 extern void _update_sregs(), _update_sregs_done(); 1488 static const uint8_t iretq_insn[2] = { 0x48, 0xcf }; 1489 1490 #elif defined(__i386) 1491 static const uint8_t iret_insn[1] = { 0xcf }; 1492 1493 /* 1494 * Note carefully the appallingly awful dependency between 1495 * the instruction sequence used in __SEGREGS_POP and these 1496 * instructions encoded here. 1497 * 1498 * XX64 Add some commentary to locore.s/privregs.h to document this. 1499 */ 1500 static const uint8_t movw_0_esp_gs[4] = { 0x8e, 0x6c, 0x24, 0x0 }; 1501 static const uint8_t movw_4_esp_fs[4] = { 0x8e, 0x64, 0x24, 0x4 }; 1502 static const uint8_t movw_8_esp_es[4] = { 0x8e, 0x44, 0x24, 0x8 }; 1503 static const uint8_t movw_c_esp_ds[4] = { 0x8e, 0x5c, 0x24, 0xc }; 1504 #endif 1505 /* 1506 * if we're not an lwp, or the pc range is outside _sys_rtt, then 1507 * we should immediately be die()ing horribly 1508 */ 1509 if (lwp == NULL || 1510 (uintptr_t)pc < (uintptr_t)_sys_rtt || 1511 (uintptr_t)pc > (uintptr_t)sr_sup) 1512 return (1); 1513 1514 /* 1515 * So at least we're in the right part of the kernel. 1516 * 1517 * Disassemble the instruction at the faulting pc. 1518 * Once we know what it is, we carefully reconstruct the stack 1519 * based on the order in which the stack is deconstructed in 1520 * _sys_rtt. Ew. 1521 */ 1522 1523 #if defined(__amd64) 1524 1525 if (bcmp(pc, iretq_insn, sizeof (iretq_insn)) == 0) { 1526 /* 1527 * We took the #gp while trying to perform the iretq. 1528 * This means that either %cs or %ss are bad. 1529 * All we know for sure is that most of the general 1530 * registers have been restored, including the 1531 * segment registers, and all we have left on the 1532 * topmost part of the lwp's stack are the 1533 * registers that the iretq was unable to consume. 1534 * 1535 * All the rest of the state was crushed by the #gp 1536 * which pushed -its- registers atop our old save area 1537 * (because we had to decrement the stack pointer, sigh) so 1538 * all that we can try and do is to reconstruct the 1539 * crushed frame from the #gp trap frame itself. 1540 */ 1541 trp = &tmpregs; 1542 trp->r_ss = lwptoregs(lwp)->r_ss; 1543 trp->r_sp = lwptoregs(lwp)->r_sp; 1544 trp->r_ps = lwptoregs(lwp)->r_ps; 1545 trp->r_cs = lwptoregs(lwp)->r_cs; 1546 trp->r_pc = lwptoregs(lwp)->r_pc; 1547 bcopy(rp, trp, offsetof(struct regs, r_pc)); 1548 1549 /* 1550 * Validate simple math 1551 */ 1552 ASSERT(trp->r_pc == lwptoregs(lwp)->r_pc); 1553 ASSERT(trp->r_err == rp->r_err); 1554 1555 } else if ((lwp->lwp_pcb.pcb_flags & RUPDATE_PENDING) != 0 && 1556 pc >= (caddr_t)_update_sregs && 1557 pc < (caddr_t)_update_sregs_done) { 1558 /* 1559 * This is the common case -- we're trying to load 1560 * a bad segment register value in the only section 1561 * of kernel code that ever loads segment registers. 1562 * 1563 * We don't need to do anything at this point because 1564 * the pcb contains all the pending segment register 1565 * state, and the regs are still intact because we 1566 * didn't adjust the stack pointer yet. Given the fidelity 1567 * of all this, we could conceivably send a signal 1568 * to the lwp, rather than core-ing. 1569 */ 1570 trp = lwptoregs(lwp); 1571 ASSERT((caddr_t)trp == (caddr_t)rp->r_sp); 1572 } 1573 1574 #elif defined(__i386) 1575 1576 if (bcmp(pc, iret_insn, sizeof (iret_insn)) == 0) { 1577 /* 1578 * We took the #gp while trying to perform the iret. 1579 * This means that either %cs or %ss are bad. 1580 * All we know for sure is that most of the general 1581 * registers have been restored, including the 1582 * segment registers, and all we have left on the 1583 * topmost part of the lwp's stack are the registers that 1584 * the iret was unable to consume. 1585 * 1586 * All the rest of the state was crushed by the #gp 1587 * which pushed -its- registers atop our old save area 1588 * (because we had to decrement the stack pointer, sigh) so 1589 * all that we can try and do is to reconstruct the 1590 * crushed frame from the #gp trap frame itself. 1591 */ 1592 trp = &tmpregs; 1593 trp->r_ss = lwptoregs(lwp)->r_ss; 1594 trp->r_sp = lwptoregs(lwp)->r_sp; 1595 trp->r_ps = lwptoregs(lwp)->r_ps; 1596 trp->r_cs = lwptoregs(lwp)->r_cs; 1597 trp->r_pc = lwptoregs(lwp)->r_pc; 1598 bcopy(rp, trp, offsetof(struct regs, r_pc)); 1599 1600 ASSERT(trp->r_pc == lwptoregs(lwp)->r_pc); 1601 ASSERT(trp->r_err == rp->r_err); 1602 1603 } else { 1604 /* 1605 * Segment registers are reloaded in _sys_rtt 1606 * via the following sequence: 1607 * 1608 * movw 0(%esp), %gs 1609 * movw 4(%esp), %fs 1610 * movw 8(%esp), %es 1611 * movw 12(%esp), %ds 1612 * addl $16, %esp 1613 * 1614 * Thus if any of them fault, we know the user 1615 * registers are left unharmed on the stack. 1616 */ 1617 if (bcmp(pc, movw_0_esp_gs, sizeof (movw_0_esp_gs)) == 0 || 1618 bcmp(pc, movw_4_esp_fs, sizeof (movw_4_esp_fs)) == 0 || 1619 bcmp(pc, movw_8_esp_es, sizeof (movw_8_esp_es)) == 0 || 1620 bcmp(pc, movw_c_esp_ds, sizeof (movw_c_esp_ds)) == 0) 1621 trp = lwptoregs(lwp); 1622 } 1623 #endif /* __amd64 */ 1624 1625 if (trp == NULL) 1626 return (1); 1627 1628 /* 1629 * If we get to here, we're reasonably confident that we've 1630 * correctly decoded what happened on the way out of the kernel. 1631 * Rewrite the lwp's registers so that we can create a core dump 1632 * the (at least vaguely) represents the mcontext we were 1633 * being asked to restore when things went so terribly wrong. 1634 */ 1635 1636 /* 1637 * Make sure that we have a meaningful %trapno and %err. 1638 */ 1639 trp->r_trapno = rp->r_trapno; 1640 trp->r_err = rp->r_err; 1641 1642 if ((caddr_t)trp != (caddr_t)lwptoregs(lwp)) 1643 bcopy(trp, lwptoregs(lwp), sizeof (*trp)); 1644 1645 mutex_enter(&p->p_lock); 1646 lwp->lwp_cursig = SIGSEGV; 1647 mutex_exit(&p->p_lock); 1648 1649 /* 1650 * Terminate all LWPs but don't discard them. If another lwp beat us to 1651 * the punch by calling exit(), evaporate now. 1652 */ 1653 if (exitlwps(1) != 0) { 1654 mutex_enter(&p->p_lock); 1655 lwp_exit(); 1656 } 1657 1658 #ifdef C2_AUDIT 1659 if (audit_active) /* audit core dump */ 1660 audit_core_start(SIGSEGV); 1661 #endif 1662 v = core(SIGSEGV, B_FALSE); 1663 #ifdef C2_AUDIT 1664 if (audit_active) /* audit core dump */ 1665 audit_core_finish(v ? CLD_KILLED : CLD_DUMPED); 1666 #endif 1667 exit(v ? CLD_KILLED : CLD_DUMPED, SIGSEGV); 1668 return (0); 1669 } 1670 1671 /* 1672 * dump_tss() - Display the TSS structure 1673 */ 1674 1675 #if defined(__amd64) 1676 1677 static void 1678 dump_tss(void) 1679 { 1680 const char tss_fmt[] = "tss.%s:\t0x%p\n"; /* Format string */ 1681 struct tss *tss = CPU->cpu_tss; 1682 1683 printf(tss_fmt, "tss_rsp0", (void *)tss->tss_rsp0); 1684 printf(tss_fmt, "tss_rsp1", (void *)tss->tss_rsp1); 1685 printf(tss_fmt, "tss_rsp2", (void *)tss->tss_rsp2); 1686 1687 printf(tss_fmt, "tss_ist1", (void *)tss->tss_ist1); 1688 printf(tss_fmt, "tss_ist2", (void *)tss->tss_ist2); 1689 printf(tss_fmt, "tss_ist3", (void *)tss->tss_ist3); 1690 printf(tss_fmt, "tss_ist4", (void *)tss->tss_ist4); 1691 printf(tss_fmt, "tss_ist5", (void *)tss->tss_ist5); 1692 printf(tss_fmt, "tss_ist6", (void *)tss->tss_ist6); 1693 printf(tss_fmt, "tss_ist7", (void *)tss->tss_ist7); 1694 } 1695 1696 #elif defined(__i386) 1697 1698 static void 1699 dump_tss(void) 1700 { 1701 const char tss_fmt[] = "tss.%s:\t0x%p\n"; /* Format string */ 1702 struct tss *tss = CPU->cpu_tss; 1703 1704 printf(tss_fmt, "tss_link", (void *)tss->tss_link); 1705 printf(tss_fmt, "tss_esp0", (void *)tss->tss_esp0); 1706 printf(tss_fmt, "tss_ss0", (void *)tss->tss_ss0); 1707 printf(tss_fmt, "tss_esp1", (void *)tss->tss_esp1); 1708 printf(tss_fmt, "tss_ss1", (void *)tss->tss_ss1); 1709 printf(tss_fmt, "tss_esp2", (void *)tss->tss_esp2); 1710 printf(tss_fmt, "tss_ss2", (void *)tss->tss_ss2); 1711 printf(tss_fmt, "tss_cr3", (void *)tss->tss_cr3); 1712 printf(tss_fmt, "tss_eip", (void *)tss->tss_eip); 1713 printf(tss_fmt, "tss_eflags", (void *)tss->tss_eflags); 1714 printf(tss_fmt, "tss_eax", (void *)tss->tss_eax); 1715 printf(tss_fmt, "tss_ebx", (void *)tss->tss_ebx); 1716 printf(tss_fmt, "tss_ecx", (void *)tss->tss_ecx); 1717 printf(tss_fmt, "tss_edx", (void *)tss->tss_edx); 1718 printf(tss_fmt, "tss_esp", (void *)tss->tss_esp); 1719 } 1720 1721 #endif /* __amd64 */ 1722 1723 #if defined(TRAPTRACE) 1724 1725 int ttrace_nrec = 0; /* number of records to dump out */ 1726 int ttrace_dump_nregs = 5; /* dump out this many records with regs too */ 1727 1728 /* 1729 * Dump out the last ttrace_nrec traptrace records on each CPU 1730 */ 1731 static void 1732 dump_ttrace(void) 1733 { 1734 trap_trace_ctl_t *ttc; 1735 trap_trace_rec_t *rec; 1736 uintptr_t current; 1737 int i, j, k; 1738 int n = NCPU; 1739 #if defined(__amd64) 1740 const char banner[] = 1741 "\ncpu address timestamp " 1742 "type vc handler pc\n"; 1743 const char fmt1[] = "%3d %016lx %12llx "; 1744 #elif defined(__i386) 1745 const char banner[] = 1746 "\ncpu address timestamp type vc handler pc\n"; 1747 const char fmt1[] = "%3d %08lx %12llx "; 1748 #endif 1749 const char fmt2[] = "%4s %3x "; 1750 const char fmt3[] = "%8s "; 1751 1752 if (ttrace_nrec == 0) 1753 return; 1754 1755 printf(banner); 1756 1757 for (i = 0; i < n; i++) { 1758 ttc = &trap_trace_ctl[i]; 1759 if (ttc->ttc_first == NULL) 1760 continue; 1761 1762 current = ttc->ttc_next - sizeof (trap_trace_rec_t); 1763 for (j = 0; j < ttrace_nrec; j++) { 1764 struct sysent *sys; 1765 struct autovec *vec; 1766 extern struct av_head autovect[]; 1767 int type; 1768 ulong_t off; 1769 char *sym, *stype; 1770 1771 if (current < ttc->ttc_first) 1772 current = 1773 ttc->ttc_limit - sizeof (trap_trace_rec_t); 1774 1775 if (current == NULL) 1776 continue; 1777 1778 rec = (trap_trace_rec_t *)current; 1779 1780 if (rec->ttr_stamp == 0) 1781 break; 1782 1783 printf(fmt1, i, (uintptr_t)rec, rec->ttr_stamp); 1784 1785 switch (rec->ttr_marker) { 1786 case TT_SYSCALL: 1787 case TT_SYSENTER: 1788 case TT_SYSC: 1789 case TT_SYSC64: 1790 #if defined(__amd64) 1791 sys = &sysent32[rec->ttr_sysnum]; 1792 switch (rec->ttr_marker) { 1793 case TT_SYSC64: 1794 sys = &sysent[rec->ttr_sysnum]; 1795 /*FALLTHROUGH*/ 1796 #elif defined(__i386) 1797 sys = &sysent[rec->ttr_sysnum]; 1798 switch (rec->ttr_marker) { 1799 case TT_SYSC64: 1800 #endif 1801 case TT_SYSC: 1802 stype = "sysc"; /* syscall */ 1803 break; 1804 case TT_SYSCALL: 1805 stype = "lcal"; /* lcall */ 1806 break; 1807 case TT_SYSENTER: 1808 stype = "syse"; /* sysenter */ 1809 break; 1810 default: 1811 break; 1812 } 1813 printf(fmt2, "sysc", rec->ttr_sysnum); 1814 if (sys != NULL) { 1815 sym = kobj_getsymname( 1816 (uintptr_t)sys->sy_callc, 1817 &off); 1818 if (sym != NULL) 1819 printf("%s ", sym); 1820 else 1821 printf("%p ", sys->sy_callc); 1822 } else { 1823 printf("unknown "); 1824 } 1825 break; 1826 1827 case TT_INTERRUPT: 1828 printf(fmt2, "intr", rec->ttr_vector); 1829 vec = (&autovect[rec->ttr_vector])->avh_link; 1830 if (vec != NULL) { 1831 sym = kobj_getsymname( 1832 (uintptr_t)vec->av_vector, &off); 1833 if (sym != NULL) 1834 printf("%s ", sym); 1835 else 1836 printf("%p ", vec->av_vector); 1837 } else { 1838 printf("unknown "); 1839 } 1840 break; 1841 1842 case TT_TRAP: 1843 type = rec->ttr_regs.r_trapno; 1844 printf(fmt2, "trap", type); 1845 printf("#%s ", type < TRAP_TYPES ? 1846 trap_type_mnemonic[type] : "trap"); 1847 break; 1848 1849 default: 1850 break; 1851 } 1852 1853 sym = kobj_getsymname(rec->ttr_regs.r_pc, &off); 1854 if (sym != NULL) 1855 printf("%s+%lx\n", sym, off); 1856 else 1857 printf("%lx\n", rec->ttr_regs.r_pc); 1858 1859 if (ttrace_dump_nregs-- > 0) { 1860 int s; 1861 1862 if (rec->ttr_marker == TT_INTERRUPT) 1863 printf( 1864 "\t\tipl %x spl %x pri %x\n", 1865 rec->ttr_ipl, 1866 rec->ttr_spl, 1867 rec->ttr_pri); 1868 1869 dumpregs(&rec->ttr_regs); 1870 1871 printf("\t%3s: %p\n\n", " ct", 1872 (void *)rec->ttr_curthread); 1873 1874 /* 1875 * print out the pc stack that we recorded 1876 * at trap time (if any) 1877 */ 1878 for (s = 0; s < rec->ttr_sdepth; s++) { 1879 uintptr_t fullpc; 1880 1881 if (s >= TTR_STACK_DEPTH) { 1882 printf("ttr_sdepth corrupt\n"); 1883 break; 1884 } 1885 1886 fullpc = (uintptr_t)rec->ttr_stack[s]; 1887 1888 sym = kobj_getsymname(fullpc, &off); 1889 if (sym != NULL) 1890 printf("-> %s+0x%lx()\n", 1891 sym, off); 1892 else 1893 printf("-> 0x%lx()\n", fullpc); 1894 } 1895 printf("\n"); 1896 } 1897 current -= sizeof (trap_trace_rec_t); 1898 } 1899 } 1900 } 1901 1902 #endif /* TRAPTRACE */ 1903 1904 void 1905 panic_showtrap(struct trap_info *tip) 1906 { 1907 showregs(tip->trap_type, tip->trap_regs, tip->trap_addr); 1908 1909 #if defined(TRAPTRACE) 1910 dump_ttrace(); 1911 #endif /* TRAPTRACE */ 1912 1913 if (tip->trap_type == T_DBLFLT) 1914 dump_tss(); 1915 } 1916 1917 void 1918 panic_savetrap(panic_data_t *pdp, struct trap_info *tip) 1919 { 1920 panic_saveregs(pdp, tip->trap_regs); 1921 } 1922