1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 23 /* 24 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 25 * Use is subject to license terms. 26 */ 27 28 /* Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */ 29 /* Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T */ 30 /* All Rights Reserved */ 31 /* */ 32 /* Copyright (c) 1987, 1988 Microsoft Corporation */ 33 /* All Rights Reserved */ 34 /* */ 35 36 #pragma ident "%Z%%M% %I% %E% SMI" 37 38 #include <sys/types.h> 39 #include <sys/sysmacros.h> 40 #include <sys/param.h> 41 #include <sys/signal.h> 42 #include <sys/systm.h> 43 #include <sys/user.h> 44 #include <sys/proc.h> 45 #include <sys/disp.h> 46 #include <sys/class.h> 47 #include <sys/core.h> 48 #include <sys/syscall.h> 49 #include <sys/cpuvar.h> 50 #include <sys/vm.h> 51 #include <sys/sysinfo.h> 52 #include <sys/fault.h> 53 #include <sys/stack.h> 54 #include <sys/mmu.h> 55 #include <sys/psw.h> 56 #include <sys/regset.h> 57 #include <sys/fp.h> 58 #include <sys/trap.h> 59 #include <sys/kmem.h> 60 #include <sys/vtrace.h> 61 #include <sys/cmn_err.h> 62 #include <sys/prsystm.h> 63 #include <sys/mutex_impl.h> 64 #include <sys/machsystm.h> 65 #include <sys/archsystm.h> 66 #include <sys/sdt.h> 67 #include <sys/avintr.h> 68 #include <sys/kobj.h> 69 70 #include <vm/hat.h> 71 72 #include <vm/seg_kmem.h> 73 #include <vm/as.h> 74 #include <vm/seg.h> 75 #include <vm/hat_pte.h> 76 77 #include <sys/procfs.h> 78 79 #include <sys/reboot.h> 80 #include <sys/debug.h> 81 #include <sys/debugreg.h> 82 #include <sys/modctl.h> 83 #include <sys/aio_impl.h> 84 #include <sys/tnf.h> 85 #include <sys/tnf_probe.h> 86 #include <sys/cred.h> 87 #include <sys/mman.h> 88 #include <sys/x86_archext.h> 89 #include <sys/copyops.h> 90 #include <c2/audit.h> 91 #include <sys/ftrace.h> 92 #include <sys/panic.h> 93 #include <sys/traptrace.h> 94 #include <sys/ontrap.h> 95 #include <sys/cpc_impl.h> 96 97 #define USER 0x10000 /* user-mode flag added to trap type */ 98 99 static const char *trap_type_mnemonic[] = { 100 "de", "db", "2", "bp", 101 "of", "br", "ud", "nm", 102 "df", "9", "ts", "np", 103 "ss", "gp", "pf", "15", 104 "mf", "ac", "mc", "xf" 105 }; 106 107 static const char *trap_type[] = { 108 "Divide error", /* trap id 0 */ 109 "Debug", /* trap id 1 */ 110 "NMI interrupt", /* trap id 2 */ 111 "Breakpoint", /* trap id 3 */ 112 "Overflow", /* trap id 4 */ 113 "BOUND range exceeded", /* trap id 5 */ 114 "Invalid opcode", /* trap id 6 */ 115 "Device not available", /* trap id 7 */ 116 "Double fault", /* trap id 8 */ 117 "Coprocessor segment overrun", /* trap id 9 */ 118 "Invalid TSS", /* trap id 10 */ 119 "Segment not present", /* trap id 11 */ 120 "Stack segment fault", /* trap id 12 */ 121 "General protection", /* trap id 13 */ 122 "Page fault", /* trap id 14 */ 123 "Reserved", /* trap id 15 */ 124 "x87 floating point error", /* trap id 16 */ 125 "Alignment check", /* trap id 17 */ 126 "Machine check", /* trap id 18 */ 127 "SIMD floating point exception", /* trap id 19 */ 128 }; 129 130 #define TRAP_TYPES (sizeof (trap_type) / sizeof (trap_type[0])) 131 132 int tudebug = 0; 133 int tudebugbpt = 0; 134 int tudebugfpe = 0; 135 int tudebugsse = 0; 136 137 #if defined(TRAPDEBUG) || defined(lint) 138 int tdebug = 0; 139 int lodebug = 0; 140 int faultdebug = 0; 141 #else 142 #define tdebug 0 143 #define lodebug 0 144 #define faultdebug 0 145 #endif /* defined(TRAPDEBUG) || defined(lint) */ 146 147 #if defined(TRAPTRACE) 148 static void dump_ttrace(void); 149 #endif /* TRAPTRACE */ 150 static void dumpregs(struct regs *); 151 static void showregs(uint_t, struct regs *, caddr_t); 152 static void dump_tss(void); 153 static int kern_gpfault(struct regs *); 154 155 struct trap_info { 156 struct regs *trap_regs; 157 uint_t trap_type; 158 caddr_t trap_addr; 159 }; 160 161 /*ARGSUSED*/ 162 static int 163 die(uint_t type, struct regs *rp, caddr_t addr, processorid_t cpuid) 164 { 165 struct trap_info ti; 166 const char *trap_name, *trap_mnemonic; 167 168 if (type < TRAP_TYPES) { 169 trap_name = trap_type[type]; 170 trap_mnemonic = trap_type_mnemonic[type]; 171 } else { 172 trap_name = "trap"; 173 trap_mnemonic = "-"; 174 } 175 176 #ifdef TRAPTRACE 177 TRAPTRACE_FREEZE; 178 #endif 179 180 ti.trap_regs = rp; 181 ti.trap_type = type & ~USER; 182 ti.trap_addr = addr; 183 184 curthread->t_panic_trap = &ti; 185 186 if (type == T_PGFLT && addr < (caddr_t)KERNELBASE) { 187 panic("BAD TRAP: type=%x (#%s %s) rp=%p addr=%p " 188 "occurred in module \"%s\" due to %s", 189 type, trap_mnemonic, trap_name, (void *)rp, (void *)addr, 190 mod_containing_pc((caddr_t)rp->r_pc), 191 addr < (caddr_t)PAGESIZE ? 192 "a NULL pointer dereference" : 193 "an illegal access to a user address"); 194 } else 195 panic("BAD TRAP: type=%x (#%s %s) rp=%p addr=%p", 196 type, trap_mnemonic, trap_name, (void *)rp, (void *)addr); 197 return (0); 198 } 199 200 /* 201 * Rewrite the instruction at pc to be an int $T_SYSCALLINT instruction. 202 * 203 * int <vector> is two bytes: 0xCD <vector> 204 */ 205 206 #define SLOW_SCALL_SIZE 2 207 208 static int 209 rewrite_syscall(caddr_t pc) 210 { 211 uchar_t instr[SLOW_SCALL_SIZE] = { 0xCD, T_SYSCALLINT }; 212 213 if (uwrite(curthread->t_procp, instr, SLOW_SCALL_SIZE, 214 (uintptr_t)pc) != 0) 215 return (1); 216 217 return (0); 218 } 219 220 /* 221 * Test to see if the instruction at pc is sysenter or syscall. The second 222 * argument should be the x86 feature flag corresponding to the expected 223 * instruction. 224 * 225 * sysenter is two bytes: 0x0F 0x34 226 * syscall is two bytes: 0x0F 0x05 227 */ 228 229 #define FAST_SCALL_SIZE 2 230 231 static int 232 instr_is_fast_syscall(caddr_t pc, int which) 233 { 234 uchar_t instr[FAST_SCALL_SIZE]; 235 236 ASSERT(which == X86_SEP || which == X86_ASYSC); 237 238 if (copyin_nowatch(pc, (caddr_t)instr, FAST_SCALL_SIZE) != 0 || 239 instr[0] != 0x0F) 240 return (0); 241 242 if ((which == X86_SEP && instr[1] == 0x34) || 243 (which == X86_ASYSC && instr[1] == 0x05)) 244 return (1); 245 246 return (0); 247 } 248 249 /* 250 * Test to see if the instruction at pc is a system call instruction. 251 * 252 * The bytes of an lcall instruction used for the syscall trap. 253 * static uchar_t lcall[7] = { 0x9a, 0, 0, 0, 0, 0x7, 0 }; 254 * static uchar_t lcallalt[7] = { 0x9a, 0, 0, 0, 0, 0x27, 0 }; 255 */ 256 257 #define LCALLSIZE 7 258 259 static int 260 instr_is_syscall(caddr_t pc) 261 { 262 uchar_t instr[LCALLSIZE]; 263 264 if (copyin_nowatch(pc, (caddr_t)instr, LCALLSIZE) == 0 && 265 instr[0] == 0x9a && 266 instr[1] == 0 && 267 instr[2] == 0 && 268 instr[3] == 0 && 269 instr[4] == 0 && 270 (instr[5] == 0x7 || instr[5] == 0x27) && 271 instr[6] == 0) 272 return (1); 273 274 return (0); 275 } 276 277 #ifdef OPTERON_ERRATUM_91 278 279 /* 280 * Test to see if the instruction at pc is a prefetch instruction. 281 * 282 * The first byte of prefetch instructions is always 0x0F. 283 * The second byte is 0x18 for regular prefetch or 0x0D for AMD 3dnow prefetch. 284 * The third byte is between 0 and 3 inclusive. 285 */ 286 287 #define PREFETCHSIZE 3 288 289 static int 290 cmp_to_prefetch(uchar_t *p) 291 { 292 if (*p == 0x0F && (*(p+1) == 0x18 || *(p+1) == 0x0D) && *(p+2) <= 3) 293 return (1); 294 return (0); 295 } 296 297 static int 298 instr_is_prefetch(caddr_t pc) 299 { 300 uchar_t instr[PREFETCHSIZE]; 301 int error; 302 303 error = copyin_nowatch(pc, (caddr_t)instr, PREFETCHSIZE); 304 305 if (error == 0 && cmp_to_prefetch(instr)) 306 return (1); 307 return (0); 308 } 309 310 #endif /* OPTERON_ERRATUM_91 */ 311 312 /* 313 * Called from the trap handler when a processor trap occurs. 314 * 315 * Note: All user-level traps that might call stop() must exit 316 * trap() by 'goto out' or by falling through. 317 */ 318 void 319 trap(struct regs *rp, caddr_t addr, processorid_t cpuid) 320 { 321 kthread_t *cur_thread = curthread; 322 enum seg_rw rw; 323 unsigned type; 324 extern int stop_on_fault(uint_t, k_siginfo_t *); 325 proc_t *p = ttoproc(cur_thread); 326 klwp_t *lwp = ttolwp(cur_thread); 327 uintptr_t lofault; 328 faultcode_t pagefault(), res, errcode; 329 enum fault_type fault_type; 330 k_siginfo_t siginfo; 331 uint_t fault = 0; 332 int mstate; 333 int sicode = 0; 334 int watchcode; 335 int watchpage; 336 caddr_t vaddr; 337 size_t sz; 338 int ta; 339 340 ASSERT_STACK_ALIGNED(); 341 342 type = rp->r_trapno; 343 CPU_STATS_ADDQ(CPU, sys, trap, 1); 344 345 ASSERT(cur_thread->t_schedflag & TS_DONT_SWAP); 346 347 if (type == T_PGFLT) { 348 349 errcode = rp->r_err; 350 if (errcode & PF_ERR_WRITE) 351 rw = S_WRITE; 352 else if ((caddr_t)rp->r_pc == addr || 353 (mmu.pt_nx != 0 && (errcode & PF_ERR_EXEC))) 354 rw = S_EXEC; 355 else 356 rw = S_READ; 357 358 #if defined(__i386) 359 /* 360 * Pentium Pro work-around 361 */ 362 if ((errcode & PF_ERR_PROT) && pentiumpro_bug4046376) { 363 uint_t attr; 364 uint_t priv_violation; 365 uint_t access_violation; 366 367 if (hat_getattr(addr < (caddr_t)kernelbase ? 368 curproc->p_as->a_hat : kas.a_hat, addr, &attr) 369 == -1) { 370 errcode &= ~PF_ERR_PROT; 371 } else { 372 priv_violation = (errcode & PF_ERR_USER) && 373 !(attr & PROT_USER); 374 access_violation = (errcode & PF_ERR_WRITE) && 375 !(attr & PROT_WRITE); 376 if (!priv_violation && !access_violation) 377 goto cleanup; 378 } 379 } 380 #endif /* __i386 */ 381 382 } 383 384 if (tdebug) 385 showregs(type, rp, addr); 386 387 if (USERMODE(rp->r_cs)) { 388 /* 389 * Set up the current cred to use during this trap. u_cred 390 * no longer exists. t_cred is used instead. 391 * The current process credential applies to the thread for 392 * the entire trap. If trapping from the kernel, this 393 * should already be set up. 394 */ 395 if (cur_thread->t_cred != p->p_cred) { 396 cred_t *oldcred = cur_thread->t_cred; 397 /* 398 * DTrace accesses t_cred in probe context. t_cred 399 * must always be either NULL, or point to a valid, 400 * allocated cred structure. 401 */ 402 cur_thread->t_cred = crgetcred(); 403 crfree(oldcred); 404 } 405 ASSERT(lwp != NULL); 406 type |= USER; 407 ASSERT(lwptoregs(lwp) == rp); 408 lwp->lwp_state = LWP_SYS; 409 410 switch (type) { 411 case T_PGFLT + USER: 412 if ((caddr_t)rp->r_pc == addr) 413 mstate = LMS_TFAULT; 414 else 415 mstate = LMS_DFAULT; 416 break; 417 default: 418 mstate = LMS_TRAP; 419 break; 420 } 421 /* Kernel probe */ 422 TNF_PROBE_1(thread_state, "thread", /* CSTYLED */, 423 tnf_microstate, state, mstate); 424 mstate = new_mstate(cur_thread, mstate); 425 426 bzero(&siginfo, sizeof (siginfo)); 427 } 428 429 switch (type) { 430 case T_PGFLT + USER: 431 case T_SGLSTP: 432 case T_SGLSTP + USER: 433 case T_BPTFLT + USER: 434 break; 435 436 default: 437 FTRACE_2("trap(): type=0x%lx, regs=0x%lx", 438 (ulong_t)type, (ulong_t)rp); 439 break; 440 } 441 442 switch (type) { 443 444 case T_MCE: /* Machine check exception */ 445 case T_MCE + USER: 446 if (x86_feature & X86_MCA) { 447 if (mca_exception(rp)) 448 (void) die(type, rp, addr, cpuid); 449 type &= ~USER; 450 goto cleanup; 451 } 452 default: 453 if (type & USER) { 454 if (tudebug) 455 showregs(type, rp, (caddr_t)0); 456 printf("trap: Unknown trap type %d in user mode\n", 457 type & ~USER); 458 siginfo.si_signo = SIGILL; 459 siginfo.si_code = ILL_ILLTRP; 460 siginfo.si_addr = (caddr_t)rp->r_pc; 461 siginfo.si_trapno = type & ~USER; 462 fault = FLTILL; 463 break; 464 } else { 465 (void) die(type, rp, addr, cpuid); 466 /*NOTREACHED*/ 467 } 468 469 case T_PGFLT: /* system page fault */ 470 /* 471 * If we're under on_trap() protection (see <sys/ontrap.h>), 472 * set ot_trap and longjmp back to the on_trap() call site. 473 */ 474 if ((cur_thread->t_ontrap != NULL) && 475 (cur_thread->t_ontrap->ot_prot & OT_DATA_ACCESS)) { 476 curthread->t_ontrap->ot_trap |= OT_DATA_ACCESS; 477 longjmp(&curthread->t_ontrap->ot_jmpbuf); 478 } 479 480 /* 481 * See if we can handle as pagefault. Save lofault 482 * across this. Here we assume that an address 483 * less than KERNELBASE is a user fault. 484 * We can do this as copy.s routines verify that the 485 * starting address is less than KERNELBASE before 486 * starting and because we know that we always have 487 * KERNELBASE mapped as invalid to serve as a "barrier". 488 */ 489 lofault = cur_thread->t_lofault; 490 cur_thread->t_lofault = 0; 491 492 mstate = new_mstate(cur_thread, LMS_KFAULT); 493 494 if (addr < (caddr_t)kernelbase) { 495 res = pagefault(addr, 496 (errcode & PF_ERR_PROT)? F_PROT: F_INVAL, rw, 0); 497 if (res == FC_NOMAP && 498 addr < p->p_usrstack && 499 grow(addr)) 500 res = 0; 501 } else { 502 res = pagefault(addr, 503 (errcode & PF_ERR_PROT)? F_PROT: F_INVAL, rw, 1); 504 } 505 (void) new_mstate(cur_thread, mstate); 506 507 /* 508 * Restore lofault. If we resolved the fault, exit. 509 * If we didn't and lofault wasn't set, die. 510 */ 511 cur_thread->t_lofault = lofault; 512 if (res == 0) 513 goto cleanup; 514 515 #if defined(OPTERON_ERRATUM_93) && defined(_LP64) 516 if (lofault == 0 && opteron_erratum_93) { 517 /* 518 * Workaround for Opteron Erratum 93. On return from 519 * a System Managment Interrupt at a HLT instruction 520 * the %rip might be truncated to a 32 bit value. 521 * BIOS is supposed to fix this, but some don't. 522 * If this occurs we simply restore the high order bits. 523 * The HLT instruction is 1 byte of 0xf4. 524 */ 525 uintptr_t rip = rp->r_pc; 526 527 if ((rip & 0xfffffffful) == rip) { 528 rip |= 0xfffffffful << 32; 529 if (hat_getpfnum(kas.a_hat, (caddr_t)rip) != 530 PFN_INVALID && 531 (*(uchar_t *)rip == 0xf4 || 532 *(uchar_t *)(rip - 1) == 0xf4)) { 533 rp->r_pc = rip; 534 goto cleanup; 535 } 536 } 537 } 538 #endif /* OPTERON_ERRATUM_93 && _LP64 */ 539 540 #ifdef OPTERON_ERRATUM_91 541 if (lofault == 0 && opteron_erratum_91) { 542 /* 543 * Workaround for Opteron Erratum 91. Prefetches may 544 * generate a page fault (they're not supposed to do 545 * that!). If this occurs we simply return back to the 546 * instruction. 547 */ 548 caddr_t pc = (caddr_t)rp->r_pc; 549 550 /* 551 * If the faulting PC is not mapped, this is a 552 * legitimate kernel page fault that must result in a 553 * panic. If the faulting PC is mapped, it could contain 554 * a prefetch instruction. Check for that here. 555 */ 556 if (hat_getpfnum(kas.a_hat, pc) != PFN_INVALID) { 557 if (cmp_to_prefetch((uchar_t *)pc)) { 558 #ifdef DEBUG 559 cmn_err(CE_WARN, "Opteron erratum 91 " 560 "occurred: kernel prefetch" 561 " at %p generated a page fault!", 562 (void *)rp->r_pc); 563 #endif /* DEBUG */ 564 goto cleanup; 565 } 566 } 567 (void) die(type, rp, addr, cpuid); 568 } 569 #endif /* OPTERON_ERRATUM_91 */ 570 571 if (lofault == 0) 572 (void) die(type, rp, addr, cpuid); 573 574 /* 575 * Cannot resolve fault. Return to lofault. 576 */ 577 if (lodebug) { 578 showregs(type, rp, addr); 579 traceregs(rp); 580 } 581 if (FC_CODE(res) == FC_OBJERR) 582 res = FC_ERRNO(res); 583 else 584 res = EFAULT; 585 rp->r_r0 = res; 586 rp->r_pc = cur_thread->t_lofault; 587 goto cleanup; 588 589 case T_PGFLT + USER: /* user page fault */ 590 if (faultdebug) { 591 char *fault_str; 592 593 switch (rw) { 594 case S_READ: 595 fault_str = "read"; 596 break; 597 case S_WRITE: 598 fault_str = "write"; 599 break; 600 case S_EXEC: 601 fault_str = "exec"; 602 break; 603 default: 604 fault_str = ""; 605 break; 606 } 607 printf("user %s fault: addr=0x%lx errcode=0x%x\n", 608 fault_str, (uintptr_t)addr, errcode); 609 } 610 611 #if defined(OPTERON_ERRATUM_100) && defined(_LP64) 612 /* 613 * Workaround for AMD erratum 100 614 * 615 * A 32-bit process may receive a page fault on a non 616 * 32-bit address by mistake. The range of the faulting 617 * address will be 618 * 619 * 0xffffffff80000000 .. 0xffffffffffffffff or 620 * 0x0000000100000000 .. 0x000000017fffffff 621 * 622 * The fault is always due to an instruction fetch, however 623 * the value of r_pc should be correct (in 32 bit range), 624 * so we ignore the page fault on the bogus address. 625 */ 626 if (p->p_model == DATAMODEL_ILP32 && 627 (0xffffffff80000000 <= (uintptr_t)addr || 628 (0x100000000 <= (uintptr_t)addr && 629 (uintptr_t)addr <= 0x17fffffff))) { 630 if (!opteron_erratum_100) 631 panic("unexpected erratum #100"); 632 if (rp->r_pc <= 0xffffffff) 633 goto out; 634 } 635 #endif /* OPTERON_ERRATUM_100 && _LP64 */ 636 637 ASSERT(!(curthread->t_flag & T_WATCHPT)); 638 watchpage = (pr_watch_active(p) && pr_is_watchpage(addr, rw)); 639 #ifdef __i386 640 /* 641 * In 32-bit mode, the lcall (system call) instruction fetches 642 * one word from the stack, at the stack pointer, because of the 643 * way the call gate is constructed. This is a bogus 644 * read and should not be counted as a read watchpoint. 645 * We work around the problem here by testing to see if 646 * this situation applies and, if so, simply jumping to 647 * the code in locore.s that fields the system call trap. 648 * The registers on the stack are already set up properly 649 * due to the match between the call gate sequence and the 650 * trap gate sequence. We just have to adjust the pc. 651 */ 652 if (watchpage && addr == (caddr_t)rp->r_sp && 653 rw == S_READ && instr_is_syscall((caddr_t)rp->r_pc)) { 654 extern void watch_syscall(void); 655 656 rp->r_pc += LCALLSIZE; 657 watch_syscall(); /* never returns */ 658 /* NOTREACHED */ 659 } 660 #endif /* __i386 */ 661 vaddr = addr; 662 if (!watchpage || (sz = instr_size(rp, &vaddr, rw)) <= 0) 663 fault_type = (errcode & PF_ERR_PROT)? F_PROT: F_INVAL; 664 else if ((watchcode = pr_is_watchpoint(&vaddr, &ta, 665 sz, NULL, rw)) != 0) { 666 if (ta) { 667 do_watch_step(vaddr, sz, rw, 668 watchcode, rp->r_pc); 669 fault_type = F_INVAL; 670 } else { 671 bzero(&siginfo, sizeof (siginfo)); 672 siginfo.si_signo = SIGTRAP; 673 siginfo.si_code = watchcode; 674 siginfo.si_addr = vaddr; 675 siginfo.si_trapafter = 0; 676 siginfo.si_pc = (caddr_t)rp->r_pc; 677 fault = FLTWATCH; 678 break; 679 } 680 } else { 681 /* XXX pr_watch_emul() never succeeds (for now) */ 682 if (rw != S_EXEC && pr_watch_emul(rp, vaddr, rw)) 683 goto out; 684 do_watch_step(vaddr, sz, rw, 0, 0); 685 fault_type = F_INVAL; 686 } 687 688 res = pagefault(addr, fault_type, rw, 0); 689 690 /* 691 * If pagefault() succeeded, ok. 692 * Otherwise attempt to grow the stack. 693 */ 694 if (res == 0 || 695 (res == FC_NOMAP && 696 addr < p->p_usrstack && 697 grow(addr))) { 698 lwp->lwp_lastfault = FLTPAGE; 699 lwp->lwp_lastfaddr = addr; 700 if (prismember(&p->p_fltmask, FLTPAGE)) { 701 bzero(&siginfo, sizeof (siginfo)); 702 siginfo.si_addr = addr; 703 (void) stop_on_fault(FLTPAGE, &siginfo); 704 } 705 goto out; 706 } else if (res == FC_PROT && addr < p->p_usrstack && 707 (mmu.pt_nx != 0 && (errcode & PF_ERR_EXEC))) { 708 report_stack_exec(p, addr); 709 } 710 711 #ifdef OPTERON_ERRATUM_91 712 /* 713 * Workaround for Opteron Erratum 91. Prefetches may generate a 714 * page fault (they're not supposed to do that!). If this 715 * occurs we simply return back to the instruction. 716 * 717 * We rely on copyin to properly fault in the page with r_pc. 718 */ 719 if (opteron_erratum_91 && 720 addr != (caddr_t)rp->r_pc && 721 instr_is_prefetch((caddr_t)rp->r_pc)) { 722 #ifdef DEBUG 723 cmn_err(CE_WARN, "Opteron erratum 91 occurred: " 724 "prefetch at %p in pid %d generated a trap!", 725 (void *)rp->r_pc, p->p_pid); 726 #endif /* DEBUG */ 727 goto out; 728 } 729 #endif /* OPTERON_ERRATUM_91 */ 730 731 if (tudebug) 732 showregs(type, rp, addr); 733 /* 734 * In the case where both pagefault and grow fail, 735 * set the code to the value provided by pagefault. 736 * We map all errors returned from pagefault() to SIGSEGV. 737 */ 738 bzero(&siginfo, sizeof (siginfo)); 739 siginfo.si_addr = addr; 740 switch (FC_CODE(res)) { 741 case FC_HWERR: 742 case FC_NOSUPPORT: 743 siginfo.si_signo = SIGBUS; 744 siginfo.si_code = BUS_ADRERR; 745 fault = FLTACCESS; 746 break; 747 case FC_ALIGN: 748 siginfo.si_signo = SIGBUS; 749 siginfo.si_code = BUS_ADRALN; 750 fault = FLTACCESS; 751 break; 752 case FC_OBJERR: 753 if ((siginfo.si_errno = FC_ERRNO(res)) != EINTR) { 754 siginfo.si_signo = SIGBUS; 755 siginfo.si_code = BUS_OBJERR; 756 fault = FLTACCESS; 757 } 758 break; 759 default: /* FC_NOMAP or FC_PROT */ 760 siginfo.si_signo = SIGSEGV; 761 siginfo.si_code = 762 (res == FC_NOMAP)? SEGV_MAPERR : SEGV_ACCERR; 763 fault = FLTBOUNDS; 764 break; 765 } 766 break; 767 768 case T_ILLINST + USER: /* invalid opcode fault */ 769 /* 770 * If the syscall instruction is disabled due to LDT usage, a 771 * user program that attempts to execute it will trigger a #ud 772 * trap. Check for that case here. If this occurs on a CPU which 773 * doesn't even support syscall, the result of all of this will 774 * be to emulate that particular instruction. 775 */ 776 if (p->p_ldt != NULL && 777 instr_is_fast_syscall((caddr_t)rp->r_pc, X86_ASYSC)) { 778 if (rewrite_syscall((caddr_t)rp->r_pc) == 0) 779 goto out; 780 #ifdef DEBUG 781 else 782 cmn_err(CE_WARN, "failed to rewrite syscall " 783 "instruction in process %d", 784 curthread->t_procp->p_pid); 785 #endif /* DEBUG */ 786 } 787 /*FALLTHROUGH*/ 788 789 if (tudebug) 790 showregs(type, rp, (caddr_t)0); 791 siginfo.si_signo = SIGILL; 792 siginfo.si_code = ILL_ILLOPC; 793 siginfo.si_addr = (caddr_t)rp->r_pc; 794 fault = FLTILL; 795 break; 796 797 case T_ZERODIV + USER: /* integer divide by zero */ 798 if (tudebug && tudebugfpe) 799 showregs(type, rp, (caddr_t)0); 800 siginfo.si_signo = SIGFPE; 801 siginfo.si_code = FPE_INTDIV; 802 siginfo.si_addr = (caddr_t)rp->r_pc; 803 fault = FLTIZDIV; 804 break; 805 806 case T_OVFLW + USER: /* integer overflow */ 807 if (tudebug && tudebugfpe) 808 showregs(type, rp, (caddr_t)0); 809 siginfo.si_signo = SIGFPE; 810 siginfo.si_code = FPE_INTOVF; 811 siginfo.si_addr = (caddr_t)rp->r_pc; 812 fault = FLTIOVF; 813 break; 814 815 case T_NOEXTFLT + USER: /* math coprocessor not available */ 816 if (tudebug && tudebugfpe) 817 showregs(type, rp, addr); 818 if (fpnoextflt(rp)) { 819 siginfo.si_signo = SIGFPE; 820 siginfo.si_code = ILL_ILLOPC; 821 siginfo.si_addr = (caddr_t)rp->r_pc; 822 fault = FLTFPE; 823 } 824 break; 825 826 case T_EXTOVRFLT: /* extension overrun fault */ 827 /* check if we took a kernel trap on behalf of user */ 828 { 829 extern void ndptrap_frstor(void); 830 if (rp->r_pc != (uintptr_t)ndptrap_frstor) 831 (void) die(type, rp, addr, cpuid); 832 type |= USER; 833 } 834 /*FALLTHROUGH*/ 835 case T_EXTOVRFLT + USER: /* extension overrun fault */ 836 if (tudebug && tudebugfpe) 837 showregs(type, rp, addr); 838 if (fpextovrflt(rp)) { 839 siginfo.si_signo = SIGSEGV; 840 siginfo.si_code = SEGV_MAPERR; 841 siginfo.si_addr = (caddr_t)rp->r_pc; 842 fault = FLTBOUNDS; 843 } 844 break; 845 846 case T_EXTERRFLT: /* x87 floating point exception pending */ 847 /* check if we took a kernel trap on behalf of user */ 848 { 849 extern void ndptrap_frstor(void); 850 if (rp->r_pc != (uintptr_t)ndptrap_frstor) 851 (void) die(type, rp, addr, cpuid); 852 type |= USER; 853 } 854 /*FALLTHROUGH*/ 855 856 case T_EXTERRFLT + USER: /* x87 floating point exception pending */ 857 if (tudebug && tudebugfpe) 858 showregs(type, rp, addr); 859 if (sicode = fpexterrflt(rp)) { 860 siginfo.si_signo = SIGFPE; 861 siginfo.si_code = sicode; 862 siginfo.si_addr = (caddr_t)rp->r_pc; 863 fault = FLTFPE; 864 } 865 break; 866 867 case T_SIMDFPE + USER: /* SSE and SSE2 exceptions */ 868 if (tudebug && tudebugsse) 869 showregs(type, rp, addr); 870 if ((x86_feature & (X86_SSE|X86_SSE2)) == 0) { 871 /* 872 * There are rumours that some user instructions 873 * on older CPUs can cause this trap to occur; in 874 * which case send a SIGILL instead of a SIGFPE. 875 */ 876 siginfo.si_signo = SIGILL; 877 siginfo.si_code = ILL_ILLTRP; 878 siginfo.si_addr = (caddr_t)rp->r_pc; 879 siginfo.si_trapno = type & ~USER; 880 fault = FLTILL; 881 } else if ((sicode = fpsimderrflt(rp)) != 0) { 882 siginfo.si_signo = SIGFPE; 883 siginfo.si_code = sicode; 884 siginfo.si_addr = (caddr_t)rp->r_pc; 885 fault = FLTFPE; 886 } 887 break; 888 889 case T_BPTFLT: /* breakpoint trap */ 890 /* 891 * Kernel breakpoint traps should only happen when kmdb is 892 * active, and even then, it'll have interposed on the IDT, so 893 * control won't get here. If it does, we've hit a breakpoint 894 * without the debugger, which is very strange, and very 895 * fatal. 896 */ 897 if (tudebug && tudebugbpt) 898 showregs(type, rp, (caddr_t)0); 899 900 (void) die(type, rp, addr, cpuid); 901 break; 902 903 case T_SGLSTP: /* single step/hw breakpoint exception */ 904 if (tudebug && tudebugbpt) 905 showregs(type, rp, (caddr_t)0); 906 907 /* Now evaluate how we got here */ 908 if (lwp != NULL && (lwp->lwp_pcb.pcb_drstat & DR_SINGLESTEP)) { 909 /* 910 * i386 single-steps even through lcalls which 911 * change the privilege level. So we take a trap at 912 * the first instruction in privileged mode. 913 * 914 * Set a flag to indicate that upon completion of 915 * the system call, deal with the single-step trap. 916 * 917 * The same thing happens for sysenter, too. 918 */ 919 #if defined(__amd64) 920 if (rp->r_pc == (uintptr_t)sys_sysenter) { 921 /* 922 * Adjust the pc so that we don't execute the 923 * swapgs instruction at the head of the 924 * handler and completely confuse things. 925 */ 926 rp->r_pc = (uintptr_t) 927 _sys_sysenter_post_swapgs; 928 #elif defined(__i386) 929 if (rp->r_pc == (uintptr_t)sys_call || 930 rp->r_pc == (uintptr_t)sys_sysenter) { 931 #endif 932 rp->r_ps &= ~PS_T; /* turn off trace */ 933 lwp->lwp_pcb.pcb_flags |= DEBUG_PENDING; 934 cur_thread->t_post_sys = 1; 935 goto cleanup; 936 } 937 } 938 /* XXX - needs review on debugger interface? */ 939 if (boothowto & RB_DEBUG) 940 debug_enter((char *)NULL); 941 else 942 (void) die(type, rp, addr, cpuid); 943 break; 944 945 case T_NMIFLT: /* NMI interrupt */ 946 printf("Unexpected NMI in system mode\n"); 947 goto cleanup; 948 949 case T_NMIFLT + USER: /* NMI interrupt */ 950 printf("Unexpected NMI in user mode\n"); 951 break; 952 953 case T_GPFLT: /* general protection violation */ 954 #if defined(__amd64) 955 /* 956 * On amd64, we can get a #gp from referencing addresses 957 * in the virtual address hole e.g. from a copyin. 958 */ 959 960 /* 961 * If we're under on_trap() protection (see <sys/ontrap.h>), 962 * set ot_trap and longjmp back to the on_trap() call site. 963 */ 964 if ((cur_thread->t_ontrap != NULL) && 965 (cur_thread->t_ontrap->ot_prot & OT_DATA_ACCESS)) { 966 curthread->t_ontrap->ot_trap |= OT_DATA_ACCESS; 967 longjmp(&curthread->t_ontrap->ot_jmpbuf); 968 } 969 970 /* 971 * If we're under lofault protection (copyin etc.), 972 * longjmp back to lofault with an EFAULT. 973 */ 974 if (cur_thread->t_lofault) { 975 /* 976 * Fault is not resolvable, so just return to lofault 977 */ 978 if (lodebug) { 979 showregs(type, rp, addr); 980 traceregs(rp); 981 } 982 rp->r_r0 = EFAULT; 983 rp->r_pc = cur_thread->t_lofault; 984 goto cleanup; 985 } 986 /*FALLTHROUGH*/ 987 #endif 988 case T_STKFLT: /* stack fault */ 989 case T_TSSFLT: /* invalid TSS fault */ 990 case T_SEGFLT: /* segment not present fault */ 991 if (tudebug) 992 showregs(type, rp, (caddr_t)0); 993 if (kern_gpfault(rp)) 994 (void) die(type, rp, addr, cpuid); 995 goto cleanup; 996 997 case T_SEGFLT + USER: /* segment not present fault */ 998 #ifdef _SYSCALL32_IMPL 999 if (instr_is_syscall((caddr_t)rp->r_pc)) { 1000 /* 1001 * System calls via the call gate come in through 1002 * not-present traps. 1003 * 1004 * Since this is a not-present trap, rp->r_pc points to 1005 * the trapping lcall instruction. We need to bump it 1006 * to the next insn so the app can continue on. 1007 */ 1008 rp->r_pc += LCALLSIZE; 1009 lwp->lwp_regs = rp; 1010 1011 /* 1012 * Normally the microstate of the LWP is forced back to 1013 * LMS_USER by the syscall handlers. Emulate that 1014 * behavior here. 1015 */ 1016 mstate = LMS_USER; 1017 1018 dosyscall(); 1019 goto out; 1020 } 1021 #endif /* _SYSCALL32_IMPL */ 1022 /*FALLTHROUGH*/ 1023 1024 case T_GPFLT + USER: /* general protection violation */ 1025 /* 1026 * If the current process is using a private LDT and the 1027 * trapping instruction is sysenter, the sysenter instruction 1028 * has been disabled on the CPU because it destroys segment 1029 * registers. If this is the case, rewrite the instruction to 1030 * be a safe system call and retry it. If this occurs on a CPU 1031 * which doesn't even support sysenter, the result of all of 1032 * this will be to emulate that particular instruction. 1033 */ 1034 if (p->p_ldt != NULL && 1035 instr_is_fast_syscall((caddr_t)rp->r_pc, X86_SEP)) { 1036 if (rewrite_syscall((caddr_t)rp->r_pc) == 0) 1037 goto out; 1038 #ifdef DEBUG 1039 else 1040 cmn_err(CE_WARN, "failed to rewrite sysenter " 1041 "instruction in process %d", 1042 curthread->t_procp->p_pid); 1043 #endif /* DEBUG */ 1044 } 1045 /*FALLTHROUGH*/ 1046 1047 case T_BOUNDFLT + USER: /* bound fault */ 1048 case T_STKFLT + USER: /* stack fault */ 1049 case T_TSSFLT + USER: /* invalid TSS fault */ 1050 if (tudebug) 1051 showregs(type, rp, (caddr_t)0); 1052 siginfo.si_signo = SIGSEGV; 1053 siginfo.si_code = SEGV_MAPERR; 1054 siginfo.si_addr = (caddr_t)rp->r_pc; 1055 fault = FLTBOUNDS; 1056 break; 1057 1058 case T_ALIGNMENT + USER: /* user alignment error (486) */ 1059 if (tudebug) 1060 showregs(type, rp, (caddr_t)0); 1061 bzero(&siginfo, sizeof (siginfo)); 1062 siginfo.si_signo = SIGBUS; 1063 siginfo.si_code = BUS_ADRALN; 1064 siginfo.si_addr = (caddr_t)rp->r_pc; 1065 fault = FLTACCESS; 1066 break; 1067 1068 case T_SGLSTP + USER: /* single step/hw breakpoint exception */ 1069 if (tudebug && tudebugbpt) 1070 showregs(type, rp, (caddr_t)0); 1071 1072 /* Was it single-stepping? */ 1073 if (lwp->lwp_pcb.pcb_drstat & DR_SINGLESTEP) { 1074 pcb_t *pcb = &lwp->lwp_pcb; 1075 1076 rp->r_ps &= ~PS_T; 1077 /* 1078 * If both NORMAL_STEP and WATCH_STEP are in effect, 1079 * give precedence to NORMAL_STEP. If neither is set, 1080 * user must have set the PS_T bit in %efl; treat this 1081 * as NORMAL_STEP. 1082 */ 1083 if ((pcb->pcb_flags & NORMAL_STEP) || 1084 !(pcb->pcb_flags & WATCH_STEP)) { 1085 siginfo.si_signo = SIGTRAP; 1086 siginfo.si_code = TRAP_TRACE; 1087 siginfo.si_addr = (caddr_t)rp->r_pc; 1088 fault = FLTTRACE; 1089 if (pcb->pcb_flags & WATCH_STEP) 1090 (void) undo_watch_step(NULL); 1091 } else { 1092 fault = undo_watch_step(&siginfo); 1093 } 1094 pcb->pcb_flags &= ~(NORMAL_STEP|WATCH_STEP); 1095 } else { 1096 cmn_err(CE_WARN, 1097 "Unexpected INT 1 in user mode, dr6=%lx", 1098 lwp->lwp_pcb.pcb_drstat); 1099 } 1100 break; 1101 1102 case T_BPTFLT + USER: /* breakpoint trap */ 1103 if (tudebug && tudebugbpt) 1104 showregs(type, rp, (caddr_t)0); 1105 /* 1106 * int 3 (the breakpoint instruction) leaves the pc referring 1107 * to the address one byte after the breakpointed address. 1108 * If the P_PR_BPTADJ flag has been set via /proc, We adjust 1109 * it back so it refers to the breakpointed address. 1110 */ 1111 if (p->p_proc_flag & P_PR_BPTADJ) 1112 rp->r_pc--; 1113 siginfo.si_signo = SIGTRAP; 1114 siginfo.si_code = TRAP_BRKPT; 1115 siginfo.si_addr = (caddr_t)rp->r_pc; 1116 fault = FLTBPT; 1117 break; 1118 1119 case T_AST: 1120 /* 1121 * This occurs only after the cs register has been made to 1122 * look like a kernel selector, either through debugging or 1123 * possibly by functions like setcontext(). The thread is 1124 * about to cause a general protection fault at common_iret() 1125 * in locore. We let that happen immediately instead of 1126 * doing the T_AST processing. 1127 */ 1128 goto cleanup; 1129 1130 case T_AST + USER: /* profiling or resched pseudo trap */ 1131 if (lwp->lwp_pcb.pcb_flags & CPC_OVERFLOW) { 1132 lwp->lwp_pcb.pcb_flags &= ~CPC_OVERFLOW; 1133 if (kcpc_overflow_ast()) { 1134 /* 1135 * Signal performance counter overflow 1136 */ 1137 if (tudebug) 1138 showregs(type, rp, (caddr_t)0); 1139 bzero(&siginfo, sizeof (siginfo)); 1140 siginfo.si_signo = SIGEMT; 1141 siginfo.si_code = EMT_CPCOVF; 1142 siginfo.si_addr = (caddr_t)rp->r_pc; 1143 fault = FLTCPCOVF; 1144 } 1145 } 1146 break; 1147 } 1148 1149 /* 1150 * We can't get here from a system trap 1151 */ 1152 ASSERT(type & USER); 1153 1154 if (fault) { 1155 /* 1156 * Remember the fault and fault adddress 1157 * for real-time (SIGPROF) profiling. 1158 */ 1159 lwp->lwp_lastfault = fault; 1160 lwp->lwp_lastfaddr = siginfo.si_addr; 1161 1162 DTRACE_PROC2(fault, int, fault, ksiginfo_t *, &siginfo); 1163 1164 /* 1165 * If a debugger has declared this fault to be an 1166 * event of interest, stop the lwp. Otherwise just 1167 * deliver the associated signal. 1168 */ 1169 if (siginfo.si_signo != SIGKILL && 1170 prismember(&p->p_fltmask, fault) && 1171 stop_on_fault(fault, &siginfo) == 0) 1172 siginfo.si_signo = 0; 1173 } 1174 1175 if (siginfo.si_signo) 1176 trapsig(&siginfo, (fault == FLTCPCOVF)? 0 : 1); 1177 1178 if (lwp->lwp_oweupc) 1179 profil_tick(rp->r_pc); 1180 1181 if (cur_thread->t_astflag | cur_thread->t_sig_check) { 1182 /* 1183 * Turn off the AST flag before checking all the conditions that 1184 * may have caused an AST. This flag is on whenever a signal or 1185 * unusual condition should be handled after the next trap or 1186 * syscall. 1187 */ 1188 astoff(cur_thread); 1189 cur_thread->t_sig_check = 0; 1190 1191 mutex_enter(&p->p_lock); 1192 if (curthread->t_proc_flag & TP_CHANGEBIND) { 1193 timer_lwpbind(); 1194 curthread->t_proc_flag &= ~TP_CHANGEBIND; 1195 } 1196 mutex_exit(&p->p_lock); 1197 1198 /* 1199 * for kaio requests that are on the per-process poll queue, 1200 * aiop->aio_pollq, they're AIO_POLL bit is set, the kernel 1201 * should copyout their result_t to user memory. by copying 1202 * out the result_t, the user can poll on memory waiting 1203 * for the kaio request to complete. 1204 */ 1205 if (p->p_aio) 1206 aio_cleanup(0); 1207 /* 1208 * If this LWP was asked to hold, call holdlwp(), which will 1209 * stop. holdlwps() sets this up and calls pokelwps() which 1210 * sets the AST flag. 1211 * 1212 * Also check TP_EXITLWP, since this is used by fresh new LWPs 1213 * through lwp_rtt(). That flag is set if the lwp_create(2) 1214 * syscall failed after creating the LWP. 1215 */ 1216 if (ISHOLD(p)) 1217 holdlwp(); 1218 1219 /* 1220 * All code that sets signals and makes ISSIG evaluate true must 1221 * set t_astflag afterwards. 1222 */ 1223 if (ISSIG_PENDING(cur_thread, lwp, p)) { 1224 if (issig(FORREAL)) 1225 psig(); 1226 cur_thread->t_sig_check = 1; 1227 } 1228 1229 if (cur_thread->t_rprof != NULL) { 1230 realsigprof(0, 0); 1231 cur_thread->t_sig_check = 1; 1232 } 1233 } 1234 1235 out: /* We can't get here from a system trap */ 1236 ASSERT(type & USER); 1237 1238 if (ISHOLD(p)) 1239 holdlwp(); 1240 1241 /* 1242 * Set state to LWP_USER here so preempt won't give us a kernel 1243 * priority if it occurs after this point. Call CL_TRAPRET() to 1244 * restore the user-level priority. 1245 * 1246 * It is important that no locks (other than spinlocks) be entered 1247 * after this point before returning to user mode (unless lwp_state 1248 * is set back to LWP_SYS). 1249 */ 1250 lwp->lwp_state = LWP_USER; 1251 1252 if (cur_thread->t_trapret) { 1253 cur_thread->t_trapret = 0; 1254 thread_lock(cur_thread); 1255 CL_TRAPRET(cur_thread); 1256 thread_unlock(cur_thread); 1257 } 1258 if (CPU->cpu_runrun) 1259 preempt(); 1260 (void) new_mstate(cur_thread, mstate); 1261 1262 /* Kernel probe */ 1263 TNF_PROBE_1(thread_state, "thread", /* CSTYLED */, 1264 tnf_microstate, state, LMS_USER); 1265 1266 return; 1267 1268 cleanup: /* system traps end up here */ 1269 ASSERT(!(type & USER)); 1270 } 1271 1272 /* 1273 * Patch non-zero to disable preemption of threads in the kernel. 1274 */ 1275 int IGNORE_KERNEL_PREEMPTION = 0; /* XXX - delete this someday */ 1276 1277 struct kpreempt_cnts { /* kernel preemption statistics */ 1278 int kpc_idle; /* executing idle thread */ 1279 int kpc_intr; /* executing interrupt thread */ 1280 int kpc_clock; /* executing clock thread */ 1281 int kpc_blocked; /* thread has blocked preemption (t_preempt) */ 1282 int kpc_notonproc; /* thread is surrendering processor */ 1283 int kpc_inswtch; /* thread has ratified scheduling decision */ 1284 int kpc_prilevel; /* processor interrupt level is too high */ 1285 int kpc_apreempt; /* asynchronous preemption */ 1286 int kpc_spreempt; /* synchronous preemption */ 1287 } kpreempt_cnts; 1288 1289 /* 1290 * kernel preemption: forced rescheduling, preempt the running kernel thread. 1291 * the argument is old PIL for an interrupt, 1292 * or the distingished value KPREEMPT_SYNC. 1293 */ 1294 void 1295 kpreempt(int asyncspl) 1296 { 1297 kthread_t *cur_thread = curthread; 1298 1299 if (IGNORE_KERNEL_PREEMPTION) { 1300 aston(CPU->cpu_dispthread); 1301 return; 1302 } 1303 1304 /* 1305 * Check that conditions are right for kernel preemption 1306 */ 1307 do { 1308 if (cur_thread->t_preempt) { 1309 /* 1310 * either a privileged thread (idle, panic, interrupt) 1311 * or will check when t_preempt is lowered 1312 */ 1313 if (cur_thread->t_pri < 0) 1314 kpreempt_cnts.kpc_idle++; 1315 else if (cur_thread->t_flag & T_INTR_THREAD) { 1316 kpreempt_cnts.kpc_intr++; 1317 if (cur_thread->t_pil == CLOCK_LEVEL) 1318 kpreempt_cnts.kpc_clock++; 1319 } else 1320 kpreempt_cnts.kpc_blocked++; 1321 aston(CPU->cpu_dispthread); 1322 return; 1323 } 1324 if (cur_thread->t_state != TS_ONPROC || 1325 cur_thread->t_disp_queue != CPU->cpu_disp) { 1326 /* this thread will be calling swtch() shortly */ 1327 kpreempt_cnts.kpc_notonproc++; 1328 if (CPU->cpu_thread != CPU->cpu_dispthread) { 1329 /* already in swtch(), force another */ 1330 kpreempt_cnts.kpc_inswtch++; 1331 siron(); 1332 } 1333 return; 1334 } 1335 if (getpil() >= DISP_LEVEL) { 1336 /* 1337 * We can't preempt this thread if it is at 1338 * a PIL >= DISP_LEVEL since it may be holding 1339 * a spin lock (like sched_lock). 1340 */ 1341 siron(); /* check back later */ 1342 kpreempt_cnts.kpc_prilevel++; 1343 return; 1344 } 1345 1346 if (asyncspl != KPREEMPT_SYNC) 1347 kpreempt_cnts.kpc_apreempt++; 1348 else 1349 kpreempt_cnts.kpc_spreempt++; 1350 1351 cur_thread->t_preempt++; 1352 preempt(); 1353 cur_thread->t_preempt--; 1354 } while (CPU->cpu_kprunrun); 1355 } 1356 1357 /* 1358 * Print out debugging info. 1359 */ 1360 static void 1361 showregs(uint_t type, struct regs *rp, caddr_t addr) 1362 { 1363 int s; 1364 1365 s = spl7(); 1366 type &= ~USER; 1367 if (u.u_comm[0]) 1368 printf("%s: ", u.u_comm); 1369 if (type < TRAP_TYPES) 1370 printf("#%s %s\n", trap_type_mnemonic[type], trap_type[type]); 1371 else 1372 switch (type) { 1373 case T_SYSCALL: 1374 printf("Syscall Trap:\n"); 1375 break; 1376 case T_AST: 1377 printf("AST\n"); 1378 break; 1379 default: 1380 printf("Bad Trap = %d\n", type); 1381 break; 1382 } 1383 if (type == T_PGFLT) { 1384 printf("Bad %s fault at addr=0x%lx\n", 1385 USERMODE(rp->r_cs) ? "user": "kernel", (uintptr_t)addr); 1386 } else if (addr) { 1387 printf("addr=0x%lx\n", (uintptr_t)addr); 1388 } 1389 1390 printf("pid=%d, pc=0x%lx, sp=0x%lx, eflags=0x%lx\n", 1391 (ttoproc(curthread) && ttoproc(curthread)->p_pidp) ? 1392 ttoproc(curthread)->p_pid : 0, rp->r_pc, rp->r_sp, rp->r_ps); 1393 1394 #if defined(__lint) 1395 /* 1396 * this clause can be deleted when lint bug 4870403 is fixed 1397 * (lint thinks that bit 32 is illegal in a %b format string) 1398 */ 1399 printf("cr0: %x cr4: %b\n", 1400 (uint_t)getcr0(), (uint_t)getcr4(), FMT_CR4); 1401 #else 1402 printf("cr0: %b cr4: %b\n", 1403 (uint_t)getcr0(), FMT_CR0, (uint_t)getcr4(), FMT_CR4); 1404 #endif 1405 1406 #if defined(__amd64) 1407 printf("cr2: %lx cr3: %lx cr8: %lx\n", getcr2(), getcr3(), getcr8()); 1408 #elif defined(__i386) 1409 printf("cr2: %lx cr3: %lx\n", getcr2(), getcr3()); 1410 #endif 1411 1412 dumpregs(rp); 1413 splx(s); 1414 } 1415 1416 static void 1417 dumpregs(struct regs *rp) 1418 { 1419 #if defined(__amd64) 1420 const char fmt[] = "\t%3s: %16lx %3s: %16lx %3s: %16lx\n"; 1421 1422 printf(fmt, "rdi", rp->r_rdi, "rsi", rp->r_rsi, "rdx", rp->r_rdx); 1423 printf(fmt, "rcx", rp->r_rcx, " r8", rp->r_r8, " r9", rp->r_r9); 1424 printf(fmt, "rax", rp->r_rax, "rbx", rp->r_rbx, "rbp", rp->r_rbp); 1425 printf(fmt, "r10", rp->r_r10, "r11", rp->r_r11, "r12", rp->r_r12); 1426 printf(fmt, "r13", rp->r_r13, "r14", rp->r_r14, "r15", rp->r_r15); 1427 1428 printf(fmt, "fsb", rp->r_fsbase, "gsb", rp->r_gsbase, " ds", rp->r_ds); 1429 printf(fmt, " es", rp->r_es, " fs", rp->r_fs, " gs", rp->r_gs); 1430 1431 printf(fmt, "trp", rp->r_trapno, "err", rp->r_err, "rip", rp->r_rip); 1432 printf(fmt, " cs", rp->r_cs, "rfl", rp->r_rfl, "rsp", rp->r_rsp); 1433 1434 printf("\t%3s: %16lx\n", " ss", rp->r_ss); 1435 1436 #elif defined(__i386) 1437 const char fmt[] = "\t%3s: %8lx %3s: %8lx %3s: %8lx %3s: %8lx\n"; 1438 1439 printf(fmt, " gs", rp->r_gs, " fs", rp->r_fs, 1440 " es", rp->r_es, " ds", rp->r_ds); 1441 printf(fmt, "edi", rp->r_edi, "esi", rp->r_esi, 1442 "ebp", rp->r_ebp, "esp", rp->r_esp); 1443 printf(fmt, "ebx", rp->r_ebx, "edx", rp->r_edx, 1444 "ecx", rp->r_ecx, "eax", rp->r_eax); 1445 printf(fmt, "trp", rp->r_trapno, "err", rp->r_err, 1446 "eip", rp->r_eip, " cs", rp->r_cs); 1447 printf("\t%3s: %8lx %3s: %8lx %3s: %8lx\n", 1448 "efl", rp->r_efl, "usp", rp->r_uesp, " ss", rp->r_ss); 1449 1450 #endif /* __i386 */ 1451 } 1452 1453 /* 1454 * Handle #gp faults in kernel mode. 1455 * 1456 * One legitimate way this can happen is if we attempt to update segment 1457 * registers to naughty values on the way out of the kernel. 1458 * 1459 * This can happen in a couple of ways: someone - either accidentally or 1460 * on purpose - creates (setcontext(2), lwp_create(2)) or modifies 1461 * (signal(2)) a ucontext that contains silly segment register values. 1462 * Or someone - either accidentally or on purpose - modifies the prgregset_t 1463 * of a subject process via /proc to contain silly segment register values. 1464 * 1465 * (The unfortunate part is that we can end up discovering the bad segment 1466 * register value in the middle of an 'iret' after we've popped most of the 1467 * stack. So it becomes quite difficult to associate an accurate ucontext 1468 * with the lwp, because the act of taking the #gp trap overwrites most of 1469 * what we were going to send the lwp.) 1470 * 1471 * OTOH if it turns out that's -not- the problem, and we're -not- an lwp 1472 * trying to return to user mode and we get a #gp fault, then we need 1473 * to die() -- which will happen if we return non-zero from this routine. 1474 */ 1475 static int 1476 kern_gpfault(struct regs *rp) 1477 { 1478 kthread_t *t = curthread; 1479 proc_t *p = ttoproc(t); 1480 klwp_t *lwp = ttolwp(t); 1481 struct regs tmpregs, *trp = NULL; 1482 caddr_t pc = (caddr_t)rp->r_pc; 1483 int v; 1484 1485 extern void _sys_rtt(), sr_sup(); 1486 1487 #if defined(__amd64) 1488 extern void _update_sregs(), _update_sregs_done(); 1489 static const uint8_t iretq_insn[2] = { 0x48, 0xcf }; 1490 1491 #elif defined(__i386) 1492 static const uint8_t iret_insn[1] = { 0xcf }; 1493 1494 /* 1495 * Note carefully the appallingly awful dependency between 1496 * the instruction sequence used in __SEGREGS_POP and these 1497 * instructions encoded here. 1498 * 1499 * XX64 Add some commentary to locore.s/privregs.h to document this. 1500 */ 1501 static const uint8_t movw_0_esp_gs[4] = { 0x8e, 0x6c, 0x24, 0x0 }; 1502 static const uint8_t movw_4_esp_fs[4] = { 0x8e, 0x64, 0x24, 0x4 }; 1503 static const uint8_t movw_8_esp_es[4] = { 0x8e, 0x44, 0x24, 0x8 }; 1504 static const uint8_t movw_c_esp_ds[4] = { 0x8e, 0x5c, 0x24, 0xc }; 1505 #endif 1506 /* 1507 * if we're not an lwp, or the pc range is outside _sys_rtt, then 1508 * we should immediately be die()ing horribly 1509 */ 1510 if (lwp == NULL || 1511 (uintptr_t)pc < (uintptr_t)_sys_rtt || 1512 (uintptr_t)pc > (uintptr_t)sr_sup) 1513 return (1); 1514 1515 /* 1516 * So at least we're in the right part of the kernel. 1517 * 1518 * Disassemble the instruction at the faulting pc. 1519 * Once we know what it is, we carefully reconstruct the stack 1520 * based on the order in which the stack is deconstructed in 1521 * _sys_rtt. Ew. 1522 */ 1523 1524 #if defined(__amd64) 1525 1526 if (bcmp(pc, iretq_insn, sizeof (iretq_insn)) == 0) { 1527 /* 1528 * We took the #gp while trying to perform the iretq. 1529 * This means that either %cs or %ss are bad. 1530 * All we know for sure is that most of the general 1531 * registers have been restored, including the 1532 * segment registers, and all we have left on the 1533 * topmost part of the lwp's stack are the 1534 * registers that the iretq was unable to consume. 1535 * 1536 * All the rest of the state was crushed by the #gp 1537 * which pushed -its- registers atop our old save area 1538 * (because we had to decrement the stack pointer, sigh) so 1539 * all that we can try and do is to reconstruct the 1540 * crushed frame from the #gp trap frame itself. 1541 */ 1542 trp = &tmpregs; 1543 trp->r_ss = lwptoregs(lwp)->r_ss; 1544 trp->r_sp = lwptoregs(lwp)->r_sp; 1545 trp->r_ps = lwptoregs(lwp)->r_ps; 1546 trp->r_cs = lwptoregs(lwp)->r_cs; 1547 trp->r_pc = lwptoregs(lwp)->r_pc; 1548 bcopy(rp, trp, offsetof(struct regs, r_pc)); 1549 1550 /* 1551 * Validate simple math 1552 */ 1553 ASSERT(trp->r_pc == lwptoregs(lwp)->r_pc); 1554 ASSERT(trp->r_err == rp->r_err); 1555 1556 } else if ((lwp->lwp_pcb.pcb_flags & RUPDATE_PENDING) != 0 && 1557 pc >= (caddr_t)_update_sregs && 1558 pc < (caddr_t)_update_sregs_done) { 1559 /* 1560 * This is the common case -- we're trying to load 1561 * a bad segment register value in the only section 1562 * of kernel code that ever loads segment registers. 1563 * 1564 * We don't need to do anything at this point because 1565 * the pcb contains all the pending segment register 1566 * state, and the regs are still intact because we 1567 * didn't adjust the stack pointer yet. Given the fidelity 1568 * of all this, we could conceivably send a signal 1569 * to the lwp, rather than core-ing. 1570 */ 1571 trp = lwptoregs(lwp); 1572 ASSERT((caddr_t)trp == (caddr_t)rp->r_sp); 1573 } 1574 1575 #elif defined(__i386) 1576 1577 if (bcmp(pc, iret_insn, sizeof (iret_insn)) == 0) { 1578 /* 1579 * We took the #gp while trying to perform the iret. 1580 * This means that either %cs or %ss are bad. 1581 * All we know for sure is that most of the general 1582 * registers have been restored, including the 1583 * segment registers, and all we have left on the 1584 * topmost part of the lwp's stack are the registers that 1585 * the iret was unable to consume. 1586 * 1587 * All the rest of the state was crushed by the #gp 1588 * which pushed -its- registers atop our old save area 1589 * (because we had to decrement the stack pointer, sigh) so 1590 * all that we can try and do is to reconstruct the 1591 * crushed frame from the #gp trap frame itself. 1592 */ 1593 trp = &tmpregs; 1594 trp->r_ss = lwptoregs(lwp)->r_ss; 1595 trp->r_sp = lwptoregs(lwp)->r_sp; 1596 trp->r_ps = lwptoregs(lwp)->r_ps; 1597 trp->r_cs = lwptoregs(lwp)->r_cs; 1598 trp->r_pc = lwptoregs(lwp)->r_pc; 1599 bcopy(rp, trp, offsetof(struct regs, r_pc)); 1600 1601 ASSERT(trp->r_pc == lwptoregs(lwp)->r_pc); 1602 ASSERT(trp->r_err == rp->r_err); 1603 1604 } else { 1605 /* 1606 * Segment registers are reloaded in _sys_rtt 1607 * via the following sequence: 1608 * 1609 * movw 0(%esp), %gs 1610 * movw 4(%esp), %fs 1611 * movw 8(%esp), %es 1612 * movw 12(%esp), %ds 1613 * addl $16, %esp 1614 * 1615 * Thus if any of them fault, we know the user 1616 * registers are left unharmed on the stack. 1617 */ 1618 if (bcmp(pc, movw_0_esp_gs, sizeof (movw_0_esp_gs)) == 0 || 1619 bcmp(pc, movw_4_esp_fs, sizeof (movw_4_esp_fs)) == 0 || 1620 bcmp(pc, movw_8_esp_es, sizeof (movw_8_esp_es)) == 0 || 1621 bcmp(pc, movw_c_esp_ds, sizeof (movw_c_esp_ds)) == 0) 1622 trp = lwptoregs(lwp); 1623 } 1624 #endif /* __amd64 */ 1625 1626 if (trp == NULL) 1627 return (1); 1628 1629 /* 1630 * If we get to here, we're reasonably confident that we've 1631 * correctly decoded what happened on the way out of the kernel. 1632 * Rewrite the lwp's registers so that we can create a core dump 1633 * the (at least vaguely) represents the mcontext we were 1634 * being asked to restore when things went so terribly wrong. 1635 */ 1636 1637 /* 1638 * Make sure that we have a meaningful %trapno and %err. 1639 */ 1640 trp->r_trapno = rp->r_trapno; 1641 trp->r_err = rp->r_err; 1642 1643 if ((caddr_t)trp != (caddr_t)lwptoregs(lwp)) 1644 bcopy(trp, lwptoregs(lwp), sizeof (*trp)); 1645 1646 mutex_enter(&p->p_lock); 1647 lwp->lwp_cursig = SIGSEGV; 1648 mutex_exit(&p->p_lock); 1649 1650 /* 1651 * Terminate all LWPs but don't discard them. If another lwp beat us to 1652 * the punch by calling exit(), evaporate now. 1653 */ 1654 proc_is_exiting(p); 1655 if (exitlwps(1) != 0) { 1656 mutex_enter(&p->p_lock); 1657 lwp_exit(); 1658 } 1659 1660 #ifdef C2_AUDIT 1661 if (audit_active) /* audit core dump */ 1662 audit_core_start(SIGSEGV); 1663 #endif 1664 v = core(SIGSEGV, B_FALSE); 1665 #ifdef C2_AUDIT 1666 if (audit_active) /* audit core dump */ 1667 audit_core_finish(v ? CLD_KILLED : CLD_DUMPED); 1668 #endif 1669 exit(v ? CLD_KILLED : CLD_DUMPED, SIGSEGV); 1670 return (0); 1671 } 1672 1673 /* 1674 * dump_tss() - Display the TSS structure 1675 */ 1676 1677 #if defined(__amd64) 1678 1679 static void 1680 dump_tss(void) 1681 { 1682 const char tss_fmt[] = "tss.%s:\t0x%p\n"; /* Format string */ 1683 struct tss *tss = CPU->cpu_tss; 1684 1685 printf(tss_fmt, "tss_rsp0", (void *)tss->tss_rsp0); 1686 printf(tss_fmt, "tss_rsp1", (void *)tss->tss_rsp1); 1687 printf(tss_fmt, "tss_rsp2", (void *)tss->tss_rsp2); 1688 1689 printf(tss_fmt, "tss_ist1", (void *)tss->tss_ist1); 1690 printf(tss_fmt, "tss_ist2", (void *)tss->tss_ist2); 1691 printf(tss_fmt, "tss_ist3", (void *)tss->tss_ist3); 1692 printf(tss_fmt, "tss_ist4", (void *)tss->tss_ist4); 1693 printf(tss_fmt, "tss_ist5", (void *)tss->tss_ist5); 1694 printf(tss_fmt, "tss_ist6", (void *)tss->tss_ist6); 1695 printf(tss_fmt, "tss_ist7", (void *)tss->tss_ist7); 1696 } 1697 1698 #elif defined(__i386) 1699 1700 static void 1701 dump_tss(void) 1702 { 1703 const char tss_fmt[] = "tss.%s:\t0x%p\n"; /* Format string */ 1704 struct tss *tss = CPU->cpu_tss; 1705 1706 printf(tss_fmt, "tss_link", (void *)(uintptr_t)tss->tss_link); 1707 printf(tss_fmt, "tss_esp0", (void *)(uintptr_t)tss->tss_esp0); 1708 printf(tss_fmt, "tss_ss0", (void *)(uintptr_t)tss->tss_ss0); 1709 printf(tss_fmt, "tss_esp1", (void *)(uintptr_t)tss->tss_esp1); 1710 printf(tss_fmt, "tss_ss1", (void *)(uintptr_t)tss->tss_ss1); 1711 printf(tss_fmt, "tss_esp2", (void *)(uintptr_t)tss->tss_esp2); 1712 printf(tss_fmt, "tss_ss2", (void *)(uintptr_t)tss->tss_ss2); 1713 printf(tss_fmt, "tss_cr3", (void *)(uintptr_t)tss->tss_cr3); 1714 printf(tss_fmt, "tss_eip", (void *)(uintptr_t)tss->tss_eip); 1715 printf(tss_fmt, "tss_eflags", (void *)(uintptr_t)tss->tss_eflags); 1716 printf(tss_fmt, "tss_eax", (void *)(uintptr_t)tss->tss_eax); 1717 printf(tss_fmt, "tss_ebx", (void *)(uintptr_t)tss->tss_ebx); 1718 printf(tss_fmt, "tss_ecx", (void *)(uintptr_t)tss->tss_ecx); 1719 printf(tss_fmt, "tss_edx", (void *)(uintptr_t)tss->tss_edx); 1720 printf(tss_fmt, "tss_esp", (void *)(uintptr_t)tss->tss_esp); 1721 } 1722 1723 #endif /* __amd64 */ 1724 1725 #if defined(TRAPTRACE) 1726 1727 int ttrace_nrec = 0; /* number of records to dump out */ 1728 int ttrace_dump_nregs = 5; /* dump out this many records with regs too */ 1729 1730 /* 1731 * Dump out the last ttrace_nrec traptrace records on each CPU 1732 */ 1733 static void 1734 dump_ttrace(void) 1735 { 1736 trap_trace_ctl_t *ttc; 1737 trap_trace_rec_t *rec; 1738 uintptr_t current; 1739 int i, j, k; 1740 int n = NCPU; 1741 #if defined(__amd64) 1742 const char banner[] = 1743 "\ncpu address timestamp " 1744 "type vc handler pc\n"; 1745 const char fmt1[] = "%3d %016lx %12llx "; 1746 #elif defined(__i386) 1747 const char banner[] = 1748 "\ncpu address timestamp type vc handler pc\n"; 1749 const char fmt1[] = "%3d %08lx %12llx "; 1750 #endif 1751 const char fmt2[] = "%4s %3x "; 1752 const char fmt3[] = "%8s "; 1753 1754 if (ttrace_nrec == 0) 1755 return; 1756 1757 printf(banner); 1758 1759 for (i = 0; i < n; i++) { 1760 ttc = &trap_trace_ctl[i]; 1761 if (ttc->ttc_first == NULL) 1762 continue; 1763 1764 current = ttc->ttc_next - sizeof (trap_trace_rec_t); 1765 for (j = 0; j < ttrace_nrec; j++) { 1766 struct sysent *sys; 1767 struct autovec *vec; 1768 extern struct av_head autovect[]; 1769 int type; 1770 ulong_t off; 1771 char *sym, *stype; 1772 1773 if (current < ttc->ttc_first) 1774 current = 1775 ttc->ttc_limit - sizeof (trap_trace_rec_t); 1776 1777 if (current == NULL) 1778 continue; 1779 1780 rec = (trap_trace_rec_t *)current; 1781 1782 if (rec->ttr_stamp == 0) 1783 break; 1784 1785 printf(fmt1, i, (uintptr_t)rec, rec->ttr_stamp); 1786 1787 switch (rec->ttr_marker) { 1788 case TT_SYSCALL: 1789 case TT_SYSENTER: 1790 case TT_SYSC: 1791 case TT_SYSC64: 1792 #if defined(__amd64) 1793 sys = &sysent32[rec->ttr_sysnum]; 1794 switch (rec->ttr_marker) { 1795 case TT_SYSC64: 1796 sys = &sysent[rec->ttr_sysnum]; 1797 /*FALLTHROUGH*/ 1798 #elif defined(__i386) 1799 sys = &sysent[rec->ttr_sysnum]; 1800 switch (rec->ttr_marker) { 1801 case TT_SYSC64: 1802 #endif 1803 case TT_SYSC: 1804 stype = "sysc"; /* syscall */ 1805 break; 1806 case TT_SYSCALL: 1807 stype = "lcal"; /* lcall */ 1808 break; 1809 case TT_SYSENTER: 1810 stype = "syse"; /* sysenter */ 1811 break; 1812 default: 1813 break; 1814 } 1815 printf(fmt2, "sysc", rec->ttr_sysnum); 1816 if (sys != NULL) { 1817 sym = kobj_getsymname( 1818 (uintptr_t)sys->sy_callc, 1819 &off); 1820 if (sym != NULL) 1821 printf("%s ", sym); 1822 else 1823 printf("%p ", sys->sy_callc); 1824 } else { 1825 printf("unknown "); 1826 } 1827 break; 1828 1829 case TT_INTERRUPT: 1830 printf(fmt2, "intr", rec->ttr_vector); 1831 vec = (&autovect[rec->ttr_vector])->avh_link; 1832 if (vec != NULL) { 1833 sym = kobj_getsymname( 1834 (uintptr_t)vec->av_vector, &off); 1835 if (sym != NULL) 1836 printf("%s ", sym); 1837 else 1838 printf("%p ", vec->av_vector); 1839 } else { 1840 printf("unknown "); 1841 } 1842 break; 1843 1844 case TT_TRAP: 1845 type = rec->ttr_regs.r_trapno; 1846 printf(fmt2, "trap", type); 1847 printf("#%s ", type < TRAP_TYPES ? 1848 trap_type_mnemonic[type] : "trap"); 1849 break; 1850 1851 default: 1852 break; 1853 } 1854 1855 sym = kobj_getsymname(rec->ttr_regs.r_pc, &off); 1856 if (sym != NULL) 1857 printf("%s+%lx\n", sym, off); 1858 else 1859 printf("%lx\n", rec->ttr_regs.r_pc); 1860 1861 if (ttrace_dump_nregs-- > 0) { 1862 int s; 1863 1864 if (rec->ttr_marker == TT_INTERRUPT) 1865 printf( 1866 "\t\tipl %x spl %x pri %x\n", 1867 rec->ttr_ipl, 1868 rec->ttr_spl, 1869 rec->ttr_pri); 1870 1871 dumpregs(&rec->ttr_regs); 1872 1873 printf("\t%3s: %p\n\n", " ct", 1874 (void *)rec->ttr_curthread); 1875 1876 /* 1877 * print out the pc stack that we recorded 1878 * at trap time (if any) 1879 */ 1880 for (s = 0; s < rec->ttr_sdepth; s++) { 1881 uintptr_t fullpc; 1882 1883 if (s >= TTR_STACK_DEPTH) { 1884 printf("ttr_sdepth corrupt\n"); 1885 break; 1886 } 1887 1888 fullpc = (uintptr_t)rec->ttr_stack[s]; 1889 1890 sym = kobj_getsymname(fullpc, &off); 1891 if (sym != NULL) 1892 printf("-> %s+0x%lx()\n", 1893 sym, off); 1894 else 1895 printf("-> 0x%lx()\n", fullpc); 1896 } 1897 printf("\n"); 1898 } 1899 current -= sizeof (trap_trace_rec_t); 1900 } 1901 } 1902 } 1903 1904 #endif /* TRAPTRACE */ 1905 1906 void 1907 panic_showtrap(struct trap_info *tip) 1908 { 1909 showregs(tip->trap_type, tip->trap_regs, tip->trap_addr); 1910 1911 #if defined(TRAPTRACE) 1912 dump_ttrace(); 1913 #endif /* TRAPTRACE */ 1914 1915 if (tip->trap_type == T_DBLFLT) 1916 dump_tss(); 1917 } 1918 1919 void 1920 panic_savetrap(panic_data_t *pdp, struct trap_info *tip) 1921 { 1922 panic_saveregs(pdp, tip->trap_regs); 1923 } 1924