1 /*-
2 * SPDX-License-Identifier: BSD-4-Clause
3 *
4 * Copyright (C) 1994, David Greenman
5 * Copyright (c) 1990, 1993
6 * The Regents of the University of California. All rights reserved.
7 *
8 * This code is derived from software contributed to Berkeley by
9 * the University of Utah, and William Jolitz.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 * 3. All advertising materials mentioning features or use of this software
20 * must display the following acknowledgement:
21 * This product includes software developed by the University of
22 * California, Berkeley and its contributors.
23 * 4. Neither the name of the University nor the names of its contributors
24 * may be used to endorse or promote products derived from this software
25 * without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37 * SUCH DAMAGE.
38 */
39
40 #include <sys/cdefs.h>
41 /*
42 * 386 Trap and System call handling
43 */
44
45 #include "opt_clock.h"
46 #include "opt_cpu.h"
47 #include "opt_hwpmc_hooks.h"
48 #include "opt_isa.h"
49 #include "opt_kdb.h"
50 #include "opt_trap.h"
51
52 #include <sys/param.h>
53 #include <sys/bus.h>
54 #include <sys/systm.h>
55 #include <sys/proc.h>
56 #include <sys/ptrace.h>
57 #include <sys/kdb.h>
58 #include <sys/kernel.h>
59 #include <sys/ktr.h>
60 #include <sys/lock.h>
61 #include <sys/mutex.h>
62 #include <sys/resourcevar.h>
63 #include <sys/signalvar.h>
64 #include <sys/syscall.h>
65 #include <sys/sysctl.h>
66 #include <sys/sysent.h>
67 #include <sys/uio.h>
68 #include <sys/vmmeter.h>
69 #ifdef HWPMC_HOOKS
70 #include <sys/pmckern.h>
71 PMC_SOFT_DEFINE( , , page_fault, all);
72 PMC_SOFT_DEFINE( , , page_fault, read);
73 PMC_SOFT_DEFINE( , , page_fault, write);
74 #endif
75 #include <security/audit/audit.h>
76
77 #include <vm/vm.h>
78 #include <vm/vm_param.h>
79 #include <vm/pmap.h>
80 #include <vm/vm_kern.h>
81 #include <vm/vm_map.h>
82 #include <vm/vm_page.h>
83 #include <vm/vm_extern.h>
84
85 #include <machine/cpu.h>
86 #include <machine/intr_machdep.h>
87 #include <x86/mca.h>
88 #include <machine/md_var.h>
89 #include <machine/pcb.h>
90 #ifdef SMP
91 #include <machine/smp.h>
92 #endif
93 #include <machine/stack.h>
94 #include <machine/trap.h>
95 #include <machine/tss.h>
96 #include <machine/vm86.h>
97
98 #ifdef POWERFAIL_NMI
99 #include <sys/syslog.h>
100 #include <machine/clock.h>
101 #endif
102
103 #ifdef KDTRACE_HOOKS
104 #include <sys/dtrace_bsd.h>
105 #endif
106
107 void trap(struct trapframe *frame);
108 void syscall(struct trapframe *frame);
109
110 static int trap_pfault(struct trapframe *, bool, vm_offset_t, int *, int *);
111 static void trap_fatal(struct trapframe *, vm_offset_t);
112 #ifdef KDTRACE_HOOKS
113 static bool trap_user_dtrace(struct trapframe *,
114 int (**hook)(struct trapframe *));
115 #endif
116 void dblfault_handler(void);
117
118 extern inthand_t IDTVEC(bpt), IDTVEC(dbg), IDTVEC(int0x80_syscall);
119 extern uint64_t pg_nx;
120
121 struct trap_data {
122 bool ei;
123 const char *msg;
124 };
125
126 static const struct trap_data trap_data[] = {
127 [T_PRIVINFLT] = { .ei = true, .msg = "privileged instruction fault" },
128 [T_BPTFLT] = { .ei = false, .msg = "breakpoint instruction fault" },
129 [T_ARITHTRAP] = { .ei = true, .msg = "arithmetic trap" },
130 [T_PROTFLT] = { .ei = true, .msg = "general protection fault" },
131 [T_TRCTRAP] = { .ei = false, .msg = "debug exception" },
132 [T_PAGEFLT] = { .ei = true, .msg = "page fault" },
133 [T_ALIGNFLT] = { .ei = true, .msg = "alignment fault" },
134 [T_DIVIDE] = { .ei = true, .msg = "integer divide fault" },
135 [T_NMI] = { .ei = false, .msg = "non-maskable interrupt trap" },
136 [T_OFLOW] = { .ei = true, .msg = "overflow trap" },
137 [T_BOUND] = { .ei = true, .msg = "FPU bounds check fault" },
138 [T_DNA] = { .ei = true, .msg = "FPU device not available" },
139 [T_DOUBLEFLT] = { .ei = false, .msg = "double fault" },
140 [T_FPOPFLT] = { .ei = true, .msg = "FPU operand fetch fault" },
141 [T_TSSFLT] = { .ei = true, .msg = "invalid TSS fault" },
142 [T_SEGNPFLT] = { .ei = true, .msg = "segment not present fault" },
143 [T_STKFLT] = { .ei = true, .msg = "stack fault" },
144 [T_MCHK] = { .ei = true, .msg = "machine check trap" },
145 [T_XMMFLT] = { .ei = true, .msg = "SIMD floating-point exception" },
146 [T_DTRACE_RET] ={ .ei = true, .msg = "DTrace pid return trap" },
147 };
148
149 static bool
trap_enable_intr(int trapno)150 trap_enable_intr(int trapno)
151 {
152
153 MPASS(trapno > 0);
154 if (trapno < nitems(trap_data) && trap_data[trapno].msg != NULL)
155 return (trap_data[trapno].ei);
156 return (false);
157 }
158
159 static const char *
trap_msg(int trapno)160 trap_msg(int trapno)
161 {
162 const char *res;
163 static const char unkn[] = "UNKNOWN";
164
165 res = NULL;
166 if (trapno < nitems(trap_data))
167 res = trap_data[trapno].msg;
168 if (res == NULL)
169 res = unkn;
170 return (res);
171 }
172
173 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
174 int has_f00f_bug = 0; /* Initialized so that it can be patched. */
175 #endif
176
177 static int uprintf_signal;
178 SYSCTL_INT(_machdep, OID_AUTO, uprintf_signal, CTLFLAG_RW,
179 &uprintf_signal, 0,
180 "Print debugging information on trap signal to ctty");
181
182
183 #ifdef INVARIANTS
184 static __inline register_t
read_esp(void)185 read_esp(void)
186 {
187 register_t res;
188
189 __asm __volatile("movl\t%%esp,%0" : "=r" (res));
190 return (res);
191 }
192
193 void
trap_check_kstack(void)194 trap_check_kstack(void)
195 {
196 struct thread *td;
197 vm_offset_t stk;
198
199 td = curthread;
200 stk = read_esp();
201 if (stk >= PMAP_TRM_MIN_ADDRESS)
202 panic("td %p stack %#x in trampoline", td, stk);
203 if (!kstack_contains(td, stk, 0))
204 panic("td %p stack %#x not in kstack VA %#x %d",
205 td, stk, td->td_kstack, td->td_kstack_pages);
206 }
207 #endif
208
209 /*
210 * Exception, fault, and trap interface to the FreeBSD kernel.
211 * This common code is called from assembly language IDT gate entry
212 * routines that prepare a suitable stack frame, and restore this
213 * frame after the exception has been processed.
214 */
215
216 void
trap(struct trapframe * frame)217 trap(struct trapframe *frame)
218 {
219 ksiginfo_t ksi;
220 struct thread *td;
221 struct proc *p;
222 int pf, signo, ucode;
223 u_int type;
224 register_t addr, dr6;
225 vm_offset_t eva;
226 #ifdef POWERFAIL_NMI
227 static int lastalert = 0;
228 #endif
229
230 td = curthread;
231 p = td->td_proc;
232 dr6 = 0;
233
234 VM_CNT_INC(v_trap);
235 type = frame->tf_trapno;
236
237 KASSERT((read_eflags() & PSL_I) == 0,
238 ("trap: interrupts enabled, type %d frame %p", type, frame));
239
240 #ifdef SMP
241 /* Handler for NMI IPIs used for stopping CPUs. */
242 if (type == T_NMI && ipi_nmi_handler() == 0)
243 return;
244 #endif /* SMP */
245
246 #ifdef KDB
247 if (kdb_active) {
248 kdb_reenter();
249 return;
250 }
251 #endif
252 trap_check_kstack();
253
254 if (type == T_RESERVED) {
255 trap_fatal(frame, 0);
256 return;
257 }
258
259 if (type == T_NMI) {
260 #ifdef HWPMC_HOOKS
261 /*
262 * CPU PMCs interrupt using an NMI so we check for that first.
263 * If the HWPMC module is active, 'pmc_hook' will point to
264 * the function to be called. A non-zero return value from the
265 * hook means that the NMI was consumed by it and that we can
266 * return immediately.
267 */
268 if (pmc_intr != NULL &&
269 (*pmc_intr)(frame) != 0)
270 return;
271 #endif
272 }
273
274 if (type == T_MCHK) {
275 mca_intr();
276 return;
277 }
278
279 #ifdef KDTRACE_HOOKS
280 /*
281 * A trap can occur while DTrace executes a probe. Before
282 * executing the probe, DTrace blocks re-scheduling and sets
283 * a flag in its per-cpu flags to indicate that it doesn't
284 * want to fault. On returning from the probe, the no-fault
285 * flag is cleared and finally re-scheduling is enabled.
286 */
287 if ((type == T_PROTFLT || type == T_PAGEFLT) &&
288 dtrace_trap_func != NULL && (*dtrace_trap_func)(frame, type))
289 return;
290 #endif
291
292 /*
293 * We must not allow context switches until %cr2 is read.
294 * Also, for some Cyrix CPUs, %cr2 is clobbered by interrupts.
295 * All faults use interrupt gates, so %cr2 can be safely read
296 * now, before optional enable of the interrupts below.
297 */
298 if (type == T_PAGEFLT)
299 eva = rcr2();
300
301 /*
302 * Buggy application or kernel code has disabled interrupts
303 * and then trapped. Enabling interrupts now is wrong, but it
304 * is better than running with interrupts disabled until they
305 * are accidentally enabled later.
306 */
307 if ((frame->tf_eflags & PSL_I) == 0 && TRAPF_USERMODE(frame) &&
308 (curpcb->pcb_flags & PCB_VM86CALL) == 0)
309 uprintf("pid %ld (%s): usermode trap %d (%s) with "
310 "interrupts disabled\n",
311 (long)curproc->p_pid, curthread->td_name, type,
312 trap_data[type].msg);
313
314 /*
315 * Conditionally reenable interrupts. If we hold a spin lock,
316 * then we must not reenable interrupts. This might be a
317 * spurious page fault.
318 */
319 if (trap_enable_intr(type) && td->td_md.md_spinlock_count == 0 &&
320 frame->tf_eip != (int)cpu_switch_load_gs)
321 enable_intr();
322
323 if (TRAPF_USERMODE(frame) && (curpcb->pcb_flags & PCB_VM86CALL) == 0) {
324 /* user trap */
325
326 td->td_pticks = 0;
327 td->td_frame = frame;
328 addr = frame->tf_eip;
329 if (td->td_cowgen != atomic_load_int(&p->p_cowgen))
330 thread_cow_update(td);
331
332 switch (type) {
333 case T_PRIVINFLT: /* privileged instruction fault */
334 signo = SIGILL;
335 ucode = ILL_PRVOPC;
336 break;
337
338 case T_BPTFLT: /* bpt instruction fault */
339 #ifdef KDTRACE_HOOKS
340 if (trap_user_dtrace(frame, &dtrace_pid_probe_ptr))
341 return;
342 #else
343 enable_intr();
344 #endif
345 signo = SIGTRAP;
346 ucode = TRAP_BRKPT;
347 break;
348
349 case T_TRCTRAP: /* debug exception */
350 enable_intr();
351 user_trctrap_out:
352 signo = SIGTRAP;
353 ucode = TRAP_TRACE;
354 dr6 = rdr6();
355 if ((dr6 & DBREG_DR6_BS) != 0) {
356 PROC_LOCK(td->td_proc);
357 if ((td->td_dbgflags & TDB_STEP) != 0) {
358 td->td_frame->tf_eflags &= ~PSL_T;
359 td->td_dbgflags &= ~TDB_STEP;
360 }
361 PROC_UNLOCK(td->td_proc);
362 }
363 break;
364
365 case T_ARITHTRAP: /* arithmetic trap */
366 ucode = npxtrap_x87();
367 if (ucode == -1)
368 return;
369 signo = SIGFPE;
370 break;
371
372 /*
373 * The following two traps can happen in vm86 mode,
374 * and, if so, we want to handle them specially.
375 */
376 case T_PROTFLT: /* general protection fault */
377 case T_STKFLT: /* stack fault */
378 if (frame->tf_eflags & PSL_VM) {
379 signo = vm86_emulate((struct vm86frame *)frame);
380 ucode = 0; /* XXXKIB: better code ? */
381 if (signo == SIGTRAP) {
382 load_dr6(rdr6() | 0x4000);
383 goto user_trctrap_out;
384 }
385 if (signo == 0)
386 goto user;
387 break;
388 }
389 signo = SIGBUS;
390 ucode = (type == T_PROTFLT) ? BUS_OBJERR : BUS_ADRERR;
391 break;
392 case T_SEGNPFLT: /* segment not present fault */
393 signo = SIGBUS;
394 ucode = BUS_ADRERR;
395 break;
396 case T_TSSFLT: /* invalid TSS fault */
397 signo = SIGBUS;
398 ucode = BUS_OBJERR;
399 break;
400 case T_ALIGNFLT:
401 signo = SIGBUS;
402 ucode = BUS_ADRALN;
403 break;
404 case T_DOUBLEFLT: /* double fault */
405 default:
406 signo = SIGBUS;
407 ucode = BUS_OBJERR;
408 break;
409
410 case T_PAGEFLT: /* page fault */
411 addr = eva;
412 pf = trap_pfault(frame, true, eva, &signo, &ucode);
413 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
414 if (pf == -2) {
415 /*
416 * The f00f hack workaround has triggered, so
417 * treat the fault as an illegal instruction
418 * (T_PRIVINFLT) instead of a page fault.
419 */
420 type = frame->tf_trapno = T_PRIVINFLT;
421 break;
422 }
423 #endif
424 if (pf == -1)
425 return;
426 if (pf == 0)
427 goto user;
428 break;
429
430 case T_DIVIDE: /* integer divide fault */
431 ucode = FPE_INTDIV;
432 signo = SIGFPE;
433 break;
434
435 case T_NMI:
436 #ifdef POWERFAIL_NMI
437 #ifndef TIMER_FREQ
438 # define TIMER_FREQ 1193182
439 #endif
440 if (time_second - lastalert > 10) {
441 log(LOG_WARNING, "NMI: power fail\n");
442 sysbeep(880, SBT_1S);
443 lastalert = time_second;
444 }
445 return;
446 #else /* !POWERFAIL_NMI */
447 nmi_handle_intr(type, frame);
448 return;
449 #endif /* POWERFAIL_NMI */
450
451 case T_OFLOW: /* integer overflow fault */
452 ucode = FPE_INTOVF;
453 signo = SIGFPE;
454 break;
455
456 case T_BOUND: /* bounds check fault */
457 ucode = FPE_FLTSUB;
458 signo = SIGFPE;
459 break;
460
461 case T_DNA:
462 KASSERT(PCB_USER_FPU(td->td_pcb),
463 ("kernel FPU ctx has leaked"));
464 /* transparent fault (due to context switch "late") */
465 if (npxdna())
466 return;
467 uprintf("pid %d killed due to lack of floating point\n",
468 p->p_pid);
469 signo = SIGKILL;
470 ucode = 0;
471 break;
472
473 case T_FPOPFLT: /* FPU operand fetch fault */
474 ucode = ILL_COPROC;
475 signo = SIGILL;
476 break;
477
478 case T_XMMFLT: /* SIMD floating-point exception */
479 ucode = npxtrap_sse();
480 if (ucode == -1)
481 return;
482 signo = SIGFPE;
483 break;
484 #ifdef KDTRACE_HOOKS
485 case T_DTRACE_RET:
486 (void)trap_user_dtrace(frame, &dtrace_return_probe_ptr);
487 return;
488 #endif
489 }
490 } else {
491 /* kernel trap */
492
493 KASSERT(cold || td->td_ucred != NULL,
494 ("kernel trap doesn't have ucred"));
495 switch (type) {
496 case T_PAGEFLT: /* page fault */
497 (void)trap_pfault(frame, false, eva, NULL, NULL);
498 return;
499
500 case T_DNA:
501 if (PCB_USER_FPU(td->td_pcb))
502 panic("Unregistered use of FPU in kernel");
503 if (npxdna())
504 return;
505 break;
506
507 case T_ARITHTRAP: /* arithmetic trap */
508 case T_XMMFLT: /* SIMD floating-point exception */
509 case T_FPOPFLT: /* FPU operand fetch fault */
510 /*
511 * XXXKIB for now disable any FPU traps in kernel
512 * handler registration seems to be overkill
513 */
514 trap_fatal(frame, 0);
515 return;
516
517 /*
518 * The following two traps can happen in
519 * vm86 mode, and, if so, we want to handle
520 * them specially.
521 */
522 case T_PROTFLT: /* general protection fault */
523 case T_STKFLT: /* stack fault */
524 if (frame->tf_eflags & PSL_VM) {
525 signo = vm86_emulate((struct vm86frame *)frame);
526 if (signo == SIGTRAP) {
527 type = T_TRCTRAP;
528 load_dr6(rdr6() | 0x4000);
529 goto kernel_trctrap;
530 }
531 if (signo != 0)
532 /*
533 * returns to original process
534 */
535 vm86_trap((struct vm86frame *)frame);
536 return;
537 }
538 /* FALL THROUGH */
539 case T_SEGNPFLT: /* segment not present fault */
540 if (curpcb->pcb_flags & PCB_VM86CALL)
541 break;
542
543 /*
544 * Invalid %fs's and %gs's can be created using
545 * procfs or PT_SETREGS or by invalidating the
546 * underlying LDT entry. This causes a fault
547 * in kernel mode when the kernel attempts to
548 * switch contexts. Lose the bad context
549 * (XXX) so that we can continue, and generate
550 * a signal.
551 */
552 if (frame->tf_eip == (int)cpu_switch_load_gs) {
553 curpcb->pcb_gs = 0;
554 #if 0
555 PROC_LOCK(p);
556 kern_psignal(p, SIGBUS);
557 PROC_UNLOCK(p);
558 #endif
559 return;
560 }
561
562 if (td->td_intr_nesting_level != 0)
563 break;
564
565 /*
566 * Invalid segment selectors and out of bounds
567 * %eip's and %esp's can be set up in user mode.
568 * This causes a fault in kernel mode when the
569 * kernel tries to return to user mode. We want
570 * to get this fault so that we can fix the
571 * problem here and not have to check all the
572 * selectors and pointers when the user changes
573 * them.
574 *
575 * N.B. Comparing to long mode, 32-bit mode
576 * does not push %esp on the trap frame,
577 * because iretl faulted while in ring 0. As
578 * the consequence, there is no need to fixup
579 * the stack pointer for doreti_iret_fault,
580 * the fixup and the complimentary trap() call
581 * are executed on the main thread stack, not
582 * on the trampoline stack.
583 */
584 if (frame->tf_eip == (int)doreti_iret + setidt_disp) {
585 frame->tf_eip = (int)doreti_iret_fault +
586 setidt_disp;
587 return;
588 }
589 if (type == T_STKFLT)
590 break;
591
592 if (frame->tf_eip == (int)doreti_popl_ds +
593 setidt_disp) {
594 frame->tf_eip = (int)doreti_popl_ds_fault +
595 setidt_disp;
596 return;
597 }
598 if (frame->tf_eip == (int)doreti_popl_es +
599 setidt_disp) {
600 frame->tf_eip = (int)doreti_popl_es_fault +
601 setidt_disp;
602 return;
603 }
604 if (frame->tf_eip == (int)doreti_popl_fs +
605 setidt_disp) {
606 frame->tf_eip = (int)doreti_popl_fs_fault +
607 setidt_disp;
608 return;
609 }
610 if (curpcb->pcb_onfault != NULL) {
611 frame->tf_eip = (int)curpcb->pcb_onfault;
612 return;
613 }
614 break;
615
616 case T_TSSFLT:
617 /*
618 * PSL_NT can be set in user mode and isn't cleared
619 * automatically when the kernel is entered. This
620 * causes a TSS fault when the kernel attempts to
621 * `iret' because the TSS link is uninitialized. We
622 * want to get this fault so that we can fix the
623 * problem here and not every time the kernel is
624 * entered.
625 */
626 if (frame->tf_eflags & PSL_NT) {
627 frame->tf_eflags &= ~PSL_NT;
628 return;
629 }
630 break;
631
632 case T_TRCTRAP: /* debug exception */
633 kernel_trctrap:
634 /* Clear any pending debug events. */
635 dr6 = rdr6();
636 load_dr6(0);
637
638 /*
639 * Ignore debug register exceptions due to
640 * accesses in the user's address space, which
641 * can happen under several conditions such as
642 * if a user sets a watchpoint on a buffer and
643 * then passes that buffer to a system call.
644 * We still want to get TRCTRAPS for addresses
645 * in kernel space because that is useful when
646 * debugging the kernel.
647 */
648 if (user_dbreg_trap(dr6) &&
649 !(curpcb->pcb_flags & PCB_VM86CALL))
650 return;
651
652 /*
653 * Malicious user code can configure a debug
654 * register watchpoint to trap on data access
655 * to the top of stack and then execute 'pop
656 * %ss; int 3'. Due to exception deferral for
657 * 'pop %ss', the CPU will not interrupt 'int
658 * 3' to raise the DB# exception for the debug
659 * register but will postpone the DB# until
660 * execution of the first instruction of the
661 * BP# handler (in kernel mode). Normally the
662 * previous check would ignore DB# exceptions
663 * for watchpoints on user addresses raised in
664 * kernel mode. However, some CPU errata
665 * include cases where DB# exceptions do not
666 * properly set bits in %dr6, e.g. Haswell
667 * HSD23 and Skylake-X SKZ24.
668 *
669 * A deferred DB# can also be raised on the
670 * first instructions of system call entry
671 * points or single-step traps via similar use
672 * of 'pop %ss' or 'mov xxx, %ss'.
673 */
674 if (frame->tf_eip ==
675 (uintptr_t)IDTVEC(int0x80_syscall) + setidt_disp ||
676 frame->tf_eip == (uintptr_t)IDTVEC(bpt) +
677 setidt_disp ||
678 frame->tf_eip == (uintptr_t)IDTVEC(dbg) +
679 setidt_disp)
680 return;
681 /*
682 * FALLTHROUGH (TRCTRAP kernel mode, kernel address)
683 */
684 case T_BPTFLT:
685 /*
686 * If KDB is enabled, let it handle the debugger trap.
687 * Otherwise, debugger traps "can't happen".
688 */
689 #ifdef KDB
690 if (kdb_trap(type, dr6, frame))
691 return;
692 #endif
693 break;
694
695 case T_NMI:
696 #ifdef POWERFAIL_NMI
697 if (time_second - lastalert > 10) {
698 log(LOG_WARNING, "NMI: power fail\n");
699 sysbeep(880, SBT_1S);
700 lastalert = time_second;
701 }
702 return;
703 #else /* !POWERFAIL_NMI */
704 nmi_handle_intr(type, frame);
705 return;
706 #endif /* POWERFAIL_NMI */
707 }
708
709 trap_fatal(frame, eva);
710 return;
711 }
712
713 ksiginfo_init_trap(&ksi);
714 ksi.ksi_signo = signo;
715 ksi.ksi_code = ucode;
716 ksi.ksi_addr = (void *)addr;
717 ksi.ksi_trapno = type;
718 if (uprintf_signal) {
719 uprintf("pid %d comm %s: signal %d err %#x code %d type %d "
720 "addr %#x ss %#04x esp %#08x cs %#04x eip %#08x eax %#08x"
721 "<%02x %02x %02x %02x %02x %02x %02x %02x>\n",
722 p->p_pid, p->p_comm, signo, frame->tf_err, ucode, type,
723 addr, frame->tf_ss, frame->tf_esp, frame->tf_cs,
724 frame->tf_eip, frame->tf_eax,
725 fubyte((void *)(frame->tf_eip + 0)),
726 fubyte((void *)(frame->tf_eip + 1)),
727 fubyte((void *)(frame->tf_eip + 2)),
728 fubyte((void *)(frame->tf_eip + 3)),
729 fubyte((void *)(frame->tf_eip + 4)),
730 fubyte((void *)(frame->tf_eip + 5)),
731 fubyte((void *)(frame->tf_eip + 6)),
732 fubyte((void *)(frame->tf_eip + 7)));
733 }
734 KASSERT((read_eflags() & PSL_I) != 0, ("interrupts disabled"));
735 trapsignal(td, &ksi);
736
737 user:
738 userret(td, frame);
739 KASSERT(PCB_USER_FPU(td->td_pcb),
740 ("Return from trap with kernel FPU ctx leaked"));
741 }
742
743 /*
744 * Handle all details of a page fault.
745 * Returns:
746 * -2 if the fault was caused by triggered workaround for Intel Pentium
747 * 0xf00f bug.
748 * -1 if this fault was fatal, typically from kernel mode
749 * (cannot happen, but we need to return something).
750 * 0 if this fault was handled by updating either the user or kernel
751 * page table, execution can continue.
752 * 1 if this fault was from usermode and it was not handled, a synchronous
753 * signal should be delivered to the thread. *signo returns the signal
754 * number, *ucode gives si_code.
755 */
756 static int
trap_pfault(struct trapframe * frame,bool usermode,vm_offset_t eva,int * signo,int * ucode)757 trap_pfault(struct trapframe *frame, bool usermode, vm_offset_t eva,
758 int *signo, int *ucode)
759 {
760 struct thread *td;
761 struct proc *p;
762 vm_map_t map;
763 int rv;
764 vm_prot_t ftype;
765
766 MPASS(!usermode || (signo != NULL && ucode != NULL));
767
768 td = curthread;
769 p = td->td_proc;
770
771 if (__predict_false((td->td_pflags & TDP_NOFAULTING) != 0)) {
772 /*
773 * Due to both processor errata and lazy TLB invalidation when
774 * access restrictions are removed from virtual pages, memory
775 * accesses that are allowed by the physical mapping layer may
776 * nonetheless cause one spurious page fault per virtual page.
777 * When the thread is executing a "no faulting" section that
778 * is bracketed by vm_fault_{disable,enable}_pagefaults(),
779 * every page fault is treated as a spurious page fault,
780 * unless it accesses the same virtual address as the most
781 * recent page fault within the same "no faulting" section.
782 */
783 if (td->td_md.md_spurflt_addr != eva ||
784 (td->td_pflags & TDP_RESETSPUR) != 0) {
785 /*
786 * Do nothing to the TLB. A stale TLB entry is
787 * flushed automatically by a page fault.
788 */
789 td->td_md.md_spurflt_addr = eva;
790 td->td_pflags &= ~TDP_RESETSPUR;
791 return (0);
792 }
793 } else {
794 /*
795 * If we get a page fault while in a critical section, then
796 * it is most likely a fatal kernel page fault. The kernel
797 * is already going to panic trying to get a sleep lock to
798 * do the VM lookup, so just consider it a fatal trap so the
799 * kernel can print out a useful trap message and even get
800 * to the debugger.
801 *
802 * If we get a page fault while holding a non-sleepable
803 * lock, then it is most likely a fatal kernel page fault.
804 * If WITNESS is enabled, then it's going to whine about
805 * bogus LORs with various VM locks, so just skip to the
806 * fatal trap handling directly.
807 */
808 if (td->td_critnest != 0 ||
809 WITNESS_CHECK(WARN_SLEEPOK | WARN_GIANTOK, NULL,
810 "Kernel page fault") != 0) {
811 trap_fatal(frame, eva);
812 return (-1);
813 }
814 }
815 if (eva >= PMAP_TRM_MIN_ADDRESS) {
816 /*
817 * Don't allow user-mode faults in kernel address space.
818 * An exception: if the faulting address is the invalid
819 * instruction entry in the IDT, then the Intel Pentium
820 * F00F bug workaround was triggered, and we need to
821 * treat it is as an illegal instruction, and not a page
822 * fault.
823 */
824 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
825 if ((eva == (unsigned int)&idt[6]) && has_f00f_bug) {
826 *ucode = ILL_PRVOPC;
827 *signo = SIGILL;
828 return (-2);
829 }
830 #endif
831 if (usermode) {
832 *signo = SIGSEGV;
833 *ucode = SEGV_MAPERR;
834 return (1);
835 }
836 trap_fatal(frame, eva);
837 return (-1);
838 } else {
839 map = usermode ? &p->p_vmspace->vm_map : kernel_map;
840
841 /*
842 * Kernel cannot access a user-space address directly
843 * because user pages are not mapped. Also, page
844 * faults must not be caused during the interrupts.
845 */
846 if (!usermode && td->td_intr_nesting_level != 0) {
847 trap_fatal(frame, eva);
848 return (-1);
849 }
850 }
851
852 /*
853 * If the trap was caused by errant bits in the PTE then panic.
854 */
855 if (frame->tf_err & PGEX_RSV) {
856 trap_fatal(frame, eva);
857 return (-1);
858 }
859
860 /*
861 * PGEX_I is defined only if the execute disable bit capability is
862 * supported and enabled.
863 */
864 if (frame->tf_err & PGEX_W)
865 ftype = VM_PROT_WRITE;
866 else if ((frame->tf_err & PGEX_I) && pg_nx != 0)
867 ftype = VM_PROT_EXECUTE;
868 else
869 ftype = VM_PROT_READ;
870
871 /* Fault in the page. */
872 rv = vm_fault_trap(map, eva, ftype, VM_FAULT_NORMAL, signo, ucode);
873 if (rv == KERN_SUCCESS) {
874 #ifdef HWPMC_HOOKS
875 if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) {
876 PMC_SOFT_CALL_TF( , , page_fault, all, frame);
877 if (ftype == VM_PROT_READ)
878 PMC_SOFT_CALL_TF( , , page_fault, read,
879 frame);
880 else
881 PMC_SOFT_CALL_TF( , , page_fault, write,
882 frame);
883 }
884 #endif
885 return (0);
886 }
887 if (usermode)
888 return (1);
889 if (td->td_intr_nesting_level == 0 &&
890 curpcb->pcb_onfault != NULL) {
891 frame->tf_eip = (int)curpcb->pcb_onfault;
892 return (0);
893 }
894 trap_fatal(frame, eva);
895 return (-1);
896 }
897
898 static void
trap_fatal(struct trapframe * frame,vm_offset_t eva)899 trap_fatal(struct trapframe *frame, vm_offset_t eva)
900 {
901 int code, ss, esp;
902 u_int type;
903 struct soft_segment_descriptor softseg;
904 #ifdef KDB
905 bool handled;
906 #endif
907
908 code = frame->tf_err;
909 type = frame->tf_trapno;
910 sdtossd(&gdt[IDXSEL(frame->tf_cs & 0xffff)].sd, &softseg);
911
912 printf("\n\nFatal trap %d: %s while in %s mode\n", type, trap_msg(type),
913 frame->tf_eflags & PSL_VM ? "vm86" :
914 ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel");
915 #ifdef SMP
916 /* two separate prints in case of a trap on an unmapped page */
917 printf("cpuid = %d; ", PCPU_GET(cpuid));
918 printf("apic id = %02x\n", PCPU_GET(apic_id));
919 #endif
920 if (type == T_PAGEFLT) {
921 printf("fault virtual address = 0x%x\n", eva);
922 printf("fault code = %s %s%s, %s\n",
923 code & PGEX_U ? "user" : "supervisor",
924 code & PGEX_W ? "write" : "read",
925 pg_nx != 0 ?
926 (code & PGEX_I ? " instruction" : " data") :
927 "",
928 code & PGEX_RSV ? "reserved bits in PTE" :
929 code & PGEX_P ? "protection violation" : "page not present");
930 } else {
931 printf("error code = %#x\n", code);
932 }
933 printf("instruction pointer = 0x%x:0x%x\n",
934 frame->tf_cs & 0xffff, frame->tf_eip);
935 if (TF_HAS_STACKREGS(frame)) {
936 ss = frame->tf_ss & 0xffff;
937 esp = frame->tf_esp;
938 } else {
939 ss = GSEL(GDATA_SEL, SEL_KPL);
940 esp = (int)&frame->tf_esp;
941 }
942 printf("stack pointer = 0x%x:0x%x\n", ss, esp);
943 printf("frame pointer = 0x%x:0x%x\n", ss, frame->tf_ebp);
944 printf("code segment = base 0x%x, limit 0x%x, type 0x%x\n",
945 softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type);
946 printf(" = DPL %d, pres %d, def32 %d, gran %d\n",
947 softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_def32,
948 softseg.ssd_gran);
949 printf("processor eflags = ");
950 if (frame->tf_eflags & PSL_T)
951 printf("trace trap, ");
952 if (frame->tf_eflags & PSL_I)
953 printf("interrupt enabled, ");
954 if (frame->tf_eflags & PSL_NT)
955 printf("nested task, ");
956 if (frame->tf_eflags & PSL_RF)
957 printf("resume, ");
958 if (frame->tf_eflags & PSL_VM)
959 printf("vm86, ");
960 printf("IOPL = %d\n", (frame->tf_eflags & PSL_IOPL) >> 12);
961 printf("current process = %d (%s)\n",
962 curproc->p_pid, curthread->td_name);
963
964 #ifdef KDB
965 if (debugger_on_trap) {
966 kdb_why = KDB_WHY_TRAP;
967 frame->tf_err = eva; /* smuggle fault address to ddb */
968 handled = kdb_trap(type, 0, frame);
969 frame->tf_err = code; /* restore error code */
970 kdb_why = KDB_WHY_UNSET;
971 if (handled)
972 return;
973 }
974 #endif
975 printf("trap number = %d\n", type);
976 if (trap_msg(type) != NULL)
977 panic("%s", trap_msg(type));
978 else
979 panic("unknown/reserved trap");
980 }
981
982 #ifdef KDTRACE_HOOKS
983 /*
984 * Invoke a userspace DTrace hook. The hook pointer is cleared when no
985 * userspace probes are enabled, so we must synchronize with DTrace to ensure
986 * that a trapping thread is able to call the hook before it is cleared.
987 */
988 static bool
trap_user_dtrace(struct trapframe * frame,int (** hookp)(struct trapframe *))989 trap_user_dtrace(struct trapframe *frame, int (**hookp)(struct trapframe *))
990 {
991 int (*hook)(struct trapframe *);
992
993 hook = atomic_load_ptr(hookp);
994 enable_intr();
995 if (hook != NULL)
996 return ((hook)(frame) == 0);
997 return (false);
998 }
999 #endif
1000
1001 /*
1002 * Double fault handler. Called when a fault occurs while writing
1003 * a frame for a trap/exception onto the stack. This usually occurs
1004 * when the stack overflows (such is the case with infinite recursion,
1005 * for example).
1006 *
1007 * XXX Note that the current PTD gets replaced by IdlePTD when the
1008 * task switch occurs. This means that the stack that was active at
1009 * the time of the double fault is not available at <kstack> unless
1010 * the machine was idle when the double fault occurred. The downside
1011 * of this is that "trace <ebp>" in ddb won't work.
1012 */
1013 void
dblfault_handler(void)1014 dblfault_handler(void)
1015 {
1016 struct i386tss *t;
1017
1018 #ifdef KDTRACE_HOOKS
1019 if (dtrace_doubletrap_func != NULL)
1020 (*dtrace_doubletrap_func)();
1021 #endif
1022 printf("\nFatal double fault:\n");
1023 t = PCPU_GET(common_tssp);
1024 printf(
1025 "eip = %#08x esp = %#08x ebp = %#08x eax = %#08x\n"
1026 "edx = %#08x ecx = %#08x edi = %#08x esi = %#08x\n"
1027 "ebx = %#08x\n"
1028 "psl = %#08x cs = %#08x ss = %#08x ds = %#08x\n"
1029 "es = %#08x fs = %#08x gs = %#08x cr3 = %#08x\n",
1030 t->tss_eip, t->tss_esp, t->tss_ebp, t->tss_eax,
1031 t->tss_edx, t->tss_ecx, t->tss_edi, t->tss_esi,
1032 t->tss_ebx,
1033 t->tss_eflags, t->tss_cs, t->tss_ss, t->tss_ds,
1034 t->tss_es, t->tss_fs, t->tss_gs, t->tss_cr3);
1035 #ifdef SMP
1036 printf("cpuid = %d; apic id = %02x\n", PCPU_GET(cpuid),
1037 PCPU_GET(apic_id));
1038 #endif
1039 panic("double fault");
1040 }
1041
1042 int
cpu_fetch_syscall_args(struct thread * td)1043 cpu_fetch_syscall_args(struct thread *td)
1044 {
1045 struct proc *p;
1046 struct trapframe *frame;
1047 struct syscall_args *sa;
1048 caddr_t params;
1049 long tmp;
1050 int error;
1051 #ifdef COMPAT_43
1052 u_int32_t eip;
1053 int cs;
1054 #endif
1055
1056 p = td->td_proc;
1057 frame = td->td_frame;
1058 sa = &td->td_sa;
1059
1060 #ifdef COMPAT_43
1061 if (__predict_false(frame->tf_cs == 7 && frame->tf_eip == 2)) {
1062 /*
1063 * In lcall $7,$0 after int $0x80. Convert the user
1064 * frame to what it would be for a direct int 0x80 instead
1065 * of lcall $7,$0, by popping the lcall return address.
1066 */
1067 error = fueword32((void *)frame->tf_esp, &eip);
1068 if (error == -1)
1069 return (EFAULT);
1070 cs = fuword16((void *)(frame->tf_esp + sizeof(u_int32_t)));
1071 if (cs == -1)
1072 return (EFAULT);
1073
1074 /*
1075 * Unwind in-kernel frame after all stack frame pieces
1076 * were successfully read.
1077 */
1078 frame->tf_eip = eip;
1079 frame->tf_cs = cs;
1080 frame->tf_esp += 2 * sizeof(u_int32_t);
1081 frame->tf_err = 7; /* size of lcall $7,$0 */
1082 }
1083 #endif
1084
1085 sa->code = frame->tf_eax;
1086 sa->original_code = sa->code;
1087 params = (caddr_t)frame->tf_esp + sizeof(uint32_t);
1088
1089 /*
1090 * Need to check if this is a 32 bit or 64 bit syscall.
1091 */
1092 if (sa->code == SYS_syscall) {
1093 /*
1094 * Code is first argument, followed by actual args.
1095 */
1096 error = fueword(params, &tmp);
1097 if (error == -1)
1098 return (EFAULT);
1099 sa->code = tmp;
1100 params += sizeof(uint32_t);
1101 } else if (sa->code == SYS___syscall) {
1102 /*
1103 * Like syscall, but code is a quad, so as to maintain
1104 * quad alignment for the rest of the arguments.
1105 */
1106 error = fueword(params, &tmp);
1107 if (error == -1)
1108 return (EFAULT);
1109 sa->code = tmp;
1110 params += sizeof(quad_t);
1111 }
1112
1113 if (sa->code >= p->p_sysent->sv_size)
1114 sa->callp = &nosys_sysent;
1115 else
1116 sa->callp = &p->p_sysent->sv_table[sa->code];
1117
1118 if (params != NULL && sa->callp->sy_narg != 0)
1119 error = copyin(params, (caddr_t)sa->args,
1120 (u_int)(sa->callp->sy_narg * sizeof(uint32_t)));
1121 else
1122 error = 0;
1123
1124 if (error == 0) {
1125 td->td_retval[0] = 0;
1126 td->td_retval[1] = frame->tf_edx;
1127 }
1128
1129 return (error);
1130 }
1131
1132 #include "../../kern/subr_syscall.c"
1133
1134 /*
1135 * syscall - system call request C handler. A system call is
1136 * essentially treated as a trap by reusing the frame layout.
1137 */
1138 void
syscall(struct trapframe * frame)1139 syscall(struct trapframe *frame)
1140 {
1141 struct thread *td;
1142 register_t orig_tf_eflags;
1143 ksiginfo_t ksi;
1144
1145 #ifdef DIAGNOSTIC
1146 if (!(TRAPF_USERMODE(frame) &&
1147 (curpcb->pcb_flags & PCB_VM86CALL) == 0)) {
1148 panic("syscall");
1149 /* NOT REACHED */
1150 }
1151 #endif
1152 trap_check_kstack();
1153 orig_tf_eflags = frame->tf_eflags;
1154
1155 td = curthread;
1156 td->td_frame = frame;
1157
1158 syscallenter(td);
1159
1160 /*
1161 * Traced syscall.
1162 */
1163 if ((orig_tf_eflags & PSL_T) && !(orig_tf_eflags & PSL_VM)) {
1164 frame->tf_eflags &= ~PSL_T;
1165 ksiginfo_init_trap(&ksi);
1166 ksi.ksi_signo = SIGTRAP;
1167 ksi.ksi_code = TRAP_TRACE;
1168 ksi.ksi_addr = (void *)frame->tf_eip;
1169 trapsignal(td, &ksi);
1170 }
1171
1172 KASSERT(PCB_USER_FPU(td->td_pcb),
1173 ("System call %s returning with kernel FPU ctx leaked",
1174 syscallname(td->td_proc, td->td_sa.code)));
1175 KASSERT(td->td_pcb->pcb_save == get_pcb_user_save_td(td),
1176 ("System call %s returning with mangled pcb_save",
1177 syscallname(td->td_proc, td->td_sa.code)));
1178
1179 syscallret(td);
1180 }
1181