xref: /freebsd/sys/kern/subr_trap.c (revision ce4946daa5ce852d28008dac492029500ab2ee95)
1 /*-
2  * Copyright (C) 1994, David Greenman
3  * Copyright (c) 1990, 1993
4  *	The Regents of the University of California.  All rights reserved.
5  *
6  * This code is derived from software contributed to Berkeley by
7  * the University of Utah, and William Jolitz.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  * 3. All advertising materials mentioning features or use of this software
18  *    must display the following acknowledgement:
19  *	This product includes software developed by the University of
20  *	California, Berkeley and its contributors.
21  * 4. Neither the name of the University nor the names of its contributors
22  *    may be used to endorse or promote products derived from this software
23  *    without specific prior written permission.
24  *
25  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35  * SUCH DAMAGE.
36  *
37  *	from: @(#)trap.c	7.4 (Berkeley) 5/13/91
38  * $FreeBSD$
39  */
40 
41 /*
42  * 386 Trap and System call handling
43  */
44 
45 #include "opt_clock.h"
46 #include "opt_cpu.h"
47 #include "opt_ddb.h"
48 #include "opt_isa.h"
49 #include "opt_ktrace.h"
50 #include "opt_npx.h"
51 #include "opt_trap.h"
52 
53 #include <sys/param.h>
54 #include <sys/bus.h>
55 #include <sys/systm.h>
56 #include <sys/proc.h>
57 #include <sys/pioctl.h>
58 #include <sys/ipl.h>
59 #include <sys/kernel.h>
60 #include <sys/ktr.h>
61 #include <sys/mutex.h>
62 #include <sys/resourcevar.h>
63 #include <sys/signalvar.h>
64 #include <sys/syscall.h>
65 #include <sys/sysctl.h>
66 #include <sys/sysent.h>
67 #include <sys/uio.h>
68 #include <sys/vmmeter.h>
69 #ifdef KTRACE
70 #include <sys/ktrace.h>
71 #endif
72 
73 #include <vm/vm.h>
74 #include <vm/vm_param.h>
75 #include <sys/lock.h>
76 #include <vm/pmap.h>
77 #include <vm/vm_kern.h>
78 #include <vm/vm_map.h>
79 #include <vm/vm_page.h>
80 #include <vm/vm_extern.h>
81 
82 #include <machine/cpu.h>
83 #include <machine/md_var.h>
84 #include <machine/pcb.h>
85 #ifdef SMP
86 #include <machine/smp.h>
87 #endif
88 #include <machine/tss.h>
89 
90 #include <i386/isa/icu.h>
91 #include <i386/isa/intr_machdep.h>
92 
93 #ifdef POWERFAIL_NMI
94 #include <sys/syslog.h>
95 #include <machine/clock.h>
96 #endif
97 
98 #include <machine/vm86.h>
99 
100 #include <ddb/ddb.h>
101 
102 #include <sys/sysctl.h>
103 
104 int (*pmath_emulate) __P((struct trapframe *));
105 
106 extern void trap __P((struct trapframe frame));
107 extern int trapwrite __P((unsigned addr));
108 extern void syscall __P((struct trapframe frame));
109 extern void ast __P((struct trapframe *framep));
110 
111 static int trap_pfault __P((struct trapframe *, int, vm_offset_t));
112 static void trap_fatal __P((struct trapframe *, vm_offset_t));
113 void dblfault_handler __P((void));
114 
115 extern inthand_t IDTVEC(lcall_syscall);
116 
117 #define MAX_TRAP_MSG		28
118 static char *trap_msg[] = {
119 	"",					/*  0 unused */
120 	"privileged instruction fault",		/*  1 T_PRIVINFLT */
121 	"",					/*  2 unused */
122 	"breakpoint instruction fault",		/*  3 T_BPTFLT */
123 	"",					/*  4 unused */
124 	"",					/*  5 unused */
125 	"arithmetic trap",			/*  6 T_ARITHTRAP */
126 	"",					/*  7 unused */
127 	"",					/*  8 unused */
128 	"general protection fault",		/*  9 T_PROTFLT */
129 	"trace trap",				/* 10 T_TRCTRAP */
130 	"",					/* 11 unused */
131 	"page fault",				/* 12 T_PAGEFLT */
132 	"",					/* 13 unused */
133 	"alignment fault",			/* 14 T_ALIGNFLT */
134 	"",					/* 15 unused */
135 	"",					/* 16 unused */
136 	"",					/* 17 unused */
137 	"integer divide fault",			/* 18 T_DIVIDE */
138 	"non-maskable interrupt trap",		/* 19 T_NMI */
139 	"overflow trap",			/* 20 T_OFLOW */
140 	"FPU bounds check fault",		/* 21 T_BOUND */
141 	"FPU device not available",		/* 22 T_DNA */
142 	"double fault",				/* 23 T_DOUBLEFLT */
143 	"FPU operand fetch fault",		/* 24 T_FPOPFLT */
144 	"invalid TSS fault",			/* 25 T_TSSFLT */
145 	"segment not present fault",		/* 26 T_SEGNPFLT */
146 	"stack fault",				/* 27 T_STKFLT */
147 	"machine check trap",			/* 28 T_MCHK */
148 };
149 
150 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
151 extern int has_f00f_bug;
152 #endif
153 
154 #ifdef DDB
155 static int ddb_on_nmi = 1;
156 SYSCTL_INT(_machdep, OID_AUTO, ddb_on_nmi, CTLFLAG_RW,
157 	&ddb_on_nmi, 0, "Go to DDB on NMI");
158 #endif
159 static int panic_on_nmi = 1;
160 SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RW,
161 	&panic_on_nmi, 0, "Panic on NMI");
162 
163 #ifdef WITNESS
164 extern char *syscallnames[];
165 #endif
166 
167 void
168 userret(p, frame, oticks)
169 	struct proc *p;
170 	struct trapframe *frame;
171 	u_quad_t oticks;
172 {
173 	int sig;
174 
175 	while ((sig = CURSIG(p)) != 0)
176 		postsig(sig);
177 
178 	mtx_lock_spin(&sched_lock);
179 	p->p_pri.pri_level = p->p_pri.pri_user;
180 	if (resched_wanted(p)) {
181 		/*
182 		 * Since we are curproc, clock will normally just change
183 		 * our priority without moving us from one queue to another
184 		 * (since the running process is not on a queue.)
185 		 * If that happened after we setrunqueue ourselves but before we
186 		 * mi_switch()'ed, we might not be on the queue indicated by
187 		 * our priority.
188 		 */
189 		DROP_GIANT_NOSWITCH();
190 		setrunqueue(p);
191 		p->p_stats->p_ru.ru_nivcsw++;
192 		mi_switch();
193 		mtx_unlock_spin(&sched_lock);
194 		PICKUP_GIANT();
195 		while ((sig = CURSIG(p)) != 0)
196 			postsig(sig);
197 		mtx_lock_spin(&sched_lock);
198 	}
199 
200 	/*
201 	 * Charge system time if profiling.
202 	 */
203 	if (p->p_sflag & PS_PROFIL) {
204 		mtx_unlock_spin(&sched_lock);
205 		/* XXX - do we need Giant? */
206 		if (!mtx_owned(&Giant))
207 			mtx_lock(&Giant);
208 		addupc_task(p, TRAPF_PC(frame),
209 			    (u_int)(p->p_sticks - oticks) * psratio);
210 	} else
211 		mtx_unlock_spin(&sched_lock);
212 }
213 
214 /*
215  * Exception, fault, and trap interface to the FreeBSD kernel.
216  * This common code is called from assembly language IDT gate entry
217  * routines that prepare a suitable stack frame, and restore this
218  * frame after the exception has been processed.
219  */
220 
221 void
222 trap(frame)
223 	struct trapframe frame;
224 {
225 	struct proc *p = curproc;
226 	u_quad_t sticks = 0;
227 	int i = 0, ucode = 0, type, code;
228 	vm_offset_t eva;
229 #ifdef POWERFAIL_NMI
230 	static int lastalert = 0;
231 #endif
232 
233 	atomic_add_int(&cnt.v_trap, 1);
234 
235 	if ((frame.tf_eflags & PSL_I) == 0) {
236 		/*
237 		 * Buggy application or kernel code has disabled
238 		 * interrupts and then trapped.  Enabling interrupts
239 		 * now is wrong, but it is better than running with
240 		 * interrupts disabled until they are accidentally
241 		 * enabled later.  XXX This is really bad if we trap
242 		 * while holding a spin lock.
243 		 */
244 		type = frame.tf_trapno;
245 		if (ISPL(frame.tf_cs) == SEL_UPL || (frame.tf_eflags & PSL_VM))
246 			printf(
247 			    "pid %ld (%s): trap %d with interrupts disabled\n",
248 			    (long)curproc->p_pid, curproc->p_comm, type);
249 		else if (type != T_BPTFLT && type != T_TRCTRAP) {
250 			/*
251 			 * XXX not quite right, since this may be for a
252 			 * multiple fault in user mode.
253 			 */
254 			printf("kernel trap %d with interrupts disabled\n",
255 			    type);
256 			/*
257 			 * We should walk p_heldmtx here and see if any are
258 			 * spin mutexes, and not do this if so.
259 			 */
260 			enable_intr();
261 		}
262 	}
263 
264 	eva = 0;
265 
266 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
267 restart:
268 #endif
269 
270 	type = frame.tf_trapno;
271 	code = frame.tf_err;
272 
273         if ((ISPL(frame.tf_cs) == SEL_UPL) ||
274 	    ((frame.tf_eflags & PSL_VM) && !in_vm86call)) {
275 		/* user trap */
276 
277 		mtx_lock_spin(&sched_lock);
278 		sticks = p->p_sticks;
279 		mtx_unlock_spin(&sched_lock);
280 		p->p_md.md_regs = &frame;
281 
282 		switch (type) {
283 		case T_PRIVINFLT:	/* privileged instruction fault */
284 			ucode = type;
285 			i = SIGILL;
286 			break;
287 
288 		case T_BPTFLT:		/* bpt instruction fault */
289 		case T_TRCTRAP:		/* trace trap */
290 			frame.tf_eflags &= ~PSL_T;
291 			i = SIGTRAP;
292 			break;
293 
294 		case T_ARITHTRAP:	/* arithmetic trap */
295 			ucode = code;
296 			i = SIGFPE;
297 			break;
298 
299 			/*
300 			 * The following two traps can happen in
301 			 * vm86 mode, and, if so, we want to handle
302 			 * them specially.
303 			 */
304 		case T_PROTFLT:		/* general protection fault */
305 		case T_STKFLT:		/* stack fault */
306 			if (frame.tf_eflags & PSL_VM) {
307 				mtx_lock(&Giant);
308 				i = vm86_emulate((struct vm86frame *)&frame);
309 				mtx_unlock(&Giant);
310 				if (i == 0)
311 					goto user;
312 				break;
313 			}
314 			/* FALL THROUGH */
315 
316 		case T_SEGNPFLT:	/* segment not present fault */
317 		case T_TSSFLT:		/* invalid TSS fault */
318 		case T_DOUBLEFLT:	/* double fault */
319 		default:
320 			ucode = code + BUS_SEGM_FAULT ;
321 			i = SIGBUS;
322 			break;
323 
324 		case T_PAGEFLT:		/* page fault */
325 			/*
326 			 * For some Cyrix CPUs, %cr2 is clobbered by
327 			 * interrupts.  This problem is worked around by using
328 			 * an interrupt gate for the pagefault handler.  We
329 			 * are finally ready to read %cr2 and then must
330 			 * reenable interrupts.
331 			 */
332 			eva = rcr2();
333 			enable_intr();
334 			mtx_lock(&Giant);
335 			i = trap_pfault(&frame, TRUE, eva);
336 			mtx_unlock(&Giant);
337 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
338 			if (i == -2) {
339 				/*
340 				 * f00f hack workaround has triggered, treat
341 				 * as illegal instruction not page fault.
342 				 */
343 				frame.tf_trapno = T_PRIVINFLT;
344 				goto restart;
345 			}
346 #endif
347 			if (i == -1)
348 				goto out;
349 			if (i == 0)
350 				goto user;
351 
352 			ucode = T_PAGEFLT;
353 			break;
354 
355 		case T_DIVIDE:		/* integer divide fault */
356 			ucode = FPE_INTDIV;
357 			i = SIGFPE;
358 			break;
359 
360 #ifdef DEV_ISA
361 		case T_NMI:
362 #ifdef POWERFAIL_NMI
363 #ifndef TIMER_FREQ
364 #  define TIMER_FREQ 1193182
365 #endif
366 			mtx_lock(&Giant);
367 			if (time_second - lastalert > 10) {
368 				log(LOG_WARNING, "NMI: power fail\n");
369 				sysbeep(TIMER_FREQ/880, hz);
370 				lastalert = time_second;
371 			}
372 			mtx_unlock(&Giant);
373 			goto out;
374 #else /* !POWERFAIL_NMI */
375 			/* machine/parity/power fail/"kitchen sink" faults */
376 			/* XXX Giant */
377 			if (isa_nmi(code) == 0) {
378 #ifdef DDB
379 				/*
380 				 * NMI can be hooked up to a pushbutton
381 				 * for debugging.
382 				 */
383 				if (ddb_on_nmi) {
384 					printf ("NMI ... going to debugger\n");
385 					kdb_trap (type, 0, &frame);
386 				}
387 #endif /* DDB */
388 				goto out;
389 			} else if (panic_on_nmi)
390 				panic("NMI indicates hardware failure");
391 			break;
392 #endif /* POWERFAIL_NMI */
393 #endif /* DEV_ISA */
394 
395 		case T_OFLOW:		/* integer overflow fault */
396 			ucode = FPE_INTOVF;
397 			i = SIGFPE;
398 			break;
399 
400 		case T_BOUND:		/* bounds check fault */
401 			ucode = FPE_FLTSUB;
402 			i = SIGFPE;
403 			break;
404 
405 		case T_DNA:
406 #ifdef DEV_NPX
407 			/* transparent fault (due to context switch "late") */
408 			if (npxdna())
409 				goto out;
410 #endif
411 			if (!pmath_emulate) {
412 				i = SIGFPE;
413 				ucode = FPE_FPU_NP_TRAP;
414 				break;
415 			}
416 			mtx_lock(&Giant);
417 			i = (*pmath_emulate)(&frame);
418 			mtx_unlock(&Giant);
419 			if (i == 0) {
420 				if (!(frame.tf_eflags & PSL_T))
421 					goto out;
422 				frame.tf_eflags &= ~PSL_T;
423 				i = SIGTRAP;
424 			}
425 			/* else ucode = emulator_only_knows() XXX */
426 			break;
427 
428 		case T_FPOPFLT:		/* FPU operand fetch fault */
429 			ucode = T_FPOPFLT;
430 			i = SIGILL;
431 			break;
432 		}
433 	} else {
434 		/* kernel trap */
435 
436 		switch (type) {
437 		case T_PAGEFLT:			/* page fault */
438 			/*
439 			 * For some Cyrix CPUs, %cr2 is clobbered by
440 			 * interrupts.  This problem is worked around by using
441 			 * an interrupt gate for the pagefault handler.  We
442 			 * are finally ready to read %cr2 and then must
443 			 * reenable interrupts.
444 			 */
445 			eva = rcr2();
446 			enable_intr();
447 			mtx_lock(&Giant);
448 			(void) trap_pfault(&frame, FALSE, eva);
449 			mtx_unlock(&Giant);
450 			goto out;
451 
452 		case T_DNA:
453 #ifdef DEV_NPX
454 			/*
455 			 * The kernel is apparently using npx for copying.
456 			 * XXX this should be fatal unless the kernel has
457 			 * registered such use.
458 			 */
459 			if (npxdna())
460 				goto out;
461 #endif
462 			break;
463 
464 			/*
465 			 * The following two traps can happen in
466 			 * vm86 mode, and, if so, we want to handle
467 			 * them specially.
468 			 */
469 		case T_PROTFLT:		/* general protection fault */
470 		case T_STKFLT:		/* stack fault */
471 			if (frame.tf_eflags & PSL_VM) {
472 				mtx_lock(&Giant);
473 				i = vm86_emulate((struct vm86frame *)&frame);
474 				mtx_unlock(&Giant);
475 				if (i != 0)
476 					/*
477 					 * returns to original process
478 					 */
479 					vm86_trap((struct vm86frame *)&frame);
480 				goto out;
481 			}
482 			if (type == T_STKFLT)
483 				break;
484 
485 			/* FALL THROUGH */
486 
487 		case T_SEGNPFLT:	/* segment not present fault */
488 			if (in_vm86call)
489 				break;
490 
491 			if (p->p_intr_nesting_level != 0)
492 				break;
493 
494 			/*
495 			 * Invalid %fs's and %gs's can be created using
496 			 * procfs or PT_SETREGS or by invalidating the
497 			 * underlying LDT entry.  This causes a fault
498 			 * in kernel mode when the kernel attempts to
499 			 * switch contexts.  Lose the bad context
500 			 * (XXX) so that we can continue, and generate
501 			 * a signal.
502 			 */
503 			if (frame.tf_eip == (int)cpu_switch_load_gs) {
504 				PCPU_GET(curpcb)->pcb_gs = 0;
505 				PROC_LOCK(p);
506 				psignal(p, SIGBUS);
507 				PROC_UNLOCK(p);
508 				goto out;
509 			}
510 
511 			/*
512 			 * Invalid segment selectors and out of bounds
513 			 * %eip's and %esp's can be set up in user mode.
514 			 * This causes a fault in kernel mode when the
515 			 * kernel tries to return to user mode.  We want
516 			 * to get this fault so that we can fix the
517 			 * problem here and not have to check all the
518 			 * selectors and pointers when the user changes
519 			 * them.
520 			 */
521 			if (frame.tf_eip == (int)doreti_iret) {
522 				frame.tf_eip = (int)doreti_iret_fault;
523 				goto out;
524 			}
525 			if (frame.tf_eip == (int)doreti_popl_ds) {
526 				frame.tf_eip = (int)doreti_popl_ds_fault;
527 				goto out;
528 			}
529 			if (frame.tf_eip == (int)doreti_popl_es) {
530 				frame.tf_eip = (int)doreti_popl_es_fault;
531 				goto out;
532 			}
533 			if (frame.tf_eip == (int)doreti_popl_fs) {
534 				frame.tf_eip = (int)doreti_popl_fs_fault;
535 				goto out;
536 			}
537 			if (PCPU_GET(curpcb) != NULL &&
538 			    PCPU_GET(curpcb)->pcb_onfault != NULL) {
539 				frame.tf_eip =
540 				    (int)PCPU_GET(curpcb)->pcb_onfault;
541 				goto out;
542 			}
543 			break;
544 
545 		case T_TSSFLT:
546 			/*
547 			 * PSL_NT can be set in user mode and isn't cleared
548 			 * automatically when the kernel is entered.  This
549 			 * causes a TSS fault when the kernel attempts to
550 			 * `iret' because the TSS link is uninitialized.  We
551 			 * want to get this fault so that we can fix the
552 			 * problem here and not every time the kernel is
553 			 * entered.
554 			 */
555 			if (frame.tf_eflags & PSL_NT) {
556 				frame.tf_eflags &= ~PSL_NT;
557 				goto out;
558 			}
559 			break;
560 
561 		case T_TRCTRAP:	 /* trace trap */
562 			if (frame.tf_eip == (int)IDTVEC(lcall_syscall)) {
563 				/*
564 				 * We've just entered system mode via the
565 				 * syscall lcall.  Continue single stepping
566 				 * silently until the syscall handler has
567 				 * saved the flags.
568 				 */
569 				goto out;
570 			}
571 			if (frame.tf_eip == (int)IDTVEC(lcall_syscall) + 1) {
572 				/*
573 				 * The syscall handler has now saved the
574 				 * flags.  Stop single stepping it.
575 				 */
576 				frame.tf_eflags &= ~PSL_T;
577 				goto out;
578 			}
579 			/*
580 			 * Ignore debug register trace traps due to
581 			 * accesses in the user's address space, which
582 			 * can happen under several conditions such as
583 			 * if a user sets a watchpoint on a buffer and
584 			 * then passes that buffer to a system call.
585 			 * We still want to get TRCTRAPS for addresses
586 			 * in kernel space because that is useful when
587 			 * debugging the kernel.
588 			 */
589 			/* XXX Giant */
590 			if (user_dbreg_trap() && !in_vm86call) {
591 				/*
592 				 * Reset breakpoint bits because the
593 				 * processor doesn't
594 				 */
595 				load_dr6(rdr6() & 0xfffffff0);
596 				goto out;
597 			}
598 			/*
599 			 * Fall through (TRCTRAP kernel mode, kernel address)
600 			 */
601 		case T_BPTFLT:
602 			/*
603 			 * If DDB is enabled, let it handle the debugger trap.
604 			 * Otherwise, debugger traps "can't happen".
605 			 */
606 #ifdef DDB
607 			/* XXX Giant */
608 			if (kdb_trap (type, 0, &frame))
609 				goto out;
610 #endif
611 			break;
612 
613 #ifdef DEV_ISA
614 		case T_NMI:
615 #ifdef POWERFAIL_NMI
616 			mtx_lock(&Giant);
617 			if (time_second - lastalert > 10) {
618 				log(LOG_WARNING, "NMI: power fail\n");
619 				sysbeep(TIMER_FREQ/880, hz);
620 				lastalert = time_second;
621 			}
622 			mtx_unlock(&Giant);
623 			goto out;
624 #else /* !POWERFAIL_NMI */
625 			/* XXX Giant */
626 			/* machine/parity/power fail/"kitchen sink" faults */
627 			if (isa_nmi(code) == 0) {
628 #ifdef DDB
629 				/*
630 				 * NMI can be hooked up to a pushbutton
631 				 * for debugging.
632 				 */
633 				if (ddb_on_nmi) {
634 					printf ("NMI ... going to debugger\n");
635 					kdb_trap (type, 0, &frame);
636 				}
637 #endif /* DDB */
638 				goto out;
639 			} else if (panic_on_nmi == 0)
640 				goto out;
641 			/* FALL THROUGH */
642 #endif /* POWERFAIL_NMI */
643 #endif /* DEV_ISA */
644 		}
645 
646 		mtx_lock(&Giant);
647 		trap_fatal(&frame, eva);
648 		mtx_unlock(&Giant);
649 		goto out;
650 	}
651 
652 	mtx_lock(&Giant);
653 	/* Translate fault for emulators (e.g. Linux) */
654 	if (*p->p_sysent->sv_transtrap)
655 		i = (*p->p_sysent->sv_transtrap)(i, type);
656 
657 	trapsignal(p, i, ucode);
658 
659 #ifdef DEBUG
660 	if (type <= MAX_TRAP_MSG) {
661 		uprintf("fatal process exception: %s",
662 			trap_msg[type]);
663 		if ((type == T_PAGEFLT) || (type == T_PROTFLT))
664 			uprintf(", fault VA = 0x%lx", (u_long)eva);
665 		uprintf("\n");
666 	}
667 #endif
668 	mtx_unlock(&Giant);
669 
670 user:
671 	userret(p, &frame, sticks);
672 	if (mtx_owned(&Giant))
673 		mtx_unlock(&Giant);
674 out:
675 	return;
676 }
677 
678 #ifdef notyet
679 /*
680  * This version doesn't allow a page fault to user space while
681  * in the kernel. The rest of the kernel needs to be made "safe"
682  * before this can be used. I think the only things remaining
683  * to be made safe are the iBCS2 code and the process tracing/
684  * debugging code.
685  */
686 static int
687 trap_pfault(frame, usermode, eva)
688 	struct trapframe *frame;
689 	int usermode;
690 	vm_offset_t eva;
691 {
692 	vm_offset_t va;
693 	struct vmspace *vm = NULL;
694 	vm_map_t map = 0;
695 	int rv = 0;
696 	vm_prot_t ftype;
697 	struct proc *p = curproc;
698 
699 	if (frame->tf_err & PGEX_W)
700 		ftype = VM_PROT_WRITE;
701 	else
702 		ftype = VM_PROT_READ;
703 
704 	va = trunc_page(eva);
705 	if (va < VM_MIN_KERNEL_ADDRESS) {
706 		vm_offset_t v;
707 		vm_page_t mpte;
708 
709 		if (p == NULL ||
710 		    (!usermode && va < VM_MAXUSER_ADDRESS &&
711 		     (p->p_intr_nesting_level != 0 ||
712 		      PCPU_GET(curpcb) == NULL ||
713 		      PCPU_GET(curpcb)->pcb_onfault == NULL))) {
714 			trap_fatal(frame, eva);
715 			return (-1);
716 		}
717 
718 		/*
719 		 * This is a fault on non-kernel virtual memory.
720 		 * vm is initialized above to NULL. If curproc is NULL
721 		 * or curproc->p_vmspace is NULL the fault is fatal.
722 		 */
723 		vm = p->p_vmspace;
724 		if (vm == NULL)
725 			goto nogo;
726 
727 		map = &vm->vm_map;
728 
729 		/*
730 		 * Keep swapout from messing with us during this
731 		 *	critical time.
732 		 */
733 		PROC_LOCK(p);
734 		++p->p_lock;
735 		PROC_UNLOCK(p);
736 
737 		/*
738 		 * Grow the stack if necessary
739 		 */
740 		/* grow_stack returns false only if va falls into
741 		 * a growable stack region and the stack growth
742 		 * fails.  It returns true if va was not within
743 		 * a growable stack region, or if the stack
744 		 * growth succeeded.
745 		 */
746 		if (!grow_stack (p, va)) {
747 			rv = KERN_FAILURE;
748 			PROC_LOCK(p);
749 			--p->p_lock;
750 			PROC_UNLOCK(p);
751 			goto nogo;
752 		}
753 
754 		/* Fault in the user page: */
755 		rv = vm_fault(map, va, ftype,
756 			      (ftype & VM_PROT_WRITE) ? VM_FAULT_DIRTY
757 						      : VM_FAULT_NORMAL);
758 
759 		PROC_LOCK(p);
760 		--p->p_lock;
761 		PROC_UNLOCK(p);
762 	} else {
763 		/*
764 		 * Don't allow user-mode faults in kernel address space.
765 		 */
766 		if (usermode)
767 			goto nogo;
768 
769 		/*
770 		 * Since we know that kernel virtual address addresses
771 		 * always have pte pages mapped, we just have to fault
772 		 * the page.
773 		 */
774 		rv = vm_fault(kernel_map, va, ftype, VM_FAULT_NORMAL);
775 	}
776 
777 	if (rv == KERN_SUCCESS)
778 		return (0);
779 nogo:
780 	if (!usermode) {
781 		if (p->p_intr_nesting_level == 0 &&
782 		    PCPU_GET(curpcb) != NULL &&
783 		    PCPU_GET(curpcb)->pcb_onfault != NULL) {
784 			frame->tf_eip = (int)PCPU_GET(curpcb)->pcb_onfault;
785 			return (0);
786 		}
787 		trap_fatal(frame, eva);
788 		return (-1);
789 	}
790 
791 	/* kludge to pass faulting virtual address to sendsig */
792 	frame->tf_err = eva;
793 
794 	return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
795 }
796 #endif
797 
798 int
799 trap_pfault(frame, usermode, eva)
800 	struct trapframe *frame;
801 	int usermode;
802 	vm_offset_t eva;
803 {
804 	vm_offset_t va;
805 	struct vmspace *vm = NULL;
806 	vm_map_t map = 0;
807 	int rv = 0;
808 	vm_prot_t ftype;
809 	struct proc *p = curproc;
810 
811 	va = trunc_page(eva);
812 	if (va >= KERNBASE) {
813 		/*
814 		 * Don't allow user-mode faults in kernel address space.
815 		 * An exception:  if the faulting address is the invalid
816 		 * instruction entry in the IDT, then the Intel Pentium
817 		 * F00F bug workaround was triggered, and we need to
818 		 * treat it is as an illegal instruction, and not a page
819 		 * fault.
820 		 */
821 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
822 		if ((eva == (unsigned int)&idt[6]) && has_f00f_bug)
823 			return -2;
824 #endif
825 		if (usermode)
826 			goto nogo;
827 
828 		map = kernel_map;
829 	} else {
830 		/*
831 		 * This is a fault on non-kernel virtual memory.
832 		 * vm is initialized above to NULL. If curproc is NULL
833 		 * or curproc->p_vmspace is NULL the fault is fatal.
834 		 */
835 		if (p != NULL)
836 			vm = p->p_vmspace;
837 
838 		if (vm == NULL)
839 			goto nogo;
840 
841 		map = &vm->vm_map;
842 	}
843 
844 	if (frame->tf_err & PGEX_W)
845 		ftype = VM_PROT_WRITE;
846 	else
847 		ftype = VM_PROT_READ;
848 
849 	if (map != kernel_map) {
850 		/*
851 		 * Keep swapout from messing with us during this
852 		 *	critical time.
853 		 */
854 		PROC_LOCK(p);
855 		++p->p_lock;
856 		PROC_UNLOCK(p);
857 
858 		/*
859 		 * Grow the stack if necessary
860 		 */
861 		/* grow_stack returns false only if va falls into
862 		 * a growable stack region and the stack growth
863 		 * fails.  It returns true if va was not within
864 		 * a growable stack region, or if the stack
865 		 * growth succeeded.
866 		 */
867 		if (!grow_stack (p, va)) {
868 			rv = KERN_FAILURE;
869 			PROC_LOCK(p);
870 			--p->p_lock;
871 			PROC_UNLOCK(p);
872 			goto nogo;
873 		}
874 
875 		/* Fault in the user page: */
876 		rv = vm_fault(map, va, ftype,
877 			      (ftype & VM_PROT_WRITE) ? VM_FAULT_DIRTY
878 						      : VM_FAULT_NORMAL);
879 
880 		PROC_LOCK(p);
881 		--p->p_lock;
882 		PROC_UNLOCK(p);
883 	} else {
884 		/*
885 		 * Don't have to worry about process locking or stacks in the
886 		 * kernel.
887 		 */
888 		rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL);
889 	}
890 
891 	if (rv == KERN_SUCCESS)
892 		return (0);
893 nogo:
894 	if (!usermode) {
895 		if (p->p_intr_nesting_level == 0 &&
896 		    PCPU_GET(curpcb) != NULL &&
897 		    PCPU_GET(curpcb)->pcb_onfault != NULL) {
898 			frame->tf_eip = (int)PCPU_GET(curpcb)->pcb_onfault;
899 			return (0);
900 		}
901 		trap_fatal(frame, eva);
902 		return (-1);
903 	}
904 
905 	/* kludge to pass faulting virtual address to sendsig */
906 	frame->tf_err = eva;
907 
908 	return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
909 }
910 
911 static void
912 trap_fatal(frame, eva)
913 	struct trapframe *frame;
914 	vm_offset_t eva;
915 {
916 	int code, type, ss, esp;
917 	struct soft_segment_descriptor softseg;
918 
919 	code = frame->tf_err;
920 	type = frame->tf_trapno;
921 	sdtossd(&gdt[IDXSEL(frame->tf_cs & 0xffff)].sd, &softseg);
922 
923 	if (type <= MAX_TRAP_MSG)
924 		printf("\n\nFatal trap %d: %s while in %s mode\n",
925 			type, trap_msg[type],
926         		frame->tf_eflags & PSL_VM ? "vm86" :
927 			ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel");
928 #ifdef SMP
929 	/* two separate prints in case of a trap on an unmapped page */
930 	printf("cpuid = %d; ", PCPU_GET(cpuid));
931 	printf("lapic.id = %08x\n", lapic.id);
932 #endif
933 	if (type == T_PAGEFLT) {
934 		printf("fault virtual address	= 0x%x\n", eva);
935 		printf("fault code		= %s %s, %s\n",
936 			code & PGEX_U ? "user" : "supervisor",
937 			code & PGEX_W ? "write" : "read",
938 			code & PGEX_P ? "protection violation" : "page not present");
939 	}
940 	printf("instruction pointer	= 0x%x:0x%x\n",
941 	       frame->tf_cs & 0xffff, frame->tf_eip);
942         if ((ISPL(frame->tf_cs) == SEL_UPL) || (frame->tf_eflags & PSL_VM)) {
943 		ss = frame->tf_ss & 0xffff;
944 		esp = frame->tf_esp;
945 	} else {
946 		ss = GSEL(GDATA_SEL, SEL_KPL);
947 		esp = (int)&frame->tf_esp;
948 	}
949 	printf("stack pointer	        = 0x%x:0x%x\n", ss, esp);
950 	printf("frame pointer	        = 0x%x:0x%x\n", ss, frame->tf_ebp);
951 	printf("code segment		= base 0x%x, limit 0x%x, type 0x%x\n",
952 	       softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type);
953 	printf("			= DPL %d, pres %d, def32 %d, gran %d\n",
954 	       softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_def32,
955 	       softseg.ssd_gran);
956 	printf("processor eflags	= ");
957 	if (frame->tf_eflags & PSL_T)
958 		printf("trace trap, ");
959 	if (frame->tf_eflags & PSL_I)
960 		printf("interrupt enabled, ");
961 	if (frame->tf_eflags & PSL_NT)
962 		printf("nested task, ");
963 	if (frame->tf_eflags & PSL_RF)
964 		printf("resume, ");
965 	if (frame->tf_eflags & PSL_VM)
966 		printf("vm86, ");
967 	printf("IOPL = %d\n", (frame->tf_eflags & PSL_IOPL) >> 12);
968 	printf("current process		= ");
969 	if (curproc) {
970 		printf("%lu (%s)\n",
971 		    (u_long)curproc->p_pid, curproc->p_comm ?
972 		    curproc->p_comm : "");
973 	} else {
974 		printf("Idle\n");
975 	}
976 
977 #ifdef KDB
978 	if (kdb_trap(&psl))
979 		return;
980 #endif
981 #ifdef DDB
982 	if ((debugger_on_panic || db_active) && kdb_trap(type, 0, frame))
983 		return;
984 #endif
985 	printf("trap number		= %d\n", type);
986 	if (type <= MAX_TRAP_MSG)
987 		panic(trap_msg[type]);
988 	else
989 		panic("unknown/reserved trap");
990 }
991 
992 /*
993  * Double fault handler. Called when a fault occurs while writing
994  * a frame for a trap/exception onto the stack. This usually occurs
995  * when the stack overflows (such is the case with infinite recursion,
996  * for example).
997  *
998  * XXX Note that the current PTD gets replaced by IdlePTD when the
999  * task switch occurs. This means that the stack that was active at
1000  * the time of the double fault is not available at <kstack> unless
1001  * the machine was idle when the double fault occurred. The downside
1002  * of this is that "trace <ebp>" in ddb won't work.
1003  */
1004 void
1005 dblfault_handler()
1006 {
1007 	printf("\nFatal double fault:\n");
1008 	printf("eip = 0x%x\n", PCPU_GET(common_tss.tss_eip));
1009 	printf("esp = 0x%x\n", PCPU_GET(common_tss.tss_esp));
1010 	printf("ebp = 0x%x\n", PCPU_GET(common_tss.tss_ebp));
1011 #ifdef SMP
1012 	/* two separate prints in case of a trap on an unmapped page */
1013 	printf("cpuid = %d; ", PCPU_GET(cpuid));
1014 	printf("lapic.id = %08x\n", lapic.id);
1015 #endif
1016 	panic("double fault");
1017 }
1018 
1019 /*
1020  * Compensate for 386 brain damage (missing URKR).
1021  * This is a little simpler than the pagefault handler in trap() because
1022  * it the page tables have already been faulted in and high addresses
1023  * are thrown out early for other reasons.
1024  */
1025 int trapwrite(addr)
1026 	unsigned addr;
1027 {
1028 	struct proc *p;
1029 	vm_offset_t va;
1030 	struct vmspace *vm;
1031 	int rv;
1032 
1033 	va = trunc_page((vm_offset_t)addr);
1034 	/*
1035 	 * XXX - MAX is END.  Changed > to >= for temp. fix.
1036 	 */
1037 	if (va >= VM_MAXUSER_ADDRESS)
1038 		return (1);
1039 
1040 	p = curproc;
1041 	vm = p->p_vmspace;
1042 
1043 	PROC_LOCK(p);
1044 	++p->p_lock;
1045 	PROC_UNLOCK(p);
1046 
1047 	if (!grow_stack (p, va)) {
1048 		PROC_LOCK(p);
1049 		--p->p_lock;
1050 		PROC_UNLOCK(p);
1051 		return (1);
1052 	}
1053 
1054 	/*
1055 	 * fault the data page
1056 	 */
1057 	rv = vm_fault(&vm->vm_map, va, VM_PROT_WRITE, VM_FAULT_DIRTY);
1058 
1059 	PROC_LOCK(p);
1060 	--p->p_lock;
1061 	PROC_UNLOCK(p);
1062 
1063 	if (rv != KERN_SUCCESS)
1064 		return 1;
1065 
1066 	return (0);
1067 }
1068 
1069 /*
1070  *	syscall -	MP aware system call request C handler
1071  *
1072  *	A system call is essentially treated as a trap except that the
1073  *	MP lock is not held on entry or return.  We are responsible for
1074  *	obtaining the MP lock if necessary and for handling ASTs
1075  *	(e.g. a task switch) prior to return.
1076  *
1077  *	In general, only simple access and manipulation of curproc and
1078  *	the current stack is allowed without having to hold MP lock.
1079  */
1080 void
1081 syscall(frame)
1082 	struct trapframe frame;
1083 {
1084 	caddr_t params;
1085 	int i;
1086 	struct sysent *callp;
1087 	struct proc *p = curproc;
1088 	u_quad_t sticks;
1089 	int error;
1090 	int narg;
1091 	int args[8];
1092 	u_int code;
1093 
1094 	atomic_add_int(&cnt.v_syscall, 1);
1095 
1096 #ifdef DIAGNOSTIC
1097 	if (ISPL(frame.tf_cs) != SEL_UPL) {
1098 		mtx_lock(&Giant);
1099 		panic("syscall");
1100 		/* NOT REACHED */
1101 	}
1102 #endif
1103 
1104 	mtx_lock_spin(&sched_lock);
1105 	sticks = p->p_sticks;
1106 	mtx_unlock_spin(&sched_lock);
1107 
1108 	p->p_md.md_regs = &frame;
1109 	params = (caddr_t)frame.tf_esp + sizeof(int);
1110 	code = frame.tf_eax;
1111 
1112 	if (p->p_sysent->sv_prepsyscall) {
1113 		/*
1114 		 * The prep code is not MP aware.
1115 		 */
1116 		mtx_lock(&Giant);
1117 		(*p->p_sysent->sv_prepsyscall)(&frame, args, &code, &params);
1118 		mtx_unlock(&Giant);
1119 	} else {
1120 		/*
1121 		 * Need to check if this is a 32 bit or 64 bit syscall.
1122 		 * fuword is MP aware.
1123 		 */
1124 		if (code == SYS_syscall) {
1125 			/*
1126 			 * Code is first argument, followed by actual args.
1127 			 */
1128 			code = fuword(params);
1129 			params += sizeof(int);
1130 		} else if (code == SYS___syscall) {
1131 			/*
1132 			 * Like syscall, but code is a quad, so as to maintain
1133 			 * quad alignment for the rest of the arguments.
1134 			 */
1135 			code = fuword(params);
1136 			params += sizeof(quad_t);
1137 		}
1138 	}
1139 
1140  	if (p->p_sysent->sv_mask)
1141  		code &= p->p_sysent->sv_mask;
1142 
1143  	if (code >= p->p_sysent->sv_size)
1144  		callp = &p->p_sysent->sv_table[0];
1145   	else
1146  		callp = &p->p_sysent->sv_table[code];
1147 
1148 	narg = callp->sy_narg & SYF_ARGMASK;
1149 
1150 	/*
1151 	 * copyin is MP aware, but the tracing code is not
1152 	 */
1153 	if (params && (i = narg * sizeof(int)) &&
1154 	    (error = copyin(params, (caddr_t)args, (u_int)i))) {
1155 		mtx_lock(&Giant);
1156 #ifdef KTRACE
1157 		if (KTRPOINT(p, KTR_SYSCALL))
1158 			ktrsyscall(p->p_tracep, code, narg, args);
1159 #endif
1160 		goto bad;
1161 	}
1162 
1163 	/*
1164 	 * Try to run the syscall without the MP lock if the syscall
1165 	 * is MP safe.  We have to obtain the MP lock no matter what if
1166 	 * we are ktracing
1167 	 */
1168 	if ((callp->sy_narg & SYF_MPSAFE) == 0) {
1169 		mtx_lock(&Giant);
1170 	}
1171 
1172 #ifdef KTRACE
1173 	if (KTRPOINT(p, KTR_SYSCALL)) {
1174 		if (!mtx_owned(&Giant))
1175 			mtx_lock(&Giant);
1176 		ktrsyscall(p->p_tracep, code, narg, args);
1177 	}
1178 #endif
1179 	p->p_retval[0] = 0;
1180 	p->p_retval[1] = frame.tf_edx;
1181 
1182 	STOPEVENT(p, S_SCE, narg);	/* MP aware */
1183 
1184 	error = (*callp->sy_call)(p, args);
1185 
1186 	/*
1187 	 * MP SAFE (we may or may not have the MP lock at this point)
1188 	 */
1189 	switch (error) {
1190 	case 0:
1191 		frame.tf_eax = p->p_retval[0];
1192 		frame.tf_edx = p->p_retval[1];
1193 		frame.tf_eflags &= ~PSL_C;
1194 		break;
1195 
1196 	case ERESTART:
1197 		/*
1198 		 * Reconstruct pc, assuming lcall $X,y is 7 bytes,
1199 		 * int 0x80 is 2 bytes. We saved this in tf_err.
1200 		 */
1201 		frame.tf_eip -= frame.tf_err;
1202 		break;
1203 
1204 	case EJUSTRETURN:
1205 		break;
1206 
1207 	default:
1208 bad:
1209  		if (p->p_sysent->sv_errsize) {
1210  			if (error >= p->p_sysent->sv_errsize)
1211   				error = -1;	/* XXX */
1212    			else
1213   				error = p->p_sysent->sv_errtbl[error];
1214 		}
1215 		frame.tf_eax = error;
1216 		frame.tf_eflags |= PSL_C;
1217 		break;
1218 	}
1219 
1220 	/*
1221 	 * Traced syscall.  trapsignal() is not MP aware.
1222 	 */
1223 	if ((frame.tf_eflags & PSL_T) && !(frame.tf_eflags & PSL_VM)) {
1224 		if (!mtx_owned(&Giant))
1225 			mtx_lock(&Giant);
1226 		frame.tf_eflags &= ~PSL_T;
1227 		trapsignal(p, SIGTRAP, 0);
1228 	}
1229 
1230 	/*
1231 	 * Handle reschedule and other end-of-syscall issues
1232 	 */
1233 	userret(p, &frame, sticks);
1234 
1235 #ifdef KTRACE
1236 	if (KTRPOINT(p, KTR_SYSRET)) {
1237 		if (!mtx_owned(&Giant))
1238 			mtx_lock(&Giant);
1239 		ktrsysret(p->p_tracep, code, error, p->p_retval[0]);
1240 	}
1241 #endif
1242 
1243 	/*
1244 	 * Release Giant if we had to get it
1245 	 */
1246 	if (mtx_owned(&Giant))
1247 		mtx_unlock(&Giant);
1248 
1249 	/*
1250 	 * This works because errno is findable through the
1251 	 * register set.  If we ever support an emulation where this
1252 	 * is not the case, this code will need to be revisited.
1253 	 */
1254 	STOPEVENT(p, S_SCX, code);
1255 
1256 #ifdef WITNESS
1257 	if (witness_list(p)) {
1258 		panic("system call %s returning with mutex(s) held\n",
1259 		    syscallnames[code]);
1260 	}
1261 #endif
1262 	mtx_assert(&sched_lock, MA_NOTOWNED);
1263 	mtx_assert(&Giant, MA_NOTOWNED);
1264 }
1265 
1266 void
1267 ast(framep)
1268 	struct trapframe *framep;
1269 {
1270 	struct proc *p = CURPROC;
1271 	u_quad_t sticks;
1272 
1273 	KASSERT(TRAPF_USERMODE(framep), ("ast in kernel mode"));
1274 
1275 	/*
1276 	 * We check for a pending AST here rather than in the assembly as
1277 	 * acquiring and releasing mutexes in assembly is not fun.
1278 	 */
1279 	mtx_lock_spin(&sched_lock);
1280 	if (!(astpending(p) || resched_wanted(p))) {
1281 		mtx_unlock_spin(&sched_lock);
1282 		return;
1283 	}
1284 
1285 	sticks = p->p_sticks;
1286 	p->p_md.md_regs = framep;
1287 
1288 	astoff(p);
1289 	cnt.v_soft++;
1290 	mtx_intr_enable(&sched_lock);
1291 	if (p->p_sflag & PS_OWEUPC) {
1292 		p->p_sflag &= ~PS_OWEUPC;
1293 		mtx_unlock_spin(&sched_lock);
1294 		mtx_lock(&Giant);
1295 		mtx_lock_spin(&sched_lock);
1296 		addupc_task(p, p->p_stats->p_prof.pr_addr,
1297 			    p->p_stats->p_prof.pr_ticks);
1298 	}
1299 	if (p->p_sflag & PS_ALRMPEND) {
1300 		p->p_sflag &= ~PS_ALRMPEND;
1301 		mtx_unlock_spin(&sched_lock);
1302 		PROC_LOCK(p);
1303 		psignal(p, SIGVTALRM);
1304 		PROC_UNLOCK(p);
1305 		mtx_lock_spin(&sched_lock);
1306 	}
1307 	if (p->p_sflag & PS_PROFPEND) {
1308 		p->p_sflag &= ~PS_PROFPEND;
1309 		mtx_unlock_spin(&sched_lock);
1310 		PROC_LOCK(p);
1311 		psignal(p, SIGPROF);
1312 		PROC_UNLOCK(p);
1313 	} else
1314 		mtx_unlock_spin(&sched_lock);
1315 
1316 	userret(p, framep, sticks);
1317 
1318 	if (mtx_owned(&Giant))
1319 		mtx_unlock(&Giant);
1320 }
1321