xref: /freebsd/sys/kern/subr_trap.c (revision 5da751e46c992ac03537e8e65ee23e9ccd0ae4e3)
1 /*-
2  * Copyright (C) 1994, David Greenman
3  * Copyright (c) 1990, 1993
4  *	The Regents of the University of California.  All rights reserved.
5  *
6  * This code is derived from software contributed to Berkeley by
7  * the University of Utah, and William Jolitz.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  * 3. All advertising materials mentioning features or use of this software
18  *    must display the following acknowledgement:
19  *	This product includes software developed by the University of
20  *	California, Berkeley and its contributors.
21  * 4. Neither the name of the University nor the names of its contributors
22  *    may be used to endorse or promote products derived from this software
23  *    without specific prior written permission.
24  *
25  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35  * SUCH DAMAGE.
36  *
37  *	from: @(#)trap.c	7.4 (Berkeley) 5/13/91
38  * $FreeBSD$
39  */
40 
41 /*
42  * 386 Trap and System call handling
43  */
44 
45 #include "opt_clock.h"
46 #include "opt_cpu.h"
47 #include "opt_ddb.h"
48 #include "opt_isa.h"
49 #include "opt_ktrace.h"
50 #include "opt_npx.h"
51 #include "opt_trap.h"
52 
53 #include <sys/param.h>
54 #include <sys/bus.h>
55 #include <sys/systm.h>
56 #include <sys/proc.h>
57 #include <sys/pioctl.h>
58 #include <sys/ipl.h>
59 #include <sys/kernel.h>
60 #include <sys/ktr.h>
61 #include <sys/mutex.h>
62 #include <sys/resourcevar.h>
63 #include <sys/signalvar.h>
64 #include <sys/syscall.h>
65 #include <sys/sysctl.h>
66 #include <sys/sysent.h>
67 #include <sys/uio.h>
68 #include <sys/vmmeter.h>
69 #ifdef KTRACE
70 #include <sys/ktrace.h>
71 #endif
72 
73 #include <vm/vm.h>
74 #include <vm/vm_param.h>
75 #include <sys/lock.h>
76 #include <vm/pmap.h>
77 #include <vm/vm_kern.h>
78 #include <vm/vm_map.h>
79 #include <vm/vm_page.h>
80 #include <vm/vm_extern.h>
81 
82 #include <machine/cpu.h>
83 #include <machine/md_var.h>
84 #include <machine/pcb.h>
85 #ifdef SMP
86 #include <machine/smp.h>
87 #endif
88 #include <machine/tss.h>
89 
90 #include <i386/isa/icu.h>
91 #include <i386/isa/intr_machdep.h>
92 
93 #ifdef POWERFAIL_NMI
94 #include <sys/syslog.h>
95 #include <machine/clock.h>
96 #endif
97 
98 #include <machine/vm86.h>
99 
100 #include <ddb/ddb.h>
101 
102 #include <sys/sysctl.h>
103 
104 int (*pmath_emulate) __P((struct trapframe *));
105 
106 extern void trap __P((struct trapframe frame));
107 extern int trapwrite __P((unsigned addr));
108 extern void syscall __P((struct trapframe frame));
109 extern void ast __P((struct trapframe *framep));
110 
111 static int trap_pfault __P((struct trapframe *, int, vm_offset_t));
112 static void trap_fatal __P((struct trapframe *, vm_offset_t));
113 void dblfault_handler __P((void));
114 
115 extern inthand_t IDTVEC(lcall_syscall);
116 
117 #define MAX_TRAP_MSG		28
118 static char *trap_msg[] = {
119 	"",					/*  0 unused */
120 	"privileged instruction fault",		/*  1 T_PRIVINFLT */
121 	"",					/*  2 unused */
122 	"breakpoint instruction fault",		/*  3 T_BPTFLT */
123 	"",					/*  4 unused */
124 	"",					/*  5 unused */
125 	"arithmetic trap",			/*  6 T_ARITHTRAP */
126 	"",					/*  7 unused */
127 	"",					/*  8 unused */
128 	"general protection fault",		/*  9 T_PROTFLT */
129 	"trace trap",				/* 10 T_TRCTRAP */
130 	"",					/* 11 unused */
131 	"page fault",				/* 12 T_PAGEFLT */
132 	"",					/* 13 unused */
133 	"alignment fault",			/* 14 T_ALIGNFLT */
134 	"",					/* 15 unused */
135 	"",					/* 16 unused */
136 	"",					/* 17 unused */
137 	"integer divide fault",			/* 18 T_DIVIDE */
138 	"non-maskable interrupt trap",		/* 19 T_NMI */
139 	"overflow trap",			/* 20 T_OFLOW */
140 	"FPU bounds check fault",		/* 21 T_BOUND */
141 	"FPU device not available",		/* 22 T_DNA */
142 	"double fault",				/* 23 T_DOUBLEFLT */
143 	"FPU operand fetch fault",		/* 24 T_FPOPFLT */
144 	"invalid TSS fault",			/* 25 T_TSSFLT */
145 	"segment not present fault",		/* 26 T_SEGNPFLT */
146 	"stack fault",				/* 27 T_STKFLT */
147 	"machine check trap",			/* 28 T_MCHK */
148 };
149 
150 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
151 extern int has_f00f_bug;
152 #endif
153 
154 #ifdef DDB
155 static int ddb_on_nmi = 1;
156 SYSCTL_INT(_machdep, OID_AUTO, ddb_on_nmi, CTLFLAG_RW,
157 	&ddb_on_nmi, 0, "Go to DDB on NMI");
158 #endif
159 static int panic_on_nmi = 1;
160 SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RW,
161 	&panic_on_nmi, 0, "Panic on NMI");
162 
163 #ifdef WITNESS
164 extern char *syscallnames[];
165 #endif
166 
167 void
168 userret(p, frame, oticks)
169 	struct proc *p;
170 	struct trapframe *frame;
171 	u_quad_t oticks;
172 {
173 	int sig;
174 
175 	while ((sig = CURSIG(p)) != 0)
176 		postsig(sig);
177 
178 	mtx_lock_spin(&sched_lock);
179 	p->p_pri.pri_level = p->p_pri.pri_user;
180 	if (resched_wanted(p)) {
181 		/*
182 		 * Since we are curproc, clock will normally just change
183 		 * our priority without moving us from one queue to another
184 		 * (since the running process is not on a queue.)
185 		 * If that happened after we setrunqueue ourselves but before we
186 		 * mi_switch()'ed, we might not be on the queue indicated by
187 		 * our priority.
188 		 */
189 		DROP_GIANT_NOSWITCH();
190 		setrunqueue(p);
191 		p->p_stats->p_ru.ru_nivcsw++;
192 		mi_switch();
193 		mtx_unlock_spin(&sched_lock);
194 		PICKUP_GIANT();
195 		while ((sig = CURSIG(p)) != 0)
196 			postsig(sig);
197 		mtx_lock_spin(&sched_lock);
198 	}
199 
200 	/*
201 	 * Charge system time if profiling.
202 	 */
203 	if (p->p_sflag & PS_PROFIL) {
204 		mtx_unlock_spin(&sched_lock);
205 		/* XXX - do we need Giant? */
206 		if (!mtx_owned(&Giant))
207 			mtx_lock(&Giant);
208 		addupc_task(p, TRAPF_PC(frame),
209 			    (u_int)(p->p_sticks - oticks) * psratio);
210 	} else
211 		mtx_unlock_spin(&sched_lock);
212 }
213 
214 /*
215  * Exception, fault, and trap interface to the FreeBSD kernel.
216  * This common code is called from assembly language IDT gate entry
217  * routines that prepare a suitable stack frame, and restore this
218  * frame after the exception has been processed.
219  */
220 
221 void
222 trap(frame)
223 	struct trapframe frame;
224 {
225 	struct proc *p = curproc;
226 	u_quad_t sticks = 0;
227 	int i = 0, ucode = 0, type, code;
228 	vm_offset_t eva;
229 #ifdef POWERFAIL_NMI
230 	static int lastalert = 0;
231 #endif
232 
233 	atomic_add_int(&cnt.v_trap, 1);
234 
235 	if ((frame.tf_eflags & PSL_I) == 0) {
236 		/*
237 		 * Buggy application or kernel code has disabled
238 		 * interrupts and then trapped.  Enabling interrupts
239 		 * now is wrong, but it is better than running with
240 		 * interrupts disabled until they are accidentally
241 		 * enabled later.  XXX This is really bad if we trap
242 		 * while holding a spin lock.
243 		 */
244 		type = frame.tf_trapno;
245 		if (ISPL(frame.tf_cs) == SEL_UPL || (frame.tf_eflags & PSL_VM))
246 			printf(
247 			    "pid %ld (%s): trap %d with interrupts disabled\n",
248 			    (long)curproc->p_pid, curproc->p_comm, type);
249 		else if (type != T_BPTFLT && type != T_TRCTRAP) {
250 			/*
251 			 * XXX not quite right, since this may be for a
252 			 * multiple fault in user mode.
253 			 */
254 			printf("kernel trap %d with interrupts disabled\n",
255 			    type);
256 			/*
257 			 * We should walk p_heldmtx here and see if any are
258 			 * spin mutexes, and not do this if so.
259 			 */
260 			enable_intr();
261 		}
262 	}
263 
264 	eva = 0;
265 
266 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
267 restart:
268 #endif
269 
270 	type = frame.tf_trapno;
271 	code = frame.tf_err;
272 
273         if ((ISPL(frame.tf_cs) == SEL_UPL) ||
274 	    ((frame.tf_eflags & PSL_VM) && !in_vm86call)) {
275 		/* user trap */
276 
277 		mtx_lock_spin(&sched_lock);
278 		sticks = p->p_sticks;
279 		mtx_unlock_spin(&sched_lock);
280 		p->p_md.md_regs = &frame;
281 
282 		switch (type) {
283 		case T_PRIVINFLT:	/* privileged instruction fault */
284 			ucode = type;
285 			i = SIGILL;
286 			break;
287 
288 		case T_BPTFLT:		/* bpt instruction fault */
289 		case T_TRCTRAP:		/* trace trap */
290 			frame.tf_eflags &= ~PSL_T;
291 			i = SIGTRAP;
292 			break;
293 
294 		case T_ARITHTRAP:	/* arithmetic trap */
295 			ucode = code;
296 			i = SIGFPE;
297 			break;
298 
299 			/*
300 			 * The following two traps can happen in
301 			 * vm86 mode, and, if so, we want to handle
302 			 * them specially.
303 			 */
304 		case T_PROTFLT:		/* general protection fault */
305 		case T_STKFLT:		/* stack fault */
306 			if (frame.tf_eflags & PSL_VM) {
307 				mtx_lock(&Giant);
308 				i = vm86_emulate((struct vm86frame *)&frame);
309 				mtx_unlock(&Giant);
310 				if (i == 0)
311 					goto user;
312 				break;
313 			}
314 			/* FALL THROUGH */
315 
316 		case T_SEGNPFLT:	/* segment not present fault */
317 		case T_TSSFLT:		/* invalid TSS fault */
318 		case T_DOUBLEFLT:	/* double fault */
319 		default:
320 			ucode = code + BUS_SEGM_FAULT ;
321 			i = SIGBUS;
322 			break;
323 
324 		case T_PAGEFLT:		/* page fault */
325 			/*
326 			 * For some Cyrix CPUs, %cr2 is clobbered by
327 			 * interrupts.  This problem is worked around by using
328 			 * an interrupt gate for the pagefault handler.  We
329 			 * are finally ready to read %cr2 and then must
330 			 * reenable interrupts.
331 			 */
332 			eva = rcr2();
333 			enable_intr();
334 			mtx_lock(&Giant);
335 			i = trap_pfault(&frame, TRUE, eva);
336 			mtx_unlock(&Giant);
337 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
338 			if (i == -2) {
339 				/*
340 				 * f00f hack workaround has triggered, treat
341 				 * as illegal instruction not page fault.
342 				 */
343 				frame.tf_trapno = T_PRIVINFLT;
344 				goto restart;
345 			}
346 #endif
347 			if (i == -1)
348 				goto out;
349 			if (i == 0)
350 				goto user;
351 
352 			ucode = T_PAGEFLT;
353 			break;
354 
355 		case T_DIVIDE:		/* integer divide fault */
356 			ucode = FPE_INTDIV;
357 			i = SIGFPE;
358 			break;
359 
360 #ifdef DEV_ISA
361 		case T_NMI:
362 #ifdef POWERFAIL_NMI
363 #ifndef TIMER_FREQ
364 #  define TIMER_FREQ 1193182
365 #endif
366 			mtx_lock(&Giant);
367 			if (time_second - lastalert > 10) {
368 				log(LOG_WARNING, "NMI: power fail\n");
369 				sysbeep(TIMER_FREQ/880, hz);
370 				lastalert = time_second;
371 			}
372 			mtx_unlock(&Giant);
373 			goto out;
374 #else /* !POWERFAIL_NMI */
375 			/* machine/parity/power fail/"kitchen sink" faults */
376 			/* XXX Giant */
377 			if (isa_nmi(code) == 0) {
378 #ifdef DDB
379 				/*
380 				 * NMI can be hooked up to a pushbutton
381 				 * for debugging.
382 				 */
383 				if (ddb_on_nmi) {
384 					printf ("NMI ... going to debugger\n");
385 					kdb_trap (type, 0, &frame);
386 				}
387 #endif /* DDB */
388 				goto out;
389 			} else if (panic_on_nmi)
390 				panic("NMI indicates hardware failure");
391 			break;
392 #endif /* POWERFAIL_NMI */
393 #endif /* DEV_ISA */
394 
395 		case T_OFLOW:		/* integer overflow fault */
396 			ucode = FPE_INTOVF;
397 			i = SIGFPE;
398 			break;
399 
400 		case T_BOUND:		/* bounds check fault */
401 			ucode = FPE_FLTSUB;
402 			i = SIGFPE;
403 			break;
404 
405 		case T_DNA:
406 #ifdef DEV_NPX
407 			/* transparent fault (due to context switch "late") */
408 			if (npxdna())
409 				goto out;
410 #endif
411 			if (!pmath_emulate) {
412 				i = SIGFPE;
413 				ucode = FPE_FPU_NP_TRAP;
414 				break;
415 			}
416 			mtx_lock(&Giant);
417 			i = (*pmath_emulate)(&frame);
418 			mtx_unlock(&Giant);
419 			if (i == 0) {
420 				if (!(frame.tf_eflags & PSL_T))
421 					goto out;
422 				frame.tf_eflags &= ~PSL_T;
423 				i = SIGTRAP;
424 			}
425 			/* else ucode = emulator_only_knows() XXX */
426 			break;
427 
428 		case T_FPOPFLT:		/* FPU operand fetch fault */
429 			ucode = T_FPOPFLT;
430 			i = SIGILL;
431 			break;
432 		}
433 	} else {
434 		/* kernel trap */
435 
436 		switch (type) {
437 		case T_PAGEFLT:			/* page fault */
438 			/*
439 			 * For some Cyrix CPUs, %cr2 is clobbered by
440 			 * interrupts.  This problem is worked around by using
441 			 * an interrupt gate for the pagefault handler.  We
442 			 * are finally ready to read %cr2 and then must
443 			 * reenable interrupts.
444 			 */
445 			eva = rcr2();
446 			enable_intr();
447 			mtx_lock(&Giant);
448 			(void) trap_pfault(&frame, FALSE, eva);
449 			mtx_unlock(&Giant);
450 			goto out;
451 
452 		case T_DNA:
453 #ifdef DEV_NPX
454 			/*
455 			 * The kernel is apparently using npx for copying.
456 			 * XXX this should be fatal unless the kernel has
457 			 * registered such use.
458 			 */
459 			if (npxdna())
460 				goto out;
461 #endif
462 			break;
463 
464 			/*
465 			 * The following two traps can happen in
466 			 * vm86 mode, and, if so, we want to handle
467 			 * them specially.
468 			 */
469 		case T_PROTFLT:		/* general protection fault */
470 		case T_STKFLT:		/* stack fault */
471 			if (frame.tf_eflags & PSL_VM) {
472 				mtx_lock(&Giant);
473 				i = vm86_emulate((struct vm86frame *)&frame);
474 				mtx_unlock(&Giant);
475 				if (i != 0)
476 					/*
477 					 * returns to original process
478 					 */
479 					vm86_trap((struct vm86frame *)&frame);
480 				goto out;
481 			}
482 			if (type == T_STKFLT)
483 				break;
484 
485 			/* FALL THROUGH */
486 
487 		case T_SEGNPFLT:	/* segment not present fault */
488 			if (in_vm86call)
489 				break;
490 
491 			if (p->p_intr_nesting_level != 0)
492 				break;
493 
494 			/*
495 			 * Invalid %fs's and %gs's can be created using
496 			 * procfs or PT_SETREGS or by invalidating the
497 			 * underlying LDT entry.  This causes a fault
498 			 * in kernel mode when the kernel attempts to
499 			 * switch contexts.  Lose the bad context
500 			 * (XXX) so that we can continue, and generate
501 			 * a signal.
502 			 */
503 			if (frame.tf_eip == (int)cpu_switch_load_gs) {
504 				PCPU_GET(curpcb)->pcb_gs = 0;
505 				PROC_LOCK(p);
506 				psignal(p, SIGBUS);
507 				PROC_UNLOCK(p);
508 				goto out;
509 			}
510 
511 			/*
512 			 * Invalid segment selectors and out of bounds
513 			 * %eip's and %esp's can be set up in user mode.
514 			 * This causes a fault in kernel mode when the
515 			 * kernel tries to return to user mode.  We want
516 			 * to get this fault so that we can fix the
517 			 * problem here and not have to check all the
518 			 * selectors and pointers when the user changes
519 			 * them.
520 			 */
521 			if (frame.tf_eip == (int)doreti_iret) {
522 				frame.tf_eip = (int)doreti_iret_fault;
523 				goto out;
524 			}
525 			if (frame.tf_eip == (int)doreti_popl_ds) {
526 				frame.tf_eip = (int)doreti_popl_ds_fault;
527 				goto out;
528 			}
529 			if (frame.tf_eip == (int)doreti_popl_es) {
530 				frame.tf_eip = (int)doreti_popl_es_fault;
531 				goto out;
532 			}
533 			if (frame.tf_eip == (int)doreti_popl_fs) {
534 				frame.tf_eip = (int)doreti_popl_fs_fault;
535 				goto out;
536 			}
537 			if (PCPU_GET(curpcb) != NULL &&
538 			    PCPU_GET(curpcb)->pcb_onfault != NULL) {
539 				frame.tf_eip =
540 				    (int)PCPU_GET(curpcb)->pcb_onfault;
541 				goto out;
542 			}
543 			break;
544 
545 		case T_TSSFLT:
546 			/*
547 			 * PSL_NT can be set in user mode and isn't cleared
548 			 * automatically when the kernel is entered.  This
549 			 * causes a TSS fault when the kernel attempts to
550 			 * `iret' because the TSS link is uninitialized.  We
551 			 * want to get this fault so that we can fix the
552 			 * problem here and not every time the kernel is
553 			 * entered.
554 			 */
555 			if (frame.tf_eflags & PSL_NT) {
556 				frame.tf_eflags &= ~PSL_NT;
557 				goto out;
558 			}
559 			break;
560 
561 		case T_TRCTRAP:	 /* trace trap */
562 			if (frame.tf_eip == (int)IDTVEC(lcall_syscall)) {
563 				/*
564 				 * We've just entered system mode via the
565 				 * syscall lcall.  Continue single stepping
566 				 * silently until the syscall handler has
567 				 * saved the flags.
568 				 */
569 				goto out;
570 			}
571 			if (frame.tf_eip == (int)IDTVEC(lcall_syscall) + 1) {
572 				/*
573 				 * The syscall handler has now saved the
574 				 * flags.  Stop single stepping it.
575 				 */
576 				frame.tf_eflags &= ~PSL_T;
577 				goto out;
578 			}
579 			/*
580 			 * Ignore debug register trace traps due to
581 			 * accesses in the user's address space, which
582 			 * can happen under several conditions such as
583 			 * if a user sets a watchpoint on a buffer and
584 			 * then passes that buffer to a system call.
585 			 * We still want to get TRCTRAPS for addresses
586 			 * in kernel space because that is useful when
587 			 * debugging the kernel.
588 			 */
589 			/* XXX Giant */
590 			if (user_dbreg_trap() && !in_vm86call) {
591 				/*
592 				 * Reset breakpoint bits because the
593 				 * processor doesn't
594 				 */
595 				load_dr6(rdr6() & 0xfffffff0);
596 				goto out;
597 			}
598 			/*
599 			 * Fall through (TRCTRAP kernel mode, kernel address)
600 			 */
601 		case T_BPTFLT:
602 			/*
603 			 * If DDB is enabled, let it handle the debugger trap.
604 			 * Otherwise, debugger traps "can't happen".
605 			 */
606 #ifdef DDB
607 			/* XXX Giant */
608 			if (kdb_trap (type, 0, &frame))
609 				goto out;
610 #endif
611 			break;
612 
613 #ifdef DEV_ISA
614 		case T_NMI:
615 #ifdef POWERFAIL_NMI
616 			mtx_lock(&Giant);
617 			if (time_second - lastalert > 10) {
618 				log(LOG_WARNING, "NMI: power fail\n");
619 				sysbeep(TIMER_FREQ/880, hz);
620 				lastalert = time_second;
621 			}
622 			mtx_unlock(&Giant);
623 			goto out;
624 #else /* !POWERFAIL_NMI */
625 			/* XXX Giant */
626 			/* machine/parity/power fail/"kitchen sink" faults */
627 			if (isa_nmi(code) == 0) {
628 #ifdef DDB
629 				/*
630 				 * NMI can be hooked up to a pushbutton
631 				 * for debugging.
632 				 */
633 				if (ddb_on_nmi) {
634 					printf ("NMI ... going to debugger\n");
635 					kdb_trap (type, 0, &frame);
636 				}
637 #endif /* DDB */
638 				goto out;
639 			} else if (panic_on_nmi == 0)
640 				goto out;
641 			/* FALL THROUGH */
642 #endif /* POWERFAIL_NMI */
643 #endif /* DEV_ISA */
644 		}
645 
646 		mtx_lock(&Giant);
647 		trap_fatal(&frame, eva);
648 		mtx_unlock(&Giant);
649 		goto out;
650 	}
651 
652 	mtx_lock(&Giant);
653 	/* Translate fault for emulators (e.g. Linux) */
654 	if (*p->p_sysent->sv_transtrap)
655 		i = (*p->p_sysent->sv_transtrap)(i, type);
656 
657 	trapsignal(p, i, ucode);
658 
659 #ifdef DEBUG
660 	if (type <= MAX_TRAP_MSG) {
661 		uprintf("fatal process exception: %s",
662 			trap_msg[type]);
663 		if ((type == T_PAGEFLT) || (type == T_PROTFLT))
664 			uprintf(", fault VA = 0x%lx", (u_long)eva);
665 		uprintf("\n");
666 	}
667 #endif
668 	mtx_unlock(&Giant);
669 
670 user:
671 	userret(p, &frame, sticks);
672 	if (mtx_owned(&Giant))
673 		mtx_unlock(&Giant);
674 out:
675 	return;
676 }
677 
678 #ifdef notyet
679 /*
680  * This version doesn't allow a page fault to user space while
681  * in the kernel. The rest of the kernel needs to be made "safe"
682  * before this can be used. I think the only things remaining
683  * to be made safe are the iBCS2 code and the process tracing/
684  * debugging code.
685  */
686 static int
687 trap_pfault(frame, usermode, eva)
688 	struct trapframe *frame;
689 	int usermode;
690 	vm_offset_t eva;
691 {
692 	vm_offset_t va;
693 	struct vmspace *vm = NULL;
694 	vm_map_t map = 0;
695 	int rv = 0;
696 	vm_prot_t ftype;
697 	struct proc *p = curproc;
698 
699 	if (frame->tf_err & PGEX_W)
700 		ftype = VM_PROT_WRITE;
701 	else
702 		ftype = VM_PROT_READ;
703 
704 	va = trunc_page(eva);
705 	if (va < VM_MIN_KERNEL_ADDRESS) {
706 		vm_offset_t v;
707 		vm_page_t mpte;
708 
709 		if (p == NULL ||
710 		    (!usermode && va < VM_MAXUSER_ADDRESS &&
711 		     (p->p_intr_nesting_level != 0 ||
712 		      PCPU_GET(curpcb) == NULL ||
713 		      PCPU_GET(curpcb)->pcb_onfault == NULL))) {
714 			trap_fatal(frame, eva);
715 			return (-1);
716 		}
717 
718 		/*
719 		 * This is a fault on non-kernel virtual memory.
720 		 * vm is initialized above to NULL. If curproc is NULL
721 		 * or curproc->p_vmspace is NULL the fault is fatal.
722 		 */
723 		vm = p->p_vmspace;
724 		if (vm == NULL)
725 			goto nogo;
726 
727 		map = &vm->vm_map;
728 
729 		/*
730 		 * Keep swapout from messing with us during this
731 		 *	critical time.
732 		 */
733 		PROC_LOCK(p);
734 		++p->p_lock;
735 		PROC_UNLOCK(p);
736 
737 		/*
738 		 * Grow the stack if necessary
739 		 */
740 		/* grow_stack returns false only if va falls into
741 		 * a growable stack region and the stack growth
742 		 * fails.  It returns true if va was not within
743 		 * a growable stack region, or if the stack
744 		 * growth succeeded.
745 		 */
746 		if (!grow_stack (p, va))
747 			rv = KERN_FAILURE;
748 		else
749 			/* Fault in the user page: */
750 			rv = vm_fault(map, va, ftype,
751 			      (ftype & VM_PROT_WRITE) ? VM_FAULT_DIRTY
752 						      : VM_FAULT_NORMAL);
753 
754 		PROC_LOCK(p);
755 		--p->p_lock;
756 		PROC_UNLOCK(p);
757 	} else {
758 		/*
759 		 * Don't allow user-mode faults in kernel address space.
760 		 */
761 		if (usermode)
762 			goto nogo;
763 
764 		/*
765 		 * Since we know that kernel virtual address addresses
766 		 * always have pte pages mapped, we just have to fault
767 		 * the page.
768 		 */
769 		rv = vm_fault(kernel_map, va, ftype, VM_FAULT_NORMAL);
770 	}
771 
772 	if (rv == KERN_SUCCESS)
773 		return (0);
774 nogo:
775 	if (!usermode) {
776 		if (p->p_intr_nesting_level == 0 &&
777 		    PCPU_GET(curpcb) != NULL &&
778 		    PCPU_GET(curpcb)->pcb_onfault != NULL) {
779 			frame->tf_eip = (int)PCPU_GET(curpcb)->pcb_onfault;
780 			return (0);
781 		}
782 		trap_fatal(frame, eva);
783 		return (-1);
784 	}
785 
786 	/* kludge to pass faulting virtual address to sendsig */
787 	frame->tf_err = eva;
788 
789 	return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
790 }
791 #endif
792 
793 int
794 trap_pfault(frame, usermode, eva)
795 	struct trapframe *frame;
796 	int usermode;
797 	vm_offset_t eva;
798 {
799 	vm_offset_t va;
800 	struct vmspace *vm = NULL;
801 	vm_map_t map = 0;
802 	int rv = 0;
803 	vm_prot_t ftype;
804 	struct proc *p = curproc;
805 
806 	va = trunc_page(eva);
807 	if (va >= KERNBASE) {
808 		/*
809 		 * Don't allow user-mode faults in kernel address space.
810 		 * An exception:  if the faulting address is the invalid
811 		 * instruction entry in the IDT, then the Intel Pentium
812 		 * F00F bug workaround was triggered, and we need to
813 		 * treat it is as an illegal instruction, and not a page
814 		 * fault.
815 		 */
816 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
817 		if ((eva == (unsigned int)&idt[6]) && has_f00f_bug)
818 			return -2;
819 #endif
820 		if (usermode)
821 			goto nogo;
822 
823 		map = kernel_map;
824 	} else {
825 		/*
826 		 * This is a fault on non-kernel virtual memory.
827 		 * vm is initialized above to NULL. If curproc is NULL
828 		 * or curproc->p_vmspace is NULL the fault is fatal.
829 		 */
830 		if (p != NULL)
831 			vm = p->p_vmspace;
832 
833 		if (vm == NULL)
834 			goto nogo;
835 
836 		map = &vm->vm_map;
837 	}
838 
839 	if (frame->tf_err & PGEX_W)
840 		ftype = VM_PROT_WRITE;
841 	else
842 		ftype = VM_PROT_READ;
843 
844 	if (map != kernel_map) {
845 		/*
846 		 * Keep swapout from messing with us during this
847 		 *	critical time.
848 		 */
849 		PROC_LOCK(p);
850 		++p->p_lock;
851 		PROC_UNLOCK(p);
852 
853 		/*
854 		 * Grow the stack if necessary
855 		 */
856 		/* grow_stack returns false only if va falls into
857 		 * a growable stack region and the stack growth
858 		 * fails.  It returns true if va was not within
859 		 * a growable stack region, or if the stack
860 		 * growth succeeded.
861 		 */
862 		if (!grow_stack (p, va))
863 			rv = KERN_FAILURE;
864 		else
865 			/* Fault in the user page: */
866 			rv = vm_fault(map, va, ftype,
867 			      (ftype & VM_PROT_WRITE) ? VM_FAULT_DIRTY
868 						      : VM_FAULT_NORMAL);
869 
870 		PROC_LOCK(p);
871 		--p->p_lock;
872 		PROC_UNLOCK(p);
873 	} else {
874 		/*
875 		 * Don't have to worry about process locking or stacks in the
876 		 * kernel.
877 		 */
878 		rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL);
879 	}
880 
881 	if (rv == KERN_SUCCESS)
882 		return (0);
883 nogo:
884 	if (!usermode) {
885 		if (p->p_intr_nesting_level == 0 &&
886 		    PCPU_GET(curpcb) != NULL &&
887 		    PCPU_GET(curpcb)->pcb_onfault != NULL) {
888 			frame->tf_eip = (int)PCPU_GET(curpcb)->pcb_onfault;
889 			return (0);
890 		}
891 		trap_fatal(frame, eva);
892 		return (-1);
893 	}
894 
895 	/* kludge to pass faulting virtual address to sendsig */
896 	frame->tf_err = eva;
897 
898 	return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
899 }
900 
901 static void
902 trap_fatal(frame, eva)
903 	struct trapframe *frame;
904 	vm_offset_t eva;
905 {
906 	int code, type, ss, esp;
907 	struct soft_segment_descriptor softseg;
908 
909 	code = frame->tf_err;
910 	type = frame->tf_trapno;
911 	sdtossd(&gdt[IDXSEL(frame->tf_cs & 0xffff)].sd, &softseg);
912 
913 	if (type <= MAX_TRAP_MSG)
914 		printf("\n\nFatal trap %d: %s while in %s mode\n",
915 			type, trap_msg[type],
916         		frame->tf_eflags & PSL_VM ? "vm86" :
917 			ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel");
918 #ifdef SMP
919 	/* two separate prints in case of a trap on an unmapped page */
920 	printf("cpuid = %d; ", PCPU_GET(cpuid));
921 	printf("lapic.id = %08x\n", lapic.id);
922 #endif
923 	if (type == T_PAGEFLT) {
924 		printf("fault virtual address	= 0x%x\n", eva);
925 		printf("fault code		= %s %s, %s\n",
926 			code & PGEX_U ? "user" : "supervisor",
927 			code & PGEX_W ? "write" : "read",
928 			code & PGEX_P ? "protection violation" : "page not present");
929 	}
930 	printf("instruction pointer	= 0x%x:0x%x\n",
931 	       frame->tf_cs & 0xffff, frame->tf_eip);
932         if ((ISPL(frame->tf_cs) == SEL_UPL) || (frame->tf_eflags & PSL_VM)) {
933 		ss = frame->tf_ss & 0xffff;
934 		esp = frame->tf_esp;
935 	} else {
936 		ss = GSEL(GDATA_SEL, SEL_KPL);
937 		esp = (int)&frame->tf_esp;
938 	}
939 	printf("stack pointer	        = 0x%x:0x%x\n", ss, esp);
940 	printf("frame pointer	        = 0x%x:0x%x\n", ss, frame->tf_ebp);
941 	printf("code segment		= base 0x%x, limit 0x%x, type 0x%x\n",
942 	       softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type);
943 	printf("			= DPL %d, pres %d, def32 %d, gran %d\n",
944 	       softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_def32,
945 	       softseg.ssd_gran);
946 	printf("processor eflags	= ");
947 	if (frame->tf_eflags & PSL_T)
948 		printf("trace trap, ");
949 	if (frame->tf_eflags & PSL_I)
950 		printf("interrupt enabled, ");
951 	if (frame->tf_eflags & PSL_NT)
952 		printf("nested task, ");
953 	if (frame->tf_eflags & PSL_RF)
954 		printf("resume, ");
955 	if (frame->tf_eflags & PSL_VM)
956 		printf("vm86, ");
957 	printf("IOPL = %d\n", (frame->tf_eflags & PSL_IOPL) >> 12);
958 	printf("current process		= ");
959 	if (curproc) {
960 		printf("%lu (%s)\n",
961 		    (u_long)curproc->p_pid, curproc->p_comm ?
962 		    curproc->p_comm : "");
963 	} else {
964 		printf("Idle\n");
965 	}
966 
967 #ifdef KDB
968 	if (kdb_trap(&psl))
969 		return;
970 #endif
971 #ifdef DDB
972 	if ((debugger_on_panic || db_active) && kdb_trap(type, 0, frame))
973 		return;
974 #endif
975 	printf("trap number		= %d\n", type);
976 	if (type <= MAX_TRAP_MSG)
977 		panic(trap_msg[type]);
978 	else
979 		panic("unknown/reserved trap");
980 }
981 
982 /*
983  * Double fault handler. Called when a fault occurs while writing
984  * a frame for a trap/exception onto the stack. This usually occurs
985  * when the stack overflows (such is the case with infinite recursion,
986  * for example).
987  *
988  * XXX Note that the current PTD gets replaced by IdlePTD when the
989  * task switch occurs. This means that the stack that was active at
990  * the time of the double fault is not available at <kstack> unless
991  * the machine was idle when the double fault occurred. The downside
992  * of this is that "trace <ebp>" in ddb won't work.
993  */
994 void
995 dblfault_handler()
996 {
997 	printf("\nFatal double fault:\n");
998 	printf("eip = 0x%x\n", PCPU_GET(common_tss.tss_eip));
999 	printf("esp = 0x%x\n", PCPU_GET(common_tss.tss_esp));
1000 	printf("ebp = 0x%x\n", PCPU_GET(common_tss.tss_ebp));
1001 #ifdef SMP
1002 	/* two separate prints in case of a trap on an unmapped page */
1003 	printf("cpuid = %d; ", PCPU_GET(cpuid));
1004 	printf("lapic.id = %08x\n", lapic.id);
1005 #endif
1006 	panic("double fault");
1007 }
1008 
1009 /*
1010  * Compensate for 386 brain damage (missing URKR).
1011  * This is a little simpler than the pagefault handler in trap() because
1012  * it the page tables have already been faulted in and high addresses
1013  * are thrown out early for other reasons.
1014  */
1015 int trapwrite(addr)
1016 	unsigned addr;
1017 {
1018 	struct proc *p;
1019 	vm_offset_t va;
1020 	struct vmspace *vm;
1021 	int rv;
1022 
1023 	va = trunc_page((vm_offset_t)addr);
1024 	/*
1025 	 * XXX - MAX is END.  Changed > to >= for temp. fix.
1026 	 */
1027 	if (va >= VM_MAXUSER_ADDRESS)
1028 		return (1);
1029 
1030 	p = curproc;
1031 	vm = p->p_vmspace;
1032 
1033 	PROC_LOCK(p);
1034 	++p->p_lock;
1035 	PROC_UNLOCK(p);
1036 
1037 	if (!grow_stack (p, va))
1038 		rv = KERN_FAILURE;
1039 	else
1040 		/*
1041 		 * fault the data page
1042 		 */
1043 		rv = vm_fault(&vm->vm_map, va, VM_PROT_WRITE, VM_FAULT_DIRTY);
1044 
1045 	PROC_LOCK(p);
1046 	--p->p_lock;
1047 	PROC_UNLOCK(p);
1048 
1049 	if (rv != KERN_SUCCESS)
1050 		return 1;
1051 
1052 	return (0);
1053 }
1054 
1055 /*
1056  *	syscall -	MP aware system call request C handler
1057  *
1058  *	A system call is essentially treated as a trap except that the
1059  *	MP lock is not held on entry or return.  We are responsible for
1060  *	obtaining the MP lock if necessary and for handling ASTs
1061  *	(e.g. a task switch) prior to return.
1062  *
1063  *	In general, only simple access and manipulation of curproc and
1064  *	the current stack is allowed without having to hold MP lock.
1065  */
1066 void
1067 syscall(frame)
1068 	struct trapframe frame;
1069 {
1070 	caddr_t params;
1071 	int i;
1072 	struct sysent *callp;
1073 	struct proc *p = curproc;
1074 	u_quad_t sticks;
1075 	int error;
1076 	int narg;
1077 	int args[8];
1078 	u_int code;
1079 
1080 	atomic_add_int(&cnt.v_syscall, 1);
1081 
1082 #ifdef DIAGNOSTIC
1083 	if (ISPL(frame.tf_cs) != SEL_UPL) {
1084 		mtx_lock(&Giant);
1085 		panic("syscall");
1086 		/* NOT REACHED */
1087 	}
1088 #endif
1089 
1090 	mtx_lock_spin(&sched_lock);
1091 	sticks = p->p_sticks;
1092 	mtx_unlock_spin(&sched_lock);
1093 
1094 	p->p_md.md_regs = &frame;
1095 	params = (caddr_t)frame.tf_esp + sizeof(int);
1096 	code = frame.tf_eax;
1097 
1098 	if (p->p_sysent->sv_prepsyscall) {
1099 		/*
1100 		 * The prep code is not MP aware.
1101 		 */
1102 		mtx_lock(&Giant);
1103 		(*p->p_sysent->sv_prepsyscall)(&frame, args, &code, &params);
1104 		mtx_unlock(&Giant);
1105 	} else {
1106 		/*
1107 		 * Need to check if this is a 32 bit or 64 bit syscall.
1108 		 * fuword is MP aware.
1109 		 */
1110 		if (code == SYS_syscall) {
1111 			/*
1112 			 * Code is first argument, followed by actual args.
1113 			 */
1114 			code = fuword(params);
1115 			params += sizeof(int);
1116 		} else if (code == SYS___syscall) {
1117 			/*
1118 			 * Like syscall, but code is a quad, so as to maintain
1119 			 * quad alignment for the rest of the arguments.
1120 			 */
1121 			code = fuword(params);
1122 			params += sizeof(quad_t);
1123 		}
1124 	}
1125 
1126  	if (p->p_sysent->sv_mask)
1127  		code &= p->p_sysent->sv_mask;
1128 
1129  	if (code >= p->p_sysent->sv_size)
1130  		callp = &p->p_sysent->sv_table[0];
1131   	else
1132  		callp = &p->p_sysent->sv_table[code];
1133 
1134 	narg = callp->sy_narg & SYF_ARGMASK;
1135 
1136 	/*
1137 	 * copyin is MP aware, but the tracing code is not
1138 	 */
1139 	if (params && (i = narg * sizeof(int)) &&
1140 	    (error = copyin(params, (caddr_t)args, (u_int)i))) {
1141 		mtx_lock(&Giant);
1142 #ifdef KTRACE
1143 		if (KTRPOINT(p, KTR_SYSCALL))
1144 			ktrsyscall(p->p_tracep, code, narg, args);
1145 #endif
1146 		goto bad;
1147 	}
1148 
1149 	/*
1150 	 * Try to run the syscall without the MP lock if the syscall
1151 	 * is MP safe.  We have to obtain the MP lock no matter what if
1152 	 * we are ktracing
1153 	 */
1154 	if ((callp->sy_narg & SYF_MPSAFE) == 0) {
1155 		mtx_lock(&Giant);
1156 	}
1157 
1158 #ifdef KTRACE
1159 	if (KTRPOINT(p, KTR_SYSCALL)) {
1160 		if (!mtx_owned(&Giant))
1161 			mtx_lock(&Giant);
1162 		ktrsyscall(p->p_tracep, code, narg, args);
1163 	}
1164 #endif
1165 	p->p_retval[0] = 0;
1166 	p->p_retval[1] = frame.tf_edx;
1167 
1168 	STOPEVENT(p, S_SCE, narg);	/* MP aware */
1169 
1170 	error = (*callp->sy_call)(p, args);
1171 
1172 	/*
1173 	 * MP SAFE (we may or may not have the MP lock at this point)
1174 	 */
1175 	switch (error) {
1176 	case 0:
1177 		frame.tf_eax = p->p_retval[0];
1178 		frame.tf_edx = p->p_retval[1];
1179 		frame.tf_eflags &= ~PSL_C;
1180 		break;
1181 
1182 	case ERESTART:
1183 		/*
1184 		 * Reconstruct pc, assuming lcall $X,y is 7 bytes,
1185 		 * int 0x80 is 2 bytes. We saved this in tf_err.
1186 		 */
1187 		frame.tf_eip -= frame.tf_err;
1188 		break;
1189 
1190 	case EJUSTRETURN:
1191 		break;
1192 
1193 	default:
1194 bad:
1195  		if (p->p_sysent->sv_errsize) {
1196  			if (error >= p->p_sysent->sv_errsize)
1197   				error = -1;	/* XXX */
1198    			else
1199   				error = p->p_sysent->sv_errtbl[error];
1200 		}
1201 		frame.tf_eax = error;
1202 		frame.tf_eflags |= PSL_C;
1203 		break;
1204 	}
1205 
1206 	/*
1207 	 * Traced syscall.  trapsignal() is not MP aware.
1208 	 */
1209 	if ((frame.tf_eflags & PSL_T) && !(frame.tf_eflags & PSL_VM)) {
1210 		if (!mtx_owned(&Giant))
1211 			mtx_lock(&Giant);
1212 		frame.tf_eflags &= ~PSL_T;
1213 		trapsignal(p, SIGTRAP, 0);
1214 	}
1215 
1216 	/*
1217 	 * Handle reschedule and other end-of-syscall issues
1218 	 */
1219 	userret(p, &frame, sticks);
1220 
1221 #ifdef KTRACE
1222 	if (KTRPOINT(p, KTR_SYSRET)) {
1223 		if (!mtx_owned(&Giant))
1224 			mtx_lock(&Giant);
1225 		ktrsysret(p->p_tracep, code, error, p->p_retval[0]);
1226 	}
1227 #endif
1228 
1229 	/*
1230 	 * Release Giant if we had to get it
1231 	 */
1232 	if (mtx_owned(&Giant))
1233 		mtx_unlock(&Giant);
1234 
1235 	/*
1236 	 * This works because errno is findable through the
1237 	 * register set.  If we ever support an emulation where this
1238 	 * is not the case, this code will need to be revisited.
1239 	 */
1240 	STOPEVENT(p, S_SCX, code);
1241 
1242 #ifdef WITNESS
1243 	if (witness_list(p)) {
1244 		panic("system call %s returning with mutex(s) held\n",
1245 		    syscallnames[code]);
1246 	}
1247 #endif
1248 	mtx_assert(&sched_lock, MA_NOTOWNED);
1249 	mtx_assert(&Giant, MA_NOTOWNED);
1250 }
1251 
1252 void
1253 ast(framep)
1254 	struct trapframe *framep;
1255 {
1256 	struct proc *p = CURPROC;
1257 	u_quad_t sticks;
1258 
1259 	KASSERT(TRAPF_USERMODE(framep), ("ast in kernel mode"));
1260 
1261 	/*
1262 	 * We check for a pending AST here rather than in the assembly as
1263 	 * acquiring and releasing mutexes in assembly is not fun.
1264 	 */
1265 	mtx_lock_spin(&sched_lock);
1266 	if (!(astpending(p) || resched_wanted(p))) {
1267 		mtx_unlock_spin(&sched_lock);
1268 		return;
1269 	}
1270 
1271 	sticks = p->p_sticks;
1272 	p->p_md.md_regs = framep;
1273 
1274 	astoff(p);
1275 	cnt.v_soft++;
1276 	mtx_intr_enable(&sched_lock);
1277 	if (p->p_sflag & PS_OWEUPC) {
1278 		p->p_sflag &= ~PS_OWEUPC;
1279 		mtx_unlock_spin(&sched_lock);
1280 		mtx_lock(&Giant);
1281 		mtx_lock_spin(&sched_lock);
1282 		addupc_task(p, p->p_stats->p_prof.pr_addr,
1283 			    p->p_stats->p_prof.pr_ticks);
1284 	}
1285 	if (p->p_sflag & PS_ALRMPEND) {
1286 		p->p_sflag &= ~PS_ALRMPEND;
1287 		mtx_unlock_spin(&sched_lock);
1288 		PROC_LOCK(p);
1289 		psignal(p, SIGVTALRM);
1290 		PROC_UNLOCK(p);
1291 		mtx_lock_spin(&sched_lock);
1292 	}
1293 	if (p->p_sflag & PS_PROFPEND) {
1294 		p->p_sflag &= ~PS_PROFPEND;
1295 		mtx_unlock_spin(&sched_lock);
1296 		PROC_LOCK(p);
1297 		psignal(p, SIGPROF);
1298 		PROC_UNLOCK(p);
1299 	} else
1300 		mtx_unlock_spin(&sched_lock);
1301 
1302 	userret(p, framep, sticks);
1303 
1304 	if (mtx_owned(&Giant))
1305 		mtx_unlock(&Giant);
1306 }
1307