xref: /freebsd/sys/i386/linux/linux_sysvec.c (revision b28624fde638caadd4a89f50c9b7e7da0f98c4d2)
1 /*-
2  * Copyright (c) 1994-1996 S�ren Schmidt
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer
10  *    in this position and unchanged.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  * 3. The name of the author may not be used to endorse or promote products
15  *    derived from this software without specific prior written permission
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/exec.h>
35 #include <sys/imgact.h>
36 #include <sys/imgact_aout.h>
37 #include <sys/imgact_elf.h>
38 #include <sys/kernel.h>
39 #include <sys/lock.h>
40 #include <sys/malloc.h>
41 #include <sys/module.h>
42 #include <sys/mutex.h>
43 #include <sys/proc.h>
44 #include <sys/signalvar.h>
45 #include <sys/syscallsubr.h>
46 #include <sys/sysent.h>
47 #include <sys/sysproto.h>
48 #include <sys/vnode.h>
49 #include <sys/eventhandler.h>
50 
51 #include <vm/vm.h>
52 #include <vm/pmap.h>
53 #include <vm/vm_extern.h>
54 #include <vm/vm_map.h>
55 #include <vm/vm_object.h>
56 #include <vm/vm_page.h>
57 #include <vm/vm_param.h>
58 
59 #include <machine/cpu.h>
60 #include <machine/md_var.h>
61 #include <machine/pcb.h>
62 
63 #include <i386/linux/linux.h>
64 #include <i386/linux/linux_proto.h>
65 #include <compat/linux/linux_emul.h>
66 #include <compat/linux/linux_mib.h>
67 #include <compat/linux/linux_signal.h>
68 #include <compat/linux/linux_util.h>
69 
70 MODULE_VERSION(linux, 1);
71 
72 MALLOC_DEFINE(M_LINUX, "linux", "Linux mode structures");
73 
74 #if BYTE_ORDER == LITTLE_ENDIAN
75 #define SHELLMAGIC      0x2123 /* #! */
76 #else
77 #define SHELLMAGIC      0x2321
78 #endif
79 
80 /*
81  * Allow the sendsig functions to use the ldebug() facility
82  * even though they are not syscalls themselves. Map them
83  * to syscall 0. This is slightly less bogus than using
84  * ldebug(sigreturn).
85  */
86 #define	LINUX_SYS_linux_rt_sendsig	0
87 #define	LINUX_SYS_linux_sendsig		0
88 
89 #define	fldcw(addr)		__asm("fldcw %0" : : "m" (*(addr)))
90 #define	__LINUX_NPXCW__		0x37f
91 
92 extern char linux_sigcode[];
93 extern int linux_szsigcode;
94 
95 extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL];
96 
97 SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler);
98 SET_DECLARE(linux_device_handler_set, struct linux_device_handler);
99 
100 static int	linux_fixup(register_t **stack_base,
101 		    struct image_params *iparams);
102 static int	elf_linux_fixup(register_t **stack_base,
103 		    struct image_params *iparams);
104 static void	linux_prepsyscall(struct trapframe *tf, int *args, u_int *code,
105 		    caddr_t *params);
106 static void     linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask);
107 static void	exec_linux_setregs(struct thread *td, u_long entry,
108 				   u_long stack, u_long ps_strings);
109 
110 extern LIST_HEAD(futex_list, futex) futex_list;
111 extern struct sx futex_sx;
112 
113 static eventhandler_tag linux_exit_tag;
114 static eventhandler_tag linux_schedtail_tag;
115 static eventhandler_tag linux_exec_tag;
116 
117 /*
118  * Linux syscalls return negative errno's, we do positive and map them
119  * Reference:
120  *   FreeBSD: src/sys/sys/errno.h
121  *   Linux:   linux-2.6.17.8/include/asm-generic/errno-base.h
122  *            linux-2.6.17.8/include/asm-generic/errno.h
123  */
124 static int bsd_to_linux_errno[ELAST + 1] = {
125 	-0,  -1,  -2,  -3,  -4,  -5,  -6,  -7,  -8,  -9,
126 	-10, -35, -12, -13, -14, -15, -16, -17, -18, -19,
127 	-20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
128 	-30, -31, -32, -33, -34, -11,-115,-114, -88, -89,
129 	-90, -91, -92, -93, -94, -95, -96, -97, -98, -99,
130 	-100,-101,-102,-103,-104,-105,-106,-107,-108,-109,
131 	-110,-111, -40, -36,-112,-113, -39, -11, -87,-122,
132 	-116, -66,  -6,  -6,  -6,  -6,  -6, -37, -38,  -9,
133 	  -6,  -6, -43, -42, -75,-125, -84, -95, -16, -74,
134 	 -72, -67, -71
135 };
136 
137 int bsd_to_linux_signal[LINUX_SIGTBLSZ] = {
138 	LINUX_SIGHUP, LINUX_SIGINT, LINUX_SIGQUIT, LINUX_SIGILL,
139 	LINUX_SIGTRAP, LINUX_SIGABRT, 0, LINUX_SIGFPE,
140 	LINUX_SIGKILL, LINUX_SIGBUS, LINUX_SIGSEGV, LINUX_SIGSYS,
141 	LINUX_SIGPIPE, LINUX_SIGALRM, LINUX_SIGTERM, LINUX_SIGURG,
142 	LINUX_SIGSTOP, LINUX_SIGTSTP, LINUX_SIGCONT, LINUX_SIGCHLD,
143 	LINUX_SIGTTIN, LINUX_SIGTTOU, LINUX_SIGIO, LINUX_SIGXCPU,
144 	LINUX_SIGXFSZ, LINUX_SIGVTALRM, LINUX_SIGPROF, LINUX_SIGWINCH,
145 	0, LINUX_SIGUSR1, LINUX_SIGUSR2
146 };
147 
148 int linux_to_bsd_signal[LINUX_SIGTBLSZ] = {
149 	SIGHUP, SIGINT, SIGQUIT, SIGILL,
150 	SIGTRAP, SIGABRT, SIGBUS, SIGFPE,
151 	SIGKILL, SIGUSR1, SIGSEGV, SIGUSR2,
152 	SIGPIPE, SIGALRM, SIGTERM, SIGBUS,
153 	SIGCHLD, SIGCONT, SIGSTOP, SIGTSTP,
154 	SIGTTIN, SIGTTOU, SIGURG, SIGXCPU,
155 	SIGXFSZ, SIGVTALRM, SIGPROF, SIGWINCH,
156 	SIGIO, SIGURG, SIGSYS
157 };
158 
159 #define LINUX_T_UNKNOWN  255
160 static int _bsd_to_linux_trapcode[] = {
161 	LINUX_T_UNKNOWN,	/* 0 */
162 	6,			/* 1  T_PRIVINFLT */
163 	LINUX_T_UNKNOWN,	/* 2 */
164 	3,			/* 3  T_BPTFLT */
165 	LINUX_T_UNKNOWN,	/* 4 */
166 	LINUX_T_UNKNOWN,	/* 5 */
167 	16,			/* 6  T_ARITHTRAP */
168 	254,			/* 7  T_ASTFLT */
169 	LINUX_T_UNKNOWN,	/* 8 */
170 	13,			/* 9  T_PROTFLT */
171 	1,			/* 10 T_TRCTRAP */
172 	LINUX_T_UNKNOWN,	/* 11 */
173 	14,			/* 12 T_PAGEFLT */
174 	LINUX_T_UNKNOWN,	/* 13 */
175 	17,			/* 14 T_ALIGNFLT */
176 	LINUX_T_UNKNOWN,	/* 15 */
177 	LINUX_T_UNKNOWN,	/* 16 */
178 	LINUX_T_UNKNOWN,	/* 17 */
179 	0,			/* 18 T_DIVIDE */
180 	2,			/* 19 T_NMI */
181 	4,			/* 20 T_OFLOW */
182 	5,			/* 21 T_BOUND */
183 	7,			/* 22 T_DNA */
184 	8,			/* 23 T_DOUBLEFLT */
185 	9,			/* 24 T_FPOPFLT */
186 	10,			/* 25 T_TSSFLT */
187 	11,			/* 26 T_SEGNPFLT */
188 	12,			/* 27 T_STKFLT */
189 	18,			/* 28 T_MCHK */
190 	19,			/* 29 T_XMMFLT */
191 	15			/* 30 T_RESERVED */
192 };
193 #define bsd_to_linux_trapcode(code) \
194     ((code)<sizeof(_bsd_to_linux_trapcode)/sizeof(*_bsd_to_linux_trapcode)? \
195      _bsd_to_linux_trapcode[(code)]: \
196      LINUX_T_UNKNOWN)
197 
198 /*
199  * If FreeBSD & Linux have a difference of opinion about what a trap
200  * means, deal with it here.
201  *
202  * MPSAFE
203  */
204 static int
205 translate_traps(int signal, int trap_code)
206 {
207 	if (signal != SIGBUS)
208 		return signal;
209 	switch (trap_code) {
210 	case T_PROTFLT:
211 	case T_TSSFLT:
212 	case T_DOUBLEFLT:
213 	case T_PAGEFLT:
214 		return SIGSEGV;
215 	default:
216 		return signal;
217 	}
218 }
219 
220 static int
221 linux_fixup(register_t **stack_base, struct image_params *imgp)
222 {
223 	register_t *argv, *envp;
224 
225 	argv = *stack_base;
226 	envp = *stack_base + (imgp->args->argc + 1);
227 	(*stack_base)--;
228 	**stack_base = (intptr_t)(void *)envp;
229 	(*stack_base)--;
230 	**stack_base = (intptr_t)(void *)argv;
231 	(*stack_base)--;
232 	**stack_base = imgp->args->argc;
233 	return 0;
234 }
235 
236 static int
237 elf_linux_fixup(register_t **stack_base, struct image_params *imgp)
238 {
239 	Elf32_Auxargs *args;
240 	register_t *pos;
241 
242 	KASSERT(curthread->td_proc == imgp->proc &&
243 	    (curthread->td_proc->p_flag & P_SA) == 0,
244 	    ("unsafe elf_linux_fixup(), should be curproc"));
245 	args = (Elf32_Auxargs *)imgp->auxargs;
246 	pos = *stack_base + (imgp->args->argc + imgp->args->envc + 2);
247 
248 	if (args->trace)
249 		AUXARGS_ENTRY(pos, AT_DEBUG, 1);
250 	if (args->execfd != -1)
251 		AUXARGS_ENTRY(pos, AT_EXECFD, args->execfd);
252 	AUXARGS_ENTRY(pos, AT_PHDR, args->phdr);
253 	AUXARGS_ENTRY(pos, AT_PHENT, args->phent);
254 	AUXARGS_ENTRY(pos, AT_PHNUM, args->phnum);
255 	AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz);
256 	AUXARGS_ENTRY(pos, AT_FLAGS, args->flags);
257 	AUXARGS_ENTRY(pos, AT_ENTRY, args->entry);
258 	AUXARGS_ENTRY(pos, AT_BASE, args->base);
259 	AUXARGS_ENTRY(pos, AT_UID, imgp->proc->p_ucred->cr_ruid);
260 	AUXARGS_ENTRY(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid);
261 	AUXARGS_ENTRY(pos, AT_GID, imgp->proc->p_ucred->cr_rgid);
262 	AUXARGS_ENTRY(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid);
263 	AUXARGS_ENTRY(pos, AT_NULL, 0);
264 
265 	free(imgp->auxargs, M_TEMP);
266 	imgp->auxargs = NULL;
267 
268 	(*stack_base)--;
269 	**stack_base = (register_t)imgp->args->argc;
270 	return 0;
271 }
272 
273 extern int _ucodesel, _udatasel;
274 extern unsigned long linux_sznonrtsigcode;
275 
276 static void
277 linux_rt_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
278 {
279 	struct thread *td = curthread;
280 	struct proc *p = td->td_proc;
281 	struct sigacts *psp;
282 	struct trapframe *regs;
283 	struct l_rt_sigframe *fp, frame;
284 	int sig, code;
285 	int oonstack;
286 
287 	sig = ksi->ksi_signo;
288 	code = ksi->ksi_code;
289 	PROC_LOCK_ASSERT(p, MA_OWNED);
290 	psp = p->p_sigacts;
291 	mtx_assert(&psp->ps_mtx, MA_OWNED);
292 	regs = td->td_frame;
293 	oonstack = sigonstack(regs->tf_esp);
294 
295 #ifdef DEBUG
296 	if (ldebug(rt_sendsig))
297 		printf(ARGS(rt_sendsig, "%p, %d, %p, %u"),
298 		    catcher, sig, (void*)mask, code);
299 #endif
300 	/*
301 	 * Allocate space for the signal handler context.
302 	 */
303 	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
304 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
305 		fp = (struct l_rt_sigframe *)(td->td_sigstk.ss_sp +
306 		    td->td_sigstk.ss_size - sizeof(struct l_rt_sigframe));
307 	} else
308 		fp = (struct l_rt_sigframe *)regs->tf_esp - 1;
309 	mtx_unlock(&psp->ps_mtx);
310 
311 	/*
312 	 * Build the argument list for the signal handler.
313 	 */
314 	if (p->p_sysent->sv_sigtbl)
315 		if (sig <= p->p_sysent->sv_sigsize)
316 			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
317 
318 	bzero(&frame, sizeof(frame));
319 
320 	frame.sf_handler = catcher;
321 	frame.sf_sig = sig;
322 	frame.sf_siginfo = &fp->sf_si;
323 	frame.sf_ucontext = &fp->sf_sc;
324 
325 	/* Fill in POSIX parts */
326 	frame.sf_si.lsi_signo = sig;
327 	frame.sf_si.lsi_code = code;
328 	frame.sf_si.lsi_addr = ksi->ksi_addr;
329 
330 	/*
331 	 * Build the signal context to be used by sigreturn.
332 	 */
333 	frame.sf_sc.uc_flags = 0;		/* XXX ??? */
334 	frame.sf_sc.uc_link = NULL;		/* XXX ??? */
335 
336 	frame.sf_sc.uc_stack.ss_sp = td->td_sigstk.ss_sp;
337 	frame.sf_sc.uc_stack.ss_size = td->td_sigstk.ss_size;
338 	frame.sf_sc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
339 	    ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE;
340 	PROC_UNLOCK(p);
341 
342 	bsd_to_linux_sigset(mask, &frame.sf_sc.uc_sigmask);
343 
344 	frame.sf_sc.uc_mcontext.sc_mask   = frame.sf_sc.uc_sigmask.__bits[0];
345 	frame.sf_sc.uc_mcontext.sc_gs     = rgs();
346 	frame.sf_sc.uc_mcontext.sc_fs     = regs->tf_fs;
347 	frame.sf_sc.uc_mcontext.sc_es     = regs->tf_es;
348 	frame.sf_sc.uc_mcontext.sc_ds     = regs->tf_ds;
349 	frame.sf_sc.uc_mcontext.sc_edi    = regs->tf_edi;
350 	frame.sf_sc.uc_mcontext.sc_esi    = regs->tf_esi;
351 	frame.sf_sc.uc_mcontext.sc_ebp    = regs->tf_ebp;
352 	frame.sf_sc.uc_mcontext.sc_ebx    = regs->tf_ebx;
353 	frame.sf_sc.uc_mcontext.sc_edx    = regs->tf_edx;
354 	frame.sf_sc.uc_mcontext.sc_ecx    = regs->tf_ecx;
355 	frame.sf_sc.uc_mcontext.sc_eax    = regs->tf_eax;
356 	frame.sf_sc.uc_mcontext.sc_eip    = regs->tf_eip;
357 	frame.sf_sc.uc_mcontext.sc_cs     = regs->tf_cs;
358 	frame.sf_sc.uc_mcontext.sc_eflags = regs->tf_eflags;
359 	frame.sf_sc.uc_mcontext.sc_esp_at_signal = regs->tf_esp;
360 	frame.sf_sc.uc_mcontext.sc_ss     = regs->tf_ss;
361 	frame.sf_sc.uc_mcontext.sc_err    = regs->tf_err;
362 	frame.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code);
363 
364 #ifdef DEBUG
365 	if (ldebug(rt_sendsig))
366 		printf(LMSG("rt_sendsig flags: 0x%x, sp: %p, ss: 0x%x, mask: 0x%x"),
367 		    frame.sf_sc.uc_stack.ss_flags, td->td_sigstk.ss_sp,
368 		    td->td_sigstk.ss_size, frame.sf_sc.uc_mcontext.sc_mask);
369 #endif
370 
371 	if (copyout(&frame, fp, sizeof(frame)) != 0) {
372 		/*
373 		 * Process has trashed its stack; give it an illegal
374 		 * instruction to halt it in its tracks.
375 		 */
376 #ifdef DEBUG
377 		if (ldebug(rt_sendsig))
378 			printf(LMSG("rt_sendsig: bad stack %p, oonstack=%x"),
379 			    fp, oonstack);
380 #endif
381 		PROC_LOCK(p);
382 		sigexit(td, SIGILL);
383 	}
384 
385 	/*
386 	 * Build context to run handler in.
387 	 */
388 	regs->tf_esp = (int)fp;
389 	regs->tf_eip = PS_STRINGS - *(p->p_sysent->sv_szsigcode) +
390 	    linux_sznonrtsigcode;
391 	regs->tf_eflags &= ~(PSL_T | PSL_VM);
392 	regs->tf_cs = _ucodesel;
393 	regs->tf_ds = _udatasel;
394 	regs->tf_es = _udatasel;
395 	regs->tf_fs = _udatasel;
396 	regs->tf_ss = _udatasel;
397 	PROC_LOCK(p);
398 	mtx_lock(&psp->ps_mtx);
399 }
400 
401 
402 /*
403  * Send an interrupt to process.
404  *
405  * Stack is set up to allow sigcode stored
406  * in u. to call routine, followed by kcall
407  * to sigreturn routine below.  After sigreturn
408  * resets the signal mask, the stack, and the
409  * frame pointer, it returns to the user
410  * specified pc, psl.
411  */
412 static void
413 linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
414 {
415 	struct thread *td = curthread;
416 	struct proc *p = td->td_proc;
417 	struct sigacts *psp;
418 	struct trapframe *regs;
419 	struct l_sigframe *fp, frame;
420 	l_sigset_t lmask;
421 	int sig, code;
422 	int oonstack, i;
423 
424 	PROC_LOCK_ASSERT(p, MA_OWNED);
425 	psp = p->p_sigacts;
426 	sig = ksi->ksi_signo;
427 	code = ksi->ksi_code;
428 	mtx_assert(&psp->ps_mtx, MA_OWNED);
429 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
430 		/* Signal handler installed with SA_SIGINFO. */
431 		linux_rt_sendsig(catcher, ksi, mask);
432 		return;
433 	}
434 	regs = td->td_frame;
435 	oonstack = sigonstack(regs->tf_esp);
436 
437 #ifdef DEBUG
438 	if (ldebug(sendsig))
439 		printf(ARGS(sendsig, "%p, %d, %p, %u"),
440 		    catcher, sig, (void*)mask, code);
441 #endif
442 
443 	/*
444 	 * Allocate space for the signal handler context.
445 	 */
446 	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
447 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
448 		fp = (struct l_sigframe *)(td->td_sigstk.ss_sp +
449 		    td->td_sigstk.ss_size - sizeof(struct l_sigframe));
450 	} else
451 		fp = (struct l_sigframe *)regs->tf_esp - 1;
452 	mtx_unlock(&psp->ps_mtx);
453 	PROC_UNLOCK(p);
454 
455 	/*
456 	 * Build the argument list for the signal handler.
457 	 */
458 	if (p->p_sysent->sv_sigtbl)
459 		if (sig <= p->p_sysent->sv_sigsize)
460 			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
461 
462 	bzero(&frame, sizeof(frame));
463 
464 	frame.sf_handler = catcher;
465 	frame.sf_sig = sig;
466 
467 	bsd_to_linux_sigset(mask, &lmask);
468 
469 	/*
470 	 * Build the signal context to be used by sigreturn.
471 	 */
472 	frame.sf_sc.sc_mask   = lmask.__bits[0];
473 	frame.sf_sc.sc_gs     = rgs();
474 	frame.sf_sc.sc_fs     = regs->tf_fs;
475 	frame.sf_sc.sc_es     = regs->tf_es;
476 	frame.sf_sc.sc_ds     = regs->tf_ds;
477 	frame.sf_sc.sc_edi    = regs->tf_edi;
478 	frame.sf_sc.sc_esi    = regs->tf_esi;
479 	frame.sf_sc.sc_ebp    = regs->tf_ebp;
480 	frame.sf_sc.sc_ebx    = regs->tf_ebx;
481 	frame.sf_sc.sc_edx    = regs->tf_edx;
482 	frame.sf_sc.sc_ecx    = regs->tf_ecx;
483 	frame.sf_sc.sc_eax    = regs->tf_eax;
484 	frame.sf_sc.sc_eip    = regs->tf_eip;
485 	frame.sf_sc.sc_cs     = regs->tf_cs;
486 	frame.sf_sc.sc_eflags = regs->tf_eflags;
487 	frame.sf_sc.sc_esp_at_signal = regs->tf_esp;
488 	frame.sf_sc.sc_ss     = regs->tf_ss;
489 	frame.sf_sc.sc_err    = regs->tf_err;
490 	frame.sf_sc.sc_trapno = bsd_to_linux_trapcode(ksi->ksi_trapno);
491 
492 	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
493 		frame.sf_extramask[i] = lmask.__bits[i+1];
494 
495 	if (copyout(&frame, fp, sizeof(frame)) != 0) {
496 		/*
497 		 * Process has trashed its stack; give it an illegal
498 		 * instruction to halt it in its tracks.
499 		 */
500 		PROC_LOCK(p);
501 		sigexit(td, SIGILL);
502 	}
503 
504 	/*
505 	 * Build context to run handler in.
506 	 */
507 	regs->tf_esp = (int)fp;
508 	regs->tf_eip = PS_STRINGS - *(p->p_sysent->sv_szsigcode);
509 	regs->tf_eflags &= ~(PSL_T | PSL_VM);
510 	regs->tf_cs = _ucodesel;
511 	regs->tf_ds = _udatasel;
512 	regs->tf_es = _udatasel;
513 	regs->tf_fs = _udatasel;
514 	regs->tf_ss = _udatasel;
515 	PROC_LOCK(p);
516 	mtx_lock(&psp->ps_mtx);
517 }
518 
519 /*
520  * System call to cleanup state after a signal
521  * has been taken.  Reset signal mask and
522  * stack state from context left by sendsig (above).
523  * Return to previous pc and psl as specified by
524  * context left by sendsig. Check carefully to
525  * make sure that the user has not modified the
526  * psl to gain improper privileges or to cause
527  * a machine fault.
528  */
529 int
530 linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args)
531 {
532 	struct proc *p = td->td_proc;
533 	struct l_sigframe frame;
534 	struct trapframe *regs;
535 	l_sigset_t lmask;
536 	int eflags, i;
537 	ksiginfo_t ksi;
538 
539 	regs = td->td_frame;
540 
541 #ifdef DEBUG
542 	if (ldebug(sigreturn))
543 		printf(ARGS(sigreturn, "%p"), (void *)args->sfp);
544 #endif
545 	/*
546 	 * The trampoline code hands us the sigframe.
547 	 * It is unsafe to keep track of it ourselves, in the event that a
548 	 * program jumps out of a signal handler.
549 	 */
550 	if (copyin(args->sfp, &frame, sizeof(frame)) != 0)
551 		return (EFAULT);
552 
553 	/*
554 	 * Check for security violations.
555 	 */
556 #define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
557 	eflags = frame.sf_sc.sc_eflags;
558 	/*
559 	 * XXX do allow users to change the privileged flag PSL_RF.  The
560 	 * cpu sets PSL_RF in tf_eflags for faults.  Debuggers should
561 	 * sometimes set it there too.  tf_eflags is kept in the signal
562 	 * context during signal handling and there is no other place
563 	 * to remember it, so the PSL_RF bit may be corrupted by the
564 	 * signal handler without us knowing.  Corruption of the PSL_RF
565 	 * bit at worst causes one more or one less debugger trap, so
566 	 * allowing it is fairly harmless.
567 	 */
568 	if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF))
569 		return(EINVAL);
570 
571 	/*
572 	 * Don't allow users to load a valid privileged %cs.  Let the
573 	 * hardware check for invalid selectors, excess privilege in
574 	 * other selectors, invalid %eip's and invalid %esp's.
575 	 */
576 #define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
577 	if (!CS_SECURE(frame.sf_sc.sc_cs)) {
578 		ksiginfo_init_trap(&ksi);
579 		ksi.ksi_signo = SIGBUS;
580 		ksi.ksi_code = BUS_OBJERR;
581 		ksi.ksi_trapno = T_PROTFLT;
582 		ksi.ksi_addr = (void *)regs->tf_eip;
583 		trapsignal(td, &ksi);
584 		return(EINVAL);
585 	}
586 
587 	lmask.__bits[0] = frame.sf_sc.sc_mask;
588 	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
589 		lmask.__bits[i+1] = frame.sf_extramask[i];
590 	PROC_LOCK(p);
591 	linux_to_bsd_sigset(&lmask, &td->td_sigmask);
592 	SIG_CANTMASK(td->td_sigmask);
593 	signotify(td);
594 	PROC_UNLOCK(p);
595 
596 	/*
597 	 * Restore signal context.
598 	 */
599 	/* %gs was restored by the trampoline. */
600 	regs->tf_fs     = frame.sf_sc.sc_fs;
601 	regs->tf_es     = frame.sf_sc.sc_es;
602 	regs->tf_ds     = frame.sf_sc.sc_ds;
603 	regs->tf_edi    = frame.sf_sc.sc_edi;
604 	regs->tf_esi    = frame.sf_sc.sc_esi;
605 	regs->tf_ebp    = frame.sf_sc.sc_ebp;
606 	regs->tf_ebx    = frame.sf_sc.sc_ebx;
607 	regs->tf_edx    = frame.sf_sc.sc_edx;
608 	regs->tf_ecx    = frame.sf_sc.sc_ecx;
609 	regs->tf_eax    = frame.sf_sc.sc_eax;
610 	regs->tf_eip    = frame.sf_sc.sc_eip;
611 	regs->tf_cs     = frame.sf_sc.sc_cs;
612 	regs->tf_eflags = eflags;
613 	regs->tf_esp    = frame.sf_sc.sc_esp_at_signal;
614 	regs->tf_ss     = frame.sf_sc.sc_ss;
615 
616 	return (EJUSTRETURN);
617 }
618 
619 /*
620  * System call to cleanup state after a signal
621  * has been taken.  Reset signal mask and
622  * stack state from context left by rt_sendsig (above).
623  * Return to previous pc and psl as specified by
624  * context left by sendsig. Check carefully to
625  * make sure that the user has not modified the
626  * psl to gain improper privileges or to cause
627  * a machine fault.
628  */
629 int
630 linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args)
631 {
632 	struct proc *p = td->td_proc;
633 	struct l_ucontext uc;
634 	struct l_sigcontext *context;
635 	l_stack_t *lss;
636 	stack_t ss;
637 	struct trapframe *regs;
638 	int eflags;
639 	ksiginfo_t ksi;
640 
641 	regs = td->td_frame;
642 
643 #ifdef DEBUG
644 	if (ldebug(rt_sigreturn))
645 		printf(ARGS(rt_sigreturn, "%p"), (void *)args->ucp);
646 #endif
647 	/*
648 	 * The trampoline code hands us the ucontext.
649 	 * It is unsafe to keep track of it ourselves, in the event that a
650 	 * program jumps out of a signal handler.
651 	 */
652 	if (copyin(args->ucp, &uc, sizeof(uc)) != 0)
653 		return (EFAULT);
654 
655 	context = &uc.uc_mcontext;
656 
657 	/*
658 	 * Check for security violations.
659 	 */
660 #define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
661 	eflags = context->sc_eflags;
662 	/*
663 	 * XXX do allow users to change the privileged flag PSL_RF.  The
664 	 * cpu sets PSL_RF in tf_eflags for faults.  Debuggers should
665 	 * sometimes set it there too.  tf_eflags is kept in the signal
666 	 * context during signal handling and there is no other place
667 	 * to remember it, so the PSL_RF bit may be corrupted by the
668 	 * signal handler without us knowing.  Corruption of the PSL_RF
669 	 * bit at worst causes one more or one less debugger trap, so
670 	 * allowing it is fairly harmless.
671 	 */
672 	if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF))
673 		return(EINVAL);
674 
675 	/*
676 	 * Don't allow users to load a valid privileged %cs.  Let the
677 	 * hardware check for invalid selectors, excess privilege in
678 	 * other selectors, invalid %eip's and invalid %esp's.
679 	 */
680 #define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
681 	if (!CS_SECURE(context->sc_cs)) {
682 		ksiginfo_init_trap(&ksi);
683 		ksi.ksi_signo = SIGBUS;
684 		ksi.ksi_code = BUS_OBJERR;
685 		ksi.ksi_trapno = T_PROTFLT;
686 		ksi.ksi_addr = (void *)regs->tf_eip;
687 		trapsignal(td, &ksi);
688 		return(EINVAL);
689 	}
690 
691 	PROC_LOCK(p);
692 	linux_to_bsd_sigset(&uc.uc_sigmask, &td->td_sigmask);
693 	SIG_CANTMASK(td->td_sigmask);
694 	signotify(td);
695 	PROC_UNLOCK(p);
696 
697 	/*
698 	 * Restore signal context
699 	 */
700 	/* %gs was restored by the trampoline. */
701 	regs->tf_fs     = context->sc_fs;
702 	regs->tf_es     = context->sc_es;
703 	regs->tf_ds     = context->sc_ds;
704 	regs->tf_edi    = context->sc_edi;
705 	regs->tf_esi    = context->sc_esi;
706 	regs->tf_ebp    = context->sc_ebp;
707 	regs->tf_ebx    = context->sc_ebx;
708 	regs->tf_edx    = context->sc_edx;
709 	regs->tf_ecx    = context->sc_ecx;
710 	regs->tf_eax    = context->sc_eax;
711 	regs->tf_eip    = context->sc_eip;
712 	regs->tf_cs     = context->sc_cs;
713 	regs->tf_eflags = eflags;
714 	regs->tf_esp    = context->sc_esp_at_signal;
715 	regs->tf_ss     = context->sc_ss;
716 
717 	/*
718 	 * call sigaltstack & ignore results..
719 	 */
720 	lss = &uc.uc_stack;
721 	ss.ss_sp = lss->ss_sp;
722 	ss.ss_size = lss->ss_size;
723 	ss.ss_flags = linux_to_bsd_sigaltstack(lss->ss_flags);
724 
725 #ifdef DEBUG
726 	if (ldebug(rt_sigreturn))
727 		printf(LMSG("rt_sigret flags: 0x%x, sp: %p, ss: 0x%x, mask: 0x%x"),
728 		    ss.ss_flags, ss.ss_sp, ss.ss_size, context->sc_mask);
729 #endif
730 	(void)kern_sigaltstack(td, &ss, NULL);
731 
732 	return (EJUSTRETURN);
733 }
734 
735 /*
736  * MPSAFE
737  */
738 static void
739 linux_prepsyscall(struct trapframe *tf, int *args, u_int *code, caddr_t *params)
740 {
741 	args[0] = tf->tf_ebx;
742 	args[1] = tf->tf_ecx;
743 	args[2] = tf->tf_edx;
744 	args[3] = tf->tf_esi;
745 	args[4] = tf->tf_edi;
746 	args[5] = tf->tf_ebp;	/* Unconfirmed */
747 	*params = NULL;		/* no copyin */
748 }
749 
750 /*
751  * If a linux binary is exec'ing something, try this image activator
752  * first.  We override standard shell script execution in order to
753  * be able to modify the interpreter path.  We only do this if a linux
754  * binary is doing the exec, so we do not create an EXEC module for it.
755  */
756 static int	exec_linux_imgact_try(struct image_params *iparams);
757 
758 static int
759 exec_linux_imgact_try(struct image_params *imgp)
760 {
761     const char *head = (const char *)imgp->image_header;
762     char *rpath;
763     int error = -1, len;
764 
765     /*
766      * The interpreter for shell scripts run from a linux binary needs
767      * to be located in /compat/linux if possible in order to recursively
768      * maintain linux path emulation.
769      */
770     if (((const short *)head)[0] == SHELLMAGIC) {
771 	    /*
772 	     * Run our normal shell image activator.  If it succeeds attempt
773 	     * to use the alternate path for the interpreter.  If an alternate
774 	     * path is found, use our stringspace to store it.
775 	     */
776 	    if ((error = exec_shell_imgact(imgp)) == 0) {
777 		    linux_emul_convpath(FIRST_THREAD_IN_PROC(imgp->proc),
778 			imgp->interpreter_name, UIO_SYSSPACE, &rpath, 0);
779 		    if (rpath != NULL) {
780 			    len = strlen(rpath) + 1;
781 
782 			    if (len <= MAXSHELLCMDLEN) {
783 				    memcpy(imgp->interpreter_name, rpath, len);
784 			    }
785 			    free(rpath, M_TEMP);
786 		    }
787 	    }
788     }
789     return(error);
790 }
791 
792 /*
793  * exec_setregs may initialize some registers differently than Linux
794  * does, thus potentially confusing Linux binaries. If necessary, we
795  * override the exec_setregs default(s) here.
796  */
797 static void
798 exec_linux_setregs(struct thread *td, u_long entry,
799 		   u_long stack, u_long ps_strings)
800 {
801 	static const u_short control = __LINUX_NPXCW__;
802 	struct pcb *pcb = td->td_pcb;
803 
804 	exec_setregs(td, entry, stack, ps_strings);
805 
806 	/* Linux sets %gs to 0, we default to _udatasel */
807 	pcb->pcb_gs = 0; load_gs(0);
808 
809 	/* Linux sets the i387 to extended precision. */
810 	fldcw(&control);
811 }
812 
813 struct sysentvec linux_sysvec = {
814 	LINUX_SYS_MAXSYSCALL,
815 	linux_sysent,
816 	0,
817 	LINUX_SIGTBLSZ,
818 	bsd_to_linux_signal,
819 	ELAST + 1,
820 	bsd_to_linux_errno,
821 	translate_traps,
822 	linux_fixup,
823 	linux_sendsig,
824 	linux_sigcode,
825 	&linux_szsigcode,
826 	linux_prepsyscall,
827 	"Linux a.out",
828 	NULL,
829 	exec_linux_imgact_try,
830 	LINUX_MINSIGSTKSZ,
831 	PAGE_SIZE,
832 	VM_MIN_ADDRESS,
833 	VM_MAXUSER_ADDRESS,
834 	USRSTACK,
835 	PS_STRINGS,
836 	VM_PROT_ALL,
837 	exec_copyout_strings,
838 	exec_linux_setregs,
839 	NULL
840 };
841 
842 struct sysentvec elf_linux_sysvec = {
843 	LINUX_SYS_MAXSYSCALL,
844 	linux_sysent,
845 	0,
846 	LINUX_SIGTBLSZ,
847 	bsd_to_linux_signal,
848 	ELAST + 1,
849 	bsd_to_linux_errno,
850 	translate_traps,
851 	elf_linux_fixup,
852 	linux_sendsig,
853 	linux_sigcode,
854 	&linux_szsigcode,
855 	linux_prepsyscall,
856 	"Linux ELF",
857 	elf32_coredump,
858 	exec_linux_imgact_try,
859 	LINUX_MINSIGSTKSZ,
860 	PAGE_SIZE,
861 	VM_MIN_ADDRESS,
862 	VM_MAXUSER_ADDRESS,
863 	USRSTACK,
864 	PS_STRINGS,
865 	VM_PROT_ALL,
866 	exec_copyout_strings,
867 	exec_linux_setregs,
868 	NULL
869 };
870 
871 static Elf32_Brandinfo linux_brand = {
872 					ELFOSABI_LINUX,
873 					EM_386,
874 					"Linux",
875 					"/compat/linux",
876 					"/lib/ld-linux.so.1",
877 					&elf_linux_sysvec,
878 					NULL,
879 					BI_CAN_EXEC_DYN,
880 				 };
881 
882 static Elf32_Brandinfo linux_glibc2brand = {
883 					ELFOSABI_LINUX,
884 					EM_386,
885 					"Linux",
886 					"/compat/linux",
887 					"/lib/ld-linux.so.2",
888 					&elf_linux_sysvec,
889 					NULL,
890 					BI_CAN_EXEC_DYN,
891 				 };
892 
893 Elf32_Brandinfo *linux_brandlist[] = {
894 					&linux_brand,
895 					&linux_glibc2brand,
896 					NULL
897 				};
898 
899 static int
900 linux_elf_modevent(module_t mod, int type, void *data)
901 {
902 	Elf32_Brandinfo **brandinfo;
903 	int error;
904 	struct linux_ioctl_handler **lihp;
905 	struct linux_device_handler **ldhp;
906 
907 	error = 0;
908 
909 	switch(type) {
910 	case MOD_LOAD:
911 		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
912 		     ++brandinfo)
913 			if (elf32_insert_brand_entry(*brandinfo) < 0)
914 				error = EINVAL;
915 		if (error == 0) {
916 			SET_FOREACH(lihp, linux_ioctl_handler_set)
917 				linux_ioctl_register_handler(*lihp);
918 			SET_FOREACH(ldhp, linux_device_handler_set)
919 				linux_device_register_handler(*ldhp);
920 			mtx_init(&emul_lock, "emuldata lock", NULL, MTX_DEF);
921 			sx_init(&emul_shared_lock, "emuldata->shared lock");
922 			LIST_INIT(&futex_list);
923 			sx_init(&futex_sx, "futex protection lock");
924 			linux_exit_tag = EVENTHANDLER_REGISTER(process_exit, linux_proc_exit,
925 			      NULL, 1000);
926 			linux_schedtail_tag = EVENTHANDLER_REGISTER(schedtail, linux_schedtail,
927 			      NULL, 1000);
928 			linux_exec_tag = EVENTHANDLER_REGISTER(process_exec, linux_proc_exec,
929 			      NULL, 1000);
930 			if (bootverbose)
931 				printf("Linux ELF exec handler installed\n");
932 		} else
933 			printf("cannot insert Linux ELF brand handler\n");
934 		break;
935 	case MOD_UNLOAD:
936 		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
937 		     ++brandinfo)
938 			if (elf32_brand_inuse(*brandinfo))
939 				error = EBUSY;
940 		if (error == 0) {
941 			for (brandinfo = &linux_brandlist[0];
942 			     *brandinfo != NULL; ++brandinfo)
943 				if (elf32_remove_brand_entry(*brandinfo) < 0)
944 					error = EINVAL;
945 		}
946 		if (error == 0) {
947 			SET_FOREACH(lihp, linux_ioctl_handler_set)
948 				linux_ioctl_unregister_handler(*lihp);
949 			SET_FOREACH(ldhp, linux_device_handler_set)
950 				linux_device_unregister_handler(*ldhp);
951 			mtx_destroy(&emul_lock);
952 			sx_destroy(&emul_shared_lock);
953 			sx_destroy(&futex_sx);
954 			EVENTHANDLER_DEREGISTER(process_exit, linux_exit_tag);
955 			EVENTHANDLER_DEREGISTER(schedtail, linux_schedtail_tag);
956 			EVENTHANDLER_DEREGISTER(process_exec, linux_exec_tag);
957 			if (bootverbose)
958 				printf("Linux ELF exec handler removed\n");
959 		} else
960 			printf("Could not deinstall ELF interpreter entry\n");
961 		break;
962 	default:
963 		return EOPNOTSUPP;
964 	}
965 	return error;
966 }
967 
968 static moduledata_t linux_elf_mod = {
969 	"linuxelf",
970 	linux_elf_modevent,
971 	0
972 };
973 
974 DECLARE_MODULE(linuxelf, linux_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY);
975