xref: /freebsd/sys/i386/linux/linux_sysvec.c (revision adfa0adec0b5d7c19c220a85ef6ca729235ed172)
1 /*-
2  * Copyright (c) 1994-1996 S�ren Schmidt
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer
10  *    in this position and unchanged.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  * 3. The name of the author may not be used to endorse or promote products
15  *    derived from this software without specific prior written permission
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/exec.h>
35 #include <sys/imgact.h>
36 #include <sys/imgact_aout.h>
37 #include <sys/imgact_elf.h>
38 #include <sys/kernel.h>
39 #include <sys/lock.h>
40 #include <sys/malloc.h>
41 #include <sys/module.h>
42 #include <sys/mutex.h>
43 #include <sys/proc.h>
44 #include <sys/signalvar.h>
45 #include <sys/syscallsubr.h>
46 #include <sys/sysent.h>
47 #include <sys/sysproto.h>
48 #include <sys/vnode.h>
49 #include <sys/eventhandler.h>
50 
51 #include <vm/vm.h>
52 #include <vm/pmap.h>
53 #include <vm/vm_extern.h>
54 #include <vm/vm_map.h>
55 #include <vm/vm_object.h>
56 #include <vm/vm_page.h>
57 #include <vm/vm_param.h>
58 
59 #include <machine/cpu.h>
60 #include <machine/md_var.h>
61 #include <machine/pcb.h>
62 
63 #include <i386/linux/linux.h>
64 #include <i386/linux/linux_proto.h>
65 #include <compat/linux/linux_mib.h>
66 #include <compat/linux/linux_signal.h>
67 #include <compat/linux/linux_util.h>
68 
69 MODULE_VERSION(linux, 1);
70 
71 MALLOC_DEFINE(M_LINUX, "linux", "Linux mode structures");
72 
73 #if BYTE_ORDER == LITTLE_ENDIAN
74 #define SHELLMAGIC      0x2123 /* #! */
75 #else
76 #define SHELLMAGIC      0x2321
77 #endif
78 
79 /*
80  * Allow the sendsig functions to use the ldebug() facility
81  * even though they are not syscalls themselves. Map them
82  * to syscall 0. This is slightly less bogus than using
83  * ldebug(sigreturn).
84  */
85 #define	LINUX_SYS_linux_rt_sendsig	0
86 #define	LINUX_SYS_linux_sendsig		0
87 
88 #define	fldcw(addr)		__asm("fldcw %0" : : "m" (*(addr)))
89 #define	__LINUX_NPXCW__		0x37f
90 
91 extern char linux_sigcode[];
92 extern int linux_szsigcode;
93 
94 extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL];
95 
96 SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler);
97 SET_DECLARE(linux_device_handler_set, struct linux_device_handler);
98 
99 static int	linux_fixup(register_t **stack_base,
100 		    struct image_params *iparams);
101 static int	elf_linux_fixup(register_t **stack_base,
102 		    struct image_params *iparams);
103 static void	linux_prepsyscall(struct trapframe *tf, int *args, u_int *code,
104 		    caddr_t *params);
105 static void     linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask);
106 static void	exec_linux_setregs(struct thread *td, u_long entry,
107 				   u_long stack, u_long ps_strings);
108 
109 extern void linux_proc_exit(void *, struct proc *, struct image_params *);
110 extern void linux_proc_exec(void *, struct proc *, struct image_params *);
111 extern void linux_schedtail(void *, struct proc *);
112 extern LIST_HEAD(futex_list, futex) futex_list;
113 extern struct sx emul_shared_lock;
114 extern struct sx emul_lock;
115 extern struct mtx futex_mtx;
116 
117 static eventhandler_tag linux_exit_tag;
118 static eventhandler_tag linux_schedtail_tag;
119 static eventhandler_tag linux_exec_tag;
120 
121 /*
122  * Linux syscalls return negative errno's, we do positive and map them
123  * Reference:
124  *   FreeBSD: src/sys/sys/errno.h
125  *   Linux:   linux-2.6.17.8/include/asm-generic/errno-base.h
126  *            linux-2.6.17.8/include/asm-generic/errno.h
127  */
128 static int bsd_to_linux_errno[ELAST + 1] = {
129 	-0,  -1,  -2,  -3,  -4,  -5,  -6,  -7,  -8,  -9,
130 	-10, -35, -12, -13, -14, -15, -16, -17, -18, -19,
131 	-20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
132 	-30, -31, -32, -33, -34, -11,-115,-114, -88, -89,
133 	-90, -91, -92, -93, -94, -95, -96, -97, -98, -99,
134 	-100,-101,-102,-103,-104,-105,-106,-107,-108,-109,
135 	-110,-111, -40, -36,-112,-113, -39, -11, -87,-122,
136 	-116, -66,  -6,  -6,  -6,  -6,  -6, -37, -38,  -9,
137 	  -6,  -6, -43, -42, -75,-125, -84, -95, -16, -74,
138 	 -72, -67, -71
139 };
140 
141 int bsd_to_linux_signal[LINUX_SIGTBLSZ] = {
142 	LINUX_SIGHUP, LINUX_SIGINT, LINUX_SIGQUIT, LINUX_SIGILL,
143 	LINUX_SIGTRAP, LINUX_SIGABRT, 0, LINUX_SIGFPE,
144 	LINUX_SIGKILL, LINUX_SIGBUS, LINUX_SIGSEGV, LINUX_SIGSYS,
145 	LINUX_SIGPIPE, LINUX_SIGALRM, LINUX_SIGTERM, LINUX_SIGURG,
146 	LINUX_SIGSTOP, LINUX_SIGTSTP, LINUX_SIGCONT, LINUX_SIGCHLD,
147 	LINUX_SIGTTIN, LINUX_SIGTTOU, LINUX_SIGIO, LINUX_SIGXCPU,
148 	LINUX_SIGXFSZ, LINUX_SIGVTALRM, LINUX_SIGPROF, LINUX_SIGWINCH,
149 	0, LINUX_SIGUSR1, LINUX_SIGUSR2
150 };
151 
152 int linux_to_bsd_signal[LINUX_SIGTBLSZ] = {
153 	SIGHUP, SIGINT, SIGQUIT, SIGILL,
154 	SIGTRAP, SIGABRT, SIGBUS, SIGFPE,
155 	SIGKILL, SIGUSR1, SIGSEGV, SIGUSR2,
156 	SIGPIPE, SIGALRM, SIGTERM, SIGBUS,
157 	SIGCHLD, SIGCONT, SIGSTOP, SIGTSTP,
158 	SIGTTIN, SIGTTOU, SIGURG, SIGXCPU,
159 	SIGXFSZ, SIGVTALRM, SIGPROF, SIGWINCH,
160 	SIGIO, SIGURG, SIGSYS
161 };
162 
163 #define LINUX_T_UNKNOWN  255
164 static int _bsd_to_linux_trapcode[] = {
165 	LINUX_T_UNKNOWN,	/* 0 */
166 	6,			/* 1  T_PRIVINFLT */
167 	LINUX_T_UNKNOWN,	/* 2 */
168 	3,			/* 3  T_BPTFLT */
169 	LINUX_T_UNKNOWN,	/* 4 */
170 	LINUX_T_UNKNOWN,	/* 5 */
171 	16,			/* 6  T_ARITHTRAP */
172 	254,			/* 7  T_ASTFLT */
173 	LINUX_T_UNKNOWN,	/* 8 */
174 	13,			/* 9  T_PROTFLT */
175 	1,			/* 10 T_TRCTRAP */
176 	LINUX_T_UNKNOWN,	/* 11 */
177 	14,			/* 12 T_PAGEFLT */
178 	LINUX_T_UNKNOWN,	/* 13 */
179 	17,			/* 14 T_ALIGNFLT */
180 	LINUX_T_UNKNOWN,	/* 15 */
181 	LINUX_T_UNKNOWN,	/* 16 */
182 	LINUX_T_UNKNOWN,	/* 17 */
183 	0,			/* 18 T_DIVIDE */
184 	2,			/* 19 T_NMI */
185 	4,			/* 20 T_OFLOW */
186 	5,			/* 21 T_BOUND */
187 	7,			/* 22 T_DNA */
188 	8,			/* 23 T_DOUBLEFLT */
189 	9,			/* 24 T_FPOPFLT */
190 	10,			/* 25 T_TSSFLT */
191 	11,			/* 26 T_SEGNPFLT */
192 	12,			/* 27 T_STKFLT */
193 	18,			/* 28 T_MCHK */
194 	19,			/* 29 T_XMMFLT */
195 	15			/* 30 T_RESERVED */
196 };
197 #define bsd_to_linux_trapcode(code) \
198     ((code)<sizeof(_bsd_to_linux_trapcode)/sizeof(*_bsd_to_linux_trapcode)? \
199      _bsd_to_linux_trapcode[(code)]: \
200      LINUX_T_UNKNOWN)
201 
202 /*
203  * If FreeBSD & Linux have a difference of opinion about what a trap
204  * means, deal with it here.
205  *
206  * MPSAFE
207  */
208 static int
209 translate_traps(int signal, int trap_code)
210 {
211 	if (signal != SIGBUS)
212 		return signal;
213 	switch (trap_code) {
214 	case T_PROTFLT:
215 	case T_TSSFLT:
216 	case T_DOUBLEFLT:
217 	case T_PAGEFLT:
218 		return SIGSEGV;
219 	default:
220 		return signal;
221 	}
222 }
223 
224 static int
225 linux_fixup(register_t **stack_base, struct image_params *imgp)
226 {
227 	register_t *argv, *envp;
228 
229 	argv = *stack_base;
230 	envp = *stack_base + (imgp->args->argc + 1);
231 	(*stack_base)--;
232 	**stack_base = (intptr_t)(void *)envp;
233 	(*stack_base)--;
234 	**stack_base = (intptr_t)(void *)argv;
235 	(*stack_base)--;
236 	**stack_base = imgp->args->argc;
237 	return 0;
238 }
239 
240 static int
241 elf_linux_fixup(register_t **stack_base, struct image_params *imgp)
242 {
243 	Elf32_Auxargs *args;
244 	register_t *pos;
245 
246 	KASSERT(curthread->td_proc == imgp->proc &&
247 	    (curthread->td_proc->p_flag & P_SA) == 0,
248 	    ("unsafe elf_linux_fixup(), should be curproc"));
249 	args = (Elf32_Auxargs *)imgp->auxargs;
250 	pos = *stack_base + (imgp->args->argc + imgp->args->envc + 2);
251 
252 	if (args->trace)
253 		AUXARGS_ENTRY(pos, AT_DEBUG, 1);
254 	if (args->execfd != -1)
255 		AUXARGS_ENTRY(pos, AT_EXECFD, args->execfd);
256 	AUXARGS_ENTRY(pos, AT_PHDR, args->phdr);
257 	AUXARGS_ENTRY(pos, AT_PHENT, args->phent);
258 	AUXARGS_ENTRY(pos, AT_PHNUM, args->phnum);
259 	AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz);
260 	AUXARGS_ENTRY(pos, AT_FLAGS, args->flags);
261 	AUXARGS_ENTRY(pos, AT_ENTRY, args->entry);
262 	AUXARGS_ENTRY(pos, AT_BASE, args->base);
263 	AUXARGS_ENTRY(pos, AT_UID, imgp->proc->p_ucred->cr_ruid);
264 	AUXARGS_ENTRY(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid);
265 	AUXARGS_ENTRY(pos, AT_GID, imgp->proc->p_ucred->cr_rgid);
266 	AUXARGS_ENTRY(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid);
267 	AUXARGS_ENTRY(pos, AT_NULL, 0);
268 
269 	free(imgp->auxargs, M_TEMP);
270 	imgp->auxargs = NULL;
271 
272 	(*stack_base)--;
273 	**stack_base = (register_t)imgp->args->argc;
274 	return 0;
275 }
276 
277 extern int _ucodesel, _udatasel;
278 extern unsigned long linux_sznonrtsigcode;
279 
280 static void
281 linux_rt_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
282 {
283 	struct thread *td = curthread;
284 	struct proc *p = td->td_proc;
285 	struct sigacts *psp;
286 	struct trapframe *regs;
287 	struct l_rt_sigframe *fp, frame;
288 	int sig, code;
289 	int oonstack;
290 
291 	sig = ksi->ksi_signo;
292 	code = ksi->ksi_code;
293 	PROC_LOCK_ASSERT(p, MA_OWNED);
294 	psp = p->p_sigacts;
295 	mtx_assert(&psp->ps_mtx, MA_OWNED);
296 	regs = td->td_frame;
297 	oonstack = sigonstack(regs->tf_esp);
298 
299 #ifdef DEBUG
300 	if (ldebug(rt_sendsig))
301 		printf(ARGS(rt_sendsig, "%p, %d, %p, %u"),
302 		    catcher, sig, (void*)mask, code);
303 #endif
304 	/*
305 	 * Allocate space for the signal handler context.
306 	 */
307 	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
308 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
309 		fp = (struct l_rt_sigframe *)(td->td_sigstk.ss_sp +
310 		    td->td_sigstk.ss_size - sizeof(struct l_rt_sigframe));
311 	} else
312 		fp = (struct l_rt_sigframe *)regs->tf_esp - 1;
313 	mtx_unlock(&psp->ps_mtx);
314 
315 	/*
316 	 * Build the argument list for the signal handler.
317 	 */
318 	if (p->p_sysent->sv_sigtbl)
319 		if (sig <= p->p_sysent->sv_sigsize)
320 			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
321 
322 	bzero(&frame, sizeof(frame));
323 
324 	frame.sf_handler = catcher;
325 	frame.sf_sig = sig;
326 	frame.sf_siginfo = &fp->sf_si;
327 	frame.sf_ucontext = &fp->sf_sc;
328 
329 	/* Fill in POSIX parts */
330 	frame.sf_si.lsi_signo = sig;
331 	frame.sf_si.lsi_code = code;
332 	frame.sf_si.lsi_addr = ksi->ksi_addr;
333 
334 	/*
335 	 * Build the signal context to be used by sigreturn.
336 	 */
337 	frame.sf_sc.uc_flags = 0;		/* XXX ??? */
338 	frame.sf_sc.uc_link = NULL;		/* XXX ??? */
339 
340 	frame.sf_sc.uc_stack.ss_sp = td->td_sigstk.ss_sp;
341 	frame.sf_sc.uc_stack.ss_size = td->td_sigstk.ss_size;
342 	frame.sf_sc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
343 	    ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE;
344 	PROC_UNLOCK(p);
345 
346 	bsd_to_linux_sigset(mask, &frame.sf_sc.uc_sigmask);
347 
348 	frame.sf_sc.uc_mcontext.sc_mask   = frame.sf_sc.uc_sigmask.__bits[0];
349 	frame.sf_sc.uc_mcontext.sc_gs     = rgs();
350 	frame.sf_sc.uc_mcontext.sc_fs     = regs->tf_fs;
351 	frame.sf_sc.uc_mcontext.sc_es     = regs->tf_es;
352 	frame.sf_sc.uc_mcontext.sc_ds     = regs->tf_ds;
353 	frame.sf_sc.uc_mcontext.sc_edi    = regs->tf_edi;
354 	frame.sf_sc.uc_mcontext.sc_esi    = regs->tf_esi;
355 	frame.sf_sc.uc_mcontext.sc_ebp    = regs->tf_ebp;
356 	frame.sf_sc.uc_mcontext.sc_ebx    = regs->tf_ebx;
357 	frame.sf_sc.uc_mcontext.sc_edx    = regs->tf_edx;
358 	frame.sf_sc.uc_mcontext.sc_ecx    = regs->tf_ecx;
359 	frame.sf_sc.uc_mcontext.sc_eax    = regs->tf_eax;
360 	frame.sf_sc.uc_mcontext.sc_eip    = regs->tf_eip;
361 	frame.sf_sc.uc_mcontext.sc_cs     = regs->tf_cs;
362 	frame.sf_sc.uc_mcontext.sc_eflags = regs->tf_eflags;
363 	frame.sf_sc.uc_mcontext.sc_esp_at_signal = regs->tf_esp;
364 	frame.sf_sc.uc_mcontext.sc_ss     = regs->tf_ss;
365 	frame.sf_sc.uc_mcontext.sc_err    = regs->tf_err;
366 	frame.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code);
367 
368 #ifdef DEBUG
369 	if (ldebug(rt_sendsig))
370 		printf(LMSG("rt_sendsig flags: 0x%x, sp: %p, ss: 0x%x, mask: 0x%x"),
371 		    frame.sf_sc.uc_stack.ss_flags, td->td_sigstk.ss_sp,
372 		    td->td_sigstk.ss_size, frame.sf_sc.uc_mcontext.sc_mask);
373 #endif
374 
375 	if (copyout(&frame, fp, sizeof(frame)) != 0) {
376 		/*
377 		 * Process has trashed its stack; give it an illegal
378 		 * instruction to halt it in its tracks.
379 		 */
380 #ifdef DEBUG
381 		if (ldebug(rt_sendsig))
382 			printf(LMSG("rt_sendsig: bad stack %p, oonstack=%x"),
383 			    fp, oonstack);
384 #endif
385 		PROC_LOCK(p);
386 		sigexit(td, SIGILL);
387 	}
388 
389 	/*
390 	 * Build context to run handler in.
391 	 */
392 	regs->tf_esp = (int)fp;
393 	regs->tf_eip = PS_STRINGS - *(p->p_sysent->sv_szsigcode) +
394 	    linux_sznonrtsigcode;
395 	regs->tf_eflags &= ~(PSL_T | PSL_VM);
396 	regs->tf_cs = _ucodesel;
397 	regs->tf_ds = _udatasel;
398 	regs->tf_es = _udatasel;
399 	regs->tf_fs = _udatasel;
400 	regs->tf_ss = _udatasel;
401 	PROC_LOCK(p);
402 	mtx_lock(&psp->ps_mtx);
403 }
404 
405 
406 /*
407  * Send an interrupt to process.
408  *
409  * Stack is set up to allow sigcode stored
410  * in u. to call routine, followed by kcall
411  * to sigreturn routine below.  After sigreturn
412  * resets the signal mask, the stack, and the
413  * frame pointer, it returns to the user
414  * specified pc, psl.
415  */
416 static void
417 linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
418 {
419 	struct thread *td = curthread;
420 	struct proc *p = td->td_proc;
421 	struct sigacts *psp;
422 	struct trapframe *regs;
423 	struct l_sigframe *fp, frame;
424 	l_sigset_t lmask;
425 	int sig, code;
426 	int oonstack, i;
427 
428 	PROC_LOCK_ASSERT(p, MA_OWNED);
429 	psp = p->p_sigacts;
430 	sig = ksi->ksi_signo;
431 	code = ksi->ksi_code;
432 	mtx_assert(&psp->ps_mtx, MA_OWNED);
433 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
434 		/* Signal handler installed with SA_SIGINFO. */
435 		linux_rt_sendsig(catcher, ksi, mask);
436 		return;
437 	}
438 	regs = td->td_frame;
439 	oonstack = sigonstack(regs->tf_esp);
440 
441 #ifdef DEBUG
442 	if (ldebug(sendsig))
443 		printf(ARGS(sendsig, "%p, %d, %p, %u"),
444 		    catcher, sig, (void*)mask, code);
445 #endif
446 
447 	/*
448 	 * Allocate space for the signal handler context.
449 	 */
450 	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
451 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
452 		fp = (struct l_sigframe *)(td->td_sigstk.ss_sp +
453 		    td->td_sigstk.ss_size - sizeof(struct l_sigframe));
454 	} else
455 		fp = (struct l_sigframe *)regs->tf_esp - 1;
456 	mtx_unlock(&psp->ps_mtx);
457 	PROC_UNLOCK(p);
458 
459 	/*
460 	 * Build the argument list for the signal handler.
461 	 */
462 	if (p->p_sysent->sv_sigtbl)
463 		if (sig <= p->p_sysent->sv_sigsize)
464 			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
465 
466 	bzero(&frame, sizeof(frame));
467 
468 	frame.sf_handler = catcher;
469 	frame.sf_sig = sig;
470 
471 	bsd_to_linux_sigset(mask, &lmask);
472 
473 	/*
474 	 * Build the signal context to be used by sigreturn.
475 	 */
476 	frame.sf_sc.sc_mask   = lmask.__bits[0];
477 	frame.sf_sc.sc_gs     = rgs();
478 	frame.sf_sc.sc_fs     = regs->tf_fs;
479 	frame.sf_sc.sc_es     = regs->tf_es;
480 	frame.sf_sc.sc_ds     = regs->tf_ds;
481 	frame.sf_sc.sc_edi    = regs->tf_edi;
482 	frame.sf_sc.sc_esi    = regs->tf_esi;
483 	frame.sf_sc.sc_ebp    = regs->tf_ebp;
484 	frame.sf_sc.sc_ebx    = regs->tf_ebx;
485 	frame.sf_sc.sc_edx    = regs->tf_edx;
486 	frame.sf_sc.sc_ecx    = regs->tf_ecx;
487 	frame.sf_sc.sc_eax    = regs->tf_eax;
488 	frame.sf_sc.sc_eip    = regs->tf_eip;
489 	frame.sf_sc.sc_cs     = regs->tf_cs;
490 	frame.sf_sc.sc_eflags = regs->tf_eflags;
491 	frame.sf_sc.sc_esp_at_signal = regs->tf_esp;
492 	frame.sf_sc.sc_ss     = regs->tf_ss;
493 	frame.sf_sc.sc_err    = regs->tf_err;
494 	frame.sf_sc.sc_trapno = bsd_to_linux_trapcode(ksi->ksi_trapno);
495 
496 	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
497 		frame.sf_extramask[i] = lmask.__bits[i+1];
498 
499 	if (copyout(&frame, fp, sizeof(frame)) != 0) {
500 		/*
501 		 * Process has trashed its stack; give it an illegal
502 		 * instruction to halt it in its tracks.
503 		 */
504 		PROC_LOCK(p);
505 		sigexit(td, SIGILL);
506 	}
507 
508 	/*
509 	 * Build context to run handler in.
510 	 */
511 	regs->tf_esp = (int)fp;
512 	regs->tf_eip = PS_STRINGS - *(p->p_sysent->sv_szsigcode);
513 	regs->tf_eflags &= ~(PSL_T | PSL_VM);
514 	regs->tf_cs = _ucodesel;
515 	regs->tf_ds = _udatasel;
516 	regs->tf_es = _udatasel;
517 	regs->tf_fs = _udatasel;
518 	regs->tf_ss = _udatasel;
519 	PROC_LOCK(p);
520 	mtx_lock(&psp->ps_mtx);
521 }
522 
523 /*
524  * System call to cleanup state after a signal
525  * has been taken.  Reset signal mask and
526  * stack state from context left by sendsig (above).
527  * Return to previous pc and psl as specified by
528  * context left by sendsig. Check carefully to
529  * make sure that the user has not modified the
530  * psl to gain improper privileges or to cause
531  * a machine fault.
532  */
533 int
534 linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args)
535 {
536 	struct proc *p = td->td_proc;
537 	struct l_sigframe frame;
538 	struct trapframe *regs;
539 	l_sigset_t lmask;
540 	int eflags, i;
541 	ksiginfo_t ksi;
542 
543 	regs = td->td_frame;
544 
545 #ifdef DEBUG
546 	if (ldebug(sigreturn))
547 		printf(ARGS(sigreturn, "%p"), (void *)args->sfp);
548 #endif
549 	/*
550 	 * The trampoline code hands us the sigframe.
551 	 * It is unsafe to keep track of it ourselves, in the event that a
552 	 * program jumps out of a signal handler.
553 	 */
554 	if (copyin(args->sfp, &frame, sizeof(frame)) != 0)
555 		return (EFAULT);
556 
557 	/*
558 	 * Check for security violations.
559 	 */
560 #define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
561 	eflags = frame.sf_sc.sc_eflags;
562 	/*
563 	 * XXX do allow users to change the privileged flag PSL_RF.  The
564 	 * cpu sets PSL_RF in tf_eflags for faults.  Debuggers should
565 	 * sometimes set it there too.  tf_eflags is kept in the signal
566 	 * context during signal handling and there is no other place
567 	 * to remember it, so the PSL_RF bit may be corrupted by the
568 	 * signal handler without us knowing.  Corruption of the PSL_RF
569 	 * bit at worst causes one more or one less debugger trap, so
570 	 * allowing it is fairly harmless.
571 	 */
572 	if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF))
573 		return(EINVAL);
574 
575 	/*
576 	 * Don't allow users to load a valid privileged %cs.  Let the
577 	 * hardware check for invalid selectors, excess privilege in
578 	 * other selectors, invalid %eip's and invalid %esp's.
579 	 */
580 #define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
581 	if (!CS_SECURE(frame.sf_sc.sc_cs)) {
582 		ksiginfo_init_trap(&ksi);
583 		ksi.ksi_signo = SIGBUS;
584 		ksi.ksi_code = BUS_OBJERR;
585 		ksi.ksi_trapno = T_PROTFLT;
586 		ksi.ksi_addr = (void *)regs->tf_eip;
587 		trapsignal(td, &ksi);
588 		return(EINVAL);
589 	}
590 
591 	lmask.__bits[0] = frame.sf_sc.sc_mask;
592 	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
593 		lmask.__bits[i+1] = frame.sf_extramask[i];
594 	PROC_LOCK(p);
595 	linux_to_bsd_sigset(&lmask, &td->td_sigmask);
596 	SIG_CANTMASK(td->td_sigmask);
597 	signotify(td);
598 	PROC_UNLOCK(p);
599 
600 	/*
601 	 * Restore signal context.
602 	 */
603 	/* %gs was restored by the trampoline. */
604 	regs->tf_fs     = frame.sf_sc.sc_fs;
605 	regs->tf_es     = frame.sf_sc.sc_es;
606 	regs->tf_ds     = frame.sf_sc.sc_ds;
607 	regs->tf_edi    = frame.sf_sc.sc_edi;
608 	regs->tf_esi    = frame.sf_sc.sc_esi;
609 	regs->tf_ebp    = frame.sf_sc.sc_ebp;
610 	regs->tf_ebx    = frame.sf_sc.sc_ebx;
611 	regs->tf_edx    = frame.sf_sc.sc_edx;
612 	regs->tf_ecx    = frame.sf_sc.sc_ecx;
613 	regs->tf_eax    = frame.sf_sc.sc_eax;
614 	regs->tf_eip    = frame.sf_sc.sc_eip;
615 	regs->tf_cs     = frame.sf_sc.sc_cs;
616 	regs->tf_eflags = eflags;
617 	regs->tf_esp    = frame.sf_sc.sc_esp_at_signal;
618 	regs->tf_ss     = frame.sf_sc.sc_ss;
619 
620 	return (EJUSTRETURN);
621 }
622 
623 /*
624  * System call to cleanup state after a signal
625  * has been taken.  Reset signal mask and
626  * stack state from context left by rt_sendsig (above).
627  * Return to previous pc and psl as specified by
628  * context left by sendsig. Check carefully to
629  * make sure that the user has not modified the
630  * psl to gain improper privileges or to cause
631  * a machine fault.
632  */
633 int
634 linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args)
635 {
636 	struct proc *p = td->td_proc;
637 	struct l_ucontext uc;
638 	struct l_sigcontext *context;
639 	l_stack_t *lss;
640 	stack_t ss;
641 	struct trapframe *regs;
642 	int eflags;
643 	ksiginfo_t ksi;
644 
645 	regs = td->td_frame;
646 
647 #ifdef DEBUG
648 	if (ldebug(rt_sigreturn))
649 		printf(ARGS(rt_sigreturn, "%p"), (void *)args->ucp);
650 #endif
651 	/*
652 	 * The trampoline code hands us the ucontext.
653 	 * It is unsafe to keep track of it ourselves, in the event that a
654 	 * program jumps out of a signal handler.
655 	 */
656 	if (copyin(args->ucp, &uc, sizeof(uc)) != 0)
657 		return (EFAULT);
658 
659 	context = &uc.uc_mcontext;
660 
661 	/*
662 	 * Check for security violations.
663 	 */
664 #define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
665 	eflags = context->sc_eflags;
666 	/*
667 	 * XXX do allow users to change the privileged flag PSL_RF.  The
668 	 * cpu sets PSL_RF in tf_eflags for faults.  Debuggers should
669 	 * sometimes set it there too.  tf_eflags is kept in the signal
670 	 * context during signal handling and there is no other place
671 	 * to remember it, so the PSL_RF bit may be corrupted by the
672 	 * signal handler without us knowing.  Corruption of the PSL_RF
673 	 * bit at worst causes one more or one less debugger trap, so
674 	 * allowing it is fairly harmless.
675 	 */
676 	if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF))
677 		return(EINVAL);
678 
679 	/*
680 	 * Don't allow users to load a valid privileged %cs.  Let the
681 	 * hardware check for invalid selectors, excess privilege in
682 	 * other selectors, invalid %eip's and invalid %esp's.
683 	 */
684 #define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
685 	if (!CS_SECURE(context->sc_cs)) {
686 		ksiginfo_init_trap(&ksi);
687 		ksi.ksi_signo = SIGBUS;
688 		ksi.ksi_code = BUS_OBJERR;
689 		ksi.ksi_trapno = T_PROTFLT;
690 		ksi.ksi_addr = (void *)regs->tf_eip;
691 		trapsignal(td, &ksi);
692 		return(EINVAL);
693 	}
694 
695 	PROC_LOCK(p);
696 	linux_to_bsd_sigset(&uc.uc_sigmask, &td->td_sigmask);
697 	SIG_CANTMASK(td->td_sigmask);
698 	signotify(td);
699 	PROC_UNLOCK(p);
700 
701 	/*
702 	 * Restore signal context
703 	 */
704 	/* %gs was restored by the trampoline. */
705 	regs->tf_fs     = context->sc_fs;
706 	regs->tf_es     = context->sc_es;
707 	regs->tf_ds     = context->sc_ds;
708 	regs->tf_edi    = context->sc_edi;
709 	regs->tf_esi    = context->sc_esi;
710 	regs->tf_ebp    = context->sc_ebp;
711 	regs->tf_ebx    = context->sc_ebx;
712 	regs->tf_edx    = context->sc_edx;
713 	regs->tf_ecx    = context->sc_ecx;
714 	regs->tf_eax    = context->sc_eax;
715 	regs->tf_eip    = context->sc_eip;
716 	regs->tf_cs     = context->sc_cs;
717 	regs->tf_eflags = eflags;
718 	regs->tf_esp    = context->sc_esp_at_signal;
719 	regs->tf_ss     = context->sc_ss;
720 
721 	/*
722 	 * call sigaltstack & ignore results..
723 	 */
724 	lss = &uc.uc_stack;
725 	ss.ss_sp = lss->ss_sp;
726 	ss.ss_size = lss->ss_size;
727 	ss.ss_flags = linux_to_bsd_sigaltstack(lss->ss_flags);
728 
729 #ifdef DEBUG
730 	if (ldebug(rt_sigreturn))
731 		printf(LMSG("rt_sigret flags: 0x%x, sp: %p, ss: 0x%x, mask: 0x%x"),
732 		    ss.ss_flags, ss.ss_sp, ss.ss_size, context->sc_mask);
733 #endif
734 	(void)kern_sigaltstack(td, &ss, NULL);
735 
736 	return (EJUSTRETURN);
737 }
738 
739 /*
740  * MPSAFE
741  */
742 static void
743 linux_prepsyscall(struct trapframe *tf, int *args, u_int *code, caddr_t *params)
744 {
745 	args[0] = tf->tf_ebx;
746 	args[1] = tf->tf_ecx;
747 	args[2] = tf->tf_edx;
748 	args[3] = tf->tf_esi;
749 	args[4] = tf->tf_edi;
750 	args[5] = tf->tf_ebp;	/* Unconfirmed */
751 	*params = NULL;		/* no copyin */
752 }
753 
754 /*
755  * If a linux binary is exec'ing something, try this image activator
756  * first.  We override standard shell script execution in order to
757  * be able to modify the interpreter path.  We only do this if a linux
758  * binary is doing the exec, so we do not create an EXEC module for it.
759  */
760 static int	exec_linux_imgact_try(struct image_params *iparams);
761 
762 static int
763 exec_linux_imgact_try(struct image_params *imgp)
764 {
765     const char *head = (const char *)imgp->image_header;
766     char *rpath;
767     int error = -1, len;
768 
769     /*
770      * The interpreter for shell scripts run from a linux binary needs
771      * to be located in /compat/linux if possible in order to recursively
772      * maintain linux path emulation.
773      */
774     if (((const short *)head)[0] == SHELLMAGIC) {
775 	    /*
776 	     * Run our normal shell image activator.  If it succeeds attempt
777 	     * to use the alternate path for the interpreter.  If an alternate
778 	     * path is found, use our stringspace to store it.
779 	     */
780 	    if ((error = exec_shell_imgact(imgp)) == 0) {
781 		    linux_emul_convpath(FIRST_THREAD_IN_PROC(imgp->proc),
782 			imgp->interpreter_name, UIO_SYSSPACE, &rpath, 0);
783 		    if (rpath != NULL) {
784 			    len = strlen(rpath) + 1;
785 
786 			    if (len <= MAXSHELLCMDLEN) {
787 				    memcpy(imgp->interpreter_name, rpath, len);
788 			    }
789 			    free(rpath, M_TEMP);
790 		    }
791 	    }
792     }
793     return(error);
794 }
795 
796 /*
797  * exec_setregs may initialize some registers differently than Linux
798  * does, thus potentially confusing Linux binaries. If necessary, we
799  * override the exec_setregs default(s) here.
800  */
801 static void
802 exec_linux_setregs(struct thread *td, u_long entry,
803 		   u_long stack, u_long ps_strings)
804 {
805 	static const u_short control = __LINUX_NPXCW__;
806 	struct pcb *pcb = td->td_pcb;
807 
808 	exec_setregs(td, entry, stack, ps_strings);
809 
810 	/* Linux sets %gs to 0, we default to _udatasel */
811 	pcb->pcb_gs = 0; load_gs(0);
812 
813 	/* Linux sets the i387 to extended precision. */
814 	fldcw(&control);
815 }
816 
817 struct sysentvec linux_sysvec = {
818 	LINUX_SYS_MAXSYSCALL,
819 	linux_sysent,
820 	0,
821 	LINUX_SIGTBLSZ,
822 	bsd_to_linux_signal,
823 	ELAST + 1,
824 	bsd_to_linux_errno,
825 	translate_traps,
826 	linux_fixup,
827 	linux_sendsig,
828 	linux_sigcode,
829 	&linux_szsigcode,
830 	linux_prepsyscall,
831 	"Linux a.out",
832 	NULL,
833 	exec_linux_imgact_try,
834 	LINUX_MINSIGSTKSZ,
835 	PAGE_SIZE,
836 	VM_MIN_ADDRESS,
837 	VM_MAXUSER_ADDRESS,
838 	USRSTACK,
839 	PS_STRINGS,
840 	VM_PROT_ALL,
841 	exec_copyout_strings,
842 	exec_linux_setregs,
843 	NULL
844 };
845 
846 struct sysentvec elf_linux_sysvec = {
847 	LINUX_SYS_MAXSYSCALL,
848 	linux_sysent,
849 	0,
850 	LINUX_SIGTBLSZ,
851 	bsd_to_linux_signal,
852 	ELAST + 1,
853 	bsd_to_linux_errno,
854 	translate_traps,
855 	elf_linux_fixup,
856 	linux_sendsig,
857 	linux_sigcode,
858 	&linux_szsigcode,
859 	linux_prepsyscall,
860 	"Linux ELF",
861 	elf32_coredump,
862 	exec_linux_imgact_try,
863 	LINUX_MINSIGSTKSZ,
864 	PAGE_SIZE,
865 	VM_MIN_ADDRESS,
866 	VM_MAXUSER_ADDRESS,
867 	USRSTACK,
868 	PS_STRINGS,
869 	VM_PROT_ALL,
870 	exec_copyout_strings,
871 	exec_linux_setregs,
872 	NULL
873 };
874 
875 static Elf32_Brandinfo linux_brand = {
876 					ELFOSABI_LINUX,
877 					EM_386,
878 					"Linux",
879 					"/compat/linux",
880 					"/lib/ld-linux.so.1",
881 					&elf_linux_sysvec,
882 					NULL,
883 					BI_CAN_EXEC_DYN,
884 				 };
885 
886 static Elf32_Brandinfo linux_glibc2brand = {
887 					ELFOSABI_LINUX,
888 					EM_386,
889 					"Linux",
890 					"/compat/linux",
891 					"/lib/ld-linux.so.2",
892 					&elf_linux_sysvec,
893 					NULL,
894 					BI_CAN_EXEC_DYN,
895 				 };
896 
897 Elf32_Brandinfo *linux_brandlist[] = {
898 					&linux_brand,
899 					&linux_glibc2brand,
900 					NULL
901 				};
902 
903 static int
904 linux_elf_modevent(module_t mod, int type, void *data)
905 {
906 	Elf32_Brandinfo **brandinfo;
907 	int error;
908 	struct linux_ioctl_handler **lihp;
909 	struct linux_device_handler **ldhp;
910 
911 	error = 0;
912 
913 	switch(type) {
914 	case MOD_LOAD:
915 		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
916 		     ++brandinfo)
917 			if (elf32_insert_brand_entry(*brandinfo) < 0)
918 				error = EINVAL;
919 		if (error == 0) {
920 			SET_FOREACH(lihp, linux_ioctl_handler_set)
921 				linux_ioctl_register_handler(*lihp);
922 			SET_FOREACH(ldhp, linux_device_handler_set)
923 				linux_device_register_handler(*ldhp);
924 			sx_init(&emul_lock, "emuldata lock");
925 			sx_init(&emul_shared_lock, "emuldata->shared lock");
926 			LIST_INIT(&futex_list);
927 			mtx_init(&futex_mtx, "futex protection lock", NULL, MTX_DEF);
928 			linux_exit_tag = EVENTHANDLER_REGISTER(process_exit, linux_proc_exit,
929 			      NULL, 1000);
930 			linux_schedtail_tag = EVENTHANDLER_REGISTER(schedtail, linux_schedtail,
931 			      NULL, 1000);
932 			linux_exec_tag = EVENTHANDLER_REGISTER(process_exec, linux_proc_exec,
933 			      NULL, 1000);
934 			if (bootverbose)
935 				printf("Linux ELF exec handler installed\n");
936 		} else
937 			printf("cannot insert Linux ELF brand handler\n");
938 		break;
939 	case MOD_UNLOAD:
940 		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
941 		     ++brandinfo)
942 			if (elf32_brand_inuse(*brandinfo))
943 				error = EBUSY;
944 		if (error == 0) {
945 			for (brandinfo = &linux_brandlist[0];
946 			     *brandinfo != NULL; ++brandinfo)
947 				if (elf32_remove_brand_entry(*brandinfo) < 0)
948 					error = EINVAL;
949 		}
950 		if (error == 0) {
951 			SET_FOREACH(lihp, linux_ioctl_handler_set)
952 				linux_ioctl_unregister_handler(*lihp);
953 			SET_FOREACH(ldhp, linux_device_handler_set)
954 				linux_device_unregister_handler(*ldhp);
955 			sx_destroy(&emul_lock);
956 			sx_destroy(&emul_shared_lock);
957 			mtx_destroy(&futex_mtx);
958 			EVENTHANDLER_DEREGISTER(process_exit, linux_exit_tag);
959 			EVENTHANDLER_DEREGISTER(schedtail, linux_schedtail_tag);
960 			EVENTHANDLER_DEREGISTER(process_exec, linux_exec_tag);
961 			if (bootverbose)
962 				printf("Linux ELF exec handler removed\n");
963 		} else
964 			printf("Could not deinstall ELF interpreter entry\n");
965 		break;
966 	default:
967 		return EOPNOTSUPP;
968 	}
969 	return error;
970 }
971 
972 static moduledata_t linux_elf_mod = {
973 	"linuxelf",
974 	linux_elf_modevent,
975 	0
976 };
977 
978 DECLARE_MODULE(linuxelf, linux_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY);
979