xref: /freebsd/sys/i386/linux/linux_sysvec.c (revision 1669d8afc64812c8d2d1d147ae1fd42ff441e1b1)
1 /*-
2  * Copyright (c) 1994-1996 S�ren Schmidt
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer
10  *    in this position and unchanged.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  * 3. The name of the author may not be used to endorse or promote products
15  *    derived from this software without specific prior written permission
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/exec.h>
35 #include <sys/imgact.h>
36 #include <sys/imgact_aout.h>
37 #include <sys/imgact_elf.h>
38 #include <sys/kernel.h>
39 #include <sys/lock.h>
40 #include <sys/malloc.h>
41 #include <sys/module.h>
42 #include <sys/mutex.h>
43 #include <sys/proc.h>
44 #include <sys/signalvar.h>
45 #include <sys/syscallsubr.h>
46 #include <sys/sysent.h>
47 #include <sys/sysproto.h>
48 #include <sys/vnode.h>
49 #include <sys/eventhandler.h>
50 
51 #include <vm/vm.h>
52 #include <vm/pmap.h>
53 #include <vm/vm_extern.h>
54 #include <vm/vm_map.h>
55 #include <vm/vm_object.h>
56 #include <vm/vm_page.h>
57 #include <vm/vm_param.h>
58 
59 #include <machine/cpu.h>
60 #include <machine/md_var.h>
61 #include <machine/pcb.h>
62 
63 #include <i386/linux/linux.h>
64 #include <i386/linux/linux_proto.h>
65 #include <compat/linux/linux_emul.h>
66 #include <compat/linux/linux_mib.h>
67 #include <compat/linux/linux_signal.h>
68 #include <compat/linux/linux_util.h>
69 
70 MODULE_VERSION(linux, 1);
71 
72 MALLOC_DEFINE(M_LINUX, "linux", "Linux mode structures");
73 
74 #if BYTE_ORDER == LITTLE_ENDIAN
75 #define SHELLMAGIC      0x2123 /* #! */
76 #else
77 #define SHELLMAGIC      0x2321
78 #endif
79 
80 /*
81  * Allow the sendsig functions to use the ldebug() facility
82  * even though they are not syscalls themselves. Map them
83  * to syscall 0. This is slightly less bogus than using
84  * ldebug(sigreturn).
85  */
86 #define	LINUX_SYS_linux_rt_sendsig	0
87 #define	LINUX_SYS_linux_sendsig		0
88 
89 #define	fldcw(addr)		__asm("fldcw %0" : : "m" (*(addr)))
90 #define	__LINUX_NPXCW__		0x37f
91 
92 extern char linux_sigcode[];
93 extern int linux_szsigcode;
94 
95 extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL];
96 
97 SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler);
98 SET_DECLARE(linux_device_handler_set, struct linux_device_handler);
99 
100 static int	linux_fixup(register_t **stack_base,
101 		    struct image_params *iparams);
102 static int	elf_linux_fixup(register_t **stack_base,
103 		    struct image_params *iparams);
104 static void	linux_prepsyscall(struct trapframe *tf, int *args, u_int *code,
105 		    caddr_t *params);
106 static void     linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask);
107 static void	exec_linux_setregs(struct thread *td, u_long entry,
108 				   u_long stack, u_long ps_strings);
109 
110 extern LIST_HEAD(futex_list, futex) futex_list;
111 extern struct sx futex_sx;
112 
113 static eventhandler_tag linux_exit_tag;
114 static eventhandler_tag linux_schedtail_tag;
115 static eventhandler_tag linux_exec_tag;
116 
117 /*
118  * Linux syscalls return negative errno's, we do positive and map them
119  * Reference:
120  *   FreeBSD: src/sys/sys/errno.h
121  *   Linux:   linux-2.6.17.8/include/asm-generic/errno-base.h
122  *            linux-2.6.17.8/include/asm-generic/errno.h
123  */
124 static int bsd_to_linux_errno[ELAST + 1] = {
125 	-0,  -1,  -2,  -3,  -4,  -5,  -6,  -7,  -8,  -9,
126 	-10, -35, -12, -13, -14, -15, -16, -17, -18, -19,
127 	-20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
128 	-30, -31, -32, -33, -34, -11,-115,-114, -88, -89,
129 	-90, -91, -92, -93, -94, -95, -96, -97, -98, -99,
130 	-100,-101,-102,-103,-104,-105,-106,-107,-108,-109,
131 	-110,-111, -40, -36,-112,-113, -39, -11, -87,-122,
132 	-116, -66,  -6,  -6,  -6,  -6,  -6, -37, -38,  -9,
133 	  -6,  -6, -43, -42, -75,-125, -84, -95, -16, -74,
134 	 -72, -67, -71
135 };
136 
137 int bsd_to_linux_signal[LINUX_SIGTBLSZ] = {
138 	LINUX_SIGHUP, LINUX_SIGINT, LINUX_SIGQUIT, LINUX_SIGILL,
139 	LINUX_SIGTRAP, LINUX_SIGABRT, 0, LINUX_SIGFPE,
140 	LINUX_SIGKILL, LINUX_SIGBUS, LINUX_SIGSEGV, LINUX_SIGSYS,
141 	LINUX_SIGPIPE, LINUX_SIGALRM, LINUX_SIGTERM, LINUX_SIGURG,
142 	LINUX_SIGSTOP, LINUX_SIGTSTP, LINUX_SIGCONT, LINUX_SIGCHLD,
143 	LINUX_SIGTTIN, LINUX_SIGTTOU, LINUX_SIGIO, LINUX_SIGXCPU,
144 	LINUX_SIGXFSZ, LINUX_SIGVTALRM, LINUX_SIGPROF, LINUX_SIGWINCH,
145 	0, LINUX_SIGUSR1, LINUX_SIGUSR2
146 };
147 
148 int linux_to_bsd_signal[LINUX_SIGTBLSZ] = {
149 	SIGHUP, SIGINT, SIGQUIT, SIGILL,
150 	SIGTRAP, SIGABRT, SIGBUS, SIGFPE,
151 	SIGKILL, SIGUSR1, SIGSEGV, SIGUSR2,
152 	SIGPIPE, SIGALRM, SIGTERM, SIGBUS,
153 	SIGCHLD, SIGCONT, SIGSTOP, SIGTSTP,
154 	SIGTTIN, SIGTTOU, SIGURG, SIGXCPU,
155 	SIGXFSZ, SIGVTALRM, SIGPROF, SIGWINCH,
156 	SIGIO, SIGURG, SIGSYS
157 };
158 
159 #define LINUX_T_UNKNOWN  255
160 static int _bsd_to_linux_trapcode[] = {
161 	LINUX_T_UNKNOWN,	/* 0 */
162 	6,			/* 1  T_PRIVINFLT */
163 	LINUX_T_UNKNOWN,	/* 2 */
164 	3,			/* 3  T_BPTFLT */
165 	LINUX_T_UNKNOWN,	/* 4 */
166 	LINUX_T_UNKNOWN,	/* 5 */
167 	16,			/* 6  T_ARITHTRAP */
168 	254,			/* 7  T_ASTFLT */
169 	LINUX_T_UNKNOWN,	/* 8 */
170 	13,			/* 9  T_PROTFLT */
171 	1,			/* 10 T_TRCTRAP */
172 	LINUX_T_UNKNOWN,	/* 11 */
173 	14,			/* 12 T_PAGEFLT */
174 	LINUX_T_UNKNOWN,	/* 13 */
175 	17,			/* 14 T_ALIGNFLT */
176 	LINUX_T_UNKNOWN,	/* 15 */
177 	LINUX_T_UNKNOWN,	/* 16 */
178 	LINUX_T_UNKNOWN,	/* 17 */
179 	0,			/* 18 T_DIVIDE */
180 	2,			/* 19 T_NMI */
181 	4,			/* 20 T_OFLOW */
182 	5,			/* 21 T_BOUND */
183 	7,			/* 22 T_DNA */
184 	8,			/* 23 T_DOUBLEFLT */
185 	9,			/* 24 T_FPOPFLT */
186 	10,			/* 25 T_TSSFLT */
187 	11,			/* 26 T_SEGNPFLT */
188 	12,			/* 27 T_STKFLT */
189 	18,			/* 28 T_MCHK */
190 	19,			/* 29 T_XMMFLT */
191 	15			/* 30 T_RESERVED */
192 };
193 #define bsd_to_linux_trapcode(code) \
194     ((code)<sizeof(_bsd_to_linux_trapcode)/sizeof(*_bsd_to_linux_trapcode)? \
195      _bsd_to_linux_trapcode[(code)]: \
196      LINUX_T_UNKNOWN)
197 
198 /*
199  * If FreeBSD & Linux have a difference of opinion about what a trap
200  * means, deal with it here.
201  *
202  * MPSAFE
203  */
204 static int
205 translate_traps(int signal, int trap_code)
206 {
207 	if (signal != SIGBUS)
208 		return signal;
209 	switch (trap_code) {
210 	case T_PROTFLT:
211 	case T_TSSFLT:
212 	case T_DOUBLEFLT:
213 	case T_PAGEFLT:
214 		return SIGSEGV;
215 	default:
216 		return signal;
217 	}
218 }
219 
220 static int
221 linux_fixup(register_t **stack_base, struct image_params *imgp)
222 {
223 	register_t *argv, *envp;
224 
225 	argv = *stack_base;
226 	envp = *stack_base + (imgp->args->argc + 1);
227 	(*stack_base)--;
228 	**stack_base = (intptr_t)(void *)envp;
229 	(*stack_base)--;
230 	**stack_base = (intptr_t)(void *)argv;
231 	(*stack_base)--;
232 	**stack_base = imgp->args->argc;
233 	return 0;
234 }
235 
236 static int
237 elf_linux_fixup(register_t **stack_base, struct image_params *imgp)
238 {
239 	Elf32_Auxargs *args;
240 	register_t *pos;
241 
242 	KASSERT(curthread->td_proc == imgp->proc &&
243 	    (curthread->td_proc->p_flag & P_SA) == 0,
244 	    ("unsafe elf_linux_fixup(), should be curproc"));
245 	args = (Elf32_Auxargs *)imgp->auxargs;
246 	pos = *stack_base + (imgp->args->argc + imgp->args->envc + 2);
247 
248 	if (args->trace)
249 		AUXARGS_ENTRY(pos, AT_DEBUG, 1);
250 	if (args->execfd != -1)
251 		AUXARGS_ENTRY(pos, AT_EXECFD, args->execfd);
252 	AUXARGS_ENTRY(pos, AT_PHDR, args->phdr);
253 	AUXARGS_ENTRY(pos, AT_PHENT, args->phent);
254 	AUXARGS_ENTRY(pos, AT_PHNUM, args->phnum);
255 	AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz);
256 	AUXARGS_ENTRY(pos, AT_FLAGS, args->flags);
257 	AUXARGS_ENTRY(pos, AT_ENTRY, args->entry);
258 	AUXARGS_ENTRY(pos, AT_BASE, args->base);
259 	AUXARGS_ENTRY(pos, AT_UID, imgp->proc->p_ucred->cr_ruid);
260 	AUXARGS_ENTRY(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid);
261 	AUXARGS_ENTRY(pos, AT_GID, imgp->proc->p_ucred->cr_rgid);
262 	AUXARGS_ENTRY(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid);
263 	AUXARGS_ENTRY(pos, AT_NULL, 0);
264 
265 	free(imgp->auxargs, M_TEMP);
266 	imgp->auxargs = NULL;
267 
268 	(*stack_base)--;
269 	**stack_base = (register_t)imgp->args->argc;
270 	return 0;
271 }
272 
273 extern int _ucodesel, _udatasel;
274 extern unsigned long linux_sznonrtsigcode;
275 
276 static void
277 linux_rt_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
278 {
279 	struct thread *td = curthread;
280 	struct proc *p = td->td_proc;
281 	struct sigacts *psp;
282 	struct trapframe *regs;
283 	struct l_rt_sigframe *fp, frame;
284 	int sig, code;
285 	int oonstack;
286 
287 	sig = ksi->ksi_signo;
288 	code = ksi->ksi_code;
289 	PROC_LOCK_ASSERT(p, MA_OWNED);
290 	psp = p->p_sigacts;
291 	mtx_assert(&psp->ps_mtx, MA_OWNED);
292 	regs = td->td_frame;
293 	oonstack = sigonstack(regs->tf_esp);
294 
295 #ifdef DEBUG
296 	if (ldebug(rt_sendsig))
297 		printf(ARGS(rt_sendsig, "%p, %d, %p, %u"),
298 		    catcher, sig, (void*)mask, code);
299 #endif
300 	/*
301 	 * Allocate space for the signal handler context.
302 	 */
303 	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
304 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
305 		fp = (struct l_rt_sigframe *)(td->td_sigstk.ss_sp +
306 		    td->td_sigstk.ss_size - sizeof(struct l_rt_sigframe));
307 	} else
308 		fp = (struct l_rt_sigframe *)regs->tf_esp - 1;
309 	mtx_unlock(&psp->ps_mtx);
310 
311 	/*
312 	 * Build the argument list for the signal handler.
313 	 */
314 	if (p->p_sysent->sv_sigtbl)
315 		if (sig <= p->p_sysent->sv_sigsize)
316 			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
317 
318 	bzero(&frame, sizeof(frame));
319 
320 	frame.sf_handler = catcher;
321 	frame.sf_sig = sig;
322 	frame.sf_siginfo = &fp->sf_si;
323 	frame.sf_ucontext = &fp->sf_sc;
324 
325 	/* Fill in POSIX parts */
326 	frame.sf_si.lsi_signo = sig;
327 	frame.sf_si.lsi_code = code;
328 	frame.sf_si.lsi_addr = ksi->ksi_addr;
329 
330 	/*
331 	 * Build the signal context to be used by sigreturn.
332 	 */
333 	frame.sf_sc.uc_flags = 0;		/* XXX ??? */
334 	frame.sf_sc.uc_link = NULL;		/* XXX ??? */
335 
336 	frame.sf_sc.uc_stack.ss_sp = td->td_sigstk.ss_sp;
337 	frame.sf_sc.uc_stack.ss_size = td->td_sigstk.ss_size;
338 	frame.sf_sc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
339 	    ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE;
340 	PROC_UNLOCK(p);
341 
342 	bsd_to_linux_sigset(mask, &frame.sf_sc.uc_sigmask);
343 
344 	frame.sf_sc.uc_mcontext.sc_mask   = frame.sf_sc.uc_sigmask.__bits[0];
345 	frame.sf_sc.uc_mcontext.sc_gs     = rgs();
346 	frame.sf_sc.uc_mcontext.sc_fs     = regs->tf_fs;
347 	frame.sf_sc.uc_mcontext.sc_es     = regs->tf_es;
348 	frame.sf_sc.uc_mcontext.sc_ds     = regs->tf_ds;
349 	frame.sf_sc.uc_mcontext.sc_edi    = regs->tf_edi;
350 	frame.sf_sc.uc_mcontext.sc_esi    = regs->tf_esi;
351 	frame.sf_sc.uc_mcontext.sc_ebp    = regs->tf_ebp;
352 	frame.sf_sc.uc_mcontext.sc_ebx    = regs->tf_ebx;
353 	frame.sf_sc.uc_mcontext.sc_edx    = regs->tf_edx;
354 	frame.sf_sc.uc_mcontext.sc_ecx    = regs->tf_ecx;
355 	frame.sf_sc.uc_mcontext.sc_eax    = regs->tf_eax;
356 	frame.sf_sc.uc_mcontext.sc_eip    = regs->tf_eip;
357 	frame.sf_sc.uc_mcontext.sc_cs     = regs->tf_cs;
358 	frame.sf_sc.uc_mcontext.sc_eflags = regs->tf_eflags;
359 	frame.sf_sc.uc_mcontext.sc_esp_at_signal = regs->tf_esp;
360 	frame.sf_sc.uc_mcontext.sc_ss     = regs->tf_ss;
361 	frame.sf_sc.uc_mcontext.sc_err    = regs->tf_err;
362 	frame.sf_sc.uc_mcontext.sc_cr2    = (register_t)ksi->ksi_addr;
363 	frame.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code);
364 
365 #ifdef DEBUG
366 	if (ldebug(rt_sendsig))
367 		printf(LMSG("rt_sendsig flags: 0x%x, sp: %p, ss: 0x%x, mask: 0x%x"),
368 		    frame.sf_sc.uc_stack.ss_flags, td->td_sigstk.ss_sp,
369 		    td->td_sigstk.ss_size, frame.sf_sc.uc_mcontext.sc_mask);
370 #endif
371 
372 	if (copyout(&frame, fp, sizeof(frame)) != 0) {
373 		/*
374 		 * Process has trashed its stack; give it an illegal
375 		 * instruction to halt it in its tracks.
376 		 */
377 #ifdef DEBUG
378 		if (ldebug(rt_sendsig))
379 			printf(LMSG("rt_sendsig: bad stack %p, oonstack=%x"),
380 			    fp, oonstack);
381 #endif
382 		PROC_LOCK(p);
383 		sigexit(td, SIGILL);
384 	}
385 
386 	/*
387 	 * Build context to run handler in.
388 	 */
389 	regs->tf_esp = (int)fp;
390 	regs->tf_eip = PS_STRINGS - *(p->p_sysent->sv_szsigcode) +
391 	    linux_sznonrtsigcode;
392 	regs->tf_eflags &= ~(PSL_T | PSL_VM);
393 	regs->tf_cs = _ucodesel;
394 	regs->tf_ds = _udatasel;
395 	regs->tf_es = _udatasel;
396 	regs->tf_fs = _udatasel;
397 	regs->tf_ss = _udatasel;
398 	PROC_LOCK(p);
399 	mtx_lock(&psp->ps_mtx);
400 }
401 
402 
403 /*
404  * Send an interrupt to process.
405  *
406  * Stack is set up to allow sigcode stored
407  * in u. to call routine, followed by kcall
408  * to sigreturn routine below.  After sigreturn
409  * resets the signal mask, the stack, and the
410  * frame pointer, it returns to the user
411  * specified pc, psl.
412  */
413 static void
414 linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
415 {
416 	struct thread *td = curthread;
417 	struct proc *p = td->td_proc;
418 	struct sigacts *psp;
419 	struct trapframe *regs;
420 	struct l_sigframe *fp, frame;
421 	l_sigset_t lmask;
422 	int sig, code;
423 	int oonstack, i;
424 
425 	PROC_LOCK_ASSERT(p, MA_OWNED);
426 	psp = p->p_sigacts;
427 	sig = ksi->ksi_signo;
428 	code = ksi->ksi_code;
429 	mtx_assert(&psp->ps_mtx, MA_OWNED);
430 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
431 		/* Signal handler installed with SA_SIGINFO. */
432 		linux_rt_sendsig(catcher, ksi, mask);
433 		return;
434 	}
435 	regs = td->td_frame;
436 	oonstack = sigonstack(regs->tf_esp);
437 
438 #ifdef DEBUG
439 	if (ldebug(sendsig))
440 		printf(ARGS(sendsig, "%p, %d, %p, %u"),
441 		    catcher, sig, (void*)mask, code);
442 #endif
443 
444 	/*
445 	 * Allocate space for the signal handler context.
446 	 */
447 	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
448 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
449 		fp = (struct l_sigframe *)(td->td_sigstk.ss_sp +
450 		    td->td_sigstk.ss_size - sizeof(struct l_sigframe));
451 	} else
452 		fp = (struct l_sigframe *)regs->tf_esp - 1;
453 	mtx_unlock(&psp->ps_mtx);
454 	PROC_UNLOCK(p);
455 
456 	/*
457 	 * Build the argument list for the signal handler.
458 	 */
459 	if (p->p_sysent->sv_sigtbl)
460 		if (sig <= p->p_sysent->sv_sigsize)
461 			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
462 
463 	bzero(&frame, sizeof(frame));
464 
465 	frame.sf_handler = catcher;
466 	frame.sf_sig = sig;
467 
468 	bsd_to_linux_sigset(mask, &lmask);
469 
470 	/*
471 	 * Build the signal context to be used by sigreturn.
472 	 */
473 	frame.sf_sc.sc_mask   = lmask.__bits[0];
474 	frame.sf_sc.sc_gs     = rgs();
475 	frame.sf_sc.sc_fs     = regs->tf_fs;
476 	frame.sf_sc.sc_es     = regs->tf_es;
477 	frame.sf_sc.sc_ds     = regs->tf_ds;
478 	frame.sf_sc.sc_edi    = regs->tf_edi;
479 	frame.sf_sc.sc_esi    = regs->tf_esi;
480 	frame.sf_sc.sc_ebp    = regs->tf_ebp;
481 	frame.sf_sc.sc_ebx    = regs->tf_ebx;
482 	frame.sf_sc.sc_edx    = regs->tf_edx;
483 	frame.sf_sc.sc_ecx    = regs->tf_ecx;
484 	frame.sf_sc.sc_eax    = regs->tf_eax;
485 	frame.sf_sc.sc_eip    = regs->tf_eip;
486 	frame.sf_sc.sc_cs     = regs->tf_cs;
487 	frame.sf_sc.sc_eflags = regs->tf_eflags;
488 	frame.sf_sc.sc_esp_at_signal = regs->tf_esp;
489 	frame.sf_sc.sc_ss     = regs->tf_ss;
490 	frame.sf_sc.sc_err    = regs->tf_err;
491 	frame.sf_sc.sc_cr2    = (register_t)ksi->ksi_addr;
492 	frame.sf_sc.sc_trapno = bsd_to_linux_trapcode(ksi->ksi_trapno);
493 
494 	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
495 		frame.sf_extramask[i] = lmask.__bits[i+1];
496 
497 	if (copyout(&frame, fp, sizeof(frame)) != 0) {
498 		/*
499 		 * Process has trashed its stack; give it an illegal
500 		 * instruction to halt it in its tracks.
501 		 */
502 		PROC_LOCK(p);
503 		sigexit(td, SIGILL);
504 	}
505 
506 	/*
507 	 * Build context to run handler in.
508 	 */
509 	regs->tf_esp = (int)fp;
510 	regs->tf_eip = PS_STRINGS - *(p->p_sysent->sv_szsigcode);
511 	regs->tf_eflags &= ~(PSL_T | PSL_VM);
512 	regs->tf_cs = _ucodesel;
513 	regs->tf_ds = _udatasel;
514 	regs->tf_es = _udatasel;
515 	regs->tf_fs = _udatasel;
516 	regs->tf_ss = _udatasel;
517 	PROC_LOCK(p);
518 	mtx_lock(&psp->ps_mtx);
519 }
520 
521 /*
522  * System call to cleanup state after a signal
523  * has been taken.  Reset signal mask and
524  * stack state from context left by sendsig (above).
525  * Return to previous pc and psl as specified by
526  * context left by sendsig. Check carefully to
527  * make sure that the user has not modified the
528  * psl to gain improper privileges or to cause
529  * a machine fault.
530  */
531 int
532 linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args)
533 {
534 	struct proc *p = td->td_proc;
535 	struct l_sigframe frame;
536 	struct trapframe *regs;
537 	l_sigset_t lmask;
538 	int eflags, i;
539 	ksiginfo_t ksi;
540 
541 	regs = td->td_frame;
542 
543 #ifdef DEBUG
544 	if (ldebug(sigreturn))
545 		printf(ARGS(sigreturn, "%p"), (void *)args->sfp);
546 #endif
547 	/*
548 	 * The trampoline code hands us the sigframe.
549 	 * It is unsafe to keep track of it ourselves, in the event that a
550 	 * program jumps out of a signal handler.
551 	 */
552 	if (copyin(args->sfp, &frame, sizeof(frame)) != 0)
553 		return (EFAULT);
554 
555 	/*
556 	 * Check for security violations.
557 	 */
558 #define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
559 	eflags = frame.sf_sc.sc_eflags;
560 	/*
561 	 * XXX do allow users to change the privileged flag PSL_RF.  The
562 	 * cpu sets PSL_RF in tf_eflags for faults.  Debuggers should
563 	 * sometimes set it there too.  tf_eflags is kept in the signal
564 	 * context during signal handling and there is no other place
565 	 * to remember it, so the PSL_RF bit may be corrupted by the
566 	 * signal handler without us knowing.  Corruption of the PSL_RF
567 	 * bit at worst causes one more or one less debugger trap, so
568 	 * allowing it is fairly harmless.
569 	 */
570 	if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF))
571 		return(EINVAL);
572 
573 	/*
574 	 * Don't allow users to load a valid privileged %cs.  Let the
575 	 * hardware check for invalid selectors, excess privilege in
576 	 * other selectors, invalid %eip's and invalid %esp's.
577 	 */
578 #define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
579 	if (!CS_SECURE(frame.sf_sc.sc_cs)) {
580 		ksiginfo_init_trap(&ksi);
581 		ksi.ksi_signo = SIGBUS;
582 		ksi.ksi_code = BUS_OBJERR;
583 		ksi.ksi_trapno = T_PROTFLT;
584 		ksi.ksi_addr = (void *)regs->tf_eip;
585 		trapsignal(td, &ksi);
586 		return(EINVAL);
587 	}
588 
589 	lmask.__bits[0] = frame.sf_sc.sc_mask;
590 	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
591 		lmask.__bits[i+1] = frame.sf_extramask[i];
592 	PROC_LOCK(p);
593 	linux_to_bsd_sigset(&lmask, &td->td_sigmask);
594 	SIG_CANTMASK(td->td_sigmask);
595 	signotify(td);
596 	PROC_UNLOCK(p);
597 
598 	/*
599 	 * Restore signal context.
600 	 */
601 	/* %gs was restored by the trampoline. */
602 	regs->tf_fs     = frame.sf_sc.sc_fs;
603 	regs->tf_es     = frame.sf_sc.sc_es;
604 	regs->tf_ds     = frame.sf_sc.sc_ds;
605 	regs->tf_edi    = frame.sf_sc.sc_edi;
606 	regs->tf_esi    = frame.sf_sc.sc_esi;
607 	regs->tf_ebp    = frame.sf_sc.sc_ebp;
608 	regs->tf_ebx    = frame.sf_sc.sc_ebx;
609 	regs->tf_edx    = frame.sf_sc.sc_edx;
610 	regs->tf_ecx    = frame.sf_sc.sc_ecx;
611 	regs->tf_eax    = frame.sf_sc.sc_eax;
612 	regs->tf_eip    = frame.sf_sc.sc_eip;
613 	regs->tf_cs     = frame.sf_sc.sc_cs;
614 	regs->tf_eflags = eflags;
615 	regs->tf_esp    = frame.sf_sc.sc_esp_at_signal;
616 	regs->tf_ss     = frame.sf_sc.sc_ss;
617 
618 	return (EJUSTRETURN);
619 }
620 
621 /*
622  * System call to cleanup state after a signal
623  * has been taken.  Reset signal mask and
624  * stack state from context left by rt_sendsig (above).
625  * Return to previous pc and psl as specified by
626  * context left by sendsig. Check carefully to
627  * make sure that the user has not modified the
628  * psl to gain improper privileges or to cause
629  * a machine fault.
630  */
631 int
632 linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args)
633 {
634 	struct proc *p = td->td_proc;
635 	struct l_ucontext uc;
636 	struct l_sigcontext *context;
637 	l_stack_t *lss;
638 	stack_t ss;
639 	struct trapframe *regs;
640 	int eflags;
641 	ksiginfo_t ksi;
642 
643 	regs = td->td_frame;
644 
645 #ifdef DEBUG
646 	if (ldebug(rt_sigreturn))
647 		printf(ARGS(rt_sigreturn, "%p"), (void *)args->ucp);
648 #endif
649 	/*
650 	 * The trampoline code hands us the ucontext.
651 	 * It is unsafe to keep track of it ourselves, in the event that a
652 	 * program jumps out of a signal handler.
653 	 */
654 	if (copyin(args->ucp, &uc, sizeof(uc)) != 0)
655 		return (EFAULT);
656 
657 	context = &uc.uc_mcontext;
658 
659 	/*
660 	 * Check for security violations.
661 	 */
662 #define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
663 	eflags = context->sc_eflags;
664 	/*
665 	 * XXX do allow users to change the privileged flag PSL_RF.  The
666 	 * cpu sets PSL_RF in tf_eflags for faults.  Debuggers should
667 	 * sometimes set it there too.  tf_eflags is kept in the signal
668 	 * context during signal handling and there is no other place
669 	 * to remember it, so the PSL_RF bit may be corrupted by the
670 	 * signal handler without us knowing.  Corruption of the PSL_RF
671 	 * bit at worst causes one more or one less debugger trap, so
672 	 * allowing it is fairly harmless.
673 	 */
674 	if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF))
675 		return(EINVAL);
676 
677 	/*
678 	 * Don't allow users to load a valid privileged %cs.  Let the
679 	 * hardware check for invalid selectors, excess privilege in
680 	 * other selectors, invalid %eip's and invalid %esp's.
681 	 */
682 #define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
683 	if (!CS_SECURE(context->sc_cs)) {
684 		ksiginfo_init_trap(&ksi);
685 		ksi.ksi_signo = SIGBUS;
686 		ksi.ksi_code = BUS_OBJERR;
687 		ksi.ksi_trapno = T_PROTFLT;
688 		ksi.ksi_addr = (void *)regs->tf_eip;
689 		trapsignal(td, &ksi);
690 		return(EINVAL);
691 	}
692 
693 	PROC_LOCK(p);
694 	linux_to_bsd_sigset(&uc.uc_sigmask, &td->td_sigmask);
695 	SIG_CANTMASK(td->td_sigmask);
696 	signotify(td);
697 	PROC_UNLOCK(p);
698 
699 	/*
700 	 * Restore signal context
701 	 */
702 	/* %gs was restored by the trampoline. */
703 	regs->tf_fs     = context->sc_fs;
704 	regs->tf_es     = context->sc_es;
705 	regs->tf_ds     = context->sc_ds;
706 	regs->tf_edi    = context->sc_edi;
707 	regs->tf_esi    = context->sc_esi;
708 	regs->tf_ebp    = context->sc_ebp;
709 	regs->tf_ebx    = context->sc_ebx;
710 	regs->tf_edx    = context->sc_edx;
711 	regs->tf_ecx    = context->sc_ecx;
712 	regs->tf_eax    = context->sc_eax;
713 	regs->tf_eip    = context->sc_eip;
714 	regs->tf_cs     = context->sc_cs;
715 	regs->tf_eflags = eflags;
716 	regs->tf_esp    = context->sc_esp_at_signal;
717 	regs->tf_ss     = context->sc_ss;
718 
719 	/*
720 	 * call sigaltstack & ignore results..
721 	 */
722 	lss = &uc.uc_stack;
723 	ss.ss_sp = lss->ss_sp;
724 	ss.ss_size = lss->ss_size;
725 	ss.ss_flags = linux_to_bsd_sigaltstack(lss->ss_flags);
726 
727 #ifdef DEBUG
728 	if (ldebug(rt_sigreturn))
729 		printf(LMSG("rt_sigret flags: 0x%x, sp: %p, ss: 0x%x, mask: 0x%x"),
730 		    ss.ss_flags, ss.ss_sp, ss.ss_size, context->sc_mask);
731 #endif
732 	(void)kern_sigaltstack(td, &ss, NULL);
733 
734 	return (EJUSTRETURN);
735 }
736 
737 /*
738  * MPSAFE
739  */
740 static void
741 linux_prepsyscall(struct trapframe *tf, int *args, u_int *code, caddr_t *params)
742 {
743 	args[0] = tf->tf_ebx;
744 	args[1] = tf->tf_ecx;
745 	args[2] = tf->tf_edx;
746 	args[3] = tf->tf_esi;
747 	args[4] = tf->tf_edi;
748 	args[5] = tf->tf_ebp;	/* Unconfirmed */
749 	*params = NULL;		/* no copyin */
750 }
751 
752 /*
753  * If a linux binary is exec'ing something, try this image activator
754  * first.  We override standard shell script execution in order to
755  * be able to modify the interpreter path.  We only do this if a linux
756  * binary is doing the exec, so we do not create an EXEC module for it.
757  */
758 static int	exec_linux_imgact_try(struct image_params *iparams);
759 
760 static int
761 exec_linux_imgact_try(struct image_params *imgp)
762 {
763     const char *head = (const char *)imgp->image_header;
764     char *rpath;
765     int error = -1, len;
766 
767     /*
768      * The interpreter for shell scripts run from a linux binary needs
769      * to be located in /compat/linux if possible in order to recursively
770      * maintain linux path emulation.
771      */
772     if (((const short *)head)[0] == SHELLMAGIC) {
773 	    /*
774 	     * Run our normal shell image activator.  If it succeeds attempt
775 	     * to use the alternate path for the interpreter.  If an alternate
776 	     * path is found, use our stringspace to store it.
777 	     */
778 	    if ((error = exec_shell_imgact(imgp)) == 0) {
779 		    linux_emul_convpath(FIRST_THREAD_IN_PROC(imgp->proc),
780 			imgp->interpreter_name, UIO_SYSSPACE, &rpath, 0);
781 		    if (rpath != NULL) {
782 			    len = strlen(rpath) + 1;
783 
784 			    if (len <= MAXSHELLCMDLEN) {
785 				    memcpy(imgp->interpreter_name, rpath, len);
786 			    }
787 			    free(rpath, M_TEMP);
788 		    }
789 	    }
790     }
791     return(error);
792 }
793 
794 /*
795  * exec_setregs may initialize some registers differently than Linux
796  * does, thus potentially confusing Linux binaries. If necessary, we
797  * override the exec_setregs default(s) here.
798  */
799 static void
800 exec_linux_setregs(struct thread *td, u_long entry,
801 		   u_long stack, u_long ps_strings)
802 {
803 	static const u_short control = __LINUX_NPXCW__;
804 	struct pcb *pcb = td->td_pcb;
805 
806 	exec_setregs(td, entry, stack, ps_strings);
807 
808 	/* Linux sets %gs to 0, we default to _udatasel */
809 	pcb->pcb_gs = 0; load_gs(0);
810 
811 	/* Linux sets the i387 to extended precision. */
812 	fldcw(&control);
813 }
814 
815 struct sysentvec linux_sysvec = {
816 	LINUX_SYS_MAXSYSCALL,
817 	linux_sysent,
818 	0,
819 	LINUX_SIGTBLSZ,
820 	bsd_to_linux_signal,
821 	ELAST + 1,
822 	bsd_to_linux_errno,
823 	translate_traps,
824 	linux_fixup,
825 	linux_sendsig,
826 	linux_sigcode,
827 	&linux_szsigcode,
828 	linux_prepsyscall,
829 	"Linux a.out",
830 	NULL,
831 	exec_linux_imgact_try,
832 	LINUX_MINSIGSTKSZ,
833 	PAGE_SIZE,
834 	VM_MIN_ADDRESS,
835 	VM_MAXUSER_ADDRESS,
836 	USRSTACK,
837 	PS_STRINGS,
838 	VM_PROT_ALL,
839 	exec_copyout_strings,
840 	exec_linux_setregs,
841 	NULL
842 };
843 
844 struct sysentvec elf_linux_sysvec = {
845 	LINUX_SYS_MAXSYSCALL,
846 	linux_sysent,
847 	0,
848 	LINUX_SIGTBLSZ,
849 	bsd_to_linux_signal,
850 	ELAST + 1,
851 	bsd_to_linux_errno,
852 	translate_traps,
853 	elf_linux_fixup,
854 	linux_sendsig,
855 	linux_sigcode,
856 	&linux_szsigcode,
857 	linux_prepsyscall,
858 	"Linux ELF",
859 	elf32_coredump,
860 	exec_linux_imgact_try,
861 	LINUX_MINSIGSTKSZ,
862 	PAGE_SIZE,
863 	VM_MIN_ADDRESS,
864 	VM_MAXUSER_ADDRESS,
865 	USRSTACK,
866 	PS_STRINGS,
867 	VM_PROT_ALL,
868 	exec_copyout_strings,
869 	exec_linux_setregs,
870 	NULL
871 };
872 
873 static Elf32_Brandinfo linux_brand = {
874 					ELFOSABI_LINUX,
875 					EM_386,
876 					"Linux",
877 					"/compat/linux",
878 					"/lib/ld-linux.so.1",
879 					&elf_linux_sysvec,
880 					NULL,
881 					BI_CAN_EXEC_DYN,
882 				 };
883 
884 static Elf32_Brandinfo linux_glibc2brand = {
885 					ELFOSABI_LINUX,
886 					EM_386,
887 					"Linux",
888 					"/compat/linux",
889 					"/lib/ld-linux.so.2",
890 					&elf_linux_sysvec,
891 					NULL,
892 					BI_CAN_EXEC_DYN,
893 				 };
894 
895 Elf32_Brandinfo *linux_brandlist[] = {
896 					&linux_brand,
897 					&linux_glibc2brand,
898 					NULL
899 				};
900 
901 static int
902 linux_elf_modevent(module_t mod, int type, void *data)
903 {
904 	Elf32_Brandinfo **brandinfo;
905 	int error;
906 	struct linux_ioctl_handler **lihp;
907 	struct linux_device_handler **ldhp;
908 
909 	error = 0;
910 
911 	switch(type) {
912 	case MOD_LOAD:
913 		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
914 		     ++brandinfo)
915 			if (elf32_insert_brand_entry(*brandinfo) < 0)
916 				error = EINVAL;
917 		if (error == 0) {
918 			SET_FOREACH(lihp, linux_ioctl_handler_set)
919 				linux_ioctl_register_handler(*lihp);
920 			SET_FOREACH(ldhp, linux_device_handler_set)
921 				linux_device_register_handler(*ldhp);
922 			mtx_init(&emul_lock, "emuldata lock", NULL, MTX_DEF);
923 			sx_init(&emul_shared_lock, "emuldata->shared lock");
924 			LIST_INIT(&futex_list);
925 			sx_init(&futex_sx, "futex protection lock");
926 			linux_exit_tag = EVENTHANDLER_REGISTER(process_exit, linux_proc_exit,
927 			      NULL, 1000);
928 			linux_schedtail_tag = EVENTHANDLER_REGISTER(schedtail, linux_schedtail,
929 			      NULL, 1000);
930 			linux_exec_tag = EVENTHANDLER_REGISTER(process_exec, linux_proc_exec,
931 			      NULL, 1000);
932 			if (bootverbose)
933 				printf("Linux ELF exec handler installed\n");
934 		} else
935 			printf("cannot insert Linux ELF brand handler\n");
936 		break;
937 	case MOD_UNLOAD:
938 		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
939 		     ++brandinfo)
940 			if (elf32_brand_inuse(*brandinfo))
941 				error = EBUSY;
942 		if (error == 0) {
943 			for (brandinfo = &linux_brandlist[0];
944 			     *brandinfo != NULL; ++brandinfo)
945 				if (elf32_remove_brand_entry(*brandinfo) < 0)
946 					error = EINVAL;
947 		}
948 		if (error == 0) {
949 			SET_FOREACH(lihp, linux_ioctl_handler_set)
950 				linux_ioctl_unregister_handler(*lihp);
951 			SET_FOREACH(ldhp, linux_device_handler_set)
952 				linux_device_unregister_handler(*ldhp);
953 			mtx_destroy(&emul_lock);
954 			sx_destroy(&emul_shared_lock);
955 			sx_destroy(&futex_sx);
956 			EVENTHANDLER_DEREGISTER(process_exit, linux_exit_tag);
957 			EVENTHANDLER_DEREGISTER(schedtail, linux_schedtail_tag);
958 			EVENTHANDLER_DEREGISTER(process_exec, linux_exec_tag);
959 			if (bootverbose)
960 				printf("Linux ELF exec handler removed\n");
961 		} else
962 			printf("Could not deinstall ELF interpreter entry\n");
963 		break;
964 	default:
965 		return EOPNOTSUPP;
966 	}
967 	return error;
968 }
969 
970 static moduledata_t linux_elf_mod = {
971 	"linuxelf",
972 	linux_elf_modevent,
973 	0
974 };
975 
976 DECLARE_MODULE(linuxelf, linux_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY);
977