xref: /freebsd/sys/i386/linux/linux_sysvec.c (revision 6b3455a7665208c366849f0b2b3bc916fb97516e)
1 /*-
2  * Copyright (c) 1994-1996 S�ren Schmidt
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer
10  *    in this position and unchanged.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  * 3. The name of the author may not be used to endorse or promote products
15  *    derived from this software without specific prior written permission
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 
32 /* XXX we use functions that might not exist. */
33 #include "opt_compat.h"
34 
35 #ifndef COMPAT_43
36 #error "Unable to compile Linux-emulator due to missing COMPAT_43 option!"
37 #endif
38 
39 #include <sys/param.h>
40 #include <sys/systm.h>
41 #include <sys/exec.h>
42 #include <sys/imgact.h>
43 #include <sys/imgact_aout.h>
44 #include <sys/imgact_elf.h>
45 #include <sys/kernel.h>
46 #include <sys/lock.h>
47 #include <sys/malloc.h>
48 #include <sys/module.h>
49 #include <sys/mutex.h>
50 #include <sys/proc.h>
51 #include <sys/signalvar.h>
52 #include <sys/syscallsubr.h>
53 #include <sys/sysent.h>
54 #include <sys/sysproto.h>
55 #include <sys/user.h>
56 #include <sys/vnode.h>
57 
58 #include <vm/vm.h>
59 #include <vm/pmap.h>
60 #include <vm/vm_extern.h>
61 #include <vm/vm_map.h>
62 #include <vm/vm_object.h>
63 #include <vm/vm_page.h>
64 #include <vm/vm_param.h>
65 
66 #include <machine/cpu.h>
67 #include <machine/md_var.h>
68 
69 #include <i386/linux/linux.h>
70 #include <i386/linux/linux_proto.h>
71 #include <compat/linux/linux_mib.h>
72 #include <compat/linux/linux_signal.h>
73 #include <compat/linux/linux_util.h>
74 
75 MODULE_VERSION(linux, 1);
76 MODULE_DEPEND(linux, sysvmsg, 1, 1, 1);
77 MODULE_DEPEND(linux, sysvsem, 1, 1, 1);
78 MODULE_DEPEND(linux, sysvshm, 1, 1, 1);
79 
80 MALLOC_DEFINE(M_LINUX, "linux", "Linux mode structures");
81 
82 #if BYTE_ORDER == LITTLE_ENDIAN
83 #define SHELLMAGIC      0x2123 /* #! */
84 #else
85 #define SHELLMAGIC      0x2321
86 #endif
87 
88 /*
89  * Allow the sendsig functions to use the ldebug() facility
90  * even though they are not syscalls themselves. Map them
91  * to syscall 0. This is slightly less bogus than using
92  * ldebug(sigreturn).
93  */
94 #define	LINUX_SYS_linux_rt_sendsig	0
95 #define	LINUX_SYS_linux_sendsig		0
96 
97 extern char linux_sigcode[];
98 extern int linux_szsigcode;
99 
100 extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL];
101 
102 SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler);
103 
104 static int	linux_fixup(register_t **stack_base,
105 		    struct image_params *iparams);
106 static int	elf_linux_fixup(register_t **stack_base,
107 		    struct image_params *iparams);
108 static void	linux_prepsyscall(struct trapframe *tf, int *args, u_int *code,
109 		    caddr_t *params);
110 static void     linux_sendsig(sig_t catcher, int sig, sigset_t *mask,
111 		    u_long code);
112 static void	exec_linux_setregs(struct thread *td, u_long entry,
113 				   u_long stack, u_long ps_strings);
114 
115 /*
116  * Linux syscalls return negative errno's, we do positive and map them
117  */
118 static int bsd_to_linux_errno[ELAST + 1] = {
119 	-0,  -1,  -2,  -3,  -4,  -5,  -6,  -7,  -8,  -9,
120 	-10, -35, -12, -13, -14, -15, -16, -17, -18, -19,
121 	-20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
122 	-30, -31, -32, -33, -34, -11,-115,-114, -88, -89,
123 	-90, -91, -92, -93, -94, -95, -96, -97, -98, -99,
124 	-100,-101,-102,-103,-104,-105,-106,-107,-108,-109,
125 	-110,-111, -40, -36,-112,-113, -39, -11, -87,-122,
126 	-116, -66,  -6,  -6,  -6,  -6,  -6, -37, -38,  -9,
127 	-6, -6, -43, -42, -75, -6, -84
128 };
129 
130 int bsd_to_linux_signal[LINUX_SIGTBLSZ] = {
131 	LINUX_SIGHUP, LINUX_SIGINT, LINUX_SIGQUIT, LINUX_SIGILL,
132 	LINUX_SIGTRAP, LINUX_SIGABRT, 0, LINUX_SIGFPE,
133 	LINUX_SIGKILL, LINUX_SIGBUS, LINUX_SIGSEGV, LINUX_SIGSYS,
134 	LINUX_SIGPIPE, LINUX_SIGALRM, LINUX_SIGTERM, LINUX_SIGURG,
135 	LINUX_SIGSTOP, LINUX_SIGTSTP, LINUX_SIGCONT, LINUX_SIGCHLD,
136 	LINUX_SIGTTIN, LINUX_SIGTTOU, LINUX_SIGIO, LINUX_SIGXCPU,
137 	LINUX_SIGXFSZ, LINUX_SIGVTALRM, LINUX_SIGPROF, LINUX_SIGWINCH,
138 	0, LINUX_SIGUSR1, LINUX_SIGUSR2
139 };
140 
141 int linux_to_bsd_signal[LINUX_SIGTBLSZ] = {
142 	SIGHUP, SIGINT, SIGQUIT, SIGILL,
143 	SIGTRAP, SIGABRT, SIGBUS, SIGFPE,
144 	SIGKILL, SIGUSR1, SIGSEGV, SIGUSR2,
145 	SIGPIPE, SIGALRM, SIGTERM, SIGBUS,
146 	SIGCHLD, SIGCONT, SIGSTOP, SIGTSTP,
147 	SIGTTIN, SIGTTOU, SIGURG, SIGXCPU,
148 	SIGXFSZ, SIGVTALRM, SIGPROF, SIGWINCH,
149 	SIGIO, SIGURG, SIGSYS
150 };
151 
152 #define LINUX_T_UNKNOWN  255
153 static int _bsd_to_linux_trapcode[] = {
154 	LINUX_T_UNKNOWN,	/* 0 */
155 	6,			/* 1  T_PRIVINFLT */
156 	LINUX_T_UNKNOWN,	/* 2 */
157 	3,			/* 3  T_BPTFLT */
158 	LINUX_T_UNKNOWN,	/* 4 */
159 	LINUX_T_UNKNOWN,	/* 5 */
160 	16,			/* 6  T_ARITHTRAP */
161 	254,			/* 7  T_ASTFLT */
162 	LINUX_T_UNKNOWN,	/* 8 */
163 	13,			/* 9  T_PROTFLT */
164 	1,			/* 10 T_TRCTRAP */
165 	LINUX_T_UNKNOWN,	/* 11 */
166 	14,			/* 12 T_PAGEFLT */
167 	LINUX_T_UNKNOWN,	/* 13 */
168 	17,			/* 14 T_ALIGNFLT */
169 	LINUX_T_UNKNOWN,	/* 15 */
170 	LINUX_T_UNKNOWN,	/* 16 */
171 	LINUX_T_UNKNOWN,	/* 17 */
172 	0,			/* 18 T_DIVIDE */
173 	2,			/* 19 T_NMI */
174 	4,			/* 20 T_OFLOW */
175 	5,			/* 21 T_BOUND */
176 	7,			/* 22 T_DNA */
177 	8,			/* 23 T_DOUBLEFLT */
178 	9,			/* 24 T_FPOPFLT */
179 	10,			/* 25 T_TSSFLT */
180 	11,			/* 26 T_SEGNPFLT */
181 	12,			/* 27 T_STKFLT */
182 	18,			/* 28 T_MCHK */
183 	19,			/* 29 T_XMMFLT */
184 	15			/* 30 T_RESERVED */
185 };
186 #define bsd_to_linux_trapcode(code) \
187     ((code)<sizeof(_bsd_to_linux_trapcode)/sizeof(*_bsd_to_linux_trapcode)? \
188      _bsd_to_linux_trapcode[(code)]: \
189      LINUX_T_UNKNOWN)
190 
191 /*
192  * If FreeBSD & Linux have a difference of opinion about what a trap
193  * means, deal with it here.
194  *
195  * MPSAFE
196  */
197 static int
198 translate_traps(int signal, int trap_code)
199 {
200 	if (signal != SIGBUS)
201 		return signal;
202 	switch (trap_code) {
203 	case T_PROTFLT:
204 	case T_TSSFLT:
205 	case T_DOUBLEFLT:
206 	case T_PAGEFLT:
207 		return SIGSEGV;
208 	default:
209 		return signal;
210 	}
211 }
212 
213 static int
214 linux_fixup(register_t **stack_base, struct image_params *imgp)
215 {
216 	register_t *argv, *envp;
217 
218 	argv = *stack_base;
219 	envp = *stack_base + (imgp->argc + 1);
220 	(*stack_base)--;
221 	**stack_base = (intptr_t)(void *)envp;
222 	(*stack_base)--;
223 	**stack_base = (intptr_t)(void *)argv;
224 	(*stack_base)--;
225 	**stack_base = imgp->argc;
226 	return 0;
227 }
228 
229 static int
230 elf_linux_fixup(register_t **stack_base, struct image_params *imgp)
231 {
232 	Elf32_Auxargs *args;
233 	register_t *pos;
234 
235 	KASSERT(curthread->td_proc == imgp->proc &&
236 	    (curthread->td_proc->p_flag & P_SA) == 0,
237 	    ("unsafe elf_linux_fixup(), should be curproc"));
238 	args = (Elf32_Auxargs *)imgp->auxargs;
239 	pos = *stack_base + (imgp->argc + imgp->envc + 2);
240 
241 	if (args->trace)
242 		AUXARGS_ENTRY(pos, AT_DEBUG, 1);
243 	if (args->execfd != -1)
244 		AUXARGS_ENTRY(pos, AT_EXECFD, args->execfd);
245 	AUXARGS_ENTRY(pos, AT_PHDR, args->phdr);
246 	AUXARGS_ENTRY(pos, AT_PHENT, args->phent);
247 	AUXARGS_ENTRY(pos, AT_PHNUM, args->phnum);
248 	AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz);
249 	AUXARGS_ENTRY(pos, AT_FLAGS, args->flags);
250 	AUXARGS_ENTRY(pos, AT_ENTRY, args->entry);
251 	AUXARGS_ENTRY(pos, AT_BASE, args->base);
252 	AUXARGS_ENTRY(pos, AT_UID, imgp->proc->p_ucred->cr_ruid);
253 	AUXARGS_ENTRY(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid);
254 	AUXARGS_ENTRY(pos, AT_GID, imgp->proc->p_ucred->cr_rgid);
255 	AUXARGS_ENTRY(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid);
256 	AUXARGS_ENTRY(pos, AT_NULL, 0);
257 
258 	free(imgp->auxargs, M_TEMP);
259 	imgp->auxargs = NULL;
260 
261 	(*stack_base)--;
262 	**stack_base = (register_t)imgp->argc;
263 	return 0;
264 }
265 
266 extern int _ucodesel, _udatasel;
267 extern unsigned long linux_sznonrtsigcode;
268 
269 static void
270 linux_rt_sendsig(sig_t catcher, int sig, sigset_t *mask, u_long code)
271 {
272 	struct thread *td = curthread;
273 	struct proc *p = td->td_proc;
274 	struct sigacts *psp;
275 	struct trapframe *regs;
276 	struct l_rt_sigframe *fp, frame;
277 	int oonstack;
278 
279 	PROC_LOCK_ASSERT(p, MA_OWNED);
280 	psp = p->p_sigacts;
281 	mtx_assert(&psp->ps_mtx, MA_OWNED);
282 	regs = td->td_frame;
283 	oonstack = sigonstack(regs->tf_esp);
284 
285 #ifdef DEBUG
286 	if (ldebug(rt_sendsig))
287 		printf(ARGS(rt_sendsig, "%p, %d, %p, %lu"),
288 		    catcher, sig, (void*)mask, code);
289 #endif
290 	/*
291 	 * Allocate space for the signal handler context.
292 	 */
293 	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
294 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
295 		fp = (struct l_rt_sigframe *)(td->td_sigstk.ss_sp +
296 		    td->td_sigstk.ss_size - sizeof(struct l_rt_sigframe));
297 	} else
298 		fp = (struct l_rt_sigframe *)regs->tf_esp - 1;
299 	mtx_unlock(&psp->ps_mtx);
300 
301 	/*
302 	 * Build the argument list for the signal handler.
303 	 */
304 	if (p->p_sysent->sv_sigtbl)
305 		if (sig <= p->p_sysent->sv_sigsize)
306 			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
307 
308 	bzero(&frame, sizeof(frame));
309 
310 	frame.sf_handler = catcher;
311 	frame.sf_sig = sig;
312 	frame.sf_siginfo = &fp->sf_si;
313 	frame.sf_ucontext = &fp->sf_sc;
314 
315 	/* Fill in POSIX parts */
316 	frame.sf_si.lsi_signo = sig;
317 	frame.sf_si.lsi_code = code;
318 	frame.sf_si.lsi_addr = (void *)regs->tf_err;
319 
320 	/*
321 	 * Build the signal context to be used by sigreturn.
322 	 */
323 	frame.sf_sc.uc_flags = 0;		/* XXX ??? */
324 	frame.sf_sc.uc_link = NULL;		/* XXX ??? */
325 
326 	frame.sf_sc.uc_stack.ss_sp = td->td_sigstk.ss_sp;
327 	frame.sf_sc.uc_stack.ss_size = td->td_sigstk.ss_size;
328 	frame.sf_sc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
329 	    ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE;
330 	PROC_UNLOCK(p);
331 
332 	bsd_to_linux_sigset(mask, &frame.sf_sc.uc_sigmask);
333 
334 	frame.sf_sc.uc_mcontext.sc_mask   = frame.sf_sc.uc_sigmask.__bits[0];
335 	frame.sf_sc.uc_mcontext.sc_gs     = rgs();
336 	frame.sf_sc.uc_mcontext.sc_fs     = regs->tf_fs;
337 	frame.sf_sc.uc_mcontext.sc_es     = regs->tf_es;
338 	frame.sf_sc.uc_mcontext.sc_ds     = regs->tf_ds;
339 	frame.sf_sc.uc_mcontext.sc_edi    = regs->tf_edi;
340 	frame.sf_sc.uc_mcontext.sc_esi    = regs->tf_esi;
341 	frame.sf_sc.uc_mcontext.sc_ebp    = regs->tf_ebp;
342 	frame.sf_sc.uc_mcontext.sc_ebx    = regs->tf_ebx;
343 	frame.sf_sc.uc_mcontext.sc_edx    = regs->tf_edx;
344 	frame.sf_sc.uc_mcontext.sc_ecx    = regs->tf_ecx;
345 	frame.sf_sc.uc_mcontext.sc_eax    = regs->tf_eax;
346 	frame.sf_sc.uc_mcontext.sc_eip    = regs->tf_eip;
347 	frame.sf_sc.uc_mcontext.sc_cs     = regs->tf_cs;
348 	frame.sf_sc.uc_mcontext.sc_eflags = regs->tf_eflags;
349 	frame.sf_sc.uc_mcontext.sc_esp_at_signal = regs->tf_esp;
350 	frame.sf_sc.uc_mcontext.sc_ss     = regs->tf_ss;
351 	frame.sf_sc.uc_mcontext.sc_err    = regs->tf_err;
352 	frame.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code);
353 
354 #ifdef DEBUG
355 	if (ldebug(rt_sendsig))
356 		printf(LMSG("rt_sendsig flags: 0x%x, sp: %p, ss: 0x%x, mask: 0x%x"),
357 		    frame.sf_sc.uc_stack.ss_flags, td->td_sigstk.ss_sp,
358 		    td->td_sigstk.ss_size, frame.sf_sc.uc_mcontext.sc_mask);
359 #endif
360 
361 	if (copyout(&frame, fp, sizeof(frame)) != 0) {
362 		/*
363 		 * Process has trashed its stack; give it an illegal
364 		 * instruction to halt it in its tracks.
365 		 */
366 #ifdef DEBUG
367 		if (ldebug(rt_sendsig))
368 			printf(LMSG("rt_sendsig: bad stack %p, oonstack=%x"),
369 			    fp, oonstack);
370 #endif
371 		PROC_LOCK(p);
372 		sigexit(td, SIGILL);
373 	}
374 
375 	/*
376 	 * Build context to run handler in.
377 	 */
378 	regs->tf_esp = (int)fp;
379 	regs->tf_eip = PS_STRINGS - *(p->p_sysent->sv_szsigcode) +
380 	    linux_sznonrtsigcode;
381 	regs->tf_eflags &= ~(PSL_T | PSL_VM);
382 	regs->tf_cs = _ucodesel;
383 	regs->tf_ds = _udatasel;
384 	regs->tf_es = _udatasel;
385 	regs->tf_fs = _udatasel;
386 	regs->tf_ss = _udatasel;
387 	PROC_LOCK(p);
388 	mtx_lock(&psp->ps_mtx);
389 }
390 
391 
392 /*
393  * Send an interrupt to process.
394  *
395  * Stack is set up to allow sigcode stored
396  * in u. to call routine, followed by kcall
397  * to sigreturn routine below.  After sigreturn
398  * resets the signal mask, the stack, and the
399  * frame pointer, it returns to the user
400  * specified pc, psl.
401  */
402 static void
403 linux_sendsig(sig_t catcher, int sig, sigset_t *mask, u_long code)
404 {
405 	struct thread *td = curthread;
406 	struct proc *p = td->td_proc;
407 	struct sigacts *psp;
408 	struct trapframe *regs;
409 	struct l_sigframe *fp, frame;
410 	l_sigset_t lmask;
411 	int oonstack, i;
412 
413 	PROC_LOCK_ASSERT(p, MA_OWNED);
414 	psp = p->p_sigacts;
415 	mtx_assert(&psp->ps_mtx, MA_OWNED);
416 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
417 		/* Signal handler installed with SA_SIGINFO. */
418 		linux_rt_sendsig(catcher, sig, mask, code);
419 		return;
420 	}
421 
422 	regs = td->td_frame;
423 	oonstack = sigonstack(regs->tf_esp);
424 
425 #ifdef DEBUG
426 	if (ldebug(sendsig))
427 		printf(ARGS(sendsig, "%p, %d, %p, %lu"),
428 		    catcher, sig, (void*)mask, code);
429 #endif
430 
431 	/*
432 	 * Allocate space for the signal handler context.
433 	 */
434 	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
435 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
436 		fp = (struct l_sigframe *)(td->td_sigstk.ss_sp +
437 		    td->td_sigstk.ss_size - sizeof(struct l_sigframe));
438 	} else
439 		fp = (struct l_sigframe *)regs->tf_esp - 1;
440 	mtx_unlock(&psp->ps_mtx);
441 	PROC_UNLOCK(p);
442 
443 	/*
444 	 * Build the argument list for the signal handler.
445 	 */
446 	if (p->p_sysent->sv_sigtbl)
447 		if (sig <= p->p_sysent->sv_sigsize)
448 			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
449 
450 	bzero(&frame, sizeof(frame));
451 
452 	frame.sf_handler = catcher;
453 	frame.sf_sig = sig;
454 
455 	bsd_to_linux_sigset(mask, &lmask);
456 
457 	/*
458 	 * Build the signal context to be used by sigreturn.
459 	 */
460 	frame.sf_sc.sc_mask   = lmask.__bits[0];
461 	frame.sf_sc.sc_gs     = rgs();
462 	frame.sf_sc.sc_fs     = regs->tf_fs;
463 	frame.sf_sc.sc_es     = regs->tf_es;
464 	frame.sf_sc.sc_ds     = regs->tf_ds;
465 	frame.sf_sc.sc_edi    = regs->tf_edi;
466 	frame.sf_sc.sc_esi    = regs->tf_esi;
467 	frame.sf_sc.sc_ebp    = regs->tf_ebp;
468 	frame.sf_sc.sc_ebx    = regs->tf_ebx;
469 	frame.sf_sc.sc_edx    = regs->tf_edx;
470 	frame.sf_sc.sc_ecx    = regs->tf_ecx;
471 	frame.sf_sc.sc_eax    = regs->tf_eax;
472 	frame.sf_sc.sc_eip    = regs->tf_eip;
473 	frame.sf_sc.sc_cs     = regs->tf_cs;
474 	frame.sf_sc.sc_eflags = regs->tf_eflags;
475 	frame.sf_sc.sc_esp_at_signal = regs->tf_esp;
476 	frame.sf_sc.sc_ss     = regs->tf_ss;
477 	frame.sf_sc.sc_err    = regs->tf_err;
478 	frame.sf_sc.sc_trapno = bsd_to_linux_trapcode(code);
479 
480 	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
481 		frame.sf_extramask[i] = lmask.__bits[i+1];
482 
483 	if (copyout(&frame, fp, sizeof(frame)) != 0) {
484 		/*
485 		 * Process has trashed its stack; give it an illegal
486 		 * instruction to halt it in its tracks.
487 		 */
488 		PROC_LOCK(p);
489 		sigexit(td, SIGILL);
490 	}
491 
492 	/*
493 	 * Build context to run handler in.
494 	 */
495 	regs->tf_esp = (int)fp;
496 	regs->tf_eip = PS_STRINGS - *(p->p_sysent->sv_szsigcode);
497 	regs->tf_eflags &= ~(PSL_T | PSL_VM);
498 	regs->tf_cs = _ucodesel;
499 	regs->tf_ds = _udatasel;
500 	regs->tf_es = _udatasel;
501 	regs->tf_fs = _udatasel;
502 	regs->tf_ss = _udatasel;
503 	PROC_LOCK(p);
504 	mtx_lock(&psp->ps_mtx);
505 }
506 
507 /*
508  * System call to cleanup state after a signal
509  * has been taken.  Reset signal mask and
510  * stack state from context left by sendsig (above).
511  * Return to previous pc and psl as specified by
512  * context left by sendsig. Check carefully to
513  * make sure that the user has not modified the
514  * psl to gain improper privileges or to cause
515  * a machine fault.
516  */
517 int
518 linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args)
519 {
520 	struct proc *p = td->td_proc;
521 	struct l_sigframe frame;
522 	struct trapframe *regs;
523 	l_sigset_t lmask;
524 	int eflags, i;
525 
526 	regs = td->td_frame;
527 
528 #ifdef DEBUG
529 	if (ldebug(sigreturn))
530 		printf(ARGS(sigreturn, "%p"), (void *)args->sfp);
531 #endif
532 	/*
533 	 * The trampoline code hands us the sigframe.
534 	 * It is unsafe to keep track of it ourselves, in the event that a
535 	 * program jumps out of a signal handler.
536 	 */
537 	if (copyin(args->sfp, &frame, sizeof(frame)) != 0)
538 		return (EFAULT);
539 
540 	/*
541 	 * Check for security violations.
542 	 */
543 #define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
544 	eflags = frame.sf_sc.sc_eflags;
545 	/*
546 	 * XXX do allow users to change the privileged flag PSL_RF.  The
547 	 * cpu sets PSL_RF in tf_eflags for faults.  Debuggers should
548 	 * sometimes set it there too.  tf_eflags is kept in the signal
549 	 * context during signal handling and there is no other place
550 	 * to remember it, so the PSL_RF bit may be corrupted by the
551 	 * signal handler without us knowing.  Corruption of the PSL_RF
552 	 * bit at worst causes one more or one less debugger trap, so
553 	 * allowing it is fairly harmless.
554 	 */
555 	if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF))
556 		return(EINVAL);
557 
558 	/*
559 	 * Don't allow users to load a valid privileged %cs.  Let the
560 	 * hardware check for invalid selectors, excess privilege in
561 	 * other selectors, invalid %eip's and invalid %esp's.
562 	 */
563 #define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
564 	if (!CS_SECURE(frame.sf_sc.sc_cs)) {
565 		trapsignal(td, SIGBUS, T_PROTFLT);
566 		return(EINVAL);
567 	}
568 
569 	lmask.__bits[0] = frame.sf_sc.sc_mask;
570 	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
571 		lmask.__bits[i+1] = frame.sf_extramask[i];
572 	PROC_LOCK(p);
573 	linux_to_bsd_sigset(&lmask, &td->td_sigmask);
574 	SIG_CANTMASK(td->td_sigmask);
575 	signotify(td);
576 	PROC_UNLOCK(p);
577 
578 	/*
579 	 * Restore signal context.
580 	 */
581 	/* %gs was restored by the trampoline. */
582 	regs->tf_fs     = frame.sf_sc.sc_fs;
583 	regs->tf_es     = frame.sf_sc.sc_es;
584 	regs->tf_ds     = frame.sf_sc.sc_ds;
585 	regs->tf_edi    = frame.sf_sc.sc_edi;
586 	regs->tf_esi    = frame.sf_sc.sc_esi;
587 	regs->tf_ebp    = frame.sf_sc.sc_ebp;
588 	regs->tf_ebx    = frame.sf_sc.sc_ebx;
589 	regs->tf_edx    = frame.sf_sc.sc_edx;
590 	regs->tf_ecx    = frame.sf_sc.sc_ecx;
591 	regs->tf_eax    = frame.sf_sc.sc_eax;
592 	regs->tf_eip    = frame.sf_sc.sc_eip;
593 	regs->tf_cs     = frame.sf_sc.sc_cs;
594 	regs->tf_eflags = eflags;
595 	regs->tf_esp    = frame.sf_sc.sc_esp_at_signal;
596 	regs->tf_ss     = frame.sf_sc.sc_ss;
597 
598 	return (EJUSTRETURN);
599 }
600 
601 /*
602  * System call to cleanup state after a signal
603  * has been taken.  Reset signal mask and
604  * stack state from context left by rt_sendsig (above).
605  * Return to previous pc and psl as specified by
606  * context left by sendsig. Check carefully to
607  * make sure that the user has not modified the
608  * psl to gain improper privileges or to cause
609  * a machine fault.
610  */
611 int
612 linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args)
613 {
614 	struct proc *p = td->td_proc;
615 	struct l_ucontext uc;
616 	struct l_sigcontext *context;
617 	l_stack_t *lss;
618 	stack_t ss;
619 	struct trapframe *regs;
620 	int eflags;
621 
622 	regs = td->td_frame;
623 
624 #ifdef DEBUG
625 	if (ldebug(rt_sigreturn))
626 		printf(ARGS(rt_sigreturn, "%p"), (void *)args->ucp);
627 #endif
628 	/*
629 	 * The trampoline code hands us the ucontext.
630 	 * It is unsafe to keep track of it ourselves, in the event that a
631 	 * program jumps out of a signal handler.
632 	 */
633 	if (copyin(args->ucp, &uc, sizeof(uc)) != 0)
634 		return (EFAULT);
635 
636 	context = &uc.uc_mcontext;
637 
638 	/*
639 	 * Check for security violations.
640 	 */
641 #define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
642 	eflags = context->sc_eflags;
643 	/*
644 	 * XXX do allow users to change the privileged flag PSL_RF.  The
645 	 * cpu sets PSL_RF in tf_eflags for faults.  Debuggers should
646 	 * sometimes set it there too.  tf_eflags is kept in the signal
647 	 * context during signal handling and there is no other place
648 	 * to remember it, so the PSL_RF bit may be corrupted by the
649 	 * signal handler without us knowing.  Corruption of the PSL_RF
650 	 * bit at worst causes one more or one less debugger trap, so
651 	 * allowing it is fairly harmless.
652 	 */
653 	if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF))
654 		return(EINVAL);
655 
656 	/*
657 	 * Don't allow users to load a valid privileged %cs.  Let the
658 	 * hardware check for invalid selectors, excess privilege in
659 	 * other selectors, invalid %eip's and invalid %esp's.
660 	 */
661 #define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
662 	if (!CS_SECURE(context->sc_cs)) {
663 		trapsignal(td, SIGBUS, T_PROTFLT);
664 		return(EINVAL);
665 	}
666 
667 	PROC_LOCK(p);
668 	linux_to_bsd_sigset(&uc.uc_sigmask, &td->td_sigmask);
669 	SIG_CANTMASK(td->td_sigmask);
670 	signotify(td);
671 	PROC_UNLOCK(p);
672 
673 	/*
674 	 * Restore signal context
675 	 */
676 	/* %gs was restored by the trampoline. */
677 	regs->tf_fs     = context->sc_fs;
678 	regs->tf_es     = context->sc_es;
679 	regs->tf_ds     = context->sc_ds;
680 	regs->tf_edi    = context->sc_edi;
681 	regs->tf_esi    = context->sc_esi;
682 	regs->tf_ebp    = context->sc_ebp;
683 	regs->tf_ebx    = context->sc_ebx;
684 	regs->tf_edx    = context->sc_edx;
685 	regs->tf_ecx    = context->sc_ecx;
686 	regs->tf_eax    = context->sc_eax;
687 	regs->tf_eip    = context->sc_eip;
688 	regs->tf_cs     = context->sc_cs;
689 	regs->tf_eflags = eflags;
690 	regs->tf_esp    = context->sc_esp_at_signal;
691 	regs->tf_ss     = context->sc_ss;
692 
693 	/*
694 	 * call sigaltstack & ignore results..
695 	 */
696 	lss = &uc.uc_stack;
697 	ss.ss_sp = lss->ss_sp;
698 	ss.ss_size = lss->ss_size;
699 	ss.ss_flags = linux_to_bsd_sigaltstack(lss->ss_flags);
700 
701 #ifdef DEBUG
702 	if (ldebug(rt_sigreturn))
703 		printf(LMSG("rt_sigret flags: 0x%x, sp: %p, ss: 0x%x, mask: 0x%x"),
704 		    ss.ss_flags, ss.ss_sp, ss.ss_size, context->sc_mask);
705 #endif
706 	(void)kern_sigaltstack(td, &ss, NULL);
707 
708 	return (EJUSTRETURN);
709 }
710 
711 /*
712  * MPSAFE
713  */
714 static void
715 linux_prepsyscall(struct trapframe *tf, int *args, u_int *code, caddr_t *params)
716 {
717 	args[0] = tf->tf_ebx;
718 	args[1] = tf->tf_ecx;
719 	args[2] = tf->tf_edx;
720 	args[3] = tf->tf_esi;
721 	args[4] = tf->tf_edi;
722 	args[5] = tf->tf_ebp;	/* Unconfirmed */
723 	*params = NULL;		/* no copyin */
724 }
725 
726 
727 
728 /*
729  * Dump core, into a file named as described in the comments for
730  * expand_name(), unless the process was setuid/setgid.
731  */
732 static int
733 linux_aout_coredump(struct thread *td, struct vnode *vp, off_t limit)
734 {
735 	struct proc *p = td->td_proc;
736 	struct ucred *cred = td->td_ucred;
737 	struct vmspace *vm = p->p_vmspace;
738 	char *tempuser;
739 	int error;
740 
741 	if (ctob((uarea_pages + kstack_pages) +
742 	    vm->vm_dsize + vm->vm_ssize) >= limit)
743 		return (EFAULT);
744 	tempuser = malloc(ctob(uarea_pages + kstack_pages), M_TEMP,
745 	    M_WAITOK | M_ZERO);
746 	if (tempuser == NULL)
747 		return (ENOMEM);
748 	PROC_LOCK(p);
749 	fill_kinfo_proc(p, &p->p_uarea->u_kproc);
750 	PROC_UNLOCK(p);
751 	bcopy(p->p_uarea, tempuser, sizeof(struct user));
752 	bcopy(td->td_frame,
753 	    tempuser + ctob(uarea_pages) +
754 	    ((caddr_t)td->td_frame - (caddr_t)td->td_kstack),
755 	    sizeof(struct trapframe));
756 	error = vn_rdwr(UIO_WRITE, vp, (caddr_t)tempuser,
757 	    ctob(uarea_pages + kstack_pages),
758 	    (off_t)0, UIO_SYSSPACE, IO_UNIT, cred, NOCRED,
759 	    (int *)NULL, td);
760 	free(tempuser, M_TEMP);
761 	if (error == 0)
762 		error = vn_rdwr(UIO_WRITE, vp, vm->vm_daddr,
763 		    (int)ctob(vm->vm_dsize),
764 		    (off_t)ctob(uarea_pages + kstack_pages), UIO_USERSPACE,
765 		    IO_UNIT | IO_DIRECT, cred, NOCRED, (int *) NULL, td);
766 	if (error == 0)
767 		error = vn_rdwr_inchunks(UIO_WRITE, vp,
768 		    (caddr_t)trunc_page(USRSTACK - ctob(vm->vm_ssize)),
769 		    round_page(ctob(vm->vm_ssize)),
770 		    (off_t)ctob(uarea_pages + kstack_pages) +
771 			ctob(vm->vm_dsize), UIO_USERSPACE,
772 		    IO_UNIT | IO_DIRECT, cred, NOCRED, NULL, td);
773 	return (error);
774 }
775 /*
776  * If a linux binary is exec'ing something, try this image activator
777  * first.  We override standard shell script execution in order to
778  * be able to modify the interpreter path.  We only do this if a linux
779  * binary is doing the exec, so we do not create an EXEC module for it.
780  */
781 static int	exec_linux_imgact_try(struct image_params *iparams);
782 
783 static int
784 exec_linux_imgact_try(struct image_params *imgp)
785 {
786     const char *head = (const char *)imgp->image_header;
787     int error = -1;
788 
789     /*
790      * The interpreter for shell scripts run from a linux binary needs
791      * to be located in /compat/linux if possible in order to recursively
792      * maintain linux path emulation.
793      */
794     if (((const short *)head)[0] == SHELLMAGIC) {
795 	    /*
796 	     * Run our normal shell image activator.  If it succeeds attempt
797 	     * to use the alternate path for the interpreter.  If an alternate
798 	     * path is found, use our stringspace to store it.
799 	     */
800 	    if ((error = exec_shell_imgact(imgp)) == 0) {
801 		    char *rpath = NULL;
802 
803 		    linux_emul_find(FIRST_THREAD_IN_PROC(imgp->proc), NULL,
804 			imgp->interpreter_name, &rpath, 0);
805 		    if (rpath != imgp->interpreter_name) {
806 			    int len = strlen(rpath) + 1;
807 
808 			    if (len <= MAXSHELLCMDLEN) {
809 				    memcpy(imgp->interpreter_name, rpath, len);
810 			    }
811 			    free(rpath, M_TEMP);
812 		    }
813 	    }
814     }
815     return(error);
816 }
817 
818 /*
819  * exec_setregs may initialize some registers differently than Linux
820  * does, thus potentially confusing Linux binaries. If necessary, we
821  * override the exec_setregs default(s) here.
822  */
823 static void
824 exec_linux_setregs(struct thread *td, u_long entry,
825 		   u_long stack, u_long ps_strings)
826 {
827 	struct pcb *pcb = td->td_pcb;
828 
829 	exec_setregs(td, entry, stack, ps_strings);
830 
831 	/* Linux sets %gs to 0, we default to _udatasel */
832 	pcb->pcb_gs = 0; load_gs(0);
833 }
834 
835 struct sysentvec linux_sysvec = {
836 	LINUX_SYS_MAXSYSCALL,
837 	linux_sysent,
838 	0xff,
839 	LINUX_SIGTBLSZ,
840 	bsd_to_linux_signal,
841 	ELAST + 1,
842 	bsd_to_linux_errno,
843 	translate_traps,
844 	linux_fixup,
845 	linux_sendsig,
846 	linux_sigcode,
847 	&linux_szsigcode,
848 	linux_prepsyscall,
849 	"Linux a.out",
850 	linux_aout_coredump,
851 	exec_linux_imgact_try,
852 	LINUX_MINSIGSTKSZ,
853 	PAGE_SIZE,
854 	VM_MIN_ADDRESS,
855 	VM_MAXUSER_ADDRESS,
856 	USRSTACK,
857 	PS_STRINGS,
858 	VM_PROT_ALL,
859 	exec_copyout_strings,
860 	exec_linux_setregs,
861 	NULL
862 };
863 
864 struct sysentvec elf_linux_sysvec = {
865 	LINUX_SYS_MAXSYSCALL,
866 	linux_sysent,
867 	0xff,
868 	LINUX_SIGTBLSZ,
869 	bsd_to_linux_signal,
870 	ELAST + 1,
871 	bsd_to_linux_errno,
872 	translate_traps,
873 	elf_linux_fixup,
874 	linux_sendsig,
875 	linux_sigcode,
876 	&linux_szsigcode,
877 	linux_prepsyscall,
878 	"Linux ELF",
879 	elf32_coredump,
880 	exec_linux_imgact_try,
881 	LINUX_MINSIGSTKSZ,
882 	PAGE_SIZE,
883 	VM_MIN_ADDRESS,
884 	VM_MAXUSER_ADDRESS,
885 	USRSTACK,
886 	PS_STRINGS,
887 	VM_PROT_ALL,
888 	exec_copyout_strings,
889 	exec_linux_setregs,
890 	NULL
891 };
892 
893 static Elf32_Brandinfo linux_brand = {
894 					ELFOSABI_LINUX,
895 					EM_386,
896 					"Linux",
897 					"/compat/linux",
898 					"/lib/ld-linux.so.1",
899 					&elf_linux_sysvec,
900 					NULL,
901 				 };
902 
903 static Elf32_Brandinfo linux_glibc2brand = {
904 					ELFOSABI_LINUX,
905 					EM_386,
906 					"Linux",
907 					"/compat/linux",
908 					"/lib/ld-linux.so.2",
909 					&elf_linux_sysvec,
910 					NULL,
911 				 };
912 
913 Elf32_Brandinfo *linux_brandlist[] = {
914 					&linux_brand,
915 					&linux_glibc2brand,
916 					NULL
917 				};
918 
919 static int
920 linux_elf_modevent(module_t mod, int type, void *data)
921 {
922 	Elf32_Brandinfo **brandinfo;
923 	int error;
924 	struct linux_ioctl_handler **lihp;
925 
926 	error = 0;
927 
928 	switch(type) {
929 	case MOD_LOAD:
930 		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
931 		     ++brandinfo)
932 			if (elf32_insert_brand_entry(*brandinfo) < 0)
933 				error = EINVAL;
934 		if (error == 0) {
935 			SET_FOREACH(lihp, linux_ioctl_handler_set)
936 				linux_ioctl_register_handler(*lihp);
937 			if (bootverbose)
938 				printf("Linux ELF exec handler installed\n");
939 		} else
940 			printf("cannot insert Linux ELF brand handler\n");
941 		break;
942 	case MOD_UNLOAD:
943 		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
944 		     ++brandinfo)
945 			if (elf32_brand_inuse(*brandinfo))
946 				error = EBUSY;
947 		if (error == 0) {
948 			for (brandinfo = &linux_brandlist[0];
949 			     *brandinfo != NULL; ++brandinfo)
950 				if (elf32_remove_brand_entry(*brandinfo) < 0)
951 					error = EINVAL;
952 		}
953 		if (error == 0) {
954 			SET_FOREACH(lihp, linux_ioctl_handler_set)
955 				linux_ioctl_unregister_handler(*lihp);
956 			if (bootverbose)
957 				printf("Linux ELF exec handler removed\n");
958 			linux_mib_destroy();
959 		} else
960 			printf("Could not deinstall ELF interpreter entry\n");
961 		break;
962 	default:
963 		return EOPNOTSUPP;
964 	}
965 	return error;
966 }
967 
968 static moduledata_t linux_elf_mod = {
969 	"linuxelf",
970 	linux_elf_modevent,
971 	0
972 };
973 
974 DECLARE_MODULE(linuxelf, linux_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY);
975