xref: /freebsd/sys/i386/linux/linux_sysvec.c (revision 7b54cdda4e5d7bf71170366d59f76889ead62c9e)
1 /*-
2  * Copyright (c) 1994-1996 S�ren Schmidt
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer
10  *    in this position and unchanged.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  * 3. The name of the author may not be used to endorse or promote products
15  *    derived from this software without specific prior written permission
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 
32 /* XXX we use functions that might not exist. */
33 #include "opt_compat.h"
34 
35 #ifndef COMPAT_43
36 #error "Unable to compile Linux-emulator due to missing COMPAT_43 option!"
37 #endif
38 
39 #include <sys/param.h>
40 #include <sys/systm.h>
41 #include <sys/exec.h>
42 #include <sys/imgact.h>
43 #include <sys/imgact_aout.h>
44 #include <sys/imgact_elf.h>
45 #include <sys/kernel.h>
46 #include <sys/lock.h>
47 #include <sys/malloc.h>
48 #include <sys/module.h>
49 #include <sys/mutex.h>
50 #include <sys/proc.h>
51 #include <sys/signalvar.h>
52 #include <sys/syscallsubr.h>
53 #include <sys/sysent.h>
54 #include <sys/sysproto.h>
55 #include <sys/user.h>
56 #include <sys/vnode.h>
57 
58 #include <vm/vm.h>
59 #include <vm/pmap.h>
60 #include <vm/vm_extern.h>
61 #include <vm/vm_map.h>
62 #include <vm/vm_object.h>
63 #include <vm/vm_page.h>
64 #include <vm/vm_param.h>
65 
66 #include <machine/cpu.h>
67 #include <machine/md_var.h>
68 
69 #include <i386/linux/linux.h>
70 #include <i386/linux/linux_proto.h>
71 #include <compat/linux/linux_mib.h>
72 #include <compat/linux/linux_signal.h>
73 #include <compat/linux/linux_util.h>
74 
75 MODULE_VERSION(linux, 1);
76 MODULE_DEPEND(linux, sysvmsg, 1, 1, 1);
77 MODULE_DEPEND(linux, sysvsem, 1, 1, 1);
78 MODULE_DEPEND(linux, sysvshm, 1, 1, 1);
79 
80 MALLOC_DEFINE(M_LINUX, "linux", "Linux mode structures");
81 
82 #if BYTE_ORDER == LITTLE_ENDIAN
83 #define SHELLMAGIC      0x2123 /* #! */
84 #else
85 #define SHELLMAGIC      0x2321
86 #endif
87 
88 /*
89  * Allow the sendsig functions to use the ldebug() facility
90  * even though they are not syscalls themselves. Map them
91  * to syscall 0. This is slightly less bogus than using
92  * ldebug(sigreturn).
93  */
94 #define	LINUX_SYS_linux_rt_sendsig	0
95 #define	LINUX_SYS_linux_sendsig		0
96 
97 #define	uarea_pages	1
98 
99 extern char linux_sigcode[];
100 extern int linux_szsigcode;
101 
102 extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL];
103 
104 SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler);
105 
106 static int	linux_fixup(register_t **stack_base,
107 		    struct image_params *iparams);
108 static int	elf_linux_fixup(register_t **stack_base,
109 		    struct image_params *iparams);
110 static void	linux_prepsyscall(struct trapframe *tf, int *args, u_int *code,
111 		    caddr_t *params);
112 static void     linux_sendsig(sig_t catcher, int sig, sigset_t *mask,
113 		    u_long code);
114 static void	exec_linux_setregs(struct thread *td, u_long entry,
115 				   u_long stack, u_long ps_strings);
116 
117 /*
118  * Linux syscalls return negative errno's, we do positive and map them
119  */
120 static int bsd_to_linux_errno[ELAST + 1] = {
121 	-0,  -1,  -2,  -3,  -4,  -5,  -6,  -7,  -8,  -9,
122 	-10, -35, -12, -13, -14, -15, -16, -17, -18, -19,
123 	-20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
124 	-30, -31, -32, -33, -34, -11,-115,-114, -88, -89,
125 	-90, -91, -92, -93, -94, -95, -96, -97, -98, -99,
126 	-100,-101,-102,-103,-104,-105,-106,-107,-108,-109,
127 	-110,-111, -40, -36,-112,-113, -39, -11, -87,-122,
128 	-116, -66,  -6,  -6,  -6,  -6,  -6, -37, -38,  -9,
129 	-6, -6, -43, -42, -75, -6, -84
130 };
131 
132 int bsd_to_linux_signal[LINUX_SIGTBLSZ] = {
133 	LINUX_SIGHUP, LINUX_SIGINT, LINUX_SIGQUIT, LINUX_SIGILL,
134 	LINUX_SIGTRAP, LINUX_SIGABRT, 0, LINUX_SIGFPE,
135 	LINUX_SIGKILL, LINUX_SIGBUS, LINUX_SIGSEGV, LINUX_SIGSYS,
136 	LINUX_SIGPIPE, LINUX_SIGALRM, LINUX_SIGTERM, LINUX_SIGURG,
137 	LINUX_SIGSTOP, LINUX_SIGTSTP, LINUX_SIGCONT, LINUX_SIGCHLD,
138 	LINUX_SIGTTIN, LINUX_SIGTTOU, LINUX_SIGIO, LINUX_SIGXCPU,
139 	LINUX_SIGXFSZ, LINUX_SIGVTALRM, LINUX_SIGPROF, LINUX_SIGWINCH,
140 	0, LINUX_SIGUSR1, LINUX_SIGUSR2
141 };
142 
143 int linux_to_bsd_signal[LINUX_SIGTBLSZ] = {
144 	SIGHUP, SIGINT, SIGQUIT, SIGILL,
145 	SIGTRAP, SIGABRT, SIGBUS, SIGFPE,
146 	SIGKILL, SIGUSR1, SIGSEGV, SIGUSR2,
147 	SIGPIPE, SIGALRM, SIGTERM, SIGBUS,
148 	SIGCHLD, SIGCONT, SIGSTOP, SIGTSTP,
149 	SIGTTIN, SIGTTOU, SIGURG, SIGXCPU,
150 	SIGXFSZ, SIGVTALRM, SIGPROF, SIGWINCH,
151 	SIGIO, SIGURG, SIGSYS
152 };
153 
154 #define LINUX_T_UNKNOWN  255
155 static int _bsd_to_linux_trapcode[] = {
156 	LINUX_T_UNKNOWN,	/* 0 */
157 	6,			/* 1  T_PRIVINFLT */
158 	LINUX_T_UNKNOWN,	/* 2 */
159 	3,			/* 3  T_BPTFLT */
160 	LINUX_T_UNKNOWN,	/* 4 */
161 	LINUX_T_UNKNOWN,	/* 5 */
162 	16,			/* 6  T_ARITHTRAP */
163 	254,			/* 7  T_ASTFLT */
164 	LINUX_T_UNKNOWN,	/* 8 */
165 	13,			/* 9  T_PROTFLT */
166 	1,			/* 10 T_TRCTRAP */
167 	LINUX_T_UNKNOWN,	/* 11 */
168 	14,			/* 12 T_PAGEFLT */
169 	LINUX_T_UNKNOWN,	/* 13 */
170 	17,			/* 14 T_ALIGNFLT */
171 	LINUX_T_UNKNOWN,	/* 15 */
172 	LINUX_T_UNKNOWN,	/* 16 */
173 	LINUX_T_UNKNOWN,	/* 17 */
174 	0,			/* 18 T_DIVIDE */
175 	2,			/* 19 T_NMI */
176 	4,			/* 20 T_OFLOW */
177 	5,			/* 21 T_BOUND */
178 	7,			/* 22 T_DNA */
179 	8,			/* 23 T_DOUBLEFLT */
180 	9,			/* 24 T_FPOPFLT */
181 	10,			/* 25 T_TSSFLT */
182 	11,			/* 26 T_SEGNPFLT */
183 	12,			/* 27 T_STKFLT */
184 	18,			/* 28 T_MCHK */
185 	19,			/* 29 T_XMMFLT */
186 	15			/* 30 T_RESERVED */
187 };
188 #define bsd_to_linux_trapcode(code) \
189     ((code)<sizeof(_bsd_to_linux_trapcode)/sizeof(*_bsd_to_linux_trapcode)? \
190      _bsd_to_linux_trapcode[(code)]: \
191      LINUX_T_UNKNOWN)
192 
193 /*
194  * If FreeBSD & Linux have a difference of opinion about what a trap
195  * means, deal with it here.
196  *
197  * MPSAFE
198  */
199 static int
200 translate_traps(int signal, int trap_code)
201 {
202 	if (signal != SIGBUS)
203 		return signal;
204 	switch (trap_code) {
205 	case T_PROTFLT:
206 	case T_TSSFLT:
207 	case T_DOUBLEFLT:
208 	case T_PAGEFLT:
209 		return SIGSEGV;
210 	default:
211 		return signal;
212 	}
213 }
214 
215 static int
216 linux_fixup(register_t **stack_base, struct image_params *imgp)
217 {
218 	register_t *argv, *envp;
219 
220 	argv = *stack_base;
221 	envp = *stack_base + (imgp->argc + 1);
222 	(*stack_base)--;
223 	**stack_base = (intptr_t)(void *)envp;
224 	(*stack_base)--;
225 	**stack_base = (intptr_t)(void *)argv;
226 	(*stack_base)--;
227 	**stack_base = imgp->argc;
228 	return 0;
229 }
230 
231 static int
232 elf_linux_fixup(register_t **stack_base, struct image_params *imgp)
233 {
234 	Elf32_Auxargs *args;
235 	register_t *pos;
236 
237 	KASSERT(curthread->td_proc == imgp->proc &&
238 	    (curthread->td_proc->p_flag & P_SA) == 0,
239 	    ("unsafe elf_linux_fixup(), should be curproc"));
240 	args = (Elf32_Auxargs *)imgp->auxargs;
241 	pos = *stack_base + (imgp->argc + imgp->envc + 2);
242 
243 	if (args->trace)
244 		AUXARGS_ENTRY(pos, AT_DEBUG, 1);
245 	if (args->execfd != -1)
246 		AUXARGS_ENTRY(pos, AT_EXECFD, args->execfd);
247 	AUXARGS_ENTRY(pos, AT_PHDR, args->phdr);
248 	AUXARGS_ENTRY(pos, AT_PHENT, args->phent);
249 	AUXARGS_ENTRY(pos, AT_PHNUM, args->phnum);
250 	AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz);
251 	AUXARGS_ENTRY(pos, AT_FLAGS, args->flags);
252 	AUXARGS_ENTRY(pos, AT_ENTRY, args->entry);
253 	AUXARGS_ENTRY(pos, AT_BASE, args->base);
254 	AUXARGS_ENTRY(pos, AT_UID, imgp->proc->p_ucred->cr_ruid);
255 	AUXARGS_ENTRY(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid);
256 	AUXARGS_ENTRY(pos, AT_GID, imgp->proc->p_ucred->cr_rgid);
257 	AUXARGS_ENTRY(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid);
258 	AUXARGS_ENTRY(pos, AT_NULL, 0);
259 
260 	free(imgp->auxargs, M_TEMP);
261 	imgp->auxargs = NULL;
262 
263 	(*stack_base)--;
264 	**stack_base = (register_t)imgp->argc;
265 	return 0;
266 }
267 
268 extern int _ucodesel, _udatasel;
269 extern unsigned long linux_sznonrtsigcode;
270 
271 static void
272 linux_rt_sendsig(sig_t catcher, int sig, sigset_t *mask, u_long code)
273 {
274 	struct thread *td = curthread;
275 	struct proc *p = td->td_proc;
276 	struct sigacts *psp;
277 	struct trapframe *regs;
278 	struct l_rt_sigframe *fp, frame;
279 	int oonstack;
280 
281 	PROC_LOCK_ASSERT(p, MA_OWNED);
282 	psp = p->p_sigacts;
283 	mtx_assert(&psp->ps_mtx, MA_OWNED);
284 	regs = td->td_frame;
285 	oonstack = sigonstack(regs->tf_esp);
286 
287 #ifdef DEBUG
288 	if (ldebug(rt_sendsig))
289 		printf(ARGS(rt_sendsig, "%p, %d, %p, %lu"),
290 		    catcher, sig, (void*)mask, code);
291 #endif
292 	/*
293 	 * Allocate space for the signal handler context.
294 	 */
295 	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
296 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
297 		fp = (struct l_rt_sigframe *)(td->td_sigstk.ss_sp +
298 		    td->td_sigstk.ss_size - sizeof(struct l_rt_sigframe));
299 	} else
300 		fp = (struct l_rt_sigframe *)regs->tf_esp - 1;
301 	mtx_unlock(&psp->ps_mtx);
302 
303 	/*
304 	 * Build the argument list for the signal handler.
305 	 */
306 	if (p->p_sysent->sv_sigtbl)
307 		if (sig <= p->p_sysent->sv_sigsize)
308 			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
309 
310 	bzero(&frame, sizeof(frame));
311 
312 	frame.sf_handler = catcher;
313 	frame.sf_sig = sig;
314 	frame.sf_siginfo = &fp->sf_si;
315 	frame.sf_ucontext = &fp->sf_sc;
316 
317 	/* Fill in POSIX parts */
318 	frame.sf_si.lsi_signo = sig;
319 	frame.sf_si.lsi_code = code;
320 	frame.sf_si.lsi_addr = (void *)regs->tf_err;
321 
322 	/*
323 	 * Build the signal context to be used by sigreturn.
324 	 */
325 	frame.sf_sc.uc_flags = 0;		/* XXX ??? */
326 	frame.sf_sc.uc_link = NULL;		/* XXX ??? */
327 
328 	frame.sf_sc.uc_stack.ss_sp = td->td_sigstk.ss_sp;
329 	frame.sf_sc.uc_stack.ss_size = td->td_sigstk.ss_size;
330 	frame.sf_sc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
331 	    ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE;
332 	PROC_UNLOCK(p);
333 
334 	bsd_to_linux_sigset(mask, &frame.sf_sc.uc_sigmask);
335 
336 	frame.sf_sc.uc_mcontext.sc_mask   = frame.sf_sc.uc_sigmask.__bits[0];
337 	frame.sf_sc.uc_mcontext.sc_gs     = rgs();
338 	frame.sf_sc.uc_mcontext.sc_fs     = regs->tf_fs;
339 	frame.sf_sc.uc_mcontext.sc_es     = regs->tf_es;
340 	frame.sf_sc.uc_mcontext.sc_ds     = regs->tf_ds;
341 	frame.sf_sc.uc_mcontext.sc_edi    = regs->tf_edi;
342 	frame.sf_sc.uc_mcontext.sc_esi    = regs->tf_esi;
343 	frame.sf_sc.uc_mcontext.sc_ebp    = regs->tf_ebp;
344 	frame.sf_sc.uc_mcontext.sc_ebx    = regs->tf_ebx;
345 	frame.sf_sc.uc_mcontext.sc_edx    = regs->tf_edx;
346 	frame.sf_sc.uc_mcontext.sc_ecx    = regs->tf_ecx;
347 	frame.sf_sc.uc_mcontext.sc_eax    = regs->tf_eax;
348 	frame.sf_sc.uc_mcontext.sc_eip    = regs->tf_eip;
349 	frame.sf_sc.uc_mcontext.sc_cs     = regs->tf_cs;
350 	frame.sf_sc.uc_mcontext.sc_eflags = regs->tf_eflags;
351 	frame.sf_sc.uc_mcontext.sc_esp_at_signal = regs->tf_esp;
352 	frame.sf_sc.uc_mcontext.sc_ss     = regs->tf_ss;
353 	frame.sf_sc.uc_mcontext.sc_err    = regs->tf_err;
354 	frame.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code);
355 
356 #ifdef DEBUG
357 	if (ldebug(rt_sendsig))
358 		printf(LMSG("rt_sendsig flags: 0x%x, sp: %p, ss: 0x%x, mask: 0x%x"),
359 		    frame.sf_sc.uc_stack.ss_flags, td->td_sigstk.ss_sp,
360 		    td->td_sigstk.ss_size, frame.sf_sc.uc_mcontext.sc_mask);
361 #endif
362 
363 	if (copyout(&frame, fp, sizeof(frame)) != 0) {
364 		/*
365 		 * Process has trashed its stack; give it an illegal
366 		 * instruction to halt it in its tracks.
367 		 */
368 #ifdef DEBUG
369 		if (ldebug(rt_sendsig))
370 			printf(LMSG("rt_sendsig: bad stack %p, oonstack=%x"),
371 			    fp, oonstack);
372 #endif
373 		PROC_LOCK(p);
374 		sigexit(td, SIGILL);
375 	}
376 
377 	/*
378 	 * Build context to run handler in.
379 	 */
380 	regs->tf_esp = (int)fp;
381 	regs->tf_eip = PS_STRINGS - *(p->p_sysent->sv_szsigcode) +
382 	    linux_sznonrtsigcode;
383 	regs->tf_eflags &= ~(PSL_T | PSL_VM);
384 	regs->tf_cs = _ucodesel;
385 	regs->tf_ds = _udatasel;
386 	regs->tf_es = _udatasel;
387 	regs->tf_fs = _udatasel;
388 	regs->tf_ss = _udatasel;
389 	PROC_LOCK(p);
390 	mtx_lock(&psp->ps_mtx);
391 }
392 
393 
394 /*
395  * Send an interrupt to process.
396  *
397  * Stack is set up to allow sigcode stored
398  * in u. to call routine, followed by kcall
399  * to sigreturn routine below.  After sigreturn
400  * resets the signal mask, the stack, and the
401  * frame pointer, it returns to the user
402  * specified pc, psl.
403  */
404 static void
405 linux_sendsig(sig_t catcher, int sig, sigset_t *mask, u_long code)
406 {
407 	struct thread *td = curthread;
408 	struct proc *p = td->td_proc;
409 	struct sigacts *psp;
410 	struct trapframe *regs;
411 	struct l_sigframe *fp, frame;
412 	l_sigset_t lmask;
413 	int oonstack, i;
414 
415 	PROC_LOCK_ASSERT(p, MA_OWNED);
416 	psp = p->p_sigacts;
417 	mtx_assert(&psp->ps_mtx, MA_OWNED);
418 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
419 		/* Signal handler installed with SA_SIGINFO. */
420 		linux_rt_sendsig(catcher, sig, mask, code);
421 		return;
422 	}
423 
424 	regs = td->td_frame;
425 	oonstack = sigonstack(regs->tf_esp);
426 
427 #ifdef DEBUG
428 	if (ldebug(sendsig))
429 		printf(ARGS(sendsig, "%p, %d, %p, %lu"),
430 		    catcher, sig, (void*)mask, code);
431 #endif
432 
433 	/*
434 	 * Allocate space for the signal handler context.
435 	 */
436 	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
437 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
438 		fp = (struct l_sigframe *)(td->td_sigstk.ss_sp +
439 		    td->td_sigstk.ss_size - sizeof(struct l_sigframe));
440 	} else
441 		fp = (struct l_sigframe *)regs->tf_esp - 1;
442 	mtx_unlock(&psp->ps_mtx);
443 	PROC_UNLOCK(p);
444 
445 	/*
446 	 * Build the argument list for the signal handler.
447 	 */
448 	if (p->p_sysent->sv_sigtbl)
449 		if (sig <= p->p_sysent->sv_sigsize)
450 			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
451 
452 	bzero(&frame, sizeof(frame));
453 
454 	frame.sf_handler = catcher;
455 	frame.sf_sig = sig;
456 
457 	bsd_to_linux_sigset(mask, &lmask);
458 
459 	/*
460 	 * Build the signal context to be used by sigreturn.
461 	 */
462 	frame.sf_sc.sc_mask   = lmask.__bits[0];
463 	frame.sf_sc.sc_gs     = rgs();
464 	frame.sf_sc.sc_fs     = regs->tf_fs;
465 	frame.sf_sc.sc_es     = regs->tf_es;
466 	frame.sf_sc.sc_ds     = regs->tf_ds;
467 	frame.sf_sc.sc_edi    = regs->tf_edi;
468 	frame.sf_sc.sc_esi    = regs->tf_esi;
469 	frame.sf_sc.sc_ebp    = regs->tf_ebp;
470 	frame.sf_sc.sc_ebx    = regs->tf_ebx;
471 	frame.sf_sc.sc_edx    = regs->tf_edx;
472 	frame.sf_sc.sc_ecx    = regs->tf_ecx;
473 	frame.sf_sc.sc_eax    = regs->tf_eax;
474 	frame.sf_sc.sc_eip    = regs->tf_eip;
475 	frame.sf_sc.sc_cs     = regs->tf_cs;
476 	frame.sf_sc.sc_eflags = regs->tf_eflags;
477 	frame.sf_sc.sc_esp_at_signal = regs->tf_esp;
478 	frame.sf_sc.sc_ss     = regs->tf_ss;
479 	frame.sf_sc.sc_err    = regs->tf_err;
480 	frame.sf_sc.sc_trapno = bsd_to_linux_trapcode(code);
481 
482 	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
483 		frame.sf_extramask[i] = lmask.__bits[i+1];
484 
485 	if (copyout(&frame, fp, sizeof(frame)) != 0) {
486 		/*
487 		 * Process has trashed its stack; give it an illegal
488 		 * instruction to halt it in its tracks.
489 		 */
490 		PROC_LOCK(p);
491 		sigexit(td, SIGILL);
492 	}
493 
494 	/*
495 	 * Build context to run handler in.
496 	 */
497 	regs->tf_esp = (int)fp;
498 	regs->tf_eip = PS_STRINGS - *(p->p_sysent->sv_szsigcode);
499 	regs->tf_eflags &= ~(PSL_T | PSL_VM);
500 	regs->tf_cs = _ucodesel;
501 	regs->tf_ds = _udatasel;
502 	regs->tf_es = _udatasel;
503 	regs->tf_fs = _udatasel;
504 	regs->tf_ss = _udatasel;
505 	PROC_LOCK(p);
506 	mtx_lock(&psp->ps_mtx);
507 }
508 
509 /*
510  * System call to cleanup state after a signal
511  * has been taken.  Reset signal mask and
512  * stack state from context left by sendsig (above).
513  * Return to previous pc and psl as specified by
514  * context left by sendsig. Check carefully to
515  * make sure that the user has not modified the
516  * psl to gain improper privileges or to cause
517  * a machine fault.
518  */
519 int
520 linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args)
521 {
522 	struct proc *p = td->td_proc;
523 	struct l_sigframe frame;
524 	struct trapframe *regs;
525 	l_sigset_t lmask;
526 	int eflags, i;
527 
528 	regs = td->td_frame;
529 
530 #ifdef DEBUG
531 	if (ldebug(sigreturn))
532 		printf(ARGS(sigreturn, "%p"), (void *)args->sfp);
533 #endif
534 	/*
535 	 * The trampoline code hands us the sigframe.
536 	 * It is unsafe to keep track of it ourselves, in the event that a
537 	 * program jumps out of a signal handler.
538 	 */
539 	if (copyin(args->sfp, &frame, sizeof(frame)) != 0)
540 		return (EFAULT);
541 
542 	/*
543 	 * Check for security violations.
544 	 */
545 #define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
546 	eflags = frame.sf_sc.sc_eflags;
547 	/*
548 	 * XXX do allow users to change the privileged flag PSL_RF.  The
549 	 * cpu sets PSL_RF in tf_eflags for faults.  Debuggers should
550 	 * sometimes set it there too.  tf_eflags is kept in the signal
551 	 * context during signal handling and there is no other place
552 	 * to remember it, so the PSL_RF bit may be corrupted by the
553 	 * signal handler without us knowing.  Corruption of the PSL_RF
554 	 * bit at worst causes one more or one less debugger trap, so
555 	 * allowing it is fairly harmless.
556 	 */
557 	if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF))
558 		return(EINVAL);
559 
560 	/*
561 	 * Don't allow users to load a valid privileged %cs.  Let the
562 	 * hardware check for invalid selectors, excess privilege in
563 	 * other selectors, invalid %eip's and invalid %esp's.
564 	 */
565 #define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
566 	if (!CS_SECURE(frame.sf_sc.sc_cs)) {
567 		trapsignal(td, SIGBUS, T_PROTFLT);
568 		return(EINVAL);
569 	}
570 
571 	lmask.__bits[0] = frame.sf_sc.sc_mask;
572 	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
573 		lmask.__bits[i+1] = frame.sf_extramask[i];
574 	PROC_LOCK(p);
575 	linux_to_bsd_sigset(&lmask, &td->td_sigmask);
576 	SIG_CANTMASK(td->td_sigmask);
577 	signotify(td);
578 	PROC_UNLOCK(p);
579 
580 	/*
581 	 * Restore signal context.
582 	 */
583 	/* %gs was restored by the trampoline. */
584 	regs->tf_fs     = frame.sf_sc.sc_fs;
585 	regs->tf_es     = frame.sf_sc.sc_es;
586 	regs->tf_ds     = frame.sf_sc.sc_ds;
587 	regs->tf_edi    = frame.sf_sc.sc_edi;
588 	regs->tf_esi    = frame.sf_sc.sc_esi;
589 	regs->tf_ebp    = frame.sf_sc.sc_ebp;
590 	regs->tf_ebx    = frame.sf_sc.sc_ebx;
591 	regs->tf_edx    = frame.sf_sc.sc_edx;
592 	regs->tf_ecx    = frame.sf_sc.sc_ecx;
593 	regs->tf_eax    = frame.sf_sc.sc_eax;
594 	regs->tf_eip    = frame.sf_sc.sc_eip;
595 	regs->tf_cs     = frame.sf_sc.sc_cs;
596 	regs->tf_eflags = eflags;
597 	regs->tf_esp    = frame.sf_sc.sc_esp_at_signal;
598 	regs->tf_ss     = frame.sf_sc.sc_ss;
599 
600 	return (EJUSTRETURN);
601 }
602 
603 /*
604  * System call to cleanup state after a signal
605  * has been taken.  Reset signal mask and
606  * stack state from context left by rt_sendsig (above).
607  * Return to previous pc and psl as specified by
608  * context left by sendsig. Check carefully to
609  * make sure that the user has not modified the
610  * psl to gain improper privileges or to cause
611  * a machine fault.
612  */
613 int
614 linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args)
615 {
616 	struct proc *p = td->td_proc;
617 	struct l_ucontext uc;
618 	struct l_sigcontext *context;
619 	l_stack_t *lss;
620 	stack_t ss;
621 	struct trapframe *regs;
622 	int eflags;
623 
624 	regs = td->td_frame;
625 
626 #ifdef DEBUG
627 	if (ldebug(rt_sigreturn))
628 		printf(ARGS(rt_sigreturn, "%p"), (void *)args->ucp);
629 #endif
630 	/*
631 	 * The trampoline code hands us the ucontext.
632 	 * It is unsafe to keep track of it ourselves, in the event that a
633 	 * program jumps out of a signal handler.
634 	 */
635 	if (copyin(args->ucp, &uc, sizeof(uc)) != 0)
636 		return (EFAULT);
637 
638 	context = &uc.uc_mcontext;
639 
640 	/*
641 	 * Check for security violations.
642 	 */
643 #define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
644 	eflags = context->sc_eflags;
645 	/*
646 	 * XXX do allow users to change the privileged flag PSL_RF.  The
647 	 * cpu sets PSL_RF in tf_eflags for faults.  Debuggers should
648 	 * sometimes set it there too.  tf_eflags is kept in the signal
649 	 * context during signal handling and there is no other place
650 	 * to remember it, so the PSL_RF bit may be corrupted by the
651 	 * signal handler without us knowing.  Corruption of the PSL_RF
652 	 * bit at worst causes one more or one less debugger trap, so
653 	 * allowing it is fairly harmless.
654 	 */
655 	if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF))
656 		return(EINVAL);
657 
658 	/*
659 	 * Don't allow users to load a valid privileged %cs.  Let the
660 	 * hardware check for invalid selectors, excess privilege in
661 	 * other selectors, invalid %eip's and invalid %esp's.
662 	 */
663 #define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
664 	if (!CS_SECURE(context->sc_cs)) {
665 		trapsignal(td, SIGBUS, T_PROTFLT);
666 		return(EINVAL);
667 	}
668 
669 	PROC_LOCK(p);
670 	linux_to_bsd_sigset(&uc.uc_sigmask, &td->td_sigmask);
671 	SIG_CANTMASK(td->td_sigmask);
672 	signotify(td);
673 	PROC_UNLOCK(p);
674 
675 	/*
676 	 * Restore signal context
677 	 */
678 	/* %gs was restored by the trampoline. */
679 	regs->tf_fs     = context->sc_fs;
680 	regs->tf_es     = context->sc_es;
681 	regs->tf_ds     = context->sc_ds;
682 	regs->tf_edi    = context->sc_edi;
683 	regs->tf_esi    = context->sc_esi;
684 	regs->tf_ebp    = context->sc_ebp;
685 	regs->tf_ebx    = context->sc_ebx;
686 	regs->tf_edx    = context->sc_edx;
687 	regs->tf_ecx    = context->sc_ecx;
688 	regs->tf_eax    = context->sc_eax;
689 	regs->tf_eip    = context->sc_eip;
690 	regs->tf_cs     = context->sc_cs;
691 	regs->tf_eflags = eflags;
692 	regs->tf_esp    = context->sc_esp_at_signal;
693 	regs->tf_ss     = context->sc_ss;
694 
695 	/*
696 	 * call sigaltstack & ignore results..
697 	 */
698 	lss = &uc.uc_stack;
699 	ss.ss_sp = lss->ss_sp;
700 	ss.ss_size = lss->ss_size;
701 	ss.ss_flags = linux_to_bsd_sigaltstack(lss->ss_flags);
702 
703 #ifdef DEBUG
704 	if (ldebug(rt_sigreturn))
705 		printf(LMSG("rt_sigret flags: 0x%x, sp: %p, ss: 0x%x, mask: 0x%x"),
706 		    ss.ss_flags, ss.ss_sp, ss.ss_size, context->sc_mask);
707 #endif
708 	(void)kern_sigaltstack(td, &ss, NULL);
709 
710 	return (EJUSTRETURN);
711 }
712 
713 /*
714  * MPSAFE
715  */
716 static void
717 linux_prepsyscall(struct trapframe *tf, int *args, u_int *code, caddr_t *params)
718 {
719 	args[0] = tf->tf_ebx;
720 	args[1] = tf->tf_ecx;
721 	args[2] = tf->tf_edx;
722 	args[3] = tf->tf_esi;
723 	args[4] = tf->tf_edi;
724 	args[5] = tf->tf_ebp;	/* Unconfirmed */
725 	*params = NULL;		/* no copyin */
726 }
727 
728 
729 
730 /*
731  * Dump core, into a file named as described in the comments for
732  * expand_name(), unless the process was setuid/setgid.
733  */
734 static int
735 linux_aout_coredump(struct thread *td, struct vnode *vp, off_t limit)
736 {
737 	struct proc *p = td->td_proc;
738 	struct ucred *cred = td->td_ucred;
739 	struct vmspace *vm = p->p_vmspace;
740 	char *tempuser;
741 	int error;
742 
743 	if (ctob((uarea_pages + kstack_pages) +
744 	    vm->vm_dsize + vm->vm_ssize) >= limit)
745 		return (EFAULT);
746 	tempuser = malloc(ctob(uarea_pages + kstack_pages), M_TEMP,
747 	    M_WAITOK | M_ZERO);
748 	if (tempuser == NULL)
749 		return (ENOMEM);
750 	PROC_LOCK(p);
751 	fill_user(p, (struct user *)tempuser);
752 	PROC_UNLOCK(p);
753 	bcopy(td->td_frame,
754 	    tempuser + ctob(uarea_pages) +
755 	    ((caddr_t)td->td_frame - (caddr_t)td->td_kstack),
756 	    sizeof(struct trapframe));
757 	error = vn_rdwr(UIO_WRITE, vp, (caddr_t)tempuser,
758 	    ctob(uarea_pages + kstack_pages),
759 	    (off_t)0, UIO_SYSSPACE, IO_UNIT, cred, NOCRED,
760 	    (int *)NULL, td);
761 	free(tempuser, M_TEMP);
762 	if (error == 0)
763 		error = vn_rdwr(UIO_WRITE, vp, vm->vm_daddr,
764 		    (int)ctob(vm->vm_dsize),
765 		    (off_t)ctob(uarea_pages + kstack_pages), UIO_USERSPACE,
766 		    IO_UNIT | IO_DIRECT, cred, NOCRED, (int *) NULL, td);
767 	if (error == 0)
768 		error = vn_rdwr_inchunks(UIO_WRITE, vp,
769 		    (caddr_t)trunc_page(USRSTACK - ctob(vm->vm_ssize)),
770 		    round_page(ctob(vm->vm_ssize)),
771 		    (off_t)ctob(uarea_pages + kstack_pages) +
772 			ctob(vm->vm_dsize), UIO_USERSPACE,
773 		    IO_UNIT | IO_DIRECT, cred, NOCRED, NULL, td);
774 	return (error);
775 }
776 /*
777  * If a linux binary is exec'ing something, try this image activator
778  * first.  We override standard shell script execution in order to
779  * be able to modify the interpreter path.  We only do this if a linux
780  * binary is doing the exec, so we do not create an EXEC module for it.
781  */
782 static int	exec_linux_imgact_try(struct image_params *iparams);
783 
784 static int
785 exec_linux_imgact_try(struct image_params *imgp)
786 {
787     const char *head = (const char *)imgp->image_header;
788     int error = -1;
789 
790     /*
791      * The interpreter for shell scripts run from a linux binary needs
792      * to be located in /compat/linux if possible in order to recursively
793      * maintain linux path emulation.
794      */
795     if (((const short *)head)[0] == SHELLMAGIC) {
796 	    /*
797 	     * Run our normal shell image activator.  If it succeeds attempt
798 	     * to use the alternate path for the interpreter.  If an alternate
799 	     * path is found, use our stringspace to store it.
800 	     */
801 	    if ((error = exec_shell_imgact(imgp)) == 0) {
802 		    char *rpath = NULL;
803 
804 		    linux_emul_find(FIRST_THREAD_IN_PROC(imgp->proc), NULL,
805 			imgp->interpreter_name, &rpath, 0);
806 		    if (rpath != imgp->interpreter_name) {
807 			    int len = strlen(rpath) + 1;
808 
809 			    if (len <= MAXSHELLCMDLEN) {
810 				    memcpy(imgp->interpreter_name, rpath, len);
811 			    }
812 			    free(rpath, M_TEMP);
813 		    }
814 	    }
815     }
816     return(error);
817 }
818 
819 /*
820  * exec_setregs may initialize some registers differently than Linux
821  * does, thus potentially confusing Linux binaries. If necessary, we
822  * override the exec_setregs default(s) here.
823  */
824 static void
825 exec_linux_setregs(struct thread *td, u_long entry,
826 		   u_long stack, u_long ps_strings)
827 {
828 	struct pcb *pcb = td->td_pcb;
829 
830 	exec_setregs(td, entry, stack, ps_strings);
831 
832 	/* Linux sets %gs to 0, we default to _udatasel */
833 	pcb->pcb_gs = 0; load_gs(0);
834 }
835 
836 struct sysentvec linux_sysvec = {
837 	LINUX_SYS_MAXSYSCALL,
838 	linux_sysent,
839 	0xff,
840 	LINUX_SIGTBLSZ,
841 	bsd_to_linux_signal,
842 	ELAST + 1,
843 	bsd_to_linux_errno,
844 	translate_traps,
845 	linux_fixup,
846 	linux_sendsig,
847 	linux_sigcode,
848 	&linux_szsigcode,
849 	linux_prepsyscall,
850 	"Linux a.out",
851 	linux_aout_coredump,
852 	exec_linux_imgact_try,
853 	LINUX_MINSIGSTKSZ,
854 	PAGE_SIZE,
855 	VM_MIN_ADDRESS,
856 	VM_MAXUSER_ADDRESS,
857 	USRSTACK,
858 	PS_STRINGS,
859 	VM_PROT_ALL,
860 	exec_copyout_strings,
861 	exec_linux_setregs,
862 	NULL
863 };
864 
865 struct sysentvec elf_linux_sysvec = {
866 	LINUX_SYS_MAXSYSCALL,
867 	linux_sysent,
868 	0xff,
869 	LINUX_SIGTBLSZ,
870 	bsd_to_linux_signal,
871 	ELAST + 1,
872 	bsd_to_linux_errno,
873 	translate_traps,
874 	elf_linux_fixup,
875 	linux_sendsig,
876 	linux_sigcode,
877 	&linux_szsigcode,
878 	linux_prepsyscall,
879 	"Linux ELF",
880 	elf32_coredump,
881 	exec_linux_imgact_try,
882 	LINUX_MINSIGSTKSZ,
883 	PAGE_SIZE,
884 	VM_MIN_ADDRESS,
885 	VM_MAXUSER_ADDRESS,
886 	USRSTACK,
887 	PS_STRINGS,
888 	VM_PROT_ALL,
889 	exec_copyout_strings,
890 	exec_linux_setregs,
891 	NULL
892 };
893 
894 static Elf32_Brandinfo linux_brand = {
895 					ELFOSABI_LINUX,
896 					EM_386,
897 					"Linux",
898 					"/compat/linux",
899 					"/lib/ld-linux.so.1",
900 					&elf_linux_sysvec,
901 					NULL,
902 				 };
903 
904 static Elf32_Brandinfo linux_glibc2brand = {
905 					ELFOSABI_LINUX,
906 					EM_386,
907 					"Linux",
908 					"/compat/linux",
909 					"/lib/ld-linux.so.2",
910 					&elf_linux_sysvec,
911 					NULL,
912 				 };
913 
914 Elf32_Brandinfo *linux_brandlist[] = {
915 					&linux_brand,
916 					&linux_glibc2brand,
917 					NULL
918 				};
919 
920 static int
921 linux_elf_modevent(module_t mod, int type, void *data)
922 {
923 	Elf32_Brandinfo **brandinfo;
924 	int error;
925 	struct linux_ioctl_handler **lihp;
926 
927 	error = 0;
928 
929 	switch(type) {
930 	case MOD_LOAD:
931 		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
932 		     ++brandinfo)
933 			if (elf32_insert_brand_entry(*brandinfo) < 0)
934 				error = EINVAL;
935 		if (error == 0) {
936 			SET_FOREACH(lihp, linux_ioctl_handler_set)
937 				linux_ioctl_register_handler(*lihp);
938 			if (bootverbose)
939 				printf("Linux ELF exec handler installed\n");
940 		} else
941 			printf("cannot insert Linux ELF brand handler\n");
942 		break;
943 	case MOD_UNLOAD:
944 		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
945 		     ++brandinfo)
946 			if (elf32_brand_inuse(*brandinfo))
947 				error = EBUSY;
948 		if (error == 0) {
949 			for (brandinfo = &linux_brandlist[0];
950 			     *brandinfo != NULL; ++brandinfo)
951 				if (elf32_remove_brand_entry(*brandinfo) < 0)
952 					error = EINVAL;
953 		}
954 		if (error == 0) {
955 			SET_FOREACH(lihp, linux_ioctl_handler_set)
956 				linux_ioctl_unregister_handler(*lihp);
957 			if (bootverbose)
958 				printf("Linux ELF exec handler removed\n");
959 			linux_mib_destroy();
960 		} else
961 			printf("Could not deinstall ELF interpreter entry\n");
962 		break;
963 	default:
964 		return EOPNOTSUPP;
965 	}
966 	return error;
967 }
968 
969 static moduledata_t linux_elf_mod = {
970 	"linuxelf",
971 	linux_elf_modevent,
972 	0
973 };
974 
975 DECLARE_MODULE(linuxelf, linux_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY);
976