xref: /freebsd/sys/i386/linux/linux_sysvec.c (revision 8847579c57d6aff2b3371c707dce7a2cee8389aa)
1 /*-
2  * Copyright (c) 1994-1996 S�ren Schmidt
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer
10  *    in this position and unchanged.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  * 3. The name of the author may not be used to endorse or promote products
15  *    derived from this software without specific prior written permission
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/exec.h>
35 #include <sys/imgact.h>
36 #include <sys/imgact_aout.h>
37 #include <sys/imgact_elf.h>
38 #include <sys/kernel.h>
39 #include <sys/lock.h>
40 #include <sys/malloc.h>
41 #include <sys/module.h>
42 #include <sys/mutex.h>
43 #include <sys/proc.h>
44 #include <sys/signalvar.h>
45 #include <sys/syscallsubr.h>
46 #include <sys/sysent.h>
47 #include <sys/sysproto.h>
48 #include <sys/vnode.h>
49 
50 #include <vm/vm.h>
51 #include <vm/pmap.h>
52 #include <vm/vm_extern.h>
53 #include <vm/vm_map.h>
54 #include <vm/vm_object.h>
55 #include <vm/vm_page.h>
56 #include <vm/vm_param.h>
57 
58 #include <machine/cpu.h>
59 #include <machine/md_var.h>
60 #include <machine/pcb.h>
61 
62 #include <i386/linux/linux.h>
63 #include <i386/linux/linux_proto.h>
64 #include <compat/linux/linux_mib.h>
65 #include <compat/linux/linux_signal.h>
66 #include <compat/linux/linux_util.h>
67 
68 MODULE_VERSION(linux, 1);
69 
70 MALLOC_DEFINE(M_LINUX, "linux", "Linux mode structures");
71 
72 #if BYTE_ORDER == LITTLE_ENDIAN
73 #define SHELLMAGIC      0x2123 /* #! */
74 #else
75 #define SHELLMAGIC      0x2321
76 #endif
77 
78 /*
79  * Allow the sendsig functions to use the ldebug() facility
80  * even though they are not syscalls themselves. Map them
81  * to syscall 0. This is slightly less bogus than using
82  * ldebug(sigreturn).
83  */
84 #define	LINUX_SYS_linux_rt_sendsig	0
85 #define	LINUX_SYS_linux_sendsig		0
86 
87 #define	fldcw(addr)		__asm("fldcw %0" : : "m" (*(addr)))
88 #define	__LINUX_NPXCW__		0x37f
89 
90 extern char linux_sigcode[];
91 extern int linux_szsigcode;
92 
93 extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL];
94 
95 SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler);
96 SET_DECLARE(linux_device_handler_set, struct linux_device_handler);
97 
98 static int	linux_fixup(register_t **stack_base,
99 		    struct image_params *iparams);
100 static int	elf_linux_fixup(register_t **stack_base,
101 		    struct image_params *iparams);
102 static void	linux_prepsyscall(struct trapframe *tf, int *args, u_int *code,
103 		    caddr_t *params);
104 static void     linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask);
105 static void	exec_linux_setregs(struct thread *td, u_long entry,
106 				   u_long stack, u_long ps_strings);
107 
108 /*
109  * Linux syscalls return negative errno's, we do positive and map them
110  */
111 static int bsd_to_linux_errno[ELAST + 1] = {
112 	-0,  -1,  -2,  -3,  -4,  -5,  -6,  -7,  -8,  -9,
113 	-10, -35, -12, -13, -14, -15, -16, -17, -18, -19,
114 	-20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
115 	-30, -31, -32, -33, -34, -11,-115,-114, -88, -89,
116 	-90, -91, -92, -93, -94, -95, -96, -97, -98, -99,
117 	-100,-101,-102,-103,-104,-105,-106,-107,-108,-109,
118 	-110,-111, -40, -36,-112,-113, -39, -11, -87,-122,
119 	-116, -66,  -6,  -6,  -6,  -6,  -6, -37, -38,  -9,
120 	-6, -6, -43, -42, -75, -6, -84
121 };
122 
123 int bsd_to_linux_signal[LINUX_SIGTBLSZ] = {
124 	LINUX_SIGHUP, LINUX_SIGINT, LINUX_SIGQUIT, LINUX_SIGILL,
125 	LINUX_SIGTRAP, LINUX_SIGABRT, 0, LINUX_SIGFPE,
126 	LINUX_SIGKILL, LINUX_SIGBUS, LINUX_SIGSEGV, LINUX_SIGSYS,
127 	LINUX_SIGPIPE, LINUX_SIGALRM, LINUX_SIGTERM, LINUX_SIGURG,
128 	LINUX_SIGSTOP, LINUX_SIGTSTP, LINUX_SIGCONT, LINUX_SIGCHLD,
129 	LINUX_SIGTTIN, LINUX_SIGTTOU, LINUX_SIGIO, LINUX_SIGXCPU,
130 	LINUX_SIGXFSZ, LINUX_SIGVTALRM, LINUX_SIGPROF, LINUX_SIGWINCH,
131 	0, LINUX_SIGUSR1, LINUX_SIGUSR2
132 };
133 
134 int linux_to_bsd_signal[LINUX_SIGTBLSZ] = {
135 	SIGHUP, SIGINT, SIGQUIT, SIGILL,
136 	SIGTRAP, SIGABRT, SIGBUS, SIGFPE,
137 	SIGKILL, SIGUSR1, SIGSEGV, SIGUSR2,
138 	SIGPIPE, SIGALRM, SIGTERM, SIGBUS,
139 	SIGCHLD, SIGCONT, SIGSTOP, SIGTSTP,
140 	SIGTTIN, SIGTTOU, SIGURG, SIGXCPU,
141 	SIGXFSZ, SIGVTALRM, SIGPROF, SIGWINCH,
142 	SIGIO, SIGURG, SIGSYS
143 };
144 
145 #define LINUX_T_UNKNOWN  255
146 static int _bsd_to_linux_trapcode[] = {
147 	LINUX_T_UNKNOWN,	/* 0 */
148 	6,			/* 1  T_PRIVINFLT */
149 	LINUX_T_UNKNOWN,	/* 2 */
150 	3,			/* 3  T_BPTFLT */
151 	LINUX_T_UNKNOWN,	/* 4 */
152 	LINUX_T_UNKNOWN,	/* 5 */
153 	16,			/* 6  T_ARITHTRAP */
154 	254,			/* 7  T_ASTFLT */
155 	LINUX_T_UNKNOWN,	/* 8 */
156 	13,			/* 9  T_PROTFLT */
157 	1,			/* 10 T_TRCTRAP */
158 	LINUX_T_UNKNOWN,	/* 11 */
159 	14,			/* 12 T_PAGEFLT */
160 	LINUX_T_UNKNOWN,	/* 13 */
161 	17,			/* 14 T_ALIGNFLT */
162 	LINUX_T_UNKNOWN,	/* 15 */
163 	LINUX_T_UNKNOWN,	/* 16 */
164 	LINUX_T_UNKNOWN,	/* 17 */
165 	0,			/* 18 T_DIVIDE */
166 	2,			/* 19 T_NMI */
167 	4,			/* 20 T_OFLOW */
168 	5,			/* 21 T_BOUND */
169 	7,			/* 22 T_DNA */
170 	8,			/* 23 T_DOUBLEFLT */
171 	9,			/* 24 T_FPOPFLT */
172 	10,			/* 25 T_TSSFLT */
173 	11,			/* 26 T_SEGNPFLT */
174 	12,			/* 27 T_STKFLT */
175 	18,			/* 28 T_MCHK */
176 	19,			/* 29 T_XMMFLT */
177 	15			/* 30 T_RESERVED */
178 };
179 #define bsd_to_linux_trapcode(code) \
180     ((code)<sizeof(_bsd_to_linux_trapcode)/sizeof(*_bsd_to_linux_trapcode)? \
181      _bsd_to_linux_trapcode[(code)]: \
182      LINUX_T_UNKNOWN)
183 
184 /*
185  * If FreeBSD & Linux have a difference of opinion about what a trap
186  * means, deal with it here.
187  *
188  * MPSAFE
189  */
190 static int
191 translate_traps(int signal, int trap_code)
192 {
193 	if (signal != SIGBUS)
194 		return signal;
195 	switch (trap_code) {
196 	case T_PROTFLT:
197 	case T_TSSFLT:
198 	case T_DOUBLEFLT:
199 	case T_PAGEFLT:
200 		return SIGSEGV;
201 	default:
202 		return signal;
203 	}
204 }
205 
206 static int
207 linux_fixup(register_t **stack_base, struct image_params *imgp)
208 {
209 	register_t *argv, *envp;
210 
211 	argv = *stack_base;
212 	envp = *stack_base + (imgp->args->argc + 1);
213 	(*stack_base)--;
214 	**stack_base = (intptr_t)(void *)envp;
215 	(*stack_base)--;
216 	**stack_base = (intptr_t)(void *)argv;
217 	(*stack_base)--;
218 	**stack_base = imgp->args->argc;
219 	return 0;
220 }
221 
222 static int
223 elf_linux_fixup(register_t **stack_base, struct image_params *imgp)
224 {
225 	Elf32_Auxargs *args;
226 	register_t *pos;
227 
228 	KASSERT(curthread->td_proc == imgp->proc &&
229 	    (curthread->td_proc->p_flag & P_SA) == 0,
230 	    ("unsafe elf_linux_fixup(), should be curproc"));
231 	args = (Elf32_Auxargs *)imgp->auxargs;
232 	pos = *stack_base + (imgp->args->argc + imgp->args->envc + 2);
233 
234 	if (args->trace)
235 		AUXARGS_ENTRY(pos, AT_DEBUG, 1);
236 	if (args->execfd != -1)
237 		AUXARGS_ENTRY(pos, AT_EXECFD, args->execfd);
238 	AUXARGS_ENTRY(pos, AT_PHDR, args->phdr);
239 	AUXARGS_ENTRY(pos, AT_PHENT, args->phent);
240 	AUXARGS_ENTRY(pos, AT_PHNUM, args->phnum);
241 	AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz);
242 	AUXARGS_ENTRY(pos, AT_FLAGS, args->flags);
243 	AUXARGS_ENTRY(pos, AT_ENTRY, args->entry);
244 	AUXARGS_ENTRY(pos, AT_BASE, args->base);
245 	AUXARGS_ENTRY(pos, AT_UID, imgp->proc->p_ucred->cr_ruid);
246 	AUXARGS_ENTRY(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid);
247 	AUXARGS_ENTRY(pos, AT_GID, imgp->proc->p_ucred->cr_rgid);
248 	AUXARGS_ENTRY(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid);
249 	AUXARGS_ENTRY(pos, AT_NULL, 0);
250 
251 	free(imgp->auxargs, M_TEMP);
252 	imgp->auxargs = NULL;
253 
254 	(*stack_base)--;
255 	**stack_base = (register_t)imgp->args->argc;
256 	return 0;
257 }
258 
259 extern int _ucodesel, _udatasel;
260 extern unsigned long linux_sznonrtsigcode;
261 
262 static void
263 linux_rt_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
264 {
265 	struct thread *td = curthread;
266 	struct proc *p = td->td_proc;
267 	struct sigacts *psp;
268 	struct trapframe *regs;
269 	struct l_rt_sigframe *fp, frame;
270 	int sig, code;
271 	int oonstack;
272 
273 	sig = ksi->ksi_signo;
274 	code = ksi->ksi_code;
275 	PROC_LOCK_ASSERT(p, MA_OWNED);
276 	psp = p->p_sigacts;
277 	mtx_assert(&psp->ps_mtx, MA_OWNED);
278 	regs = td->td_frame;
279 	oonstack = sigonstack(regs->tf_esp);
280 
281 #ifdef DEBUG
282 	if (ldebug(rt_sendsig))
283 		printf(ARGS(rt_sendsig, "%p, %d, %p, %u"),
284 		    catcher, sig, (void*)mask, code);
285 #endif
286 	/*
287 	 * Allocate space for the signal handler context.
288 	 */
289 	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
290 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
291 		fp = (struct l_rt_sigframe *)(td->td_sigstk.ss_sp +
292 		    td->td_sigstk.ss_size - sizeof(struct l_rt_sigframe));
293 	} else
294 		fp = (struct l_rt_sigframe *)regs->tf_esp - 1;
295 	mtx_unlock(&psp->ps_mtx);
296 
297 	/*
298 	 * Build the argument list for the signal handler.
299 	 */
300 	if (p->p_sysent->sv_sigtbl)
301 		if (sig <= p->p_sysent->sv_sigsize)
302 			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
303 
304 	bzero(&frame, sizeof(frame));
305 
306 	frame.sf_handler = catcher;
307 	frame.sf_sig = sig;
308 	frame.sf_siginfo = &fp->sf_si;
309 	frame.sf_ucontext = &fp->sf_sc;
310 
311 	/* Fill in POSIX parts */
312 	frame.sf_si.lsi_signo = sig;
313 	frame.sf_si.lsi_code = code;
314 	frame.sf_si.lsi_addr = ksi->ksi_addr;
315 
316 	/*
317 	 * Build the signal context to be used by sigreturn.
318 	 */
319 	frame.sf_sc.uc_flags = 0;		/* XXX ??? */
320 	frame.sf_sc.uc_link = NULL;		/* XXX ??? */
321 
322 	frame.sf_sc.uc_stack.ss_sp = td->td_sigstk.ss_sp;
323 	frame.sf_sc.uc_stack.ss_size = td->td_sigstk.ss_size;
324 	frame.sf_sc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
325 	    ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE;
326 	PROC_UNLOCK(p);
327 
328 	bsd_to_linux_sigset(mask, &frame.sf_sc.uc_sigmask);
329 
330 	frame.sf_sc.uc_mcontext.sc_mask   = frame.sf_sc.uc_sigmask.__bits[0];
331 	frame.sf_sc.uc_mcontext.sc_gs     = rgs();
332 	frame.sf_sc.uc_mcontext.sc_fs     = regs->tf_fs;
333 	frame.sf_sc.uc_mcontext.sc_es     = regs->tf_es;
334 	frame.sf_sc.uc_mcontext.sc_ds     = regs->tf_ds;
335 	frame.sf_sc.uc_mcontext.sc_edi    = regs->tf_edi;
336 	frame.sf_sc.uc_mcontext.sc_esi    = regs->tf_esi;
337 	frame.sf_sc.uc_mcontext.sc_ebp    = regs->tf_ebp;
338 	frame.sf_sc.uc_mcontext.sc_ebx    = regs->tf_ebx;
339 	frame.sf_sc.uc_mcontext.sc_edx    = regs->tf_edx;
340 	frame.sf_sc.uc_mcontext.sc_ecx    = regs->tf_ecx;
341 	frame.sf_sc.uc_mcontext.sc_eax    = regs->tf_eax;
342 	frame.sf_sc.uc_mcontext.sc_eip    = regs->tf_eip;
343 	frame.sf_sc.uc_mcontext.sc_cs     = regs->tf_cs;
344 	frame.sf_sc.uc_mcontext.sc_eflags = regs->tf_eflags;
345 	frame.sf_sc.uc_mcontext.sc_esp_at_signal = regs->tf_esp;
346 	frame.sf_sc.uc_mcontext.sc_ss     = regs->tf_ss;
347 	frame.sf_sc.uc_mcontext.sc_err    = regs->tf_err;
348 	frame.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code);
349 
350 #ifdef DEBUG
351 	if (ldebug(rt_sendsig))
352 		printf(LMSG("rt_sendsig flags: 0x%x, sp: %p, ss: 0x%x, mask: 0x%x"),
353 		    frame.sf_sc.uc_stack.ss_flags, td->td_sigstk.ss_sp,
354 		    td->td_sigstk.ss_size, frame.sf_sc.uc_mcontext.sc_mask);
355 #endif
356 
357 	if (copyout(&frame, fp, sizeof(frame)) != 0) {
358 		/*
359 		 * Process has trashed its stack; give it an illegal
360 		 * instruction to halt it in its tracks.
361 		 */
362 #ifdef DEBUG
363 		if (ldebug(rt_sendsig))
364 			printf(LMSG("rt_sendsig: bad stack %p, oonstack=%x"),
365 			    fp, oonstack);
366 #endif
367 		PROC_LOCK(p);
368 		sigexit(td, SIGILL);
369 	}
370 
371 	/*
372 	 * Build context to run handler in.
373 	 */
374 	regs->tf_esp = (int)fp;
375 	regs->tf_eip = PS_STRINGS - *(p->p_sysent->sv_szsigcode) +
376 	    linux_sznonrtsigcode;
377 	regs->tf_eflags &= ~(PSL_T | PSL_VM);
378 	regs->tf_cs = _ucodesel;
379 	regs->tf_ds = _udatasel;
380 	regs->tf_es = _udatasel;
381 	regs->tf_fs = _udatasel;
382 	regs->tf_ss = _udatasel;
383 	PROC_LOCK(p);
384 	mtx_lock(&psp->ps_mtx);
385 }
386 
387 
388 /*
389  * Send an interrupt to process.
390  *
391  * Stack is set up to allow sigcode stored
392  * in u. to call routine, followed by kcall
393  * to sigreturn routine below.  After sigreturn
394  * resets the signal mask, the stack, and the
395  * frame pointer, it returns to the user
396  * specified pc, psl.
397  */
398 static void
399 linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
400 {
401 	struct thread *td = curthread;
402 	struct proc *p = td->td_proc;
403 	struct sigacts *psp;
404 	struct trapframe *regs;
405 	struct l_sigframe *fp, frame;
406 	l_sigset_t lmask;
407 	int sig, code;
408 	int oonstack, i;
409 
410 	PROC_LOCK_ASSERT(p, MA_OWNED);
411 	psp = p->p_sigacts;
412 	sig = ksi->ksi_signo;
413 	code = ksi->ksi_code;
414 	mtx_assert(&psp->ps_mtx, MA_OWNED);
415 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
416 		/* Signal handler installed with SA_SIGINFO. */
417 		linux_rt_sendsig(catcher, ksi, mask);
418 		return;
419 	}
420 	regs = td->td_frame;
421 	oonstack = sigonstack(regs->tf_esp);
422 
423 #ifdef DEBUG
424 	if (ldebug(sendsig))
425 		printf(ARGS(sendsig, "%p, %d, %p, %u"),
426 		    catcher, sig, (void*)mask, code);
427 #endif
428 
429 	/*
430 	 * Allocate space for the signal handler context.
431 	 */
432 	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
433 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
434 		fp = (struct l_sigframe *)(td->td_sigstk.ss_sp +
435 		    td->td_sigstk.ss_size - sizeof(struct l_sigframe));
436 	} else
437 		fp = (struct l_sigframe *)regs->tf_esp - 1;
438 	mtx_unlock(&psp->ps_mtx);
439 	PROC_UNLOCK(p);
440 
441 	/*
442 	 * Build the argument list for the signal handler.
443 	 */
444 	if (p->p_sysent->sv_sigtbl)
445 		if (sig <= p->p_sysent->sv_sigsize)
446 			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
447 
448 	bzero(&frame, sizeof(frame));
449 
450 	frame.sf_handler = catcher;
451 	frame.sf_sig = sig;
452 
453 	bsd_to_linux_sigset(mask, &lmask);
454 
455 	/*
456 	 * Build the signal context to be used by sigreturn.
457 	 */
458 	frame.sf_sc.sc_mask   = lmask.__bits[0];
459 	frame.sf_sc.sc_gs     = rgs();
460 	frame.sf_sc.sc_fs     = regs->tf_fs;
461 	frame.sf_sc.sc_es     = regs->tf_es;
462 	frame.sf_sc.sc_ds     = regs->tf_ds;
463 	frame.sf_sc.sc_edi    = regs->tf_edi;
464 	frame.sf_sc.sc_esi    = regs->tf_esi;
465 	frame.sf_sc.sc_ebp    = regs->tf_ebp;
466 	frame.sf_sc.sc_ebx    = regs->tf_ebx;
467 	frame.sf_sc.sc_edx    = regs->tf_edx;
468 	frame.sf_sc.sc_ecx    = regs->tf_ecx;
469 	frame.sf_sc.sc_eax    = regs->tf_eax;
470 	frame.sf_sc.sc_eip    = regs->tf_eip;
471 	frame.sf_sc.sc_cs     = regs->tf_cs;
472 	frame.sf_sc.sc_eflags = regs->tf_eflags;
473 	frame.sf_sc.sc_esp_at_signal = regs->tf_esp;
474 	frame.sf_sc.sc_ss     = regs->tf_ss;
475 	frame.sf_sc.sc_err    = regs->tf_err;
476 	frame.sf_sc.sc_trapno = bsd_to_linux_trapcode(ksi->ksi_trapno);
477 
478 	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
479 		frame.sf_extramask[i] = lmask.__bits[i+1];
480 
481 	if (copyout(&frame, fp, sizeof(frame)) != 0) {
482 		/*
483 		 * Process has trashed its stack; give it an illegal
484 		 * instruction to halt it in its tracks.
485 		 */
486 		PROC_LOCK(p);
487 		sigexit(td, SIGILL);
488 	}
489 
490 	/*
491 	 * Build context to run handler in.
492 	 */
493 	regs->tf_esp = (int)fp;
494 	regs->tf_eip = PS_STRINGS - *(p->p_sysent->sv_szsigcode);
495 	regs->tf_eflags &= ~(PSL_T | PSL_VM);
496 	regs->tf_cs = _ucodesel;
497 	regs->tf_ds = _udatasel;
498 	regs->tf_es = _udatasel;
499 	regs->tf_fs = _udatasel;
500 	regs->tf_ss = _udatasel;
501 	PROC_LOCK(p);
502 	mtx_lock(&psp->ps_mtx);
503 }
504 
505 /*
506  * System call to cleanup state after a signal
507  * has been taken.  Reset signal mask and
508  * stack state from context left by sendsig (above).
509  * Return to previous pc and psl as specified by
510  * context left by sendsig. Check carefully to
511  * make sure that the user has not modified the
512  * psl to gain improper privileges or to cause
513  * a machine fault.
514  */
515 int
516 linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args)
517 {
518 	struct proc *p = td->td_proc;
519 	struct l_sigframe frame;
520 	struct trapframe *regs;
521 	l_sigset_t lmask;
522 	int eflags, i;
523 	ksiginfo_t ksi;
524 
525 	regs = td->td_frame;
526 
527 #ifdef DEBUG
528 	if (ldebug(sigreturn))
529 		printf(ARGS(sigreturn, "%p"), (void *)args->sfp);
530 #endif
531 	/*
532 	 * The trampoline code hands us the sigframe.
533 	 * It is unsafe to keep track of it ourselves, in the event that a
534 	 * program jumps out of a signal handler.
535 	 */
536 	if (copyin(args->sfp, &frame, sizeof(frame)) != 0)
537 		return (EFAULT);
538 
539 	/*
540 	 * Check for security violations.
541 	 */
542 #define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
543 	eflags = frame.sf_sc.sc_eflags;
544 	/*
545 	 * XXX do allow users to change the privileged flag PSL_RF.  The
546 	 * cpu sets PSL_RF in tf_eflags for faults.  Debuggers should
547 	 * sometimes set it there too.  tf_eflags is kept in the signal
548 	 * context during signal handling and there is no other place
549 	 * to remember it, so the PSL_RF bit may be corrupted by the
550 	 * signal handler without us knowing.  Corruption of the PSL_RF
551 	 * bit at worst causes one more or one less debugger trap, so
552 	 * allowing it is fairly harmless.
553 	 */
554 	if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF))
555 		return(EINVAL);
556 
557 	/*
558 	 * Don't allow users to load a valid privileged %cs.  Let the
559 	 * hardware check for invalid selectors, excess privilege in
560 	 * other selectors, invalid %eip's and invalid %esp's.
561 	 */
562 #define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
563 	if (!CS_SECURE(frame.sf_sc.sc_cs)) {
564 		ksiginfo_init_trap(&ksi);
565 		ksi.ksi_signo = SIGBUS;
566 		ksi.ksi_code = BUS_OBJERR;
567 		ksi.ksi_trapno = T_PROTFLT;
568 		ksi.ksi_addr = (void *)regs->tf_eip;
569 		trapsignal(td, &ksi);
570 		return(EINVAL);
571 	}
572 
573 	lmask.__bits[0] = frame.sf_sc.sc_mask;
574 	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
575 		lmask.__bits[i+1] = frame.sf_extramask[i];
576 	PROC_LOCK(p);
577 	linux_to_bsd_sigset(&lmask, &td->td_sigmask);
578 	SIG_CANTMASK(td->td_sigmask);
579 	signotify(td);
580 	PROC_UNLOCK(p);
581 
582 	/*
583 	 * Restore signal context.
584 	 */
585 	/* %gs was restored by the trampoline. */
586 	regs->tf_fs     = frame.sf_sc.sc_fs;
587 	regs->tf_es     = frame.sf_sc.sc_es;
588 	regs->tf_ds     = frame.sf_sc.sc_ds;
589 	regs->tf_edi    = frame.sf_sc.sc_edi;
590 	regs->tf_esi    = frame.sf_sc.sc_esi;
591 	regs->tf_ebp    = frame.sf_sc.sc_ebp;
592 	regs->tf_ebx    = frame.sf_sc.sc_ebx;
593 	regs->tf_edx    = frame.sf_sc.sc_edx;
594 	regs->tf_ecx    = frame.sf_sc.sc_ecx;
595 	regs->tf_eax    = frame.sf_sc.sc_eax;
596 	regs->tf_eip    = frame.sf_sc.sc_eip;
597 	regs->tf_cs     = frame.sf_sc.sc_cs;
598 	regs->tf_eflags = eflags;
599 	regs->tf_esp    = frame.sf_sc.sc_esp_at_signal;
600 	regs->tf_ss     = frame.sf_sc.sc_ss;
601 
602 	return (EJUSTRETURN);
603 }
604 
605 /*
606  * System call to cleanup state after a signal
607  * has been taken.  Reset signal mask and
608  * stack state from context left by rt_sendsig (above).
609  * Return to previous pc and psl as specified by
610  * context left by sendsig. Check carefully to
611  * make sure that the user has not modified the
612  * psl to gain improper privileges or to cause
613  * a machine fault.
614  */
615 int
616 linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args)
617 {
618 	struct proc *p = td->td_proc;
619 	struct l_ucontext uc;
620 	struct l_sigcontext *context;
621 	l_stack_t *lss;
622 	stack_t ss;
623 	struct trapframe *regs;
624 	int eflags;
625 	ksiginfo_t ksi;
626 
627 	regs = td->td_frame;
628 
629 #ifdef DEBUG
630 	if (ldebug(rt_sigreturn))
631 		printf(ARGS(rt_sigreturn, "%p"), (void *)args->ucp);
632 #endif
633 	/*
634 	 * The trampoline code hands us the ucontext.
635 	 * It is unsafe to keep track of it ourselves, in the event that a
636 	 * program jumps out of a signal handler.
637 	 */
638 	if (copyin(args->ucp, &uc, sizeof(uc)) != 0)
639 		return (EFAULT);
640 
641 	context = &uc.uc_mcontext;
642 
643 	/*
644 	 * Check for security violations.
645 	 */
646 #define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
647 	eflags = context->sc_eflags;
648 	/*
649 	 * XXX do allow users to change the privileged flag PSL_RF.  The
650 	 * cpu sets PSL_RF in tf_eflags for faults.  Debuggers should
651 	 * sometimes set it there too.  tf_eflags is kept in the signal
652 	 * context during signal handling and there is no other place
653 	 * to remember it, so the PSL_RF bit may be corrupted by the
654 	 * signal handler without us knowing.  Corruption of the PSL_RF
655 	 * bit at worst causes one more or one less debugger trap, so
656 	 * allowing it is fairly harmless.
657 	 */
658 	if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF))
659 		return(EINVAL);
660 
661 	/*
662 	 * Don't allow users to load a valid privileged %cs.  Let the
663 	 * hardware check for invalid selectors, excess privilege in
664 	 * other selectors, invalid %eip's and invalid %esp's.
665 	 */
666 #define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
667 	if (!CS_SECURE(context->sc_cs)) {
668 		ksiginfo_init_trap(&ksi);
669 		ksi.ksi_signo = SIGBUS;
670 		ksi.ksi_code = BUS_OBJERR;
671 		ksi.ksi_trapno = T_PROTFLT;
672 		ksi.ksi_addr = (void *)regs->tf_eip;
673 		trapsignal(td, &ksi);
674 		return(EINVAL);
675 	}
676 
677 	PROC_LOCK(p);
678 	linux_to_bsd_sigset(&uc.uc_sigmask, &td->td_sigmask);
679 	SIG_CANTMASK(td->td_sigmask);
680 	signotify(td);
681 	PROC_UNLOCK(p);
682 
683 	/*
684 	 * Restore signal context
685 	 */
686 	/* %gs was restored by the trampoline. */
687 	regs->tf_fs     = context->sc_fs;
688 	regs->tf_es     = context->sc_es;
689 	regs->tf_ds     = context->sc_ds;
690 	regs->tf_edi    = context->sc_edi;
691 	regs->tf_esi    = context->sc_esi;
692 	regs->tf_ebp    = context->sc_ebp;
693 	regs->tf_ebx    = context->sc_ebx;
694 	regs->tf_edx    = context->sc_edx;
695 	regs->tf_ecx    = context->sc_ecx;
696 	regs->tf_eax    = context->sc_eax;
697 	regs->tf_eip    = context->sc_eip;
698 	regs->tf_cs     = context->sc_cs;
699 	regs->tf_eflags = eflags;
700 	regs->tf_esp    = context->sc_esp_at_signal;
701 	regs->tf_ss     = context->sc_ss;
702 
703 	/*
704 	 * call sigaltstack & ignore results..
705 	 */
706 	lss = &uc.uc_stack;
707 	ss.ss_sp = lss->ss_sp;
708 	ss.ss_size = lss->ss_size;
709 	ss.ss_flags = linux_to_bsd_sigaltstack(lss->ss_flags);
710 
711 #ifdef DEBUG
712 	if (ldebug(rt_sigreturn))
713 		printf(LMSG("rt_sigret flags: 0x%x, sp: %p, ss: 0x%x, mask: 0x%x"),
714 		    ss.ss_flags, ss.ss_sp, ss.ss_size, context->sc_mask);
715 #endif
716 	(void)kern_sigaltstack(td, &ss, NULL);
717 
718 	return (EJUSTRETURN);
719 }
720 
721 /*
722  * MPSAFE
723  */
724 static void
725 linux_prepsyscall(struct trapframe *tf, int *args, u_int *code, caddr_t *params)
726 {
727 	args[0] = tf->tf_ebx;
728 	args[1] = tf->tf_ecx;
729 	args[2] = tf->tf_edx;
730 	args[3] = tf->tf_esi;
731 	args[4] = tf->tf_edi;
732 	args[5] = tf->tf_ebp;	/* Unconfirmed */
733 	*params = NULL;		/* no copyin */
734 }
735 
736 /*
737  * If a linux binary is exec'ing something, try this image activator
738  * first.  We override standard shell script execution in order to
739  * be able to modify the interpreter path.  We only do this if a linux
740  * binary is doing the exec, so we do not create an EXEC module for it.
741  */
742 static int	exec_linux_imgact_try(struct image_params *iparams);
743 
744 static int
745 exec_linux_imgact_try(struct image_params *imgp)
746 {
747     const char *head = (const char *)imgp->image_header;
748     char *rpath;
749     int error = -1, len;
750 
751     /*
752      * The interpreter for shell scripts run from a linux binary needs
753      * to be located in /compat/linux if possible in order to recursively
754      * maintain linux path emulation.
755      */
756     if (((const short *)head)[0] == SHELLMAGIC) {
757 	    /*
758 	     * Run our normal shell image activator.  If it succeeds attempt
759 	     * to use the alternate path for the interpreter.  If an alternate
760 	     * path is found, use our stringspace to store it.
761 	     */
762 	    if ((error = exec_shell_imgact(imgp)) == 0) {
763 		    linux_emul_convpath(FIRST_THREAD_IN_PROC(imgp->proc),
764 			imgp->interpreter_name, UIO_SYSSPACE, &rpath, 0);
765 		    if (rpath != NULL) {
766 			    len = strlen(rpath) + 1;
767 
768 			    if (len <= MAXSHELLCMDLEN) {
769 				    memcpy(imgp->interpreter_name, rpath, len);
770 			    }
771 			    free(rpath, M_TEMP);
772 		    }
773 	    }
774     }
775     return(error);
776 }
777 
778 /*
779  * exec_setregs may initialize some registers differently than Linux
780  * does, thus potentially confusing Linux binaries. If necessary, we
781  * override the exec_setregs default(s) here.
782  */
783 static void
784 exec_linux_setregs(struct thread *td, u_long entry,
785 		   u_long stack, u_long ps_strings)
786 {
787 	static const u_short control = __LINUX_NPXCW__;
788 	struct pcb *pcb = td->td_pcb;
789 
790 	exec_setregs(td, entry, stack, ps_strings);
791 
792 	/* Linux sets %gs to 0, we default to _udatasel */
793 	pcb->pcb_gs = 0; load_gs(0);
794 
795 	/* Linux sets the i387 to extended precision. */
796 	fldcw(&control);
797 }
798 
799 struct sysentvec linux_sysvec = {
800 	LINUX_SYS_MAXSYSCALL,
801 	linux_sysent,
802 	0xff,
803 	LINUX_SIGTBLSZ,
804 	bsd_to_linux_signal,
805 	ELAST + 1,
806 	bsd_to_linux_errno,
807 	translate_traps,
808 	linux_fixup,
809 	linux_sendsig,
810 	linux_sigcode,
811 	&linux_szsigcode,
812 	linux_prepsyscall,
813 	"Linux a.out",
814 	NULL,
815 	exec_linux_imgact_try,
816 	LINUX_MINSIGSTKSZ,
817 	PAGE_SIZE,
818 	VM_MIN_ADDRESS,
819 	VM_MAXUSER_ADDRESS,
820 	USRSTACK,
821 	PS_STRINGS,
822 	VM_PROT_ALL,
823 	exec_copyout_strings,
824 	exec_linux_setregs,
825 	NULL
826 };
827 
828 struct sysentvec elf_linux_sysvec = {
829 	LINUX_SYS_MAXSYSCALL,
830 	linux_sysent,
831 	0xff,
832 	LINUX_SIGTBLSZ,
833 	bsd_to_linux_signal,
834 	ELAST + 1,
835 	bsd_to_linux_errno,
836 	translate_traps,
837 	elf_linux_fixup,
838 	linux_sendsig,
839 	linux_sigcode,
840 	&linux_szsigcode,
841 	linux_prepsyscall,
842 	"Linux ELF",
843 	elf32_coredump,
844 	exec_linux_imgact_try,
845 	LINUX_MINSIGSTKSZ,
846 	PAGE_SIZE,
847 	VM_MIN_ADDRESS,
848 	VM_MAXUSER_ADDRESS,
849 	USRSTACK,
850 	PS_STRINGS,
851 	VM_PROT_ALL,
852 	exec_copyout_strings,
853 	exec_linux_setregs,
854 	NULL
855 };
856 
857 static Elf32_Brandinfo linux_brand = {
858 					ELFOSABI_LINUX,
859 					EM_386,
860 					"Linux",
861 					"/compat/linux",
862 					"/lib/ld-linux.so.1",
863 					&elf_linux_sysvec,
864 					NULL,
865 					BI_CAN_EXEC_DYN,
866 				 };
867 
868 static Elf32_Brandinfo linux_glibc2brand = {
869 					ELFOSABI_LINUX,
870 					EM_386,
871 					"Linux",
872 					"/compat/linux",
873 					"/lib/ld-linux.so.2",
874 					&elf_linux_sysvec,
875 					NULL,
876 					BI_CAN_EXEC_DYN,
877 				 };
878 
879 Elf32_Brandinfo *linux_brandlist[] = {
880 					&linux_brand,
881 					&linux_glibc2brand,
882 					NULL
883 				};
884 
885 static int
886 linux_elf_modevent(module_t mod, int type, void *data)
887 {
888 	Elf32_Brandinfo **brandinfo;
889 	int error;
890 	struct linux_ioctl_handler **lihp;
891 	struct linux_device_handler **ldhp;
892 
893 	error = 0;
894 
895 	switch(type) {
896 	case MOD_LOAD:
897 		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
898 		     ++brandinfo)
899 			if (elf32_insert_brand_entry(*brandinfo) < 0)
900 				error = EINVAL;
901 		if (error == 0) {
902 			SET_FOREACH(lihp, linux_ioctl_handler_set)
903 				linux_ioctl_register_handler(*lihp);
904 			SET_FOREACH(ldhp, linux_device_handler_set)
905 				linux_device_register_handler(*ldhp);
906 			if (bootverbose)
907 				printf("Linux ELF exec handler installed\n");
908 		} else
909 			printf("cannot insert Linux ELF brand handler\n");
910 		break;
911 	case MOD_UNLOAD:
912 		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
913 		     ++brandinfo)
914 			if (elf32_brand_inuse(*brandinfo))
915 				error = EBUSY;
916 		if (error == 0) {
917 			for (brandinfo = &linux_brandlist[0];
918 			     *brandinfo != NULL; ++brandinfo)
919 				if (elf32_remove_brand_entry(*brandinfo) < 0)
920 					error = EINVAL;
921 		}
922 		if (error == 0) {
923 			SET_FOREACH(lihp, linux_ioctl_handler_set)
924 				linux_ioctl_unregister_handler(*lihp);
925 			SET_FOREACH(ldhp, linux_device_handler_set)
926 				linux_device_unregister_handler(*ldhp);
927 			if (bootverbose)
928 				printf("Linux ELF exec handler removed\n");
929 		} else
930 			printf("Could not deinstall ELF interpreter entry\n");
931 		break;
932 	default:
933 		return EOPNOTSUPP;
934 	}
935 	return error;
936 }
937 
938 static moduledata_t linux_elf_mod = {
939 	"linuxelf",
940 	linux_elf_modevent,
941 	0
942 };
943 
944 DECLARE_MODULE(linuxelf, linux_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY);
945