xref: /freebsd/sys/i386/linux/linux_sysvec.c (revision 10f0bcab61ef441cb5af32fb706688d8cbd55dc0)
1 /*-
2  * Copyright (c) 1994-1996 S�ren Schmidt
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer
10  *    in this position and unchanged.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  * 3. The name of the author may not be used to endorse or promote products
15  *    derived from this software without specific prior written permission
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/exec.h>
35 #include <sys/imgact.h>
36 #include <sys/imgact_aout.h>
37 #include <sys/imgact_elf.h>
38 #include <sys/kernel.h>
39 #include <sys/lock.h>
40 #include <sys/malloc.h>
41 #include <sys/module.h>
42 #include <sys/mutex.h>
43 #include <sys/proc.h>
44 #include <sys/signalvar.h>
45 #include <sys/syscallsubr.h>
46 #include <sys/sysent.h>
47 #include <sys/sysproto.h>
48 #include <sys/vnode.h>
49 #include <sys/eventhandler.h>
50 
51 #include <vm/vm.h>
52 #include <vm/pmap.h>
53 #include <vm/vm_extern.h>
54 #include <vm/vm_map.h>
55 #include <vm/vm_object.h>
56 #include <vm/vm_page.h>
57 #include <vm/vm_param.h>
58 
59 #include <machine/cpu.h>
60 #include <machine/md_var.h>
61 #include <machine/pcb.h>
62 
63 #include <i386/linux/linux.h>
64 #include <i386/linux/linux_proto.h>
65 #include <compat/linux/linux_emul.h>
66 #include <compat/linux/linux_mib.h>
67 #include <compat/linux/linux_signal.h>
68 #include <compat/linux/linux_util.h>
69 
70 MODULE_VERSION(linux, 1);
71 
72 MALLOC_DEFINE(M_LINUX, "linux", "Linux mode structures");
73 
74 #if BYTE_ORDER == LITTLE_ENDIAN
75 #define SHELLMAGIC      0x2123 /* #! */
76 #else
77 #define SHELLMAGIC      0x2321
78 #endif
79 
80 /*
81  * Allow the sendsig functions to use the ldebug() facility
82  * even though they are not syscalls themselves. Map them
83  * to syscall 0. This is slightly less bogus than using
84  * ldebug(sigreturn).
85  */
86 #define	LINUX_SYS_linux_rt_sendsig	0
87 #define	LINUX_SYS_linux_sendsig		0
88 
89 #define	fldcw(addr)		__asm("fldcw %0" : : "m" (*(addr)))
90 #define	__LINUX_NPXCW__		0x37f
91 
92 extern char linux_sigcode[];
93 extern int linux_szsigcode;
94 
95 extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL];
96 
97 SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler);
98 SET_DECLARE(linux_device_handler_set, struct linux_device_handler);
99 
100 static int	linux_fixup(register_t **stack_base,
101 		    struct image_params *iparams);
102 static int	elf_linux_fixup(register_t **stack_base,
103 		    struct image_params *iparams);
104 static void	linux_prepsyscall(struct trapframe *tf, int *args, u_int *code,
105 		    caddr_t *params);
106 static void     linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask);
107 static void	exec_linux_setregs(struct thread *td, u_long entry,
108 				   u_long stack, u_long ps_strings);
109 
110 extern LIST_HEAD(futex_list, futex) futex_list;
111 extern struct sx futex_sx;
112 
113 static eventhandler_tag linux_exit_tag;
114 static eventhandler_tag linux_schedtail_tag;
115 static eventhandler_tag linux_exec_tag;
116 
117 /*
118  * Linux syscalls return negative errno's, we do positive and map them
119  * Reference:
120  *   FreeBSD: src/sys/sys/errno.h
121  *   Linux:   linux-2.6.17.8/include/asm-generic/errno-base.h
122  *            linux-2.6.17.8/include/asm-generic/errno.h
123  */
124 static int bsd_to_linux_errno[ELAST + 1] = {
125 	-0,  -1,  -2,  -3,  -4,  -5,  -6,  -7,  -8,  -9,
126 	-10, -35, -12, -13, -14, -15, -16, -17, -18, -19,
127 	-20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
128 	-30, -31, -32, -33, -34, -11,-115,-114, -88, -89,
129 	-90, -91, -92, -93, -94, -95, -96, -97, -98, -99,
130 	-100,-101,-102,-103,-104,-105,-106,-107,-108,-109,
131 	-110,-111, -40, -36,-112,-113, -39, -11, -87,-122,
132 	-116, -66,  -6,  -6,  -6,  -6,  -6, -37, -38,  -9,
133 	  -6,  -6, -43, -42, -75,-125, -84, -95, -16, -74,
134 	 -72, -67, -71
135 };
136 
137 int bsd_to_linux_signal[LINUX_SIGTBLSZ] = {
138 	LINUX_SIGHUP, LINUX_SIGINT, LINUX_SIGQUIT, LINUX_SIGILL,
139 	LINUX_SIGTRAP, LINUX_SIGABRT, 0, LINUX_SIGFPE,
140 	LINUX_SIGKILL, LINUX_SIGBUS, LINUX_SIGSEGV, LINUX_SIGSYS,
141 	LINUX_SIGPIPE, LINUX_SIGALRM, LINUX_SIGTERM, LINUX_SIGURG,
142 	LINUX_SIGSTOP, LINUX_SIGTSTP, LINUX_SIGCONT, LINUX_SIGCHLD,
143 	LINUX_SIGTTIN, LINUX_SIGTTOU, LINUX_SIGIO, LINUX_SIGXCPU,
144 	LINUX_SIGXFSZ, LINUX_SIGVTALRM, LINUX_SIGPROF, LINUX_SIGWINCH,
145 	0, LINUX_SIGUSR1, LINUX_SIGUSR2
146 };
147 
148 int linux_to_bsd_signal[LINUX_SIGTBLSZ] = {
149 	SIGHUP, SIGINT, SIGQUIT, SIGILL,
150 	SIGTRAP, SIGABRT, SIGBUS, SIGFPE,
151 	SIGKILL, SIGUSR1, SIGSEGV, SIGUSR2,
152 	SIGPIPE, SIGALRM, SIGTERM, SIGBUS,
153 	SIGCHLD, SIGCONT, SIGSTOP, SIGTSTP,
154 	SIGTTIN, SIGTTOU, SIGURG, SIGXCPU,
155 	SIGXFSZ, SIGVTALRM, SIGPROF, SIGWINCH,
156 	SIGIO, SIGURG, SIGSYS
157 };
158 
159 #define LINUX_T_UNKNOWN  255
160 static int _bsd_to_linux_trapcode[] = {
161 	LINUX_T_UNKNOWN,	/* 0 */
162 	6,			/* 1  T_PRIVINFLT */
163 	LINUX_T_UNKNOWN,	/* 2 */
164 	3,			/* 3  T_BPTFLT */
165 	LINUX_T_UNKNOWN,	/* 4 */
166 	LINUX_T_UNKNOWN,	/* 5 */
167 	16,			/* 6  T_ARITHTRAP */
168 	254,			/* 7  T_ASTFLT */
169 	LINUX_T_UNKNOWN,	/* 8 */
170 	13,			/* 9  T_PROTFLT */
171 	1,			/* 10 T_TRCTRAP */
172 	LINUX_T_UNKNOWN,	/* 11 */
173 	14,			/* 12 T_PAGEFLT */
174 	LINUX_T_UNKNOWN,	/* 13 */
175 	17,			/* 14 T_ALIGNFLT */
176 	LINUX_T_UNKNOWN,	/* 15 */
177 	LINUX_T_UNKNOWN,	/* 16 */
178 	LINUX_T_UNKNOWN,	/* 17 */
179 	0,			/* 18 T_DIVIDE */
180 	2,			/* 19 T_NMI */
181 	4,			/* 20 T_OFLOW */
182 	5,			/* 21 T_BOUND */
183 	7,			/* 22 T_DNA */
184 	8,			/* 23 T_DOUBLEFLT */
185 	9,			/* 24 T_FPOPFLT */
186 	10,			/* 25 T_TSSFLT */
187 	11,			/* 26 T_SEGNPFLT */
188 	12,			/* 27 T_STKFLT */
189 	18,			/* 28 T_MCHK */
190 	19,			/* 29 T_XMMFLT */
191 	15			/* 30 T_RESERVED */
192 };
193 #define bsd_to_linux_trapcode(code) \
194     ((code)<sizeof(_bsd_to_linux_trapcode)/sizeof(*_bsd_to_linux_trapcode)? \
195      _bsd_to_linux_trapcode[(code)]: \
196      LINUX_T_UNKNOWN)
197 
198 /*
199  * If FreeBSD & Linux have a difference of opinion about what a trap
200  * means, deal with it here.
201  *
202  * MPSAFE
203  */
204 static int
205 translate_traps(int signal, int trap_code)
206 {
207 	if (signal != SIGBUS)
208 		return signal;
209 	switch (trap_code) {
210 	case T_PROTFLT:
211 	case T_TSSFLT:
212 	case T_DOUBLEFLT:
213 	case T_PAGEFLT:
214 		return SIGSEGV;
215 	default:
216 		return signal;
217 	}
218 }
219 
220 static int
221 linux_fixup(register_t **stack_base, struct image_params *imgp)
222 {
223 	register_t *argv, *envp;
224 
225 	argv = *stack_base;
226 	envp = *stack_base + (imgp->args->argc + 1);
227 	(*stack_base)--;
228 	**stack_base = (intptr_t)(void *)envp;
229 	(*stack_base)--;
230 	**stack_base = (intptr_t)(void *)argv;
231 	(*stack_base)--;
232 	**stack_base = imgp->args->argc;
233 	return 0;
234 }
235 
236 static int
237 elf_linux_fixup(register_t **stack_base, struct image_params *imgp)
238 {
239 	Elf32_Auxargs *args;
240 	register_t *pos;
241 
242 	KASSERT(curthread->td_proc == imgp->proc,
243 	    ("unsafe elf_linux_fixup(), should be curproc"));
244 	args = (Elf32_Auxargs *)imgp->auxargs;
245 	pos = *stack_base + (imgp->args->argc + imgp->args->envc + 2);
246 
247 	if (args->trace)
248 		AUXARGS_ENTRY(pos, AT_DEBUG, 1);
249 	if (args->execfd != -1)
250 		AUXARGS_ENTRY(pos, AT_EXECFD, args->execfd);
251 	AUXARGS_ENTRY(pos, AT_PHDR, args->phdr);
252 	AUXARGS_ENTRY(pos, AT_PHENT, args->phent);
253 	AUXARGS_ENTRY(pos, AT_PHNUM, args->phnum);
254 	AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz);
255 	AUXARGS_ENTRY(pos, AT_FLAGS, args->flags);
256 	AUXARGS_ENTRY(pos, AT_ENTRY, args->entry);
257 	AUXARGS_ENTRY(pos, AT_BASE, args->base);
258 	AUXARGS_ENTRY(pos, AT_UID, imgp->proc->p_ucred->cr_ruid);
259 	AUXARGS_ENTRY(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid);
260 	AUXARGS_ENTRY(pos, AT_GID, imgp->proc->p_ucred->cr_rgid);
261 	AUXARGS_ENTRY(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid);
262 	AUXARGS_ENTRY(pos, AT_NULL, 0);
263 
264 	free(imgp->auxargs, M_TEMP);
265 	imgp->auxargs = NULL;
266 
267 	(*stack_base)--;
268 	**stack_base = (register_t)imgp->args->argc;
269 	return 0;
270 }
271 
272 extern int _ucodesel, _udatasel;
273 extern unsigned long linux_sznonrtsigcode;
274 
275 static void
276 linux_rt_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
277 {
278 	struct thread *td = curthread;
279 	struct proc *p = td->td_proc;
280 	struct sigacts *psp;
281 	struct trapframe *regs;
282 	struct l_rt_sigframe *fp, frame;
283 	int sig, code;
284 	int oonstack;
285 
286 	sig = ksi->ksi_signo;
287 	code = ksi->ksi_code;
288 	PROC_LOCK_ASSERT(p, MA_OWNED);
289 	psp = p->p_sigacts;
290 	mtx_assert(&psp->ps_mtx, MA_OWNED);
291 	regs = td->td_frame;
292 	oonstack = sigonstack(regs->tf_esp);
293 
294 #ifdef DEBUG
295 	if (ldebug(rt_sendsig))
296 		printf(ARGS(rt_sendsig, "%p, %d, %p, %u"),
297 		    catcher, sig, (void*)mask, code);
298 #endif
299 	/*
300 	 * Allocate space for the signal handler context.
301 	 */
302 	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
303 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
304 		fp = (struct l_rt_sigframe *)(td->td_sigstk.ss_sp +
305 		    td->td_sigstk.ss_size - sizeof(struct l_rt_sigframe));
306 	} else
307 		fp = (struct l_rt_sigframe *)regs->tf_esp - 1;
308 	mtx_unlock(&psp->ps_mtx);
309 
310 	/*
311 	 * Build the argument list for the signal handler.
312 	 */
313 	if (p->p_sysent->sv_sigtbl)
314 		if (sig <= p->p_sysent->sv_sigsize)
315 			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
316 
317 	bzero(&frame, sizeof(frame));
318 
319 	frame.sf_handler = catcher;
320 	frame.sf_sig = sig;
321 	frame.sf_siginfo = &fp->sf_si;
322 	frame.sf_ucontext = &fp->sf_sc;
323 
324 	/* Fill in POSIX parts */
325 	frame.sf_si.lsi_signo = sig;
326 	frame.sf_si.lsi_code = code;
327 	frame.sf_si.lsi_addr = ksi->ksi_addr;
328 
329 	/*
330 	 * Build the signal context to be used by sigreturn.
331 	 */
332 	frame.sf_sc.uc_flags = 0;		/* XXX ??? */
333 	frame.sf_sc.uc_link = NULL;		/* XXX ??? */
334 
335 	frame.sf_sc.uc_stack.ss_sp = td->td_sigstk.ss_sp;
336 	frame.sf_sc.uc_stack.ss_size = td->td_sigstk.ss_size;
337 	frame.sf_sc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
338 	    ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE;
339 	PROC_UNLOCK(p);
340 
341 	bsd_to_linux_sigset(mask, &frame.sf_sc.uc_sigmask);
342 
343 	frame.sf_sc.uc_mcontext.sc_mask   = frame.sf_sc.uc_sigmask.__bits[0];
344 	frame.sf_sc.uc_mcontext.sc_gs     = rgs();
345 	frame.sf_sc.uc_mcontext.sc_fs     = regs->tf_fs;
346 	frame.sf_sc.uc_mcontext.sc_es     = regs->tf_es;
347 	frame.sf_sc.uc_mcontext.sc_ds     = regs->tf_ds;
348 	frame.sf_sc.uc_mcontext.sc_edi    = regs->tf_edi;
349 	frame.sf_sc.uc_mcontext.sc_esi    = regs->tf_esi;
350 	frame.sf_sc.uc_mcontext.sc_ebp    = regs->tf_ebp;
351 	frame.sf_sc.uc_mcontext.sc_ebx    = regs->tf_ebx;
352 	frame.sf_sc.uc_mcontext.sc_edx    = regs->tf_edx;
353 	frame.sf_sc.uc_mcontext.sc_ecx    = regs->tf_ecx;
354 	frame.sf_sc.uc_mcontext.sc_eax    = regs->tf_eax;
355 	frame.sf_sc.uc_mcontext.sc_eip    = regs->tf_eip;
356 	frame.sf_sc.uc_mcontext.sc_cs     = regs->tf_cs;
357 	frame.sf_sc.uc_mcontext.sc_eflags = regs->tf_eflags;
358 	frame.sf_sc.uc_mcontext.sc_esp_at_signal = regs->tf_esp;
359 	frame.sf_sc.uc_mcontext.sc_ss     = regs->tf_ss;
360 	frame.sf_sc.uc_mcontext.sc_err    = regs->tf_err;
361 	frame.sf_sc.uc_mcontext.sc_cr2    = (register_t)ksi->ksi_addr;
362 	frame.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code);
363 
364 #ifdef DEBUG
365 	if (ldebug(rt_sendsig))
366 		printf(LMSG("rt_sendsig flags: 0x%x, sp: %p, ss: 0x%x, mask: 0x%x"),
367 		    frame.sf_sc.uc_stack.ss_flags, td->td_sigstk.ss_sp,
368 		    td->td_sigstk.ss_size, frame.sf_sc.uc_mcontext.sc_mask);
369 #endif
370 
371 	if (copyout(&frame, fp, sizeof(frame)) != 0) {
372 		/*
373 		 * Process has trashed its stack; give it an illegal
374 		 * instruction to halt it in its tracks.
375 		 */
376 #ifdef DEBUG
377 		if (ldebug(rt_sendsig))
378 			printf(LMSG("rt_sendsig: bad stack %p, oonstack=%x"),
379 			    fp, oonstack);
380 #endif
381 		PROC_LOCK(p);
382 		sigexit(td, SIGILL);
383 	}
384 
385 	/*
386 	 * Build context to run handler in.
387 	 */
388 	regs->tf_esp = (int)fp;
389 	regs->tf_eip = PS_STRINGS - *(p->p_sysent->sv_szsigcode) +
390 	    linux_sznonrtsigcode;
391 	regs->tf_eflags &= ~(PSL_T | PSL_VM | PSL_D);
392 	regs->tf_cs = _ucodesel;
393 	regs->tf_ds = _udatasel;
394 	regs->tf_es = _udatasel;
395 	regs->tf_fs = _udatasel;
396 	regs->tf_ss = _udatasel;
397 	PROC_LOCK(p);
398 	mtx_lock(&psp->ps_mtx);
399 }
400 
401 
402 /*
403  * Send an interrupt to process.
404  *
405  * Stack is set up to allow sigcode stored
406  * in u. to call routine, followed by kcall
407  * to sigreturn routine below.  After sigreturn
408  * resets the signal mask, the stack, and the
409  * frame pointer, it returns to the user
410  * specified pc, psl.
411  */
412 static void
413 linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
414 {
415 	struct thread *td = curthread;
416 	struct proc *p = td->td_proc;
417 	struct sigacts *psp;
418 	struct trapframe *regs;
419 	struct l_sigframe *fp, frame;
420 	l_sigset_t lmask;
421 	int sig, code;
422 	int oonstack, i;
423 
424 	PROC_LOCK_ASSERT(p, MA_OWNED);
425 	psp = p->p_sigacts;
426 	sig = ksi->ksi_signo;
427 	code = ksi->ksi_code;
428 	mtx_assert(&psp->ps_mtx, MA_OWNED);
429 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
430 		/* Signal handler installed with SA_SIGINFO. */
431 		linux_rt_sendsig(catcher, ksi, mask);
432 		return;
433 	}
434 	regs = td->td_frame;
435 	oonstack = sigonstack(regs->tf_esp);
436 
437 #ifdef DEBUG
438 	if (ldebug(sendsig))
439 		printf(ARGS(sendsig, "%p, %d, %p, %u"),
440 		    catcher, sig, (void*)mask, code);
441 #endif
442 
443 	/*
444 	 * Allocate space for the signal handler context.
445 	 */
446 	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
447 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
448 		fp = (struct l_sigframe *)(td->td_sigstk.ss_sp +
449 		    td->td_sigstk.ss_size - sizeof(struct l_sigframe));
450 	} else
451 		fp = (struct l_sigframe *)regs->tf_esp - 1;
452 	mtx_unlock(&psp->ps_mtx);
453 	PROC_UNLOCK(p);
454 
455 	/*
456 	 * Build the argument list for the signal handler.
457 	 */
458 	if (p->p_sysent->sv_sigtbl)
459 		if (sig <= p->p_sysent->sv_sigsize)
460 			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
461 
462 	bzero(&frame, sizeof(frame));
463 
464 	frame.sf_handler = catcher;
465 	frame.sf_sig = sig;
466 
467 	bsd_to_linux_sigset(mask, &lmask);
468 
469 	/*
470 	 * Build the signal context to be used by sigreturn.
471 	 */
472 	frame.sf_sc.sc_mask   = lmask.__bits[0];
473 	frame.sf_sc.sc_gs     = rgs();
474 	frame.sf_sc.sc_fs     = regs->tf_fs;
475 	frame.sf_sc.sc_es     = regs->tf_es;
476 	frame.sf_sc.sc_ds     = regs->tf_ds;
477 	frame.sf_sc.sc_edi    = regs->tf_edi;
478 	frame.sf_sc.sc_esi    = regs->tf_esi;
479 	frame.sf_sc.sc_ebp    = regs->tf_ebp;
480 	frame.sf_sc.sc_ebx    = regs->tf_ebx;
481 	frame.sf_sc.sc_edx    = regs->tf_edx;
482 	frame.sf_sc.sc_ecx    = regs->tf_ecx;
483 	frame.sf_sc.sc_eax    = regs->tf_eax;
484 	frame.sf_sc.sc_eip    = regs->tf_eip;
485 	frame.sf_sc.sc_cs     = regs->tf_cs;
486 	frame.sf_sc.sc_eflags = regs->tf_eflags;
487 	frame.sf_sc.sc_esp_at_signal = regs->tf_esp;
488 	frame.sf_sc.sc_ss     = regs->tf_ss;
489 	frame.sf_sc.sc_err    = regs->tf_err;
490 	frame.sf_sc.sc_cr2    = (register_t)ksi->ksi_addr;
491 	frame.sf_sc.sc_trapno = bsd_to_linux_trapcode(ksi->ksi_trapno);
492 
493 	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
494 		frame.sf_extramask[i] = lmask.__bits[i+1];
495 
496 	if (copyout(&frame, fp, sizeof(frame)) != 0) {
497 		/*
498 		 * Process has trashed its stack; give it an illegal
499 		 * instruction to halt it in its tracks.
500 		 */
501 		PROC_LOCK(p);
502 		sigexit(td, SIGILL);
503 	}
504 
505 	/*
506 	 * Build context to run handler in.
507 	 */
508 	regs->tf_esp = (int)fp;
509 	regs->tf_eip = PS_STRINGS - *(p->p_sysent->sv_szsigcode);
510 	regs->tf_eflags &= ~(PSL_T | PSL_VM | PSL_D);
511 	regs->tf_cs = _ucodesel;
512 	regs->tf_ds = _udatasel;
513 	regs->tf_es = _udatasel;
514 	regs->tf_fs = _udatasel;
515 	regs->tf_ss = _udatasel;
516 	PROC_LOCK(p);
517 	mtx_lock(&psp->ps_mtx);
518 }
519 
520 /*
521  * System call to cleanup state after a signal
522  * has been taken.  Reset signal mask and
523  * stack state from context left by sendsig (above).
524  * Return to previous pc and psl as specified by
525  * context left by sendsig. Check carefully to
526  * make sure that the user has not modified the
527  * psl to gain improper privileges or to cause
528  * a machine fault.
529  */
530 int
531 linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args)
532 {
533 	struct proc *p = td->td_proc;
534 	struct l_sigframe frame;
535 	struct trapframe *regs;
536 	l_sigset_t lmask;
537 	int eflags, i;
538 	ksiginfo_t ksi;
539 
540 	regs = td->td_frame;
541 
542 #ifdef DEBUG
543 	if (ldebug(sigreturn))
544 		printf(ARGS(sigreturn, "%p"), (void *)args->sfp);
545 #endif
546 	/*
547 	 * The trampoline code hands us the sigframe.
548 	 * It is unsafe to keep track of it ourselves, in the event that a
549 	 * program jumps out of a signal handler.
550 	 */
551 	if (copyin(args->sfp, &frame, sizeof(frame)) != 0)
552 		return (EFAULT);
553 
554 	/*
555 	 * Check for security violations.
556 	 */
557 #define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
558 	eflags = frame.sf_sc.sc_eflags;
559 	/*
560 	 * XXX do allow users to change the privileged flag PSL_RF.  The
561 	 * cpu sets PSL_RF in tf_eflags for faults.  Debuggers should
562 	 * sometimes set it there too.  tf_eflags is kept in the signal
563 	 * context during signal handling and there is no other place
564 	 * to remember it, so the PSL_RF bit may be corrupted by the
565 	 * signal handler without us knowing.  Corruption of the PSL_RF
566 	 * bit at worst causes one more or one less debugger trap, so
567 	 * allowing it is fairly harmless.
568 	 */
569 	if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF))
570 		return(EINVAL);
571 
572 	/*
573 	 * Don't allow users to load a valid privileged %cs.  Let the
574 	 * hardware check for invalid selectors, excess privilege in
575 	 * other selectors, invalid %eip's and invalid %esp's.
576 	 */
577 #define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
578 	if (!CS_SECURE(frame.sf_sc.sc_cs)) {
579 		ksiginfo_init_trap(&ksi);
580 		ksi.ksi_signo = SIGBUS;
581 		ksi.ksi_code = BUS_OBJERR;
582 		ksi.ksi_trapno = T_PROTFLT;
583 		ksi.ksi_addr = (void *)regs->tf_eip;
584 		trapsignal(td, &ksi);
585 		return(EINVAL);
586 	}
587 
588 	lmask.__bits[0] = frame.sf_sc.sc_mask;
589 	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
590 		lmask.__bits[i+1] = frame.sf_extramask[i];
591 	PROC_LOCK(p);
592 	linux_to_bsd_sigset(&lmask, &td->td_sigmask);
593 	SIG_CANTMASK(td->td_sigmask);
594 	signotify(td);
595 	PROC_UNLOCK(p);
596 
597 	/*
598 	 * Restore signal context.
599 	 */
600 	/* %gs was restored by the trampoline. */
601 	regs->tf_fs     = frame.sf_sc.sc_fs;
602 	regs->tf_es     = frame.sf_sc.sc_es;
603 	regs->tf_ds     = frame.sf_sc.sc_ds;
604 	regs->tf_edi    = frame.sf_sc.sc_edi;
605 	regs->tf_esi    = frame.sf_sc.sc_esi;
606 	regs->tf_ebp    = frame.sf_sc.sc_ebp;
607 	regs->tf_ebx    = frame.sf_sc.sc_ebx;
608 	regs->tf_edx    = frame.sf_sc.sc_edx;
609 	regs->tf_ecx    = frame.sf_sc.sc_ecx;
610 	regs->tf_eax    = frame.sf_sc.sc_eax;
611 	regs->tf_eip    = frame.sf_sc.sc_eip;
612 	regs->tf_cs     = frame.sf_sc.sc_cs;
613 	regs->tf_eflags = eflags;
614 	regs->tf_esp    = frame.sf_sc.sc_esp_at_signal;
615 	regs->tf_ss     = frame.sf_sc.sc_ss;
616 
617 	return (EJUSTRETURN);
618 }
619 
620 /*
621  * System call to cleanup state after a signal
622  * has been taken.  Reset signal mask and
623  * stack state from context left by rt_sendsig (above).
624  * Return to previous pc and psl as specified by
625  * context left by sendsig. Check carefully to
626  * make sure that the user has not modified the
627  * psl to gain improper privileges or to cause
628  * a machine fault.
629  */
630 int
631 linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args)
632 {
633 	struct proc *p = td->td_proc;
634 	struct l_ucontext uc;
635 	struct l_sigcontext *context;
636 	l_stack_t *lss;
637 	stack_t ss;
638 	struct trapframe *regs;
639 	int eflags;
640 	ksiginfo_t ksi;
641 
642 	regs = td->td_frame;
643 
644 #ifdef DEBUG
645 	if (ldebug(rt_sigreturn))
646 		printf(ARGS(rt_sigreturn, "%p"), (void *)args->ucp);
647 #endif
648 	/*
649 	 * The trampoline code hands us the ucontext.
650 	 * It is unsafe to keep track of it ourselves, in the event that a
651 	 * program jumps out of a signal handler.
652 	 */
653 	if (copyin(args->ucp, &uc, sizeof(uc)) != 0)
654 		return (EFAULT);
655 
656 	context = &uc.uc_mcontext;
657 
658 	/*
659 	 * Check for security violations.
660 	 */
661 #define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
662 	eflags = context->sc_eflags;
663 	/*
664 	 * XXX do allow users to change the privileged flag PSL_RF.  The
665 	 * cpu sets PSL_RF in tf_eflags for faults.  Debuggers should
666 	 * sometimes set it there too.  tf_eflags is kept in the signal
667 	 * context during signal handling and there is no other place
668 	 * to remember it, so the PSL_RF bit may be corrupted by the
669 	 * signal handler without us knowing.  Corruption of the PSL_RF
670 	 * bit at worst causes one more or one less debugger trap, so
671 	 * allowing it is fairly harmless.
672 	 */
673 	if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF))
674 		return(EINVAL);
675 
676 	/*
677 	 * Don't allow users to load a valid privileged %cs.  Let the
678 	 * hardware check for invalid selectors, excess privilege in
679 	 * other selectors, invalid %eip's and invalid %esp's.
680 	 */
681 #define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
682 	if (!CS_SECURE(context->sc_cs)) {
683 		ksiginfo_init_trap(&ksi);
684 		ksi.ksi_signo = SIGBUS;
685 		ksi.ksi_code = BUS_OBJERR;
686 		ksi.ksi_trapno = T_PROTFLT;
687 		ksi.ksi_addr = (void *)regs->tf_eip;
688 		trapsignal(td, &ksi);
689 		return(EINVAL);
690 	}
691 
692 	PROC_LOCK(p);
693 	linux_to_bsd_sigset(&uc.uc_sigmask, &td->td_sigmask);
694 	SIG_CANTMASK(td->td_sigmask);
695 	signotify(td);
696 	PROC_UNLOCK(p);
697 
698 	/*
699 	 * Restore signal context
700 	 */
701 	/* %gs was restored by the trampoline. */
702 	regs->tf_fs     = context->sc_fs;
703 	regs->tf_es     = context->sc_es;
704 	regs->tf_ds     = context->sc_ds;
705 	regs->tf_edi    = context->sc_edi;
706 	regs->tf_esi    = context->sc_esi;
707 	regs->tf_ebp    = context->sc_ebp;
708 	regs->tf_ebx    = context->sc_ebx;
709 	regs->tf_edx    = context->sc_edx;
710 	regs->tf_ecx    = context->sc_ecx;
711 	regs->tf_eax    = context->sc_eax;
712 	regs->tf_eip    = context->sc_eip;
713 	regs->tf_cs     = context->sc_cs;
714 	regs->tf_eflags = eflags;
715 	regs->tf_esp    = context->sc_esp_at_signal;
716 	regs->tf_ss     = context->sc_ss;
717 
718 	/*
719 	 * call sigaltstack & ignore results..
720 	 */
721 	lss = &uc.uc_stack;
722 	ss.ss_sp = lss->ss_sp;
723 	ss.ss_size = lss->ss_size;
724 	ss.ss_flags = linux_to_bsd_sigaltstack(lss->ss_flags);
725 
726 #ifdef DEBUG
727 	if (ldebug(rt_sigreturn))
728 		printf(LMSG("rt_sigret flags: 0x%x, sp: %p, ss: 0x%x, mask: 0x%x"),
729 		    ss.ss_flags, ss.ss_sp, ss.ss_size, context->sc_mask);
730 #endif
731 	(void)kern_sigaltstack(td, &ss, NULL);
732 
733 	return (EJUSTRETURN);
734 }
735 
736 /*
737  * MPSAFE
738  */
739 static void
740 linux_prepsyscall(struct trapframe *tf, int *args, u_int *code, caddr_t *params)
741 {
742 	args[0] = tf->tf_ebx;
743 	args[1] = tf->tf_ecx;
744 	args[2] = tf->tf_edx;
745 	args[3] = tf->tf_esi;
746 	args[4] = tf->tf_edi;
747 	args[5] = tf->tf_ebp;	/* Unconfirmed */
748 	*params = NULL;		/* no copyin */
749 }
750 
751 /*
752  * If a linux binary is exec'ing something, try this image activator
753  * first.  We override standard shell script execution in order to
754  * be able to modify the interpreter path.  We only do this if a linux
755  * binary is doing the exec, so we do not create an EXEC module for it.
756  */
757 static int	exec_linux_imgact_try(struct image_params *iparams);
758 
759 static int
760 exec_linux_imgact_try(struct image_params *imgp)
761 {
762     const char *head = (const char *)imgp->image_header;
763     char *rpath;
764     int error = -1, len;
765 
766     /*
767      * The interpreter for shell scripts run from a linux binary needs
768      * to be located in /compat/linux if possible in order to recursively
769      * maintain linux path emulation.
770      */
771     if (((const short *)head)[0] == SHELLMAGIC) {
772 	    /*
773 	     * Run our normal shell image activator.  If it succeeds attempt
774 	     * to use the alternate path for the interpreter.  If an alternate
775 	     * path is found, use our stringspace to store it.
776 	     */
777 	    if ((error = exec_shell_imgact(imgp)) == 0) {
778 		    linux_emul_convpath(FIRST_THREAD_IN_PROC(imgp->proc),
779 			imgp->interpreter_name, UIO_SYSSPACE, &rpath, 0);
780 		    if (rpath != NULL) {
781 			    len = strlen(rpath) + 1;
782 
783 			    if (len <= MAXSHELLCMDLEN) {
784 				    memcpy(imgp->interpreter_name, rpath, len);
785 			    }
786 			    free(rpath, M_TEMP);
787 		    }
788 	    }
789     }
790     return(error);
791 }
792 
793 /*
794  * exec_setregs may initialize some registers differently than Linux
795  * does, thus potentially confusing Linux binaries. If necessary, we
796  * override the exec_setregs default(s) here.
797  */
798 static void
799 exec_linux_setregs(struct thread *td, u_long entry,
800 		   u_long stack, u_long ps_strings)
801 {
802 	static const u_short control = __LINUX_NPXCW__;
803 	struct pcb *pcb = td->td_pcb;
804 
805 	exec_setregs(td, entry, stack, ps_strings);
806 
807 	/* Linux sets %gs to 0, we default to _udatasel */
808 	pcb->pcb_gs = 0; load_gs(0);
809 
810 	/* Linux sets the i387 to extended precision. */
811 	fldcw(&control);
812 }
813 
814 struct sysentvec linux_sysvec = {
815 	LINUX_SYS_MAXSYSCALL,
816 	linux_sysent,
817 	0,
818 	LINUX_SIGTBLSZ,
819 	bsd_to_linux_signal,
820 	ELAST + 1,
821 	bsd_to_linux_errno,
822 	translate_traps,
823 	linux_fixup,
824 	linux_sendsig,
825 	linux_sigcode,
826 	&linux_szsigcode,
827 	linux_prepsyscall,
828 	"Linux a.out",
829 	NULL,
830 	exec_linux_imgact_try,
831 	LINUX_MINSIGSTKSZ,
832 	PAGE_SIZE,
833 	VM_MIN_ADDRESS,
834 	VM_MAXUSER_ADDRESS,
835 	USRSTACK,
836 	PS_STRINGS,
837 	VM_PROT_ALL,
838 	exec_copyout_strings,
839 	exec_linux_setregs,
840 	NULL
841 };
842 
843 struct sysentvec elf_linux_sysvec = {
844 	LINUX_SYS_MAXSYSCALL,
845 	linux_sysent,
846 	0,
847 	LINUX_SIGTBLSZ,
848 	bsd_to_linux_signal,
849 	ELAST + 1,
850 	bsd_to_linux_errno,
851 	translate_traps,
852 	elf_linux_fixup,
853 	linux_sendsig,
854 	linux_sigcode,
855 	&linux_szsigcode,
856 	linux_prepsyscall,
857 	"Linux ELF",
858 	elf32_coredump,
859 	exec_linux_imgact_try,
860 	LINUX_MINSIGSTKSZ,
861 	PAGE_SIZE,
862 	VM_MIN_ADDRESS,
863 	VM_MAXUSER_ADDRESS,
864 	USRSTACK,
865 	PS_STRINGS,
866 	VM_PROT_ALL,
867 	exec_copyout_strings,
868 	exec_linux_setregs,
869 	NULL
870 };
871 
872 static Elf32_Brandinfo linux_brand = {
873 					ELFOSABI_LINUX,
874 					EM_386,
875 					"Linux",
876 					"/compat/linux",
877 					"/lib/ld-linux.so.1",
878 					&elf_linux_sysvec,
879 					NULL,
880 					BI_CAN_EXEC_DYN,
881 				 };
882 
883 static Elf32_Brandinfo linux_glibc2brand = {
884 					ELFOSABI_LINUX,
885 					EM_386,
886 					"Linux",
887 					"/compat/linux",
888 					"/lib/ld-linux.so.2",
889 					&elf_linux_sysvec,
890 					NULL,
891 					BI_CAN_EXEC_DYN,
892 				 };
893 
894 Elf32_Brandinfo *linux_brandlist[] = {
895 					&linux_brand,
896 					&linux_glibc2brand,
897 					NULL
898 				};
899 
900 static int
901 linux_elf_modevent(module_t mod, int type, void *data)
902 {
903 	Elf32_Brandinfo **brandinfo;
904 	int error;
905 	struct linux_ioctl_handler **lihp;
906 	struct linux_device_handler **ldhp;
907 
908 	error = 0;
909 
910 	switch(type) {
911 	case MOD_LOAD:
912 		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
913 		     ++brandinfo)
914 			if (elf32_insert_brand_entry(*brandinfo) < 0)
915 				error = EINVAL;
916 		if (error == 0) {
917 			SET_FOREACH(lihp, linux_ioctl_handler_set)
918 				linux_ioctl_register_handler(*lihp);
919 			SET_FOREACH(ldhp, linux_device_handler_set)
920 				linux_device_register_handler(*ldhp);
921 			mtx_init(&emul_lock, "emuldata lock", NULL, MTX_DEF);
922 			sx_init(&emul_shared_lock, "emuldata->shared lock");
923 			LIST_INIT(&futex_list);
924 			sx_init(&futex_sx, "futex protection lock");
925 			linux_exit_tag = EVENTHANDLER_REGISTER(process_exit, linux_proc_exit,
926 			      NULL, 1000);
927 			linux_schedtail_tag = EVENTHANDLER_REGISTER(schedtail, linux_schedtail,
928 			      NULL, 1000);
929 			linux_exec_tag = EVENTHANDLER_REGISTER(process_exec, linux_proc_exec,
930 			      NULL, 1000);
931 			if (bootverbose)
932 				printf("Linux ELF exec handler installed\n");
933 		} else
934 			printf("cannot insert Linux ELF brand handler\n");
935 		break;
936 	case MOD_UNLOAD:
937 		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
938 		     ++brandinfo)
939 			if (elf32_brand_inuse(*brandinfo))
940 				error = EBUSY;
941 		if (error == 0) {
942 			for (brandinfo = &linux_brandlist[0];
943 			     *brandinfo != NULL; ++brandinfo)
944 				if (elf32_remove_brand_entry(*brandinfo) < 0)
945 					error = EINVAL;
946 		}
947 		if (error == 0) {
948 			SET_FOREACH(lihp, linux_ioctl_handler_set)
949 				linux_ioctl_unregister_handler(*lihp);
950 			SET_FOREACH(ldhp, linux_device_handler_set)
951 				linux_device_unregister_handler(*ldhp);
952 			mtx_destroy(&emul_lock);
953 			sx_destroy(&emul_shared_lock);
954 			sx_destroy(&futex_sx);
955 			EVENTHANDLER_DEREGISTER(process_exit, linux_exit_tag);
956 			EVENTHANDLER_DEREGISTER(schedtail, linux_schedtail_tag);
957 			EVENTHANDLER_DEREGISTER(process_exec, linux_exec_tag);
958 			if (bootverbose)
959 				printf("Linux ELF exec handler removed\n");
960 		} else
961 			printf("Could not deinstall ELF interpreter entry\n");
962 		break;
963 	default:
964 		return EOPNOTSUPP;
965 	}
966 	return error;
967 }
968 
969 static moduledata_t linux_elf_mod = {
970 	"linuxelf",
971 	linux_elf_modevent,
972 	0
973 };
974 
975 DECLARE_MODULE(linuxelf, linux_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY);
976