xref: /freebsd/sys/amd64/linux32/linux32_sysvec.c (revision e168b357aa7fe7ae2bb9b56373a3aada3ebf56d7)
1 /*-
2  * Copyright (c) 2004 Tim J. Robbins
3  * Copyright (c) 2003 Peter Wemm
4  * Copyright (c) 2002 Doug Rabson
5  * Copyright (c) 1998-1999 Andrew Gallatin
6  * Copyright (c) 1994-1996 S�ren Schmidt
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer
14  *    in this position and unchanged.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. The name of the author may not be used to endorse or promote products
19  *    derived from this software without specific prior written permission
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
22  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
23  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
24  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
25  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
26  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
30  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31  */
32 
33 #include <sys/cdefs.h>
34 __FBSDID("$FreeBSD$");
35 
36 /* XXX we use functions that might not exist. */
37 #include "opt_compat.h"
38 
39 #ifndef COMPAT_43
40 #error "Unable to compile Linux-emulator due to missing COMPAT_43 option!"
41 #endif
42 #ifndef COMPAT_IA32
43 #error "Unable to compile Linux-emulator due to missing COMPAT_IA32 option!"
44 #endif
45 
46 #define	__ELF_WORD_SIZE	32
47 
48 #include <sys/param.h>
49 #include <sys/systm.h>
50 #include <sys/exec.h>
51 #include <sys/imgact.h>
52 #include <sys/imgact_elf.h>
53 #include <sys/kernel.h>
54 #include <sys/lock.h>
55 #include <sys/malloc.h>
56 #include <sys/module.h>
57 #include <sys/mutex.h>
58 #include <sys/proc.h>
59 #include <sys/resourcevar.h>
60 #include <sys/signalvar.h>
61 #include <sys/sysctl.h>
62 #include <sys/syscallsubr.h>
63 #include <sys/sysent.h>
64 #include <sys/sysproto.h>
65 #include <sys/vnode.h>
66 
67 #include <vm/vm.h>
68 #include <vm/pmap.h>
69 #include <vm/vm_extern.h>
70 #include <vm/vm_map.h>
71 #include <vm/vm_object.h>
72 #include <vm/vm_page.h>
73 #include <vm/vm_param.h>
74 
75 #include <machine/cpu.h>
76 #include <machine/md_var.h>
77 #include <machine/pcb.h>
78 #include <machine/specialreg.h>
79 
80 #include <amd64/linux32/linux.h>
81 #include <amd64/linux32/linux32_proto.h>
82 #include <compat/linux/linux_mib.h>
83 #include <compat/linux/linux_signal.h>
84 #include <compat/linux/linux_util.h>
85 
86 MODULE_VERSION(linux, 1);
87 
88 MALLOC_DEFINE(M_LINUX, "linux", "Linux mode structures");
89 
90 #define	AUXARGS_ENTRY_32(pos, id, val)	\
91 	do {				\
92 		suword32(pos++, id);	\
93 		suword32(pos++, val);	\
94 	} while (0)
95 
96 #if BYTE_ORDER == LITTLE_ENDIAN
97 #define SHELLMAGIC      0x2123 /* #! */
98 #else
99 #define SHELLMAGIC      0x2321
100 #endif
101 
102 /*
103  * Allow the sendsig functions to use the ldebug() facility
104  * even though they are not syscalls themselves. Map them
105  * to syscall 0. This is slightly less bogus than using
106  * ldebug(sigreturn).
107  */
108 #define	LINUX_SYS_linux_rt_sendsig	0
109 #define	LINUX_SYS_linux_sendsig		0
110 
111 extern char linux_sigcode[];
112 extern int linux_szsigcode;
113 
114 extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL];
115 
116 SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler);
117 
118 static int	elf_linux_fixup(register_t **stack_base,
119 		    struct image_params *iparams);
120 static register_t *linux_copyout_strings(struct image_params *imgp);
121 static void	linux_prepsyscall(struct trapframe *tf, int *args, u_int *code,
122 		    caddr_t *params);
123 static void     linux_sendsig(sig_t catcher, int sig, sigset_t *mask,
124 		    u_long code);
125 static void	exec_linux_setregs(struct thread *td, u_long entry,
126 				   u_long stack, u_long ps_strings);
127 static void	linux32_fixlimits(struct image_params *imgp);
128 
129 /*
130  * Linux syscalls return negative errno's, we do positive and map them
131  */
132 static int bsd_to_linux_errno[ELAST + 1] = {
133 	-0,  -1,  -2,  -3,  -4,  -5,  -6,  -7,  -8,  -9,
134 	-10, -35, -12, -13, -14, -15, -16, -17, -18, -19,
135 	-20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
136 	-30, -31, -32, -33, -34, -11,-115,-114, -88, -89,
137 	-90, -91, -92, -93, -94, -95, -96, -97, -98, -99,
138 	-100,-101,-102,-103,-104,-105,-106,-107,-108,-109,
139 	-110,-111, -40, -36,-112,-113, -39, -11, -87,-122,
140 	-116, -66,  -6,  -6,  -6,  -6,  -6, -37, -38,  -9,
141 	-6, -6, -43, -42, -75, -6, -84
142 };
143 
144 int bsd_to_linux_signal[LINUX_SIGTBLSZ] = {
145 	LINUX_SIGHUP, LINUX_SIGINT, LINUX_SIGQUIT, LINUX_SIGILL,
146 	LINUX_SIGTRAP, LINUX_SIGABRT, 0, LINUX_SIGFPE,
147 	LINUX_SIGKILL, LINUX_SIGBUS, LINUX_SIGSEGV, LINUX_SIGSYS,
148 	LINUX_SIGPIPE, LINUX_SIGALRM, LINUX_SIGTERM, LINUX_SIGURG,
149 	LINUX_SIGSTOP, LINUX_SIGTSTP, LINUX_SIGCONT, LINUX_SIGCHLD,
150 	LINUX_SIGTTIN, LINUX_SIGTTOU, LINUX_SIGIO, LINUX_SIGXCPU,
151 	LINUX_SIGXFSZ, LINUX_SIGVTALRM, LINUX_SIGPROF, LINUX_SIGWINCH,
152 	0, LINUX_SIGUSR1, LINUX_SIGUSR2
153 };
154 
155 int linux_to_bsd_signal[LINUX_SIGTBLSZ] = {
156 	SIGHUP, SIGINT, SIGQUIT, SIGILL,
157 	SIGTRAP, SIGABRT, SIGBUS, SIGFPE,
158 	SIGKILL, SIGUSR1, SIGSEGV, SIGUSR2,
159 	SIGPIPE, SIGALRM, SIGTERM, SIGBUS,
160 	SIGCHLD, SIGCONT, SIGSTOP, SIGTSTP,
161 	SIGTTIN, SIGTTOU, SIGURG, SIGXCPU,
162 	SIGXFSZ, SIGVTALRM, SIGPROF, SIGWINCH,
163 	SIGIO, SIGURG, SIGSYS
164 };
165 
166 #define LINUX_T_UNKNOWN  255
167 static int _bsd_to_linux_trapcode[] = {
168 	LINUX_T_UNKNOWN,	/* 0 */
169 	6,			/* 1  T_PRIVINFLT */
170 	LINUX_T_UNKNOWN,	/* 2 */
171 	3,			/* 3  T_BPTFLT */
172 	LINUX_T_UNKNOWN,	/* 4 */
173 	LINUX_T_UNKNOWN,	/* 5 */
174 	16,			/* 6  T_ARITHTRAP */
175 	254,			/* 7  T_ASTFLT */
176 	LINUX_T_UNKNOWN,	/* 8 */
177 	13,			/* 9  T_PROTFLT */
178 	1,			/* 10 T_TRCTRAP */
179 	LINUX_T_UNKNOWN,	/* 11 */
180 	14,			/* 12 T_PAGEFLT */
181 	LINUX_T_UNKNOWN,	/* 13 */
182 	17,			/* 14 T_ALIGNFLT */
183 	LINUX_T_UNKNOWN,	/* 15 */
184 	LINUX_T_UNKNOWN,	/* 16 */
185 	LINUX_T_UNKNOWN,	/* 17 */
186 	0,			/* 18 T_DIVIDE */
187 	2,			/* 19 T_NMI */
188 	4,			/* 20 T_OFLOW */
189 	5,			/* 21 T_BOUND */
190 	7,			/* 22 T_DNA */
191 	8,			/* 23 T_DOUBLEFLT */
192 	9,			/* 24 T_FPOPFLT */
193 	10,			/* 25 T_TSSFLT */
194 	11,			/* 26 T_SEGNPFLT */
195 	12,			/* 27 T_STKFLT */
196 	18,			/* 28 T_MCHK */
197 	19,			/* 29 T_XMMFLT */
198 	15			/* 30 T_RESERVED */
199 };
200 #define bsd_to_linux_trapcode(code) \
201     ((code)<sizeof(_bsd_to_linux_trapcode)/sizeof(*_bsd_to_linux_trapcode)? \
202      _bsd_to_linux_trapcode[(code)]: \
203      LINUX_T_UNKNOWN)
204 
205 struct linux32_ps_strings {
206 	u_int32_t ps_argvstr;	/* first of 0 or more argument strings */
207 	u_int ps_nargvstr;	/* the number of argument strings */
208 	u_int32_t ps_envstr;	/* first of 0 or more environment strings */
209 	u_int ps_nenvstr;	/* the number of environment strings */
210 };
211 
212 /*
213  * If FreeBSD & Linux have a difference of opinion about what a trap
214  * means, deal with it here.
215  *
216  * MPSAFE
217  */
218 static int
219 translate_traps(int signal, int trap_code)
220 {
221 	if (signal != SIGBUS)
222 		return signal;
223 	switch (trap_code) {
224 	case T_PROTFLT:
225 	case T_TSSFLT:
226 	case T_DOUBLEFLT:
227 	case T_PAGEFLT:
228 		return SIGSEGV;
229 	default:
230 		return signal;
231 	}
232 }
233 
234 static int
235 elf_linux_fixup(register_t **stack_base, struct image_params *imgp)
236 {
237 	Elf32_Auxargs *args;
238 	Elf32_Addr *base;
239 	Elf32_Addr *pos;
240 
241 	KASSERT(curthread->td_proc == imgp->proc &&
242 	    (curthread->td_proc->p_flag & P_SA) == 0,
243 	    ("unsafe elf_linux_fixup(), should be curproc"));
244 	base = (Elf32_Addr *)*stack_base;
245 	args = (Elf32_Auxargs *)imgp->auxargs;
246 	pos = base + (imgp->args->argc + imgp->args->envc + 2);
247 
248 	if (args->trace)
249 		AUXARGS_ENTRY_32(pos, AT_DEBUG, 1);
250 	if (args->execfd != -1)
251 		AUXARGS_ENTRY_32(pos, AT_EXECFD, args->execfd);
252 	AUXARGS_ENTRY_32(pos, AT_PHDR, args->phdr);
253 	AUXARGS_ENTRY_32(pos, AT_PHENT, args->phent);
254 	AUXARGS_ENTRY_32(pos, AT_PHNUM, args->phnum);
255 	AUXARGS_ENTRY_32(pos, AT_PAGESZ, args->pagesz);
256 	AUXARGS_ENTRY_32(pos, AT_FLAGS, args->flags);
257 	AUXARGS_ENTRY_32(pos, AT_ENTRY, args->entry);
258 	AUXARGS_ENTRY_32(pos, AT_BASE, args->base);
259 	AUXARGS_ENTRY_32(pos, AT_UID, imgp->proc->p_ucred->cr_ruid);
260 	AUXARGS_ENTRY_32(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid);
261 	AUXARGS_ENTRY_32(pos, AT_GID, imgp->proc->p_ucred->cr_rgid);
262 	AUXARGS_ENTRY_32(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid);
263 	AUXARGS_ENTRY_32(pos, AT_NULL, 0);
264 
265 	free(imgp->auxargs, M_TEMP);
266 	imgp->auxargs = NULL;
267 
268 	base--;
269 	suword32(base, (uint32_t)imgp->args->argc);
270 	*stack_base = (register_t *)base;
271 	return 0;
272 }
273 
274 extern int _ucodesel, _ucode32sel, _udatasel;
275 extern unsigned long linux_sznonrtsigcode;
276 
277 static void
278 linux_rt_sendsig(sig_t catcher, int sig, sigset_t *mask, u_long code)
279 {
280 	struct thread *td = curthread;
281 	struct proc *p = td->td_proc;
282 	struct sigacts *psp;
283 	struct trapframe *regs;
284 	struct l_rt_sigframe *fp, frame;
285 	int oonstack;
286 
287 	PROC_LOCK_ASSERT(p, MA_OWNED);
288 	psp = p->p_sigacts;
289 	mtx_assert(&psp->ps_mtx, MA_OWNED);
290 	regs = td->td_frame;
291 	oonstack = sigonstack(regs->tf_rsp);
292 
293 #ifdef DEBUG
294 	if (ldebug(rt_sendsig))
295 		printf(ARGS(rt_sendsig, "%p, %d, %p, %lu"),
296 		    catcher, sig, (void*)mask, code);
297 #endif
298 	/*
299 	 * Allocate space for the signal handler context.
300 	 */
301 	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
302 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
303 		fp = (struct l_rt_sigframe *)(td->td_sigstk.ss_sp +
304 		    td->td_sigstk.ss_size - sizeof(struct l_rt_sigframe));
305 	} else
306 		fp = (struct l_rt_sigframe *)regs->tf_rsp - 1;
307 	mtx_unlock(&psp->ps_mtx);
308 
309 	/*
310 	 * Build the argument list for the signal handler.
311 	 */
312 	if (p->p_sysent->sv_sigtbl)
313 		if (sig <= p->p_sysent->sv_sigsize)
314 			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
315 
316 	bzero(&frame, sizeof(frame));
317 
318 	frame.sf_handler = PTROUT(catcher);
319 	frame.sf_sig = sig;
320 	frame.sf_siginfo = PTROUT(&fp->sf_si);
321 	frame.sf_ucontext = PTROUT(&fp->sf_sc);
322 
323 	/* Fill in POSIX parts */
324 	frame.sf_si.lsi_signo = sig;
325 	frame.sf_si.lsi_code = code;
326 	frame.sf_si.lsi_addr = PTROUT(regs->tf_err);
327 
328 	/*
329 	 * Build the signal context to be used by sigreturn.
330 	 */
331 	frame.sf_sc.uc_flags = 0;		/* XXX ??? */
332 	frame.sf_sc.uc_link = 0;		/* XXX ??? */
333 
334 	frame.sf_sc.uc_stack.ss_sp = PTROUT(td->td_sigstk.ss_sp);
335 	frame.sf_sc.uc_stack.ss_size = td->td_sigstk.ss_size;
336 	frame.sf_sc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
337 	    ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE;
338 	PROC_UNLOCK(p);
339 
340 	bsd_to_linux_sigset(mask, &frame.sf_sc.uc_sigmask);
341 
342 	frame.sf_sc.uc_mcontext.sc_mask   = frame.sf_sc.uc_sigmask.__bits[0];
343         frame.sf_sc.uc_mcontext.sc_gs     = rgs();
344         frame.sf_sc.uc_mcontext.sc_fs     = rfs();
345         __asm __volatile("movl %%es,%0" :
346 	    "=rm" (frame.sf_sc.uc_mcontext.sc_es));
347         __asm __volatile("movl %%ds,%0" :
348 	    "=rm" (frame.sf_sc.uc_mcontext.sc_ds));
349 	frame.sf_sc.uc_mcontext.sc_edi    = regs->tf_rdi;
350 	frame.sf_sc.uc_mcontext.sc_esi    = regs->tf_rsi;
351 	frame.sf_sc.uc_mcontext.sc_ebp    = regs->tf_rbp;
352 	frame.sf_sc.uc_mcontext.sc_ebx    = regs->tf_rbx;
353 	frame.sf_sc.uc_mcontext.sc_edx    = regs->tf_rdx;
354 	frame.sf_sc.uc_mcontext.sc_ecx    = regs->tf_rcx;
355 	frame.sf_sc.uc_mcontext.sc_eax    = regs->tf_rax;
356 	frame.sf_sc.uc_mcontext.sc_eip    = regs->tf_rip;
357 	frame.sf_sc.uc_mcontext.sc_cs     = regs->tf_cs;
358 	frame.sf_sc.uc_mcontext.sc_eflags = regs->tf_rflags;
359 	frame.sf_sc.uc_mcontext.sc_esp_at_signal = regs->tf_rsp;
360 	frame.sf_sc.uc_mcontext.sc_ss     = regs->tf_ss;
361 	frame.sf_sc.uc_mcontext.sc_err    = regs->tf_err;
362 	frame.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code);
363 
364 #ifdef DEBUG
365 	if (ldebug(rt_sendsig))
366 		printf(LMSG("rt_sendsig flags: 0x%x, sp: %p, ss: 0x%lx, mask: 0x%x"),
367 		    frame.sf_sc.uc_stack.ss_flags, td->td_sigstk.ss_sp,
368 		    td->td_sigstk.ss_size, frame.sf_sc.uc_mcontext.sc_mask);
369 #endif
370 
371 	if (copyout(&frame, fp, sizeof(frame)) != 0) {
372 		/*
373 		 * Process has trashed its stack; give it an illegal
374 		 * instruction to halt it in its tracks.
375 		 */
376 #ifdef DEBUG
377 		if (ldebug(rt_sendsig))
378 			printf(LMSG("rt_sendsig: bad stack %p, oonstack=%x"),
379 			    fp, oonstack);
380 #endif
381 		PROC_LOCK(p);
382 		sigexit(td, SIGILL);
383 	}
384 
385 	/*
386 	 * Build context to run handler in.
387 	 */
388 	regs->tf_rsp = PTROUT(fp);
389 	regs->tf_rip = LINUX32_PS_STRINGS - *(p->p_sysent->sv_szsigcode) +
390 	    linux_sznonrtsigcode;
391 	regs->tf_rflags &= ~PSL_T;
392 	regs->tf_cs = _ucode32sel;
393 	regs->tf_ss = _udatasel;
394 	load_ds(_udatasel);
395 	td->td_pcb->pcb_ds = _udatasel;
396 	load_es(_udatasel);
397 	td->td_pcb->pcb_es = _udatasel;
398 	PROC_LOCK(p);
399 	mtx_lock(&psp->ps_mtx);
400 }
401 
402 
403 /*
404  * Send an interrupt to process.
405  *
406  * Stack is set up to allow sigcode stored
407  * in u. to call routine, followed by kcall
408  * to sigreturn routine below.  After sigreturn
409  * resets the signal mask, the stack, and the
410  * frame pointer, it returns to the user
411  * specified pc, psl.
412  */
413 static void
414 linux_sendsig(sig_t catcher, int sig, sigset_t *mask, u_long code)
415 {
416 	struct thread *td = curthread;
417 	struct proc *p = td->td_proc;
418 	struct sigacts *psp;
419 	struct trapframe *regs;
420 	struct l_sigframe *fp, frame;
421 	l_sigset_t lmask;
422 	int oonstack, i;
423 
424 	PROC_LOCK_ASSERT(p, MA_OWNED);
425 	psp = p->p_sigacts;
426 	mtx_assert(&psp->ps_mtx, MA_OWNED);
427 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
428 		/* Signal handler installed with SA_SIGINFO. */
429 		linux_rt_sendsig(catcher, sig, mask, code);
430 		return;
431 	}
432 
433 	regs = td->td_frame;
434 	oonstack = sigonstack(regs->tf_rsp);
435 
436 #ifdef DEBUG
437 	if (ldebug(sendsig))
438 		printf(ARGS(sendsig, "%p, %d, %p, %lu"),
439 		    catcher, sig, (void*)mask, code);
440 #endif
441 
442 	/*
443 	 * Allocate space for the signal handler context.
444 	 */
445 	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
446 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
447 		fp = (struct l_sigframe *)(td->td_sigstk.ss_sp +
448 		    td->td_sigstk.ss_size - sizeof(struct l_sigframe));
449 	} else
450 		fp = (struct l_sigframe *)regs->tf_rsp - 1;
451 	mtx_unlock(&psp->ps_mtx);
452 	PROC_UNLOCK(p);
453 
454 	/*
455 	 * Build the argument list for the signal handler.
456 	 */
457 	if (p->p_sysent->sv_sigtbl)
458 		if (sig <= p->p_sysent->sv_sigsize)
459 			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
460 
461 	bzero(&frame, sizeof(frame));
462 
463 	frame.sf_handler = PTROUT(catcher);
464 	frame.sf_sig = sig;
465 
466 	bsd_to_linux_sigset(mask, &lmask);
467 
468 	/*
469 	 * Build the signal context to be used by sigreturn.
470 	 */
471 	frame.sf_sc.sc_mask   = lmask.__bits[0];
472         frame.sf_sc.sc_gs     = rgs();
473         frame.sf_sc.sc_fs     = rfs();
474         __asm __volatile("movl %%es,%0" : "=rm" (frame.sf_sc.sc_es));
475         __asm __volatile("movl %%ds,%0" : "=rm" (frame.sf_sc.sc_ds));
476 	frame.sf_sc.sc_edi    = regs->tf_rdi;
477 	frame.sf_sc.sc_esi    = regs->tf_rsi;
478 	frame.sf_sc.sc_ebp    = regs->tf_rbp;
479 	frame.sf_sc.sc_ebx    = regs->tf_rbx;
480 	frame.sf_sc.sc_edx    = regs->tf_rdx;
481 	frame.sf_sc.sc_ecx    = regs->tf_rcx;
482 	frame.sf_sc.sc_eax    = regs->tf_rax;
483 	frame.sf_sc.sc_eip    = regs->tf_rip;
484 	frame.sf_sc.sc_cs     = regs->tf_cs;
485 	frame.sf_sc.sc_eflags = regs->tf_rflags;
486 	frame.sf_sc.sc_esp_at_signal = regs->tf_rsp;
487 	frame.sf_sc.sc_ss     = regs->tf_ss;
488 	frame.sf_sc.sc_err    = regs->tf_err;
489 	frame.sf_sc.sc_trapno = bsd_to_linux_trapcode(code);
490 
491 	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
492 		frame.sf_extramask[i] = lmask.__bits[i+1];
493 
494 	if (copyout(&frame, fp, sizeof(frame)) != 0) {
495 		/*
496 		 * Process has trashed its stack; give it an illegal
497 		 * instruction to halt it in its tracks.
498 		 */
499 		PROC_LOCK(p);
500 		sigexit(td, SIGILL);
501 	}
502 
503 	/*
504 	 * Build context to run handler in.
505 	 */
506 	regs->tf_rsp = PTROUT(fp);
507 	regs->tf_rip = LINUX32_PS_STRINGS - *(p->p_sysent->sv_szsigcode);
508 	regs->tf_rflags &= ~PSL_T;
509 	regs->tf_cs = _ucode32sel;
510 	regs->tf_ss = _udatasel;
511 	load_ds(_udatasel);
512 	td->td_pcb->pcb_ds = _udatasel;
513 	load_es(_udatasel);
514 	td->td_pcb->pcb_es = _udatasel;
515 	PROC_LOCK(p);
516 	mtx_lock(&psp->ps_mtx);
517 }
518 
519 /*
520  * System call to cleanup state after a signal
521  * has been taken.  Reset signal mask and
522  * stack state from context left by sendsig (above).
523  * Return to previous pc and psl as specified by
524  * context left by sendsig. Check carefully to
525  * make sure that the user has not modified the
526  * psl to gain improper privileges or to cause
527  * a machine fault.
528  */
529 int
530 linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args)
531 {
532 	struct proc *p = td->td_proc;
533 	struct l_sigframe frame;
534 	struct trapframe *regs;
535 	l_sigset_t lmask;
536 	int eflags, i;
537 
538 	regs = td->td_frame;
539 
540 #ifdef DEBUG
541 	if (ldebug(sigreturn))
542 		printf(ARGS(sigreturn, "%p"), (void *)args->sfp);
543 #endif
544 	/*
545 	 * The trampoline code hands us the sigframe.
546 	 * It is unsafe to keep track of it ourselves, in the event that a
547 	 * program jumps out of a signal handler.
548 	 */
549 	if (copyin(args->sfp, &frame, sizeof(frame)) != 0)
550 		return (EFAULT);
551 
552 	/*
553 	 * Check for security violations.
554 	 */
555 #define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
556 	eflags = frame.sf_sc.sc_eflags;
557 	/*
558 	 * XXX do allow users to change the privileged flag PSL_RF.  The
559 	 * cpu sets PSL_RF in tf_eflags for faults.  Debuggers should
560 	 * sometimes set it there too.  tf_eflags is kept in the signal
561 	 * context during signal handling and there is no other place
562 	 * to remember it, so the PSL_RF bit may be corrupted by the
563 	 * signal handler without us knowing.  Corruption of the PSL_RF
564 	 * bit at worst causes one more or one less debugger trap, so
565 	 * allowing it is fairly harmless.
566 	 */
567 	if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF))
568 		return(EINVAL);
569 
570 	/*
571 	 * Don't allow users to load a valid privileged %cs.  Let the
572 	 * hardware check for invalid selectors, excess privilege in
573 	 * other selectors, invalid %eip's and invalid %esp's.
574 	 */
575 #define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
576 	if (!CS_SECURE(frame.sf_sc.sc_cs)) {
577 		trapsignal(td, SIGBUS, T_PROTFLT);
578 		return(EINVAL);
579 	}
580 
581 	lmask.__bits[0] = frame.sf_sc.sc_mask;
582 	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
583 		lmask.__bits[i+1] = frame.sf_extramask[i];
584 	PROC_LOCK(p);
585 	linux_to_bsd_sigset(&lmask, &td->td_sigmask);
586 	SIG_CANTMASK(td->td_sigmask);
587 	signotify(td);
588 	PROC_UNLOCK(p);
589 
590 	/*
591 	 * Restore signal context.
592 	 */
593 	/* Selectors were restored by the trampoline. */
594 	regs->tf_rdi    = frame.sf_sc.sc_edi;
595 	regs->tf_rsi    = frame.sf_sc.sc_esi;
596 	regs->tf_rbp    = frame.sf_sc.sc_ebp;
597 	regs->tf_rbx    = frame.sf_sc.sc_ebx;
598 	regs->tf_rdx    = frame.sf_sc.sc_edx;
599 	regs->tf_rcx    = frame.sf_sc.sc_ecx;
600 	regs->tf_rax    = frame.sf_sc.sc_eax;
601 	regs->tf_rip    = frame.sf_sc.sc_eip;
602 	regs->tf_cs     = frame.sf_sc.sc_cs;
603 	regs->tf_rflags = eflags;
604 	regs->tf_rsp    = frame.sf_sc.sc_esp_at_signal;
605 	regs->tf_ss     = frame.sf_sc.sc_ss;
606 
607 	return (EJUSTRETURN);
608 }
609 
610 /*
611  * System call to cleanup state after a signal
612  * has been taken.  Reset signal mask and
613  * stack state from context left by rt_sendsig (above).
614  * Return to previous pc and psl as specified by
615  * context left by sendsig. Check carefully to
616  * make sure that the user has not modified the
617  * psl to gain improper privileges or to cause
618  * a machine fault.
619  */
620 int
621 linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args)
622 {
623 	struct proc *p = td->td_proc;
624 	struct l_ucontext uc;
625 	struct l_sigcontext *context;
626 	l_stack_t *lss;
627 	stack_t ss;
628 	struct trapframe *regs;
629 	int eflags;
630 
631 	regs = td->td_frame;
632 
633 #ifdef DEBUG
634 	if (ldebug(rt_sigreturn))
635 		printf(ARGS(rt_sigreturn, "%p"), (void *)args->ucp);
636 #endif
637 	/*
638 	 * The trampoline code hands us the ucontext.
639 	 * It is unsafe to keep track of it ourselves, in the event that a
640 	 * program jumps out of a signal handler.
641 	 */
642 	if (copyin(args->ucp, &uc, sizeof(uc)) != 0)
643 		return (EFAULT);
644 
645 	context = &uc.uc_mcontext;
646 
647 	/*
648 	 * Check for security violations.
649 	 */
650 #define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
651 	eflags = context->sc_eflags;
652 	/*
653 	 * XXX do allow users to change the privileged flag PSL_RF.  The
654 	 * cpu sets PSL_RF in tf_eflags for faults.  Debuggers should
655 	 * sometimes set it there too.  tf_eflags is kept in the signal
656 	 * context during signal handling and there is no other place
657 	 * to remember it, so the PSL_RF bit may be corrupted by the
658 	 * signal handler without us knowing.  Corruption of the PSL_RF
659 	 * bit at worst causes one more or one less debugger trap, so
660 	 * allowing it is fairly harmless.
661 	 */
662 	if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF))
663 		return(EINVAL);
664 
665 	/*
666 	 * Don't allow users to load a valid privileged %cs.  Let the
667 	 * hardware check for invalid selectors, excess privilege in
668 	 * other selectors, invalid %eip's and invalid %esp's.
669 	 */
670 #define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
671 	if (!CS_SECURE(context->sc_cs)) {
672 		trapsignal(td, SIGBUS, T_PROTFLT);
673 		return(EINVAL);
674 	}
675 
676 	PROC_LOCK(p);
677 	linux_to_bsd_sigset(&uc.uc_sigmask, &td->td_sigmask);
678 	SIG_CANTMASK(td->td_sigmask);
679 	signotify(td);
680 	PROC_UNLOCK(p);
681 
682 	/*
683 	 * Restore signal context
684 	 */
685 	/* Selectors were restored by the trampoline. */
686 	regs->tf_rdi    = context->sc_edi;
687 	regs->tf_rsi    = context->sc_esi;
688 	regs->tf_rbp    = context->sc_ebp;
689 	regs->tf_rbx    = context->sc_ebx;
690 	regs->tf_rdx    = context->sc_edx;
691 	regs->tf_rcx    = context->sc_ecx;
692 	regs->tf_rax    = context->sc_eax;
693 	regs->tf_rip    = context->sc_eip;
694 	regs->tf_cs     = context->sc_cs;
695 	regs->tf_rflags = eflags;
696 	regs->tf_rsp    = context->sc_esp_at_signal;
697 	regs->tf_ss     = context->sc_ss;
698 
699 	/*
700 	 * call sigaltstack & ignore results..
701 	 */
702 	lss = &uc.uc_stack;
703 	ss.ss_sp = PTRIN(lss->ss_sp);
704 	ss.ss_size = lss->ss_size;
705 	ss.ss_flags = linux_to_bsd_sigaltstack(lss->ss_flags);
706 
707 #ifdef DEBUG
708 	if (ldebug(rt_sigreturn))
709 		printf(LMSG("rt_sigret flags: 0x%x, sp: %p, ss: 0x%lx, mask: 0x%x"),
710 		    ss.ss_flags, ss.ss_sp, ss.ss_size, context->sc_mask);
711 #endif
712 	(void)kern_sigaltstack(td, &ss, NULL);
713 
714 	return (EJUSTRETURN);
715 }
716 
717 /*
718  * MPSAFE
719  */
720 static void
721 linux_prepsyscall(struct trapframe *tf, int *args, u_int *code, caddr_t *params)
722 {
723 	args[0] = tf->tf_rbx;
724 	args[1] = tf->tf_rcx;
725 	args[2] = tf->tf_rdx;
726 	args[3] = tf->tf_rsi;
727 	args[4] = tf->tf_rdi;
728 	args[5] = tf->tf_rbp;	/* Unconfirmed */
729 	*params = NULL;		/* no copyin */
730 }
731 
732 /*
733  * If a linux binary is exec'ing something, try this image activator
734  * first.  We override standard shell script execution in order to
735  * be able to modify the interpreter path.  We only do this if a linux
736  * binary is doing the exec, so we do not create an EXEC module for it.
737  */
738 static int	exec_linux_imgact_try(struct image_params *iparams);
739 
740 static int
741 exec_linux_imgact_try(struct image_params *imgp)
742 {
743     const char *head = (const char *)imgp->image_header;
744     char *rpath;
745     int error = -1, len;
746 
747     /*
748      * The interpreter for shell scripts run from a linux binary needs
749      * to be located in /compat/linux if possible in order to recursively
750      * maintain linux path emulation.
751      */
752     if (((const short *)head)[0] == SHELLMAGIC) {
753 	    /*
754 	     * Run our normal shell image activator.  If it succeeds attempt
755 	     * to use the alternate path for the interpreter.  If an alternate
756 	     * path is found, use our stringspace to store it.
757 	     */
758 	    if ((error = exec_shell_imgact(imgp)) == 0) {
759 		    linux_emul_convpath(FIRST_THREAD_IN_PROC(imgp->proc),
760 			imgp->interpreter_name, UIO_SYSSPACE, &rpath, 0);
761 		    if (rpath != NULL) {
762 			    len = strlen(rpath) + 1;
763 
764 			    if (len <= MAXSHELLCMDLEN) {
765 				    memcpy(imgp->interpreter_name, rpath, len);
766 			    }
767 			    free(rpath, M_TEMP);
768 		    }
769 	    }
770     }
771     return(error);
772 }
773 
774 /*
775  * Clear registers on exec
776  * XXX copied from ia32_signal.c.
777  */
778 static void
779 exec_linux_setregs(td, entry, stack, ps_strings)
780 	struct thread *td;
781 	u_long entry;
782 	u_long stack;
783 	u_long ps_strings;
784 {
785 	struct trapframe *regs = td->td_frame;
786 	struct pcb *pcb = td->td_pcb;
787 
788 	wrmsr(MSR_FSBASE, 0);
789 	wrmsr(MSR_KGSBASE, 0);	/* User value while we're in the kernel */
790 	pcb->pcb_fsbase = 0;
791 	pcb->pcb_gsbase = 0;
792 	load_ds(_udatasel);
793 	load_es(_udatasel);
794 	load_fs(_udatasel);
795 	load_gs(0);
796 	pcb->pcb_ds = _udatasel;
797 	pcb->pcb_es = _udatasel;
798 	pcb->pcb_fs = _udatasel;
799 	pcb->pcb_gs = 0;
800 
801 	bzero((char *)regs, sizeof(struct trapframe));
802 	regs->tf_rip = entry;
803 	regs->tf_rsp = stack;
804 	regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T);
805 	regs->tf_ss = _udatasel;
806 	regs->tf_cs = _ucode32sel;
807 	regs->tf_rbx = ps_strings;
808 	load_cr0(rcr0() | CR0_MP | CR0_TS);
809 
810 	/* Return via doreti so that we can change to a different %cs */
811 	pcb->pcb_flags |= PCB_FULLCTX;
812 	td->td_retval[1] = 0;
813 }
814 
815 /*
816  * XXX copied from ia32_sysvec.c.
817  */
818 static register_t *
819 linux_copyout_strings(struct image_params *imgp)
820 {
821 	int argc, envc;
822 	u_int32_t *vectp;
823 	char *stringp, *destp;
824 	u_int32_t *stack_base;
825 	struct linux32_ps_strings *arginfo;
826 	int sigcodesz;
827 
828 	/*
829 	 * Calculate string base and vector table pointers.
830 	 * Also deal with signal trampoline code for this exec type.
831 	 */
832 	arginfo = (struct linux32_ps_strings *)LINUX32_PS_STRINGS;
833 	sigcodesz = *(imgp->proc->p_sysent->sv_szsigcode);
834 	destp =	(caddr_t)arginfo - sigcodesz - SPARE_USRSPACE -
835 		roundup((ARG_MAX - imgp->args->stringspace), sizeof(char *));
836 
837 	/*
838 	 * install sigcode
839 	 */
840 	if (sigcodesz)
841 		copyout(imgp->proc->p_sysent->sv_sigcode,
842 			((caddr_t)arginfo - sigcodesz), szsigcode);
843 
844 	/*
845 	 * If we have a valid auxargs ptr, prepare some room
846 	 * on the stack.
847 	 */
848 	if (imgp->auxargs) {
849 		/*
850 		 * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for
851 		 * lower compatibility.
852 		 */
853 		imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size
854 			: (AT_COUNT * 2);
855 		/*
856 		 * The '+ 2' is for the null pointers at the end of each of
857 		 * the arg and env vector sets,and imgp->auxarg_size is room
858 		 * for argument of Runtime loader.
859 		 */
860 		vectp = (u_int32_t *) (destp - (imgp->args->argc + imgp->args->envc + 2 +
861 				       imgp->auxarg_size) * sizeof(u_int32_t));
862 
863 	} else
864 		/*
865 		 * The '+ 2' is for the null pointers at the end of each of
866 		 * the arg and env vector sets
867 		 */
868 		vectp = (u_int32_t *)
869 			(destp - (imgp->args->argc + imgp->args->envc + 2) * sizeof(u_int32_t));
870 
871 	/*
872 	 * vectp also becomes our initial stack base
873 	 */
874 	stack_base = vectp;
875 
876 	stringp = imgp->args->begin_argv;
877 	argc = imgp->args->argc;
878 	envc = imgp->args->envc;
879 	/*
880 	 * Copy out strings - arguments and environment.
881 	 */
882 	copyout(stringp, destp, ARG_MAX - imgp->args->stringspace);
883 
884 	/*
885 	 * Fill in "ps_strings" struct for ps, w, etc.
886 	 */
887 	suword32(&arginfo->ps_argvstr, (u_int32_t)(intptr_t)vectp);
888 	suword32(&arginfo->ps_nargvstr, argc);
889 
890 	/*
891 	 * Fill in argument portion of vector table.
892 	 */
893 	for (; argc > 0; --argc) {
894 		suword32(vectp++, (u_int32_t)(intptr_t)destp);
895 		while (*stringp++ != 0)
896 			destp++;
897 		destp++;
898 	}
899 
900 	/* a null vector table pointer separates the argp's from the envp's */
901 	suword32(vectp++, 0);
902 
903 	suword32(&arginfo->ps_envstr, (u_int32_t)(intptr_t)vectp);
904 	suword32(&arginfo->ps_nenvstr, envc);
905 
906 	/*
907 	 * Fill in environment portion of vector table.
908 	 */
909 	for (; envc > 0; --envc) {
910 		suword32(vectp++, (u_int32_t)(intptr_t)destp);
911 		while (*stringp++ != 0)
912 			destp++;
913 		destp++;
914 	}
915 
916 	/* end of vector table is a null pointer */
917 	suword32(vectp, 0);
918 
919 	return ((register_t *)stack_base);
920 }
921 
922 SYSCTL_NODE(_compat, OID_AUTO, linux32, CTLFLAG_RW, 0,
923     "32-bit Linux emulation");
924 
925 static u_long	linux32_maxdsiz = LINUX32_MAXDSIZ;
926 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxdsiz, CTLFLAG_RW,
927     &linux32_maxdsiz, 0, "");
928 static u_long	linux32_maxssiz = LINUX32_MAXSSIZ;
929 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxssiz, CTLFLAG_RW,
930     &linux32_maxssiz, 0, "");
931 static u_long	linux32_maxvmem = LINUX32_MAXVMEM;
932 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxvmem, CTLFLAG_RW,
933     &linux32_maxvmem, 0, "");
934 
935 /*
936  * XXX copied from ia32_sysvec.c.
937  */
938 static void
939 linux32_fixlimits(struct image_params *imgp)
940 {
941 	struct proc *p = imgp->proc;
942 	struct plimit *oldlim, *newlim;
943 
944 	if (linux32_maxdsiz == 0 && linux32_maxssiz == 0 &&
945 	    linux32_maxvmem == 0)
946 		return;
947 	newlim = lim_alloc();
948 	PROC_LOCK(p);
949 	oldlim = p->p_limit;
950 	lim_copy(newlim, oldlim);
951 	if (linux32_maxdsiz != 0) {
952 		if (newlim->pl_rlimit[RLIMIT_DATA].rlim_cur > linux32_maxdsiz)
953 		    newlim->pl_rlimit[RLIMIT_DATA].rlim_cur = linux32_maxdsiz;
954 		if (newlim->pl_rlimit[RLIMIT_DATA].rlim_max > linux32_maxdsiz)
955 		    newlim->pl_rlimit[RLIMIT_DATA].rlim_max = linux32_maxdsiz;
956 	}
957 	if (linux32_maxssiz != 0) {
958 		if (newlim->pl_rlimit[RLIMIT_STACK].rlim_cur > linux32_maxssiz)
959 		    newlim->pl_rlimit[RLIMIT_STACK].rlim_cur = linux32_maxssiz;
960 		if (newlim->pl_rlimit[RLIMIT_STACK].rlim_max > linux32_maxssiz)
961 		    newlim->pl_rlimit[RLIMIT_STACK].rlim_max = linux32_maxssiz;
962 	}
963 	if (linux32_maxvmem != 0) {
964 		if (newlim->pl_rlimit[RLIMIT_VMEM].rlim_cur > linux32_maxvmem)
965 		    newlim->pl_rlimit[RLIMIT_VMEM].rlim_cur = linux32_maxvmem;
966 		if (newlim->pl_rlimit[RLIMIT_VMEM].rlim_max > linux32_maxvmem)
967 		    newlim->pl_rlimit[RLIMIT_VMEM].rlim_max = linux32_maxvmem;
968 	}
969 	p->p_limit = newlim;
970 	PROC_UNLOCK(p);
971 	lim_free(oldlim);
972 }
973 
974 struct sysentvec elf_linux_sysvec = {
975 	LINUX_SYS_MAXSYSCALL,
976 	linux_sysent,
977 	0xff,
978 	LINUX_SIGTBLSZ,
979 	bsd_to_linux_signal,
980 	ELAST + 1,
981 	bsd_to_linux_errno,
982 	translate_traps,
983 	elf_linux_fixup,
984 	linux_sendsig,
985 	linux_sigcode,
986 	&linux_szsigcode,
987 	linux_prepsyscall,
988 	"Linux ELF32",
989 	elf32_coredump,
990 	exec_linux_imgact_try,
991 	LINUX_MINSIGSTKSZ,
992 	PAGE_SIZE,
993 	VM_MIN_ADDRESS,
994 	LINUX32_USRSTACK,
995 	LINUX32_USRSTACK,
996 	LINUX32_PS_STRINGS,
997 	VM_PROT_ALL,
998 	linux_copyout_strings,
999 	exec_linux_setregs,
1000 	linux32_fixlimits
1001 };
1002 
1003 static Elf32_Brandinfo linux_brand = {
1004 					ELFOSABI_LINUX,
1005 					EM_386,
1006 					"Linux",
1007 					"/compat/linux",
1008 					"/lib/ld-linux.so.1",
1009 					&elf_linux_sysvec,
1010 					NULL,
1011 				 };
1012 
1013 static Elf32_Brandinfo linux_glibc2brand = {
1014 					ELFOSABI_LINUX,
1015 					EM_386,
1016 					"Linux",
1017 					"/compat/linux",
1018 					"/lib/ld-linux.so.2",
1019 					&elf_linux_sysvec,
1020 					NULL,
1021 				 };
1022 
1023 Elf32_Brandinfo *linux_brandlist[] = {
1024 					&linux_brand,
1025 					&linux_glibc2brand,
1026 					NULL
1027 				};
1028 
1029 static int
1030 linux_elf_modevent(module_t mod, int type, void *data)
1031 {
1032 	Elf32_Brandinfo **brandinfo;
1033 	int error;
1034 	struct linux_ioctl_handler **lihp;
1035 
1036 	error = 0;
1037 
1038 	switch(type) {
1039 	case MOD_LOAD:
1040 		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
1041 		     ++brandinfo)
1042 			if (elf32_insert_brand_entry(*brandinfo) < 0)
1043 				error = EINVAL;
1044 		if (error == 0) {
1045 			SET_FOREACH(lihp, linux_ioctl_handler_set)
1046 				linux_ioctl_register_handler(*lihp);
1047 			if (bootverbose)
1048 				printf("Linux ELF exec handler installed\n");
1049 		} else
1050 			printf("cannot insert Linux ELF brand handler\n");
1051 		break;
1052 	case MOD_UNLOAD:
1053 		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
1054 		     ++brandinfo)
1055 			if (elf32_brand_inuse(*brandinfo))
1056 				error = EBUSY;
1057 		if (error == 0) {
1058 			for (brandinfo = &linux_brandlist[0];
1059 			     *brandinfo != NULL; ++brandinfo)
1060 				if (elf32_remove_brand_entry(*brandinfo) < 0)
1061 					error = EINVAL;
1062 		}
1063 		if (error == 0) {
1064 			SET_FOREACH(lihp, linux_ioctl_handler_set)
1065 				linux_ioctl_unregister_handler(*lihp);
1066 			if (bootverbose)
1067 				printf("Linux ELF exec handler removed\n");
1068 			linux_mib_destroy();
1069 		} else
1070 			printf("Could not deinstall ELF interpreter entry\n");
1071 		break;
1072 	default:
1073 		break;
1074 	}
1075 	return error;
1076 }
1077 
1078 static moduledata_t linux_elf_mod = {
1079 	"linuxelf",
1080 	linux_elf_modevent,
1081 	0
1082 };
1083 
1084 DECLARE_MODULE(linuxelf, linux_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY);
1085