xref: /freebsd/sys/amd64/linux32/linux32_sysvec.c (revision 6af83ee0d2941d18880b6aaa2b4facd1d30c6106)
1 /*-
2  * Copyright (c) 2004 Tim J. Robbins
3  * Copyright (c) 2003 Peter Wemm
4  * Copyright (c) 2002 Doug Rabson
5  * Copyright (c) 1998-1999 Andrew Gallatin
6  * Copyright (c) 1994-1996 S�ren Schmidt
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer
14  *    in this position and unchanged.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. The name of the author may not be used to endorse or promote products
19  *    derived from this software without specific prior written permission
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
22  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
23  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
24  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
25  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
26  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
30  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31  */
32 
33 #include <sys/cdefs.h>
34 __FBSDID("$FreeBSD$");
35 
36 /* XXX we use functions that might not exist. */
37 #include "opt_compat.h"
38 
39 #ifndef COMPAT_43
40 #error "Unable to compile Linux-emulator due to missing COMPAT_43 option!"
41 #endif
42 #ifndef COMPAT_IA32
43 #error "Unable to compile Linux-emulator due to missing COMPAT_IA32 option!"
44 #endif
45 
46 #define	__ELF_WORD_SIZE	32
47 
48 #include <sys/param.h>
49 #include <sys/systm.h>
50 #include <sys/exec.h>
51 #include <sys/imgact.h>
52 #include <sys/imgact_elf.h>
53 #include <sys/kernel.h>
54 #include <sys/lock.h>
55 #include <sys/malloc.h>
56 #include <sys/module.h>
57 #include <sys/mutex.h>
58 #include <sys/proc.h>
59 #include <sys/resourcevar.h>
60 #include <sys/signalvar.h>
61 #include <sys/sysctl.h>
62 #include <sys/syscallsubr.h>
63 #include <sys/sysent.h>
64 #include <sys/sysproto.h>
65 #include <sys/vnode.h>
66 
67 #include <vm/vm.h>
68 #include <vm/pmap.h>
69 #include <vm/vm_extern.h>
70 #include <vm/vm_map.h>
71 #include <vm/vm_object.h>
72 #include <vm/vm_page.h>
73 #include <vm/vm_param.h>
74 
75 #include <machine/cpu.h>
76 #include <machine/md_var.h>
77 #include <machine/pcb.h>
78 #include <machine/specialreg.h>
79 
80 #include <amd64/linux32/linux.h>
81 #include <amd64/linux32/linux32_proto.h>
82 #include <compat/linux/linux_mib.h>
83 #include <compat/linux/linux_signal.h>
84 #include <compat/linux/linux_util.h>
85 
86 MODULE_VERSION(linux, 1);
87 MODULE_DEPEND(linux, sysvmsg, 1, 1, 1);
88 MODULE_DEPEND(linux, sysvsem, 1, 1, 1);
89 MODULE_DEPEND(linux, sysvshm, 1, 1, 1);
90 
91 MALLOC_DEFINE(M_LINUX, "linux", "Linux mode structures");
92 
93 #define	AUXARGS_ENTRY_32(pos, id, val)	\
94 	do {				\
95 		suword32(pos++, id);	\
96 		suword32(pos++, val);	\
97 	} while (0)
98 
99 #if BYTE_ORDER == LITTLE_ENDIAN
100 #define SHELLMAGIC      0x2123 /* #! */
101 #else
102 #define SHELLMAGIC      0x2321
103 #endif
104 
105 /*
106  * Allow the sendsig functions to use the ldebug() facility
107  * even though they are not syscalls themselves. Map them
108  * to syscall 0. This is slightly less bogus than using
109  * ldebug(sigreturn).
110  */
111 #define	LINUX_SYS_linux_rt_sendsig	0
112 #define	LINUX_SYS_linux_sendsig		0
113 
114 extern char linux_sigcode[];
115 extern int linux_szsigcode;
116 
117 extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL];
118 
119 SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler);
120 
121 static int	elf_linux_fixup(register_t **stack_base,
122 		    struct image_params *iparams);
123 static register_t *linux_copyout_strings(struct image_params *imgp);
124 static void	linux_prepsyscall(struct trapframe *tf, int *args, u_int *code,
125 		    caddr_t *params);
126 static void     linux_sendsig(sig_t catcher, int sig, sigset_t *mask,
127 		    u_long code);
128 static void	exec_linux_setregs(struct thread *td, u_long entry,
129 				   u_long stack, u_long ps_strings);
130 static void	linux32_fixlimits(struct image_params *imgp);
131 
132 /*
133  * Linux syscalls return negative errno's, we do positive and map them
134  */
135 static int bsd_to_linux_errno[ELAST + 1] = {
136 	-0,  -1,  -2,  -3,  -4,  -5,  -6,  -7,  -8,  -9,
137 	-10, -35, -12, -13, -14, -15, -16, -17, -18, -19,
138 	-20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
139 	-30, -31, -32, -33, -34, -11,-115,-114, -88, -89,
140 	-90, -91, -92, -93, -94, -95, -96, -97, -98, -99,
141 	-100,-101,-102,-103,-104,-105,-106,-107,-108,-109,
142 	-110,-111, -40, -36,-112,-113, -39, -11, -87,-122,
143 	-116, -66,  -6,  -6,  -6,  -6,  -6, -37, -38,  -9,
144 	-6, -6, -43, -42, -75, -6, -84
145 };
146 
147 int bsd_to_linux_signal[LINUX_SIGTBLSZ] = {
148 	LINUX_SIGHUP, LINUX_SIGINT, LINUX_SIGQUIT, LINUX_SIGILL,
149 	LINUX_SIGTRAP, LINUX_SIGABRT, 0, LINUX_SIGFPE,
150 	LINUX_SIGKILL, LINUX_SIGBUS, LINUX_SIGSEGV, LINUX_SIGSYS,
151 	LINUX_SIGPIPE, LINUX_SIGALRM, LINUX_SIGTERM, LINUX_SIGURG,
152 	LINUX_SIGSTOP, LINUX_SIGTSTP, LINUX_SIGCONT, LINUX_SIGCHLD,
153 	LINUX_SIGTTIN, LINUX_SIGTTOU, LINUX_SIGIO, LINUX_SIGXCPU,
154 	LINUX_SIGXFSZ, LINUX_SIGVTALRM, LINUX_SIGPROF, LINUX_SIGWINCH,
155 	0, LINUX_SIGUSR1, LINUX_SIGUSR2
156 };
157 
158 int linux_to_bsd_signal[LINUX_SIGTBLSZ] = {
159 	SIGHUP, SIGINT, SIGQUIT, SIGILL,
160 	SIGTRAP, SIGABRT, SIGBUS, SIGFPE,
161 	SIGKILL, SIGUSR1, SIGSEGV, SIGUSR2,
162 	SIGPIPE, SIGALRM, SIGTERM, SIGBUS,
163 	SIGCHLD, SIGCONT, SIGSTOP, SIGTSTP,
164 	SIGTTIN, SIGTTOU, SIGURG, SIGXCPU,
165 	SIGXFSZ, SIGVTALRM, SIGPROF, SIGWINCH,
166 	SIGIO, SIGURG, SIGSYS
167 };
168 
169 #define LINUX_T_UNKNOWN  255
170 static int _bsd_to_linux_trapcode[] = {
171 	LINUX_T_UNKNOWN,	/* 0 */
172 	6,			/* 1  T_PRIVINFLT */
173 	LINUX_T_UNKNOWN,	/* 2 */
174 	3,			/* 3  T_BPTFLT */
175 	LINUX_T_UNKNOWN,	/* 4 */
176 	LINUX_T_UNKNOWN,	/* 5 */
177 	16,			/* 6  T_ARITHTRAP */
178 	254,			/* 7  T_ASTFLT */
179 	LINUX_T_UNKNOWN,	/* 8 */
180 	13,			/* 9  T_PROTFLT */
181 	1,			/* 10 T_TRCTRAP */
182 	LINUX_T_UNKNOWN,	/* 11 */
183 	14,			/* 12 T_PAGEFLT */
184 	LINUX_T_UNKNOWN,	/* 13 */
185 	17,			/* 14 T_ALIGNFLT */
186 	LINUX_T_UNKNOWN,	/* 15 */
187 	LINUX_T_UNKNOWN,	/* 16 */
188 	LINUX_T_UNKNOWN,	/* 17 */
189 	0,			/* 18 T_DIVIDE */
190 	2,			/* 19 T_NMI */
191 	4,			/* 20 T_OFLOW */
192 	5,			/* 21 T_BOUND */
193 	7,			/* 22 T_DNA */
194 	8,			/* 23 T_DOUBLEFLT */
195 	9,			/* 24 T_FPOPFLT */
196 	10,			/* 25 T_TSSFLT */
197 	11,			/* 26 T_SEGNPFLT */
198 	12,			/* 27 T_STKFLT */
199 	18,			/* 28 T_MCHK */
200 	19,			/* 29 T_XMMFLT */
201 	15			/* 30 T_RESERVED */
202 };
203 #define bsd_to_linux_trapcode(code) \
204     ((code)<sizeof(_bsd_to_linux_trapcode)/sizeof(*_bsd_to_linux_trapcode)? \
205      _bsd_to_linux_trapcode[(code)]: \
206      LINUX_T_UNKNOWN)
207 
208 struct linux32_ps_strings {
209 	u_int32_t ps_argvstr;	/* first of 0 or more argument strings */
210 	int	ps_nargvstr;	/* the number of argument strings */
211 	u_int32_t ps_envstr;	/* first of 0 or more environment strings */
212 	int	ps_nenvstr;	/* the number of environment strings */
213 };
214 
215 /*
216  * If FreeBSD & Linux have a difference of opinion about what a trap
217  * means, deal with it here.
218  *
219  * MPSAFE
220  */
221 static int
222 translate_traps(int signal, int trap_code)
223 {
224 	if (signal != SIGBUS)
225 		return signal;
226 	switch (trap_code) {
227 	case T_PROTFLT:
228 	case T_TSSFLT:
229 	case T_DOUBLEFLT:
230 	case T_PAGEFLT:
231 		return SIGSEGV;
232 	default:
233 		return signal;
234 	}
235 }
236 
237 static int
238 elf_linux_fixup(register_t **stack_base, struct image_params *imgp)
239 {
240 	Elf32_Auxargs *args;
241 	Elf32_Addr *base;
242 	Elf32_Addr *pos;
243 
244 	KASSERT(curthread->td_proc == imgp->proc &&
245 	    (curthread->td_proc->p_flag & P_SA) == 0,
246 	    ("unsafe elf_linux_fixup(), should be curproc"));
247 	base = (Elf32_Addr *)*stack_base;
248 	args = (Elf32_Auxargs *)imgp->auxargs;
249 	pos = base + (imgp->args->argc + imgp->args->envc + 2);
250 
251 	if (args->trace)
252 		AUXARGS_ENTRY_32(pos, AT_DEBUG, 1);
253 	if (args->execfd != -1)
254 		AUXARGS_ENTRY_32(pos, AT_EXECFD, args->execfd);
255 	AUXARGS_ENTRY_32(pos, AT_PHDR, args->phdr);
256 	AUXARGS_ENTRY_32(pos, AT_PHENT, args->phent);
257 	AUXARGS_ENTRY_32(pos, AT_PHNUM, args->phnum);
258 	AUXARGS_ENTRY_32(pos, AT_PAGESZ, args->pagesz);
259 	AUXARGS_ENTRY_32(pos, AT_FLAGS, args->flags);
260 	AUXARGS_ENTRY_32(pos, AT_ENTRY, args->entry);
261 	AUXARGS_ENTRY_32(pos, AT_BASE, args->base);
262 	AUXARGS_ENTRY_32(pos, AT_UID, imgp->proc->p_ucred->cr_ruid);
263 	AUXARGS_ENTRY_32(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid);
264 	AUXARGS_ENTRY_32(pos, AT_GID, imgp->proc->p_ucred->cr_rgid);
265 	AUXARGS_ENTRY_32(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid);
266 	AUXARGS_ENTRY_32(pos, AT_NULL, 0);
267 
268 	free(imgp->auxargs, M_TEMP);
269 	imgp->auxargs = NULL;
270 
271 	base--;
272 	suword32(base, (uint32_t)imgp->args->argc);
273 	*stack_base = (register_t *)base;
274 	return 0;
275 }
276 
277 extern int _ucodesel, _ucode32sel, _udatasel;
278 extern unsigned long linux_sznonrtsigcode;
279 
280 static void
281 linux_rt_sendsig(sig_t catcher, int sig, sigset_t *mask, u_long code)
282 {
283 	struct thread *td = curthread;
284 	struct proc *p = td->td_proc;
285 	struct sigacts *psp;
286 	struct trapframe *regs;
287 	struct l_rt_sigframe *fp, frame;
288 	int oonstack;
289 
290 	PROC_LOCK_ASSERT(p, MA_OWNED);
291 	psp = p->p_sigacts;
292 	mtx_assert(&psp->ps_mtx, MA_OWNED);
293 	regs = td->td_frame;
294 	oonstack = sigonstack(regs->tf_rsp);
295 
296 #ifdef DEBUG
297 	if (ldebug(rt_sendsig))
298 		printf(ARGS(rt_sendsig, "%p, %d, %p, %lu"),
299 		    catcher, sig, (void*)mask, code);
300 #endif
301 	/*
302 	 * Allocate space for the signal handler context.
303 	 */
304 	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
305 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
306 		fp = (struct l_rt_sigframe *)(td->td_sigstk.ss_sp +
307 		    td->td_sigstk.ss_size - sizeof(struct l_rt_sigframe));
308 	} else
309 		fp = (struct l_rt_sigframe *)regs->tf_rsp - 1;
310 	mtx_unlock(&psp->ps_mtx);
311 
312 	/*
313 	 * Build the argument list for the signal handler.
314 	 */
315 	if (p->p_sysent->sv_sigtbl)
316 		if (sig <= p->p_sysent->sv_sigsize)
317 			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
318 
319 	bzero(&frame, sizeof(frame));
320 
321 	frame.sf_handler = PTROUT(catcher);
322 	frame.sf_sig = sig;
323 	frame.sf_siginfo = PTROUT(&fp->sf_si);
324 	frame.sf_ucontext = PTROUT(&fp->sf_sc);
325 
326 	/* Fill in POSIX parts */
327 	frame.sf_si.lsi_signo = sig;
328 	frame.sf_si.lsi_code = code;
329 	frame.sf_si.lsi_addr = PTROUT(regs->tf_err);
330 
331 	/*
332 	 * Build the signal context to be used by sigreturn.
333 	 */
334 	frame.sf_sc.uc_flags = 0;		/* XXX ??? */
335 	frame.sf_sc.uc_link = 0;		/* XXX ??? */
336 
337 	frame.sf_sc.uc_stack.ss_sp = PTROUT(td->td_sigstk.ss_sp);
338 	frame.sf_sc.uc_stack.ss_size = td->td_sigstk.ss_size;
339 	frame.sf_sc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
340 	    ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE;
341 	PROC_UNLOCK(p);
342 
343 	bsd_to_linux_sigset(mask, &frame.sf_sc.uc_sigmask);
344 
345 	frame.sf_sc.uc_mcontext.sc_mask   = frame.sf_sc.uc_sigmask.__bits[0];
346         frame.sf_sc.uc_mcontext.sc_gs     = rgs();
347         frame.sf_sc.uc_mcontext.sc_fs     = rfs();
348         __asm __volatile("movl %%es,%0" :
349 	    "=rm" (frame.sf_sc.uc_mcontext.sc_es));
350         __asm __volatile("movl %%ds,%0" :
351 	    "=rm" (frame.sf_sc.uc_mcontext.sc_ds));
352 	frame.sf_sc.uc_mcontext.sc_edi    = regs->tf_rdi;
353 	frame.sf_sc.uc_mcontext.sc_esi    = regs->tf_rsi;
354 	frame.sf_sc.uc_mcontext.sc_ebp    = regs->tf_rbp;
355 	frame.sf_sc.uc_mcontext.sc_ebx    = regs->tf_rbx;
356 	frame.sf_sc.uc_mcontext.sc_edx    = regs->tf_rdx;
357 	frame.sf_sc.uc_mcontext.sc_ecx    = regs->tf_rcx;
358 	frame.sf_sc.uc_mcontext.sc_eax    = regs->tf_rax;
359 	frame.sf_sc.uc_mcontext.sc_eip    = regs->tf_rip;
360 	frame.sf_sc.uc_mcontext.sc_cs     = regs->tf_cs;
361 	frame.sf_sc.uc_mcontext.sc_eflags = regs->tf_rflags;
362 	frame.sf_sc.uc_mcontext.sc_esp_at_signal = regs->tf_rsp;
363 	frame.sf_sc.uc_mcontext.sc_ss     = regs->tf_ss;
364 	frame.sf_sc.uc_mcontext.sc_err    = regs->tf_err;
365 	frame.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code);
366 
367 #ifdef DEBUG
368 	if (ldebug(rt_sendsig))
369 		printf(LMSG("rt_sendsig flags: 0x%x, sp: %p, ss: 0x%lx, mask: 0x%x"),
370 		    frame.sf_sc.uc_stack.ss_flags, td->td_sigstk.ss_sp,
371 		    td->td_sigstk.ss_size, frame.sf_sc.uc_mcontext.sc_mask);
372 #endif
373 
374 	if (copyout(&frame, fp, sizeof(frame)) != 0) {
375 		/*
376 		 * Process has trashed its stack; give it an illegal
377 		 * instruction to halt it in its tracks.
378 		 */
379 #ifdef DEBUG
380 		if (ldebug(rt_sendsig))
381 			printf(LMSG("rt_sendsig: bad stack %p, oonstack=%x"),
382 			    fp, oonstack);
383 #endif
384 		PROC_LOCK(p);
385 		sigexit(td, SIGILL);
386 	}
387 
388 	/*
389 	 * Build context to run handler in.
390 	 */
391 	regs->tf_rsp = PTROUT(fp);
392 	regs->tf_rip = LINUX32_PS_STRINGS - *(p->p_sysent->sv_szsigcode) +
393 	    linux_sznonrtsigcode;
394 	regs->tf_rflags &= ~PSL_T;
395 	regs->tf_cs = _ucode32sel;
396 	regs->tf_ss = _udatasel;
397 	load_ds(_udatasel);
398 	td->td_pcb->pcb_ds = _udatasel;
399 	load_es(_udatasel);
400 	td->td_pcb->pcb_es = _udatasel;
401 	PROC_LOCK(p);
402 	mtx_lock(&psp->ps_mtx);
403 }
404 
405 
406 /*
407  * Send an interrupt to process.
408  *
409  * Stack is set up to allow sigcode stored
410  * in u. to call routine, followed by kcall
411  * to sigreturn routine below.  After sigreturn
412  * resets the signal mask, the stack, and the
413  * frame pointer, it returns to the user
414  * specified pc, psl.
415  */
416 static void
417 linux_sendsig(sig_t catcher, int sig, sigset_t *mask, u_long code)
418 {
419 	struct thread *td = curthread;
420 	struct proc *p = td->td_proc;
421 	struct sigacts *psp;
422 	struct trapframe *regs;
423 	struct l_sigframe *fp, frame;
424 	l_sigset_t lmask;
425 	int oonstack, i;
426 
427 	PROC_LOCK_ASSERT(p, MA_OWNED);
428 	psp = p->p_sigacts;
429 	mtx_assert(&psp->ps_mtx, MA_OWNED);
430 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
431 		/* Signal handler installed with SA_SIGINFO. */
432 		linux_rt_sendsig(catcher, sig, mask, code);
433 		return;
434 	}
435 
436 	regs = td->td_frame;
437 	oonstack = sigonstack(regs->tf_rsp);
438 
439 #ifdef DEBUG
440 	if (ldebug(sendsig))
441 		printf(ARGS(sendsig, "%p, %d, %p, %lu"),
442 		    catcher, sig, (void*)mask, code);
443 #endif
444 
445 	/*
446 	 * Allocate space for the signal handler context.
447 	 */
448 	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
449 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
450 		fp = (struct l_sigframe *)(td->td_sigstk.ss_sp +
451 		    td->td_sigstk.ss_size - sizeof(struct l_sigframe));
452 	} else
453 		fp = (struct l_sigframe *)regs->tf_rsp - 1;
454 	mtx_unlock(&psp->ps_mtx);
455 	PROC_UNLOCK(p);
456 
457 	/*
458 	 * Build the argument list for the signal handler.
459 	 */
460 	if (p->p_sysent->sv_sigtbl)
461 		if (sig <= p->p_sysent->sv_sigsize)
462 			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
463 
464 	bzero(&frame, sizeof(frame));
465 
466 	frame.sf_handler = PTROUT(catcher);
467 	frame.sf_sig = sig;
468 
469 	bsd_to_linux_sigset(mask, &lmask);
470 
471 	/*
472 	 * Build the signal context to be used by sigreturn.
473 	 */
474 	frame.sf_sc.sc_mask   = lmask.__bits[0];
475         frame.sf_sc.sc_gs     = rgs();
476         frame.sf_sc.sc_fs     = rfs();
477         __asm __volatile("movl %%es,%0" : "=rm" (frame.sf_sc.sc_es));
478         __asm __volatile("movl %%ds,%0" : "=rm" (frame.sf_sc.sc_ds));
479 	frame.sf_sc.sc_edi    = regs->tf_rdi;
480 	frame.sf_sc.sc_esi    = regs->tf_rsi;
481 	frame.sf_sc.sc_ebp    = regs->tf_rbp;
482 	frame.sf_sc.sc_ebx    = regs->tf_rbx;
483 	frame.sf_sc.sc_edx    = regs->tf_rdx;
484 	frame.sf_sc.sc_ecx    = regs->tf_rcx;
485 	frame.sf_sc.sc_eax    = regs->tf_rax;
486 	frame.sf_sc.sc_eip    = regs->tf_rip;
487 	frame.sf_sc.sc_cs     = regs->tf_cs;
488 	frame.sf_sc.sc_eflags = regs->tf_rflags;
489 	frame.sf_sc.sc_esp_at_signal = regs->tf_rsp;
490 	frame.sf_sc.sc_ss     = regs->tf_ss;
491 	frame.sf_sc.sc_err    = regs->tf_err;
492 	frame.sf_sc.sc_trapno = bsd_to_linux_trapcode(code);
493 
494 	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
495 		frame.sf_extramask[i] = lmask.__bits[i+1];
496 
497 	if (copyout(&frame, fp, sizeof(frame)) != 0) {
498 		/*
499 		 * Process has trashed its stack; give it an illegal
500 		 * instruction to halt it in its tracks.
501 		 */
502 		PROC_LOCK(p);
503 		sigexit(td, SIGILL);
504 	}
505 
506 	/*
507 	 * Build context to run handler in.
508 	 */
509 	regs->tf_rsp = PTROUT(fp);
510 	regs->tf_rip = LINUX32_PS_STRINGS - *(p->p_sysent->sv_szsigcode);
511 	regs->tf_rflags &= ~PSL_T;
512 	regs->tf_cs = _ucode32sel;
513 	regs->tf_ss = _udatasel;
514 	load_ds(_udatasel);
515 	td->td_pcb->pcb_ds = _udatasel;
516 	load_es(_udatasel);
517 	td->td_pcb->pcb_es = _udatasel;
518 	PROC_LOCK(p);
519 	mtx_lock(&psp->ps_mtx);
520 }
521 
522 /*
523  * System call to cleanup state after a signal
524  * has been taken.  Reset signal mask and
525  * stack state from context left by sendsig (above).
526  * Return to previous pc and psl as specified by
527  * context left by sendsig. Check carefully to
528  * make sure that the user has not modified the
529  * psl to gain improper privileges or to cause
530  * a machine fault.
531  */
532 int
533 linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args)
534 {
535 	struct proc *p = td->td_proc;
536 	struct l_sigframe frame;
537 	struct trapframe *regs;
538 	l_sigset_t lmask;
539 	int eflags, i;
540 
541 	regs = td->td_frame;
542 
543 #ifdef DEBUG
544 	if (ldebug(sigreturn))
545 		printf(ARGS(sigreturn, "%p"), (void *)args->sfp);
546 #endif
547 	/*
548 	 * The trampoline code hands us the sigframe.
549 	 * It is unsafe to keep track of it ourselves, in the event that a
550 	 * program jumps out of a signal handler.
551 	 */
552 	if (copyin(args->sfp, &frame, sizeof(frame)) != 0)
553 		return (EFAULT);
554 
555 	/*
556 	 * Check for security violations.
557 	 */
558 #define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
559 	eflags = frame.sf_sc.sc_eflags;
560 	/*
561 	 * XXX do allow users to change the privileged flag PSL_RF.  The
562 	 * cpu sets PSL_RF in tf_eflags for faults.  Debuggers should
563 	 * sometimes set it there too.  tf_eflags is kept in the signal
564 	 * context during signal handling and there is no other place
565 	 * to remember it, so the PSL_RF bit may be corrupted by the
566 	 * signal handler without us knowing.  Corruption of the PSL_RF
567 	 * bit at worst causes one more or one less debugger trap, so
568 	 * allowing it is fairly harmless.
569 	 */
570 	if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF))
571 		return(EINVAL);
572 
573 	/*
574 	 * Don't allow users to load a valid privileged %cs.  Let the
575 	 * hardware check for invalid selectors, excess privilege in
576 	 * other selectors, invalid %eip's and invalid %esp's.
577 	 */
578 #define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
579 	if (!CS_SECURE(frame.sf_sc.sc_cs)) {
580 		trapsignal(td, SIGBUS, T_PROTFLT);
581 		return(EINVAL);
582 	}
583 
584 	lmask.__bits[0] = frame.sf_sc.sc_mask;
585 	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
586 		lmask.__bits[i+1] = frame.sf_extramask[i];
587 	PROC_LOCK(p);
588 	linux_to_bsd_sigset(&lmask, &td->td_sigmask);
589 	SIG_CANTMASK(td->td_sigmask);
590 	signotify(td);
591 	PROC_UNLOCK(p);
592 
593 	/*
594 	 * Restore signal context.
595 	 */
596 	/* Selectors were restored by the trampoline. */
597 	regs->tf_rdi    = frame.sf_sc.sc_edi;
598 	regs->tf_rsi    = frame.sf_sc.sc_esi;
599 	regs->tf_rbp    = frame.sf_sc.sc_ebp;
600 	regs->tf_rbx    = frame.sf_sc.sc_ebx;
601 	regs->tf_rdx    = frame.sf_sc.sc_edx;
602 	regs->tf_rcx    = frame.sf_sc.sc_ecx;
603 	regs->tf_rax    = frame.sf_sc.sc_eax;
604 	regs->tf_rip    = frame.sf_sc.sc_eip;
605 	regs->tf_cs     = frame.sf_sc.sc_cs;
606 	regs->tf_rflags = eflags;
607 	regs->tf_rsp    = frame.sf_sc.sc_esp_at_signal;
608 	regs->tf_ss     = frame.sf_sc.sc_ss;
609 
610 	return (EJUSTRETURN);
611 }
612 
613 /*
614  * System call to cleanup state after a signal
615  * has been taken.  Reset signal mask and
616  * stack state from context left by rt_sendsig (above).
617  * Return to previous pc and psl as specified by
618  * context left by sendsig. Check carefully to
619  * make sure that the user has not modified the
620  * psl to gain improper privileges or to cause
621  * a machine fault.
622  */
623 int
624 linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args)
625 {
626 	struct proc *p = td->td_proc;
627 	struct l_ucontext uc;
628 	struct l_sigcontext *context;
629 	l_stack_t *lss;
630 	stack_t ss;
631 	struct trapframe *regs;
632 	int eflags;
633 
634 	regs = td->td_frame;
635 
636 #ifdef DEBUG
637 	if (ldebug(rt_sigreturn))
638 		printf(ARGS(rt_sigreturn, "%p"), (void *)args->ucp);
639 #endif
640 	/*
641 	 * The trampoline code hands us the ucontext.
642 	 * It is unsafe to keep track of it ourselves, in the event that a
643 	 * program jumps out of a signal handler.
644 	 */
645 	if (copyin(args->ucp, &uc, sizeof(uc)) != 0)
646 		return (EFAULT);
647 
648 	context = &uc.uc_mcontext;
649 
650 	/*
651 	 * Check for security violations.
652 	 */
653 #define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
654 	eflags = context->sc_eflags;
655 	/*
656 	 * XXX do allow users to change the privileged flag PSL_RF.  The
657 	 * cpu sets PSL_RF in tf_eflags for faults.  Debuggers should
658 	 * sometimes set it there too.  tf_eflags is kept in the signal
659 	 * context during signal handling and there is no other place
660 	 * to remember it, so the PSL_RF bit may be corrupted by the
661 	 * signal handler without us knowing.  Corruption of the PSL_RF
662 	 * bit at worst causes one more or one less debugger trap, so
663 	 * allowing it is fairly harmless.
664 	 */
665 	if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF))
666 		return(EINVAL);
667 
668 	/*
669 	 * Don't allow users to load a valid privileged %cs.  Let the
670 	 * hardware check for invalid selectors, excess privilege in
671 	 * other selectors, invalid %eip's and invalid %esp's.
672 	 */
673 #define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
674 	if (!CS_SECURE(context->sc_cs)) {
675 		trapsignal(td, SIGBUS, T_PROTFLT);
676 		return(EINVAL);
677 	}
678 
679 	PROC_LOCK(p);
680 	linux_to_bsd_sigset(&uc.uc_sigmask, &td->td_sigmask);
681 	SIG_CANTMASK(td->td_sigmask);
682 	signotify(td);
683 	PROC_UNLOCK(p);
684 
685 	/*
686 	 * Restore signal context
687 	 */
688 	/* Selectors were restored by the trampoline. */
689 	regs->tf_rdi    = context->sc_edi;
690 	regs->tf_rsi    = context->sc_esi;
691 	regs->tf_rbp    = context->sc_ebp;
692 	regs->tf_rbx    = context->sc_ebx;
693 	regs->tf_rdx    = context->sc_edx;
694 	regs->tf_rcx    = context->sc_ecx;
695 	regs->tf_rax    = context->sc_eax;
696 	regs->tf_rip    = context->sc_eip;
697 	regs->tf_cs     = context->sc_cs;
698 	regs->tf_rflags = eflags;
699 	regs->tf_rsp    = context->sc_esp_at_signal;
700 	regs->tf_ss     = context->sc_ss;
701 
702 	/*
703 	 * call sigaltstack & ignore results..
704 	 */
705 	lss = &uc.uc_stack;
706 	ss.ss_sp = PTRIN(lss->ss_sp);
707 	ss.ss_size = lss->ss_size;
708 	ss.ss_flags = linux_to_bsd_sigaltstack(lss->ss_flags);
709 
710 #ifdef DEBUG
711 	if (ldebug(rt_sigreturn))
712 		printf(LMSG("rt_sigret flags: 0x%x, sp: %p, ss: 0x%lx, mask: 0x%x"),
713 		    ss.ss_flags, ss.ss_sp, ss.ss_size, context->sc_mask);
714 #endif
715 	(void)kern_sigaltstack(td, &ss, NULL);
716 
717 	return (EJUSTRETURN);
718 }
719 
720 /*
721  * MPSAFE
722  */
723 static void
724 linux_prepsyscall(struct trapframe *tf, int *args, u_int *code, caddr_t *params)
725 {
726 	args[0] = tf->tf_rbx;
727 	args[1] = tf->tf_rcx;
728 	args[2] = tf->tf_rdx;
729 	args[3] = tf->tf_rsi;
730 	args[4] = tf->tf_rdi;
731 	args[5] = tf->tf_rbp;	/* Unconfirmed */
732 	*params = NULL;		/* no copyin */
733 }
734 
735 /*
736  * If a linux binary is exec'ing something, try this image activator
737  * first.  We override standard shell script execution in order to
738  * be able to modify the interpreter path.  We only do this if a linux
739  * binary is doing the exec, so we do not create an EXEC module for it.
740  */
741 static int	exec_linux_imgact_try(struct image_params *iparams);
742 
743 static int
744 exec_linux_imgact_try(struct image_params *imgp)
745 {
746     const char *head = (const char *)imgp->image_header;
747     int error = -1;
748 
749     /*
750      * The interpreter for shell scripts run from a linux binary needs
751      * to be located in /compat/linux if possible in order to recursively
752      * maintain linux path emulation.
753      */
754     if (((const short *)head)[0] == SHELLMAGIC) {
755 	    /*
756 	     * Run our normal shell image activator.  If it succeeds attempt
757 	     * to use the alternate path for the interpreter.  If an alternate
758 	     * path is found, use our stringspace to store it.
759 	     */
760 	    if ((error = exec_shell_imgact(imgp)) == 0) {
761 		    char *rpath = NULL;
762 
763 		    linux_emul_find(FIRST_THREAD_IN_PROC(imgp->proc), NULL,
764 			imgp->interpreter_name, &rpath, 0);
765 		    if (rpath != imgp->interpreter_name) {
766 			    int len = strlen(rpath) + 1;
767 
768 			    if (len <= MAXSHELLCMDLEN) {
769 				    memcpy(imgp->interpreter_name, rpath, len);
770 			    }
771 			    free(rpath, M_TEMP);
772 		    }
773 	    }
774     }
775     return(error);
776 }
777 
778 /*
779  * Clear registers on exec
780  * XXX copied from ia32_signal.c.
781  */
782 static void
783 exec_linux_setregs(td, entry, stack, ps_strings)
784 	struct thread *td;
785 	u_long entry;
786 	u_long stack;
787 	u_long ps_strings;
788 {
789 	struct trapframe *regs = td->td_frame;
790 	struct pcb *pcb = td->td_pcb;
791 
792 	wrmsr(MSR_FSBASE, 0);
793 	wrmsr(MSR_KGSBASE, 0);	/* User value while we're in the kernel */
794 	pcb->pcb_fsbase = 0;
795 	pcb->pcb_gsbase = 0;
796 	load_ds(_udatasel);
797 	load_es(_udatasel);
798 	load_fs(_udatasel);
799 	load_gs(0);
800 	pcb->pcb_ds = _udatasel;
801 	pcb->pcb_es = _udatasel;
802 	pcb->pcb_fs = _udatasel;
803 	pcb->pcb_gs = 0;
804 
805 	bzero((char *)regs, sizeof(struct trapframe));
806 	regs->tf_rip = entry;
807 	regs->tf_rsp = stack;
808 	regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T);
809 	regs->tf_ss = _udatasel;
810 	regs->tf_cs = _ucode32sel;
811 	regs->tf_rbx = ps_strings;
812 	load_cr0(rcr0() | CR0_MP | CR0_TS);
813 
814 	/* Return via doreti so that we can change to a different %cs */
815 	pcb->pcb_flags |= PCB_FULLCTX;
816 	td->td_retval[1] = 0;
817 }
818 
819 /*
820  * XXX copied from ia32_sysvec.c.
821  */
822 static register_t *
823 linux_copyout_strings(struct image_params *imgp)
824 {
825 	int argc, envc;
826 	u_int32_t *vectp;
827 	char *stringp, *destp;
828 	u_int32_t *stack_base;
829 	struct linux32_ps_strings *arginfo;
830 	int sigcodesz;
831 
832 	/*
833 	 * Calculate string base and vector table pointers.
834 	 * Also deal with signal trampoline code for this exec type.
835 	 */
836 	arginfo = (struct linux32_ps_strings *)LINUX32_PS_STRINGS;
837 	sigcodesz = *(imgp->proc->p_sysent->sv_szsigcode);
838 	destp =	(caddr_t)arginfo - sigcodesz - SPARE_USRSPACE -
839 		roundup((ARG_MAX - imgp->args->stringspace), sizeof(char *));
840 
841 	/*
842 	 * install sigcode
843 	 */
844 	if (sigcodesz)
845 		copyout(imgp->proc->p_sysent->sv_sigcode,
846 			((caddr_t)arginfo - sigcodesz), szsigcode);
847 
848 	/*
849 	 * If we have a valid auxargs ptr, prepare some room
850 	 * on the stack.
851 	 */
852 	if (imgp->auxargs) {
853 		/*
854 		 * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for
855 		 * lower compatibility.
856 		 */
857 		imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size
858 			: (AT_COUNT * 2);
859 		/*
860 		 * The '+ 2' is for the null pointers at the end of each of
861 		 * the arg and env vector sets,and imgp->auxarg_size is room
862 		 * for argument of Runtime loader.
863 		 */
864 		vectp = (u_int32_t *) (destp - (imgp->args->argc + imgp->args->envc + 2 +
865 				       imgp->auxarg_size) * sizeof(u_int32_t));
866 
867 	} else
868 		/*
869 		 * The '+ 2' is for the null pointers at the end of each of
870 		 * the arg and env vector sets
871 		 */
872 		vectp = (u_int32_t *)
873 			(destp - (imgp->args->argc + imgp->args->envc + 2) * sizeof(u_int32_t));
874 
875 	/*
876 	 * vectp also becomes our initial stack base
877 	 */
878 	stack_base = vectp;
879 
880 	stringp = imgp->args->begin_argv;
881 	argc = imgp->args->argc;
882 	envc = imgp->args->envc;
883 	/*
884 	 * Copy out strings - arguments and environment.
885 	 */
886 	copyout(stringp, destp, ARG_MAX - imgp->args->stringspace);
887 
888 	/*
889 	 * Fill in "ps_strings" struct for ps, w, etc.
890 	 */
891 	suword32(&arginfo->ps_argvstr, (u_int32_t)(intptr_t)vectp);
892 	suword32(&arginfo->ps_nargvstr, argc);
893 
894 	/*
895 	 * Fill in argument portion of vector table.
896 	 */
897 	for (; argc > 0; --argc) {
898 		suword32(vectp++, (u_int32_t)(intptr_t)destp);
899 		while (*stringp++ != 0)
900 			destp++;
901 		destp++;
902 	}
903 
904 	/* a null vector table pointer separates the argp's from the envp's */
905 	suword32(vectp++, 0);
906 
907 	suword32(&arginfo->ps_envstr, (u_int32_t)(intptr_t)vectp);
908 	suword32(&arginfo->ps_nenvstr, envc);
909 
910 	/*
911 	 * Fill in environment portion of vector table.
912 	 */
913 	for (; envc > 0; --envc) {
914 		suword32(vectp++, (u_int32_t)(intptr_t)destp);
915 		while (*stringp++ != 0)
916 			destp++;
917 		destp++;
918 	}
919 
920 	/* end of vector table is a null pointer */
921 	suword32(vectp, 0);
922 
923 	return ((register_t *)stack_base);
924 }
925 
926 SYSCTL_NODE(_compat, OID_AUTO, linux32, CTLFLAG_RW, 0,
927     "32-bit Linux emulation");
928 
929 static u_long	linux32_maxdsiz = LINUX32_MAXDSIZ;
930 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxdsiz, CTLFLAG_RW,
931     &linux32_maxdsiz, 0, "");
932 static u_long	linux32_maxssiz = LINUX32_MAXSSIZ;
933 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxssiz, CTLFLAG_RW,
934     &linux32_maxssiz, 0, "");
935 static u_long	linux32_maxvmem = LINUX32_MAXVMEM;
936 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxvmem, CTLFLAG_RW,
937     &linux32_maxvmem, 0, "");
938 
939 /*
940  * XXX copied from ia32_sysvec.c.
941  */
942 static void
943 linux32_fixlimits(struct image_params *imgp)
944 {
945 	struct proc *p = imgp->proc;
946 	struct plimit *oldlim, *newlim;
947 
948 	if (linux32_maxdsiz == 0 && linux32_maxssiz == 0 &&
949 	    linux32_maxvmem == 0)
950 		return;
951 	newlim = lim_alloc();
952 	PROC_LOCK(p);
953 	oldlim = p->p_limit;
954 	lim_copy(newlim, oldlim);
955 	if (linux32_maxdsiz != 0) {
956 		if (newlim->pl_rlimit[RLIMIT_DATA].rlim_cur > linux32_maxdsiz)
957 		    newlim->pl_rlimit[RLIMIT_DATA].rlim_cur = linux32_maxdsiz;
958 		if (newlim->pl_rlimit[RLIMIT_DATA].rlim_max > linux32_maxdsiz)
959 		    newlim->pl_rlimit[RLIMIT_DATA].rlim_max = linux32_maxdsiz;
960 	}
961 	if (linux32_maxssiz != 0) {
962 		if (newlim->pl_rlimit[RLIMIT_STACK].rlim_cur > linux32_maxssiz)
963 		    newlim->pl_rlimit[RLIMIT_STACK].rlim_cur = linux32_maxssiz;
964 		if (newlim->pl_rlimit[RLIMIT_STACK].rlim_max > linux32_maxssiz)
965 		    newlim->pl_rlimit[RLIMIT_STACK].rlim_max = linux32_maxssiz;
966 	}
967 	if (linux32_maxvmem != 0) {
968 		if (newlim->pl_rlimit[RLIMIT_VMEM].rlim_cur > linux32_maxvmem)
969 		    newlim->pl_rlimit[RLIMIT_VMEM].rlim_cur = linux32_maxvmem;
970 		if (newlim->pl_rlimit[RLIMIT_VMEM].rlim_max > linux32_maxvmem)
971 		    newlim->pl_rlimit[RLIMIT_VMEM].rlim_max = linux32_maxvmem;
972 	}
973 	p->p_limit = newlim;
974 	PROC_UNLOCK(p);
975 	lim_free(oldlim);
976 }
977 
978 struct sysentvec elf_linux_sysvec = {
979 	LINUX_SYS_MAXSYSCALL,
980 	linux_sysent,
981 	0xff,
982 	LINUX_SIGTBLSZ,
983 	bsd_to_linux_signal,
984 	ELAST + 1,
985 	bsd_to_linux_errno,
986 	translate_traps,
987 	elf_linux_fixup,
988 	linux_sendsig,
989 	linux_sigcode,
990 	&linux_szsigcode,
991 	linux_prepsyscall,
992 	"Linux ELF32",
993 	elf32_coredump,
994 	exec_linux_imgact_try,
995 	LINUX_MINSIGSTKSZ,
996 	PAGE_SIZE,
997 	VM_MIN_ADDRESS,
998 	LINUX32_USRSTACK,
999 	LINUX32_USRSTACK,
1000 	LINUX32_PS_STRINGS,
1001 	VM_PROT_ALL,
1002 	linux_copyout_strings,
1003 	exec_linux_setregs,
1004 	linux32_fixlimits
1005 };
1006 
1007 static Elf32_Brandinfo linux_brand = {
1008 					ELFOSABI_LINUX,
1009 					EM_386,
1010 					"Linux",
1011 					"/compat/linux",
1012 					"/lib/ld-linux.so.1",
1013 					&elf_linux_sysvec,
1014 					NULL,
1015 				 };
1016 
1017 static Elf32_Brandinfo linux_glibc2brand = {
1018 					ELFOSABI_LINUX,
1019 					EM_386,
1020 					"Linux",
1021 					"/compat/linux",
1022 					"/lib/ld-linux.so.2",
1023 					&elf_linux_sysvec,
1024 					NULL,
1025 				 };
1026 
1027 Elf32_Brandinfo *linux_brandlist[] = {
1028 					&linux_brand,
1029 					&linux_glibc2brand,
1030 					NULL
1031 				};
1032 
1033 static int
1034 linux_elf_modevent(module_t mod, int type, void *data)
1035 {
1036 	Elf32_Brandinfo **brandinfo;
1037 	int error;
1038 	struct linux_ioctl_handler **lihp;
1039 
1040 	error = 0;
1041 
1042 	switch(type) {
1043 	case MOD_LOAD:
1044 		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
1045 		     ++brandinfo)
1046 			if (elf32_insert_brand_entry(*brandinfo) < 0)
1047 				error = EINVAL;
1048 		if (error == 0) {
1049 			SET_FOREACH(lihp, linux_ioctl_handler_set)
1050 				linux_ioctl_register_handler(*lihp);
1051 			if (bootverbose)
1052 				printf("Linux ELF exec handler installed\n");
1053 		} else
1054 			printf("cannot insert Linux ELF brand handler\n");
1055 		break;
1056 	case MOD_UNLOAD:
1057 		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
1058 		     ++brandinfo)
1059 			if (elf32_brand_inuse(*brandinfo))
1060 				error = EBUSY;
1061 		if (error == 0) {
1062 			for (brandinfo = &linux_brandlist[0];
1063 			     *brandinfo != NULL; ++brandinfo)
1064 				if (elf32_remove_brand_entry(*brandinfo) < 0)
1065 					error = EINVAL;
1066 		}
1067 		if (error == 0) {
1068 			SET_FOREACH(lihp, linux_ioctl_handler_set)
1069 				linux_ioctl_unregister_handler(*lihp);
1070 			if (bootverbose)
1071 				printf("Linux ELF exec handler removed\n");
1072 			linux_mib_destroy();
1073 		} else
1074 			printf("Could not deinstall ELF interpreter entry\n");
1075 		break;
1076 	default:
1077 		break;
1078 	}
1079 	return error;
1080 }
1081 
1082 static moduledata_t linux_elf_mod = {
1083 	"linuxelf",
1084 	linux_elf_modevent,
1085 	0
1086 };
1087 
1088 DECLARE_MODULE(linuxelf, linux_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY);
1089