xref: /freebsd/sys/amd64/linux32/linux32_sysvec.c (revision c680f6b12d27019c2c3e2943396cdea7951535e6)
1 /*-
2  * Copyright (c) 2004 Tim J. Robbins
3  * Copyright (c) 2003 Peter Wemm
4  * Copyright (c) 2002 Doug Rabson
5  * Copyright (c) 1998-1999 Andrew Gallatin
6  * Copyright (c) 1994-1996 S�ren Schmidt
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer
14  *    in this position and unchanged.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. The name of the author may not be used to endorse or promote products
19  *    derived from this software without specific prior written permission
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
22  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
23  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
24  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
25  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
26  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
30  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31  */
32 
33 #include <sys/cdefs.h>
34 __FBSDID("$FreeBSD$");
35 
36 /* XXX we use functions that might not exist. */
37 #include "opt_compat.h"
38 
39 #ifndef COMPAT_43
40 #error "Unable to compile Linux-emulator due to missing COMPAT_43 option!"
41 #endif
42 #ifndef COMPAT_IA32
43 #error "Unable to compile Linux-emulator due to missing IA32 option!"
44 #endif
45 
46 #define	__ELF_WORD_SIZE	32
47 
48 #include <sys/param.h>
49 #include <sys/systm.h>
50 #include <sys/exec.h>
51 #include <sys/imgact.h>
52 #include <sys/imgact_elf.h>
53 #include <sys/kernel.h>
54 #include <sys/lock.h>
55 #include <sys/malloc.h>
56 #include <sys/module.h>
57 #include <sys/mutex.h>
58 #include <sys/proc.h>
59 #include <sys/signalvar.h>
60 #include <sys/sysctl.h>
61 #include <sys/syscallsubr.h>
62 #include <sys/sysent.h>
63 #include <sys/sysproto.h>
64 #include <sys/user.h>
65 #include <sys/vnode.h>
66 
67 #include <vm/vm.h>
68 #include <vm/pmap.h>
69 #include <vm/vm_extern.h>
70 #include <vm/vm_map.h>
71 #include <vm/vm_object.h>
72 #include <vm/vm_page.h>
73 #include <vm/vm_param.h>
74 
75 #include <machine/cpu.h>
76 #include <machine/md_var.h>
77 #include <machine/specialreg.h>
78 
79 #include <amd64/linux32/linux.h>
80 #include <amd64/linux32/linux32_proto.h>
81 #include <compat/linux/linux_mib.h>
82 #include <compat/linux/linux_signal.h>
83 #include <compat/linux/linux_util.h>
84 
85 MODULE_VERSION(linux, 1);
86 MODULE_DEPEND(linux, sysvmsg, 1, 1, 1);
87 MODULE_DEPEND(linux, sysvsem, 1, 1, 1);
88 MODULE_DEPEND(linux, sysvshm, 1, 1, 1);
89 
90 MALLOC_DEFINE(M_LINUX, "linux", "Linux mode structures");
91 
92 #define	AUXARGS_ENTRY_32(pos, id, val)	\
93 	do {				\
94 		suword32(pos++, id);	\
95 		suword32(pos++, val);	\
96 	} while (0)
97 
98 #if BYTE_ORDER == LITTLE_ENDIAN
99 #define SHELLMAGIC      0x2123 /* #! */
100 #else
101 #define SHELLMAGIC      0x2321
102 #endif
103 
104 /*
105  * Allow the sendsig functions to use the ldebug() facility
106  * even though they are not syscalls themselves. Map them
107  * to syscall 0. This is slightly less bogus than using
108  * ldebug(sigreturn).
109  */
110 #define	LINUX_SYS_linux_rt_sendsig	0
111 #define	LINUX_SYS_linux_sendsig		0
112 
113 extern char linux_sigcode[];
114 extern int linux_szsigcode;
115 
116 extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL];
117 
118 SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler);
119 
120 static int	elf_linux_fixup(register_t **stack_base,
121 		    struct image_params *iparams);
122 static register_t *linux_copyout_strings(struct image_params *imgp);
123 static void	linux_prepsyscall(struct trapframe *tf, int *args, u_int *code,
124 		    caddr_t *params);
125 static void     linux_sendsig(sig_t catcher, int sig, sigset_t *mask,
126 		    u_long code);
127 static void	exec_linux_setregs(struct thread *td, u_long entry,
128 				   u_long stack, u_long ps_strings);
129 static void	linux32_fixlimits(struct image_params *imgp);
130 
131 /*
132  * Linux syscalls return negative errno's, we do positive and map them
133  */
134 static int bsd_to_linux_errno[ELAST + 1] = {
135 	-0,  -1,  -2,  -3,  -4,  -5,  -6,  -7,  -8,  -9,
136 	-10, -35, -12, -13, -14, -15, -16, -17, -18, -19,
137 	-20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
138 	-30, -31, -32, -33, -34, -11,-115,-114, -88, -89,
139 	-90, -91, -92, -93, -94, -95, -96, -97, -98, -99,
140 	-100,-101,-102,-103,-104,-105,-106,-107,-108,-109,
141 	-110,-111, -40, -36,-112,-113, -39, -11, -87,-122,
142 	-116, -66,  -6,  -6,  -6,  -6,  -6, -37, -38,  -9,
143 	-6, -6, -43, -42, -75, -6, -84
144 };
145 
146 int bsd_to_linux_signal[LINUX_SIGTBLSZ] = {
147 	LINUX_SIGHUP, LINUX_SIGINT, LINUX_SIGQUIT, LINUX_SIGILL,
148 	LINUX_SIGTRAP, LINUX_SIGABRT, 0, LINUX_SIGFPE,
149 	LINUX_SIGKILL, LINUX_SIGBUS, LINUX_SIGSEGV, LINUX_SIGSYS,
150 	LINUX_SIGPIPE, LINUX_SIGALRM, LINUX_SIGTERM, LINUX_SIGURG,
151 	LINUX_SIGSTOP, LINUX_SIGTSTP, LINUX_SIGCONT, LINUX_SIGCHLD,
152 	LINUX_SIGTTIN, LINUX_SIGTTOU, LINUX_SIGIO, LINUX_SIGXCPU,
153 	LINUX_SIGXFSZ, LINUX_SIGVTALRM, LINUX_SIGPROF, LINUX_SIGWINCH,
154 	0, LINUX_SIGUSR1, LINUX_SIGUSR2
155 };
156 
157 int linux_to_bsd_signal[LINUX_SIGTBLSZ] = {
158 	SIGHUP, SIGINT, SIGQUIT, SIGILL,
159 	SIGTRAP, SIGABRT, SIGBUS, SIGFPE,
160 	SIGKILL, SIGUSR1, SIGSEGV, SIGUSR2,
161 	SIGPIPE, SIGALRM, SIGTERM, SIGBUS,
162 	SIGCHLD, SIGCONT, SIGSTOP, SIGTSTP,
163 	SIGTTIN, SIGTTOU, SIGURG, SIGXCPU,
164 	SIGXFSZ, SIGVTALRM, SIGPROF, SIGWINCH,
165 	SIGIO, SIGURG, SIGSYS
166 };
167 
168 #define LINUX_T_UNKNOWN  255
169 static int _bsd_to_linux_trapcode[] = {
170 	LINUX_T_UNKNOWN,	/* 0 */
171 	6,			/* 1  T_PRIVINFLT */
172 	LINUX_T_UNKNOWN,	/* 2 */
173 	3,			/* 3  T_BPTFLT */
174 	LINUX_T_UNKNOWN,	/* 4 */
175 	LINUX_T_UNKNOWN,	/* 5 */
176 	16,			/* 6  T_ARITHTRAP */
177 	254,			/* 7  T_ASTFLT */
178 	LINUX_T_UNKNOWN,	/* 8 */
179 	13,			/* 9  T_PROTFLT */
180 	1,			/* 10 T_TRCTRAP */
181 	LINUX_T_UNKNOWN,	/* 11 */
182 	14,			/* 12 T_PAGEFLT */
183 	LINUX_T_UNKNOWN,	/* 13 */
184 	17,			/* 14 T_ALIGNFLT */
185 	LINUX_T_UNKNOWN,	/* 15 */
186 	LINUX_T_UNKNOWN,	/* 16 */
187 	LINUX_T_UNKNOWN,	/* 17 */
188 	0,			/* 18 T_DIVIDE */
189 	2,			/* 19 T_NMI */
190 	4,			/* 20 T_OFLOW */
191 	5,			/* 21 T_BOUND */
192 	7,			/* 22 T_DNA */
193 	8,			/* 23 T_DOUBLEFLT */
194 	9,			/* 24 T_FPOPFLT */
195 	10,			/* 25 T_TSSFLT */
196 	11,			/* 26 T_SEGNPFLT */
197 	12,			/* 27 T_STKFLT */
198 	18,			/* 28 T_MCHK */
199 	19,			/* 29 T_XMMFLT */
200 	15			/* 30 T_RESERVED */
201 };
202 #define bsd_to_linux_trapcode(code) \
203     ((code)<sizeof(_bsd_to_linux_trapcode)/sizeof(*_bsd_to_linux_trapcode)? \
204      _bsd_to_linux_trapcode[(code)]: \
205      LINUX_T_UNKNOWN)
206 
207 struct linux32_ps_strings {
208 	u_int32_t ps_argvstr;	/* first of 0 or more argument strings */
209 	int	ps_nargvstr;	/* the number of argument strings */
210 	u_int32_t ps_envstr;	/* first of 0 or more environment strings */
211 	int	ps_nenvstr;	/* the number of environment strings */
212 };
213 
214 /*
215  * If FreeBSD & Linux have a difference of opinion about what a trap
216  * means, deal with it here.
217  *
218  * MPSAFE
219  */
220 static int
221 translate_traps(int signal, int trap_code)
222 {
223 	if (signal != SIGBUS)
224 		return signal;
225 	switch (trap_code) {
226 	case T_PROTFLT:
227 	case T_TSSFLT:
228 	case T_DOUBLEFLT:
229 	case T_PAGEFLT:
230 		return SIGSEGV;
231 	default:
232 		return signal;
233 	}
234 }
235 
236 static int
237 elf_linux_fixup(register_t **stack_base, struct image_params *imgp)
238 {
239 	Elf32_Auxargs *args;
240 	Elf32_Addr *base;
241 	Elf32_Addr *pos;
242 
243 	KASSERT(curthread->td_proc == imgp->proc &&
244 	    (curthread->td_proc->p_flag & P_SA) == 0,
245 	    ("unsafe elf_linux_fixup(), should be curproc"));
246 	base = (Elf32_Addr *)*stack_base;
247 	args = (Elf32_Auxargs *)imgp->auxargs;
248 	pos = base + (imgp->argc + imgp->envc + 2);
249 
250 	if (args->trace)
251 		AUXARGS_ENTRY_32(pos, AT_DEBUG, 1);
252 	if (args->execfd != -1)
253 		AUXARGS_ENTRY_32(pos, AT_EXECFD, args->execfd);
254 	AUXARGS_ENTRY_32(pos, AT_PHDR, args->phdr);
255 	AUXARGS_ENTRY_32(pos, AT_PHENT, args->phent);
256 	AUXARGS_ENTRY_32(pos, AT_PHNUM, args->phnum);
257 	AUXARGS_ENTRY_32(pos, AT_PAGESZ, args->pagesz);
258 	AUXARGS_ENTRY_32(pos, AT_FLAGS, args->flags);
259 	AUXARGS_ENTRY_32(pos, AT_ENTRY, args->entry);
260 	AUXARGS_ENTRY_32(pos, AT_BASE, args->base);
261 	AUXARGS_ENTRY_32(pos, AT_UID, imgp->proc->p_ucred->cr_ruid);
262 	AUXARGS_ENTRY_32(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid);
263 	AUXARGS_ENTRY_32(pos, AT_GID, imgp->proc->p_ucred->cr_rgid);
264 	AUXARGS_ENTRY_32(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid);
265 	AUXARGS_ENTRY_32(pos, AT_NULL, 0);
266 
267 	free(imgp->auxargs, M_TEMP);
268 	imgp->auxargs = NULL;
269 
270 	base--;
271 	suword32(base, (uint32_t)imgp->argc);
272 	*stack_base = (register_t *)base;
273 	return 0;
274 }
275 
276 extern int _ucodesel, _ucode32sel, _udatasel;
277 extern unsigned long linux_sznonrtsigcode;
278 
279 static void
280 linux_rt_sendsig(sig_t catcher, int sig, sigset_t *mask, u_long code)
281 {
282 	struct thread *td = curthread;
283 	struct proc *p = td->td_proc;
284 	struct sigacts *psp;
285 	struct trapframe *regs;
286 	struct l_rt_sigframe *fp, frame;
287 	int oonstack;
288 
289 	PROC_LOCK_ASSERT(p, MA_OWNED);
290 	psp = p->p_sigacts;
291 	mtx_assert(&psp->ps_mtx, MA_OWNED);
292 	regs = td->td_frame;
293 	oonstack = sigonstack(regs->tf_rsp);
294 
295 #ifdef DEBUG
296 	if (ldebug(rt_sendsig))
297 		printf(ARGS(rt_sendsig, "%p, %d, %p, %lu"),
298 		    catcher, sig, (void*)mask, code);
299 #endif
300 	/*
301 	 * Allocate space for the signal handler context.
302 	 */
303 	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
304 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
305 		fp = (struct l_rt_sigframe *)(td->td_sigstk.ss_sp +
306 		    td->td_sigstk.ss_size - sizeof(struct l_rt_sigframe));
307 	} else
308 		fp = (struct l_rt_sigframe *)regs->tf_rsp - 1;
309 	mtx_unlock(&psp->ps_mtx);
310 
311 	/*
312 	 * Build the argument list for the signal handler.
313 	 */
314 	if (p->p_sysent->sv_sigtbl)
315 		if (sig <= p->p_sysent->sv_sigsize)
316 			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
317 
318 	bzero(&frame, sizeof(frame));
319 
320 	frame.sf_handler = PTROUT(catcher);
321 	frame.sf_sig = sig;
322 	frame.sf_siginfo = PTROUT(&fp->sf_si);
323 	frame.sf_ucontext = PTROUT(&fp->sf_sc);
324 
325 	/* Fill in POSIX parts */
326 	frame.sf_si.lsi_signo = sig;
327 	frame.sf_si.lsi_code = code;
328 	frame.sf_si.lsi_addr = PTROUT(regs->tf_err);
329 
330 	/*
331 	 * Build the signal context to be used by sigreturn.
332 	 */
333 	frame.sf_sc.uc_flags = 0;		/* XXX ??? */
334 	frame.sf_sc.uc_link = 0;		/* XXX ??? */
335 
336 	frame.sf_sc.uc_stack.ss_sp = PTROUT(td->td_sigstk.ss_sp);
337 	frame.sf_sc.uc_stack.ss_size = td->td_sigstk.ss_size;
338 	frame.sf_sc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
339 	    ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE;
340 	PROC_UNLOCK(p);
341 
342 	bsd_to_linux_sigset(mask, &frame.sf_sc.uc_sigmask);
343 
344 	frame.sf_sc.uc_mcontext.sc_mask   = frame.sf_sc.uc_sigmask.__bits[0];
345         frame.sf_sc.uc_mcontext.sc_gs     = rgs();
346         frame.sf_sc.uc_mcontext.sc_fs     = rfs();
347         __asm __volatile("movl %%es,%0" :
348 	    "=rm" (frame.sf_sc.uc_mcontext.sc_es));
349         __asm __volatile("movl %%ds,%0" :
350 	    "=rm" (frame.sf_sc.uc_mcontext.sc_ds));
351 	frame.sf_sc.uc_mcontext.sc_edi    = regs->tf_rdi;
352 	frame.sf_sc.uc_mcontext.sc_esi    = regs->tf_rsi;
353 	frame.sf_sc.uc_mcontext.sc_ebp    = regs->tf_rbp;
354 	frame.sf_sc.uc_mcontext.sc_ebx    = regs->tf_rbx;
355 	frame.sf_sc.uc_mcontext.sc_edx    = regs->tf_rdx;
356 	frame.sf_sc.uc_mcontext.sc_ecx    = regs->tf_rcx;
357 	frame.sf_sc.uc_mcontext.sc_eax    = regs->tf_rax;
358 	frame.sf_sc.uc_mcontext.sc_eip    = regs->tf_rip;
359 	frame.sf_sc.uc_mcontext.sc_cs     = regs->tf_cs;
360 	frame.sf_sc.uc_mcontext.sc_eflags = regs->tf_rflags;
361 	frame.sf_sc.uc_mcontext.sc_esp_at_signal = regs->tf_rsp;
362 	frame.sf_sc.uc_mcontext.sc_ss     = regs->tf_ss;
363 	frame.sf_sc.uc_mcontext.sc_err    = regs->tf_err;
364 	frame.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code);
365 
366 #ifdef DEBUG
367 	if (ldebug(rt_sendsig))
368 		printf(LMSG("rt_sendsig flags: 0x%x, sp: %p, ss: 0x%lx, mask: 0x%x"),
369 		    frame.sf_sc.uc_stack.ss_flags, td->td_sigstk.ss_sp,
370 		    td->td_sigstk.ss_size, frame.sf_sc.uc_mcontext.sc_mask);
371 #endif
372 
373 	if (copyout(&frame, fp, sizeof(frame)) != 0) {
374 		/*
375 		 * Process has trashed its stack; give it an illegal
376 		 * instruction to halt it in its tracks.
377 		 */
378 #ifdef DEBUG
379 		if (ldebug(rt_sendsig))
380 			printf(LMSG("rt_sendsig: bad stack %p, oonstack=%x"),
381 			    fp, oonstack);
382 #endif
383 		PROC_LOCK(p);
384 		sigexit(td, SIGILL);
385 	}
386 
387 	/*
388 	 * Build context to run handler in.
389 	 */
390 	regs->tf_rsp = PTROUT(fp);
391 	regs->tf_rip = LINUX32_PS_STRINGS - *(p->p_sysent->sv_szsigcode) +
392 	    linux_sznonrtsigcode;
393 	regs->tf_rflags &= ~PSL_T;
394 	regs->tf_cs = _ucode32sel;
395 	regs->tf_ss = _udatasel;
396 	load_ds(_udatasel);
397 	td->td_pcb->pcb_ds = _udatasel;
398 	load_es(_udatasel);
399 	td->td_pcb->pcb_es = _udatasel;
400 	PROC_LOCK(p);
401 	mtx_lock(&psp->ps_mtx);
402 }
403 
404 
405 /*
406  * Send an interrupt to process.
407  *
408  * Stack is set up to allow sigcode stored
409  * in u. to call routine, followed by kcall
410  * to sigreturn routine below.  After sigreturn
411  * resets the signal mask, the stack, and the
412  * frame pointer, it returns to the user
413  * specified pc, psl.
414  */
415 static void
416 linux_sendsig(sig_t catcher, int sig, sigset_t *mask, u_long code)
417 {
418 	struct thread *td = curthread;
419 	struct proc *p = td->td_proc;
420 	struct sigacts *psp;
421 	struct trapframe *regs;
422 	struct l_sigframe *fp, frame;
423 	l_sigset_t lmask;
424 	int oonstack, i;
425 
426 	PROC_LOCK_ASSERT(p, MA_OWNED);
427 	psp = p->p_sigacts;
428 	mtx_assert(&psp->ps_mtx, MA_OWNED);
429 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
430 		/* Signal handler installed with SA_SIGINFO. */
431 		linux_rt_sendsig(catcher, sig, mask, code);
432 		return;
433 	}
434 
435 	regs = td->td_frame;
436 	oonstack = sigonstack(regs->tf_rsp);
437 
438 #ifdef DEBUG
439 	if (ldebug(sendsig))
440 		printf(ARGS(sendsig, "%p, %d, %p, %lu"),
441 		    catcher, sig, (void*)mask, code);
442 #endif
443 
444 	/*
445 	 * Allocate space for the signal handler context.
446 	 */
447 	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
448 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
449 		fp = (struct l_sigframe *)(td->td_sigstk.ss_sp +
450 		    td->td_sigstk.ss_size - sizeof(struct l_sigframe));
451 	} else
452 		fp = (struct l_sigframe *)regs->tf_rsp - 1;
453 	mtx_unlock(&psp->ps_mtx);
454 	PROC_UNLOCK(p);
455 
456 	/*
457 	 * Build the argument list for the signal handler.
458 	 */
459 	if (p->p_sysent->sv_sigtbl)
460 		if (sig <= p->p_sysent->sv_sigsize)
461 			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
462 
463 	bzero(&frame, sizeof(frame));
464 
465 	frame.sf_handler = PTROUT(catcher);
466 	frame.sf_sig = sig;
467 
468 	bsd_to_linux_sigset(mask, &lmask);
469 
470 	/*
471 	 * Build the signal context to be used by sigreturn.
472 	 */
473 	frame.sf_sc.sc_mask   = lmask.__bits[0];
474         frame.sf_sc.sc_gs     = rgs();
475         frame.sf_sc.sc_fs     = rfs();
476         __asm __volatile("movl %%es,%0" : "=rm" (frame.sf_sc.sc_es));
477         __asm __volatile("movl %%ds,%0" : "=rm" (frame.sf_sc.sc_ds));
478 	frame.sf_sc.sc_edi    = regs->tf_rdi;
479 	frame.sf_sc.sc_esi    = regs->tf_rsi;
480 	frame.sf_sc.sc_ebp    = regs->tf_rbp;
481 	frame.sf_sc.sc_ebx    = regs->tf_rbx;
482 	frame.sf_sc.sc_edx    = regs->tf_rdx;
483 	frame.sf_sc.sc_ecx    = regs->tf_rcx;
484 	frame.sf_sc.sc_eax    = regs->tf_rax;
485 	frame.sf_sc.sc_eip    = regs->tf_rip;
486 	frame.sf_sc.sc_cs     = regs->tf_cs;
487 	frame.sf_sc.sc_eflags = regs->tf_rflags;
488 	frame.sf_sc.sc_esp_at_signal = regs->tf_rsp;
489 	frame.sf_sc.sc_ss     = regs->tf_ss;
490 	frame.sf_sc.sc_err    = regs->tf_err;
491 	frame.sf_sc.sc_trapno = bsd_to_linux_trapcode(code);
492 
493 	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
494 		frame.sf_extramask[i] = lmask.__bits[i+1];
495 
496 	if (copyout(&frame, fp, sizeof(frame)) != 0) {
497 		/*
498 		 * Process has trashed its stack; give it an illegal
499 		 * instruction to halt it in its tracks.
500 		 */
501 		PROC_LOCK(p);
502 		sigexit(td, SIGILL);
503 	}
504 
505 	/*
506 	 * Build context to run handler in.
507 	 */
508 	regs->tf_rsp = PTROUT(fp);
509 	regs->tf_rip = LINUX32_PS_STRINGS - *(p->p_sysent->sv_szsigcode);
510 	regs->tf_rflags &= ~PSL_T;
511 	regs->tf_cs = _ucode32sel;
512 	regs->tf_ss = _udatasel;
513 	load_ds(_udatasel);
514 	td->td_pcb->pcb_ds = _udatasel;
515 	load_es(_udatasel);
516 	td->td_pcb->pcb_es = _udatasel;
517 	PROC_LOCK(p);
518 	mtx_lock(&psp->ps_mtx);
519 }
520 
521 /*
522  * System call to cleanup state after a signal
523  * has been taken.  Reset signal mask and
524  * stack state from context left by sendsig (above).
525  * Return to previous pc and psl as specified by
526  * context left by sendsig. Check carefully to
527  * make sure that the user has not modified the
528  * psl to gain improper privileges or to cause
529  * a machine fault.
530  */
531 int
532 linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args)
533 {
534 	struct proc *p = td->td_proc;
535 	struct l_sigframe frame;
536 	struct trapframe *regs;
537 	l_sigset_t lmask;
538 	int eflags, i;
539 
540 	regs = td->td_frame;
541 
542 #ifdef DEBUG
543 	if (ldebug(sigreturn))
544 		printf(ARGS(sigreturn, "%p"), (void *)args->sfp);
545 #endif
546 	/*
547 	 * The trampoline code hands us the sigframe.
548 	 * It is unsafe to keep track of it ourselves, in the event that a
549 	 * program jumps out of a signal handler.
550 	 */
551 	if (copyin(args->sfp, &frame, sizeof(frame)) != 0)
552 		return (EFAULT);
553 
554 	/*
555 	 * Check for security violations.
556 	 */
557 #define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
558 	eflags = frame.sf_sc.sc_eflags;
559 	/*
560 	 * XXX do allow users to change the privileged flag PSL_RF.  The
561 	 * cpu sets PSL_RF in tf_eflags for faults.  Debuggers should
562 	 * sometimes set it there too.  tf_eflags is kept in the signal
563 	 * context during signal handling and there is no other place
564 	 * to remember it, so the PSL_RF bit may be corrupted by the
565 	 * signal handler without us knowing.  Corruption of the PSL_RF
566 	 * bit at worst causes one more or one less debugger trap, so
567 	 * allowing it is fairly harmless.
568 	 */
569 	if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF))
570 		return(EINVAL);
571 
572 	/*
573 	 * Don't allow users to load a valid privileged %cs.  Let the
574 	 * hardware check for invalid selectors, excess privilege in
575 	 * other selectors, invalid %eip's and invalid %esp's.
576 	 */
577 #define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
578 	if (!CS_SECURE(frame.sf_sc.sc_cs)) {
579 		trapsignal(td, SIGBUS, T_PROTFLT);
580 		return(EINVAL);
581 	}
582 
583 	lmask.__bits[0] = frame.sf_sc.sc_mask;
584 	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
585 		lmask.__bits[i+1] = frame.sf_extramask[i];
586 	PROC_LOCK(p);
587 	linux_to_bsd_sigset(&lmask, &td->td_sigmask);
588 	SIG_CANTMASK(td->td_sigmask);
589 	signotify(td);
590 	PROC_UNLOCK(p);
591 
592 	/*
593 	 * Restore signal context.
594 	 */
595 	/* Selectors were restored by the trampoline. */
596 	regs->tf_rdi    = frame.sf_sc.sc_edi;
597 	regs->tf_rsi    = frame.sf_sc.sc_esi;
598 	regs->tf_rbp    = frame.sf_sc.sc_ebp;
599 	regs->tf_rbx    = frame.sf_sc.sc_ebx;
600 	regs->tf_rdx    = frame.sf_sc.sc_edx;
601 	regs->tf_rcx    = frame.sf_sc.sc_ecx;
602 	regs->tf_rax    = frame.sf_sc.sc_eax;
603 	regs->tf_rip    = frame.sf_sc.sc_eip;
604 	regs->tf_cs     = frame.sf_sc.sc_cs;
605 	regs->tf_rflags = eflags;
606 	regs->tf_rsp    = frame.sf_sc.sc_esp_at_signal;
607 	regs->tf_ss     = frame.sf_sc.sc_ss;
608 
609 	return (EJUSTRETURN);
610 }
611 
612 /*
613  * System call to cleanup state after a signal
614  * has been taken.  Reset signal mask and
615  * stack state from context left by rt_sendsig (above).
616  * Return to previous pc and psl as specified by
617  * context left by sendsig. Check carefully to
618  * make sure that the user has not modified the
619  * psl to gain improper privileges or to cause
620  * a machine fault.
621  */
622 int
623 linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args)
624 {
625 	struct proc *p = td->td_proc;
626 	struct l_ucontext uc;
627 	struct l_sigcontext *context;
628 	l_stack_t *lss;
629 	stack_t ss;
630 	struct trapframe *regs;
631 	int eflags;
632 
633 	regs = td->td_frame;
634 
635 #ifdef DEBUG
636 	if (ldebug(rt_sigreturn))
637 		printf(ARGS(rt_sigreturn, "%p"), (void *)args->ucp);
638 #endif
639 	/*
640 	 * The trampoline code hands us the ucontext.
641 	 * It is unsafe to keep track of it ourselves, in the event that a
642 	 * program jumps out of a signal handler.
643 	 */
644 	if (copyin(args->ucp, &uc, sizeof(uc)) != 0)
645 		return (EFAULT);
646 
647 	context = &uc.uc_mcontext;
648 
649 	/*
650 	 * Check for security violations.
651 	 */
652 #define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
653 	eflags = context->sc_eflags;
654 	/*
655 	 * XXX do allow users to change the privileged flag PSL_RF.  The
656 	 * cpu sets PSL_RF in tf_eflags for faults.  Debuggers should
657 	 * sometimes set it there too.  tf_eflags is kept in the signal
658 	 * context during signal handling and there is no other place
659 	 * to remember it, so the PSL_RF bit may be corrupted by the
660 	 * signal handler without us knowing.  Corruption of the PSL_RF
661 	 * bit at worst causes one more or one less debugger trap, so
662 	 * allowing it is fairly harmless.
663 	 */
664 	if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF))
665 		return(EINVAL);
666 
667 	/*
668 	 * Don't allow users to load a valid privileged %cs.  Let the
669 	 * hardware check for invalid selectors, excess privilege in
670 	 * other selectors, invalid %eip's and invalid %esp's.
671 	 */
672 #define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
673 	if (!CS_SECURE(context->sc_cs)) {
674 		trapsignal(td, SIGBUS, T_PROTFLT);
675 		return(EINVAL);
676 	}
677 
678 	PROC_LOCK(p);
679 	linux_to_bsd_sigset(&uc.uc_sigmask, &td->td_sigmask);
680 	SIG_CANTMASK(td->td_sigmask);
681 	signotify(td);
682 	PROC_UNLOCK(p);
683 
684 	/*
685 	 * Restore signal context
686 	 */
687 	/* Selectors were restored by the trampoline. */
688 	regs->tf_rdi    = context->sc_edi;
689 	regs->tf_rsi    = context->sc_esi;
690 	regs->tf_rbp    = context->sc_ebp;
691 	regs->tf_rbx    = context->sc_ebx;
692 	regs->tf_rdx    = context->sc_edx;
693 	regs->tf_rcx    = context->sc_ecx;
694 	regs->tf_rax    = context->sc_eax;
695 	regs->tf_rip    = context->sc_eip;
696 	regs->tf_cs     = context->sc_cs;
697 	regs->tf_rflags = eflags;
698 	regs->tf_rsp    = context->sc_esp_at_signal;
699 	regs->tf_ss     = context->sc_ss;
700 
701 	/*
702 	 * call sigaltstack & ignore results..
703 	 */
704 	lss = &uc.uc_stack;
705 	ss.ss_sp = PTRIN(lss->ss_sp);
706 	ss.ss_size = lss->ss_size;
707 	ss.ss_flags = linux_to_bsd_sigaltstack(lss->ss_flags);
708 
709 #ifdef DEBUG
710 	if (ldebug(rt_sigreturn))
711 		printf(LMSG("rt_sigret flags: 0x%x, sp: %p, ss: 0x%lx, mask: 0x%x"),
712 		    ss.ss_flags, ss.ss_sp, ss.ss_size, context->sc_mask);
713 #endif
714 	(void)kern_sigaltstack(td, &ss, NULL);
715 
716 	return (EJUSTRETURN);
717 }
718 
719 /*
720  * MPSAFE
721  */
722 static void
723 linux_prepsyscall(struct trapframe *tf, int *args, u_int *code, caddr_t *params)
724 {
725 	args[0] = tf->tf_rbx;
726 	args[1] = tf->tf_rcx;
727 	args[2] = tf->tf_rdx;
728 	args[3] = tf->tf_rsi;
729 	args[4] = tf->tf_rdi;
730 	args[5] = tf->tf_rbp;	/* Unconfirmed */
731 	*params = NULL;		/* no copyin */
732 }
733 
734 /*
735  * If a linux binary is exec'ing something, try this image activator
736  * first.  We override standard shell script execution in order to
737  * be able to modify the interpreter path.  We only do this if a linux
738  * binary is doing the exec, so we do not create an EXEC module for it.
739  */
740 static int	exec_linux_imgact_try(struct image_params *iparams);
741 
742 static int
743 exec_linux_imgact_try(struct image_params *imgp)
744 {
745     const char *head = (const char *)imgp->image_header;
746     int error = -1;
747 
748     /*
749      * The interpreter for shell scripts run from a linux binary needs
750      * to be located in /compat/linux if possible in order to recursively
751      * maintain linux path emulation.
752      */
753     if (((const short *)head)[0] == SHELLMAGIC) {
754 	    /*
755 	     * Run our normal shell image activator.  If it succeeds attempt
756 	     * to use the alternate path for the interpreter.  If an alternate
757 	     * path is found, use our stringspace to store it.
758 	     */
759 	    if ((error = exec_shell_imgact(imgp)) == 0) {
760 		    char *rpath = NULL;
761 
762 		    linux_emul_find(FIRST_THREAD_IN_PROC(imgp->proc), NULL,
763 			imgp->interpreter_name, &rpath, 0);
764 		    if (rpath != imgp->interpreter_name) {
765 			    int len = strlen(rpath) + 1;
766 
767 			    if (len <= MAXSHELLCMDLEN) {
768 				    memcpy(imgp->interpreter_name, rpath, len);
769 			    }
770 			    free(rpath, M_TEMP);
771 		    }
772 	    }
773     }
774     return(error);
775 }
776 
777 /*
778  * Clear registers on exec
779  * XXX copied from ia32_signal.c.
780  */
781 static void
782 exec_linux_setregs(td, entry, stack, ps_strings)
783 	struct thread *td;
784 	u_long entry;
785 	u_long stack;
786 	u_long ps_strings;
787 {
788 	struct trapframe *regs = td->td_frame;
789 	struct pcb *pcb = td->td_pcb;
790 
791 	wrmsr(MSR_FSBASE, 0);
792 	wrmsr(MSR_KGSBASE, 0);	/* User value while we're in the kernel */
793 	pcb->pcb_fsbase = 0;
794 	pcb->pcb_gsbase = 0;
795 	load_ds(_udatasel);
796 	load_es(_udatasel);
797 	load_fs(_udatasel);
798 	load_gs(0);
799 	pcb->pcb_ds = _udatasel;
800 	pcb->pcb_es = _udatasel;
801 	pcb->pcb_fs = _udatasel;
802 	pcb->pcb_gs = 0;
803 
804 	bzero((char *)regs, sizeof(struct trapframe));
805 	regs->tf_rip = entry;
806 	regs->tf_rsp = stack;
807 	regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T);
808 	regs->tf_ss = _udatasel;
809 	regs->tf_cs = _ucode32sel;
810 	regs->tf_rbx = ps_strings;
811 	load_cr0(rcr0() | CR0_MP | CR0_TS);
812 
813 	/* Return via doreti so that we can change to a different %cs */
814 	pcb->pcb_flags |= PCB_FULLCTX;
815 	td->td_retval[1] = 0;
816 }
817 
818 /*
819  * XXX copied from ia32_sysvec.c.
820  */
821 static register_t *
822 linux_copyout_strings(struct image_params *imgp)
823 {
824 	int argc, envc;
825 	u_int32_t *vectp;
826 	char *stringp, *destp;
827 	u_int32_t *stack_base;
828 	struct linux32_ps_strings *arginfo;
829 	int sigcodesz;
830 
831 	/*
832 	 * Calculate string base and vector table pointers.
833 	 * Also deal with signal trampoline code for this exec type.
834 	 */
835 	arginfo = (struct linux32_ps_strings *)LINUX32_PS_STRINGS;
836 	sigcodesz = *(imgp->proc->p_sysent->sv_szsigcode);
837 	destp =	(caddr_t)arginfo - sigcodesz - SPARE_USRSPACE -
838 		roundup((ARG_MAX - imgp->stringspace), sizeof(char *));
839 
840 	/*
841 	 * install sigcode
842 	 */
843 	if (sigcodesz)
844 		copyout(imgp->proc->p_sysent->sv_sigcode,
845 			((caddr_t)arginfo - sigcodesz), szsigcode);
846 
847 	/*
848 	 * If we have a valid auxargs ptr, prepare some room
849 	 * on the stack.
850 	 */
851 	if (imgp->auxargs) {
852 		/*
853 		 * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for
854 		 * lower compatibility.
855 		 */
856 		imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size
857 			: (AT_COUNT * 2);
858 		/*
859 		 * The '+ 2' is for the null pointers at the end of each of
860 		 * the arg and env vector sets,and imgp->auxarg_size is room
861 		 * for argument of Runtime loader.
862 		 */
863 		vectp = (u_int32_t *) (destp - (imgp->argc + imgp->envc + 2 +
864 				       imgp->auxarg_size) * sizeof(u_int32_t));
865 
866 	} else
867 		/*
868 		 * The '+ 2' is for the null pointers at the end of each of
869 		 * the arg and env vector sets
870 		 */
871 		vectp = (u_int32_t *)
872 			(destp - (imgp->argc + imgp->envc + 2) * sizeof(u_int32_t));
873 
874 	/*
875 	 * vectp also becomes our initial stack base
876 	 */
877 	stack_base = vectp;
878 
879 	stringp = imgp->stringbase;
880 	argc = imgp->argc;
881 	envc = imgp->envc;
882 	/*
883 	 * Copy out strings - arguments and environment.
884 	 */
885 	copyout(stringp, destp, ARG_MAX - imgp->stringspace);
886 
887 	/*
888 	 * Fill in "ps_strings" struct for ps, w, etc.
889 	 */
890 	suword32(&arginfo->ps_argvstr, (u_int32_t)(intptr_t)vectp);
891 	suword32(&arginfo->ps_nargvstr, argc);
892 
893 	/*
894 	 * Fill in argument portion of vector table.
895 	 */
896 	for (; argc > 0; --argc) {
897 		suword32(vectp++, (u_int32_t)(intptr_t)destp);
898 		while (*stringp++ != 0)
899 			destp++;
900 		destp++;
901 	}
902 
903 	/* a null vector table pointer separates the argp's from the envp's */
904 	suword32(vectp++, 0);
905 
906 	suword32(&arginfo->ps_envstr, (u_int32_t)(intptr_t)vectp);
907 	suword32(&arginfo->ps_nenvstr, envc);
908 
909 	/*
910 	 * Fill in environment portion of vector table.
911 	 */
912 	for (; envc > 0; --envc) {
913 		suword32(vectp++, (u_int32_t)(intptr_t)destp);
914 		while (*stringp++ != 0)
915 			destp++;
916 		destp++;
917 	}
918 
919 	/* end of vector table is a null pointer */
920 	suword32(vectp, 0);
921 
922 	return ((register_t *)stack_base);
923 }
924 
925 SYSCTL_NODE(_compat, OID_AUTO, linux32, CTLFLAG_RW, 0,
926     "32-bit Linux emulation");
927 
928 static u_long	linux32_maxdsiz = LINUX32_MAXDSIZ;
929 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxdsiz, CTLFLAG_RW,
930     &linux32_maxdsiz, 0, "");
931 static u_long	linux32_maxssiz = LINUX32_MAXSSIZ;
932 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxssiz, CTLFLAG_RW,
933     &linux32_maxssiz, 0, "");
934 static u_long	linux32_maxvmem = LINUX32_MAXVMEM;
935 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxvmem, CTLFLAG_RW,
936     &linux32_maxvmem, 0, "");
937 
938 /*
939  * XXX copied from ia32_sysvec.c.
940  */
941 static void
942 linux32_fixlimits(struct image_params *imgp)
943 {
944 	struct proc *p = imgp->proc;
945 	struct plimit *oldlim, *newlim;
946 
947 	if (linux32_maxdsiz == 0 && linux32_maxssiz == 0 &&
948 	    linux32_maxvmem == 0)
949 		return;
950 	newlim = lim_alloc();
951 	PROC_LOCK(p);
952 	oldlim = p->p_limit;
953 	lim_copy(newlim, oldlim);
954 	if (linux32_maxdsiz != 0) {
955 		if (newlim->pl_rlimit[RLIMIT_DATA].rlim_cur > linux32_maxdsiz)
956 		    newlim->pl_rlimit[RLIMIT_DATA].rlim_cur = linux32_maxdsiz;
957 		if (newlim->pl_rlimit[RLIMIT_DATA].rlim_max > linux32_maxdsiz)
958 		    newlim->pl_rlimit[RLIMIT_DATA].rlim_max = linux32_maxdsiz;
959 	}
960 	if (linux32_maxssiz != 0) {
961 		if (newlim->pl_rlimit[RLIMIT_STACK].rlim_cur > linux32_maxssiz)
962 		    newlim->pl_rlimit[RLIMIT_STACK].rlim_cur = linux32_maxssiz;
963 		if (newlim->pl_rlimit[RLIMIT_STACK].rlim_max > linux32_maxssiz)
964 		    newlim->pl_rlimit[RLIMIT_STACK].rlim_max = linux32_maxssiz;
965 	}
966 	if (linux32_maxvmem != 0) {
967 		if (newlim->pl_rlimit[RLIMIT_VMEM].rlim_cur > linux32_maxvmem)
968 		    newlim->pl_rlimit[RLIMIT_VMEM].rlim_cur = linux32_maxvmem;
969 		if (newlim->pl_rlimit[RLIMIT_VMEM].rlim_max > linux32_maxvmem)
970 		    newlim->pl_rlimit[RLIMIT_VMEM].rlim_max = linux32_maxvmem;
971 	}
972 	p->p_limit = newlim;
973 	PROC_UNLOCK(p);
974 	lim_free(oldlim);
975 }
976 
977 struct sysentvec elf_linux_sysvec = {
978 	LINUX_SYS_MAXSYSCALL,
979 	linux_sysent,
980 	0xff,
981 	LINUX_SIGTBLSZ,
982 	bsd_to_linux_signal,
983 	ELAST + 1,
984 	bsd_to_linux_errno,
985 	translate_traps,
986 	elf_linux_fixup,
987 	linux_sendsig,
988 	linux_sigcode,
989 	&linux_szsigcode,
990 	linux_prepsyscall,
991 	"Linux ELF32",
992 	elf32_coredump,
993 	exec_linux_imgact_try,
994 	LINUX_MINSIGSTKSZ,
995 	PAGE_SIZE,
996 	VM_MIN_ADDRESS,
997 	LINUX32_USRSTACK,
998 	LINUX32_USRSTACK,
999 	LINUX32_PS_STRINGS,
1000 	VM_PROT_ALL,
1001 	linux_copyout_strings,
1002 	exec_linux_setregs,
1003 	linux32_fixlimits
1004 };
1005 
1006 static Elf32_Brandinfo linux_brand = {
1007 					ELFOSABI_LINUX,
1008 					EM_386,
1009 					"Linux",
1010 					"/compat/linux",
1011 					"/lib/ld-linux.so.1",
1012 					&elf_linux_sysvec,
1013 					NULL,
1014 				 };
1015 
1016 static Elf32_Brandinfo linux_glibc2brand = {
1017 					ELFOSABI_LINUX,
1018 					EM_386,
1019 					"Linux",
1020 					"/compat/linux",
1021 					"/lib/ld-linux.so.2",
1022 					&elf_linux_sysvec,
1023 					NULL,
1024 				 };
1025 
1026 Elf32_Brandinfo *linux_brandlist[] = {
1027 					&linux_brand,
1028 					&linux_glibc2brand,
1029 					NULL
1030 				};
1031 
1032 static int
1033 linux_elf_modevent(module_t mod, int type, void *data)
1034 {
1035 	Elf32_Brandinfo **brandinfo;
1036 	int error;
1037 	struct linux_ioctl_handler **lihp;
1038 
1039 	error = 0;
1040 
1041 	switch(type) {
1042 	case MOD_LOAD:
1043 		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
1044 		     ++brandinfo)
1045 			if (elf32_insert_brand_entry(*brandinfo) < 0)
1046 				error = EINVAL;
1047 		if (error == 0) {
1048 			SET_FOREACH(lihp, linux_ioctl_handler_set)
1049 				linux_ioctl_register_handler(*lihp);
1050 			if (bootverbose)
1051 				printf("Linux ELF exec handler installed\n");
1052 		} else
1053 			printf("cannot insert Linux ELF brand handler\n");
1054 		break;
1055 	case MOD_UNLOAD:
1056 		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
1057 		     ++brandinfo)
1058 			if (elf32_brand_inuse(*brandinfo))
1059 				error = EBUSY;
1060 		if (error == 0) {
1061 			for (brandinfo = &linux_brandlist[0];
1062 			     *brandinfo != NULL; ++brandinfo)
1063 				if (elf32_remove_brand_entry(*brandinfo) < 0)
1064 					error = EINVAL;
1065 		}
1066 		if (error == 0) {
1067 			SET_FOREACH(lihp, linux_ioctl_handler_set)
1068 				linux_ioctl_unregister_handler(*lihp);
1069 			if (bootverbose)
1070 				printf("Linux ELF exec handler removed\n");
1071 			linux_mib_destroy();
1072 		} else
1073 			printf("Could not deinstall ELF interpreter entry\n");
1074 		break;
1075 	default:
1076 		break;
1077 	}
1078 	return error;
1079 }
1080 
1081 static moduledata_t linux_elf_mod = {
1082 	"linuxelf",
1083 	linux_elf_modevent,
1084 	0
1085 };
1086 
1087 DECLARE_MODULE(linuxelf, linux_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY);
1088