xref: /freebsd/sys/amd64/linux32/linux32_sysvec.c (revision 1d15fdd97b0b56f4321bc3ebe7271f5f7ef29f28)
1 /*-
2  * Copyright (c) 2004 Tim J. Robbins
3  * Copyright (c) 2003 Peter Wemm
4  * Copyright (c) 2002 Doug Rabson
5  * Copyright (c) 1998-1999 Andrew Gallatin
6  * Copyright (c) 1994-1996 S�ren Schmidt
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer
14  *    in this position and unchanged.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. The name of the author may not be used to endorse or promote products
19  *    derived from this software without specific prior written permission
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
22  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
23  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
24  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
25  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
26  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
30  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31  */
32 
33 #include <sys/cdefs.h>
34 __FBSDID("$FreeBSD$");
35 
36 /* XXX we use functions that might not exist. */
37 #include "opt_compat.h"
38 
39 #ifndef COMPAT_43
40 #error "Unable to compile Linux-emulator due to missing COMPAT_43 option!"
41 #endif
42 #ifndef COMPAT_IA32
43 #error "Unable to compile Linux-emulator due to missing COMPAT_IA32 option!"
44 #endif
45 
46 #define	__ELF_WORD_SIZE	32
47 
48 #include <sys/param.h>
49 #include <sys/systm.h>
50 #include <sys/exec.h>
51 #include <sys/imgact.h>
52 #include <sys/imgact_elf.h>
53 #include <sys/kernel.h>
54 #include <sys/lock.h>
55 #include <sys/malloc.h>
56 #include <sys/module.h>
57 #include <sys/mutex.h>
58 #include <sys/proc.h>
59 #include <sys/resourcevar.h>
60 #include <sys/signalvar.h>
61 #include <sys/sysctl.h>
62 #include <sys/syscallsubr.h>
63 #include <sys/sysent.h>
64 #include <sys/sysproto.h>
65 #include <sys/vnode.h>
66 
67 #include <vm/vm.h>
68 #include <vm/pmap.h>
69 #include <vm/vm_extern.h>
70 #include <vm/vm_map.h>
71 #include <vm/vm_object.h>
72 #include <vm/vm_page.h>
73 #include <vm/vm_param.h>
74 
75 #include <machine/cpu.h>
76 #include <machine/md_var.h>
77 #include <machine/pcb.h>
78 #include <machine/specialreg.h>
79 
80 #include <amd64/linux32/linux.h>
81 #include <amd64/linux32/linux32_proto.h>
82 #include <compat/linux/linux_mib.h>
83 #include <compat/linux/linux_signal.h>
84 #include <compat/linux/linux_util.h>
85 
86 MODULE_VERSION(linux, 1);
87 MODULE_DEPEND(linux, sysvmsg, 1, 1, 1);
88 MODULE_DEPEND(linux, sysvsem, 1, 1, 1);
89 MODULE_DEPEND(linux, sysvshm, 1, 1, 1);
90 
91 MALLOC_DEFINE(M_LINUX, "linux", "Linux mode structures");
92 
93 #define	AUXARGS_ENTRY_32(pos, id, val)	\
94 	do {				\
95 		suword32(pos++, id);	\
96 		suword32(pos++, val);	\
97 	} while (0)
98 
99 #if BYTE_ORDER == LITTLE_ENDIAN
100 #define SHELLMAGIC      0x2123 /* #! */
101 #else
102 #define SHELLMAGIC      0x2321
103 #endif
104 
105 /*
106  * Allow the sendsig functions to use the ldebug() facility
107  * even though they are not syscalls themselves. Map them
108  * to syscall 0. This is slightly less bogus than using
109  * ldebug(sigreturn).
110  */
111 #define	LINUX_SYS_linux_rt_sendsig	0
112 #define	LINUX_SYS_linux_sendsig		0
113 
114 extern char linux_sigcode[];
115 extern int linux_szsigcode;
116 
117 extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL];
118 
119 SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler);
120 
121 static int	elf_linux_fixup(register_t **stack_base,
122 		    struct image_params *iparams);
123 static register_t *linux_copyout_strings(struct image_params *imgp);
124 static void	linux_prepsyscall(struct trapframe *tf, int *args, u_int *code,
125 		    caddr_t *params);
126 static void     linux_sendsig(sig_t catcher, int sig, sigset_t *mask,
127 		    u_long code);
128 static void	exec_linux_setregs(struct thread *td, u_long entry,
129 				   u_long stack, u_long ps_strings);
130 static void	linux32_fixlimits(struct image_params *imgp);
131 
132 /*
133  * Linux syscalls return negative errno's, we do positive and map them
134  */
135 static int bsd_to_linux_errno[ELAST + 1] = {
136 	-0,  -1,  -2,  -3,  -4,  -5,  -6,  -7,  -8,  -9,
137 	-10, -35, -12, -13, -14, -15, -16, -17, -18, -19,
138 	-20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
139 	-30, -31, -32, -33, -34, -11,-115,-114, -88, -89,
140 	-90, -91, -92, -93, -94, -95, -96, -97, -98, -99,
141 	-100,-101,-102,-103,-104,-105,-106,-107,-108,-109,
142 	-110,-111, -40, -36,-112,-113, -39, -11, -87,-122,
143 	-116, -66,  -6,  -6,  -6,  -6,  -6, -37, -38,  -9,
144 	-6, -6, -43, -42, -75, -6, -84
145 };
146 
147 int bsd_to_linux_signal[LINUX_SIGTBLSZ] = {
148 	LINUX_SIGHUP, LINUX_SIGINT, LINUX_SIGQUIT, LINUX_SIGILL,
149 	LINUX_SIGTRAP, LINUX_SIGABRT, 0, LINUX_SIGFPE,
150 	LINUX_SIGKILL, LINUX_SIGBUS, LINUX_SIGSEGV, LINUX_SIGSYS,
151 	LINUX_SIGPIPE, LINUX_SIGALRM, LINUX_SIGTERM, LINUX_SIGURG,
152 	LINUX_SIGSTOP, LINUX_SIGTSTP, LINUX_SIGCONT, LINUX_SIGCHLD,
153 	LINUX_SIGTTIN, LINUX_SIGTTOU, LINUX_SIGIO, LINUX_SIGXCPU,
154 	LINUX_SIGXFSZ, LINUX_SIGVTALRM, LINUX_SIGPROF, LINUX_SIGWINCH,
155 	0, LINUX_SIGUSR1, LINUX_SIGUSR2
156 };
157 
158 int linux_to_bsd_signal[LINUX_SIGTBLSZ] = {
159 	SIGHUP, SIGINT, SIGQUIT, SIGILL,
160 	SIGTRAP, SIGABRT, SIGBUS, SIGFPE,
161 	SIGKILL, SIGUSR1, SIGSEGV, SIGUSR2,
162 	SIGPIPE, SIGALRM, SIGTERM, SIGBUS,
163 	SIGCHLD, SIGCONT, SIGSTOP, SIGTSTP,
164 	SIGTTIN, SIGTTOU, SIGURG, SIGXCPU,
165 	SIGXFSZ, SIGVTALRM, SIGPROF, SIGWINCH,
166 	SIGIO, SIGURG, SIGSYS
167 };
168 
169 #define LINUX_T_UNKNOWN  255
170 static int _bsd_to_linux_trapcode[] = {
171 	LINUX_T_UNKNOWN,	/* 0 */
172 	6,			/* 1  T_PRIVINFLT */
173 	LINUX_T_UNKNOWN,	/* 2 */
174 	3,			/* 3  T_BPTFLT */
175 	LINUX_T_UNKNOWN,	/* 4 */
176 	LINUX_T_UNKNOWN,	/* 5 */
177 	16,			/* 6  T_ARITHTRAP */
178 	254,			/* 7  T_ASTFLT */
179 	LINUX_T_UNKNOWN,	/* 8 */
180 	13,			/* 9  T_PROTFLT */
181 	1,			/* 10 T_TRCTRAP */
182 	LINUX_T_UNKNOWN,	/* 11 */
183 	14,			/* 12 T_PAGEFLT */
184 	LINUX_T_UNKNOWN,	/* 13 */
185 	17,			/* 14 T_ALIGNFLT */
186 	LINUX_T_UNKNOWN,	/* 15 */
187 	LINUX_T_UNKNOWN,	/* 16 */
188 	LINUX_T_UNKNOWN,	/* 17 */
189 	0,			/* 18 T_DIVIDE */
190 	2,			/* 19 T_NMI */
191 	4,			/* 20 T_OFLOW */
192 	5,			/* 21 T_BOUND */
193 	7,			/* 22 T_DNA */
194 	8,			/* 23 T_DOUBLEFLT */
195 	9,			/* 24 T_FPOPFLT */
196 	10,			/* 25 T_TSSFLT */
197 	11,			/* 26 T_SEGNPFLT */
198 	12,			/* 27 T_STKFLT */
199 	18,			/* 28 T_MCHK */
200 	19,			/* 29 T_XMMFLT */
201 	15			/* 30 T_RESERVED */
202 };
203 #define bsd_to_linux_trapcode(code) \
204     ((code)<sizeof(_bsd_to_linux_trapcode)/sizeof(*_bsd_to_linux_trapcode)? \
205      _bsd_to_linux_trapcode[(code)]: \
206      LINUX_T_UNKNOWN)
207 
208 struct linux32_ps_strings {
209 	u_int32_t ps_argvstr;	/* first of 0 or more argument strings */
210 	int	ps_nargvstr;	/* the number of argument strings */
211 	u_int32_t ps_envstr;	/* first of 0 or more environment strings */
212 	int	ps_nenvstr;	/* the number of environment strings */
213 };
214 
215 /*
216  * If FreeBSD & Linux have a difference of opinion about what a trap
217  * means, deal with it here.
218  *
219  * MPSAFE
220  */
221 static int
222 translate_traps(int signal, int trap_code)
223 {
224 	if (signal != SIGBUS)
225 		return signal;
226 	switch (trap_code) {
227 	case T_PROTFLT:
228 	case T_TSSFLT:
229 	case T_DOUBLEFLT:
230 	case T_PAGEFLT:
231 		return SIGSEGV;
232 	default:
233 		return signal;
234 	}
235 }
236 
237 static int
238 elf_linux_fixup(register_t **stack_base, struct image_params *imgp)
239 {
240 	Elf32_Auxargs *args;
241 	Elf32_Addr *base;
242 	Elf32_Addr *pos;
243 
244 	KASSERT(curthread->td_proc == imgp->proc &&
245 	    (curthread->td_proc->p_flag & P_SA) == 0,
246 	    ("unsafe elf_linux_fixup(), should be curproc"));
247 	base = (Elf32_Addr *)*stack_base;
248 	args = (Elf32_Auxargs *)imgp->auxargs;
249 	pos = base + (imgp->args->argc + imgp->args->envc + 2);
250 
251 	if (args->trace)
252 		AUXARGS_ENTRY_32(pos, AT_DEBUG, 1);
253 	if (args->execfd != -1)
254 		AUXARGS_ENTRY_32(pos, AT_EXECFD, args->execfd);
255 	AUXARGS_ENTRY_32(pos, AT_PHDR, args->phdr);
256 	AUXARGS_ENTRY_32(pos, AT_PHENT, args->phent);
257 	AUXARGS_ENTRY_32(pos, AT_PHNUM, args->phnum);
258 	AUXARGS_ENTRY_32(pos, AT_PAGESZ, args->pagesz);
259 	AUXARGS_ENTRY_32(pos, AT_FLAGS, args->flags);
260 	AUXARGS_ENTRY_32(pos, AT_ENTRY, args->entry);
261 	AUXARGS_ENTRY_32(pos, AT_BASE, args->base);
262 	AUXARGS_ENTRY_32(pos, AT_UID, imgp->proc->p_ucred->cr_ruid);
263 	AUXARGS_ENTRY_32(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid);
264 	AUXARGS_ENTRY_32(pos, AT_GID, imgp->proc->p_ucred->cr_rgid);
265 	AUXARGS_ENTRY_32(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid);
266 	AUXARGS_ENTRY_32(pos, AT_NULL, 0);
267 
268 	free(imgp->auxargs, M_TEMP);
269 	imgp->auxargs = NULL;
270 
271 	base--;
272 	suword32(base, (uint32_t)imgp->args->argc);
273 	*stack_base = (register_t *)base;
274 	return 0;
275 }
276 
277 extern int _ucodesel, _ucode32sel, _udatasel;
278 extern unsigned long linux_sznonrtsigcode;
279 
280 static void
281 linux_rt_sendsig(sig_t catcher, int sig, sigset_t *mask, u_long code)
282 {
283 	struct thread *td = curthread;
284 	struct proc *p = td->td_proc;
285 	struct sigacts *psp;
286 	struct trapframe *regs;
287 	struct l_rt_sigframe *fp, frame;
288 	int oonstack;
289 
290 	PROC_LOCK_ASSERT(p, MA_OWNED);
291 	psp = p->p_sigacts;
292 	mtx_assert(&psp->ps_mtx, MA_OWNED);
293 	regs = td->td_frame;
294 	oonstack = sigonstack(regs->tf_rsp);
295 
296 #ifdef DEBUG
297 	if (ldebug(rt_sendsig))
298 		printf(ARGS(rt_sendsig, "%p, %d, %p, %lu"),
299 		    catcher, sig, (void*)mask, code);
300 #endif
301 	/*
302 	 * Allocate space for the signal handler context.
303 	 */
304 	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
305 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
306 		fp = (struct l_rt_sigframe *)(td->td_sigstk.ss_sp +
307 		    td->td_sigstk.ss_size - sizeof(struct l_rt_sigframe));
308 	} else
309 		fp = (struct l_rt_sigframe *)regs->tf_rsp - 1;
310 	mtx_unlock(&psp->ps_mtx);
311 
312 	/*
313 	 * Build the argument list for the signal handler.
314 	 */
315 	if (p->p_sysent->sv_sigtbl)
316 		if (sig <= p->p_sysent->sv_sigsize)
317 			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
318 
319 	bzero(&frame, sizeof(frame));
320 
321 	frame.sf_handler = PTROUT(catcher);
322 	frame.sf_sig = sig;
323 	frame.sf_siginfo = PTROUT(&fp->sf_si);
324 	frame.sf_ucontext = PTROUT(&fp->sf_sc);
325 
326 	/* Fill in POSIX parts */
327 	frame.sf_si.lsi_signo = sig;
328 	frame.sf_si.lsi_code = code;
329 	frame.sf_si.lsi_addr = PTROUT(regs->tf_err);
330 
331 	/*
332 	 * Build the signal context to be used by sigreturn.
333 	 */
334 	frame.sf_sc.uc_flags = 0;		/* XXX ??? */
335 	frame.sf_sc.uc_link = 0;		/* XXX ??? */
336 
337 	frame.sf_sc.uc_stack.ss_sp = PTROUT(td->td_sigstk.ss_sp);
338 	frame.sf_sc.uc_stack.ss_size = td->td_sigstk.ss_size;
339 	frame.sf_sc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
340 	    ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE;
341 	PROC_UNLOCK(p);
342 
343 	bsd_to_linux_sigset(mask, &frame.sf_sc.uc_sigmask);
344 
345 	frame.sf_sc.uc_mcontext.sc_mask   = frame.sf_sc.uc_sigmask.__bits[0];
346         frame.sf_sc.uc_mcontext.sc_gs     = rgs();
347         frame.sf_sc.uc_mcontext.sc_fs     = rfs();
348         __asm __volatile("movl %%es,%0" :
349 	    "=rm" (frame.sf_sc.uc_mcontext.sc_es));
350         __asm __volatile("movl %%ds,%0" :
351 	    "=rm" (frame.sf_sc.uc_mcontext.sc_ds));
352 	frame.sf_sc.uc_mcontext.sc_edi    = regs->tf_rdi;
353 	frame.sf_sc.uc_mcontext.sc_esi    = regs->tf_rsi;
354 	frame.sf_sc.uc_mcontext.sc_ebp    = regs->tf_rbp;
355 	frame.sf_sc.uc_mcontext.sc_ebx    = regs->tf_rbx;
356 	frame.sf_sc.uc_mcontext.sc_edx    = regs->tf_rdx;
357 	frame.sf_sc.uc_mcontext.sc_ecx    = regs->tf_rcx;
358 	frame.sf_sc.uc_mcontext.sc_eax    = regs->tf_rax;
359 	frame.sf_sc.uc_mcontext.sc_eip    = regs->tf_rip;
360 	frame.sf_sc.uc_mcontext.sc_cs     = regs->tf_cs;
361 	frame.sf_sc.uc_mcontext.sc_eflags = regs->tf_rflags;
362 	frame.sf_sc.uc_mcontext.sc_esp_at_signal = regs->tf_rsp;
363 	frame.sf_sc.uc_mcontext.sc_ss     = regs->tf_ss;
364 	frame.sf_sc.uc_mcontext.sc_err    = regs->tf_err;
365 	frame.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code);
366 
367 #ifdef DEBUG
368 	if (ldebug(rt_sendsig))
369 		printf(LMSG("rt_sendsig flags: 0x%x, sp: %p, ss: 0x%lx, mask: 0x%x"),
370 		    frame.sf_sc.uc_stack.ss_flags, td->td_sigstk.ss_sp,
371 		    td->td_sigstk.ss_size, frame.sf_sc.uc_mcontext.sc_mask);
372 #endif
373 
374 	if (copyout(&frame, fp, sizeof(frame)) != 0) {
375 		/*
376 		 * Process has trashed its stack; give it an illegal
377 		 * instruction to halt it in its tracks.
378 		 */
379 #ifdef DEBUG
380 		if (ldebug(rt_sendsig))
381 			printf(LMSG("rt_sendsig: bad stack %p, oonstack=%x"),
382 			    fp, oonstack);
383 #endif
384 		PROC_LOCK(p);
385 		sigexit(td, SIGILL);
386 	}
387 
388 	/*
389 	 * Build context to run handler in.
390 	 */
391 	regs->tf_rsp = PTROUT(fp);
392 	regs->tf_rip = LINUX32_PS_STRINGS - *(p->p_sysent->sv_szsigcode) +
393 	    linux_sznonrtsigcode;
394 	regs->tf_rflags &= ~PSL_T;
395 	regs->tf_cs = _ucode32sel;
396 	regs->tf_ss = _udatasel;
397 	load_ds(_udatasel);
398 	td->td_pcb->pcb_ds = _udatasel;
399 	load_es(_udatasel);
400 	td->td_pcb->pcb_es = _udatasel;
401 	PROC_LOCK(p);
402 	mtx_lock(&psp->ps_mtx);
403 }
404 
405 
406 /*
407  * Send an interrupt to process.
408  *
409  * Stack is set up to allow sigcode stored
410  * in u. to call routine, followed by kcall
411  * to sigreturn routine below.  After sigreturn
412  * resets the signal mask, the stack, and the
413  * frame pointer, it returns to the user
414  * specified pc, psl.
415  */
416 static void
417 linux_sendsig(sig_t catcher, int sig, sigset_t *mask, u_long code)
418 {
419 	struct thread *td = curthread;
420 	struct proc *p = td->td_proc;
421 	struct sigacts *psp;
422 	struct trapframe *regs;
423 	struct l_sigframe *fp, frame;
424 	l_sigset_t lmask;
425 	int oonstack, i;
426 
427 	PROC_LOCK_ASSERT(p, MA_OWNED);
428 	psp = p->p_sigacts;
429 	mtx_assert(&psp->ps_mtx, MA_OWNED);
430 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
431 		/* Signal handler installed with SA_SIGINFO. */
432 		linux_rt_sendsig(catcher, sig, mask, code);
433 		return;
434 	}
435 
436 	regs = td->td_frame;
437 	oonstack = sigonstack(regs->tf_rsp);
438 
439 #ifdef DEBUG
440 	if (ldebug(sendsig))
441 		printf(ARGS(sendsig, "%p, %d, %p, %lu"),
442 		    catcher, sig, (void*)mask, code);
443 #endif
444 
445 	/*
446 	 * Allocate space for the signal handler context.
447 	 */
448 	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
449 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
450 		fp = (struct l_sigframe *)(td->td_sigstk.ss_sp +
451 		    td->td_sigstk.ss_size - sizeof(struct l_sigframe));
452 	} else
453 		fp = (struct l_sigframe *)regs->tf_rsp - 1;
454 	mtx_unlock(&psp->ps_mtx);
455 	PROC_UNLOCK(p);
456 
457 	/*
458 	 * Build the argument list for the signal handler.
459 	 */
460 	if (p->p_sysent->sv_sigtbl)
461 		if (sig <= p->p_sysent->sv_sigsize)
462 			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
463 
464 	bzero(&frame, sizeof(frame));
465 
466 	frame.sf_handler = PTROUT(catcher);
467 	frame.sf_sig = sig;
468 
469 	bsd_to_linux_sigset(mask, &lmask);
470 
471 	/*
472 	 * Build the signal context to be used by sigreturn.
473 	 */
474 	frame.sf_sc.sc_mask   = lmask.__bits[0];
475         frame.sf_sc.sc_gs     = rgs();
476         frame.sf_sc.sc_fs     = rfs();
477         __asm __volatile("movl %%es,%0" : "=rm" (frame.sf_sc.sc_es));
478         __asm __volatile("movl %%ds,%0" : "=rm" (frame.sf_sc.sc_ds));
479 	frame.sf_sc.sc_edi    = regs->tf_rdi;
480 	frame.sf_sc.sc_esi    = regs->tf_rsi;
481 	frame.sf_sc.sc_ebp    = regs->tf_rbp;
482 	frame.sf_sc.sc_ebx    = regs->tf_rbx;
483 	frame.sf_sc.sc_edx    = regs->tf_rdx;
484 	frame.sf_sc.sc_ecx    = regs->tf_rcx;
485 	frame.sf_sc.sc_eax    = regs->tf_rax;
486 	frame.sf_sc.sc_eip    = regs->tf_rip;
487 	frame.sf_sc.sc_cs     = regs->tf_cs;
488 	frame.sf_sc.sc_eflags = regs->tf_rflags;
489 	frame.sf_sc.sc_esp_at_signal = regs->tf_rsp;
490 	frame.sf_sc.sc_ss     = regs->tf_ss;
491 	frame.sf_sc.sc_err    = regs->tf_err;
492 	frame.sf_sc.sc_trapno = bsd_to_linux_trapcode(code);
493 
494 	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
495 		frame.sf_extramask[i] = lmask.__bits[i+1];
496 
497 	if (copyout(&frame, fp, sizeof(frame)) != 0) {
498 		/*
499 		 * Process has trashed its stack; give it an illegal
500 		 * instruction to halt it in its tracks.
501 		 */
502 		PROC_LOCK(p);
503 		sigexit(td, SIGILL);
504 	}
505 
506 	/*
507 	 * Build context to run handler in.
508 	 */
509 	regs->tf_rsp = PTROUT(fp);
510 	regs->tf_rip = LINUX32_PS_STRINGS - *(p->p_sysent->sv_szsigcode);
511 	regs->tf_rflags &= ~PSL_T;
512 	regs->tf_cs = _ucode32sel;
513 	regs->tf_ss = _udatasel;
514 	load_ds(_udatasel);
515 	td->td_pcb->pcb_ds = _udatasel;
516 	load_es(_udatasel);
517 	td->td_pcb->pcb_es = _udatasel;
518 	PROC_LOCK(p);
519 	mtx_lock(&psp->ps_mtx);
520 }
521 
522 /*
523  * System call to cleanup state after a signal
524  * has been taken.  Reset signal mask and
525  * stack state from context left by sendsig (above).
526  * Return to previous pc and psl as specified by
527  * context left by sendsig. Check carefully to
528  * make sure that the user has not modified the
529  * psl to gain improper privileges or to cause
530  * a machine fault.
531  */
532 int
533 linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args)
534 {
535 	struct proc *p = td->td_proc;
536 	struct l_sigframe frame;
537 	struct trapframe *regs;
538 	l_sigset_t lmask;
539 	int eflags, i;
540 
541 	regs = td->td_frame;
542 
543 #ifdef DEBUG
544 	if (ldebug(sigreturn))
545 		printf(ARGS(sigreturn, "%p"), (void *)args->sfp);
546 #endif
547 	/*
548 	 * The trampoline code hands us the sigframe.
549 	 * It is unsafe to keep track of it ourselves, in the event that a
550 	 * program jumps out of a signal handler.
551 	 */
552 	if (copyin(args->sfp, &frame, sizeof(frame)) != 0)
553 		return (EFAULT);
554 
555 	/*
556 	 * Check for security violations.
557 	 */
558 #define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
559 	eflags = frame.sf_sc.sc_eflags;
560 	/*
561 	 * XXX do allow users to change the privileged flag PSL_RF.  The
562 	 * cpu sets PSL_RF in tf_eflags for faults.  Debuggers should
563 	 * sometimes set it there too.  tf_eflags is kept in the signal
564 	 * context during signal handling and there is no other place
565 	 * to remember it, so the PSL_RF bit may be corrupted by the
566 	 * signal handler without us knowing.  Corruption of the PSL_RF
567 	 * bit at worst causes one more or one less debugger trap, so
568 	 * allowing it is fairly harmless.
569 	 */
570 	if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF))
571 		return(EINVAL);
572 
573 	/*
574 	 * Don't allow users to load a valid privileged %cs.  Let the
575 	 * hardware check for invalid selectors, excess privilege in
576 	 * other selectors, invalid %eip's and invalid %esp's.
577 	 */
578 #define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
579 	if (!CS_SECURE(frame.sf_sc.sc_cs)) {
580 		trapsignal(td, SIGBUS, T_PROTFLT);
581 		return(EINVAL);
582 	}
583 
584 	lmask.__bits[0] = frame.sf_sc.sc_mask;
585 	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
586 		lmask.__bits[i+1] = frame.sf_extramask[i];
587 	PROC_LOCK(p);
588 	linux_to_bsd_sigset(&lmask, &td->td_sigmask);
589 	SIG_CANTMASK(td->td_sigmask);
590 	signotify(td);
591 	PROC_UNLOCK(p);
592 
593 	/*
594 	 * Restore signal context.
595 	 */
596 	/* Selectors were restored by the trampoline. */
597 	regs->tf_rdi    = frame.sf_sc.sc_edi;
598 	regs->tf_rsi    = frame.sf_sc.sc_esi;
599 	regs->tf_rbp    = frame.sf_sc.sc_ebp;
600 	regs->tf_rbx    = frame.sf_sc.sc_ebx;
601 	regs->tf_rdx    = frame.sf_sc.sc_edx;
602 	regs->tf_rcx    = frame.sf_sc.sc_ecx;
603 	regs->tf_rax    = frame.sf_sc.sc_eax;
604 	regs->tf_rip    = frame.sf_sc.sc_eip;
605 	regs->tf_cs     = frame.sf_sc.sc_cs;
606 	regs->tf_rflags = eflags;
607 	regs->tf_rsp    = frame.sf_sc.sc_esp_at_signal;
608 	regs->tf_ss     = frame.sf_sc.sc_ss;
609 
610 	return (EJUSTRETURN);
611 }
612 
613 /*
614  * System call to cleanup state after a signal
615  * has been taken.  Reset signal mask and
616  * stack state from context left by rt_sendsig (above).
617  * Return to previous pc and psl as specified by
618  * context left by sendsig. Check carefully to
619  * make sure that the user has not modified the
620  * psl to gain improper privileges or to cause
621  * a machine fault.
622  */
623 int
624 linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args)
625 {
626 	struct proc *p = td->td_proc;
627 	struct l_ucontext uc;
628 	struct l_sigcontext *context;
629 	l_stack_t *lss;
630 	stack_t ss;
631 	struct trapframe *regs;
632 	int eflags;
633 
634 	regs = td->td_frame;
635 
636 #ifdef DEBUG
637 	if (ldebug(rt_sigreturn))
638 		printf(ARGS(rt_sigreturn, "%p"), (void *)args->ucp);
639 #endif
640 	/*
641 	 * The trampoline code hands us the ucontext.
642 	 * It is unsafe to keep track of it ourselves, in the event that a
643 	 * program jumps out of a signal handler.
644 	 */
645 	if (copyin(args->ucp, &uc, sizeof(uc)) != 0)
646 		return (EFAULT);
647 
648 	context = &uc.uc_mcontext;
649 
650 	/*
651 	 * Check for security violations.
652 	 */
653 #define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
654 	eflags = context->sc_eflags;
655 	/*
656 	 * XXX do allow users to change the privileged flag PSL_RF.  The
657 	 * cpu sets PSL_RF in tf_eflags for faults.  Debuggers should
658 	 * sometimes set it there too.  tf_eflags is kept in the signal
659 	 * context during signal handling and there is no other place
660 	 * to remember it, so the PSL_RF bit may be corrupted by the
661 	 * signal handler without us knowing.  Corruption of the PSL_RF
662 	 * bit at worst causes one more or one less debugger trap, so
663 	 * allowing it is fairly harmless.
664 	 */
665 	if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF))
666 		return(EINVAL);
667 
668 	/*
669 	 * Don't allow users to load a valid privileged %cs.  Let the
670 	 * hardware check for invalid selectors, excess privilege in
671 	 * other selectors, invalid %eip's and invalid %esp's.
672 	 */
673 #define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
674 	if (!CS_SECURE(context->sc_cs)) {
675 		trapsignal(td, SIGBUS, T_PROTFLT);
676 		return(EINVAL);
677 	}
678 
679 	PROC_LOCK(p);
680 	linux_to_bsd_sigset(&uc.uc_sigmask, &td->td_sigmask);
681 	SIG_CANTMASK(td->td_sigmask);
682 	signotify(td);
683 	PROC_UNLOCK(p);
684 
685 	/*
686 	 * Restore signal context
687 	 */
688 	/* Selectors were restored by the trampoline. */
689 	regs->tf_rdi    = context->sc_edi;
690 	regs->tf_rsi    = context->sc_esi;
691 	regs->tf_rbp    = context->sc_ebp;
692 	regs->tf_rbx    = context->sc_ebx;
693 	regs->tf_rdx    = context->sc_edx;
694 	regs->tf_rcx    = context->sc_ecx;
695 	regs->tf_rax    = context->sc_eax;
696 	regs->tf_rip    = context->sc_eip;
697 	regs->tf_cs     = context->sc_cs;
698 	regs->tf_rflags = eflags;
699 	regs->tf_rsp    = context->sc_esp_at_signal;
700 	regs->tf_ss     = context->sc_ss;
701 
702 	/*
703 	 * call sigaltstack & ignore results..
704 	 */
705 	lss = &uc.uc_stack;
706 	ss.ss_sp = PTRIN(lss->ss_sp);
707 	ss.ss_size = lss->ss_size;
708 	ss.ss_flags = linux_to_bsd_sigaltstack(lss->ss_flags);
709 
710 #ifdef DEBUG
711 	if (ldebug(rt_sigreturn))
712 		printf(LMSG("rt_sigret flags: 0x%x, sp: %p, ss: 0x%lx, mask: 0x%x"),
713 		    ss.ss_flags, ss.ss_sp, ss.ss_size, context->sc_mask);
714 #endif
715 	(void)kern_sigaltstack(td, &ss, NULL);
716 
717 	return (EJUSTRETURN);
718 }
719 
720 /*
721  * MPSAFE
722  */
723 static void
724 linux_prepsyscall(struct trapframe *tf, int *args, u_int *code, caddr_t *params)
725 {
726 	args[0] = tf->tf_rbx;
727 	args[1] = tf->tf_rcx;
728 	args[2] = tf->tf_rdx;
729 	args[3] = tf->tf_rsi;
730 	args[4] = tf->tf_rdi;
731 	args[5] = tf->tf_rbp;	/* Unconfirmed */
732 	*params = NULL;		/* no copyin */
733 }
734 
735 /*
736  * If a linux binary is exec'ing something, try this image activator
737  * first.  We override standard shell script execution in order to
738  * be able to modify the interpreter path.  We only do this if a linux
739  * binary is doing the exec, so we do not create an EXEC module for it.
740  */
741 static int	exec_linux_imgact_try(struct image_params *iparams);
742 
743 static int
744 exec_linux_imgact_try(struct image_params *imgp)
745 {
746     const char *head = (const char *)imgp->image_header;
747     char *rpath;
748     int error = -1, len;
749 
750     /*
751      * The interpreter for shell scripts run from a linux binary needs
752      * to be located in /compat/linux if possible in order to recursively
753      * maintain linux path emulation.
754      */
755     if (((const short *)head)[0] == SHELLMAGIC) {
756 	    /*
757 	     * Run our normal shell image activator.  If it succeeds attempt
758 	     * to use the alternate path for the interpreter.  If an alternate
759 	     * path is found, use our stringspace to store it.
760 	     */
761 	    if ((error = exec_shell_imgact(imgp)) == 0) {
762 		    linux_emul_convpath(FIRST_THREAD_IN_PROC(imgp->proc),
763 			imgp->interpreter_name, UIO_SYSSPACE, &rpath, 0);
764 		    if (rpath != NULL) {
765 			    len = strlen(rpath) + 1;
766 
767 			    if (len <= MAXSHELLCMDLEN) {
768 				    memcpy(imgp->interpreter_name, rpath, len);
769 			    }
770 			    free(rpath, M_TEMP);
771 		    }
772 	    }
773     }
774     return(error);
775 }
776 
777 /*
778  * Clear registers on exec
779  * XXX copied from ia32_signal.c.
780  */
781 static void
782 exec_linux_setregs(td, entry, stack, ps_strings)
783 	struct thread *td;
784 	u_long entry;
785 	u_long stack;
786 	u_long ps_strings;
787 {
788 	struct trapframe *regs = td->td_frame;
789 	struct pcb *pcb = td->td_pcb;
790 
791 	wrmsr(MSR_FSBASE, 0);
792 	wrmsr(MSR_KGSBASE, 0);	/* User value while we're in the kernel */
793 	pcb->pcb_fsbase = 0;
794 	pcb->pcb_gsbase = 0;
795 	load_ds(_udatasel);
796 	load_es(_udatasel);
797 	load_fs(_udatasel);
798 	load_gs(0);
799 	pcb->pcb_ds = _udatasel;
800 	pcb->pcb_es = _udatasel;
801 	pcb->pcb_fs = _udatasel;
802 	pcb->pcb_gs = 0;
803 
804 	bzero((char *)regs, sizeof(struct trapframe));
805 	regs->tf_rip = entry;
806 	regs->tf_rsp = stack;
807 	regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T);
808 	regs->tf_ss = _udatasel;
809 	regs->tf_cs = _ucode32sel;
810 	regs->tf_rbx = ps_strings;
811 	load_cr0(rcr0() | CR0_MP | CR0_TS);
812 
813 	/* Return via doreti so that we can change to a different %cs */
814 	pcb->pcb_flags |= PCB_FULLCTX;
815 	td->td_retval[1] = 0;
816 }
817 
818 /*
819  * XXX copied from ia32_sysvec.c.
820  */
821 static register_t *
822 linux_copyout_strings(struct image_params *imgp)
823 {
824 	int argc, envc;
825 	u_int32_t *vectp;
826 	char *stringp, *destp;
827 	u_int32_t *stack_base;
828 	struct linux32_ps_strings *arginfo;
829 	int sigcodesz;
830 
831 	/*
832 	 * Calculate string base and vector table pointers.
833 	 * Also deal with signal trampoline code for this exec type.
834 	 */
835 	arginfo = (struct linux32_ps_strings *)LINUX32_PS_STRINGS;
836 	sigcodesz = *(imgp->proc->p_sysent->sv_szsigcode);
837 	destp =	(caddr_t)arginfo - sigcodesz - SPARE_USRSPACE -
838 		roundup((ARG_MAX - imgp->args->stringspace), sizeof(char *));
839 
840 	/*
841 	 * install sigcode
842 	 */
843 	if (sigcodesz)
844 		copyout(imgp->proc->p_sysent->sv_sigcode,
845 			((caddr_t)arginfo - sigcodesz), szsigcode);
846 
847 	/*
848 	 * If we have a valid auxargs ptr, prepare some room
849 	 * on the stack.
850 	 */
851 	if (imgp->auxargs) {
852 		/*
853 		 * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for
854 		 * lower compatibility.
855 		 */
856 		imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size
857 			: (AT_COUNT * 2);
858 		/*
859 		 * The '+ 2' is for the null pointers at the end of each of
860 		 * the arg and env vector sets,and imgp->auxarg_size is room
861 		 * for argument of Runtime loader.
862 		 */
863 		vectp = (u_int32_t *) (destp - (imgp->args->argc + imgp->args->envc + 2 +
864 				       imgp->auxarg_size) * sizeof(u_int32_t));
865 
866 	} else
867 		/*
868 		 * The '+ 2' is for the null pointers at the end of each of
869 		 * the arg and env vector sets
870 		 */
871 		vectp = (u_int32_t *)
872 			(destp - (imgp->args->argc + imgp->args->envc + 2) * sizeof(u_int32_t));
873 
874 	/*
875 	 * vectp also becomes our initial stack base
876 	 */
877 	stack_base = vectp;
878 
879 	stringp = imgp->args->begin_argv;
880 	argc = imgp->args->argc;
881 	envc = imgp->args->envc;
882 	/*
883 	 * Copy out strings - arguments and environment.
884 	 */
885 	copyout(stringp, destp, ARG_MAX - imgp->args->stringspace);
886 
887 	/*
888 	 * Fill in "ps_strings" struct for ps, w, etc.
889 	 */
890 	suword32(&arginfo->ps_argvstr, (u_int32_t)(intptr_t)vectp);
891 	suword32(&arginfo->ps_nargvstr, argc);
892 
893 	/*
894 	 * Fill in argument portion of vector table.
895 	 */
896 	for (; argc > 0; --argc) {
897 		suword32(vectp++, (u_int32_t)(intptr_t)destp);
898 		while (*stringp++ != 0)
899 			destp++;
900 		destp++;
901 	}
902 
903 	/* a null vector table pointer separates the argp's from the envp's */
904 	suword32(vectp++, 0);
905 
906 	suword32(&arginfo->ps_envstr, (u_int32_t)(intptr_t)vectp);
907 	suword32(&arginfo->ps_nenvstr, envc);
908 
909 	/*
910 	 * Fill in environment portion of vector table.
911 	 */
912 	for (; envc > 0; --envc) {
913 		suword32(vectp++, (u_int32_t)(intptr_t)destp);
914 		while (*stringp++ != 0)
915 			destp++;
916 		destp++;
917 	}
918 
919 	/* end of vector table is a null pointer */
920 	suword32(vectp, 0);
921 
922 	return ((register_t *)stack_base);
923 }
924 
925 SYSCTL_NODE(_compat, OID_AUTO, linux32, CTLFLAG_RW, 0,
926     "32-bit Linux emulation");
927 
928 static u_long	linux32_maxdsiz = LINUX32_MAXDSIZ;
929 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxdsiz, CTLFLAG_RW,
930     &linux32_maxdsiz, 0, "");
931 static u_long	linux32_maxssiz = LINUX32_MAXSSIZ;
932 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxssiz, CTLFLAG_RW,
933     &linux32_maxssiz, 0, "");
934 static u_long	linux32_maxvmem = LINUX32_MAXVMEM;
935 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxvmem, CTLFLAG_RW,
936     &linux32_maxvmem, 0, "");
937 
938 /*
939  * XXX copied from ia32_sysvec.c.
940  */
941 static void
942 linux32_fixlimits(struct image_params *imgp)
943 {
944 	struct proc *p = imgp->proc;
945 	struct plimit *oldlim, *newlim;
946 
947 	if (linux32_maxdsiz == 0 && linux32_maxssiz == 0 &&
948 	    linux32_maxvmem == 0)
949 		return;
950 	newlim = lim_alloc();
951 	PROC_LOCK(p);
952 	oldlim = p->p_limit;
953 	lim_copy(newlim, oldlim);
954 	if (linux32_maxdsiz != 0) {
955 		if (newlim->pl_rlimit[RLIMIT_DATA].rlim_cur > linux32_maxdsiz)
956 		    newlim->pl_rlimit[RLIMIT_DATA].rlim_cur = linux32_maxdsiz;
957 		if (newlim->pl_rlimit[RLIMIT_DATA].rlim_max > linux32_maxdsiz)
958 		    newlim->pl_rlimit[RLIMIT_DATA].rlim_max = linux32_maxdsiz;
959 	}
960 	if (linux32_maxssiz != 0) {
961 		if (newlim->pl_rlimit[RLIMIT_STACK].rlim_cur > linux32_maxssiz)
962 		    newlim->pl_rlimit[RLIMIT_STACK].rlim_cur = linux32_maxssiz;
963 		if (newlim->pl_rlimit[RLIMIT_STACK].rlim_max > linux32_maxssiz)
964 		    newlim->pl_rlimit[RLIMIT_STACK].rlim_max = linux32_maxssiz;
965 	}
966 	if (linux32_maxvmem != 0) {
967 		if (newlim->pl_rlimit[RLIMIT_VMEM].rlim_cur > linux32_maxvmem)
968 		    newlim->pl_rlimit[RLIMIT_VMEM].rlim_cur = linux32_maxvmem;
969 		if (newlim->pl_rlimit[RLIMIT_VMEM].rlim_max > linux32_maxvmem)
970 		    newlim->pl_rlimit[RLIMIT_VMEM].rlim_max = linux32_maxvmem;
971 	}
972 	p->p_limit = newlim;
973 	PROC_UNLOCK(p);
974 	lim_free(oldlim);
975 }
976 
977 struct sysentvec elf_linux_sysvec = {
978 	LINUX_SYS_MAXSYSCALL,
979 	linux_sysent,
980 	0xff,
981 	LINUX_SIGTBLSZ,
982 	bsd_to_linux_signal,
983 	ELAST + 1,
984 	bsd_to_linux_errno,
985 	translate_traps,
986 	elf_linux_fixup,
987 	linux_sendsig,
988 	linux_sigcode,
989 	&linux_szsigcode,
990 	linux_prepsyscall,
991 	"Linux ELF32",
992 	elf32_coredump,
993 	exec_linux_imgact_try,
994 	LINUX_MINSIGSTKSZ,
995 	PAGE_SIZE,
996 	VM_MIN_ADDRESS,
997 	LINUX32_USRSTACK,
998 	LINUX32_USRSTACK,
999 	LINUX32_PS_STRINGS,
1000 	VM_PROT_ALL,
1001 	linux_copyout_strings,
1002 	exec_linux_setregs,
1003 	linux32_fixlimits
1004 };
1005 
1006 static Elf32_Brandinfo linux_brand = {
1007 					ELFOSABI_LINUX,
1008 					EM_386,
1009 					"Linux",
1010 					"/compat/linux",
1011 					"/lib/ld-linux.so.1",
1012 					&elf_linux_sysvec,
1013 					NULL,
1014 				 };
1015 
1016 static Elf32_Brandinfo linux_glibc2brand = {
1017 					ELFOSABI_LINUX,
1018 					EM_386,
1019 					"Linux",
1020 					"/compat/linux",
1021 					"/lib/ld-linux.so.2",
1022 					&elf_linux_sysvec,
1023 					NULL,
1024 				 };
1025 
1026 Elf32_Brandinfo *linux_brandlist[] = {
1027 					&linux_brand,
1028 					&linux_glibc2brand,
1029 					NULL
1030 				};
1031 
1032 static int
1033 linux_elf_modevent(module_t mod, int type, void *data)
1034 {
1035 	Elf32_Brandinfo **brandinfo;
1036 	int error;
1037 	struct linux_ioctl_handler **lihp;
1038 
1039 	error = 0;
1040 
1041 	switch(type) {
1042 	case MOD_LOAD:
1043 		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
1044 		     ++brandinfo)
1045 			if (elf32_insert_brand_entry(*brandinfo) < 0)
1046 				error = EINVAL;
1047 		if (error == 0) {
1048 			SET_FOREACH(lihp, linux_ioctl_handler_set)
1049 				linux_ioctl_register_handler(*lihp);
1050 			if (bootverbose)
1051 				printf("Linux ELF exec handler installed\n");
1052 		} else
1053 			printf("cannot insert Linux ELF brand handler\n");
1054 		break;
1055 	case MOD_UNLOAD:
1056 		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
1057 		     ++brandinfo)
1058 			if (elf32_brand_inuse(*brandinfo))
1059 				error = EBUSY;
1060 		if (error == 0) {
1061 			for (brandinfo = &linux_brandlist[0];
1062 			     *brandinfo != NULL; ++brandinfo)
1063 				if (elf32_remove_brand_entry(*brandinfo) < 0)
1064 					error = EINVAL;
1065 		}
1066 		if (error == 0) {
1067 			SET_FOREACH(lihp, linux_ioctl_handler_set)
1068 				linux_ioctl_unregister_handler(*lihp);
1069 			if (bootverbose)
1070 				printf("Linux ELF exec handler removed\n");
1071 			linux_mib_destroy();
1072 		} else
1073 			printf("Could not deinstall ELF interpreter entry\n");
1074 		break;
1075 	default:
1076 		break;
1077 	}
1078 	return error;
1079 }
1080 
1081 static moduledata_t linux_elf_mod = {
1082 	"linuxelf",
1083 	linux_elf_modevent,
1084 	0
1085 };
1086 
1087 DECLARE_MODULE(linuxelf, linux_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY);
1088