xref: /freebsd/sys/amd64/linux32/linux32_sysvec.c (revision ca72f67d709dd56713e0b0161e15be055cbe2707)
1 /*-
2  * Copyright (c) 2004 Tim J. Robbins
3  * Copyright (c) 2003 Peter Wemm
4  * Copyright (c) 2002 Doug Rabson
5  * Copyright (c) 1998-1999 Andrew Gallatin
6  * Copyright (c) 1994-1996 S�ren Schmidt
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer
14  *    in this position and unchanged.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. The name of the author may not be used to endorse or promote products
19  *    derived from this software without specific prior written permission
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
22  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
23  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
24  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
25  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
26  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
30  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31  */
32 
33 #include <sys/cdefs.h>
34 __FBSDID("$FreeBSD$");
35 
36 /* XXX we use functions that might not exist. */
37 #include "opt_compat.h"
38 
39 #ifndef COMPAT_43
40 #error "Unable to compile Linux-emulator due to missing COMPAT_43 option!"
41 #endif
42 #ifndef COMPAT_IA32
43 #error "Unable to compile Linux-emulator due to missing COMPAT_IA32 option!"
44 #endif
45 
46 #define	__ELF_WORD_SIZE	32
47 
48 #include <sys/param.h>
49 #include <sys/systm.h>
50 #include <sys/exec.h>
51 #include <sys/imgact.h>
52 #include <sys/imgact_elf.h>
53 #include <sys/kernel.h>
54 #include <sys/lock.h>
55 #include <sys/malloc.h>
56 #include <sys/module.h>
57 #include <sys/mutex.h>
58 #include <sys/proc.h>
59 #include <sys/resourcevar.h>
60 #include <sys/signalvar.h>
61 #include <sys/sysctl.h>
62 #include <sys/syscallsubr.h>
63 #include <sys/sysent.h>
64 #include <sys/sysproto.h>
65 #include <sys/vnode.h>
66 
67 #include <vm/vm.h>
68 #include <vm/pmap.h>
69 #include <vm/vm_extern.h>
70 #include <vm/vm_map.h>
71 #include <vm/vm_object.h>
72 #include <vm/vm_page.h>
73 #include <vm/vm_param.h>
74 
75 #include <machine/cpu.h>
76 #include <machine/md_var.h>
77 #include <machine/pcb.h>
78 #include <machine/specialreg.h>
79 
80 #include <amd64/linux32/linux.h>
81 #include <amd64/linux32/linux32_proto.h>
82 #include <compat/linux/linux_mib.h>
83 #include <compat/linux/linux_signal.h>
84 #include <compat/linux/linux_util.h>
85 
86 MODULE_VERSION(linux, 1);
87 
88 MALLOC_DEFINE(M_LINUX, "linux", "Linux mode structures");
89 
90 #define	AUXARGS_ENTRY_32(pos, id, val)	\
91 	do {				\
92 		suword32(pos++, id);	\
93 		suword32(pos++, val);	\
94 	} while (0)
95 
96 #if BYTE_ORDER == LITTLE_ENDIAN
97 #define SHELLMAGIC      0x2123 /* #! */
98 #else
99 #define SHELLMAGIC      0x2321
100 #endif
101 
102 /*
103  * Allow the sendsig functions to use the ldebug() facility
104  * even though they are not syscalls themselves. Map them
105  * to syscall 0. This is slightly less bogus than using
106  * ldebug(sigreturn).
107  */
108 #define	LINUX_SYS_linux_rt_sendsig	0
109 #define	LINUX_SYS_linux_sendsig		0
110 
111 extern char linux_sigcode[];
112 extern int linux_szsigcode;
113 
114 extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL];
115 
116 SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler);
117 
118 static int	elf_linux_fixup(register_t **stack_base,
119 		    struct image_params *iparams);
120 static register_t *linux_copyout_strings(struct image_params *imgp);
121 static void	linux_prepsyscall(struct trapframe *tf, int *args, u_int *code,
122 		    caddr_t *params);
123 static void     linux_sendsig(sig_t catcher, int sig, sigset_t *mask,
124 		    u_long code);
125 static void	exec_linux_setregs(struct thread *td, u_long entry,
126 				   u_long stack, u_long ps_strings);
127 static void	linux32_fixlimits(struct image_params *imgp);
128 
129 /*
130  * Linux syscalls return negative errno's, we do positive and map them
131  */
132 static int bsd_to_linux_errno[ELAST + 1] = {
133 	-0,  -1,  -2,  -3,  -4,  -5,  -6,  -7,  -8,  -9,
134 	-10, -35, -12, -13, -14, -15, -16, -17, -18, -19,
135 	-20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
136 	-30, -31, -32, -33, -34, -11,-115,-114, -88, -89,
137 	-90, -91, -92, -93, -94, -95, -96, -97, -98, -99,
138 	-100,-101,-102,-103,-104,-105,-106,-107,-108,-109,
139 	-110,-111, -40, -36,-112,-113, -39, -11, -87,-122,
140 	-116, -66,  -6,  -6,  -6,  -6,  -6, -37, -38,  -9,
141 	-6, -6, -43, -42, -75, -6, -84
142 };
143 
144 int bsd_to_linux_signal[LINUX_SIGTBLSZ] = {
145 	LINUX_SIGHUP, LINUX_SIGINT, LINUX_SIGQUIT, LINUX_SIGILL,
146 	LINUX_SIGTRAP, LINUX_SIGABRT, 0, LINUX_SIGFPE,
147 	LINUX_SIGKILL, LINUX_SIGBUS, LINUX_SIGSEGV, LINUX_SIGSYS,
148 	LINUX_SIGPIPE, LINUX_SIGALRM, LINUX_SIGTERM, LINUX_SIGURG,
149 	LINUX_SIGSTOP, LINUX_SIGTSTP, LINUX_SIGCONT, LINUX_SIGCHLD,
150 	LINUX_SIGTTIN, LINUX_SIGTTOU, LINUX_SIGIO, LINUX_SIGXCPU,
151 	LINUX_SIGXFSZ, LINUX_SIGVTALRM, LINUX_SIGPROF, LINUX_SIGWINCH,
152 	0, LINUX_SIGUSR1, LINUX_SIGUSR2
153 };
154 
155 int linux_to_bsd_signal[LINUX_SIGTBLSZ] = {
156 	SIGHUP, SIGINT, SIGQUIT, SIGILL,
157 	SIGTRAP, SIGABRT, SIGBUS, SIGFPE,
158 	SIGKILL, SIGUSR1, SIGSEGV, SIGUSR2,
159 	SIGPIPE, SIGALRM, SIGTERM, SIGBUS,
160 	SIGCHLD, SIGCONT, SIGSTOP, SIGTSTP,
161 	SIGTTIN, SIGTTOU, SIGURG, SIGXCPU,
162 	SIGXFSZ, SIGVTALRM, SIGPROF, SIGWINCH,
163 	SIGIO, SIGURG, SIGSYS
164 };
165 
166 #define LINUX_T_UNKNOWN  255
167 static int _bsd_to_linux_trapcode[] = {
168 	LINUX_T_UNKNOWN,	/* 0 */
169 	6,			/* 1  T_PRIVINFLT */
170 	LINUX_T_UNKNOWN,	/* 2 */
171 	3,			/* 3  T_BPTFLT */
172 	LINUX_T_UNKNOWN,	/* 4 */
173 	LINUX_T_UNKNOWN,	/* 5 */
174 	16,			/* 6  T_ARITHTRAP */
175 	254,			/* 7  T_ASTFLT */
176 	LINUX_T_UNKNOWN,	/* 8 */
177 	13,			/* 9  T_PROTFLT */
178 	1,			/* 10 T_TRCTRAP */
179 	LINUX_T_UNKNOWN,	/* 11 */
180 	14,			/* 12 T_PAGEFLT */
181 	LINUX_T_UNKNOWN,	/* 13 */
182 	17,			/* 14 T_ALIGNFLT */
183 	LINUX_T_UNKNOWN,	/* 15 */
184 	LINUX_T_UNKNOWN,	/* 16 */
185 	LINUX_T_UNKNOWN,	/* 17 */
186 	0,			/* 18 T_DIVIDE */
187 	2,			/* 19 T_NMI */
188 	4,			/* 20 T_OFLOW */
189 	5,			/* 21 T_BOUND */
190 	7,			/* 22 T_DNA */
191 	8,			/* 23 T_DOUBLEFLT */
192 	9,			/* 24 T_FPOPFLT */
193 	10,			/* 25 T_TSSFLT */
194 	11,			/* 26 T_SEGNPFLT */
195 	12,			/* 27 T_STKFLT */
196 	18,			/* 28 T_MCHK */
197 	19,			/* 29 T_XMMFLT */
198 	15			/* 30 T_RESERVED */
199 };
200 #define bsd_to_linux_trapcode(code) \
201     ((code)<sizeof(_bsd_to_linux_trapcode)/sizeof(*_bsd_to_linux_trapcode)? \
202      _bsd_to_linux_trapcode[(code)]: \
203      LINUX_T_UNKNOWN)
204 
205 struct linux32_ps_strings {
206 	u_int32_t ps_argvstr;	/* first of 0 or more argument strings */
207 	u_int ps_nargvstr;	/* the number of argument strings */
208 	u_int32_t ps_envstr;	/* first of 0 or more environment strings */
209 	u_int ps_nenvstr;	/* the number of environment strings */
210 };
211 
212 /*
213  * If FreeBSD & Linux have a difference of opinion about what a trap
214  * means, deal with it here.
215  *
216  * MPSAFE
217  */
218 static int
219 translate_traps(int signal, int trap_code)
220 {
221 	if (signal != SIGBUS)
222 		return signal;
223 	switch (trap_code) {
224 	case T_PROTFLT:
225 	case T_TSSFLT:
226 	case T_DOUBLEFLT:
227 	case T_PAGEFLT:
228 		return SIGSEGV;
229 	default:
230 		return signal;
231 	}
232 }
233 
234 static int
235 elf_linux_fixup(register_t **stack_base, struct image_params *imgp)
236 {
237 	Elf32_Auxargs *args;
238 	Elf32_Addr *base;
239 	Elf32_Addr *pos;
240 
241 	KASSERT(curthread->td_proc == imgp->proc &&
242 	    (curthread->td_proc->p_flag & P_SA) == 0,
243 	    ("unsafe elf_linux_fixup(), should be curproc"));
244 	base = (Elf32_Addr *)*stack_base;
245 	args = (Elf32_Auxargs *)imgp->auxargs;
246 	pos = base + (imgp->args->argc + imgp->args->envc + 2);
247 
248 	if (args->trace)
249 		AUXARGS_ENTRY_32(pos, AT_DEBUG, 1);
250 	if (args->execfd != -1)
251 		AUXARGS_ENTRY_32(pos, AT_EXECFD, args->execfd);
252 	AUXARGS_ENTRY_32(pos, AT_PHDR, args->phdr);
253 	AUXARGS_ENTRY_32(pos, AT_PHENT, args->phent);
254 	AUXARGS_ENTRY_32(pos, AT_PHNUM, args->phnum);
255 	AUXARGS_ENTRY_32(pos, AT_PAGESZ, args->pagesz);
256 	AUXARGS_ENTRY_32(pos, AT_FLAGS, args->flags);
257 	AUXARGS_ENTRY_32(pos, AT_ENTRY, args->entry);
258 	AUXARGS_ENTRY_32(pos, AT_BASE, args->base);
259 	AUXARGS_ENTRY_32(pos, AT_UID, imgp->proc->p_ucred->cr_ruid);
260 	AUXARGS_ENTRY_32(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid);
261 	AUXARGS_ENTRY_32(pos, AT_GID, imgp->proc->p_ucred->cr_rgid);
262 	AUXARGS_ENTRY_32(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid);
263 	AUXARGS_ENTRY_32(pos, AT_NULL, 0);
264 
265 	free(imgp->auxargs, M_TEMP);
266 	imgp->auxargs = NULL;
267 
268 	base--;
269 	suword32(base, (uint32_t)imgp->args->argc);
270 	*stack_base = (register_t *)base;
271 	return 0;
272 }
273 
274 extern int _ucodesel, _ucode32sel, _udatasel;
275 extern unsigned long linux_sznonrtsigcode;
276 
277 static void
278 linux_rt_sendsig(sig_t catcher, int sig, sigset_t *mask, u_long code)
279 {
280 	struct thread *td = curthread;
281 	struct proc *p = td->td_proc;
282 	struct sigacts *psp;
283 	struct trapframe *regs;
284 	struct l_rt_sigframe *fp, frame;
285 	int oonstack;
286 
287 	PROC_LOCK_ASSERT(p, MA_OWNED);
288 	psp = p->p_sigacts;
289 	mtx_assert(&psp->ps_mtx, MA_OWNED);
290 	regs = td->td_frame;
291 	oonstack = sigonstack(regs->tf_rsp);
292 
293 #ifdef DEBUG
294 	if (ldebug(rt_sendsig))
295 		printf(ARGS(rt_sendsig, "%p, %d, %p, %lu"),
296 		    catcher, sig, (void*)mask, code);
297 #endif
298 	/*
299 	 * Allocate space for the signal handler context.
300 	 */
301 	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
302 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
303 		fp = (struct l_rt_sigframe *)(td->td_sigstk.ss_sp +
304 		    td->td_sigstk.ss_size - sizeof(struct l_rt_sigframe));
305 	} else
306 		fp = (struct l_rt_sigframe *)regs->tf_rsp - 1;
307 	mtx_unlock(&psp->ps_mtx);
308 
309 	/*
310 	 * Build the argument list for the signal handler.
311 	 */
312 	if (p->p_sysent->sv_sigtbl)
313 		if (sig <= p->p_sysent->sv_sigsize)
314 			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
315 
316 	bzero(&frame, sizeof(frame));
317 
318 	frame.sf_handler = PTROUT(catcher);
319 	frame.sf_sig = sig;
320 	frame.sf_siginfo = PTROUT(&fp->sf_si);
321 	frame.sf_ucontext = PTROUT(&fp->sf_sc);
322 
323 	/* Fill in POSIX parts */
324 	frame.sf_si.lsi_signo = sig;
325 	frame.sf_si.lsi_code = code;
326 	frame.sf_si.lsi_addr = PTROUT(regs->tf_err);
327 
328 	/*
329 	 * Build the signal context to be used by sigreturn.
330 	 */
331 	frame.sf_sc.uc_flags = 0;		/* XXX ??? */
332 	frame.sf_sc.uc_link = 0;		/* XXX ??? */
333 
334 	frame.sf_sc.uc_stack.ss_sp = PTROUT(td->td_sigstk.ss_sp);
335 	frame.sf_sc.uc_stack.ss_size = td->td_sigstk.ss_size;
336 	frame.sf_sc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
337 	    ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE;
338 	PROC_UNLOCK(p);
339 
340 	bsd_to_linux_sigset(mask, &frame.sf_sc.uc_sigmask);
341 
342 	frame.sf_sc.uc_mcontext.sc_mask   = frame.sf_sc.uc_sigmask.__bits[0];
343         frame.sf_sc.uc_mcontext.sc_gs     = rgs();
344         frame.sf_sc.uc_mcontext.sc_fs     = rfs();
345         __asm __volatile("movl %%es,%0" :
346 	    "=rm" (frame.sf_sc.uc_mcontext.sc_es));
347         __asm __volatile("movl %%ds,%0" :
348 	    "=rm" (frame.sf_sc.uc_mcontext.sc_ds));
349 	frame.sf_sc.uc_mcontext.sc_edi    = regs->tf_rdi;
350 	frame.sf_sc.uc_mcontext.sc_esi    = regs->tf_rsi;
351 	frame.sf_sc.uc_mcontext.sc_ebp    = regs->tf_rbp;
352 	frame.sf_sc.uc_mcontext.sc_ebx    = regs->tf_rbx;
353 	frame.sf_sc.uc_mcontext.sc_edx    = regs->tf_rdx;
354 	frame.sf_sc.uc_mcontext.sc_ecx    = regs->tf_rcx;
355 	frame.sf_sc.uc_mcontext.sc_eax    = regs->tf_rax;
356 	frame.sf_sc.uc_mcontext.sc_eip    = regs->tf_rip;
357 	frame.sf_sc.uc_mcontext.sc_cs     = regs->tf_cs;
358 	frame.sf_sc.uc_mcontext.sc_eflags = regs->tf_rflags;
359 	frame.sf_sc.uc_mcontext.sc_esp_at_signal = regs->tf_rsp;
360 	frame.sf_sc.uc_mcontext.sc_ss     = regs->tf_ss;
361 	frame.sf_sc.uc_mcontext.sc_err    = regs->tf_err;
362 	frame.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code);
363 
364 #ifdef DEBUG
365 	if (ldebug(rt_sendsig))
366 		printf(LMSG("rt_sendsig flags: 0x%x, sp: %p, ss: 0x%lx, mask: 0x%x"),
367 		    frame.sf_sc.uc_stack.ss_flags, td->td_sigstk.ss_sp,
368 		    td->td_sigstk.ss_size, frame.sf_sc.uc_mcontext.sc_mask);
369 #endif
370 
371 	if (copyout(&frame, fp, sizeof(frame)) != 0) {
372 		/*
373 		 * Process has trashed its stack; give it an illegal
374 		 * instruction to halt it in its tracks.
375 		 */
376 #ifdef DEBUG
377 		if (ldebug(rt_sendsig))
378 			printf(LMSG("rt_sendsig: bad stack %p, oonstack=%x"),
379 			    fp, oonstack);
380 #endif
381 		PROC_LOCK(p);
382 		sigexit(td, SIGILL);
383 	}
384 
385 	/*
386 	 * Build context to run handler in.
387 	 */
388 	regs->tf_rsp = PTROUT(fp);
389 	regs->tf_rip = LINUX32_PS_STRINGS - *(p->p_sysent->sv_szsigcode) +
390 	    linux_sznonrtsigcode;
391 	regs->tf_rflags &= ~PSL_T;
392 	regs->tf_cs = _ucode32sel;
393 	regs->tf_ss = _udatasel;
394 	load_ds(_udatasel);
395 	td->td_pcb->pcb_ds = _udatasel;
396 	load_es(_udatasel);
397 	td->td_pcb->pcb_es = _udatasel;
398 	PROC_LOCK(p);
399 	mtx_lock(&psp->ps_mtx);
400 }
401 
402 
403 /*
404  * Send an interrupt to process.
405  *
406  * Stack is set up to allow sigcode stored
407  * in u. to call routine, followed by kcall
408  * to sigreturn routine below.  After sigreturn
409  * resets the signal mask, the stack, and the
410  * frame pointer, it returns to the user
411  * specified pc, psl.
412  */
413 static void
414 linux_sendsig(sig_t catcher, int sig, sigset_t *mask, u_long code)
415 {
416 	struct thread *td = curthread;
417 	struct proc *p = td->td_proc;
418 	struct sigacts *psp;
419 	struct trapframe *regs;
420 	struct l_sigframe *fp, frame;
421 	l_sigset_t lmask;
422 	int oonstack, i;
423 
424 	PROC_LOCK_ASSERT(p, MA_OWNED);
425 	psp = p->p_sigacts;
426 	mtx_assert(&psp->ps_mtx, MA_OWNED);
427 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
428 		/* Signal handler installed with SA_SIGINFO. */
429 		linux_rt_sendsig(catcher, sig, mask, code);
430 		return;
431 	}
432 
433 	regs = td->td_frame;
434 	oonstack = sigonstack(regs->tf_rsp);
435 
436 #ifdef DEBUG
437 	if (ldebug(sendsig))
438 		printf(ARGS(sendsig, "%p, %d, %p, %lu"),
439 		    catcher, sig, (void*)mask, code);
440 #endif
441 
442 	/*
443 	 * Allocate space for the signal handler context.
444 	 */
445 	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
446 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
447 		fp = (struct l_sigframe *)(td->td_sigstk.ss_sp +
448 		    td->td_sigstk.ss_size - sizeof(struct l_sigframe));
449 	} else
450 		fp = (struct l_sigframe *)regs->tf_rsp - 1;
451 	mtx_unlock(&psp->ps_mtx);
452 	PROC_UNLOCK(p);
453 
454 	/*
455 	 * Build the argument list for the signal handler.
456 	 */
457 	if (p->p_sysent->sv_sigtbl)
458 		if (sig <= p->p_sysent->sv_sigsize)
459 			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
460 
461 	bzero(&frame, sizeof(frame));
462 
463 	frame.sf_handler = PTROUT(catcher);
464 	frame.sf_sig = sig;
465 
466 	bsd_to_linux_sigset(mask, &lmask);
467 
468 	/*
469 	 * Build the signal context to be used by sigreturn.
470 	 */
471 	frame.sf_sc.sc_mask   = lmask.__bits[0];
472         frame.sf_sc.sc_gs     = rgs();
473         frame.sf_sc.sc_fs     = rfs();
474         __asm __volatile("movl %%es,%0" : "=rm" (frame.sf_sc.sc_es));
475         __asm __volatile("movl %%ds,%0" : "=rm" (frame.sf_sc.sc_ds));
476 	frame.sf_sc.sc_edi    = regs->tf_rdi;
477 	frame.sf_sc.sc_esi    = regs->tf_rsi;
478 	frame.sf_sc.sc_ebp    = regs->tf_rbp;
479 	frame.sf_sc.sc_ebx    = regs->tf_rbx;
480 	frame.sf_sc.sc_edx    = regs->tf_rdx;
481 	frame.sf_sc.sc_ecx    = regs->tf_rcx;
482 	frame.sf_sc.sc_eax    = regs->tf_rax;
483 	frame.sf_sc.sc_eip    = regs->tf_rip;
484 	frame.sf_sc.sc_cs     = regs->tf_cs;
485 	frame.sf_sc.sc_eflags = regs->tf_rflags;
486 	frame.sf_sc.sc_esp_at_signal = regs->tf_rsp;
487 	frame.sf_sc.sc_ss     = regs->tf_ss;
488 	frame.sf_sc.sc_err    = regs->tf_err;
489 	frame.sf_sc.sc_trapno = bsd_to_linux_trapcode(code);
490 
491 	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
492 		frame.sf_extramask[i] = lmask.__bits[i+1];
493 
494 	if (copyout(&frame, fp, sizeof(frame)) != 0) {
495 		/*
496 		 * Process has trashed its stack; give it an illegal
497 		 * instruction to halt it in its tracks.
498 		 */
499 		PROC_LOCK(p);
500 		sigexit(td, SIGILL);
501 	}
502 
503 	/*
504 	 * Build context to run handler in.
505 	 */
506 	regs->tf_rsp = PTROUT(fp);
507 	regs->tf_rip = LINUX32_PS_STRINGS - *(p->p_sysent->sv_szsigcode);
508 	regs->tf_rflags &= ~PSL_T;
509 	regs->tf_cs = _ucode32sel;
510 	regs->tf_ss = _udatasel;
511 	load_ds(_udatasel);
512 	td->td_pcb->pcb_ds = _udatasel;
513 	load_es(_udatasel);
514 	td->td_pcb->pcb_es = _udatasel;
515 	PROC_LOCK(p);
516 	mtx_lock(&psp->ps_mtx);
517 }
518 
519 /*
520  * System call to cleanup state after a signal
521  * has been taken.  Reset signal mask and
522  * stack state from context left by sendsig (above).
523  * Return to previous pc and psl as specified by
524  * context left by sendsig. Check carefully to
525  * make sure that the user has not modified the
526  * psl to gain improper privileges or to cause
527  * a machine fault.
528  */
529 int
530 linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args)
531 {
532 	struct proc *p = td->td_proc;
533 	struct l_sigframe frame;
534 	struct trapframe *regs;
535 	l_sigset_t lmask;
536 	int eflags, i;
537 
538 	regs = td->td_frame;
539 
540 #ifdef DEBUG
541 	if (ldebug(sigreturn))
542 		printf(ARGS(sigreturn, "%p"), (void *)args->sfp);
543 #endif
544 	/*
545 	 * The trampoline code hands us the sigframe.
546 	 * It is unsafe to keep track of it ourselves, in the event that a
547 	 * program jumps out of a signal handler.
548 	 */
549 	if (copyin(args->sfp, &frame, sizeof(frame)) != 0)
550 		return (EFAULT);
551 
552 	/*
553 	 * Check for security violations.
554 	 */
555 #define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
556 	eflags = frame.sf_sc.sc_eflags;
557 	/*
558 	 * XXX do allow users to change the privileged flag PSL_RF.  The
559 	 * cpu sets PSL_RF in tf_eflags for faults.  Debuggers should
560 	 * sometimes set it there too.  tf_eflags is kept in the signal
561 	 * context during signal handling and there is no other place
562 	 * to remember it, so the PSL_RF bit may be corrupted by the
563 	 * signal handler without us knowing.  Corruption of the PSL_RF
564 	 * bit at worst causes one more or one less debugger trap, so
565 	 * allowing it is fairly harmless.
566 	 */
567 	if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF))
568 		return(EINVAL);
569 
570 	/*
571 	 * Don't allow users to load a valid privileged %cs.  Let the
572 	 * hardware check for invalid selectors, excess privilege in
573 	 * other selectors, invalid %eip's and invalid %esp's.
574 	 */
575 #define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
576 	if (!CS_SECURE(frame.sf_sc.sc_cs)) {
577 		trapsignal(td, SIGBUS, T_PROTFLT);
578 		return(EINVAL);
579 	}
580 
581 	lmask.__bits[0] = frame.sf_sc.sc_mask;
582 	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
583 		lmask.__bits[i+1] = frame.sf_extramask[i];
584 	PROC_LOCK(p);
585 	linux_to_bsd_sigset(&lmask, &td->td_sigmask);
586 	SIG_CANTMASK(td->td_sigmask);
587 	signotify(td);
588 	PROC_UNLOCK(p);
589 
590 	/*
591 	 * Restore signal context.
592 	 */
593 	/* Selectors were restored by the trampoline. */
594 	regs->tf_rdi    = frame.sf_sc.sc_edi;
595 	regs->tf_rsi    = frame.sf_sc.sc_esi;
596 	regs->tf_rbp    = frame.sf_sc.sc_ebp;
597 	regs->tf_rbx    = frame.sf_sc.sc_ebx;
598 	regs->tf_rdx    = frame.sf_sc.sc_edx;
599 	regs->tf_rcx    = frame.sf_sc.sc_ecx;
600 	regs->tf_rax    = frame.sf_sc.sc_eax;
601 	regs->tf_rip    = frame.sf_sc.sc_eip;
602 	regs->tf_cs     = frame.sf_sc.sc_cs;
603 	regs->tf_rflags = eflags;
604 	regs->tf_rsp    = frame.sf_sc.sc_esp_at_signal;
605 	regs->tf_ss     = frame.sf_sc.sc_ss;
606 
607 	return (EJUSTRETURN);
608 }
609 
610 /*
611  * System call to cleanup state after a signal
612  * has been taken.  Reset signal mask and
613  * stack state from context left by rt_sendsig (above).
614  * Return to previous pc and psl as specified by
615  * context left by sendsig. Check carefully to
616  * make sure that the user has not modified the
617  * psl to gain improper privileges or to cause
618  * a machine fault.
619  */
620 int
621 linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args)
622 {
623 	struct proc *p = td->td_proc;
624 	struct l_ucontext uc;
625 	struct l_sigcontext *context;
626 	l_stack_t *lss;
627 	stack_t ss;
628 	struct trapframe *regs;
629 	int eflags;
630 
631 	regs = td->td_frame;
632 
633 #ifdef DEBUG
634 	if (ldebug(rt_sigreturn))
635 		printf(ARGS(rt_sigreturn, "%p"), (void *)args->ucp);
636 #endif
637 	/*
638 	 * The trampoline code hands us the ucontext.
639 	 * It is unsafe to keep track of it ourselves, in the event that a
640 	 * program jumps out of a signal handler.
641 	 */
642 	if (copyin(args->ucp, &uc, sizeof(uc)) != 0)
643 		return (EFAULT);
644 
645 	context = &uc.uc_mcontext;
646 
647 	/*
648 	 * Check for security violations.
649 	 */
650 #define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
651 	eflags = context->sc_eflags;
652 	/*
653 	 * XXX do allow users to change the privileged flag PSL_RF.  The
654 	 * cpu sets PSL_RF in tf_eflags for faults.  Debuggers should
655 	 * sometimes set it there too.  tf_eflags is kept in the signal
656 	 * context during signal handling and there is no other place
657 	 * to remember it, so the PSL_RF bit may be corrupted by the
658 	 * signal handler without us knowing.  Corruption of the PSL_RF
659 	 * bit at worst causes one more or one less debugger trap, so
660 	 * allowing it is fairly harmless.
661 	 */
662 	if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF))
663 		return(EINVAL);
664 
665 	/*
666 	 * Don't allow users to load a valid privileged %cs.  Let the
667 	 * hardware check for invalid selectors, excess privilege in
668 	 * other selectors, invalid %eip's and invalid %esp's.
669 	 */
670 #define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
671 	if (!CS_SECURE(context->sc_cs)) {
672 		trapsignal(td, SIGBUS, T_PROTFLT);
673 		return(EINVAL);
674 	}
675 
676 	PROC_LOCK(p);
677 	linux_to_bsd_sigset(&uc.uc_sigmask, &td->td_sigmask);
678 	SIG_CANTMASK(td->td_sigmask);
679 	signotify(td);
680 	PROC_UNLOCK(p);
681 
682 	/*
683 	 * Restore signal context
684 	 */
685 	/* Selectors were restored by the trampoline. */
686 	regs->tf_rdi    = context->sc_edi;
687 	regs->tf_rsi    = context->sc_esi;
688 	regs->tf_rbp    = context->sc_ebp;
689 	regs->tf_rbx    = context->sc_ebx;
690 	regs->tf_rdx    = context->sc_edx;
691 	regs->tf_rcx    = context->sc_ecx;
692 	regs->tf_rax    = context->sc_eax;
693 	regs->tf_rip    = context->sc_eip;
694 	regs->tf_cs     = context->sc_cs;
695 	regs->tf_rflags = eflags;
696 	regs->tf_rsp    = context->sc_esp_at_signal;
697 	regs->tf_ss     = context->sc_ss;
698 
699 	/*
700 	 * call sigaltstack & ignore results..
701 	 */
702 	lss = &uc.uc_stack;
703 	ss.ss_sp = PTRIN(lss->ss_sp);
704 	ss.ss_size = lss->ss_size;
705 	ss.ss_flags = linux_to_bsd_sigaltstack(lss->ss_flags);
706 
707 #ifdef DEBUG
708 	if (ldebug(rt_sigreturn))
709 		printf(LMSG("rt_sigret flags: 0x%x, sp: %p, ss: 0x%lx, mask: 0x%x"),
710 		    ss.ss_flags, ss.ss_sp, ss.ss_size, context->sc_mask);
711 #endif
712 	(void)kern_sigaltstack(td, &ss, NULL);
713 
714 	return (EJUSTRETURN);
715 }
716 
717 /*
718  * MPSAFE
719  */
720 static void
721 linux_prepsyscall(struct trapframe *tf, int *args, u_int *code, caddr_t *params)
722 {
723 	args[0] = tf->tf_rbx;
724 	args[1] = tf->tf_rcx;
725 	args[2] = tf->tf_rdx;
726 	args[3] = tf->tf_rsi;
727 	args[4] = tf->tf_rdi;
728 	args[5] = tf->tf_rbp;	/* Unconfirmed */
729 	*params = NULL;		/* no copyin */
730 }
731 
732 /*
733  * If a linux binary is exec'ing something, try this image activator
734  * first.  We override standard shell script execution in order to
735  * be able to modify the interpreter path.  We only do this if a linux
736  * binary is doing the exec, so we do not create an EXEC module for it.
737  */
738 static int	exec_linux_imgact_try(struct image_params *iparams);
739 
740 static int
741 exec_linux_imgact_try(struct image_params *imgp)
742 {
743     const char *head = (const char *)imgp->image_header;
744     char *rpath;
745     int error = -1, len;
746 
747     /*
748      * The interpreter for shell scripts run from a linux binary needs
749      * to be located in /compat/linux if possible in order to recursively
750      * maintain linux path emulation.
751      */
752     if (((const short *)head)[0] == SHELLMAGIC) {
753 	    /*
754 	     * Run our normal shell image activator.  If it succeeds attempt
755 	     * to use the alternate path for the interpreter.  If an alternate
756 	     * path is found, use our stringspace to store it.
757 	     */
758 	    if ((error = exec_shell_imgact(imgp)) == 0) {
759 		    linux_emul_convpath(FIRST_THREAD_IN_PROC(imgp->proc),
760 			imgp->interpreter_name, UIO_SYSSPACE, &rpath, 0);
761 		    if (rpath != NULL) {
762 			    len = strlen(rpath) + 1;
763 
764 			    if (len <= MAXSHELLCMDLEN) {
765 				    memcpy(imgp->interpreter_name, rpath, len);
766 			    }
767 			    free(rpath, M_TEMP);
768 		    }
769 	    }
770     }
771     return(error);
772 }
773 
774 /*
775  * Clear registers on exec
776  * XXX copied from ia32_signal.c.
777  */
778 static void
779 exec_linux_setregs(td, entry, stack, ps_strings)
780 	struct thread *td;
781 	u_long entry;
782 	u_long stack;
783 	u_long ps_strings;
784 {
785 	struct trapframe *regs = td->td_frame;
786 	struct pcb *pcb = td->td_pcb;
787 
788 	wrmsr(MSR_FSBASE, 0);
789 	wrmsr(MSR_KGSBASE, 0);	/* User value while we're in the kernel */
790 	pcb->pcb_fsbase = 0;
791 	pcb->pcb_gsbase = 0;
792 	load_ds(_udatasel);
793 	load_es(_udatasel);
794 	load_fs(_udatasel);
795 	load_gs(0);
796 	pcb->pcb_ds = _udatasel;
797 	pcb->pcb_es = _udatasel;
798 	pcb->pcb_fs = _udatasel;
799 	pcb->pcb_gs = 0;
800 
801 	bzero((char *)regs, sizeof(struct trapframe));
802 	regs->tf_rip = entry;
803 	regs->tf_rsp = stack;
804 	regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T);
805 	regs->tf_ss = _udatasel;
806 	regs->tf_cs = _ucode32sel;
807 	regs->tf_rbx = ps_strings;
808 	load_cr0(rcr0() | CR0_MP | CR0_TS);
809 	fpstate_drop(td);
810 
811 	/* Return via doreti so that we can change to a different %cs */
812 	pcb->pcb_flags |= PCB_FULLCTX;
813 	td->td_retval[1] = 0;
814 }
815 
816 /*
817  * XXX copied from ia32_sysvec.c.
818  */
819 static register_t *
820 linux_copyout_strings(struct image_params *imgp)
821 {
822 	int argc, envc;
823 	u_int32_t *vectp;
824 	char *stringp, *destp;
825 	u_int32_t *stack_base;
826 	struct linux32_ps_strings *arginfo;
827 	int sigcodesz;
828 
829 	/*
830 	 * Calculate string base and vector table pointers.
831 	 * Also deal with signal trampoline code for this exec type.
832 	 */
833 	arginfo = (struct linux32_ps_strings *)LINUX32_PS_STRINGS;
834 	sigcodesz = *(imgp->proc->p_sysent->sv_szsigcode);
835 	destp =	(caddr_t)arginfo - sigcodesz - SPARE_USRSPACE -
836 		roundup((ARG_MAX - imgp->args->stringspace), sizeof(char *));
837 
838 	/*
839 	 * install sigcode
840 	 */
841 	if (sigcodesz)
842 		copyout(imgp->proc->p_sysent->sv_sigcode,
843 			((caddr_t)arginfo - sigcodesz), szsigcode);
844 
845 	/*
846 	 * If we have a valid auxargs ptr, prepare some room
847 	 * on the stack.
848 	 */
849 	if (imgp->auxargs) {
850 		/*
851 		 * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for
852 		 * lower compatibility.
853 		 */
854 		imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size
855 			: (AT_COUNT * 2);
856 		/*
857 		 * The '+ 2' is for the null pointers at the end of each of
858 		 * the arg and env vector sets,and imgp->auxarg_size is room
859 		 * for argument of Runtime loader.
860 		 */
861 		vectp = (u_int32_t *) (destp - (imgp->args->argc + imgp->args->envc + 2 +
862 				       imgp->auxarg_size) * sizeof(u_int32_t));
863 
864 	} else
865 		/*
866 		 * The '+ 2' is for the null pointers at the end of each of
867 		 * the arg and env vector sets
868 		 */
869 		vectp = (u_int32_t *)
870 			(destp - (imgp->args->argc + imgp->args->envc + 2) * sizeof(u_int32_t));
871 
872 	/*
873 	 * vectp also becomes our initial stack base
874 	 */
875 	stack_base = vectp;
876 
877 	stringp = imgp->args->begin_argv;
878 	argc = imgp->args->argc;
879 	envc = imgp->args->envc;
880 	/*
881 	 * Copy out strings - arguments and environment.
882 	 */
883 	copyout(stringp, destp, ARG_MAX - imgp->args->stringspace);
884 
885 	/*
886 	 * Fill in "ps_strings" struct for ps, w, etc.
887 	 */
888 	suword32(&arginfo->ps_argvstr, (u_int32_t)(intptr_t)vectp);
889 	suword32(&arginfo->ps_nargvstr, argc);
890 
891 	/*
892 	 * Fill in argument portion of vector table.
893 	 */
894 	for (; argc > 0; --argc) {
895 		suword32(vectp++, (u_int32_t)(intptr_t)destp);
896 		while (*stringp++ != 0)
897 			destp++;
898 		destp++;
899 	}
900 
901 	/* a null vector table pointer separates the argp's from the envp's */
902 	suword32(vectp++, 0);
903 
904 	suword32(&arginfo->ps_envstr, (u_int32_t)(intptr_t)vectp);
905 	suword32(&arginfo->ps_nenvstr, envc);
906 
907 	/*
908 	 * Fill in environment portion of vector table.
909 	 */
910 	for (; envc > 0; --envc) {
911 		suword32(vectp++, (u_int32_t)(intptr_t)destp);
912 		while (*stringp++ != 0)
913 			destp++;
914 		destp++;
915 	}
916 
917 	/* end of vector table is a null pointer */
918 	suword32(vectp, 0);
919 
920 	return ((register_t *)stack_base);
921 }
922 
923 SYSCTL_NODE(_compat, OID_AUTO, linux32, CTLFLAG_RW, 0,
924     "32-bit Linux emulation");
925 
926 static u_long	linux32_maxdsiz = LINUX32_MAXDSIZ;
927 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxdsiz, CTLFLAG_RW,
928     &linux32_maxdsiz, 0, "");
929 static u_long	linux32_maxssiz = LINUX32_MAXSSIZ;
930 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxssiz, CTLFLAG_RW,
931     &linux32_maxssiz, 0, "");
932 static u_long	linux32_maxvmem = LINUX32_MAXVMEM;
933 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxvmem, CTLFLAG_RW,
934     &linux32_maxvmem, 0, "");
935 
936 /*
937  * XXX copied from ia32_sysvec.c.
938  */
939 static void
940 linux32_fixlimits(struct image_params *imgp)
941 {
942 	struct proc *p = imgp->proc;
943 	struct plimit *oldlim, *newlim;
944 
945 	if (linux32_maxdsiz == 0 && linux32_maxssiz == 0 &&
946 	    linux32_maxvmem == 0)
947 		return;
948 	newlim = lim_alloc();
949 	PROC_LOCK(p);
950 	oldlim = p->p_limit;
951 	lim_copy(newlim, oldlim);
952 	if (linux32_maxdsiz != 0) {
953 		if (newlim->pl_rlimit[RLIMIT_DATA].rlim_cur > linux32_maxdsiz)
954 		    newlim->pl_rlimit[RLIMIT_DATA].rlim_cur = linux32_maxdsiz;
955 		if (newlim->pl_rlimit[RLIMIT_DATA].rlim_max > linux32_maxdsiz)
956 		    newlim->pl_rlimit[RLIMIT_DATA].rlim_max = linux32_maxdsiz;
957 	}
958 	if (linux32_maxssiz != 0) {
959 		if (newlim->pl_rlimit[RLIMIT_STACK].rlim_cur > linux32_maxssiz)
960 		    newlim->pl_rlimit[RLIMIT_STACK].rlim_cur = linux32_maxssiz;
961 		if (newlim->pl_rlimit[RLIMIT_STACK].rlim_max > linux32_maxssiz)
962 		    newlim->pl_rlimit[RLIMIT_STACK].rlim_max = linux32_maxssiz;
963 	}
964 	if (linux32_maxvmem != 0) {
965 		if (newlim->pl_rlimit[RLIMIT_VMEM].rlim_cur > linux32_maxvmem)
966 		    newlim->pl_rlimit[RLIMIT_VMEM].rlim_cur = linux32_maxvmem;
967 		if (newlim->pl_rlimit[RLIMIT_VMEM].rlim_max > linux32_maxvmem)
968 		    newlim->pl_rlimit[RLIMIT_VMEM].rlim_max = linux32_maxvmem;
969 	}
970 	p->p_limit = newlim;
971 	PROC_UNLOCK(p);
972 	lim_free(oldlim);
973 }
974 
975 struct sysentvec elf_linux_sysvec = {
976 	LINUX_SYS_MAXSYSCALL,
977 	linux_sysent,
978 	0xff,
979 	LINUX_SIGTBLSZ,
980 	bsd_to_linux_signal,
981 	ELAST + 1,
982 	bsd_to_linux_errno,
983 	translate_traps,
984 	elf_linux_fixup,
985 	linux_sendsig,
986 	linux_sigcode,
987 	&linux_szsigcode,
988 	linux_prepsyscall,
989 	"Linux ELF32",
990 	elf32_coredump,
991 	exec_linux_imgact_try,
992 	LINUX_MINSIGSTKSZ,
993 	PAGE_SIZE,
994 	VM_MIN_ADDRESS,
995 	LINUX32_USRSTACK,
996 	LINUX32_USRSTACK,
997 	LINUX32_PS_STRINGS,
998 	VM_PROT_ALL,
999 	linux_copyout_strings,
1000 	exec_linux_setregs,
1001 	linux32_fixlimits
1002 };
1003 
1004 static Elf32_Brandinfo linux_brand = {
1005 					ELFOSABI_LINUX,
1006 					EM_386,
1007 					"Linux",
1008 					"/compat/linux",
1009 					"/lib/ld-linux.so.1",
1010 					&elf_linux_sysvec,
1011 					NULL,
1012 				 };
1013 
1014 static Elf32_Brandinfo linux_glibc2brand = {
1015 					ELFOSABI_LINUX,
1016 					EM_386,
1017 					"Linux",
1018 					"/compat/linux",
1019 					"/lib/ld-linux.so.2",
1020 					&elf_linux_sysvec,
1021 					NULL,
1022 				 };
1023 
1024 Elf32_Brandinfo *linux_brandlist[] = {
1025 					&linux_brand,
1026 					&linux_glibc2brand,
1027 					NULL
1028 				};
1029 
1030 static int
1031 linux_elf_modevent(module_t mod, int type, void *data)
1032 {
1033 	Elf32_Brandinfo **brandinfo;
1034 	int error;
1035 	struct linux_ioctl_handler **lihp;
1036 
1037 	error = 0;
1038 
1039 	switch(type) {
1040 	case MOD_LOAD:
1041 		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
1042 		     ++brandinfo)
1043 			if (elf32_insert_brand_entry(*brandinfo) < 0)
1044 				error = EINVAL;
1045 		if (error == 0) {
1046 			SET_FOREACH(lihp, linux_ioctl_handler_set)
1047 				linux_ioctl_register_handler(*lihp);
1048 			if (bootverbose)
1049 				printf("Linux ELF exec handler installed\n");
1050 		} else
1051 			printf("cannot insert Linux ELF brand handler\n");
1052 		break;
1053 	case MOD_UNLOAD:
1054 		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
1055 		     ++brandinfo)
1056 			if (elf32_brand_inuse(*brandinfo))
1057 				error = EBUSY;
1058 		if (error == 0) {
1059 			for (brandinfo = &linux_brandlist[0];
1060 			     *brandinfo != NULL; ++brandinfo)
1061 				if (elf32_remove_brand_entry(*brandinfo) < 0)
1062 					error = EINVAL;
1063 		}
1064 		if (error == 0) {
1065 			SET_FOREACH(lihp, linux_ioctl_handler_set)
1066 				linux_ioctl_unregister_handler(*lihp);
1067 			if (bootverbose)
1068 				printf("Linux ELF exec handler removed\n");
1069 			linux_mib_destroy();
1070 		} else
1071 			printf("Could not deinstall ELF interpreter entry\n");
1072 		break;
1073 	default:
1074 		break;
1075 	}
1076 	return error;
1077 }
1078 
1079 static moduledata_t linux_elf_mod = {
1080 	"linuxelf",
1081 	linux_elf_modevent,
1082 	0
1083 };
1084 
1085 DECLARE_MODULE(linuxelf, linux_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY);
1086