xref: /freebsd/sys/amd64/linux32/linux32_sysvec.c (revision 884a2a699669ec61e2366e3e358342dbc94be24a)
1 /*-
2  * Copyright (c) 2004 Tim J. Robbins
3  * Copyright (c) 2003 Peter Wemm
4  * Copyright (c) 2002 Doug Rabson
5  * Copyright (c) 1998-1999 Andrew Gallatin
6  * Copyright (c) 1994-1996 S�ren Schmidt
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer
14  *    in this position and unchanged.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. The name of the author may not be used to endorse or promote products
19  *    derived from this software without specific prior written permission
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
22  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
23  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
24  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
25  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
26  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
30  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31  */
32 
33 #include <sys/cdefs.h>
34 __FBSDID("$FreeBSD$");
35 #include "opt_compat.h"
36 
37 #ifndef COMPAT_FREEBSD32
38 #error "Unable to compile Linux-emulator due to missing COMPAT_FREEBSD32 option!"
39 #endif
40 
41 #define	__ELF_WORD_SIZE	32
42 
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/exec.h>
46 #include <sys/fcntl.h>
47 #include <sys/imgact.h>
48 #include <sys/imgact_elf.h>
49 #include <sys/kernel.h>
50 #include <sys/lock.h>
51 #include <sys/malloc.h>
52 #include <sys/module.h>
53 #include <sys/mutex.h>
54 #include <sys/proc.h>
55 #include <sys/resourcevar.h>
56 #include <sys/signalvar.h>
57 #include <sys/sysctl.h>
58 #include <sys/syscallsubr.h>
59 #include <sys/sysent.h>
60 #include <sys/sysproto.h>
61 #include <sys/vnode.h>
62 #include <sys/eventhandler.h>
63 
64 #include <vm/vm.h>
65 #include <vm/pmap.h>
66 #include <vm/vm_extern.h>
67 #include <vm/vm_map.h>
68 #include <vm/vm_object.h>
69 #include <vm/vm_page.h>
70 #include <vm/vm_param.h>
71 
72 #include <machine/cpu.h>
73 #include <machine/md_var.h>
74 #include <machine/pcb.h>
75 #include <machine/specialreg.h>
76 
77 #include <amd64/linux32/linux.h>
78 #include <amd64/linux32/linux32_proto.h>
79 #include <compat/linux/linux_emul.h>
80 #include <compat/linux/linux_futex.h>
81 #include <compat/linux/linux_mib.h>
82 #include <compat/linux/linux_misc.h>
83 #include <compat/linux/linux_signal.h>
84 #include <compat/linux/linux_util.h>
85 
86 MODULE_VERSION(linux, 1);
87 
88 MALLOC_DEFINE(M_LINUX, "linux", "Linux mode structures");
89 
90 #define	AUXARGS_ENTRY_32(pos, id, val)	\
91 	do {				\
92 		suword32(pos++, id);	\
93 		suword32(pos++, val);	\
94 	} while (0)
95 
96 #if BYTE_ORDER == LITTLE_ENDIAN
97 #define SHELLMAGIC      0x2123 /* #! */
98 #else
99 #define SHELLMAGIC      0x2321
100 #endif
101 
102 /*
103  * Allow the sendsig functions to use the ldebug() facility
104  * even though they are not syscalls themselves. Map them
105  * to syscall 0. This is slightly less bogus than using
106  * ldebug(sigreturn).
107  */
108 #define	LINUX_SYS_linux_rt_sendsig	0
109 #define	LINUX_SYS_linux_sendsig		0
110 
111 const char *linux_platform = "i686";
112 static int linux_szplatform;
113 extern char linux_sigcode[];
114 extern int linux_szsigcode;
115 
116 extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL];
117 
118 SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler);
119 SET_DECLARE(linux_device_handler_set, struct linux_device_handler);
120 
121 static int	elf_linux_fixup(register_t **stack_base,
122 		    struct image_params *iparams);
123 static register_t *linux_copyout_strings(struct image_params *imgp);
124 static void     linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask);
125 static void	exec_linux_setregs(struct thread *td,
126 				   struct image_params *imgp, u_long stack);
127 static void	linux32_fixlimit(struct rlimit *rl, int which);
128 static boolean_t linux32_trans_osrel(const Elf_Note *note, int32_t *osrel);
129 
130 static eventhandler_tag linux_exit_tag;
131 static eventhandler_tag linux_exec_tag;
132 
133 /*
134  * Linux syscalls return negative errno's, we do positive and map them
135  * Reference:
136  *   FreeBSD: src/sys/sys/errno.h
137  *   Linux:   linux-2.6.17.8/include/asm-generic/errno-base.h
138  *            linux-2.6.17.8/include/asm-generic/errno.h
139  */
140 static int bsd_to_linux_errno[ELAST + 1] = {
141 	-0,  -1,  -2,  -3,  -4,  -5,  -6,  -7,  -8,  -9,
142 	-10, -35, -12, -13, -14, -15, -16, -17, -18, -19,
143 	-20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
144 	-30, -31, -32, -33, -34, -11,-115,-114, -88, -89,
145 	-90, -91, -92, -93, -94, -95, -96, -97, -98, -99,
146 	-100,-101,-102,-103,-104,-105,-106,-107,-108,-109,
147 	-110,-111, -40, -36,-112,-113, -39, -11, -87,-122,
148 	-116, -66,  -6,  -6,  -6,  -6,  -6, -37, -38,  -9,
149 	  -6,  -6, -43, -42, -75,-125, -84, -95, -16, -74,
150 	 -72, -67, -71
151 };
152 
153 int bsd_to_linux_signal[LINUX_SIGTBLSZ] = {
154 	LINUX_SIGHUP, LINUX_SIGINT, LINUX_SIGQUIT, LINUX_SIGILL,
155 	LINUX_SIGTRAP, LINUX_SIGABRT, 0, LINUX_SIGFPE,
156 	LINUX_SIGKILL, LINUX_SIGBUS, LINUX_SIGSEGV, LINUX_SIGSYS,
157 	LINUX_SIGPIPE, LINUX_SIGALRM, LINUX_SIGTERM, LINUX_SIGURG,
158 	LINUX_SIGSTOP, LINUX_SIGTSTP, LINUX_SIGCONT, LINUX_SIGCHLD,
159 	LINUX_SIGTTIN, LINUX_SIGTTOU, LINUX_SIGIO, LINUX_SIGXCPU,
160 	LINUX_SIGXFSZ, LINUX_SIGVTALRM, LINUX_SIGPROF, LINUX_SIGWINCH,
161 	0, LINUX_SIGUSR1, LINUX_SIGUSR2
162 };
163 
164 int linux_to_bsd_signal[LINUX_SIGTBLSZ] = {
165 	SIGHUP, SIGINT, SIGQUIT, SIGILL,
166 	SIGTRAP, SIGABRT, SIGBUS, SIGFPE,
167 	SIGKILL, SIGUSR1, SIGSEGV, SIGUSR2,
168 	SIGPIPE, SIGALRM, SIGTERM, SIGBUS,
169 	SIGCHLD, SIGCONT, SIGSTOP, SIGTSTP,
170 	SIGTTIN, SIGTTOU, SIGURG, SIGXCPU,
171 	SIGXFSZ, SIGVTALRM, SIGPROF, SIGWINCH,
172 	SIGIO, SIGURG, SIGSYS
173 };
174 
175 #define LINUX_T_UNKNOWN  255
176 static int _bsd_to_linux_trapcode[] = {
177 	LINUX_T_UNKNOWN,	/* 0 */
178 	6,			/* 1  T_PRIVINFLT */
179 	LINUX_T_UNKNOWN,	/* 2 */
180 	3,			/* 3  T_BPTFLT */
181 	LINUX_T_UNKNOWN,	/* 4 */
182 	LINUX_T_UNKNOWN,	/* 5 */
183 	16,			/* 6  T_ARITHTRAP */
184 	254,			/* 7  T_ASTFLT */
185 	LINUX_T_UNKNOWN,	/* 8 */
186 	13,			/* 9  T_PROTFLT */
187 	1,			/* 10 T_TRCTRAP */
188 	LINUX_T_UNKNOWN,	/* 11 */
189 	14,			/* 12 T_PAGEFLT */
190 	LINUX_T_UNKNOWN,	/* 13 */
191 	17,			/* 14 T_ALIGNFLT */
192 	LINUX_T_UNKNOWN,	/* 15 */
193 	LINUX_T_UNKNOWN,	/* 16 */
194 	LINUX_T_UNKNOWN,	/* 17 */
195 	0,			/* 18 T_DIVIDE */
196 	2,			/* 19 T_NMI */
197 	4,			/* 20 T_OFLOW */
198 	5,			/* 21 T_BOUND */
199 	7,			/* 22 T_DNA */
200 	8,			/* 23 T_DOUBLEFLT */
201 	9,			/* 24 T_FPOPFLT */
202 	10,			/* 25 T_TSSFLT */
203 	11,			/* 26 T_SEGNPFLT */
204 	12,			/* 27 T_STKFLT */
205 	18,			/* 28 T_MCHK */
206 	19,			/* 29 T_XMMFLT */
207 	15			/* 30 T_RESERVED */
208 };
209 #define bsd_to_linux_trapcode(code) \
210     ((code)<sizeof(_bsd_to_linux_trapcode)/sizeof(*_bsd_to_linux_trapcode)? \
211      _bsd_to_linux_trapcode[(code)]: \
212      LINUX_T_UNKNOWN)
213 
214 struct linux32_ps_strings {
215 	u_int32_t ps_argvstr;	/* first of 0 or more argument strings */
216 	u_int ps_nargvstr;	/* the number of argument strings */
217 	u_int32_t ps_envstr;	/* first of 0 or more environment strings */
218 	u_int ps_nenvstr;	/* the number of environment strings */
219 };
220 
221 /*
222  * If FreeBSD & Linux have a difference of opinion about what a trap
223  * means, deal with it here.
224  *
225  * MPSAFE
226  */
227 static int
228 translate_traps(int signal, int trap_code)
229 {
230 	if (signal != SIGBUS)
231 		return signal;
232 	switch (trap_code) {
233 	case T_PROTFLT:
234 	case T_TSSFLT:
235 	case T_DOUBLEFLT:
236 	case T_PAGEFLT:
237 		return SIGSEGV;
238 	default:
239 		return signal;
240 	}
241 }
242 
243 static int
244 elf_linux_fixup(register_t **stack_base, struct image_params *imgp)
245 {
246 	Elf32_Auxargs *args;
247 	Elf32_Addr *base;
248 	Elf32_Addr *pos, *uplatform;
249 	struct linux32_ps_strings *arginfo;
250 
251 	arginfo = (struct linux32_ps_strings *)LINUX32_PS_STRINGS;
252 	uplatform = (Elf32_Addr *)((caddr_t)arginfo - linux_szplatform);
253 
254 	KASSERT(curthread->td_proc == imgp->proc,
255 	    ("unsafe elf_linux_fixup(), should be curproc"));
256 	base = (Elf32_Addr *)*stack_base;
257 	args = (Elf32_Auxargs *)imgp->auxargs;
258 	pos = base + (imgp->args->argc + imgp->args->envc + 2);
259 
260 	AUXARGS_ENTRY_32(pos, LINUX_AT_HWCAP, cpu_feature);
261 
262 	/*
263 	 * Do not export AT_CLKTCK when emulating Linux kernel prior to 2.4.0,
264 	 * as it has appeared in the 2.4.0-rc7 first time.
265 	 * Being exported, AT_CLKTCK is returned by sysconf(_SC_CLK_TCK),
266 	 * glibc falls back to the hard-coded CLK_TCK value when aux entry
267 	 * is not present.
268 	 * Also see linux_times() implementation.
269 	 */
270 	if (linux_kernver(curthread) >= LINUX_KERNVER_2004000)
271 		AUXARGS_ENTRY_32(pos, LINUX_AT_CLKTCK, stclohz);
272 	AUXARGS_ENTRY_32(pos, AT_PHDR, args->phdr);
273 	AUXARGS_ENTRY_32(pos, AT_PHENT, args->phent);
274 	AUXARGS_ENTRY_32(pos, AT_PHNUM, args->phnum);
275 	AUXARGS_ENTRY_32(pos, AT_PAGESZ, args->pagesz);
276 	AUXARGS_ENTRY_32(pos, AT_FLAGS, args->flags);
277 	AUXARGS_ENTRY_32(pos, AT_ENTRY, args->entry);
278 	AUXARGS_ENTRY_32(pos, AT_BASE, args->base);
279 	AUXARGS_ENTRY_32(pos, LINUX_AT_SECURE, 0);
280 	AUXARGS_ENTRY_32(pos, AT_UID, imgp->proc->p_ucred->cr_ruid);
281 	AUXARGS_ENTRY_32(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid);
282 	AUXARGS_ENTRY_32(pos, AT_GID, imgp->proc->p_ucred->cr_rgid);
283 	AUXARGS_ENTRY_32(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid);
284 	AUXARGS_ENTRY_32(pos, LINUX_AT_PLATFORM, PTROUT(uplatform));
285 	if (args->execfd != -1)
286 		AUXARGS_ENTRY_32(pos, AT_EXECFD, args->execfd);
287 	AUXARGS_ENTRY_32(pos, AT_NULL, 0);
288 
289 	free(imgp->auxargs, M_TEMP);
290 	imgp->auxargs = NULL;
291 
292 	base--;
293 	suword32(base, (uint32_t)imgp->args->argc);
294 	*stack_base = (register_t *)base;
295 	return 0;
296 }
297 
298 extern unsigned long linux_sznonrtsigcode;
299 
300 static void
301 linux_rt_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
302 {
303 	struct thread *td = curthread;
304 	struct proc *p = td->td_proc;
305 	struct sigacts *psp;
306 	struct trapframe *regs;
307 	struct l_rt_sigframe *fp, frame;
308 	int oonstack;
309 	int sig;
310 	int code;
311 
312 	sig = ksi->ksi_signo;
313 	code = ksi->ksi_code;
314 	PROC_LOCK_ASSERT(p, MA_OWNED);
315 	psp = p->p_sigacts;
316 	mtx_assert(&psp->ps_mtx, MA_OWNED);
317 	regs = td->td_frame;
318 	oonstack = sigonstack(regs->tf_rsp);
319 
320 #ifdef DEBUG
321 	if (ldebug(rt_sendsig))
322 		printf(ARGS(rt_sendsig, "%p, %d, %p, %u"),
323 		    catcher, sig, (void*)mask, code);
324 #endif
325 	/*
326 	 * Allocate space for the signal handler context.
327 	 */
328 	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
329 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
330 		fp = (struct l_rt_sigframe *)(td->td_sigstk.ss_sp +
331 		    td->td_sigstk.ss_size - sizeof(struct l_rt_sigframe));
332 	} else
333 		fp = (struct l_rt_sigframe *)regs->tf_rsp - 1;
334 	mtx_unlock(&psp->ps_mtx);
335 
336 	/*
337 	 * Build the argument list for the signal handler.
338 	 */
339 	if (p->p_sysent->sv_sigtbl)
340 		if (sig <= p->p_sysent->sv_sigsize)
341 			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
342 
343 	bzero(&frame, sizeof(frame));
344 
345 	frame.sf_handler = PTROUT(catcher);
346 	frame.sf_sig = sig;
347 	frame.sf_siginfo = PTROUT(&fp->sf_si);
348 	frame.sf_ucontext = PTROUT(&fp->sf_sc);
349 
350 	/* Fill in POSIX parts */
351 	ksiginfo_to_lsiginfo(ksi, &frame.sf_si, sig);
352 
353 	/*
354 	 * Build the signal context to be used by sigreturn.
355 	 */
356 	frame.sf_sc.uc_flags = 0;		/* XXX ??? */
357 	frame.sf_sc.uc_link = 0;		/* XXX ??? */
358 
359 	frame.sf_sc.uc_stack.ss_sp = PTROUT(td->td_sigstk.ss_sp);
360 	frame.sf_sc.uc_stack.ss_size = td->td_sigstk.ss_size;
361 	frame.sf_sc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
362 	    ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE;
363 	PROC_UNLOCK(p);
364 
365 	bsd_to_linux_sigset(mask, &frame.sf_sc.uc_sigmask);
366 
367 	frame.sf_sc.uc_mcontext.sc_mask   = frame.sf_sc.uc_sigmask.__bits[0];
368 	frame.sf_sc.uc_mcontext.sc_edi    = regs->tf_rdi;
369 	frame.sf_sc.uc_mcontext.sc_esi    = regs->tf_rsi;
370 	frame.sf_sc.uc_mcontext.sc_ebp    = regs->tf_rbp;
371 	frame.sf_sc.uc_mcontext.sc_ebx    = regs->tf_rbx;
372 	frame.sf_sc.uc_mcontext.sc_edx    = regs->tf_rdx;
373 	frame.sf_sc.uc_mcontext.sc_ecx    = regs->tf_rcx;
374 	frame.sf_sc.uc_mcontext.sc_eax    = regs->tf_rax;
375 	frame.sf_sc.uc_mcontext.sc_eip    = regs->tf_rip;
376 	frame.sf_sc.uc_mcontext.sc_cs     = regs->tf_cs;
377 	frame.sf_sc.uc_mcontext.sc_gs     = regs->tf_gs;
378 	frame.sf_sc.uc_mcontext.sc_fs     = regs->tf_fs;
379 	frame.sf_sc.uc_mcontext.sc_es     = regs->tf_es;
380 	frame.sf_sc.uc_mcontext.sc_ds     = regs->tf_ds;
381 	frame.sf_sc.uc_mcontext.sc_eflags = regs->tf_rflags;
382 	frame.sf_sc.uc_mcontext.sc_esp_at_signal = regs->tf_rsp;
383 	frame.sf_sc.uc_mcontext.sc_ss     = regs->tf_ss;
384 	frame.sf_sc.uc_mcontext.sc_err    = regs->tf_err;
385 	frame.sf_sc.uc_mcontext.sc_cr2    = (u_int32_t)(uintptr_t)ksi->ksi_addr;
386 	frame.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code);
387 
388 #ifdef DEBUG
389 	if (ldebug(rt_sendsig))
390 		printf(LMSG("rt_sendsig flags: 0x%x, sp: %p, ss: 0x%lx, mask: 0x%x"),
391 		    frame.sf_sc.uc_stack.ss_flags, td->td_sigstk.ss_sp,
392 		    td->td_sigstk.ss_size, frame.sf_sc.uc_mcontext.sc_mask);
393 #endif
394 
395 	if (copyout(&frame, fp, sizeof(frame)) != 0) {
396 		/*
397 		 * Process has trashed its stack; give it an illegal
398 		 * instruction to halt it in its tracks.
399 		 */
400 #ifdef DEBUG
401 		if (ldebug(rt_sendsig))
402 			printf(LMSG("rt_sendsig: bad stack %p, oonstack=%x"),
403 			    fp, oonstack);
404 #endif
405 		PROC_LOCK(p);
406 		sigexit(td, SIGILL);
407 	}
408 
409 	/*
410 	 * Build context to run handler in.
411 	 */
412 	regs->tf_rsp = PTROUT(fp);
413 	regs->tf_rip = p->p_sysent->sv_sigcode_base + linux_sznonrtsigcode;
414 	regs->tf_rflags &= ~(PSL_T | PSL_D);
415 	regs->tf_cs = _ucode32sel;
416 	regs->tf_ss = _udatasel;
417 	regs->tf_ds = _udatasel;
418 	regs->tf_es = _udatasel;
419 	regs->tf_fs = _ufssel;
420 	regs->tf_gs = _ugssel;
421 	regs->tf_flags = TF_HASSEGS;
422 	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
423 	PROC_LOCK(p);
424 	mtx_lock(&psp->ps_mtx);
425 }
426 
427 
428 /*
429  * Send an interrupt to process.
430  *
431  * Stack is set up to allow sigcode stored
432  * in u. to call routine, followed by kcall
433  * to sigreturn routine below.  After sigreturn
434  * resets the signal mask, the stack, and the
435  * frame pointer, it returns to the user
436  * specified pc, psl.
437  */
438 static void
439 linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
440 {
441 	struct thread *td = curthread;
442 	struct proc *p = td->td_proc;
443 	struct sigacts *psp;
444 	struct trapframe *regs;
445 	struct l_sigframe *fp, frame;
446 	l_sigset_t lmask;
447 	int oonstack, i;
448 	int sig, code;
449 
450 	sig = ksi->ksi_signo;
451 	code = ksi->ksi_code;
452 	PROC_LOCK_ASSERT(p, MA_OWNED);
453 	psp = p->p_sigacts;
454 	mtx_assert(&psp->ps_mtx, MA_OWNED);
455 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
456 		/* Signal handler installed with SA_SIGINFO. */
457 		linux_rt_sendsig(catcher, ksi, mask);
458 		return;
459 	}
460 
461 	regs = td->td_frame;
462 	oonstack = sigonstack(regs->tf_rsp);
463 
464 #ifdef DEBUG
465 	if (ldebug(sendsig))
466 		printf(ARGS(sendsig, "%p, %d, %p, %u"),
467 		    catcher, sig, (void*)mask, code);
468 #endif
469 
470 	/*
471 	 * Allocate space for the signal handler context.
472 	 */
473 	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
474 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
475 		fp = (struct l_sigframe *)(td->td_sigstk.ss_sp +
476 		    td->td_sigstk.ss_size - sizeof(struct l_sigframe));
477 	} else
478 		fp = (struct l_sigframe *)regs->tf_rsp - 1;
479 	mtx_unlock(&psp->ps_mtx);
480 	PROC_UNLOCK(p);
481 
482 	/*
483 	 * Build the argument list for the signal handler.
484 	 */
485 	if (p->p_sysent->sv_sigtbl)
486 		if (sig <= p->p_sysent->sv_sigsize)
487 			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
488 
489 	bzero(&frame, sizeof(frame));
490 
491 	frame.sf_handler = PTROUT(catcher);
492 	frame.sf_sig = sig;
493 
494 	bsd_to_linux_sigset(mask, &lmask);
495 
496 	/*
497 	 * Build the signal context to be used by sigreturn.
498 	 */
499 	frame.sf_sc.sc_mask   = lmask.__bits[0];
500 	frame.sf_sc.sc_gs     = regs->tf_gs;
501 	frame.sf_sc.sc_fs     = regs->tf_fs;
502 	frame.sf_sc.sc_es     = regs->tf_es;
503 	frame.sf_sc.sc_ds     = regs->tf_ds;
504 	frame.sf_sc.sc_edi    = regs->tf_rdi;
505 	frame.sf_sc.sc_esi    = regs->tf_rsi;
506 	frame.sf_sc.sc_ebp    = regs->tf_rbp;
507 	frame.sf_sc.sc_ebx    = regs->tf_rbx;
508 	frame.sf_sc.sc_edx    = regs->tf_rdx;
509 	frame.sf_sc.sc_ecx    = regs->tf_rcx;
510 	frame.sf_sc.sc_eax    = regs->tf_rax;
511 	frame.sf_sc.sc_eip    = regs->tf_rip;
512 	frame.sf_sc.sc_cs     = regs->tf_cs;
513 	frame.sf_sc.sc_eflags = regs->tf_rflags;
514 	frame.sf_sc.sc_esp_at_signal = regs->tf_rsp;
515 	frame.sf_sc.sc_ss     = regs->tf_ss;
516 	frame.sf_sc.sc_err    = regs->tf_err;
517 	frame.sf_sc.sc_cr2    = (u_int32_t)(uintptr_t)ksi->ksi_addr;
518 	frame.sf_sc.sc_trapno = bsd_to_linux_trapcode(code);
519 
520 	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
521 		frame.sf_extramask[i] = lmask.__bits[i+1];
522 
523 	if (copyout(&frame, fp, sizeof(frame)) != 0) {
524 		/*
525 		 * Process has trashed its stack; give it an illegal
526 		 * instruction to halt it in its tracks.
527 		 */
528 		PROC_LOCK(p);
529 		sigexit(td, SIGILL);
530 	}
531 
532 	/*
533 	 * Build context to run handler in.
534 	 */
535 	regs->tf_rsp = PTROUT(fp);
536 	regs->tf_rip = p->p_sysent->sv_sigcode_base;
537 	regs->tf_rflags &= ~(PSL_T | PSL_D);
538 	regs->tf_cs = _ucode32sel;
539 	regs->tf_ss = _udatasel;
540 	regs->tf_ds = _udatasel;
541 	regs->tf_es = _udatasel;
542 	regs->tf_fs = _ufssel;
543 	regs->tf_gs = _ugssel;
544 	regs->tf_flags = TF_HASSEGS;
545 	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
546 	PROC_LOCK(p);
547 	mtx_lock(&psp->ps_mtx);
548 }
549 
550 /*
551  * System call to cleanup state after a signal
552  * has been taken.  Reset signal mask and
553  * stack state from context left by sendsig (above).
554  * Return to previous pc and psl as specified by
555  * context left by sendsig. Check carefully to
556  * make sure that the user has not modified the
557  * psl to gain improper privileges or to cause
558  * a machine fault.
559  */
560 int
561 linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args)
562 {
563 	struct l_sigframe frame;
564 	struct trapframe *regs;
565 	sigset_t bmask;
566 	l_sigset_t lmask;
567 	int eflags, i;
568 	ksiginfo_t ksi;
569 
570 	regs = td->td_frame;
571 
572 #ifdef DEBUG
573 	if (ldebug(sigreturn))
574 		printf(ARGS(sigreturn, "%p"), (void *)args->sfp);
575 #endif
576 	/*
577 	 * The trampoline code hands us the sigframe.
578 	 * It is unsafe to keep track of it ourselves, in the event that a
579 	 * program jumps out of a signal handler.
580 	 */
581 	if (copyin(args->sfp, &frame, sizeof(frame)) != 0)
582 		return (EFAULT);
583 
584 	/*
585 	 * Check for security violations.
586 	 */
587 #define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
588 	eflags = frame.sf_sc.sc_eflags;
589 	/*
590 	 * XXX do allow users to change the privileged flag PSL_RF.  The
591 	 * cpu sets PSL_RF in tf_eflags for faults.  Debuggers should
592 	 * sometimes set it there too.  tf_eflags is kept in the signal
593 	 * context during signal handling and there is no other place
594 	 * to remember it, so the PSL_RF bit may be corrupted by the
595 	 * signal handler without us knowing.  Corruption of the PSL_RF
596 	 * bit at worst causes one more or one less debugger trap, so
597 	 * allowing it is fairly harmless.
598 	 */
599 	if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF))
600 		return(EINVAL);
601 
602 	/*
603 	 * Don't allow users to load a valid privileged %cs.  Let the
604 	 * hardware check for invalid selectors, excess privilege in
605 	 * other selectors, invalid %eip's and invalid %esp's.
606 	 */
607 #define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
608 	if (!CS_SECURE(frame.sf_sc.sc_cs)) {
609 		ksiginfo_init_trap(&ksi);
610 		ksi.ksi_signo = SIGBUS;
611 		ksi.ksi_code = BUS_OBJERR;
612 		ksi.ksi_trapno = T_PROTFLT;
613 		ksi.ksi_addr = (void *)regs->tf_rip;
614 		trapsignal(td, &ksi);
615 		return(EINVAL);
616 	}
617 
618 	lmask.__bits[0] = frame.sf_sc.sc_mask;
619 	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
620 		lmask.__bits[i+1] = frame.sf_extramask[i];
621 	linux_to_bsd_sigset(&lmask, &bmask);
622 	kern_sigprocmask(td, SIG_SETMASK, &bmask, NULL, 0);
623 
624 	/*
625 	 * Restore signal context.
626 	 */
627 	regs->tf_rdi    = frame.sf_sc.sc_edi;
628 	regs->tf_rsi    = frame.sf_sc.sc_esi;
629 	regs->tf_rbp    = frame.sf_sc.sc_ebp;
630 	regs->tf_rbx    = frame.sf_sc.sc_ebx;
631 	regs->tf_rdx    = frame.sf_sc.sc_edx;
632 	regs->tf_rcx    = frame.sf_sc.sc_ecx;
633 	regs->tf_rax    = frame.sf_sc.sc_eax;
634 	regs->tf_rip    = frame.sf_sc.sc_eip;
635 	regs->tf_cs     = frame.sf_sc.sc_cs;
636 	regs->tf_ds     = frame.sf_sc.sc_ds;
637 	regs->tf_es     = frame.sf_sc.sc_es;
638 	regs->tf_fs     = frame.sf_sc.sc_fs;
639 	regs->tf_gs     = frame.sf_sc.sc_gs;
640 	regs->tf_rflags = eflags;
641 	regs->tf_rsp    = frame.sf_sc.sc_esp_at_signal;
642 	regs->tf_ss     = frame.sf_sc.sc_ss;
643 	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
644 
645 	return (EJUSTRETURN);
646 }
647 
648 /*
649  * System call to cleanup state after a signal
650  * has been taken.  Reset signal mask and
651  * stack state from context left by rt_sendsig (above).
652  * Return to previous pc and psl as specified by
653  * context left by sendsig. Check carefully to
654  * make sure that the user has not modified the
655  * psl to gain improper privileges or to cause
656  * a machine fault.
657  */
658 int
659 linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args)
660 {
661 	struct l_ucontext uc;
662 	struct l_sigcontext *context;
663 	sigset_t bmask;
664 	l_stack_t *lss;
665 	stack_t ss;
666 	struct trapframe *regs;
667 	int eflags;
668 	ksiginfo_t ksi;
669 
670 	regs = td->td_frame;
671 
672 #ifdef DEBUG
673 	if (ldebug(rt_sigreturn))
674 		printf(ARGS(rt_sigreturn, "%p"), (void *)args->ucp);
675 #endif
676 	/*
677 	 * The trampoline code hands us the ucontext.
678 	 * It is unsafe to keep track of it ourselves, in the event that a
679 	 * program jumps out of a signal handler.
680 	 */
681 	if (copyin(args->ucp, &uc, sizeof(uc)) != 0)
682 		return (EFAULT);
683 
684 	context = &uc.uc_mcontext;
685 
686 	/*
687 	 * Check for security violations.
688 	 */
689 #define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
690 	eflags = context->sc_eflags;
691 	/*
692 	 * XXX do allow users to change the privileged flag PSL_RF.  The
693 	 * cpu sets PSL_RF in tf_eflags for faults.  Debuggers should
694 	 * sometimes set it there too.  tf_eflags is kept in the signal
695 	 * context during signal handling and there is no other place
696 	 * to remember it, so the PSL_RF bit may be corrupted by the
697 	 * signal handler without us knowing.  Corruption of the PSL_RF
698 	 * bit at worst causes one more or one less debugger trap, so
699 	 * allowing it is fairly harmless.
700 	 */
701 	if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF))
702 		return(EINVAL);
703 
704 	/*
705 	 * Don't allow users to load a valid privileged %cs.  Let the
706 	 * hardware check for invalid selectors, excess privilege in
707 	 * other selectors, invalid %eip's and invalid %esp's.
708 	 */
709 #define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
710 	if (!CS_SECURE(context->sc_cs)) {
711 		ksiginfo_init_trap(&ksi);
712 		ksi.ksi_signo = SIGBUS;
713 		ksi.ksi_code = BUS_OBJERR;
714 		ksi.ksi_trapno = T_PROTFLT;
715 		ksi.ksi_addr = (void *)regs->tf_rip;
716 		trapsignal(td, &ksi);
717 		return(EINVAL);
718 	}
719 
720 	linux_to_bsd_sigset(&uc.uc_sigmask, &bmask);
721 	kern_sigprocmask(td, SIG_SETMASK, &bmask, NULL, 0);
722 
723 	/*
724 	 * Restore signal context
725 	 */
726 	regs->tf_gs	= context->sc_gs;
727 	regs->tf_fs	= context->sc_fs;
728 	regs->tf_es	= context->sc_es;
729 	regs->tf_ds	= context->sc_ds;
730 	regs->tf_rdi    = context->sc_edi;
731 	regs->tf_rsi    = context->sc_esi;
732 	regs->tf_rbp    = context->sc_ebp;
733 	regs->tf_rbx    = context->sc_ebx;
734 	regs->tf_rdx    = context->sc_edx;
735 	regs->tf_rcx    = context->sc_ecx;
736 	regs->tf_rax    = context->sc_eax;
737 	regs->tf_rip    = context->sc_eip;
738 	regs->tf_cs     = context->sc_cs;
739 	regs->tf_rflags = eflags;
740 	regs->tf_rsp    = context->sc_esp_at_signal;
741 	regs->tf_ss     = context->sc_ss;
742 	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
743 
744 	/*
745 	 * call sigaltstack & ignore results..
746 	 */
747 	lss = &uc.uc_stack;
748 	ss.ss_sp = PTRIN(lss->ss_sp);
749 	ss.ss_size = lss->ss_size;
750 	ss.ss_flags = linux_to_bsd_sigaltstack(lss->ss_flags);
751 
752 #ifdef DEBUG
753 	if (ldebug(rt_sigreturn))
754 		printf(LMSG("rt_sigret flags: 0x%x, sp: %p, ss: 0x%lx, mask: 0x%x"),
755 		    ss.ss_flags, ss.ss_sp, ss.ss_size, context->sc_mask);
756 #endif
757 	(void)kern_sigaltstack(td, &ss, NULL);
758 
759 	return (EJUSTRETURN);
760 }
761 
762 static int
763 linux32_fetch_syscall_args(struct thread *td, struct syscall_args *sa)
764 {
765 	struct proc *p;
766 	struct trapframe *frame;
767 
768 	p = td->td_proc;
769 	frame = td->td_frame;
770 
771 	sa->args[0] = frame->tf_rbx;
772 	sa->args[1] = frame->tf_rcx;
773 	sa->args[2] = frame->tf_rdx;
774 	sa->args[3] = frame->tf_rsi;
775 	sa->args[4] = frame->tf_rdi;
776 	sa->args[5] = frame->tf_rbp;	/* Unconfirmed */
777 	sa->code = frame->tf_rax;
778 
779 	if (sa->code >= p->p_sysent->sv_size)
780 		sa->callp = &p->p_sysent->sv_table[0];
781 	else
782 		sa->callp = &p->p_sysent->sv_table[sa->code];
783 	sa->narg = sa->callp->sy_narg;
784 
785 	td->td_retval[0] = 0;
786 	td->td_retval[1] = frame->tf_rdx;
787 
788 	return (0);
789 }
790 
791 /*
792  * If a linux binary is exec'ing something, try this image activator
793  * first.  We override standard shell script execution in order to
794  * be able to modify the interpreter path.  We only do this if a linux
795  * binary is doing the exec, so we do not create an EXEC module for it.
796  */
797 static int	exec_linux_imgact_try(struct image_params *iparams);
798 
799 static int
800 exec_linux_imgact_try(struct image_params *imgp)
801 {
802 	const char *head = (const char *)imgp->image_header;
803 	char *rpath;
804 	int error = -1;
805 
806 	/*
807 	* The interpreter for shell scripts run from a linux binary needs
808 	* to be located in /compat/linux if possible in order to recursively
809 	* maintain linux path emulation.
810 	*/
811 	if (((const short *)head)[0] == SHELLMAGIC) {
812 		/*
813 		* Run our normal shell image activator.  If it succeeds attempt
814 		* to use the alternate path for the interpreter.  If an
815 		* alternate * path is found, use our stringspace to store it.
816 		*/
817 		if ((error = exec_shell_imgact(imgp)) == 0) {
818 			linux_emul_convpath(FIRST_THREAD_IN_PROC(imgp->proc),
819 			    imgp->interpreter_name, UIO_SYSSPACE, &rpath, 0,
820 			    AT_FDCWD);
821 			if (rpath != NULL)
822 				imgp->args->fname_buf =
823 				    imgp->interpreter_name = rpath;
824 		}
825 	}
826 	return (error);
827 }
828 
829 /*
830  * Clear registers on exec
831  * XXX copied from ia32_signal.c.
832  */
833 static void
834 exec_linux_setregs(struct thread *td, struct image_params *imgp, u_long stack)
835 {
836 	struct trapframe *regs = td->td_frame;
837 	struct pcb *pcb = td->td_pcb;
838 
839 	mtx_lock(&dt_lock);
840 	if (td->td_proc->p_md.md_ldt != NULL)
841 		user_ldt_free(td);
842 	else
843 		mtx_unlock(&dt_lock);
844 
845 	critical_enter();
846 	wrmsr(MSR_FSBASE, 0);
847 	wrmsr(MSR_KGSBASE, 0);	/* User value while we're in the kernel */
848 	pcb->pcb_fsbase = 0;
849 	pcb->pcb_gsbase = 0;
850 	critical_exit();
851 	pcb->pcb_initial_fpucw = __LINUX_NPXCW__;
852 
853 	bzero((char *)regs, sizeof(struct trapframe));
854 	regs->tf_rip = imgp->entry_addr;
855 	regs->tf_rsp = stack;
856 	regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T);
857 	regs->tf_gs = _ugssel;
858 	regs->tf_fs = _ufssel;
859 	regs->tf_es = _udatasel;
860 	regs->tf_ds = _udatasel;
861 	regs->tf_ss = _udatasel;
862 	regs->tf_flags = TF_HASSEGS;
863 	regs->tf_cs = _ucode32sel;
864 	regs->tf_rbx = imgp->ps_strings;
865 
866 	fpstate_drop(td);
867 
868 	/* Do full restore on return so that we can change to a different %cs */
869 	set_pcb_flags(pcb, PCB_32BIT | PCB_FULL_IRET);
870 	clear_pcb_flags(pcb, PCB_GS32BIT);
871 	td->td_retval[1] = 0;
872 }
873 
874 /*
875  * XXX copied from ia32_sysvec.c.
876  */
877 static register_t *
878 linux_copyout_strings(struct image_params *imgp)
879 {
880 	int argc, envc;
881 	u_int32_t *vectp;
882 	char *stringp, *destp;
883 	u_int32_t *stack_base;
884 	struct linux32_ps_strings *arginfo;
885 
886 	/*
887 	 * Calculate string base and vector table pointers.
888 	 * Also deal with signal trampoline code for this exec type.
889 	 */
890 	arginfo = (struct linux32_ps_strings *)LINUX32_PS_STRINGS;
891 	destp =	(caddr_t)arginfo - SPARE_USRSPACE - linux_szplatform -
892 	    roundup((ARG_MAX - imgp->args->stringspace),
893 	    sizeof(char *));
894 
895 	/*
896 	 * Install LINUX_PLATFORM
897 	 */
898 	copyout(linux_platform, ((caddr_t)arginfo - linux_szplatform),
899 	    linux_szplatform);
900 
901 	/*
902 	 * If we have a valid auxargs ptr, prepare some room
903 	 * on the stack.
904 	 */
905 	if (imgp->auxargs) {
906 		/*
907 		 * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for
908 		 * lower compatibility.
909 		 */
910 		imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size :
911 		    (LINUX_AT_COUNT * 2);
912 		/*
913 		 * The '+ 2' is for the null pointers at the end of each of
914 		 * the arg and env vector sets,and imgp->auxarg_size is room
915 		 * for argument of Runtime loader.
916 		 */
917 		vectp = (u_int32_t *) (destp - (imgp->args->argc +
918 		    imgp->args->envc + 2 + imgp->auxarg_size) *
919 		    sizeof(u_int32_t));
920 
921 	} else
922 		/*
923 		 * The '+ 2' is for the null pointers at the end of each of
924 		 * the arg and env vector sets
925 		 */
926 		vectp = (u_int32_t *)(destp - (imgp->args->argc +
927 		    imgp->args->envc + 2) * sizeof(u_int32_t));
928 
929 	/*
930 	 * vectp also becomes our initial stack base
931 	 */
932 	stack_base = vectp;
933 
934 	stringp = imgp->args->begin_argv;
935 	argc = imgp->args->argc;
936 	envc = imgp->args->envc;
937 	/*
938 	 * Copy out strings - arguments and environment.
939 	 */
940 	copyout(stringp, destp, ARG_MAX - imgp->args->stringspace);
941 
942 	/*
943 	 * Fill in "ps_strings" struct for ps, w, etc.
944 	 */
945 	suword32(&arginfo->ps_argvstr, (uint32_t)(intptr_t)vectp);
946 	suword32(&arginfo->ps_nargvstr, argc);
947 
948 	/*
949 	 * Fill in argument portion of vector table.
950 	 */
951 	for (; argc > 0; --argc) {
952 		suword32(vectp++, (uint32_t)(intptr_t)destp);
953 		while (*stringp++ != 0)
954 			destp++;
955 		destp++;
956 	}
957 
958 	/* a null vector table pointer separates the argp's from the envp's */
959 	suword32(vectp++, 0);
960 
961 	suword32(&arginfo->ps_envstr, (uint32_t)(intptr_t)vectp);
962 	suword32(&arginfo->ps_nenvstr, envc);
963 
964 	/*
965 	 * Fill in environment portion of vector table.
966 	 */
967 	for (; envc > 0; --envc) {
968 		suword32(vectp++, (uint32_t)(intptr_t)destp);
969 		while (*stringp++ != 0)
970 			destp++;
971 		destp++;
972 	}
973 
974 	/* end of vector table is a null pointer */
975 	suword32(vectp, 0);
976 
977 	return ((register_t *)stack_base);
978 }
979 
980 SYSCTL_NODE(_compat, OID_AUTO, linux32, CTLFLAG_RW, 0,
981     "32-bit Linux emulation");
982 
983 static u_long	linux32_maxdsiz = LINUX32_MAXDSIZ;
984 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxdsiz, CTLFLAG_RW,
985     &linux32_maxdsiz, 0, "");
986 static u_long	linux32_maxssiz = LINUX32_MAXSSIZ;
987 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxssiz, CTLFLAG_RW,
988     &linux32_maxssiz, 0, "");
989 static u_long	linux32_maxvmem = LINUX32_MAXVMEM;
990 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxvmem, CTLFLAG_RW,
991     &linux32_maxvmem, 0, "");
992 
993 static void
994 linux32_fixlimit(struct rlimit *rl, int which)
995 {
996 
997 	switch (which) {
998 	case RLIMIT_DATA:
999 		if (linux32_maxdsiz != 0) {
1000 			if (rl->rlim_cur > linux32_maxdsiz)
1001 				rl->rlim_cur = linux32_maxdsiz;
1002 			if (rl->rlim_max > linux32_maxdsiz)
1003 				rl->rlim_max = linux32_maxdsiz;
1004 		}
1005 		break;
1006 	case RLIMIT_STACK:
1007 		if (linux32_maxssiz != 0) {
1008 			if (rl->rlim_cur > linux32_maxssiz)
1009 				rl->rlim_cur = linux32_maxssiz;
1010 			if (rl->rlim_max > linux32_maxssiz)
1011 				rl->rlim_max = linux32_maxssiz;
1012 		}
1013 		break;
1014 	case RLIMIT_VMEM:
1015 		if (linux32_maxvmem != 0) {
1016 			if (rl->rlim_cur > linux32_maxvmem)
1017 				rl->rlim_cur = linux32_maxvmem;
1018 			if (rl->rlim_max > linux32_maxvmem)
1019 				rl->rlim_max = linux32_maxvmem;
1020 		}
1021 		break;
1022 	}
1023 }
1024 
1025 struct sysentvec elf_linux_sysvec = {
1026 	.sv_size	= LINUX_SYS_MAXSYSCALL,
1027 	.sv_table	= linux_sysent,
1028 	.sv_mask	= 0,
1029 	.sv_sigsize	= LINUX_SIGTBLSZ,
1030 	.sv_sigtbl	= bsd_to_linux_signal,
1031 	.sv_errsize	= ELAST + 1,
1032 	.sv_errtbl	= bsd_to_linux_errno,
1033 	.sv_transtrap	= translate_traps,
1034 	.sv_fixup	= elf_linux_fixup,
1035 	.sv_sendsig	= linux_sendsig,
1036 	.sv_sigcode	= linux_sigcode,
1037 	.sv_szsigcode	= &linux_szsigcode,
1038 	.sv_prepsyscall	= NULL,
1039 	.sv_name	= "Linux ELF32",
1040 	.sv_coredump	= elf32_coredump,
1041 	.sv_imgact_try	= exec_linux_imgact_try,
1042 	.sv_minsigstksz	= LINUX_MINSIGSTKSZ,
1043 	.sv_pagesize	= PAGE_SIZE,
1044 	.sv_minuser	= VM_MIN_ADDRESS,
1045 	.sv_maxuser	= LINUX32_MAXUSER,
1046 	.sv_usrstack	= LINUX32_USRSTACK,
1047 	.sv_psstrings	= LINUX32_PS_STRINGS,
1048 	.sv_stackprot	= VM_PROT_ALL,
1049 	.sv_copyout_strings = linux_copyout_strings,
1050 	.sv_setregs	= exec_linux_setregs,
1051 	.sv_fixlimit	= linux32_fixlimit,
1052 	.sv_maxssiz	= &linux32_maxssiz,
1053 	.sv_flags	= SV_ABI_LINUX | SV_ILP32 | SV_IA32 | SV_SHP,
1054 	.sv_set_syscall_retval = cpu_set_syscall_retval,
1055 	.sv_fetch_syscall_args = linux32_fetch_syscall_args,
1056 	.sv_syscallnames = NULL,
1057 	.sv_shared_page_base = LINUX32_SHAREDPAGE,
1058 	.sv_shared_page_len = PAGE_SIZE,
1059 	.sv_schedtail	= linux_schedtail,
1060 };
1061 INIT_SYSENTVEC(elf_sysvec, &elf_linux_sysvec);
1062 
1063 static char GNU_ABI_VENDOR[] = "GNU";
1064 static int GNULINUX_ABI_DESC = 0;
1065 
1066 static boolean_t
1067 linux32_trans_osrel(const Elf_Note *note, int32_t *osrel)
1068 {
1069 	const Elf32_Word *desc;
1070 	uintptr_t p;
1071 
1072 	p = (uintptr_t)(note + 1);
1073 	p += roundup2(note->n_namesz, sizeof(Elf32_Addr));
1074 
1075 	desc = (const Elf32_Word *)p;
1076 	if (desc[0] != GNULINUX_ABI_DESC)
1077 		return (FALSE);
1078 
1079 	/*
1080 	 * For linux we encode osrel as follows (see linux_mib.c):
1081 	 * VVVMMMIII (version, major, minor), see linux_mib.c.
1082 	 */
1083 	*osrel = desc[1] * 1000000 + desc[2] * 1000 + desc[3];
1084 
1085 	return (TRUE);
1086 }
1087 
1088 static Elf_Brandnote linux32_brandnote = {
1089 	.hdr.n_namesz	= sizeof(GNU_ABI_VENDOR),
1090 	.hdr.n_descsz	= 16,	/* XXX at least 16 */
1091 	.hdr.n_type	= 1,
1092 	.vendor		= GNU_ABI_VENDOR,
1093 	.flags		= BN_TRANSLATE_OSREL,
1094 	.trans_osrel	= linux32_trans_osrel
1095 };
1096 
1097 static Elf32_Brandinfo linux_brand = {
1098 	.brand		= ELFOSABI_LINUX,
1099 	.machine	= EM_386,
1100 	.compat_3_brand	= "Linux",
1101 	.emul_path	= "/compat/linux",
1102 	.interp_path	= "/lib/ld-linux.so.1",
1103 	.sysvec		= &elf_linux_sysvec,
1104 	.interp_newpath	= NULL,
1105 	.brand_note	= &linux32_brandnote,
1106 	.flags		= BI_CAN_EXEC_DYN | BI_BRAND_NOTE
1107 };
1108 
1109 static Elf32_Brandinfo linux_glibc2brand = {
1110 	.brand		= ELFOSABI_LINUX,
1111 	.machine	= EM_386,
1112 	.compat_3_brand	= "Linux",
1113 	.emul_path	= "/compat/linux",
1114 	.interp_path	= "/lib/ld-linux.so.2",
1115 	.sysvec		= &elf_linux_sysvec,
1116 	.interp_newpath	= NULL,
1117 	.brand_note	= &linux32_brandnote,
1118 	.flags		= BI_CAN_EXEC_DYN | BI_BRAND_NOTE
1119 };
1120 
1121 Elf32_Brandinfo *linux_brandlist[] = {
1122 	&linux_brand,
1123 	&linux_glibc2brand,
1124 	NULL
1125 };
1126 
1127 static int
1128 linux_elf_modevent(module_t mod, int type, void *data)
1129 {
1130 	Elf32_Brandinfo **brandinfo;
1131 	int error;
1132 	struct linux_ioctl_handler **lihp;
1133 	struct linux_device_handler **ldhp;
1134 
1135 	error = 0;
1136 
1137 	switch(type) {
1138 	case MOD_LOAD:
1139 		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
1140 		     ++brandinfo)
1141 			if (elf32_insert_brand_entry(*brandinfo) < 0)
1142 				error = EINVAL;
1143 		if (error == 0) {
1144 			SET_FOREACH(lihp, linux_ioctl_handler_set)
1145 				linux_ioctl_register_handler(*lihp);
1146 			SET_FOREACH(ldhp, linux_device_handler_set)
1147 				linux_device_register_handler(*ldhp);
1148 			mtx_init(&emul_lock, "emuldata lock", NULL, MTX_DEF);
1149 			sx_init(&emul_shared_lock, "emuldata->shared lock");
1150 			LIST_INIT(&futex_list);
1151 			mtx_init(&futex_mtx, "ftllk", NULL, MTX_DEF);
1152 			linux_exit_tag = EVENTHANDLER_REGISTER(process_exit,
1153 			    linux_proc_exit, NULL, 1000);
1154 			linux_exec_tag = EVENTHANDLER_REGISTER(process_exec,
1155 			    linux_proc_exec, NULL, 1000);
1156 			linux_szplatform = roundup(strlen(linux_platform) + 1,
1157 			    sizeof(char *));
1158 			linux_osd_jail_register();
1159 			stclohz = (stathz ? stathz : hz);
1160 			if (bootverbose)
1161 				printf("Linux ELF exec handler installed\n");
1162 		} else
1163 			printf("cannot insert Linux ELF brand handler\n");
1164 		break;
1165 	case MOD_UNLOAD:
1166 		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
1167 		     ++brandinfo)
1168 			if (elf32_brand_inuse(*brandinfo))
1169 				error = EBUSY;
1170 		if (error == 0) {
1171 			for (brandinfo = &linux_brandlist[0];
1172 			     *brandinfo != NULL; ++brandinfo)
1173 				if (elf32_remove_brand_entry(*brandinfo) < 0)
1174 					error = EINVAL;
1175 		}
1176 		if (error == 0) {
1177 			SET_FOREACH(lihp, linux_ioctl_handler_set)
1178 				linux_ioctl_unregister_handler(*lihp);
1179 			SET_FOREACH(ldhp, linux_device_handler_set)
1180 				linux_device_unregister_handler(*ldhp);
1181 			mtx_destroy(&emul_lock);
1182 			sx_destroy(&emul_shared_lock);
1183 			mtx_destroy(&futex_mtx);
1184 			EVENTHANDLER_DEREGISTER(process_exit, linux_exit_tag);
1185 			EVENTHANDLER_DEREGISTER(process_exec, linux_exec_tag);
1186 			linux_osd_jail_deregister();
1187 			if (bootverbose)
1188 				printf("Linux ELF exec handler removed\n");
1189 		} else
1190 			printf("Could not deinstall ELF interpreter entry\n");
1191 		break;
1192 	default:
1193 		return EOPNOTSUPP;
1194 	}
1195 	return error;
1196 }
1197 
1198 static moduledata_t linux_elf_mod = {
1199 	"linuxelf",
1200 	linux_elf_modevent,
1201 	0
1202 };
1203 
1204 DECLARE_MODULE_TIED(linuxelf, linux_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY);
1205