xref: /freebsd/sys/amd64/linux32/linux32_sysvec.c (revision e5d81ef1b54984374275ca5e414f80689b491f70)
1 /*-
2  * Copyright (c) 2004 Tim J. Robbins
3  * Copyright (c) 2003 Peter Wemm
4  * Copyright (c) 2002 Doug Rabson
5  * Copyright (c) 1998-1999 Andrew Gallatin
6  * Copyright (c) 1994-1996 S�ren Schmidt
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer
14  *    in this position and unchanged.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. The name of the author may not be used to endorse or promote products
19  *    derived from this software without specific prior written permission
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
22  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
23  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
24  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
25  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
26  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
30  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31  */
32 
33 #include <sys/cdefs.h>
34 __FBSDID("$FreeBSD$");
35 #include "opt_compat.h"
36 
37 #ifndef COMPAT_FREEBSD32
38 #error "Unable to compile Linux-emulator due to missing COMPAT_FREEBSD32 option!"
39 #endif
40 
41 #define	__ELF_WORD_SIZE	32
42 
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/exec.h>
46 #include <sys/fcntl.h>
47 #include <sys/imgact.h>
48 #include <sys/imgact_elf.h>
49 #include <sys/kernel.h>
50 #include <sys/lock.h>
51 #include <sys/malloc.h>
52 #include <sys/module.h>
53 #include <sys/mutex.h>
54 #include <sys/proc.h>
55 #include <sys/resourcevar.h>
56 #include <sys/signalvar.h>
57 #include <sys/sysctl.h>
58 #include <sys/syscallsubr.h>
59 #include <sys/sysent.h>
60 #include <sys/sysproto.h>
61 #include <sys/vnode.h>
62 #include <sys/eventhandler.h>
63 
64 #include <vm/vm.h>
65 #include <vm/pmap.h>
66 #include <vm/vm_extern.h>
67 #include <vm/vm_map.h>
68 #include <vm/vm_object.h>
69 #include <vm/vm_page.h>
70 #include <vm/vm_param.h>
71 
72 #include <machine/cpu.h>
73 #include <machine/md_var.h>
74 #include <machine/pcb.h>
75 #include <machine/specialreg.h>
76 
77 #include <amd64/linux32/linux.h>
78 #include <amd64/linux32/linux32_proto.h>
79 #include <compat/linux/linux_emul.h>
80 #include <compat/linux/linux_futex.h>
81 #include <compat/linux/linux_mib.h>
82 #include <compat/linux/linux_misc.h>
83 #include <compat/linux/linux_signal.h>
84 #include <compat/linux/linux_util.h>
85 
86 MODULE_VERSION(linux, 1);
87 
88 MALLOC_DEFINE(M_LINUX, "linux", "Linux mode structures");
89 
90 #define	AUXARGS_ENTRY_32(pos, id, val)	\
91 	do {				\
92 		suword32(pos++, id);	\
93 		suword32(pos++, val);	\
94 	} while (0)
95 
96 #if BYTE_ORDER == LITTLE_ENDIAN
97 #define SHELLMAGIC      0x2123 /* #! */
98 #else
99 #define SHELLMAGIC      0x2321
100 #endif
101 
102 /*
103  * Allow the sendsig functions to use the ldebug() facility
104  * even though they are not syscalls themselves. Map them
105  * to syscall 0. This is slightly less bogus than using
106  * ldebug(sigreturn).
107  */
108 #define	LINUX_SYS_linux_rt_sendsig	0
109 #define	LINUX_SYS_linux_sendsig		0
110 
111 const char *linux_platform = "i686";
112 static int linux_szplatform;
113 extern char linux_sigcode[];
114 extern int linux_szsigcode;
115 
116 extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL];
117 
118 SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler);
119 SET_DECLARE(linux_device_handler_set, struct linux_device_handler);
120 
121 static int	elf_linux_fixup(register_t **stack_base,
122 		    struct image_params *iparams);
123 static register_t *linux_copyout_strings(struct image_params *imgp);
124 static void     linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask);
125 static void	exec_linux_setregs(struct thread *td,
126 				   struct image_params *imgp, u_long stack);
127 static void	linux32_fixlimit(struct rlimit *rl, int which);
128 static boolean_t linux32_trans_osrel(const Elf_Note *note, int32_t *osrel);
129 
130 static eventhandler_tag linux_exit_tag;
131 static eventhandler_tag linux_exec_tag;
132 
133 /*
134  * Linux syscalls return negative errno's, we do positive and map them
135  * Reference:
136  *   FreeBSD: src/sys/sys/errno.h
137  *   Linux:   linux-2.6.17.8/include/asm-generic/errno-base.h
138  *            linux-2.6.17.8/include/asm-generic/errno.h
139  */
140 static int bsd_to_linux_errno[ELAST + 1] = {
141 	-0,  -1,  -2,  -3,  -4,  -5,  -6,  -7,  -8,  -9,
142 	-10, -35, -12, -13, -14, -15, -16, -17, -18, -19,
143 	-20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
144 	-30, -31, -32, -33, -34, -11,-115,-114, -88, -89,
145 	-90, -91, -92, -93, -94, -95, -96, -97, -98, -99,
146 	-100,-101,-102,-103,-104,-105,-106,-107,-108,-109,
147 	-110,-111, -40, -36,-112,-113, -39, -11, -87,-122,
148 	-116, -66,  -6,  -6,  -6,  -6,  -6, -37, -38,  -9,
149 	  -6,  -6, -43, -42, -75,-125, -84, -95, -16, -74,
150 	 -72, -67, -71
151 };
152 
153 int bsd_to_linux_signal[LINUX_SIGTBLSZ] = {
154 	LINUX_SIGHUP, LINUX_SIGINT, LINUX_SIGQUIT, LINUX_SIGILL,
155 	LINUX_SIGTRAP, LINUX_SIGABRT, 0, LINUX_SIGFPE,
156 	LINUX_SIGKILL, LINUX_SIGBUS, LINUX_SIGSEGV, LINUX_SIGSYS,
157 	LINUX_SIGPIPE, LINUX_SIGALRM, LINUX_SIGTERM, LINUX_SIGURG,
158 	LINUX_SIGSTOP, LINUX_SIGTSTP, LINUX_SIGCONT, LINUX_SIGCHLD,
159 	LINUX_SIGTTIN, LINUX_SIGTTOU, LINUX_SIGIO, LINUX_SIGXCPU,
160 	LINUX_SIGXFSZ, LINUX_SIGVTALRM, LINUX_SIGPROF, LINUX_SIGWINCH,
161 	0, LINUX_SIGUSR1, LINUX_SIGUSR2
162 };
163 
164 int linux_to_bsd_signal[LINUX_SIGTBLSZ] = {
165 	SIGHUP, SIGINT, SIGQUIT, SIGILL,
166 	SIGTRAP, SIGABRT, SIGBUS, SIGFPE,
167 	SIGKILL, SIGUSR1, SIGSEGV, SIGUSR2,
168 	SIGPIPE, SIGALRM, SIGTERM, SIGBUS,
169 	SIGCHLD, SIGCONT, SIGSTOP, SIGTSTP,
170 	SIGTTIN, SIGTTOU, SIGURG, SIGXCPU,
171 	SIGXFSZ, SIGVTALRM, SIGPROF, SIGWINCH,
172 	SIGIO, SIGURG, SIGSYS
173 };
174 
175 #define LINUX_T_UNKNOWN  255
176 static int _bsd_to_linux_trapcode[] = {
177 	LINUX_T_UNKNOWN,	/* 0 */
178 	6,			/* 1  T_PRIVINFLT */
179 	LINUX_T_UNKNOWN,	/* 2 */
180 	3,			/* 3  T_BPTFLT */
181 	LINUX_T_UNKNOWN,	/* 4 */
182 	LINUX_T_UNKNOWN,	/* 5 */
183 	16,			/* 6  T_ARITHTRAP */
184 	254,			/* 7  T_ASTFLT */
185 	LINUX_T_UNKNOWN,	/* 8 */
186 	13,			/* 9  T_PROTFLT */
187 	1,			/* 10 T_TRCTRAP */
188 	LINUX_T_UNKNOWN,	/* 11 */
189 	14,			/* 12 T_PAGEFLT */
190 	LINUX_T_UNKNOWN,	/* 13 */
191 	17,			/* 14 T_ALIGNFLT */
192 	LINUX_T_UNKNOWN,	/* 15 */
193 	LINUX_T_UNKNOWN,	/* 16 */
194 	LINUX_T_UNKNOWN,	/* 17 */
195 	0,			/* 18 T_DIVIDE */
196 	2,			/* 19 T_NMI */
197 	4,			/* 20 T_OFLOW */
198 	5,			/* 21 T_BOUND */
199 	7,			/* 22 T_DNA */
200 	8,			/* 23 T_DOUBLEFLT */
201 	9,			/* 24 T_FPOPFLT */
202 	10,			/* 25 T_TSSFLT */
203 	11,			/* 26 T_SEGNPFLT */
204 	12,			/* 27 T_STKFLT */
205 	18,			/* 28 T_MCHK */
206 	19,			/* 29 T_XMMFLT */
207 	15			/* 30 T_RESERVED */
208 };
209 #define bsd_to_linux_trapcode(code) \
210     ((code)<sizeof(_bsd_to_linux_trapcode)/sizeof(*_bsd_to_linux_trapcode)? \
211      _bsd_to_linux_trapcode[(code)]: \
212      LINUX_T_UNKNOWN)
213 
214 struct linux32_ps_strings {
215 	u_int32_t ps_argvstr;	/* first of 0 or more argument strings */
216 	u_int ps_nargvstr;	/* the number of argument strings */
217 	u_int32_t ps_envstr;	/* first of 0 or more environment strings */
218 	u_int ps_nenvstr;	/* the number of environment strings */
219 };
220 
221 /*
222  * If FreeBSD & Linux have a difference of opinion about what a trap
223  * means, deal with it here.
224  *
225  * MPSAFE
226  */
227 static int
228 translate_traps(int signal, int trap_code)
229 {
230 	if (signal != SIGBUS)
231 		return signal;
232 	switch (trap_code) {
233 	case T_PROTFLT:
234 	case T_TSSFLT:
235 	case T_DOUBLEFLT:
236 	case T_PAGEFLT:
237 		return SIGSEGV;
238 	default:
239 		return signal;
240 	}
241 }
242 
243 static int
244 elf_linux_fixup(register_t **stack_base, struct image_params *imgp)
245 {
246 	Elf32_Auxargs *args;
247 	Elf32_Addr *base;
248 	Elf32_Addr *pos, *uplatform;
249 	struct linux32_ps_strings *arginfo;
250 
251 	arginfo = (struct linux32_ps_strings *)LINUX32_PS_STRINGS;
252 	uplatform = (Elf32_Addr *)((caddr_t)arginfo - linux_szsigcode -
253 	    linux_szplatform);
254 
255 	KASSERT(curthread->td_proc == imgp->proc,
256 	    ("unsafe elf_linux_fixup(), should be curproc"));
257 	base = (Elf32_Addr *)*stack_base;
258 	args = (Elf32_Auxargs *)imgp->auxargs;
259 	pos = base + (imgp->args->argc + imgp->args->envc + 2);
260 
261 	AUXARGS_ENTRY_32(pos, LINUX_AT_HWCAP, cpu_feature);
262 
263 	/*
264 	 * Do not export AT_CLKTCK when emulating Linux kernel prior to 2.4.0,
265 	 * as it has appeared in the 2.4.0-rc7 first time.
266 	 * Being exported, AT_CLKTCK is returned by sysconf(_SC_CLK_TCK),
267 	 * glibc falls back to the hard-coded CLK_TCK value when aux entry
268 	 * is not present.
269 	 * Also see linux_times() implementation.
270 	 */
271 	if (linux_kernver(curthread) >= LINUX_KERNVER_2004000)
272 		AUXARGS_ENTRY_32(pos, LINUX_AT_CLKTCK, stclohz);
273 	AUXARGS_ENTRY_32(pos, AT_PHDR, args->phdr);
274 	AUXARGS_ENTRY_32(pos, AT_PHENT, args->phent);
275 	AUXARGS_ENTRY_32(pos, AT_PHNUM, args->phnum);
276 	AUXARGS_ENTRY_32(pos, AT_PAGESZ, args->pagesz);
277 	AUXARGS_ENTRY_32(pos, AT_FLAGS, args->flags);
278 	AUXARGS_ENTRY_32(pos, AT_ENTRY, args->entry);
279 	AUXARGS_ENTRY_32(pos, AT_BASE, args->base);
280 	AUXARGS_ENTRY_32(pos, LINUX_AT_SECURE, 0);
281 	AUXARGS_ENTRY_32(pos, AT_UID, imgp->proc->p_ucred->cr_ruid);
282 	AUXARGS_ENTRY_32(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid);
283 	AUXARGS_ENTRY_32(pos, AT_GID, imgp->proc->p_ucred->cr_rgid);
284 	AUXARGS_ENTRY_32(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid);
285 	AUXARGS_ENTRY_32(pos, LINUX_AT_PLATFORM, PTROUT(uplatform));
286 	if (args->execfd != -1)
287 		AUXARGS_ENTRY_32(pos, AT_EXECFD, args->execfd);
288 	AUXARGS_ENTRY_32(pos, AT_NULL, 0);
289 
290 	free(imgp->auxargs, M_TEMP);
291 	imgp->auxargs = NULL;
292 
293 	base--;
294 	suword32(base, (uint32_t)imgp->args->argc);
295 	*stack_base = (register_t *)base;
296 	return 0;
297 }
298 
299 extern unsigned long linux_sznonrtsigcode;
300 
301 static void
302 linux_rt_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
303 {
304 	struct thread *td = curthread;
305 	struct proc *p = td->td_proc;
306 	struct sigacts *psp;
307 	struct trapframe *regs;
308 	struct l_rt_sigframe *fp, frame;
309 	int oonstack;
310 	int sig;
311 	int code;
312 
313 	sig = ksi->ksi_signo;
314 	code = ksi->ksi_code;
315 	PROC_LOCK_ASSERT(p, MA_OWNED);
316 	psp = p->p_sigacts;
317 	mtx_assert(&psp->ps_mtx, MA_OWNED);
318 	regs = td->td_frame;
319 	oonstack = sigonstack(regs->tf_rsp);
320 
321 #ifdef DEBUG
322 	if (ldebug(rt_sendsig))
323 		printf(ARGS(rt_sendsig, "%p, %d, %p, %u"),
324 		    catcher, sig, (void*)mask, code);
325 #endif
326 	/*
327 	 * Allocate space for the signal handler context.
328 	 */
329 	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
330 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
331 		fp = (struct l_rt_sigframe *)(td->td_sigstk.ss_sp +
332 		    td->td_sigstk.ss_size - sizeof(struct l_rt_sigframe));
333 	} else
334 		fp = (struct l_rt_sigframe *)regs->tf_rsp - 1;
335 	mtx_unlock(&psp->ps_mtx);
336 
337 	/*
338 	 * Build the argument list for the signal handler.
339 	 */
340 	if (p->p_sysent->sv_sigtbl)
341 		if (sig <= p->p_sysent->sv_sigsize)
342 			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
343 
344 	bzero(&frame, sizeof(frame));
345 
346 	frame.sf_handler = PTROUT(catcher);
347 	frame.sf_sig = sig;
348 	frame.sf_siginfo = PTROUT(&fp->sf_si);
349 	frame.sf_ucontext = PTROUT(&fp->sf_sc);
350 
351 	/* Fill in POSIX parts */
352 	ksiginfo_to_lsiginfo(ksi, &frame.sf_si, sig);
353 
354 	/*
355 	 * Build the signal context to be used by sigreturn.
356 	 */
357 	frame.sf_sc.uc_flags = 0;		/* XXX ??? */
358 	frame.sf_sc.uc_link = 0;		/* XXX ??? */
359 
360 	frame.sf_sc.uc_stack.ss_sp = PTROUT(td->td_sigstk.ss_sp);
361 	frame.sf_sc.uc_stack.ss_size = td->td_sigstk.ss_size;
362 	frame.sf_sc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
363 	    ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE;
364 	PROC_UNLOCK(p);
365 
366 	bsd_to_linux_sigset(mask, &frame.sf_sc.uc_sigmask);
367 
368 	frame.sf_sc.uc_mcontext.sc_mask   = frame.sf_sc.uc_sigmask.__bits[0];
369 	frame.sf_sc.uc_mcontext.sc_edi    = regs->tf_rdi;
370 	frame.sf_sc.uc_mcontext.sc_esi    = regs->tf_rsi;
371 	frame.sf_sc.uc_mcontext.sc_ebp    = regs->tf_rbp;
372 	frame.sf_sc.uc_mcontext.sc_ebx    = regs->tf_rbx;
373 	frame.sf_sc.uc_mcontext.sc_edx    = regs->tf_rdx;
374 	frame.sf_sc.uc_mcontext.sc_ecx    = regs->tf_rcx;
375 	frame.sf_sc.uc_mcontext.sc_eax    = regs->tf_rax;
376 	frame.sf_sc.uc_mcontext.sc_eip    = regs->tf_rip;
377 	frame.sf_sc.uc_mcontext.sc_cs     = regs->tf_cs;
378 	frame.sf_sc.uc_mcontext.sc_gs     = regs->tf_gs;
379 	frame.sf_sc.uc_mcontext.sc_fs     = regs->tf_fs;
380 	frame.sf_sc.uc_mcontext.sc_es     = regs->tf_es;
381 	frame.sf_sc.uc_mcontext.sc_ds     = regs->tf_ds;
382 	frame.sf_sc.uc_mcontext.sc_eflags = regs->tf_rflags;
383 	frame.sf_sc.uc_mcontext.sc_esp_at_signal = regs->tf_rsp;
384 	frame.sf_sc.uc_mcontext.sc_ss     = regs->tf_ss;
385 	frame.sf_sc.uc_mcontext.sc_err    = regs->tf_err;
386 	frame.sf_sc.uc_mcontext.sc_cr2    = (u_int32_t)(uintptr_t)ksi->ksi_addr;
387 	frame.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code);
388 
389 #ifdef DEBUG
390 	if (ldebug(rt_sendsig))
391 		printf(LMSG("rt_sendsig flags: 0x%x, sp: %p, ss: 0x%lx, mask: 0x%x"),
392 		    frame.sf_sc.uc_stack.ss_flags, td->td_sigstk.ss_sp,
393 		    td->td_sigstk.ss_size, frame.sf_sc.uc_mcontext.sc_mask);
394 #endif
395 
396 	if (copyout(&frame, fp, sizeof(frame)) != 0) {
397 		/*
398 		 * Process has trashed its stack; give it an illegal
399 		 * instruction to halt it in its tracks.
400 		 */
401 #ifdef DEBUG
402 		if (ldebug(rt_sendsig))
403 			printf(LMSG("rt_sendsig: bad stack %p, oonstack=%x"),
404 			    fp, oonstack);
405 #endif
406 		PROC_LOCK(p);
407 		sigexit(td, SIGILL);
408 	}
409 
410 	/*
411 	 * Build context to run handler in.
412 	 */
413 	regs->tf_rsp = PTROUT(fp);
414 	regs->tf_rip = LINUX32_PS_STRINGS - *(p->p_sysent->sv_szsigcode) +
415 	    linux_sznonrtsigcode;
416 	regs->tf_rflags &= ~(PSL_T | PSL_D);
417 	regs->tf_cs = _ucode32sel;
418 	regs->tf_ss = _udatasel;
419 	regs->tf_ds = _udatasel;
420 	regs->tf_es = _udatasel;
421 	regs->tf_fs = _ufssel;
422 	regs->tf_gs = _ugssel;
423 	regs->tf_flags = TF_HASSEGS;
424 	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
425 	PROC_LOCK(p);
426 	mtx_lock(&psp->ps_mtx);
427 }
428 
429 
430 /*
431  * Send an interrupt to process.
432  *
433  * Stack is set up to allow sigcode stored
434  * in u. to call routine, followed by kcall
435  * to sigreturn routine below.  After sigreturn
436  * resets the signal mask, the stack, and the
437  * frame pointer, it returns to the user
438  * specified pc, psl.
439  */
440 static void
441 linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
442 {
443 	struct thread *td = curthread;
444 	struct proc *p = td->td_proc;
445 	struct sigacts *psp;
446 	struct trapframe *regs;
447 	struct l_sigframe *fp, frame;
448 	l_sigset_t lmask;
449 	int oonstack, i;
450 	int sig, code;
451 
452 	sig = ksi->ksi_signo;
453 	code = ksi->ksi_code;
454 	PROC_LOCK_ASSERT(p, MA_OWNED);
455 	psp = p->p_sigacts;
456 	mtx_assert(&psp->ps_mtx, MA_OWNED);
457 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
458 		/* Signal handler installed with SA_SIGINFO. */
459 		linux_rt_sendsig(catcher, ksi, mask);
460 		return;
461 	}
462 
463 	regs = td->td_frame;
464 	oonstack = sigonstack(regs->tf_rsp);
465 
466 #ifdef DEBUG
467 	if (ldebug(sendsig))
468 		printf(ARGS(sendsig, "%p, %d, %p, %u"),
469 		    catcher, sig, (void*)mask, code);
470 #endif
471 
472 	/*
473 	 * Allocate space for the signal handler context.
474 	 */
475 	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
476 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
477 		fp = (struct l_sigframe *)(td->td_sigstk.ss_sp +
478 		    td->td_sigstk.ss_size - sizeof(struct l_sigframe));
479 	} else
480 		fp = (struct l_sigframe *)regs->tf_rsp - 1;
481 	mtx_unlock(&psp->ps_mtx);
482 	PROC_UNLOCK(p);
483 
484 	/*
485 	 * Build the argument list for the signal handler.
486 	 */
487 	if (p->p_sysent->sv_sigtbl)
488 		if (sig <= p->p_sysent->sv_sigsize)
489 			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
490 
491 	bzero(&frame, sizeof(frame));
492 
493 	frame.sf_handler = PTROUT(catcher);
494 	frame.sf_sig = sig;
495 
496 	bsd_to_linux_sigset(mask, &lmask);
497 
498 	/*
499 	 * Build the signal context to be used by sigreturn.
500 	 */
501 	frame.sf_sc.sc_mask   = lmask.__bits[0];
502 	frame.sf_sc.sc_gs     = regs->tf_gs;
503 	frame.sf_sc.sc_fs     = regs->tf_fs;
504 	frame.sf_sc.sc_es     = regs->tf_es;
505 	frame.sf_sc.sc_ds     = regs->tf_ds;
506 	frame.sf_sc.sc_edi    = regs->tf_rdi;
507 	frame.sf_sc.sc_esi    = regs->tf_rsi;
508 	frame.sf_sc.sc_ebp    = regs->tf_rbp;
509 	frame.sf_sc.sc_ebx    = regs->tf_rbx;
510 	frame.sf_sc.sc_edx    = regs->tf_rdx;
511 	frame.sf_sc.sc_ecx    = regs->tf_rcx;
512 	frame.sf_sc.sc_eax    = regs->tf_rax;
513 	frame.sf_sc.sc_eip    = regs->tf_rip;
514 	frame.sf_sc.sc_cs     = regs->tf_cs;
515 	frame.sf_sc.sc_eflags = regs->tf_rflags;
516 	frame.sf_sc.sc_esp_at_signal = regs->tf_rsp;
517 	frame.sf_sc.sc_ss     = regs->tf_ss;
518 	frame.sf_sc.sc_err    = regs->tf_err;
519 	frame.sf_sc.sc_cr2    = (u_int32_t)(uintptr_t)ksi->ksi_addr;
520 	frame.sf_sc.sc_trapno = bsd_to_linux_trapcode(code);
521 
522 	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
523 		frame.sf_extramask[i] = lmask.__bits[i+1];
524 
525 	if (copyout(&frame, fp, sizeof(frame)) != 0) {
526 		/*
527 		 * Process has trashed its stack; give it an illegal
528 		 * instruction to halt it in its tracks.
529 		 */
530 		PROC_LOCK(p);
531 		sigexit(td, SIGILL);
532 	}
533 
534 	/*
535 	 * Build context to run handler in.
536 	 */
537 	regs->tf_rsp = PTROUT(fp);
538 	regs->tf_rip = LINUX32_PS_STRINGS - *(p->p_sysent->sv_szsigcode);
539 	regs->tf_rflags &= ~(PSL_T | PSL_D);
540 	regs->tf_cs = _ucode32sel;
541 	regs->tf_ss = _udatasel;
542 	regs->tf_ds = _udatasel;
543 	regs->tf_es = _udatasel;
544 	regs->tf_fs = _ufssel;
545 	regs->tf_gs = _ugssel;
546 	regs->tf_flags = TF_HASSEGS;
547 	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
548 	PROC_LOCK(p);
549 	mtx_lock(&psp->ps_mtx);
550 }
551 
552 /*
553  * System call to cleanup state after a signal
554  * has been taken.  Reset signal mask and
555  * stack state from context left by sendsig (above).
556  * Return to previous pc and psl as specified by
557  * context left by sendsig. Check carefully to
558  * make sure that the user has not modified the
559  * psl to gain improper privileges or to cause
560  * a machine fault.
561  */
562 int
563 linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args)
564 {
565 	struct l_sigframe frame;
566 	struct trapframe *regs;
567 	sigset_t bmask;
568 	l_sigset_t lmask;
569 	int eflags, i;
570 	ksiginfo_t ksi;
571 
572 	regs = td->td_frame;
573 
574 #ifdef DEBUG
575 	if (ldebug(sigreturn))
576 		printf(ARGS(sigreturn, "%p"), (void *)args->sfp);
577 #endif
578 	/*
579 	 * The trampoline code hands us the sigframe.
580 	 * It is unsafe to keep track of it ourselves, in the event that a
581 	 * program jumps out of a signal handler.
582 	 */
583 	if (copyin(args->sfp, &frame, sizeof(frame)) != 0)
584 		return (EFAULT);
585 
586 	/*
587 	 * Check for security violations.
588 	 */
589 #define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
590 	eflags = frame.sf_sc.sc_eflags;
591 	/*
592 	 * XXX do allow users to change the privileged flag PSL_RF.  The
593 	 * cpu sets PSL_RF in tf_eflags for faults.  Debuggers should
594 	 * sometimes set it there too.  tf_eflags is kept in the signal
595 	 * context during signal handling and there is no other place
596 	 * to remember it, so the PSL_RF bit may be corrupted by the
597 	 * signal handler without us knowing.  Corruption of the PSL_RF
598 	 * bit at worst causes one more or one less debugger trap, so
599 	 * allowing it is fairly harmless.
600 	 */
601 	if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF))
602 		return(EINVAL);
603 
604 	/*
605 	 * Don't allow users to load a valid privileged %cs.  Let the
606 	 * hardware check for invalid selectors, excess privilege in
607 	 * other selectors, invalid %eip's and invalid %esp's.
608 	 */
609 #define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
610 	if (!CS_SECURE(frame.sf_sc.sc_cs)) {
611 		ksiginfo_init_trap(&ksi);
612 		ksi.ksi_signo = SIGBUS;
613 		ksi.ksi_code = BUS_OBJERR;
614 		ksi.ksi_trapno = T_PROTFLT;
615 		ksi.ksi_addr = (void *)regs->tf_rip;
616 		trapsignal(td, &ksi);
617 		return(EINVAL);
618 	}
619 
620 	lmask.__bits[0] = frame.sf_sc.sc_mask;
621 	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
622 		lmask.__bits[i+1] = frame.sf_extramask[i];
623 	linux_to_bsd_sigset(&lmask, &bmask);
624 	kern_sigprocmask(td, SIG_SETMASK, &bmask, NULL, 0);
625 
626 	/*
627 	 * Restore signal context.
628 	 */
629 	regs->tf_rdi    = frame.sf_sc.sc_edi;
630 	regs->tf_rsi    = frame.sf_sc.sc_esi;
631 	regs->tf_rbp    = frame.sf_sc.sc_ebp;
632 	regs->tf_rbx    = frame.sf_sc.sc_ebx;
633 	regs->tf_rdx    = frame.sf_sc.sc_edx;
634 	regs->tf_rcx    = frame.sf_sc.sc_ecx;
635 	regs->tf_rax    = frame.sf_sc.sc_eax;
636 	regs->tf_rip    = frame.sf_sc.sc_eip;
637 	regs->tf_cs     = frame.sf_sc.sc_cs;
638 	regs->tf_ds     = frame.sf_sc.sc_ds;
639 	regs->tf_es     = frame.sf_sc.sc_es;
640 	regs->tf_fs     = frame.sf_sc.sc_fs;
641 	regs->tf_gs     = frame.sf_sc.sc_gs;
642 	regs->tf_rflags = eflags;
643 	regs->tf_rsp    = frame.sf_sc.sc_esp_at_signal;
644 	regs->tf_ss     = frame.sf_sc.sc_ss;
645 	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
646 
647 	return (EJUSTRETURN);
648 }
649 
650 /*
651  * System call to cleanup state after a signal
652  * has been taken.  Reset signal mask and
653  * stack state from context left by rt_sendsig (above).
654  * Return to previous pc and psl as specified by
655  * context left by sendsig. Check carefully to
656  * make sure that the user has not modified the
657  * psl to gain improper privileges or to cause
658  * a machine fault.
659  */
660 int
661 linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args)
662 {
663 	struct l_ucontext uc;
664 	struct l_sigcontext *context;
665 	sigset_t bmask;
666 	l_stack_t *lss;
667 	stack_t ss;
668 	struct trapframe *regs;
669 	int eflags;
670 	ksiginfo_t ksi;
671 
672 	regs = td->td_frame;
673 
674 #ifdef DEBUG
675 	if (ldebug(rt_sigreturn))
676 		printf(ARGS(rt_sigreturn, "%p"), (void *)args->ucp);
677 #endif
678 	/*
679 	 * The trampoline code hands us the ucontext.
680 	 * It is unsafe to keep track of it ourselves, in the event that a
681 	 * program jumps out of a signal handler.
682 	 */
683 	if (copyin(args->ucp, &uc, sizeof(uc)) != 0)
684 		return (EFAULT);
685 
686 	context = &uc.uc_mcontext;
687 
688 	/*
689 	 * Check for security violations.
690 	 */
691 #define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
692 	eflags = context->sc_eflags;
693 	/*
694 	 * XXX do allow users to change the privileged flag PSL_RF.  The
695 	 * cpu sets PSL_RF in tf_eflags for faults.  Debuggers should
696 	 * sometimes set it there too.  tf_eflags is kept in the signal
697 	 * context during signal handling and there is no other place
698 	 * to remember it, so the PSL_RF bit may be corrupted by the
699 	 * signal handler without us knowing.  Corruption of the PSL_RF
700 	 * bit at worst causes one more or one less debugger trap, so
701 	 * allowing it is fairly harmless.
702 	 */
703 	if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF))
704 		return(EINVAL);
705 
706 	/*
707 	 * Don't allow users to load a valid privileged %cs.  Let the
708 	 * hardware check for invalid selectors, excess privilege in
709 	 * other selectors, invalid %eip's and invalid %esp's.
710 	 */
711 #define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
712 	if (!CS_SECURE(context->sc_cs)) {
713 		ksiginfo_init_trap(&ksi);
714 		ksi.ksi_signo = SIGBUS;
715 		ksi.ksi_code = BUS_OBJERR;
716 		ksi.ksi_trapno = T_PROTFLT;
717 		ksi.ksi_addr = (void *)regs->tf_rip;
718 		trapsignal(td, &ksi);
719 		return(EINVAL);
720 	}
721 
722 	linux_to_bsd_sigset(&uc.uc_sigmask, &bmask);
723 	kern_sigprocmask(td, SIG_SETMASK, &bmask, NULL, 0);
724 
725 	/*
726 	 * Restore signal context
727 	 */
728 	regs->tf_gs	= context->sc_gs;
729 	regs->tf_fs	= context->sc_fs;
730 	regs->tf_es	= context->sc_es;
731 	regs->tf_ds	= context->sc_ds;
732 	regs->tf_rdi    = context->sc_edi;
733 	regs->tf_rsi    = context->sc_esi;
734 	regs->tf_rbp    = context->sc_ebp;
735 	regs->tf_rbx    = context->sc_ebx;
736 	regs->tf_rdx    = context->sc_edx;
737 	regs->tf_rcx    = context->sc_ecx;
738 	regs->tf_rax    = context->sc_eax;
739 	regs->tf_rip    = context->sc_eip;
740 	regs->tf_cs     = context->sc_cs;
741 	regs->tf_rflags = eflags;
742 	regs->tf_rsp    = context->sc_esp_at_signal;
743 	regs->tf_ss     = context->sc_ss;
744 	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
745 
746 	/*
747 	 * call sigaltstack & ignore results..
748 	 */
749 	lss = &uc.uc_stack;
750 	ss.ss_sp = PTRIN(lss->ss_sp);
751 	ss.ss_size = lss->ss_size;
752 	ss.ss_flags = linux_to_bsd_sigaltstack(lss->ss_flags);
753 
754 #ifdef DEBUG
755 	if (ldebug(rt_sigreturn))
756 		printf(LMSG("rt_sigret flags: 0x%x, sp: %p, ss: 0x%lx, mask: 0x%x"),
757 		    ss.ss_flags, ss.ss_sp, ss.ss_size, context->sc_mask);
758 #endif
759 	(void)kern_sigaltstack(td, &ss, NULL);
760 
761 	return (EJUSTRETURN);
762 }
763 
764 static int
765 linux32_fetch_syscall_args(struct thread *td, struct syscall_args *sa)
766 {
767 	struct proc *p;
768 	struct trapframe *frame;
769 
770 	p = td->td_proc;
771 	frame = td->td_frame;
772 
773 	sa->args[0] = frame->tf_rbx;
774 	sa->args[1] = frame->tf_rcx;
775 	sa->args[2] = frame->tf_rdx;
776 	sa->args[3] = frame->tf_rsi;
777 	sa->args[4] = frame->tf_rdi;
778 	sa->args[5] = frame->tf_rbp;	/* Unconfirmed */
779 	sa->code = frame->tf_rax;
780 
781 	if (sa->code >= p->p_sysent->sv_size)
782 		sa->callp = &p->p_sysent->sv_table[0];
783 	else
784 		sa->callp = &p->p_sysent->sv_table[sa->code];
785 	sa->narg = sa->callp->sy_narg;
786 
787 	td->td_retval[0] = 0;
788 	td->td_retval[1] = frame->tf_rdx;
789 
790 	return (0);
791 }
792 
793 /*
794  * If a linux binary is exec'ing something, try this image activator
795  * first.  We override standard shell script execution in order to
796  * be able to modify the interpreter path.  We only do this if a linux
797  * binary is doing the exec, so we do not create an EXEC module for it.
798  */
799 static int	exec_linux_imgact_try(struct image_params *iparams);
800 
801 static int
802 exec_linux_imgact_try(struct image_params *imgp)
803 {
804 	const char *head = (const char *)imgp->image_header;
805 	char *rpath;
806 	int error = -1;
807 
808 	/*
809 	* The interpreter for shell scripts run from a linux binary needs
810 	* to be located in /compat/linux if possible in order to recursively
811 	* maintain linux path emulation.
812 	*/
813 	if (((const short *)head)[0] == SHELLMAGIC) {
814 		/*
815 		* Run our normal shell image activator.  If it succeeds attempt
816 		* to use the alternate path for the interpreter.  If an
817 		* alternate * path is found, use our stringspace to store it.
818 		*/
819 		if ((error = exec_shell_imgact(imgp)) == 0) {
820 			linux_emul_convpath(FIRST_THREAD_IN_PROC(imgp->proc),
821 			    imgp->interpreter_name, UIO_SYSSPACE, &rpath, 0,
822 			    AT_FDCWD);
823 			if (rpath != NULL)
824 				imgp->args->fname_buf =
825 				    imgp->interpreter_name = rpath;
826 		}
827 	}
828 	return (error);
829 }
830 
831 /*
832  * Clear registers on exec
833  * XXX copied from ia32_signal.c.
834  */
835 static void
836 exec_linux_setregs(struct thread *td, struct image_params *imgp, u_long stack)
837 {
838 	struct trapframe *regs = td->td_frame;
839 	struct pcb *pcb = td->td_pcb;
840 
841 	mtx_lock(&dt_lock);
842 	if (td->td_proc->p_md.md_ldt != NULL)
843 		user_ldt_free(td);
844 	else
845 		mtx_unlock(&dt_lock);
846 
847 	critical_enter();
848 	wrmsr(MSR_FSBASE, 0);
849 	wrmsr(MSR_KGSBASE, 0);	/* User value while we're in the kernel */
850 	pcb->pcb_fsbase = 0;
851 	pcb->pcb_gsbase = 0;
852 	critical_exit();
853 	pcb->pcb_initial_fpucw = __LINUX_NPXCW__;
854 
855 	bzero((char *)regs, sizeof(struct trapframe));
856 	regs->tf_rip = imgp->entry_addr;
857 	regs->tf_rsp = stack;
858 	regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T);
859 	regs->tf_gs = _ugssel;
860 	regs->tf_fs = _ufssel;
861 	regs->tf_es = _udatasel;
862 	regs->tf_ds = _udatasel;
863 	regs->tf_ss = _udatasel;
864 	regs->tf_flags = TF_HASSEGS;
865 	regs->tf_cs = _ucode32sel;
866 	regs->tf_rbx = imgp->ps_strings;
867 
868 	fpstate_drop(td);
869 
870 	/* Do full restore on return so that we can change to a different %cs */
871 	set_pcb_flags(pcb, PCB_32BIT | PCB_FULL_IRET);
872 	clear_pcb_flags(pcb, PCB_GS32BIT);
873 	td->td_retval[1] = 0;
874 }
875 
876 /*
877  * XXX copied from ia32_sysvec.c.
878  */
879 static register_t *
880 linux_copyout_strings(struct image_params *imgp)
881 {
882 	int argc, envc;
883 	u_int32_t *vectp;
884 	char *stringp, *destp;
885 	u_int32_t *stack_base;
886 	struct linux32_ps_strings *arginfo;
887 
888 	/*
889 	 * Calculate string base and vector table pointers.
890 	 * Also deal with signal trampoline code for this exec type.
891 	 */
892 	arginfo = (struct linux32_ps_strings *)LINUX32_PS_STRINGS;
893 	destp =	(caddr_t)arginfo - linux_szsigcode - SPARE_USRSPACE -
894 	    linux_szplatform - roundup((ARG_MAX - imgp->args->stringspace),
895 	    sizeof(char *));
896 
897 	/*
898 	 * install sigcode
899 	 */
900 	copyout(imgp->proc->p_sysent->sv_sigcode,
901 	    ((caddr_t)arginfo - linux_szsigcode), linux_szsigcode);
902 
903 	/*
904 	 * Install LINUX_PLATFORM
905 	 */
906 	copyout(linux_platform, ((caddr_t)arginfo - linux_szsigcode -
907 	    linux_szplatform), linux_szplatform);
908 
909 	/*
910 	 * If we have a valid auxargs ptr, prepare some room
911 	 * on the stack.
912 	 */
913 	if (imgp->auxargs) {
914 		/*
915 		 * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for
916 		 * lower compatibility.
917 		 */
918 		imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size :
919 		    (LINUX_AT_COUNT * 2);
920 		/*
921 		 * The '+ 2' is for the null pointers at the end of each of
922 		 * the arg and env vector sets,and imgp->auxarg_size is room
923 		 * for argument of Runtime loader.
924 		 */
925 		vectp = (u_int32_t *) (destp - (imgp->args->argc +
926 		    imgp->args->envc + 2 + imgp->auxarg_size) *
927 		    sizeof(u_int32_t));
928 
929 	} else
930 		/*
931 		 * The '+ 2' is for the null pointers at the end of each of
932 		 * the arg and env vector sets
933 		 */
934 		vectp = (u_int32_t *)(destp - (imgp->args->argc +
935 		    imgp->args->envc + 2) * sizeof(u_int32_t));
936 
937 	/*
938 	 * vectp also becomes our initial stack base
939 	 */
940 	stack_base = vectp;
941 
942 	stringp = imgp->args->begin_argv;
943 	argc = imgp->args->argc;
944 	envc = imgp->args->envc;
945 	/*
946 	 * Copy out strings - arguments and environment.
947 	 */
948 	copyout(stringp, destp, ARG_MAX - imgp->args->stringspace);
949 
950 	/*
951 	 * Fill in "ps_strings" struct for ps, w, etc.
952 	 */
953 	suword32(&arginfo->ps_argvstr, (uint32_t)(intptr_t)vectp);
954 	suword32(&arginfo->ps_nargvstr, argc);
955 
956 	/*
957 	 * Fill in argument portion of vector table.
958 	 */
959 	for (; argc > 0; --argc) {
960 		suword32(vectp++, (uint32_t)(intptr_t)destp);
961 		while (*stringp++ != 0)
962 			destp++;
963 		destp++;
964 	}
965 
966 	/* a null vector table pointer separates the argp's from the envp's */
967 	suword32(vectp++, 0);
968 
969 	suword32(&arginfo->ps_envstr, (uint32_t)(intptr_t)vectp);
970 	suword32(&arginfo->ps_nenvstr, envc);
971 
972 	/*
973 	 * Fill in environment portion of vector table.
974 	 */
975 	for (; envc > 0; --envc) {
976 		suword32(vectp++, (uint32_t)(intptr_t)destp);
977 		while (*stringp++ != 0)
978 			destp++;
979 		destp++;
980 	}
981 
982 	/* end of vector table is a null pointer */
983 	suword32(vectp, 0);
984 
985 	return ((register_t *)stack_base);
986 }
987 
988 SYSCTL_NODE(_compat, OID_AUTO, linux32, CTLFLAG_RW, 0,
989     "32-bit Linux emulation");
990 
991 static u_long	linux32_maxdsiz = LINUX32_MAXDSIZ;
992 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxdsiz, CTLFLAG_RW,
993     &linux32_maxdsiz, 0, "");
994 static u_long	linux32_maxssiz = LINUX32_MAXSSIZ;
995 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxssiz, CTLFLAG_RW,
996     &linux32_maxssiz, 0, "");
997 static u_long	linux32_maxvmem = LINUX32_MAXVMEM;
998 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxvmem, CTLFLAG_RW,
999     &linux32_maxvmem, 0, "");
1000 
1001 static void
1002 linux32_fixlimit(struct rlimit *rl, int which)
1003 {
1004 
1005 	switch (which) {
1006 	case RLIMIT_DATA:
1007 		if (linux32_maxdsiz != 0) {
1008 			if (rl->rlim_cur > linux32_maxdsiz)
1009 				rl->rlim_cur = linux32_maxdsiz;
1010 			if (rl->rlim_max > linux32_maxdsiz)
1011 				rl->rlim_max = linux32_maxdsiz;
1012 		}
1013 		break;
1014 	case RLIMIT_STACK:
1015 		if (linux32_maxssiz != 0) {
1016 			if (rl->rlim_cur > linux32_maxssiz)
1017 				rl->rlim_cur = linux32_maxssiz;
1018 			if (rl->rlim_max > linux32_maxssiz)
1019 				rl->rlim_max = linux32_maxssiz;
1020 		}
1021 		break;
1022 	case RLIMIT_VMEM:
1023 		if (linux32_maxvmem != 0) {
1024 			if (rl->rlim_cur > linux32_maxvmem)
1025 				rl->rlim_cur = linux32_maxvmem;
1026 			if (rl->rlim_max > linux32_maxvmem)
1027 				rl->rlim_max = linux32_maxvmem;
1028 		}
1029 		break;
1030 	}
1031 }
1032 
1033 struct sysentvec elf_linux_sysvec = {
1034 	.sv_size	= LINUX_SYS_MAXSYSCALL,
1035 	.sv_table	= linux_sysent,
1036 	.sv_mask	= 0,
1037 	.sv_sigsize	= LINUX_SIGTBLSZ,
1038 	.sv_sigtbl	= bsd_to_linux_signal,
1039 	.sv_errsize	= ELAST + 1,
1040 	.sv_errtbl	= bsd_to_linux_errno,
1041 	.sv_transtrap	= translate_traps,
1042 	.sv_fixup	= elf_linux_fixup,
1043 	.sv_sendsig	= linux_sendsig,
1044 	.sv_sigcode	= linux_sigcode,
1045 	.sv_szsigcode	= &linux_szsigcode,
1046 	.sv_prepsyscall	= NULL,
1047 	.sv_name	= "Linux ELF32",
1048 	.sv_coredump	= elf32_coredump,
1049 	.sv_imgact_try	= exec_linux_imgact_try,
1050 	.sv_minsigstksz	= LINUX_MINSIGSTKSZ,
1051 	.sv_pagesize	= PAGE_SIZE,
1052 	.sv_minuser	= VM_MIN_ADDRESS,
1053 	.sv_maxuser	= LINUX32_USRSTACK,
1054 	.sv_usrstack	= LINUX32_USRSTACK,
1055 	.sv_psstrings	= LINUX32_PS_STRINGS,
1056 	.sv_stackprot	= VM_PROT_ALL,
1057 	.sv_copyout_strings = linux_copyout_strings,
1058 	.sv_setregs	= exec_linux_setregs,
1059 	.sv_fixlimit	= linux32_fixlimit,
1060 	.sv_maxssiz	= &linux32_maxssiz,
1061 	.sv_flags	= SV_ABI_LINUX | SV_ILP32 | SV_IA32,
1062 	.sv_set_syscall_retval = cpu_set_syscall_retval,
1063 	.sv_fetch_syscall_args = linux32_fetch_syscall_args,
1064 	.sv_syscallnames = NULL,
1065 	.sv_schedtail	= linux_schedtail,
1066 };
1067 
1068 static char GNU_ABI_VENDOR[] = "GNU";
1069 static int GNULINUX_ABI_DESC = 0;
1070 
1071 static boolean_t
1072 linux32_trans_osrel(const Elf_Note *note, int32_t *osrel)
1073 {
1074 	const Elf32_Word *desc;
1075 	uintptr_t p;
1076 
1077 	p = (uintptr_t)(note + 1);
1078 	p += roundup2(note->n_namesz, sizeof(Elf32_Addr));
1079 
1080 	desc = (const Elf32_Word *)p;
1081 	if (desc[0] != GNULINUX_ABI_DESC)
1082 		return (FALSE);
1083 
1084 	/*
1085 	 * For linux we encode osrel as follows (see linux_mib.c):
1086 	 * VVVMMMIII (version, major, minor), see linux_mib.c.
1087 	 */
1088 	*osrel = desc[1] * 1000000 + desc[2] * 1000 + desc[3];
1089 
1090 	return (TRUE);
1091 }
1092 
1093 static Elf_Brandnote linux32_brandnote = {
1094 	.hdr.n_namesz	= sizeof(GNU_ABI_VENDOR),
1095 	.hdr.n_descsz	= 16,	/* XXX at least 16 */
1096 	.hdr.n_type	= 1,
1097 	.vendor		= GNU_ABI_VENDOR,
1098 	.flags		= BN_TRANSLATE_OSREL,
1099 	.trans_osrel	= linux32_trans_osrel
1100 };
1101 
1102 static Elf32_Brandinfo linux_brand = {
1103 	.brand		= ELFOSABI_LINUX,
1104 	.machine	= EM_386,
1105 	.compat_3_brand	= "Linux",
1106 	.emul_path	= "/compat/linux",
1107 	.interp_path	= "/lib/ld-linux.so.1",
1108 	.sysvec		= &elf_linux_sysvec,
1109 	.interp_newpath	= NULL,
1110 	.brand_note	= &linux32_brandnote,
1111 	.flags		= BI_CAN_EXEC_DYN | BI_BRAND_NOTE
1112 };
1113 
1114 static Elf32_Brandinfo linux_glibc2brand = {
1115 	.brand		= ELFOSABI_LINUX,
1116 	.machine	= EM_386,
1117 	.compat_3_brand	= "Linux",
1118 	.emul_path	= "/compat/linux",
1119 	.interp_path	= "/lib/ld-linux.so.2",
1120 	.sysvec		= &elf_linux_sysvec,
1121 	.interp_newpath	= NULL,
1122 	.brand_note	= &linux32_brandnote,
1123 	.flags		= BI_CAN_EXEC_DYN | BI_BRAND_NOTE
1124 };
1125 
1126 Elf32_Brandinfo *linux_brandlist[] = {
1127 	&linux_brand,
1128 	&linux_glibc2brand,
1129 	NULL
1130 };
1131 
1132 static int
1133 linux_elf_modevent(module_t mod, int type, void *data)
1134 {
1135 	Elf32_Brandinfo **brandinfo;
1136 	int error;
1137 	struct linux_ioctl_handler **lihp;
1138 	struct linux_device_handler **ldhp;
1139 
1140 	error = 0;
1141 
1142 	switch(type) {
1143 	case MOD_LOAD:
1144 		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
1145 		     ++brandinfo)
1146 			if (elf32_insert_brand_entry(*brandinfo) < 0)
1147 				error = EINVAL;
1148 		if (error == 0) {
1149 			SET_FOREACH(lihp, linux_ioctl_handler_set)
1150 				linux_ioctl_register_handler(*lihp);
1151 			SET_FOREACH(ldhp, linux_device_handler_set)
1152 				linux_device_register_handler(*ldhp);
1153 			mtx_init(&emul_lock, "emuldata lock", NULL, MTX_DEF);
1154 			sx_init(&emul_shared_lock, "emuldata->shared lock");
1155 			LIST_INIT(&futex_list);
1156 			mtx_init(&futex_mtx, "ftllk", NULL, MTX_DEF);
1157 			linux_exit_tag = EVENTHANDLER_REGISTER(process_exit,
1158 			    linux_proc_exit, NULL, 1000);
1159 			linux_exec_tag = EVENTHANDLER_REGISTER(process_exec,
1160 			    linux_proc_exec, NULL, 1000);
1161 			linux_szplatform = roundup(strlen(linux_platform) + 1,
1162 			    sizeof(char *));
1163 			linux_osd_jail_register();
1164 			stclohz = (stathz ? stathz : hz);
1165 			if (bootverbose)
1166 				printf("Linux ELF exec handler installed\n");
1167 		} else
1168 			printf("cannot insert Linux ELF brand handler\n");
1169 		break;
1170 	case MOD_UNLOAD:
1171 		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
1172 		     ++brandinfo)
1173 			if (elf32_brand_inuse(*brandinfo))
1174 				error = EBUSY;
1175 		if (error == 0) {
1176 			for (brandinfo = &linux_brandlist[0];
1177 			     *brandinfo != NULL; ++brandinfo)
1178 				if (elf32_remove_brand_entry(*brandinfo) < 0)
1179 					error = EINVAL;
1180 		}
1181 		if (error == 0) {
1182 			SET_FOREACH(lihp, linux_ioctl_handler_set)
1183 				linux_ioctl_unregister_handler(*lihp);
1184 			SET_FOREACH(ldhp, linux_device_handler_set)
1185 				linux_device_unregister_handler(*ldhp);
1186 			mtx_destroy(&emul_lock);
1187 			sx_destroy(&emul_shared_lock);
1188 			mtx_destroy(&futex_mtx);
1189 			EVENTHANDLER_DEREGISTER(process_exit, linux_exit_tag);
1190 			EVENTHANDLER_DEREGISTER(process_exec, linux_exec_tag);
1191 			linux_osd_jail_deregister();
1192 			if (bootverbose)
1193 				printf("Linux ELF exec handler removed\n");
1194 		} else
1195 			printf("Could not deinstall ELF interpreter entry\n");
1196 		break;
1197 	default:
1198 		return EOPNOTSUPP;
1199 	}
1200 	return error;
1201 }
1202 
1203 static moduledata_t linux_elf_mod = {
1204 	"linuxelf",
1205 	linux_elf_modevent,
1206 	0
1207 };
1208 
1209 DECLARE_MODULE_TIED(linuxelf, linux_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY);
1210