xref: /freebsd/sys/amd64/linux32/linux32_sysvec.c (revision a3cf0ef5a295c885c895fabfd56470c0d1db322d)
1 /*-
2  * Copyright (c) 2004 Tim J. Robbins
3  * Copyright (c) 2003 Peter Wemm
4  * Copyright (c) 2002 Doug Rabson
5  * Copyright (c) 1998-1999 Andrew Gallatin
6  * Copyright (c) 1994-1996 S�ren Schmidt
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer
14  *    in this position and unchanged.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. The name of the author may not be used to endorse or promote products
19  *    derived from this software without specific prior written permission
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
22  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
23  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
24  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
25  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
26  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
30  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31  */
32 
33 #include <sys/cdefs.h>
34 __FBSDID("$FreeBSD$");
35 #include "opt_compat.h"
36 
37 #ifndef COMPAT_FREEBSD32
38 #error "Unable to compile Linux-emulator due to missing COMPAT_FREEBSD32 option!"
39 #endif
40 
41 #define	__ELF_WORD_SIZE	32
42 
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/exec.h>
46 #include <sys/fcntl.h>
47 #include <sys/imgact.h>
48 #include <sys/imgact_elf.h>
49 #include <sys/kernel.h>
50 #include <sys/lock.h>
51 #include <sys/malloc.h>
52 #include <sys/module.h>
53 #include <sys/mutex.h>
54 #include <sys/proc.h>
55 #include <sys/resourcevar.h>
56 #include <sys/signalvar.h>
57 #include <sys/sysctl.h>
58 #include <sys/syscallsubr.h>
59 #include <sys/sysent.h>
60 #include <sys/sysproto.h>
61 #include <sys/vnode.h>
62 #include <sys/eventhandler.h>
63 
64 #include <vm/vm.h>
65 #include <vm/pmap.h>
66 #include <vm/vm_extern.h>
67 #include <vm/vm_map.h>
68 #include <vm/vm_object.h>
69 #include <vm/vm_page.h>
70 #include <vm/vm_param.h>
71 
72 #include <machine/cpu.h>
73 #include <machine/md_var.h>
74 #include <machine/pcb.h>
75 #include <machine/specialreg.h>
76 
77 #include <amd64/linux32/linux.h>
78 #include <amd64/linux32/linux32_proto.h>
79 #include <compat/linux/linux_futex.h>
80 #include <compat/linux/linux_emul.h>
81 #include <compat/linux/linux_mib.h>
82 #include <compat/linux/linux_misc.h>
83 #include <compat/linux/linux_signal.h>
84 #include <compat/linux/linux_util.h>
85 
86 MODULE_VERSION(linux, 1);
87 
88 MALLOC_DEFINE(M_LINUX, "linux", "Linux mode structures");
89 
90 #define	AUXARGS_ENTRY_32(pos, id, val)	\
91 	do {				\
92 		suword32(pos++, id);	\
93 		suword32(pos++, val);	\
94 	} while (0)
95 
96 #if BYTE_ORDER == LITTLE_ENDIAN
97 #define SHELLMAGIC      0x2123 /* #! */
98 #else
99 #define SHELLMAGIC      0x2321
100 #endif
101 
102 /*
103  * Allow the sendsig functions to use the ldebug() facility
104  * even though they are not syscalls themselves. Map them
105  * to syscall 0. This is slightly less bogus than using
106  * ldebug(sigreturn).
107  */
108 #define	LINUX_SYS_linux_rt_sendsig	0
109 #define	LINUX_SYS_linux_sendsig		0
110 
111 const char *linux_platform = "i686";
112 static int linux_szplatform;
113 extern char linux_sigcode[];
114 extern int linux_szsigcode;
115 
116 extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL];
117 
118 SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler);
119 SET_DECLARE(linux_device_handler_set, struct linux_device_handler);
120 
121 static int	elf_linux_fixup(register_t **stack_base,
122 		    struct image_params *iparams);
123 static register_t *linux_copyout_strings(struct image_params *imgp);
124 static void     linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask);
125 static void	exec_linux_setregs(struct thread *td,
126 				   struct image_params *imgp, u_long stack);
127 static void	linux32_fixlimit(struct rlimit *rl, int which);
128 static boolean_t linux32_trans_osrel(const Elf_Note *note, int32_t *osrel);
129 
130 static eventhandler_tag linux_exit_tag;
131 static eventhandler_tag linux_schedtail_tag;
132 static eventhandler_tag linux_exec_tag;
133 
134 /*
135  * Linux syscalls return negative errno's, we do positive and map them
136  * Reference:
137  *   FreeBSD: src/sys/sys/errno.h
138  *   Linux:   linux-2.6.17.8/include/asm-generic/errno-base.h
139  *            linux-2.6.17.8/include/asm-generic/errno.h
140  */
141 static int bsd_to_linux_errno[ELAST + 1] = {
142 	-0,  -1,  -2,  -3,  -4,  -5,  -6,  -7,  -8,  -9,
143 	-10, -35, -12, -13, -14, -15, -16, -17, -18, -19,
144 	-20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
145 	-30, -31, -32, -33, -34, -11,-115,-114, -88, -89,
146 	-90, -91, -92, -93, -94, -95, -96, -97, -98, -99,
147 	-100,-101,-102,-103,-104,-105,-106,-107,-108,-109,
148 	-110,-111, -40, -36,-112,-113, -39, -11, -87,-122,
149 	-116, -66,  -6,  -6,  -6,  -6,  -6, -37, -38,  -9,
150 	  -6,  -6, -43, -42, -75,-125, -84, -95, -16, -74,
151 	 -72, -67, -71
152 };
153 
154 int bsd_to_linux_signal[LINUX_SIGTBLSZ] = {
155 	LINUX_SIGHUP, LINUX_SIGINT, LINUX_SIGQUIT, LINUX_SIGILL,
156 	LINUX_SIGTRAP, LINUX_SIGABRT, 0, LINUX_SIGFPE,
157 	LINUX_SIGKILL, LINUX_SIGBUS, LINUX_SIGSEGV, LINUX_SIGSYS,
158 	LINUX_SIGPIPE, LINUX_SIGALRM, LINUX_SIGTERM, LINUX_SIGURG,
159 	LINUX_SIGSTOP, LINUX_SIGTSTP, LINUX_SIGCONT, LINUX_SIGCHLD,
160 	LINUX_SIGTTIN, LINUX_SIGTTOU, LINUX_SIGIO, LINUX_SIGXCPU,
161 	LINUX_SIGXFSZ, LINUX_SIGVTALRM, LINUX_SIGPROF, LINUX_SIGWINCH,
162 	0, LINUX_SIGUSR1, LINUX_SIGUSR2
163 };
164 
165 int linux_to_bsd_signal[LINUX_SIGTBLSZ] = {
166 	SIGHUP, SIGINT, SIGQUIT, SIGILL,
167 	SIGTRAP, SIGABRT, SIGBUS, SIGFPE,
168 	SIGKILL, SIGUSR1, SIGSEGV, SIGUSR2,
169 	SIGPIPE, SIGALRM, SIGTERM, SIGBUS,
170 	SIGCHLD, SIGCONT, SIGSTOP, SIGTSTP,
171 	SIGTTIN, SIGTTOU, SIGURG, SIGXCPU,
172 	SIGXFSZ, SIGVTALRM, SIGPROF, SIGWINCH,
173 	SIGIO, SIGURG, SIGSYS
174 };
175 
176 #define LINUX_T_UNKNOWN  255
177 static int _bsd_to_linux_trapcode[] = {
178 	LINUX_T_UNKNOWN,	/* 0 */
179 	6,			/* 1  T_PRIVINFLT */
180 	LINUX_T_UNKNOWN,	/* 2 */
181 	3,			/* 3  T_BPTFLT */
182 	LINUX_T_UNKNOWN,	/* 4 */
183 	LINUX_T_UNKNOWN,	/* 5 */
184 	16,			/* 6  T_ARITHTRAP */
185 	254,			/* 7  T_ASTFLT */
186 	LINUX_T_UNKNOWN,	/* 8 */
187 	13,			/* 9  T_PROTFLT */
188 	1,			/* 10 T_TRCTRAP */
189 	LINUX_T_UNKNOWN,	/* 11 */
190 	14,			/* 12 T_PAGEFLT */
191 	LINUX_T_UNKNOWN,	/* 13 */
192 	17,			/* 14 T_ALIGNFLT */
193 	LINUX_T_UNKNOWN,	/* 15 */
194 	LINUX_T_UNKNOWN,	/* 16 */
195 	LINUX_T_UNKNOWN,	/* 17 */
196 	0,			/* 18 T_DIVIDE */
197 	2,			/* 19 T_NMI */
198 	4,			/* 20 T_OFLOW */
199 	5,			/* 21 T_BOUND */
200 	7,			/* 22 T_DNA */
201 	8,			/* 23 T_DOUBLEFLT */
202 	9,			/* 24 T_FPOPFLT */
203 	10,			/* 25 T_TSSFLT */
204 	11,			/* 26 T_SEGNPFLT */
205 	12,			/* 27 T_STKFLT */
206 	18,			/* 28 T_MCHK */
207 	19,			/* 29 T_XMMFLT */
208 	15			/* 30 T_RESERVED */
209 };
210 #define bsd_to_linux_trapcode(code) \
211     ((code)<sizeof(_bsd_to_linux_trapcode)/sizeof(*_bsd_to_linux_trapcode)? \
212      _bsd_to_linux_trapcode[(code)]: \
213      LINUX_T_UNKNOWN)
214 
215 struct linux32_ps_strings {
216 	u_int32_t ps_argvstr;	/* first of 0 or more argument strings */
217 	u_int ps_nargvstr;	/* the number of argument strings */
218 	u_int32_t ps_envstr;	/* first of 0 or more environment strings */
219 	u_int ps_nenvstr;	/* the number of environment strings */
220 };
221 
222 /*
223  * If FreeBSD & Linux have a difference of opinion about what a trap
224  * means, deal with it here.
225  *
226  * MPSAFE
227  */
228 static int
229 translate_traps(int signal, int trap_code)
230 {
231 	if (signal != SIGBUS)
232 		return signal;
233 	switch (trap_code) {
234 	case T_PROTFLT:
235 	case T_TSSFLT:
236 	case T_DOUBLEFLT:
237 	case T_PAGEFLT:
238 		return SIGSEGV;
239 	default:
240 		return signal;
241 	}
242 }
243 
244 static int
245 elf_linux_fixup(register_t **stack_base, struct image_params *imgp)
246 {
247 	Elf32_Auxargs *args;
248 	Elf32_Addr *base;
249 	Elf32_Addr *pos, *uplatform;
250 	struct linux32_ps_strings *arginfo;
251 
252 	arginfo = (struct linux32_ps_strings *)LINUX32_PS_STRINGS;
253 	uplatform = (Elf32_Addr *)((caddr_t)arginfo - linux_szsigcode -
254 	    linux_szplatform);
255 
256 	KASSERT(curthread->td_proc == imgp->proc,
257 	    ("unsafe elf_linux_fixup(), should be curproc"));
258 	base = (Elf32_Addr *)*stack_base;
259 	args = (Elf32_Auxargs *)imgp->auxargs;
260 	pos = base + (imgp->args->argc + imgp->args->envc + 2);
261 
262 	AUXARGS_ENTRY_32(pos, LINUX_AT_HWCAP, cpu_feature);
263 
264 	/*
265 	 * Do not export AT_CLKTCK when emulating Linux kernel prior to 2.4.0,
266 	 * as it has appeared in the 2.4.0-rc7 first time.
267 	 * Being exported, AT_CLKTCK is returned by sysconf(_SC_CLK_TCK),
268 	 * glibc falls back to the hard-coded CLK_TCK value when aux entry
269 	 * is not present.
270 	 * Also see linux_times() implementation.
271 	 */
272 	if (linux_kernver(curthread) >= LINUX_KERNVER_2004000)
273 		AUXARGS_ENTRY_32(pos, LINUX_AT_CLKTCK, stclohz);
274 	AUXARGS_ENTRY_32(pos, AT_PHDR, args->phdr);
275 	AUXARGS_ENTRY_32(pos, AT_PHENT, args->phent);
276 	AUXARGS_ENTRY_32(pos, AT_PHNUM, args->phnum);
277 	AUXARGS_ENTRY_32(pos, AT_PAGESZ, args->pagesz);
278 	AUXARGS_ENTRY_32(pos, AT_FLAGS, args->flags);
279 	AUXARGS_ENTRY_32(pos, AT_ENTRY, args->entry);
280 	AUXARGS_ENTRY_32(pos, AT_BASE, args->base);
281 	AUXARGS_ENTRY_32(pos, LINUX_AT_SECURE, 0);
282 	AUXARGS_ENTRY_32(pos, AT_UID, imgp->proc->p_ucred->cr_ruid);
283 	AUXARGS_ENTRY_32(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid);
284 	AUXARGS_ENTRY_32(pos, AT_GID, imgp->proc->p_ucred->cr_rgid);
285 	AUXARGS_ENTRY_32(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid);
286 	AUXARGS_ENTRY_32(pos, LINUX_AT_PLATFORM, PTROUT(uplatform));
287 	if (args->execfd != -1)
288 		AUXARGS_ENTRY_32(pos, AT_EXECFD, args->execfd);
289 	AUXARGS_ENTRY_32(pos, AT_NULL, 0);
290 
291 	free(imgp->auxargs, M_TEMP);
292 	imgp->auxargs = NULL;
293 
294 	base--;
295 	suword32(base, (uint32_t)imgp->args->argc);
296 	*stack_base = (register_t *)base;
297 	return 0;
298 }
299 
300 extern unsigned long linux_sznonrtsigcode;
301 
302 static void
303 linux_rt_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
304 {
305 	struct thread *td = curthread;
306 	struct proc *p = td->td_proc;
307 	struct sigacts *psp;
308 	struct trapframe *regs;
309 	struct l_rt_sigframe *fp, frame;
310 	int oonstack;
311 	int sig;
312 	int code;
313 
314 	sig = ksi->ksi_signo;
315 	code = ksi->ksi_code;
316 	PROC_LOCK_ASSERT(p, MA_OWNED);
317 	psp = p->p_sigacts;
318 	mtx_assert(&psp->ps_mtx, MA_OWNED);
319 	regs = td->td_frame;
320 	oonstack = sigonstack(regs->tf_rsp);
321 
322 #ifdef DEBUG
323 	if (ldebug(rt_sendsig))
324 		printf(ARGS(rt_sendsig, "%p, %d, %p, %u"),
325 		    catcher, sig, (void*)mask, code);
326 #endif
327 	/*
328 	 * Allocate space for the signal handler context.
329 	 */
330 	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
331 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
332 		fp = (struct l_rt_sigframe *)(td->td_sigstk.ss_sp +
333 		    td->td_sigstk.ss_size - sizeof(struct l_rt_sigframe));
334 	} else
335 		fp = (struct l_rt_sigframe *)regs->tf_rsp - 1;
336 	mtx_unlock(&psp->ps_mtx);
337 
338 	/*
339 	 * Build the argument list for the signal handler.
340 	 */
341 	if (p->p_sysent->sv_sigtbl)
342 		if (sig <= p->p_sysent->sv_sigsize)
343 			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
344 
345 	bzero(&frame, sizeof(frame));
346 
347 	frame.sf_handler = PTROUT(catcher);
348 	frame.sf_sig = sig;
349 	frame.sf_siginfo = PTROUT(&fp->sf_si);
350 	frame.sf_ucontext = PTROUT(&fp->sf_sc);
351 
352 	/* Fill in POSIX parts */
353 	ksiginfo_to_lsiginfo(ksi, &frame.sf_si, sig);
354 
355 	/*
356 	 * Build the signal context to be used by sigreturn.
357 	 */
358 	frame.sf_sc.uc_flags = 0;		/* XXX ??? */
359 	frame.sf_sc.uc_link = 0;		/* XXX ??? */
360 
361 	frame.sf_sc.uc_stack.ss_sp = PTROUT(td->td_sigstk.ss_sp);
362 	frame.sf_sc.uc_stack.ss_size = td->td_sigstk.ss_size;
363 	frame.sf_sc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
364 	    ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE;
365 	PROC_UNLOCK(p);
366 
367 	bsd_to_linux_sigset(mask, &frame.sf_sc.uc_sigmask);
368 
369 	frame.sf_sc.uc_mcontext.sc_mask   = frame.sf_sc.uc_sigmask.__bits[0];
370 	frame.sf_sc.uc_mcontext.sc_edi    = regs->tf_rdi;
371 	frame.sf_sc.uc_mcontext.sc_esi    = regs->tf_rsi;
372 	frame.sf_sc.uc_mcontext.sc_ebp    = regs->tf_rbp;
373 	frame.sf_sc.uc_mcontext.sc_ebx    = regs->tf_rbx;
374 	frame.sf_sc.uc_mcontext.sc_edx    = regs->tf_rdx;
375 	frame.sf_sc.uc_mcontext.sc_ecx    = regs->tf_rcx;
376 	frame.sf_sc.uc_mcontext.sc_eax    = regs->tf_rax;
377 	frame.sf_sc.uc_mcontext.sc_eip    = regs->tf_rip;
378 	frame.sf_sc.uc_mcontext.sc_cs     = regs->tf_cs;
379 	frame.sf_sc.uc_mcontext.sc_gs     = regs->tf_gs;
380 	frame.sf_sc.uc_mcontext.sc_fs     = regs->tf_fs;
381 	frame.sf_sc.uc_mcontext.sc_es     = regs->tf_es;
382 	frame.sf_sc.uc_mcontext.sc_ds     = regs->tf_ds;
383 	frame.sf_sc.uc_mcontext.sc_eflags = regs->tf_rflags;
384 	frame.sf_sc.uc_mcontext.sc_esp_at_signal = regs->tf_rsp;
385 	frame.sf_sc.uc_mcontext.sc_ss     = regs->tf_ss;
386 	frame.sf_sc.uc_mcontext.sc_err    = regs->tf_err;
387 	frame.sf_sc.uc_mcontext.sc_cr2    = (u_int32_t)(uintptr_t)ksi->ksi_addr;
388 	frame.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code);
389 
390 #ifdef DEBUG
391 	if (ldebug(rt_sendsig))
392 		printf(LMSG("rt_sendsig flags: 0x%x, sp: %p, ss: 0x%lx, mask: 0x%x"),
393 		    frame.sf_sc.uc_stack.ss_flags, td->td_sigstk.ss_sp,
394 		    td->td_sigstk.ss_size, frame.sf_sc.uc_mcontext.sc_mask);
395 #endif
396 
397 	if (copyout(&frame, fp, sizeof(frame)) != 0) {
398 		/*
399 		 * Process has trashed its stack; give it an illegal
400 		 * instruction to halt it in its tracks.
401 		 */
402 #ifdef DEBUG
403 		if (ldebug(rt_sendsig))
404 			printf(LMSG("rt_sendsig: bad stack %p, oonstack=%x"),
405 			    fp, oonstack);
406 #endif
407 		PROC_LOCK(p);
408 		sigexit(td, SIGILL);
409 	}
410 
411 	/*
412 	 * Build context to run handler in.
413 	 */
414 	regs->tf_rsp = PTROUT(fp);
415 	regs->tf_rip = LINUX32_PS_STRINGS - *(p->p_sysent->sv_szsigcode) +
416 	    linux_sznonrtsigcode;
417 	regs->tf_rflags &= ~(PSL_T | PSL_D);
418 	regs->tf_cs = _ucode32sel;
419 	regs->tf_ss = _udatasel;
420 	regs->tf_ds = _udatasel;
421 	regs->tf_es = _udatasel;
422 	regs->tf_fs = _ufssel;
423 	regs->tf_gs = _ugssel;
424 	regs->tf_flags = TF_HASSEGS;
425 	td->td_pcb->pcb_full_iret = 1;
426 	PROC_LOCK(p);
427 	mtx_lock(&psp->ps_mtx);
428 }
429 
430 
431 /*
432  * Send an interrupt to process.
433  *
434  * Stack is set up to allow sigcode stored
435  * in u. to call routine, followed by kcall
436  * to sigreturn routine below.  After sigreturn
437  * resets the signal mask, the stack, and the
438  * frame pointer, it returns to the user
439  * specified pc, psl.
440  */
441 static void
442 linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
443 {
444 	struct thread *td = curthread;
445 	struct proc *p = td->td_proc;
446 	struct sigacts *psp;
447 	struct trapframe *regs;
448 	struct l_sigframe *fp, frame;
449 	l_sigset_t lmask;
450 	int oonstack, i;
451 	int sig, code;
452 
453 	sig = ksi->ksi_signo;
454 	code = ksi->ksi_code;
455 	PROC_LOCK_ASSERT(p, MA_OWNED);
456 	psp = p->p_sigacts;
457 	mtx_assert(&psp->ps_mtx, MA_OWNED);
458 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
459 		/* Signal handler installed with SA_SIGINFO. */
460 		linux_rt_sendsig(catcher, ksi, mask);
461 		return;
462 	}
463 
464 	regs = td->td_frame;
465 	oonstack = sigonstack(regs->tf_rsp);
466 
467 #ifdef DEBUG
468 	if (ldebug(sendsig))
469 		printf(ARGS(sendsig, "%p, %d, %p, %u"),
470 		    catcher, sig, (void*)mask, code);
471 #endif
472 
473 	/*
474 	 * Allocate space for the signal handler context.
475 	 */
476 	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
477 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
478 		fp = (struct l_sigframe *)(td->td_sigstk.ss_sp +
479 		    td->td_sigstk.ss_size - sizeof(struct l_sigframe));
480 	} else
481 		fp = (struct l_sigframe *)regs->tf_rsp - 1;
482 	mtx_unlock(&psp->ps_mtx);
483 	PROC_UNLOCK(p);
484 
485 	/*
486 	 * Build the argument list for the signal handler.
487 	 */
488 	if (p->p_sysent->sv_sigtbl)
489 		if (sig <= p->p_sysent->sv_sigsize)
490 			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
491 
492 	bzero(&frame, sizeof(frame));
493 
494 	frame.sf_handler = PTROUT(catcher);
495 	frame.sf_sig = sig;
496 
497 	bsd_to_linux_sigset(mask, &lmask);
498 
499 	/*
500 	 * Build the signal context to be used by sigreturn.
501 	 */
502 	frame.sf_sc.sc_mask   = lmask.__bits[0];
503 	frame.sf_sc.sc_gs     = regs->tf_gs;
504 	frame.sf_sc.sc_fs     = regs->tf_fs;
505 	frame.sf_sc.sc_es     = regs->tf_es;
506 	frame.sf_sc.sc_ds     = regs->tf_ds;
507 	frame.sf_sc.sc_edi    = regs->tf_rdi;
508 	frame.sf_sc.sc_esi    = regs->tf_rsi;
509 	frame.sf_sc.sc_ebp    = regs->tf_rbp;
510 	frame.sf_sc.sc_ebx    = regs->tf_rbx;
511 	frame.sf_sc.sc_edx    = regs->tf_rdx;
512 	frame.sf_sc.sc_ecx    = regs->tf_rcx;
513 	frame.sf_sc.sc_eax    = regs->tf_rax;
514 	frame.sf_sc.sc_eip    = regs->tf_rip;
515 	frame.sf_sc.sc_cs     = regs->tf_cs;
516 	frame.sf_sc.sc_eflags = regs->tf_rflags;
517 	frame.sf_sc.sc_esp_at_signal = regs->tf_rsp;
518 	frame.sf_sc.sc_ss     = regs->tf_ss;
519 	frame.sf_sc.sc_err    = regs->tf_err;
520 	frame.sf_sc.sc_cr2    = (u_int32_t)(uintptr_t)ksi->ksi_addr;
521 	frame.sf_sc.sc_trapno = bsd_to_linux_trapcode(code);
522 
523 	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
524 		frame.sf_extramask[i] = lmask.__bits[i+1];
525 
526 	if (copyout(&frame, fp, sizeof(frame)) != 0) {
527 		/*
528 		 * Process has trashed its stack; give it an illegal
529 		 * instruction to halt it in its tracks.
530 		 */
531 		PROC_LOCK(p);
532 		sigexit(td, SIGILL);
533 	}
534 
535 	/*
536 	 * Build context to run handler in.
537 	 */
538 	regs->tf_rsp = PTROUT(fp);
539 	regs->tf_rip = LINUX32_PS_STRINGS - *(p->p_sysent->sv_szsigcode);
540 	regs->tf_rflags &= ~(PSL_T | PSL_D);
541 	regs->tf_cs = _ucode32sel;
542 	regs->tf_ss = _udatasel;
543 	regs->tf_ds = _udatasel;
544 	regs->tf_es = _udatasel;
545 	regs->tf_fs = _ufssel;
546 	regs->tf_gs = _ugssel;
547 	regs->tf_flags = TF_HASSEGS;
548 	td->td_pcb->pcb_full_iret = 1;
549 	PROC_LOCK(p);
550 	mtx_lock(&psp->ps_mtx);
551 }
552 
553 /*
554  * System call to cleanup state after a signal
555  * has been taken.  Reset signal mask and
556  * stack state from context left by sendsig (above).
557  * Return to previous pc and psl as specified by
558  * context left by sendsig. Check carefully to
559  * make sure that the user has not modified the
560  * psl to gain improper privileges or to cause
561  * a machine fault.
562  */
563 int
564 linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args)
565 {
566 	struct l_sigframe frame;
567 	struct trapframe *regs;
568 	sigset_t bmask;
569 	l_sigset_t lmask;
570 	int eflags, i;
571 	ksiginfo_t ksi;
572 
573 	regs = td->td_frame;
574 
575 #ifdef DEBUG
576 	if (ldebug(sigreturn))
577 		printf(ARGS(sigreturn, "%p"), (void *)args->sfp);
578 #endif
579 	/*
580 	 * The trampoline code hands us the sigframe.
581 	 * It is unsafe to keep track of it ourselves, in the event that a
582 	 * program jumps out of a signal handler.
583 	 */
584 	if (copyin(args->sfp, &frame, sizeof(frame)) != 0)
585 		return (EFAULT);
586 
587 	/*
588 	 * Check for security violations.
589 	 */
590 #define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
591 	eflags = frame.sf_sc.sc_eflags;
592 	/*
593 	 * XXX do allow users to change the privileged flag PSL_RF.  The
594 	 * cpu sets PSL_RF in tf_eflags for faults.  Debuggers should
595 	 * sometimes set it there too.  tf_eflags is kept in the signal
596 	 * context during signal handling and there is no other place
597 	 * to remember it, so the PSL_RF bit may be corrupted by the
598 	 * signal handler without us knowing.  Corruption of the PSL_RF
599 	 * bit at worst causes one more or one less debugger trap, so
600 	 * allowing it is fairly harmless.
601 	 */
602 	if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF))
603 		return(EINVAL);
604 
605 	/*
606 	 * Don't allow users to load a valid privileged %cs.  Let the
607 	 * hardware check for invalid selectors, excess privilege in
608 	 * other selectors, invalid %eip's and invalid %esp's.
609 	 */
610 #define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
611 	if (!CS_SECURE(frame.sf_sc.sc_cs)) {
612 		ksiginfo_init_trap(&ksi);
613 		ksi.ksi_signo = SIGBUS;
614 		ksi.ksi_code = BUS_OBJERR;
615 		ksi.ksi_trapno = T_PROTFLT;
616 		ksi.ksi_addr = (void *)regs->tf_rip;
617 		trapsignal(td, &ksi);
618 		return(EINVAL);
619 	}
620 
621 	lmask.__bits[0] = frame.sf_sc.sc_mask;
622 	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
623 		lmask.__bits[i+1] = frame.sf_extramask[i];
624 	linux_to_bsd_sigset(&lmask, &bmask);
625 	kern_sigprocmask(td, SIG_SETMASK, &bmask, NULL, 0);
626 
627 	/*
628 	 * Restore signal context.
629 	 */
630 	regs->tf_rdi    = frame.sf_sc.sc_edi;
631 	regs->tf_rsi    = frame.sf_sc.sc_esi;
632 	regs->tf_rbp    = frame.sf_sc.sc_ebp;
633 	regs->tf_rbx    = frame.sf_sc.sc_ebx;
634 	regs->tf_rdx    = frame.sf_sc.sc_edx;
635 	regs->tf_rcx    = frame.sf_sc.sc_ecx;
636 	regs->tf_rax    = frame.sf_sc.sc_eax;
637 	regs->tf_rip    = frame.sf_sc.sc_eip;
638 	regs->tf_cs     = frame.sf_sc.sc_cs;
639 	regs->tf_ds     = frame.sf_sc.sc_ds;
640 	regs->tf_es     = frame.sf_sc.sc_es;
641 	regs->tf_fs     = frame.sf_sc.sc_fs;
642 	regs->tf_gs     = frame.sf_sc.sc_gs;
643 	regs->tf_rflags = eflags;
644 	regs->tf_rsp    = frame.sf_sc.sc_esp_at_signal;
645 	regs->tf_ss     = frame.sf_sc.sc_ss;
646 	td->td_pcb->pcb_full_iret = 1;
647 
648 	return (EJUSTRETURN);
649 }
650 
651 /*
652  * System call to cleanup state after a signal
653  * has been taken.  Reset signal mask and
654  * stack state from context left by rt_sendsig (above).
655  * Return to previous pc and psl as specified by
656  * context left by sendsig. Check carefully to
657  * make sure that the user has not modified the
658  * psl to gain improper privileges or to cause
659  * a machine fault.
660  */
661 int
662 linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args)
663 {
664 	struct l_ucontext uc;
665 	struct l_sigcontext *context;
666 	sigset_t bmask;
667 	l_stack_t *lss;
668 	stack_t ss;
669 	struct trapframe *regs;
670 	int eflags;
671 	ksiginfo_t ksi;
672 
673 	regs = td->td_frame;
674 
675 #ifdef DEBUG
676 	if (ldebug(rt_sigreturn))
677 		printf(ARGS(rt_sigreturn, "%p"), (void *)args->ucp);
678 #endif
679 	/*
680 	 * The trampoline code hands us the ucontext.
681 	 * It is unsafe to keep track of it ourselves, in the event that a
682 	 * program jumps out of a signal handler.
683 	 */
684 	if (copyin(args->ucp, &uc, sizeof(uc)) != 0)
685 		return (EFAULT);
686 
687 	context = &uc.uc_mcontext;
688 
689 	/*
690 	 * Check for security violations.
691 	 */
692 #define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
693 	eflags = context->sc_eflags;
694 	/*
695 	 * XXX do allow users to change the privileged flag PSL_RF.  The
696 	 * cpu sets PSL_RF in tf_eflags for faults.  Debuggers should
697 	 * sometimes set it there too.  tf_eflags is kept in the signal
698 	 * context during signal handling and there is no other place
699 	 * to remember it, so the PSL_RF bit may be corrupted by the
700 	 * signal handler without us knowing.  Corruption of the PSL_RF
701 	 * bit at worst causes one more or one less debugger trap, so
702 	 * allowing it is fairly harmless.
703 	 */
704 	if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF))
705 		return(EINVAL);
706 
707 	/*
708 	 * Don't allow users to load a valid privileged %cs.  Let the
709 	 * hardware check for invalid selectors, excess privilege in
710 	 * other selectors, invalid %eip's and invalid %esp's.
711 	 */
712 #define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
713 	if (!CS_SECURE(context->sc_cs)) {
714 		ksiginfo_init_trap(&ksi);
715 		ksi.ksi_signo = SIGBUS;
716 		ksi.ksi_code = BUS_OBJERR;
717 		ksi.ksi_trapno = T_PROTFLT;
718 		ksi.ksi_addr = (void *)regs->tf_rip;
719 		trapsignal(td, &ksi);
720 		return(EINVAL);
721 	}
722 
723 	linux_to_bsd_sigset(&uc.uc_sigmask, &bmask);
724 	kern_sigprocmask(td, SIG_SETMASK, &bmask, NULL, 0);
725 
726 	/*
727 	 * Restore signal context
728 	 */
729 	regs->tf_gs	= context->sc_gs;
730 	regs->tf_fs	= context->sc_fs;
731 	regs->tf_es	= context->sc_es;
732 	regs->tf_ds	= context->sc_ds;
733 	regs->tf_rdi    = context->sc_edi;
734 	regs->tf_rsi    = context->sc_esi;
735 	regs->tf_rbp    = context->sc_ebp;
736 	regs->tf_rbx    = context->sc_ebx;
737 	regs->tf_rdx    = context->sc_edx;
738 	regs->tf_rcx    = context->sc_ecx;
739 	regs->tf_rax    = context->sc_eax;
740 	regs->tf_rip    = context->sc_eip;
741 	regs->tf_cs     = context->sc_cs;
742 	regs->tf_rflags = eflags;
743 	regs->tf_rsp    = context->sc_esp_at_signal;
744 	regs->tf_ss     = context->sc_ss;
745 	td->td_pcb->pcb_full_iret = 1;
746 
747 	/*
748 	 * call sigaltstack & ignore results..
749 	 */
750 	lss = &uc.uc_stack;
751 	ss.ss_sp = PTRIN(lss->ss_sp);
752 	ss.ss_size = lss->ss_size;
753 	ss.ss_flags = linux_to_bsd_sigaltstack(lss->ss_flags);
754 
755 #ifdef DEBUG
756 	if (ldebug(rt_sigreturn))
757 		printf(LMSG("rt_sigret flags: 0x%x, sp: %p, ss: 0x%lx, mask: 0x%x"),
758 		    ss.ss_flags, ss.ss_sp, ss.ss_size, context->sc_mask);
759 #endif
760 	(void)kern_sigaltstack(td, &ss, NULL);
761 
762 	return (EJUSTRETURN);
763 }
764 
765 static int
766 linux32_fetch_syscall_args(struct thread *td, struct syscall_args *sa)
767 {
768 	struct proc *p;
769 	struct trapframe *frame;
770 
771 	p = td->td_proc;
772 	frame = td->td_frame;
773 
774 	sa->args[0] = frame->tf_rbx;
775 	sa->args[1] = frame->tf_rcx;
776 	sa->args[2] = frame->tf_rdx;
777 	sa->args[3] = frame->tf_rsi;
778 	sa->args[4] = frame->tf_rdi;
779 	sa->args[5] = frame->tf_rbp;	/* Unconfirmed */
780 	sa->code = frame->tf_rax;
781 
782 	if (sa->code >= p->p_sysent->sv_size)
783 		sa->callp = &p->p_sysent->sv_table[0];
784 	else
785 		sa->callp = &p->p_sysent->sv_table[sa->code];
786 	sa->narg = sa->callp->sy_narg;
787 
788 	td->td_retval[0] = 0;
789 	td->td_retval[1] = frame->tf_rdx;
790 
791 	return (0);
792 }
793 
794 /*
795  * If a linux binary is exec'ing something, try this image activator
796  * first.  We override standard shell script execution in order to
797  * be able to modify the interpreter path.  We only do this if a linux
798  * binary is doing the exec, so we do not create an EXEC module for it.
799  */
800 static int	exec_linux_imgact_try(struct image_params *iparams);
801 
802 static int
803 exec_linux_imgact_try(struct image_params *imgp)
804 {
805 	const char *head = (const char *)imgp->image_header;
806 	char *rpath;
807 	int error = -1;
808 
809 	/*
810 	* The interpreter for shell scripts run from a linux binary needs
811 	* to be located in /compat/linux if possible in order to recursively
812 	* maintain linux path emulation.
813 	*/
814 	if (((const short *)head)[0] == SHELLMAGIC) {
815 		/*
816 		* Run our normal shell image activator.  If it succeeds attempt
817 		* to use the alternate path for the interpreter.  If an
818 		* alternate * path is found, use our stringspace to store it.
819 		*/
820 		if ((error = exec_shell_imgact(imgp)) == 0) {
821 			linux_emul_convpath(FIRST_THREAD_IN_PROC(imgp->proc),
822 			    imgp->interpreter_name, UIO_SYSSPACE, &rpath, 0,
823 			    AT_FDCWD);
824 			if (rpath != NULL)
825 				imgp->args->fname_buf =
826 				    imgp->interpreter_name = rpath;
827 		}
828 	}
829 	return (error);
830 }
831 
832 /*
833  * Clear registers on exec
834  * XXX copied from ia32_signal.c.
835  */
836 static void
837 exec_linux_setregs(struct thread *td, struct image_params *imgp, u_long stack)
838 {
839 	struct trapframe *regs = td->td_frame;
840 	struct pcb *pcb = td->td_pcb;
841 
842 	mtx_lock(&dt_lock);
843 	if (td->td_proc->p_md.md_ldt != NULL)
844 		user_ldt_free(td);
845 	else
846 		mtx_unlock(&dt_lock);
847 
848 	critical_enter();
849 	wrmsr(MSR_FSBASE, 0);
850 	wrmsr(MSR_KGSBASE, 0);	/* User value while we're in the kernel */
851 	pcb->pcb_fsbase = 0;
852 	pcb->pcb_gsbase = 0;
853 	critical_exit();
854 	pcb->pcb_initial_fpucw = __LINUX_NPXCW__;
855 
856 	bzero((char *)regs, sizeof(struct trapframe));
857 	regs->tf_rip = imgp->entry_addr;
858 	regs->tf_rsp = stack;
859 	regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T);
860 	regs->tf_gs = _ugssel;
861 	regs->tf_fs = _ufssel;
862 	regs->tf_es = _udatasel;
863 	regs->tf_ds = _udatasel;
864 	regs->tf_ss = _udatasel;
865 	regs->tf_flags = TF_HASSEGS;
866 	regs->tf_cs = _ucode32sel;
867 	regs->tf_rbx = imgp->ps_strings;
868 	td->td_pcb->pcb_full_iret = 1;
869 	load_cr0(rcr0() | CR0_MP | CR0_TS);
870 	fpstate_drop(td);
871 
872 	/* Return via doreti so that we can change to a different %cs */
873 	pcb->pcb_flags |= PCB_FULLCTX | PCB_32BIT;
874 	pcb->pcb_flags &= ~PCB_GS32BIT;
875 	td->td_retval[1] = 0;
876 }
877 
878 /*
879  * XXX copied from ia32_sysvec.c.
880  */
881 static register_t *
882 linux_copyout_strings(struct image_params *imgp)
883 {
884 	int argc, envc;
885 	u_int32_t *vectp;
886 	char *stringp, *destp;
887 	u_int32_t *stack_base;
888 	struct linux32_ps_strings *arginfo;
889 
890 	/*
891 	 * Calculate string base and vector table pointers.
892 	 * Also deal with signal trampoline code for this exec type.
893 	 */
894 	arginfo = (struct linux32_ps_strings *)LINUX32_PS_STRINGS;
895 	destp =	(caddr_t)arginfo - linux_szsigcode - SPARE_USRSPACE -
896 	    linux_szplatform - roundup((ARG_MAX - imgp->args->stringspace),
897 	    sizeof(char *));
898 
899 	/*
900 	 * install sigcode
901 	 */
902 	copyout(imgp->proc->p_sysent->sv_sigcode,
903 	    ((caddr_t)arginfo - linux_szsigcode), linux_szsigcode);
904 
905 	/*
906 	 * Install LINUX_PLATFORM
907 	 */
908 	copyout(linux_platform, ((caddr_t)arginfo - linux_szsigcode -
909 	    linux_szplatform), linux_szplatform);
910 
911 	/*
912 	 * If we have a valid auxargs ptr, prepare some room
913 	 * on the stack.
914 	 */
915 	if (imgp->auxargs) {
916 		/*
917 		 * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for
918 		 * lower compatibility.
919 		 */
920 		imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size :
921 		    (LINUX_AT_COUNT * 2);
922 		/*
923 		 * The '+ 2' is for the null pointers at the end of each of
924 		 * the arg and env vector sets,and imgp->auxarg_size is room
925 		 * for argument of Runtime loader.
926 		 */
927 		vectp = (u_int32_t *) (destp - (imgp->args->argc +
928 		    imgp->args->envc + 2 + imgp->auxarg_size) *
929 		    sizeof(u_int32_t));
930 
931 	} else
932 		/*
933 		 * The '+ 2' is for the null pointers at the end of each of
934 		 * the arg and env vector sets
935 		 */
936 		vectp = (u_int32_t *)(destp - (imgp->args->argc +
937 		    imgp->args->envc + 2) * sizeof(u_int32_t));
938 
939 	/*
940 	 * vectp also becomes our initial stack base
941 	 */
942 	stack_base = vectp;
943 
944 	stringp = imgp->args->begin_argv;
945 	argc = imgp->args->argc;
946 	envc = imgp->args->envc;
947 	/*
948 	 * Copy out strings - arguments and environment.
949 	 */
950 	copyout(stringp, destp, ARG_MAX - imgp->args->stringspace);
951 
952 	/*
953 	 * Fill in "ps_strings" struct for ps, w, etc.
954 	 */
955 	suword32(&arginfo->ps_argvstr, (uint32_t)(intptr_t)vectp);
956 	suword32(&arginfo->ps_nargvstr, argc);
957 
958 	/*
959 	 * Fill in argument portion of vector table.
960 	 */
961 	for (; argc > 0; --argc) {
962 		suword32(vectp++, (uint32_t)(intptr_t)destp);
963 		while (*stringp++ != 0)
964 			destp++;
965 		destp++;
966 	}
967 
968 	/* a null vector table pointer separates the argp's from the envp's */
969 	suword32(vectp++, 0);
970 
971 	suword32(&arginfo->ps_envstr, (uint32_t)(intptr_t)vectp);
972 	suword32(&arginfo->ps_nenvstr, envc);
973 
974 	/*
975 	 * Fill in environment portion of vector table.
976 	 */
977 	for (; envc > 0; --envc) {
978 		suword32(vectp++, (uint32_t)(intptr_t)destp);
979 		while (*stringp++ != 0)
980 			destp++;
981 		destp++;
982 	}
983 
984 	/* end of vector table is a null pointer */
985 	suword32(vectp, 0);
986 
987 	return ((register_t *)stack_base);
988 }
989 
990 SYSCTL_NODE(_compat, OID_AUTO, linux32, CTLFLAG_RW, 0,
991     "32-bit Linux emulation");
992 
993 static u_long	linux32_maxdsiz = LINUX32_MAXDSIZ;
994 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxdsiz, CTLFLAG_RW,
995     &linux32_maxdsiz, 0, "");
996 static u_long	linux32_maxssiz = LINUX32_MAXSSIZ;
997 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxssiz, CTLFLAG_RW,
998     &linux32_maxssiz, 0, "");
999 static u_long	linux32_maxvmem = LINUX32_MAXVMEM;
1000 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxvmem, CTLFLAG_RW,
1001     &linux32_maxvmem, 0, "");
1002 
1003 static void
1004 linux32_fixlimit(struct rlimit *rl, int which)
1005 {
1006 
1007 	switch (which) {
1008 	case RLIMIT_DATA:
1009 		if (linux32_maxdsiz != 0) {
1010 			if (rl->rlim_cur > linux32_maxdsiz)
1011 				rl->rlim_cur = linux32_maxdsiz;
1012 			if (rl->rlim_max > linux32_maxdsiz)
1013 				rl->rlim_max = linux32_maxdsiz;
1014 		}
1015 		break;
1016 	case RLIMIT_STACK:
1017 		if (linux32_maxssiz != 0) {
1018 			if (rl->rlim_cur > linux32_maxssiz)
1019 				rl->rlim_cur = linux32_maxssiz;
1020 			if (rl->rlim_max > linux32_maxssiz)
1021 				rl->rlim_max = linux32_maxssiz;
1022 		}
1023 		break;
1024 	case RLIMIT_VMEM:
1025 		if (linux32_maxvmem != 0) {
1026 			if (rl->rlim_cur > linux32_maxvmem)
1027 				rl->rlim_cur = linux32_maxvmem;
1028 			if (rl->rlim_max > linux32_maxvmem)
1029 				rl->rlim_max = linux32_maxvmem;
1030 		}
1031 		break;
1032 	}
1033 }
1034 
1035 struct sysentvec elf_linux_sysvec = {
1036 	.sv_size	= LINUX_SYS_MAXSYSCALL,
1037 	.sv_table	= linux_sysent,
1038 	.sv_mask	= 0,
1039 	.sv_sigsize	= LINUX_SIGTBLSZ,
1040 	.sv_sigtbl	= bsd_to_linux_signal,
1041 	.sv_errsize	= ELAST + 1,
1042 	.sv_errtbl	= bsd_to_linux_errno,
1043 	.sv_transtrap	= translate_traps,
1044 	.sv_fixup	= elf_linux_fixup,
1045 	.sv_sendsig	= linux_sendsig,
1046 	.sv_sigcode	= linux_sigcode,
1047 	.sv_szsigcode	= &linux_szsigcode,
1048 	.sv_prepsyscall	= NULL,
1049 	.sv_name	= "Linux ELF32",
1050 	.sv_coredump	= elf32_coredump,
1051 	.sv_imgact_try	= exec_linux_imgact_try,
1052 	.sv_minsigstksz	= LINUX_MINSIGSTKSZ,
1053 	.sv_pagesize	= PAGE_SIZE,
1054 	.sv_minuser	= VM_MIN_ADDRESS,
1055 	.sv_maxuser	= LINUX32_USRSTACK,
1056 	.sv_usrstack	= LINUX32_USRSTACK,
1057 	.sv_psstrings	= LINUX32_PS_STRINGS,
1058 	.sv_stackprot	= VM_PROT_ALL,
1059 	.sv_copyout_strings = linux_copyout_strings,
1060 	.sv_setregs	= exec_linux_setregs,
1061 	.sv_fixlimit	= linux32_fixlimit,
1062 	.sv_maxssiz	= &linux32_maxssiz,
1063 	.sv_flags	= SV_ABI_LINUX | SV_ILP32 | SV_IA32,
1064 	.sv_set_syscall_retval = cpu_set_syscall_retval,
1065 	.sv_fetch_syscall_args = linux32_fetch_syscall_args,
1066 	.sv_syscallnames = NULL,
1067 };
1068 
1069 static char GNU_ABI_VENDOR[] = "GNU";
1070 static int GNULINUX_ABI_DESC = 0;
1071 
1072 static boolean_t
1073 linux32_trans_osrel(const Elf_Note *note, int32_t *osrel)
1074 {
1075 	const Elf32_Word *desc;
1076 	uintptr_t p;
1077 
1078 	p = (uintptr_t)(note + 1);
1079 	p += roundup2(note->n_namesz, sizeof(Elf32_Addr));
1080 
1081 	desc = (const Elf32_Word *)p;
1082 	if (desc[0] != GNULINUX_ABI_DESC)
1083 		return (FALSE);
1084 
1085 	/*
1086 	 * For linux we encode osrel as follows (see linux_mib.c):
1087 	 * VVVMMMIII (version, major, minor), see linux_mib.c.
1088 	 */
1089 	*osrel = desc[1] * 1000000 + desc[2] * 1000 + desc[3];
1090 
1091 	return (TRUE);
1092 }
1093 
1094 static Elf_Brandnote linux32_brandnote = {
1095 	.hdr.n_namesz	= sizeof(GNU_ABI_VENDOR),
1096 	.hdr.n_descsz	= 16,	/* XXX at least 16 */
1097 	.hdr.n_type	= 1,
1098 	.vendor		= GNU_ABI_VENDOR,
1099 	.flags		= BN_TRANSLATE_OSREL,
1100 	.trans_osrel	= linux32_trans_osrel
1101 };
1102 
1103 static Elf32_Brandinfo linux_brand = {
1104 	.brand		= ELFOSABI_LINUX,
1105 	.machine	= EM_386,
1106 	.compat_3_brand	= "Linux",
1107 	.emul_path	= "/compat/linux",
1108 	.interp_path	= "/lib/ld-linux.so.1",
1109 	.sysvec		= &elf_linux_sysvec,
1110 	.interp_newpath	= NULL,
1111 	.brand_note	= &linux32_brandnote,
1112 	.flags		= BI_CAN_EXEC_DYN | BI_BRAND_NOTE
1113 };
1114 
1115 static Elf32_Brandinfo linux_glibc2brand = {
1116 	.brand		= ELFOSABI_LINUX,
1117 	.machine	= EM_386,
1118 	.compat_3_brand	= "Linux",
1119 	.emul_path	= "/compat/linux",
1120 	.interp_path	= "/lib/ld-linux.so.2",
1121 	.sysvec		= &elf_linux_sysvec,
1122 	.interp_newpath	= NULL,
1123 	.brand_note	= &linux32_brandnote,
1124 	.flags		= BI_CAN_EXEC_DYN | BI_BRAND_NOTE
1125 };
1126 
1127 Elf32_Brandinfo *linux_brandlist[] = {
1128 	&linux_brand,
1129 	&linux_glibc2brand,
1130 	NULL
1131 };
1132 
1133 static int
1134 linux_elf_modevent(module_t mod, int type, void *data)
1135 {
1136 	Elf32_Brandinfo **brandinfo;
1137 	int error;
1138 	struct linux_ioctl_handler **lihp;
1139 	struct linux_device_handler **ldhp;
1140 
1141 	error = 0;
1142 
1143 	switch(type) {
1144 	case MOD_LOAD:
1145 		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
1146 		     ++brandinfo)
1147 			if (elf32_insert_brand_entry(*brandinfo) < 0)
1148 				error = EINVAL;
1149 		if (error == 0) {
1150 			SET_FOREACH(lihp, linux_ioctl_handler_set)
1151 				linux_ioctl_register_handler(*lihp);
1152 			SET_FOREACH(ldhp, linux_device_handler_set)
1153 				linux_device_register_handler(*ldhp);
1154 			mtx_init(&emul_lock, "emuldata lock", NULL, MTX_DEF);
1155 			sx_init(&emul_shared_lock, "emuldata->shared lock");
1156 			LIST_INIT(&futex_list);
1157 			mtx_init(&futex_mtx, "ftllk", NULL, MTX_DEF);
1158 			linux_exit_tag = EVENTHANDLER_REGISTER(process_exit,
1159 			    linux_proc_exit, NULL, 1000);
1160 			linux_schedtail_tag = EVENTHANDLER_REGISTER(schedtail,
1161 			    linux_schedtail, NULL, 1000);
1162 			linux_exec_tag = EVENTHANDLER_REGISTER(process_exec,
1163 			    linux_proc_exec, NULL, 1000);
1164 			linux_szplatform = roundup(strlen(linux_platform) + 1,
1165 			    sizeof(char *));
1166 			linux_osd_jail_register();
1167 			stclohz = (stathz ? stathz : hz);
1168 			if (bootverbose)
1169 				printf("Linux ELF exec handler installed\n");
1170 		} else
1171 			printf("cannot insert Linux ELF brand handler\n");
1172 		break;
1173 	case MOD_UNLOAD:
1174 		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
1175 		     ++brandinfo)
1176 			if (elf32_brand_inuse(*brandinfo))
1177 				error = EBUSY;
1178 		if (error == 0) {
1179 			for (brandinfo = &linux_brandlist[0];
1180 			     *brandinfo != NULL; ++brandinfo)
1181 				if (elf32_remove_brand_entry(*brandinfo) < 0)
1182 					error = EINVAL;
1183 		}
1184 		if (error == 0) {
1185 			SET_FOREACH(lihp, linux_ioctl_handler_set)
1186 				linux_ioctl_unregister_handler(*lihp);
1187 			SET_FOREACH(ldhp, linux_device_handler_set)
1188 				linux_device_unregister_handler(*ldhp);
1189 			mtx_destroy(&emul_lock);
1190 			sx_destroy(&emul_shared_lock);
1191 			mtx_destroy(&futex_mtx);
1192 			EVENTHANDLER_DEREGISTER(process_exit, linux_exit_tag);
1193 			EVENTHANDLER_DEREGISTER(schedtail, linux_schedtail_tag);
1194 			EVENTHANDLER_DEREGISTER(process_exec, linux_exec_tag);
1195 			linux_osd_jail_deregister();
1196 			if (bootverbose)
1197 				printf("Linux ELF exec handler removed\n");
1198 		} else
1199 			printf("Could not deinstall ELF interpreter entry\n");
1200 		break;
1201 	default:
1202 		return EOPNOTSUPP;
1203 	}
1204 	return error;
1205 }
1206 
1207 static moduledata_t linux_elf_mod = {
1208 	"linuxelf",
1209 	linux_elf_modevent,
1210 	0
1211 };
1212 
1213 DECLARE_MODULE_TIED(linuxelf, linux_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY);
1214