xref: /freebsd/sys/amd64/linux32/linux32_sysvec.c (revision aa64588d28258aef88cc33b8043112e8856948d0)
1 /*-
2  * Copyright (c) 2004 Tim J. Robbins
3  * Copyright (c) 2003 Peter Wemm
4  * Copyright (c) 2002 Doug Rabson
5  * Copyright (c) 1998-1999 Andrew Gallatin
6  * Copyright (c) 1994-1996 S�ren Schmidt
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer
14  *    in this position and unchanged.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. The name of the author may not be used to endorse or promote products
19  *    derived from this software without specific prior written permission
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
22  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
23  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
24  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
25  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
26  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
30  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31  */
32 
33 #include <sys/cdefs.h>
34 __FBSDID("$FreeBSD$");
35 #include "opt_compat.h"
36 
37 #ifndef COMPAT_FREEBSD32
38 #error "Unable to compile Linux-emulator due to missing COMPAT_FREEBSD32 option!"
39 #endif
40 
41 #define	__ELF_WORD_SIZE	32
42 
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/exec.h>
46 #include <sys/fcntl.h>
47 #include <sys/imgact.h>
48 #include <sys/imgact_elf.h>
49 #include <sys/kernel.h>
50 #include <sys/lock.h>
51 #include <sys/malloc.h>
52 #include <sys/module.h>
53 #include <sys/mutex.h>
54 #include <sys/proc.h>
55 #include <sys/resourcevar.h>
56 #include <sys/signalvar.h>
57 #include <sys/sysctl.h>
58 #include <sys/syscallsubr.h>
59 #include <sys/sysent.h>
60 #include <sys/sysproto.h>
61 #include <sys/vnode.h>
62 #include <sys/eventhandler.h>
63 
64 #include <vm/vm.h>
65 #include <vm/pmap.h>
66 #include <vm/vm_extern.h>
67 #include <vm/vm_map.h>
68 #include <vm/vm_object.h>
69 #include <vm/vm_page.h>
70 #include <vm/vm_param.h>
71 
72 #include <machine/cpu.h>
73 #include <machine/md_var.h>
74 #include <machine/pcb.h>
75 #include <machine/specialreg.h>
76 
77 #include <amd64/linux32/linux.h>
78 #include <amd64/linux32/linux32_proto.h>
79 #include <compat/linux/linux_futex.h>
80 #include <compat/linux/linux_emul.h>
81 #include <compat/linux/linux_mib.h>
82 #include <compat/linux/linux_misc.h>
83 #include <compat/linux/linux_signal.h>
84 #include <compat/linux/linux_util.h>
85 
86 MODULE_VERSION(linux, 1);
87 
88 MALLOC_DEFINE(M_LINUX, "linux", "Linux mode structures");
89 
90 #define	AUXARGS_ENTRY_32(pos, id, val)	\
91 	do {				\
92 		suword32(pos++, id);	\
93 		suword32(pos++, val);	\
94 	} while (0)
95 
96 #if BYTE_ORDER == LITTLE_ENDIAN
97 #define SHELLMAGIC      0x2123 /* #! */
98 #else
99 #define SHELLMAGIC      0x2321
100 #endif
101 
102 /*
103  * Allow the sendsig functions to use the ldebug() facility
104  * even though they are not syscalls themselves. Map them
105  * to syscall 0. This is slightly less bogus than using
106  * ldebug(sigreturn).
107  */
108 #define	LINUX_SYS_linux_rt_sendsig	0
109 #define	LINUX_SYS_linux_sendsig		0
110 
111 const char *linux_platform = "i686";
112 static int linux_szplatform;
113 extern char linux_sigcode[];
114 extern int linux_szsigcode;
115 
116 extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL];
117 
118 SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler);
119 SET_DECLARE(linux_device_handler_set, struct linux_device_handler);
120 
121 static int	elf_linux_fixup(register_t **stack_base,
122 		    struct image_params *iparams);
123 static register_t *linux_copyout_strings(struct image_params *imgp);
124 static void     linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask);
125 static void	exec_linux_setregs(struct thread *td,
126 				   struct image_params *imgp, u_long stack);
127 static void	linux32_fixlimit(struct rlimit *rl, int which);
128 static boolean_t linux32_trans_osrel(const Elf_Note *note, int32_t *osrel);
129 
130 static eventhandler_tag linux_exit_tag;
131 static eventhandler_tag linux_schedtail_tag;
132 static eventhandler_tag linux_exec_tag;
133 
134 /*
135  * Linux syscalls return negative errno's, we do positive and map them
136  * Reference:
137  *   FreeBSD: src/sys/sys/errno.h
138  *   Linux:   linux-2.6.17.8/include/asm-generic/errno-base.h
139  *            linux-2.6.17.8/include/asm-generic/errno.h
140  */
141 static int bsd_to_linux_errno[ELAST + 1] = {
142 	-0,  -1,  -2,  -3,  -4,  -5,  -6,  -7,  -8,  -9,
143 	-10, -35, -12, -13, -14, -15, -16, -17, -18, -19,
144 	-20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
145 	-30, -31, -32, -33, -34, -11,-115,-114, -88, -89,
146 	-90, -91, -92, -93, -94, -95, -96, -97, -98, -99,
147 	-100,-101,-102,-103,-104,-105,-106,-107,-108,-109,
148 	-110,-111, -40, -36,-112,-113, -39, -11, -87,-122,
149 	-116, -66,  -6,  -6,  -6,  -6,  -6, -37, -38,  -9,
150 	  -6,  -6, -43, -42, -75,-125, -84, -95, -16, -74,
151 	 -72, -67, -71
152 };
153 
154 int bsd_to_linux_signal[LINUX_SIGTBLSZ] = {
155 	LINUX_SIGHUP, LINUX_SIGINT, LINUX_SIGQUIT, LINUX_SIGILL,
156 	LINUX_SIGTRAP, LINUX_SIGABRT, 0, LINUX_SIGFPE,
157 	LINUX_SIGKILL, LINUX_SIGBUS, LINUX_SIGSEGV, LINUX_SIGSYS,
158 	LINUX_SIGPIPE, LINUX_SIGALRM, LINUX_SIGTERM, LINUX_SIGURG,
159 	LINUX_SIGSTOP, LINUX_SIGTSTP, LINUX_SIGCONT, LINUX_SIGCHLD,
160 	LINUX_SIGTTIN, LINUX_SIGTTOU, LINUX_SIGIO, LINUX_SIGXCPU,
161 	LINUX_SIGXFSZ, LINUX_SIGVTALRM, LINUX_SIGPROF, LINUX_SIGWINCH,
162 	0, LINUX_SIGUSR1, LINUX_SIGUSR2
163 };
164 
165 int linux_to_bsd_signal[LINUX_SIGTBLSZ] = {
166 	SIGHUP, SIGINT, SIGQUIT, SIGILL,
167 	SIGTRAP, SIGABRT, SIGBUS, SIGFPE,
168 	SIGKILL, SIGUSR1, SIGSEGV, SIGUSR2,
169 	SIGPIPE, SIGALRM, SIGTERM, SIGBUS,
170 	SIGCHLD, SIGCONT, SIGSTOP, SIGTSTP,
171 	SIGTTIN, SIGTTOU, SIGURG, SIGXCPU,
172 	SIGXFSZ, SIGVTALRM, SIGPROF, SIGWINCH,
173 	SIGIO, SIGURG, SIGSYS
174 };
175 
176 #define LINUX_T_UNKNOWN  255
177 static int _bsd_to_linux_trapcode[] = {
178 	LINUX_T_UNKNOWN,	/* 0 */
179 	6,			/* 1  T_PRIVINFLT */
180 	LINUX_T_UNKNOWN,	/* 2 */
181 	3,			/* 3  T_BPTFLT */
182 	LINUX_T_UNKNOWN,	/* 4 */
183 	LINUX_T_UNKNOWN,	/* 5 */
184 	16,			/* 6  T_ARITHTRAP */
185 	254,			/* 7  T_ASTFLT */
186 	LINUX_T_UNKNOWN,	/* 8 */
187 	13,			/* 9  T_PROTFLT */
188 	1,			/* 10 T_TRCTRAP */
189 	LINUX_T_UNKNOWN,	/* 11 */
190 	14,			/* 12 T_PAGEFLT */
191 	LINUX_T_UNKNOWN,	/* 13 */
192 	17,			/* 14 T_ALIGNFLT */
193 	LINUX_T_UNKNOWN,	/* 15 */
194 	LINUX_T_UNKNOWN,	/* 16 */
195 	LINUX_T_UNKNOWN,	/* 17 */
196 	0,			/* 18 T_DIVIDE */
197 	2,			/* 19 T_NMI */
198 	4,			/* 20 T_OFLOW */
199 	5,			/* 21 T_BOUND */
200 	7,			/* 22 T_DNA */
201 	8,			/* 23 T_DOUBLEFLT */
202 	9,			/* 24 T_FPOPFLT */
203 	10,			/* 25 T_TSSFLT */
204 	11,			/* 26 T_SEGNPFLT */
205 	12,			/* 27 T_STKFLT */
206 	18,			/* 28 T_MCHK */
207 	19,			/* 29 T_XMMFLT */
208 	15			/* 30 T_RESERVED */
209 };
210 #define bsd_to_linux_trapcode(code) \
211     ((code)<sizeof(_bsd_to_linux_trapcode)/sizeof(*_bsd_to_linux_trapcode)? \
212      _bsd_to_linux_trapcode[(code)]: \
213      LINUX_T_UNKNOWN)
214 
215 struct linux32_ps_strings {
216 	u_int32_t ps_argvstr;	/* first of 0 or more argument strings */
217 	u_int ps_nargvstr;	/* the number of argument strings */
218 	u_int32_t ps_envstr;	/* first of 0 or more environment strings */
219 	u_int ps_nenvstr;	/* the number of environment strings */
220 };
221 
222 /*
223  * If FreeBSD & Linux have a difference of opinion about what a trap
224  * means, deal with it here.
225  *
226  * MPSAFE
227  */
228 static int
229 translate_traps(int signal, int trap_code)
230 {
231 	if (signal != SIGBUS)
232 		return signal;
233 	switch (trap_code) {
234 	case T_PROTFLT:
235 	case T_TSSFLT:
236 	case T_DOUBLEFLT:
237 	case T_PAGEFLT:
238 		return SIGSEGV;
239 	default:
240 		return signal;
241 	}
242 }
243 
244 static int
245 elf_linux_fixup(register_t **stack_base, struct image_params *imgp)
246 {
247 	Elf32_Auxargs *args;
248 	Elf32_Addr *base;
249 	Elf32_Addr *pos, *uplatform;
250 	struct linux32_ps_strings *arginfo;
251 
252 	arginfo = (struct linux32_ps_strings *)LINUX32_PS_STRINGS;
253 	uplatform = (Elf32_Addr *)((caddr_t)arginfo - linux_szsigcode -
254 	    linux_szplatform);
255 
256 	KASSERT(curthread->td_proc == imgp->proc,
257 	    ("unsafe elf_linux_fixup(), should be curproc"));
258 	base = (Elf32_Addr *)*stack_base;
259 	args = (Elf32_Auxargs *)imgp->auxargs;
260 	pos = base + (imgp->args->argc + imgp->args->envc + 2);
261 
262 	AUXARGS_ENTRY_32(pos, LINUX_AT_HWCAP, cpu_feature);
263 
264 	/*
265 	 * Do not export AT_CLKTCK when emulating Linux kernel prior to 2.4.0,
266 	 * as it has appeared in the 2.4.0-rc7 first time.
267 	 * Being exported, AT_CLKTCK is returned by sysconf(_SC_CLK_TCK),
268 	 * glibc falls back to the hard-coded CLK_TCK value when aux entry
269 	 * is not present.
270 	 * Also see linux_times() implementation.
271 	 */
272 	if (linux_kernver(curthread) >= LINUX_KERNVER_2004000)
273 		AUXARGS_ENTRY_32(pos, LINUX_AT_CLKTCK, stclohz);
274 	AUXARGS_ENTRY_32(pos, AT_PHDR, args->phdr);
275 	AUXARGS_ENTRY_32(pos, AT_PHENT, args->phent);
276 	AUXARGS_ENTRY_32(pos, AT_PHNUM, args->phnum);
277 	AUXARGS_ENTRY_32(pos, AT_PAGESZ, args->pagesz);
278 	AUXARGS_ENTRY_32(pos, AT_FLAGS, args->flags);
279 	AUXARGS_ENTRY_32(pos, AT_ENTRY, args->entry);
280 	AUXARGS_ENTRY_32(pos, AT_BASE, args->base);
281 	AUXARGS_ENTRY_32(pos, LINUX_AT_SECURE, 0);
282 	AUXARGS_ENTRY_32(pos, AT_UID, imgp->proc->p_ucred->cr_ruid);
283 	AUXARGS_ENTRY_32(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid);
284 	AUXARGS_ENTRY_32(pos, AT_GID, imgp->proc->p_ucred->cr_rgid);
285 	AUXARGS_ENTRY_32(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid);
286 	AUXARGS_ENTRY_32(pos, LINUX_AT_PLATFORM, PTROUT(uplatform));
287 	if (args->execfd != -1)
288 		AUXARGS_ENTRY_32(pos, AT_EXECFD, args->execfd);
289 	AUXARGS_ENTRY_32(pos, AT_NULL, 0);
290 
291 	free(imgp->auxargs, M_TEMP);
292 	imgp->auxargs = NULL;
293 
294 	base--;
295 	suword32(base, (uint32_t)imgp->args->argc);
296 	*stack_base = (register_t *)base;
297 	return 0;
298 }
299 
300 extern unsigned long linux_sznonrtsigcode;
301 
302 static void
303 linux_rt_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
304 {
305 	struct thread *td = curthread;
306 	struct proc *p = td->td_proc;
307 	struct sigacts *psp;
308 	struct trapframe *regs;
309 	struct l_rt_sigframe *fp, frame;
310 	int oonstack;
311 	int sig;
312 	int code;
313 
314 	sig = ksi->ksi_signo;
315 	code = ksi->ksi_code;
316 	PROC_LOCK_ASSERT(p, MA_OWNED);
317 	psp = p->p_sigacts;
318 	mtx_assert(&psp->ps_mtx, MA_OWNED);
319 	regs = td->td_frame;
320 	oonstack = sigonstack(regs->tf_rsp);
321 
322 #ifdef DEBUG
323 	if (ldebug(rt_sendsig))
324 		printf(ARGS(rt_sendsig, "%p, %d, %p, %u"),
325 		    catcher, sig, (void*)mask, code);
326 #endif
327 	/*
328 	 * Allocate space for the signal handler context.
329 	 */
330 	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
331 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
332 		fp = (struct l_rt_sigframe *)(td->td_sigstk.ss_sp +
333 		    td->td_sigstk.ss_size - sizeof(struct l_rt_sigframe));
334 	} else
335 		fp = (struct l_rt_sigframe *)regs->tf_rsp - 1;
336 	mtx_unlock(&psp->ps_mtx);
337 
338 	/*
339 	 * Build the argument list for the signal handler.
340 	 */
341 	if (p->p_sysent->sv_sigtbl)
342 		if (sig <= p->p_sysent->sv_sigsize)
343 			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
344 
345 	bzero(&frame, sizeof(frame));
346 
347 	frame.sf_handler = PTROUT(catcher);
348 	frame.sf_sig = sig;
349 	frame.sf_siginfo = PTROUT(&fp->sf_si);
350 	frame.sf_ucontext = PTROUT(&fp->sf_sc);
351 
352 	/* Fill in POSIX parts */
353 	ksiginfo_to_lsiginfo(ksi, &frame.sf_si, sig);
354 
355 	/*
356 	 * Build the signal context to be used by sigreturn.
357 	 */
358 	frame.sf_sc.uc_flags = 0;		/* XXX ??? */
359 	frame.sf_sc.uc_link = 0;		/* XXX ??? */
360 
361 	frame.sf_sc.uc_stack.ss_sp = PTROUT(td->td_sigstk.ss_sp);
362 	frame.sf_sc.uc_stack.ss_size = td->td_sigstk.ss_size;
363 	frame.sf_sc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
364 	    ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE;
365 	PROC_UNLOCK(p);
366 
367 	bsd_to_linux_sigset(mask, &frame.sf_sc.uc_sigmask);
368 
369 	frame.sf_sc.uc_mcontext.sc_mask   = frame.sf_sc.uc_sigmask.__bits[0];
370 	frame.sf_sc.uc_mcontext.sc_edi    = regs->tf_rdi;
371 	frame.sf_sc.uc_mcontext.sc_esi    = regs->tf_rsi;
372 	frame.sf_sc.uc_mcontext.sc_ebp    = regs->tf_rbp;
373 	frame.sf_sc.uc_mcontext.sc_ebx    = regs->tf_rbx;
374 	frame.sf_sc.uc_mcontext.sc_edx    = regs->tf_rdx;
375 	frame.sf_sc.uc_mcontext.sc_ecx    = regs->tf_rcx;
376 	frame.sf_sc.uc_mcontext.sc_eax    = regs->tf_rax;
377 	frame.sf_sc.uc_mcontext.sc_eip    = regs->tf_rip;
378 	frame.sf_sc.uc_mcontext.sc_cs     = regs->tf_cs;
379 	frame.sf_sc.uc_mcontext.sc_gs     = regs->tf_gs;
380 	frame.sf_sc.uc_mcontext.sc_fs     = regs->tf_fs;
381 	frame.sf_sc.uc_mcontext.sc_es     = regs->tf_es;
382 	frame.sf_sc.uc_mcontext.sc_ds     = regs->tf_ds;
383 	frame.sf_sc.uc_mcontext.sc_eflags = regs->tf_rflags;
384 	frame.sf_sc.uc_mcontext.sc_esp_at_signal = regs->tf_rsp;
385 	frame.sf_sc.uc_mcontext.sc_ss     = regs->tf_ss;
386 	frame.sf_sc.uc_mcontext.sc_err    = regs->tf_err;
387 	frame.sf_sc.uc_mcontext.sc_cr2    = (u_int32_t)(uintptr_t)ksi->ksi_addr;
388 	frame.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code);
389 
390 #ifdef DEBUG
391 	if (ldebug(rt_sendsig))
392 		printf(LMSG("rt_sendsig flags: 0x%x, sp: %p, ss: 0x%lx, mask: 0x%x"),
393 		    frame.sf_sc.uc_stack.ss_flags, td->td_sigstk.ss_sp,
394 		    td->td_sigstk.ss_size, frame.sf_sc.uc_mcontext.sc_mask);
395 #endif
396 
397 	if (copyout(&frame, fp, sizeof(frame)) != 0) {
398 		/*
399 		 * Process has trashed its stack; give it an illegal
400 		 * instruction to halt it in its tracks.
401 		 */
402 #ifdef DEBUG
403 		if (ldebug(rt_sendsig))
404 			printf(LMSG("rt_sendsig: bad stack %p, oonstack=%x"),
405 			    fp, oonstack);
406 #endif
407 		PROC_LOCK(p);
408 		sigexit(td, SIGILL);
409 	}
410 
411 	/*
412 	 * Build context to run handler in.
413 	 */
414 	regs->tf_rsp = PTROUT(fp);
415 	regs->tf_rip = LINUX32_PS_STRINGS - *(p->p_sysent->sv_szsigcode) +
416 	    linux_sznonrtsigcode;
417 	regs->tf_rflags &= ~(PSL_T | PSL_D);
418 	regs->tf_cs = _ucode32sel;
419 	regs->tf_ss = _udatasel;
420 	regs->tf_ds = _udatasel;
421 	regs->tf_es = _udatasel;
422 	regs->tf_fs = _ufssel;
423 	regs->tf_gs = _ugssel;
424 	regs->tf_flags = TF_HASSEGS;
425 	td->td_pcb->pcb_full_iret = 1;
426 	PROC_LOCK(p);
427 	mtx_lock(&psp->ps_mtx);
428 }
429 
430 
431 /*
432  * Send an interrupt to process.
433  *
434  * Stack is set up to allow sigcode stored
435  * in u. to call routine, followed by kcall
436  * to sigreturn routine below.  After sigreturn
437  * resets the signal mask, the stack, and the
438  * frame pointer, it returns to the user
439  * specified pc, psl.
440  */
441 static void
442 linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
443 {
444 	struct thread *td = curthread;
445 	struct proc *p = td->td_proc;
446 	struct sigacts *psp;
447 	struct trapframe *regs;
448 	struct l_sigframe *fp, frame;
449 	l_sigset_t lmask;
450 	int oonstack, i;
451 	int sig, code;
452 
453 	sig = ksi->ksi_signo;
454 	code = ksi->ksi_code;
455 	PROC_LOCK_ASSERT(p, MA_OWNED);
456 	psp = p->p_sigacts;
457 	mtx_assert(&psp->ps_mtx, MA_OWNED);
458 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
459 		/* Signal handler installed with SA_SIGINFO. */
460 		linux_rt_sendsig(catcher, ksi, mask);
461 		return;
462 	}
463 
464 	regs = td->td_frame;
465 	oonstack = sigonstack(regs->tf_rsp);
466 
467 #ifdef DEBUG
468 	if (ldebug(sendsig))
469 		printf(ARGS(sendsig, "%p, %d, %p, %u"),
470 		    catcher, sig, (void*)mask, code);
471 #endif
472 
473 	/*
474 	 * Allocate space for the signal handler context.
475 	 */
476 	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
477 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
478 		fp = (struct l_sigframe *)(td->td_sigstk.ss_sp +
479 		    td->td_sigstk.ss_size - sizeof(struct l_sigframe));
480 	} else
481 		fp = (struct l_sigframe *)regs->tf_rsp - 1;
482 	mtx_unlock(&psp->ps_mtx);
483 	PROC_UNLOCK(p);
484 
485 	/*
486 	 * Build the argument list for the signal handler.
487 	 */
488 	if (p->p_sysent->sv_sigtbl)
489 		if (sig <= p->p_sysent->sv_sigsize)
490 			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
491 
492 	bzero(&frame, sizeof(frame));
493 
494 	frame.sf_handler = PTROUT(catcher);
495 	frame.sf_sig = sig;
496 
497 	bsd_to_linux_sigset(mask, &lmask);
498 
499 	/*
500 	 * Build the signal context to be used by sigreturn.
501 	 */
502 	frame.sf_sc.sc_mask   = lmask.__bits[0];
503 	frame.sf_sc.sc_gs     = regs->tf_gs;
504 	frame.sf_sc.sc_fs     = regs->tf_fs;
505 	frame.sf_sc.sc_es     = regs->tf_es;
506 	frame.sf_sc.sc_ds     = regs->tf_ds;
507 	frame.sf_sc.sc_edi    = regs->tf_rdi;
508 	frame.sf_sc.sc_esi    = regs->tf_rsi;
509 	frame.sf_sc.sc_ebp    = regs->tf_rbp;
510 	frame.sf_sc.sc_ebx    = regs->tf_rbx;
511 	frame.sf_sc.sc_edx    = regs->tf_rdx;
512 	frame.sf_sc.sc_ecx    = regs->tf_rcx;
513 	frame.sf_sc.sc_eax    = regs->tf_rax;
514 	frame.sf_sc.sc_eip    = regs->tf_rip;
515 	frame.sf_sc.sc_cs     = regs->tf_cs;
516 	frame.sf_sc.sc_eflags = regs->tf_rflags;
517 	frame.sf_sc.sc_esp_at_signal = regs->tf_rsp;
518 	frame.sf_sc.sc_ss     = regs->tf_ss;
519 	frame.sf_sc.sc_err    = regs->tf_err;
520 	frame.sf_sc.sc_cr2    = (u_int32_t)(uintptr_t)ksi->ksi_addr;
521 	frame.sf_sc.sc_trapno = bsd_to_linux_trapcode(code);
522 
523 	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
524 		frame.sf_extramask[i] = lmask.__bits[i+1];
525 
526 	if (copyout(&frame, fp, sizeof(frame)) != 0) {
527 		/*
528 		 * Process has trashed its stack; give it an illegal
529 		 * instruction to halt it in its tracks.
530 		 */
531 		PROC_LOCK(p);
532 		sigexit(td, SIGILL);
533 	}
534 
535 	/*
536 	 * Build context to run handler in.
537 	 */
538 	regs->tf_rsp = PTROUT(fp);
539 	regs->tf_rip = LINUX32_PS_STRINGS - *(p->p_sysent->sv_szsigcode);
540 	regs->tf_rflags &= ~(PSL_T | PSL_D);
541 	regs->tf_cs = _ucode32sel;
542 	regs->tf_ss = _udatasel;
543 	regs->tf_ds = _udatasel;
544 	regs->tf_es = _udatasel;
545 	regs->tf_fs = _ufssel;
546 	regs->tf_gs = _ugssel;
547 	regs->tf_flags = TF_HASSEGS;
548 	td->td_pcb->pcb_full_iret = 1;
549 	PROC_LOCK(p);
550 	mtx_lock(&psp->ps_mtx);
551 }
552 
553 /*
554  * System call to cleanup state after a signal
555  * has been taken.  Reset signal mask and
556  * stack state from context left by sendsig (above).
557  * Return to previous pc and psl as specified by
558  * context left by sendsig. Check carefully to
559  * make sure that the user has not modified the
560  * psl to gain improper privileges or to cause
561  * a machine fault.
562  */
563 int
564 linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args)
565 {
566 	struct l_sigframe frame;
567 	struct trapframe *regs;
568 	sigset_t bmask;
569 	l_sigset_t lmask;
570 	int eflags, i;
571 	ksiginfo_t ksi;
572 
573 	regs = td->td_frame;
574 
575 #ifdef DEBUG
576 	if (ldebug(sigreturn))
577 		printf(ARGS(sigreturn, "%p"), (void *)args->sfp);
578 #endif
579 	/*
580 	 * The trampoline code hands us the sigframe.
581 	 * It is unsafe to keep track of it ourselves, in the event that a
582 	 * program jumps out of a signal handler.
583 	 */
584 	if (copyin(args->sfp, &frame, sizeof(frame)) != 0)
585 		return (EFAULT);
586 
587 	/*
588 	 * Check for security violations.
589 	 */
590 #define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
591 	eflags = frame.sf_sc.sc_eflags;
592 	/*
593 	 * XXX do allow users to change the privileged flag PSL_RF.  The
594 	 * cpu sets PSL_RF in tf_eflags for faults.  Debuggers should
595 	 * sometimes set it there too.  tf_eflags is kept in the signal
596 	 * context during signal handling and there is no other place
597 	 * to remember it, so the PSL_RF bit may be corrupted by the
598 	 * signal handler without us knowing.  Corruption of the PSL_RF
599 	 * bit at worst causes one more or one less debugger trap, so
600 	 * allowing it is fairly harmless.
601 	 */
602 	if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF))
603 		return(EINVAL);
604 
605 	/*
606 	 * Don't allow users to load a valid privileged %cs.  Let the
607 	 * hardware check for invalid selectors, excess privilege in
608 	 * other selectors, invalid %eip's and invalid %esp's.
609 	 */
610 #define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
611 	if (!CS_SECURE(frame.sf_sc.sc_cs)) {
612 		ksiginfo_init_trap(&ksi);
613 		ksi.ksi_signo = SIGBUS;
614 		ksi.ksi_code = BUS_OBJERR;
615 		ksi.ksi_trapno = T_PROTFLT;
616 		ksi.ksi_addr = (void *)regs->tf_rip;
617 		trapsignal(td, &ksi);
618 		return(EINVAL);
619 	}
620 
621 	lmask.__bits[0] = frame.sf_sc.sc_mask;
622 	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
623 		lmask.__bits[i+1] = frame.sf_extramask[i];
624 	linux_to_bsd_sigset(&lmask, &bmask);
625 	kern_sigprocmask(td, SIG_SETMASK, &bmask, NULL, 0);
626 
627 	/*
628 	 * Restore signal context.
629 	 */
630 	regs->tf_rdi    = frame.sf_sc.sc_edi;
631 	regs->tf_rsi    = frame.sf_sc.sc_esi;
632 	regs->tf_rbp    = frame.sf_sc.sc_ebp;
633 	regs->tf_rbx    = frame.sf_sc.sc_ebx;
634 	regs->tf_rdx    = frame.sf_sc.sc_edx;
635 	regs->tf_rcx    = frame.sf_sc.sc_ecx;
636 	regs->tf_rax    = frame.sf_sc.sc_eax;
637 	regs->tf_rip    = frame.sf_sc.sc_eip;
638 	regs->tf_cs     = frame.sf_sc.sc_cs;
639 	regs->tf_ds     = frame.sf_sc.sc_ds;
640 	regs->tf_es     = frame.sf_sc.sc_es;
641 	regs->tf_fs     = frame.sf_sc.sc_fs;
642 	regs->tf_gs     = frame.sf_sc.sc_gs;
643 	regs->tf_rflags = eflags;
644 	regs->tf_rsp    = frame.sf_sc.sc_esp_at_signal;
645 	regs->tf_ss     = frame.sf_sc.sc_ss;
646 	td->td_pcb->pcb_full_iret = 1;
647 
648 	return (EJUSTRETURN);
649 }
650 
651 /*
652  * System call to cleanup state after a signal
653  * has been taken.  Reset signal mask and
654  * stack state from context left by rt_sendsig (above).
655  * Return to previous pc and psl as specified by
656  * context left by sendsig. Check carefully to
657  * make sure that the user has not modified the
658  * psl to gain improper privileges or to cause
659  * a machine fault.
660  */
661 int
662 linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args)
663 {
664 	struct l_ucontext uc;
665 	struct l_sigcontext *context;
666 	sigset_t bmask;
667 	l_stack_t *lss;
668 	stack_t ss;
669 	struct trapframe *regs;
670 	int eflags;
671 	ksiginfo_t ksi;
672 
673 	regs = td->td_frame;
674 
675 #ifdef DEBUG
676 	if (ldebug(rt_sigreturn))
677 		printf(ARGS(rt_sigreturn, "%p"), (void *)args->ucp);
678 #endif
679 	/*
680 	 * The trampoline code hands us the ucontext.
681 	 * It is unsafe to keep track of it ourselves, in the event that a
682 	 * program jumps out of a signal handler.
683 	 */
684 	if (copyin(args->ucp, &uc, sizeof(uc)) != 0)
685 		return (EFAULT);
686 
687 	context = &uc.uc_mcontext;
688 
689 	/*
690 	 * Check for security violations.
691 	 */
692 #define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
693 	eflags = context->sc_eflags;
694 	/*
695 	 * XXX do allow users to change the privileged flag PSL_RF.  The
696 	 * cpu sets PSL_RF in tf_eflags for faults.  Debuggers should
697 	 * sometimes set it there too.  tf_eflags is kept in the signal
698 	 * context during signal handling and there is no other place
699 	 * to remember it, so the PSL_RF bit may be corrupted by the
700 	 * signal handler without us knowing.  Corruption of the PSL_RF
701 	 * bit at worst causes one more or one less debugger trap, so
702 	 * allowing it is fairly harmless.
703 	 */
704 	if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF))
705 		return(EINVAL);
706 
707 	/*
708 	 * Don't allow users to load a valid privileged %cs.  Let the
709 	 * hardware check for invalid selectors, excess privilege in
710 	 * other selectors, invalid %eip's and invalid %esp's.
711 	 */
712 #define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
713 	if (!CS_SECURE(context->sc_cs)) {
714 		ksiginfo_init_trap(&ksi);
715 		ksi.ksi_signo = SIGBUS;
716 		ksi.ksi_code = BUS_OBJERR;
717 		ksi.ksi_trapno = T_PROTFLT;
718 		ksi.ksi_addr = (void *)regs->tf_rip;
719 		trapsignal(td, &ksi);
720 		return(EINVAL);
721 	}
722 
723 	linux_to_bsd_sigset(&uc.uc_sigmask, &bmask);
724 	kern_sigprocmask(td, SIG_SETMASK, &bmask, NULL, 0);
725 
726 	/*
727 	 * Restore signal context
728 	 */
729 	regs->tf_gs	= context->sc_gs;
730 	regs->tf_fs	= context->sc_fs;
731 	regs->tf_es	= context->sc_es;
732 	regs->tf_ds	= context->sc_ds;
733 	regs->tf_rdi    = context->sc_edi;
734 	regs->tf_rsi    = context->sc_esi;
735 	regs->tf_rbp    = context->sc_ebp;
736 	regs->tf_rbx    = context->sc_ebx;
737 	regs->tf_rdx    = context->sc_edx;
738 	regs->tf_rcx    = context->sc_ecx;
739 	regs->tf_rax    = context->sc_eax;
740 	regs->tf_rip    = context->sc_eip;
741 	regs->tf_cs     = context->sc_cs;
742 	regs->tf_rflags = eflags;
743 	regs->tf_rsp    = context->sc_esp_at_signal;
744 	regs->tf_ss     = context->sc_ss;
745 	td->td_pcb->pcb_full_iret = 1;
746 
747 	/*
748 	 * call sigaltstack & ignore results..
749 	 */
750 	lss = &uc.uc_stack;
751 	ss.ss_sp = PTRIN(lss->ss_sp);
752 	ss.ss_size = lss->ss_size;
753 	ss.ss_flags = linux_to_bsd_sigaltstack(lss->ss_flags);
754 
755 #ifdef DEBUG
756 	if (ldebug(rt_sigreturn))
757 		printf(LMSG("rt_sigret flags: 0x%x, sp: %p, ss: 0x%lx, mask: 0x%x"),
758 		    ss.ss_flags, ss.ss_sp, ss.ss_size, context->sc_mask);
759 #endif
760 	(void)kern_sigaltstack(td, &ss, NULL);
761 
762 	return (EJUSTRETURN);
763 }
764 
765 static int
766 linux32_fetch_syscall_args(struct thread *td, struct syscall_args *sa)
767 {
768 	struct proc *p;
769 	struct trapframe *frame;
770 
771 	p = td->td_proc;
772 	frame = td->td_frame;
773 
774 	sa->args[0] = frame->tf_rbx;
775 	sa->args[1] = frame->tf_rcx;
776 	sa->args[2] = frame->tf_rdx;
777 	sa->args[3] = frame->tf_rsi;
778 	sa->args[4] = frame->tf_rdi;
779 	sa->args[5] = frame->tf_rbp;	/* Unconfirmed */
780 	sa->code = frame->tf_rax;
781 
782 	if (sa->code >= p->p_sysent->sv_size)
783 		sa->callp = &p->p_sysent->sv_table[0];
784 	else
785 		sa->callp = &p->p_sysent->sv_table[sa->code];
786 	sa->narg = sa->callp->sy_narg;
787 
788 	td->td_retval[0] = 0;
789 	td->td_retval[1] = frame->tf_rdx;
790 
791 	return (0);
792 }
793 
794 /*
795  * If a linux binary is exec'ing something, try this image activator
796  * first.  We override standard shell script execution in order to
797  * be able to modify the interpreter path.  We only do this if a linux
798  * binary is doing the exec, so we do not create an EXEC module for it.
799  */
800 static int	exec_linux_imgact_try(struct image_params *iparams);
801 
802 static int
803 exec_linux_imgact_try(struct image_params *imgp)
804 {
805 	const char *head = (const char *)imgp->image_header;
806 	char *rpath;
807 	int error = -1, len;
808 
809 	/*
810 	* The interpreter for shell scripts run from a linux binary needs
811 	* to be located in /compat/linux if possible in order to recursively
812 	* maintain linux path emulation.
813 	*/
814 	if (((const short *)head)[0] == SHELLMAGIC) {
815 		/*
816 		* Run our normal shell image activator.  If it succeeds attempt
817 		* to use the alternate path for the interpreter.  If an
818 		* alternate * path is found, use our stringspace to store it.
819 		*/
820 		if ((error = exec_shell_imgact(imgp)) == 0) {
821 			linux_emul_convpath(FIRST_THREAD_IN_PROC(imgp->proc),
822 			    imgp->interpreter_name, UIO_SYSSPACE, &rpath, 0,
823 			    AT_FDCWD);
824 			if (rpath != NULL) {
825 				len = strlen(rpath) + 1;
826 
827 				if (len <= MAXSHELLCMDLEN) {
828 					memcpy(imgp->interpreter_name, rpath,
829 					    len);
830 				}
831 				free(rpath, M_TEMP);
832 			}
833 		}
834 	}
835 	return(error);
836 }
837 
838 /*
839  * Clear registers on exec
840  * XXX copied from ia32_signal.c.
841  */
842 static void
843 exec_linux_setregs(struct thread *td, struct image_params *imgp, u_long stack)
844 {
845 	struct trapframe *regs = td->td_frame;
846 	struct pcb *pcb = td->td_pcb;
847 
848 	mtx_lock(&dt_lock);
849 	if (td->td_proc->p_md.md_ldt != NULL)
850 		user_ldt_free(td);
851 	else
852 		mtx_unlock(&dt_lock);
853 
854 	critical_enter();
855 	wrmsr(MSR_FSBASE, 0);
856 	wrmsr(MSR_KGSBASE, 0);	/* User value while we're in the kernel */
857 	pcb->pcb_fsbase = 0;
858 	pcb->pcb_gsbase = 0;
859 	critical_exit();
860 	pcb->pcb_initial_fpucw = __LINUX_NPXCW__;
861 
862 	bzero((char *)regs, sizeof(struct trapframe));
863 	regs->tf_rip = imgp->entry_addr;
864 	regs->tf_rsp = stack;
865 	regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T);
866 	regs->tf_gs = _ugssel;
867 	regs->tf_fs = _ufssel;
868 	regs->tf_es = _udatasel;
869 	regs->tf_ds = _udatasel;
870 	regs->tf_ss = _udatasel;
871 	regs->tf_flags = TF_HASSEGS;
872 	regs->tf_cs = _ucode32sel;
873 	regs->tf_rbx = imgp->ps_strings;
874 	td->td_pcb->pcb_full_iret = 1;
875 	load_cr0(rcr0() | CR0_MP | CR0_TS);
876 	fpstate_drop(td);
877 
878 	/* Return via doreti so that we can change to a different %cs */
879 	pcb->pcb_flags |= PCB_FULLCTX | PCB_32BIT;
880 	pcb->pcb_flags &= ~PCB_GS32BIT;
881 	td->td_retval[1] = 0;
882 }
883 
884 /*
885  * XXX copied from ia32_sysvec.c.
886  */
887 static register_t *
888 linux_copyout_strings(struct image_params *imgp)
889 {
890 	int argc, envc;
891 	u_int32_t *vectp;
892 	char *stringp, *destp;
893 	u_int32_t *stack_base;
894 	struct linux32_ps_strings *arginfo;
895 
896 	/*
897 	 * Calculate string base and vector table pointers.
898 	 * Also deal with signal trampoline code for this exec type.
899 	 */
900 	arginfo = (struct linux32_ps_strings *)LINUX32_PS_STRINGS;
901 	destp =	(caddr_t)arginfo - linux_szsigcode - SPARE_USRSPACE -
902 	    linux_szplatform - roundup((ARG_MAX - imgp->args->stringspace),
903 	    sizeof(char *));
904 
905 	/*
906 	 * install sigcode
907 	 */
908 	copyout(imgp->proc->p_sysent->sv_sigcode,
909 	    ((caddr_t)arginfo - linux_szsigcode), linux_szsigcode);
910 
911 	/*
912 	 * Install LINUX_PLATFORM
913 	 */
914 	copyout(linux_platform, ((caddr_t)arginfo - linux_szsigcode -
915 	    linux_szplatform), linux_szplatform);
916 
917 	/*
918 	 * If we have a valid auxargs ptr, prepare some room
919 	 * on the stack.
920 	 */
921 	if (imgp->auxargs) {
922 		/*
923 		 * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for
924 		 * lower compatibility.
925 		 */
926 		imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size :
927 		    (LINUX_AT_COUNT * 2);
928 		/*
929 		 * The '+ 2' is for the null pointers at the end of each of
930 		 * the arg and env vector sets,and imgp->auxarg_size is room
931 		 * for argument of Runtime loader.
932 		 */
933 		vectp = (u_int32_t *) (destp - (imgp->args->argc +
934 		    imgp->args->envc + 2 + imgp->auxarg_size) *
935 		    sizeof(u_int32_t));
936 
937 	} else
938 		/*
939 		 * The '+ 2' is for the null pointers at the end of each of
940 		 * the arg and env vector sets
941 		 */
942 		vectp = (u_int32_t *)(destp - (imgp->args->argc +
943 		    imgp->args->envc + 2) * sizeof(u_int32_t));
944 
945 	/*
946 	 * vectp also becomes our initial stack base
947 	 */
948 	stack_base = vectp;
949 
950 	stringp = imgp->args->begin_argv;
951 	argc = imgp->args->argc;
952 	envc = imgp->args->envc;
953 	/*
954 	 * Copy out strings - arguments and environment.
955 	 */
956 	copyout(stringp, destp, ARG_MAX - imgp->args->stringspace);
957 
958 	/*
959 	 * Fill in "ps_strings" struct for ps, w, etc.
960 	 */
961 	suword32(&arginfo->ps_argvstr, (uint32_t)(intptr_t)vectp);
962 	suword32(&arginfo->ps_nargvstr, argc);
963 
964 	/*
965 	 * Fill in argument portion of vector table.
966 	 */
967 	for (; argc > 0; --argc) {
968 		suword32(vectp++, (uint32_t)(intptr_t)destp);
969 		while (*stringp++ != 0)
970 			destp++;
971 		destp++;
972 	}
973 
974 	/* a null vector table pointer separates the argp's from the envp's */
975 	suword32(vectp++, 0);
976 
977 	suword32(&arginfo->ps_envstr, (uint32_t)(intptr_t)vectp);
978 	suword32(&arginfo->ps_nenvstr, envc);
979 
980 	/*
981 	 * Fill in environment portion of vector table.
982 	 */
983 	for (; envc > 0; --envc) {
984 		suword32(vectp++, (uint32_t)(intptr_t)destp);
985 		while (*stringp++ != 0)
986 			destp++;
987 		destp++;
988 	}
989 
990 	/* end of vector table is a null pointer */
991 	suword32(vectp, 0);
992 
993 	return ((register_t *)stack_base);
994 }
995 
996 SYSCTL_NODE(_compat, OID_AUTO, linux32, CTLFLAG_RW, 0,
997     "32-bit Linux emulation");
998 
999 static u_long	linux32_maxdsiz = LINUX32_MAXDSIZ;
1000 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxdsiz, CTLFLAG_RW,
1001     &linux32_maxdsiz, 0, "");
1002 static u_long	linux32_maxssiz = LINUX32_MAXSSIZ;
1003 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxssiz, CTLFLAG_RW,
1004     &linux32_maxssiz, 0, "");
1005 static u_long	linux32_maxvmem = LINUX32_MAXVMEM;
1006 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxvmem, CTLFLAG_RW,
1007     &linux32_maxvmem, 0, "");
1008 
1009 static void
1010 linux32_fixlimit(struct rlimit *rl, int which)
1011 {
1012 
1013 	switch (which) {
1014 	case RLIMIT_DATA:
1015 		if (linux32_maxdsiz != 0) {
1016 			if (rl->rlim_cur > linux32_maxdsiz)
1017 				rl->rlim_cur = linux32_maxdsiz;
1018 			if (rl->rlim_max > linux32_maxdsiz)
1019 				rl->rlim_max = linux32_maxdsiz;
1020 		}
1021 		break;
1022 	case RLIMIT_STACK:
1023 		if (linux32_maxssiz != 0) {
1024 			if (rl->rlim_cur > linux32_maxssiz)
1025 				rl->rlim_cur = linux32_maxssiz;
1026 			if (rl->rlim_max > linux32_maxssiz)
1027 				rl->rlim_max = linux32_maxssiz;
1028 		}
1029 		break;
1030 	case RLIMIT_VMEM:
1031 		if (linux32_maxvmem != 0) {
1032 			if (rl->rlim_cur > linux32_maxvmem)
1033 				rl->rlim_cur = linux32_maxvmem;
1034 			if (rl->rlim_max > linux32_maxvmem)
1035 				rl->rlim_max = linux32_maxvmem;
1036 		}
1037 		break;
1038 	}
1039 }
1040 
1041 struct sysentvec elf_linux_sysvec = {
1042 	.sv_size	= LINUX_SYS_MAXSYSCALL,
1043 	.sv_table	= linux_sysent,
1044 	.sv_mask	= 0,
1045 	.sv_sigsize	= LINUX_SIGTBLSZ,
1046 	.sv_sigtbl	= bsd_to_linux_signal,
1047 	.sv_errsize	= ELAST + 1,
1048 	.sv_errtbl	= bsd_to_linux_errno,
1049 	.sv_transtrap	= translate_traps,
1050 	.sv_fixup	= elf_linux_fixup,
1051 	.sv_sendsig	= linux_sendsig,
1052 	.sv_sigcode	= linux_sigcode,
1053 	.sv_szsigcode	= &linux_szsigcode,
1054 	.sv_prepsyscall	= NULL,
1055 	.sv_name	= "Linux ELF32",
1056 	.sv_coredump	= elf32_coredump,
1057 	.sv_imgact_try	= exec_linux_imgact_try,
1058 	.sv_minsigstksz	= LINUX_MINSIGSTKSZ,
1059 	.sv_pagesize	= PAGE_SIZE,
1060 	.sv_minuser	= VM_MIN_ADDRESS,
1061 	.sv_maxuser	= LINUX32_USRSTACK,
1062 	.sv_usrstack	= LINUX32_USRSTACK,
1063 	.sv_psstrings	= LINUX32_PS_STRINGS,
1064 	.sv_stackprot	= VM_PROT_ALL,
1065 	.sv_copyout_strings = linux_copyout_strings,
1066 	.sv_setregs	= exec_linux_setregs,
1067 	.sv_fixlimit	= linux32_fixlimit,
1068 	.sv_maxssiz	= &linux32_maxssiz,
1069 	.sv_flags	= SV_ABI_LINUX | SV_ILP32 | SV_IA32,
1070 	.sv_set_syscall_retval = cpu_set_syscall_retval,
1071 	.sv_fetch_syscall_args = linux32_fetch_syscall_args,
1072 	.sv_syscallnames = NULL,
1073 };
1074 
1075 static char GNU_ABI_VENDOR[] = "GNU";
1076 static int GNULINUX_ABI_DESC = 0;
1077 
1078 static boolean_t
1079 linux32_trans_osrel(const Elf_Note *note, int32_t *osrel)
1080 {
1081 	const Elf32_Word *desc;
1082 	uintptr_t p;
1083 
1084 	p = (uintptr_t)(note + 1);
1085 	p += roundup2(note->n_namesz, sizeof(Elf32_Addr));
1086 
1087 	desc = (const Elf32_Word *)p;
1088 	if (desc[0] != GNULINUX_ABI_DESC)
1089 		return (FALSE);
1090 
1091 	/*
1092 	 * For linux we encode osrel as follows (see linux_mib.c):
1093 	 * VVVMMMIII (version, major, minor), see linux_mib.c.
1094 	 */
1095 	*osrel = desc[1] * 1000000 + desc[2] * 1000 + desc[3];
1096 
1097 	return (TRUE);
1098 }
1099 
1100 static Elf_Brandnote linux32_brandnote = {
1101 	.hdr.n_namesz	= sizeof(GNU_ABI_VENDOR),
1102 	.hdr.n_descsz	= 16,	/* XXX at least 16 */
1103 	.hdr.n_type	= 1,
1104 	.vendor		= GNU_ABI_VENDOR,
1105 	.flags		= BN_TRANSLATE_OSREL,
1106 	.trans_osrel	= linux32_trans_osrel
1107 };
1108 
1109 static Elf32_Brandinfo linux_brand = {
1110 	.brand		= ELFOSABI_LINUX,
1111 	.machine	= EM_386,
1112 	.compat_3_brand	= "Linux",
1113 	.emul_path	= "/compat/linux",
1114 	.interp_path	= "/lib/ld-linux.so.1",
1115 	.sysvec		= &elf_linux_sysvec,
1116 	.interp_newpath	= NULL,
1117 	.brand_note	= &linux32_brandnote,
1118 	.flags		= BI_CAN_EXEC_DYN | BI_BRAND_NOTE
1119 };
1120 
1121 static Elf32_Brandinfo linux_glibc2brand = {
1122 	.brand		= ELFOSABI_LINUX,
1123 	.machine	= EM_386,
1124 	.compat_3_brand	= "Linux",
1125 	.emul_path	= "/compat/linux",
1126 	.interp_path	= "/lib/ld-linux.so.2",
1127 	.sysvec		= &elf_linux_sysvec,
1128 	.interp_newpath	= NULL,
1129 	.brand_note	= &linux32_brandnote,
1130 	.flags		= BI_CAN_EXEC_DYN | BI_BRAND_NOTE
1131 };
1132 
1133 Elf32_Brandinfo *linux_brandlist[] = {
1134 	&linux_brand,
1135 	&linux_glibc2brand,
1136 	NULL
1137 };
1138 
1139 static int
1140 linux_elf_modevent(module_t mod, int type, void *data)
1141 {
1142 	Elf32_Brandinfo **brandinfo;
1143 	int error;
1144 	struct linux_ioctl_handler **lihp;
1145 	struct linux_device_handler **ldhp;
1146 
1147 	error = 0;
1148 
1149 	switch(type) {
1150 	case MOD_LOAD:
1151 		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
1152 		     ++brandinfo)
1153 			if (elf32_insert_brand_entry(*brandinfo) < 0)
1154 				error = EINVAL;
1155 		if (error == 0) {
1156 			SET_FOREACH(lihp, linux_ioctl_handler_set)
1157 				linux_ioctl_register_handler(*lihp);
1158 			SET_FOREACH(ldhp, linux_device_handler_set)
1159 				linux_device_register_handler(*ldhp);
1160 			mtx_init(&emul_lock, "emuldata lock", NULL, MTX_DEF);
1161 			sx_init(&emul_shared_lock, "emuldata->shared lock");
1162 			LIST_INIT(&futex_list);
1163 			mtx_init(&futex_mtx, "ftllk", NULL, MTX_DEF);
1164 			linux_exit_tag = EVENTHANDLER_REGISTER(process_exit,
1165 			    linux_proc_exit, NULL, 1000);
1166 			linux_schedtail_tag = EVENTHANDLER_REGISTER(schedtail,
1167 			    linux_schedtail, NULL, 1000);
1168 			linux_exec_tag = EVENTHANDLER_REGISTER(process_exec,
1169 			    linux_proc_exec, NULL, 1000);
1170 			linux_szplatform = roundup(strlen(linux_platform) + 1,
1171 			    sizeof(char *));
1172 			linux_osd_jail_register();
1173 			stclohz = (stathz ? stathz : hz);
1174 			if (bootverbose)
1175 				printf("Linux ELF exec handler installed\n");
1176 		} else
1177 			printf("cannot insert Linux ELF brand handler\n");
1178 		break;
1179 	case MOD_UNLOAD:
1180 		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
1181 		     ++brandinfo)
1182 			if (elf32_brand_inuse(*brandinfo))
1183 				error = EBUSY;
1184 		if (error == 0) {
1185 			for (brandinfo = &linux_brandlist[0];
1186 			     *brandinfo != NULL; ++brandinfo)
1187 				if (elf32_remove_brand_entry(*brandinfo) < 0)
1188 					error = EINVAL;
1189 		}
1190 		if (error == 0) {
1191 			SET_FOREACH(lihp, linux_ioctl_handler_set)
1192 				linux_ioctl_unregister_handler(*lihp);
1193 			SET_FOREACH(ldhp, linux_device_handler_set)
1194 				linux_device_unregister_handler(*ldhp);
1195 			mtx_destroy(&emul_lock);
1196 			sx_destroy(&emul_shared_lock);
1197 			mtx_destroy(&futex_mtx);
1198 			EVENTHANDLER_DEREGISTER(process_exit, linux_exit_tag);
1199 			EVENTHANDLER_DEREGISTER(schedtail, linux_schedtail_tag);
1200 			EVENTHANDLER_DEREGISTER(process_exec, linux_exec_tag);
1201 			linux_osd_jail_deregister();
1202 			if (bootverbose)
1203 				printf("Linux ELF exec handler removed\n");
1204 		} else
1205 			printf("Could not deinstall ELF interpreter entry\n");
1206 		break;
1207 	default:
1208 		return EOPNOTSUPP;
1209 	}
1210 	return error;
1211 }
1212 
1213 static moduledata_t linux_elf_mod = {
1214 	"linuxelf",
1215 	linux_elf_modevent,
1216 	0
1217 };
1218 
1219 DECLARE_MODULE(linuxelf, linux_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY);
1220