xref: /freebsd/sys/amd64/linux32/linux32_sysvec.c (revision f45c063aa733372110aea20ec8185bb8b5c4438e)
1 /*-
2  * Copyright (c) 2004 Tim J. Robbins
3  * Copyright (c) 2003 Peter Wemm
4  * Copyright (c) 2002 Doug Rabson
5  * Copyright (c) 1998-1999 Andrew Gallatin
6  * Copyright (c) 1994-1996 S�ren Schmidt
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer
14  *    in this position and unchanged.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. The name of the author may not be used to endorse or promote products
19  *    derived from this software without specific prior written permission
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
22  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
23  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
24  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
25  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
26  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
30  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31  */
32 
33 #include <sys/cdefs.h>
34 __FBSDID("$FreeBSD$");
35 #include "opt_compat.h"
36 
37 #ifndef COMPAT_IA32
38 #error "Unable to compile Linux-emulator due to missing COMPAT_IA32 option!"
39 #endif
40 
41 #define	__ELF_WORD_SIZE	32
42 
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/exec.h>
46 #include <sys/fcntl.h>
47 #include <sys/imgact.h>
48 #include <sys/imgact_elf.h>
49 #include <sys/kernel.h>
50 #include <sys/lock.h>
51 #include <sys/malloc.h>
52 #include <sys/module.h>
53 #include <sys/mutex.h>
54 #include <sys/proc.h>
55 #include <sys/resourcevar.h>
56 #include <sys/signalvar.h>
57 #include <sys/sysctl.h>
58 #include <sys/syscallsubr.h>
59 #include <sys/sysent.h>
60 #include <sys/sysproto.h>
61 #include <sys/vnode.h>
62 #include <sys/eventhandler.h>
63 
64 #include <vm/vm.h>
65 #include <vm/pmap.h>
66 #include <vm/vm_extern.h>
67 #include <vm/vm_map.h>
68 #include <vm/vm_object.h>
69 #include <vm/vm_page.h>
70 #include <vm/vm_param.h>
71 
72 #include <machine/cpu.h>
73 #include <machine/md_var.h>
74 #include <machine/pcb.h>
75 #include <machine/specialreg.h>
76 
77 #include <amd64/linux32/linux.h>
78 #include <amd64/linux32/linux32_proto.h>
79 #include <compat/linux/linux_futex.h>
80 #include <compat/linux/linux_emul.h>
81 #include <compat/linux/linux_mib.h>
82 #include <compat/linux/linux_misc.h>
83 #include <compat/linux/linux_signal.h>
84 #include <compat/linux/linux_util.h>
85 
86 MODULE_VERSION(linux, 1);
87 
88 MALLOC_DEFINE(M_LINUX, "linux", "Linux mode structures");
89 
90 #define	AUXARGS_ENTRY_32(pos, id, val)	\
91 	do {				\
92 		suword32(pos++, id);	\
93 		suword32(pos++, val);	\
94 	} while (0)
95 
96 #if BYTE_ORDER == LITTLE_ENDIAN
97 #define SHELLMAGIC      0x2123 /* #! */
98 #else
99 #define SHELLMAGIC      0x2321
100 #endif
101 
102 /*
103  * Allow the sendsig functions to use the ldebug() facility
104  * even though they are not syscalls themselves. Map them
105  * to syscall 0. This is slightly less bogus than using
106  * ldebug(sigreturn).
107  */
108 #define	LINUX_SYS_linux_rt_sendsig	0
109 #define	LINUX_SYS_linux_sendsig		0
110 
111 const char *linux_platform = "i686";
112 static int linux_szplatform;
113 extern char linux_sigcode[];
114 extern int linux_szsigcode;
115 
116 extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL];
117 
118 SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler);
119 SET_DECLARE(linux_device_handler_set, struct linux_device_handler);
120 
121 static int	elf_linux_fixup(register_t **stack_base,
122 		    struct image_params *iparams);
123 static register_t *linux_copyout_strings(struct image_params *imgp);
124 static void	linux_prepsyscall(struct trapframe *tf, int *args, u_int *code,
125 		    caddr_t *params);
126 static void     linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask);
127 static void	exec_linux_setregs(struct thread *td, u_long entry,
128 				   u_long stack, u_long ps_strings);
129 static void	linux32_fixlimit(struct rlimit *rl, int which);
130 static boolean_t linux32_trans_osrel(const Elf_Note *note, int32_t *osrel);
131 
132 static eventhandler_tag linux_exit_tag;
133 static eventhandler_tag linux_schedtail_tag;
134 static eventhandler_tag linux_exec_tag;
135 
136 /*
137  * Linux syscalls return negative errno's, we do positive and map them
138  * Reference:
139  *   FreeBSD: src/sys/sys/errno.h
140  *   Linux:   linux-2.6.17.8/include/asm-generic/errno-base.h
141  *            linux-2.6.17.8/include/asm-generic/errno.h
142  */
143 static int bsd_to_linux_errno[ELAST + 1] = {
144 	-0,  -1,  -2,  -3,  -4,  -5,  -6,  -7,  -8,  -9,
145 	-10, -35, -12, -13, -14, -15, -16, -17, -18, -19,
146 	-20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
147 	-30, -31, -32, -33, -34, -11,-115,-114, -88, -89,
148 	-90, -91, -92, -93, -94, -95, -96, -97, -98, -99,
149 	-100,-101,-102,-103,-104,-105,-106,-107,-108,-109,
150 	-110,-111, -40, -36,-112,-113, -39, -11, -87,-122,
151 	-116, -66,  -6,  -6,  -6,  -6,  -6, -37, -38,  -9,
152 	  -6,  -6, -43, -42, -75,-125, -84, -95, -16, -74,
153 	 -72, -67, -71
154 };
155 
156 int bsd_to_linux_signal[LINUX_SIGTBLSZ] = {
157 	LINUX_SIGHUP, LINUX_SIGINT, LINUX_SIGQUIT, LINUX_SIGILL,
158 	LINUX_SIGTRAP, LINUX_SIGABRT, 0, LINUX_SIGFPE,
159 	LINUX_SIGKILL, LINUX_SIGBUS, LINUX_SIGSEGV, LINUX_SIGSYS,
160 	LINUX_SIGPIPE, LINUX_SIGALRM, LINUX_SIGTERM, LINUX_SIGURG,
161 	LINUX_SIGSTOP, LINUX_SIGTSTP, LINUX_SIGCONT, LINUX_SIGCHLD,
162 	LINUX_SIGTTIN, LINUX_SIGTTOU, LINUX_SIGIO, LINUX_SIGXCPU,
163 	LINUX_SIGXFSZ, LINUX_SIGVTALRM, LINUX_SIGPROF, LINUX_SIGWINCH,
164 	0, LINUX_SIGUSR1, LINUX_SIGUSR2
165 };
166 
167 int linux_to_bsd_signal[LINUX_SIGTBLSZ] = {
168 	SIGHUP, SIGINT, SIGQUIT, SIGILL,
169 	SIGTRAP, SIGABRT, SIGBUS, SIGFPE,
170 	SIGKILL, SIGUSR1, SIGSEGV, SIGUSR2,
171 	SIGPIPE, SIGALRM, SIGTERM, SIGBUS,
172 	SIGCHLD, SIGCONT, SIGSTOP, SIGTSTP,
173 	SIGTTIN, SIGTTOU, SIGURG, SIGXCPU,
174 	SIGXFSZ, SIGVTALRM, SIGPROF, SIGWINCH,
175 	SIGIO, SIGURG, SIGSYS
176 };
177 
178 #define LINUX_T_UNKNOWN  255
179 static int _bsd_to_linux_trapcode[] = {
180 	LINUX_T_UNKNOWN,	/* 0 */
181 	6,			/* 1  T_PRIVINFLT */
182 	LINUX_T_UNKNOWN,	/* 2 */
183 	3,			/* 3  T_BPTFLT */
184 	LINUX_T_UNKNOWN,	/* 4 */
185 	LINUX_T_UNKNOWN,	/* 5 */
186 	16,			/* 6  T_ARITHTRAP */
187 	254,			/* 7  T_ASTFLT */
188 	LINUX_T_UNKNOWN,	/* 8 */
189 	13,			/* 9  T_PROTFLT */
190 	1,			/* 10 T_TRCTRAP */
191 	LINUX_T_UNKNOWN,	/* 11 */
192 	14,			/* 12 T_PAGEFLT */
193 	LINUX_T_UNKNOWN,	/* 13 */
194 	17,			/* 14 T_ALIGNFLT */
195 	LINUX_T_UNKNOWN,	/* 15 */
196 	LINUX_T_UNKNOWN,	/* 16 */
197 	LINUX_T_UNKNOWN,	/* 17 */
198 	0,			/* 18 T_DIVIDE */
199 	2,			/* 19 T_NMI */
200 	4,			/* 20 T_OFLOW */
201 	5,			/* 21 T_BOUND */
202 	7,			/* 22 T_DNA */
203 	8,			/* 23 T_DOUBLEFLT */
204 	9,			/* 24 T_FPOPFLT */
205 	10,			/* 25 T_TSSFLT */
206 	11,			/* 26 T_SEGNPFLT */
207 	12,			/* 27 T_STKFLT */
208 	18,			/* 28 T_MCHK */
209 	19,			/* 29 T_XMMFLT */
210 	15			/* 30 T_RESERVED */
211 };
212 #define bsd_to_linux_trapcode(code) \
213     ((code)<sizeof(_bsd_to_linux_trapcode)/sizeof(*_bsd_to_linux_trapcode)? \
214      _bsd_to_linux_trapcode[(code)]: \
215      LINUX_T_UNKNOWN)
216 
217 struct linux32_ps_strings {
218 	u_int32_t ps_argvstr;	/* first of 0 or more argument strings */
219 	u_int ps_nargvstr;	/* the number of argument strings */
220 	u_int32_t ps_envstr;	/* first of 0 or more environment strings */
221 	u_int ps_nenvstr;	/* the number of environment strings */
222 };
223 
224 /*
225  * If FreeBSD & Linux have a difference of opinion about what a trap
226  * means, deal with it here.
227  *
228  * MPSAFE
229  */
230 static int
231 translate_traps(int signal, int trap_code)
232 {
233 	if (signal != SIGBUS)
234 		return signal;
235 	switch (trap_code) {
236 	case T_PROTFLT:
237 	case T_TSSFLT:
238 	case T_DOUBLEFLT:
239 	case T_PAGEFLT:
240 		return SIGSEGV;
241 	default:
242 		return signal;
243 	}
244 }
245 
246 static int
247 elf_linux_fixup(register_t **stack_base, struct image_params *imgp)
248 {
249 	Elf32_Auxargs *args;
250 	Elf32_Addr *base;
251 	Elf32_Addr *pos, *uplatform;
252 	struct linux32_ps_strings *arginfo;
253 
254 	arginfo = (struct linux32_ps_strings *)LINUX32_PS_STRINGS;
255 	uplatform = (Elf32_Addr *)((caddr_t)arginfo - linux_szsigcode -
256 	    linux_szplatform);
257 
258 	KASSERT(curthread->td_proc == imgp->proc,
259 	    ("unsafe elf_linux_fixup(), should be curproc"));
260 	base = (Elf32_Addr *)*stack_base;
261 	args = (Elf32_Auxargs *)imgp->auxargs;
262 	pos = base + (imgp->args->argc + imgp->args->envc + 2);
263 
264 	AUXARGS_ENTRY_32(pos, LINUX_AT_HWCAP, cpu_feature);
265 
266 	/*
267 	 * Do not export AT_CLKTCK when emulating Linux kernel prior to 2.4.0,
268 	 * as it has appeared in the 2.4.0-rc7 first time.
269 	 * Being exported, AT_CLKTCK is returned by sysconf(_SC_CLK_TCK),
270 	 * glibc falls back to the hard-coded CLK_TCK value when aux entry
271 	 * is not present.
272 	 * Also see linux_times() implementation.
273 	 */
274 	if (linux_kernver(curthread) >= LINUX_KERNVER_2004000)
275 		AUXARGS_ENTRY_32(pos, LINUX_AT_CLKTCK, stclohz);
276 	AUXARGS_ENTRY_32(pos, AT_PHDR, args->phdr);
277 	AUXARGS_ENTRY_32(pos, AT_PHENT, args->phent);
278 	AUXARGS_ENTRY_32(pos, AT_PHNUM, args->phnum);
279 	AUXARGS_ENTRY_32(pos, AT_PAGESZ, args->pagesz);
280 	AUXARGS_ENTRY_32(pos, AT_FLAGS, args->flags);
281 	AUXARGS_ENTRY_32(pos, AT_ENTRY, args->entry);
282 	AUXARGS_ENTRY_32(pos, AT_BASE, args->base);
283 	AUXARGS_ENTRY_32(pos, LINUX_AT_SECURE, 0);
284 	AUXARGS_ENTRY_32(pos, AT_UID, imgp->proc->p_ucred->cr_ruid);
285 	AUXARGS_ENTRY_32(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid);
286 	AUXARGS_ENTRY_32(pos, AT_GID, imgp->proc->p_ucred->cr_rgid);
287 	AUXARGS_ENTRY_32(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid);
288 	AUXARGS_ENTRY_32(pos, LINUX_AT_PLATFORM, PTROUT(uplatform));
289 	if (args->execfd != -1)
290 		AUXARGS_ENTRY_32(pos, AT_EXECFD, args->execfd);
291 	AUXARGS_ENTRY_32(pos, AT_NULL, 0);
292 
293 	free(imgp->auxargs, M_TEMP);
294 	imgp->auxargs = NULL;
295 
296 	base--;
297 	suword32(base, (uint32_t)imgp->args->argc);
298 	*stack_base = (register_t *)base;
299 	return 0;
300 }
301 
302 extern unsigned long linux_sznonrtsigcode;
303 
304 static void
305 linux_rt_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
306 {
307 	struct thread *td = curthread;
308 	struct proc *p = td->td_proc;
309 	struct sigacts *psp;
310 	struct trapframe *regs;
311 	struct l_rt_sigframe *fp, frame;
312 	int oonstack;
313 	int sig;
314 	int code;
315 
316 	sig = ksi->ksi_signo;
317 	code = ksi->ksi_code;
318 	PROC_LOCK_ASSERT(p, MA_OWNED);
319 	psp = p->p_sigacts;
320 	mtx_assert(&psp->ps_mtx, MA_OWNED);
321 	regs = td->td_frame;
322 	oonstack = sigonstack(regs->tf_rsp);
323 
324 #ifdef DEBUG
325 	if (ldebug(rt_sendsig))
326 		printf(ARGS(rt_sendsig, "%p, %d, %p, %u"),
327 		    catcher, sig, (void*)mask, code);
328 #endif
329 	/*
330 	 * Allocate space for the signal handler context.
331 	 */
332 	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
333 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
334 		fp = (struct l_rt_sigframe *)(td->td_sigstk.ss_sp +
335 		    td->td_sigstk.ss_size - sizeof(struct l_rt_sigframe));
336 	} else
337 		fp = (struct l_rt_sigframe *)regs->tf_rsp - 1;
338 	mtx_unlock(&psp->ps_mtx);
339 
340 	/*
341 	 * Build the argument list for the signal handler.
342 	 */
343 	if (p->p_sysent->sv_sigtbl)
344 		if (sig <= p->p_sysent->sv_sigsize)
345 			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
346 
347 	bzero(&frame, sizeof(frame));
348 
349 	frame.sf_handler = PTROUT(catcher);
350 	frame.sf_sig = sig;
351 	frame.sf_siginfo = PTROUT(&fp->sf_si);
352 	frame.sf_ucontext = PTROUT(&fp->sf_sc);
353 
354 	/* Fill in POSIX parts */
355 	ksiginfo_to_lsiginfo(ksi, &frame.sf_si, sig);
356 
357 	/*
358 	 * Build the signal context to be used by sigreturn.
359 	 */
360 	frame.sf_sc.uc_flags = 0;		/* XXX ??? */
361 	frame.sf_sc.uc_link = 0;		/* XXX ??? */
362 
363 	frame.sf_sc.uc_stack.ss_sp = PTROUT(td->td_sigstk.ss_sp);
364 	frame.sf_sc.uc_stack.ss_size = td->td_sigstk.ss_size;
365 	frame.sf_sc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
366 	    ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE;
367 	PROC_UNLOCK(p);
368 
369 	bsd_to_linux_sigset(mask, &frame.sf_sc.uc_sigmask);
370 
371 	frame.sf_sc.uc_mcontext.sc_mask   = frame.sf_sc.uc_sigmask.__bits[0];
372 	frame.sf_sc.uc_mcontext.sc_edi    = regs->tf_rdi;
373 	frame.sf_sc.uc_mcontext.sc_esi    = regs->tf_rsi;
374 	frame.sf_sc.uc_mcontext.sc_ebp    = regs->tf_rbp;
375 	frame.sf_sc.uc_mcontext.sc_ebx    = regs->tf_rbx;
376 	frame.sf_sc.uc_mcontext.sc_edx    = regs->tf_rdx;
377 	frame.sf_sc.uc_mcontext.sc_ecx    = regs->tf_rcx;
378 	frame.sf_sc.uc_mcontext.sc_eax    = regs->tf_rax;
379 	frame.sf_sc.uc_mcontext.sc_eip    = regs->tf_rip;
380 	frame.sf_sc.uc_mcontext.sc_cs     = regs->tf_cs;
381 	frame.sf_sc.uc_mcontext.sc_gs     = regs->tf_gs;
382 	frame.sf_sc.uc_mcontext.sc_fs     = regs->tf_fs;
383 	frame.sf_sc.uc_mcontext.sc_es     = regs->tf_es;
384 	frame.sf_sc.uc_mcontext.sc_ds     = regs->tf_ds;
385 	frame.sf_sc.uc_mcontext.sc_eflags = regs->tf_rflags;
386 	frame.sf_sc.uc_mcontext.sc_esp_at_signal = regs->tf_rsp;
387 	frame.sf_sc.uc_mcontext.sc_ss     = regs->tf_ss;
388 	frame.sf_sc.uc_mcontext.sc_err    = regs->tf_err;
389 	frame.sf_sc.uc_mcontext.sc_cr2    = (u_int32_t)(uintptr_t)ksi->ksi_addr;
390 	frame.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code);
391 
392 #ifdef DEBUG
393 	if (ldebug(rt_sendsig))
394 		printf(LMSG("rt_sendsig flags: 0x%x, sp: %p, ss: 0x%lx, mask: 0x%x"),
395 		    frame.sf_sc.uc_stack.ss_flags, td->td_sigstk.ss_sp,
396 		    td->td_sigstk.ss_size, frame.sf_sc.uc_mcontext.sc_mask);
397 #endif
398 
399 	if (copyout(&frame, fp, sizeof(frame)) != 0) {
400 		/*
401 		 * Process has trashed its stack; give it an illegal
402 		 * instruction to halt it in its tracks.
403 		 */
404 #ifdef DEBUG
405 		if (ldebug(rt_sendsig))
406 			printf(LMSG("rt_sendsig: bad stack %p, oonstack=%x"),
407 			    fp, oonstack);
408 #endif
409 		PROC_LOCK(p);
410 		sigexit(td, SIGILL);
411 	}
412 
413 	/*
414 	 * Build context to run handler in.
415 	 */
416 	regs->tf_rsp = PTROUT(fp);
417 	regs->tf_rip = LINUX32_PS_STRINGS - *(p->p_sysent->sv_szsigcode) +
418 	    linux_sznonrtsigcode;
419 	regs->tf_rflags &= ~(PSL_T | PSL_D);
420 	regs->tf_cs = _ucode32sel;
421 	regs->tf_ss = _udatasel;
422 	regs->tf_ds = _udatasel;
423 	regs->tf_es = _udatasel;
424 	regs->tf_fs = _ufssel;
425 	regs->tf_gs = _ugssel;
426 	regs->tf_flags = TF_HASSEGS;
427 	td->td_pcb->pcb_full_iret = 1;
428 	PROC_LOCK(p);
429 	mtx_lock(&psp->ps_mtx);
430 }
431 
432 
433 /*
434  * Send an interrupt to process.
435  *
436  * Stack is set up to allow sigcode stored
437  * in u. to call routine, followed by kcall
438  * to sigreturn routine below.  After sigreturn
439  * resets the signal mask, the stack, and the
440  * frame pointer, it returns to the user
441  * specified pc, psl.
442  */
443 static void
444 linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
445 {
446 	struct thread *td = curthread;
447 	struct proc *p = td->td_proc;
448 	struct sigacts *psp;
449 	struct trapframe *regs;
450 	struct l_sigframe *fp, frame;
451 	l_sigset_t lmask;
452 	int oonstack, i;
453 	int sig, code;
454 
455 	sig = ksi->ksi_signo;
456 	code = ksi->ksi_code;
457 	PROC_LOCK_ASSERT(p, MA_OWNED);
458 	psp = p->p_sigacts;
459 	mtx_assert(&psp->ps_mtx, MA_OWNED);
460 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
461 		/* Signal handler installed with SA_SIGINFO. */
462 		linux_rt_sendsig(catcher, ksi, mask);
463 		return;
464 	}
465 
466 	regs = td->td_frame;
467 	oonstack = sigonstack(regs->tf_rsp);
468 
469 #ifdef DEBUG
470 	if (ldebug(sendsig))
471 		printf(ARGS(sendsig, "%p, %d, %p, %u"),
472 		    catcher, sig, (void*)mask, code);
473 #endif
474 
475 	/*
476 	 * Allocate space for the signal handler context.
477 	 */
478 	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
479 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
480 		fp = (struct l_sigframe *)(td->td_sigstk.ss_sp +
481 		    td->td_sigstk.ss_size - sizeof(struct l_sigframe));
482 	} else
483 		fp = (struct l_sigframe *)regs->tf_rsp - 1;
484 	mtx_unlock(&psp->ps_mtx);
485 	PROC_UNLOCK(p);
486 
487 	/*
488 	 * Build the argument list for the signal handler.
489 	 */
490 	if (p->p_sysent->sv_sigtbl)
491 		if (sig <= p->p_sysent->sv_sigsize)
492 			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
493 
494 	bzero(&frame, sizeof(frame));
495 
496 	frame.sf_handler = PTROUT(catcher);
497 	frame.sf_sig = sig;
498 
499 	bsd_to_linux_sigset(mask, &lmask);
500 
501 	/*
502 	 * Build the signal context to be used by sigreturn.
503 	 */
504 	frame.sf_sc.sc_mask   = lmask.__bits[0];
505 	frame.sf_sc.sc_gs     = regs->tf_gs;
506 	frame.sf_sc.sc_fs     = regs->tf_fs;
507 	frame.sf_sc.sc_es     = regs->tf_es;
508 	frame.sf_sc.sc_ds     = regs->tf_ds;
509 	frame.sf_sc.sc_edi    = regs->tf_rdi;
510 	frame.sf_sc.sc_esi    = regs->tf_rsi;
511 	frame.sf_sc.sc_ebp    = regs->tf_rbp;
512 	frame.sf_sc.sc_ebx    = regs->tf_rbx;
513 	frame.sf_sc.sc_edx    = regs->tf_rdx;
514 	frame.sf_sc.sc_ecx    = regs->tf_rcx;
515 	frame.sf_sc.sc_eax    = regs->tf_rax;
516 	frame.sf_sc.sc_eip    = regs->tf_rip;
517 	frame.sf_sc.sc_cs     = regs->tf_cs;
518 	frame.sf_sc.sc_eflags = regs->tf_rflags;
519 	frame.sf_sc.sc_esp_at_signal = regs->tf_rsp;
520 	frame.sf_sc.sc_ss     = regs->tf_ss;
521 	frame.sf_sc.sc_err    = regs->tf_err;
522 	frame.sf_sc.sc_cr2    = (u_int32_t)(uintptr_t)ksi->ksi_addr;
523 	frame.sf_sc.sc_trapno = bsd_to_linux_trapcode(code);
524 
525 	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
526 		frame.sf_extramask[i] = lmask.__bits[i+1];
527 
528 	if (copyout(&frame, fp, sizeof(frame)) != 0) {
529 		/*
530 		 * Process has trashed its stack; give it an illegal
531 		 * instruction to halt it in its tracks.
532 		 */
533 		PROC_LOCK(p);
534 		sigexit(td, SIGILL);
535 	}
536 
537 	/*
538 	 * Build context to run handler in.
539 	 */
540 	regs->tf_rsp = PTROUT(fp);
541 	regs->tf_rip = LINUX32_PS_STRINGS - *(p->p_sysent->sv_szsigcode);
542 	regs->tf_rflags &= ~(PSL_T | PSL_D);
543 	regs->tf_cs = _ucode32sel;
544 	regs->tf_ss = _udatasel;
545 	regs->tf_ds = _udatasel;
546 	regs->tf_es = _udatasel;
547 	regs->tf_fs = _ufssel;
548 	regs->tf_gs = _ugssel;
549 	regs->tf_flags = TF_HASSEGS;
550 	td->td_pcb->pcb_full_iret = 1;
551 	PROC_LOCK(p);
552 	mtx_lock(&psp->ps_mtx);
553 }
554 
555 /*
556  * System call to cleanup state after a signal
557  * has been taken.  Reset signal mask and
558  * stack state from context left by sendsig (above).
559  * Return to previous pc and psl as specified by
560  * context left by sendsig. Check carefully to
561  * make sure that the user has not modified the
562  * psl to gain improper privileges or to cause
563  * a machine fault.
564  */
565 int
566 linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args)
567 {
568 	struct proc *p = td->td_proc;
569 	struct l_sigframe frame;
570 	struct trapframe *regs;
571 	l_sigset_t lmask;
572 	int eflags, i;
573 	ksiginfo_t ksi;
574 
575 	regs = td->td_frame;
576 
577 #ifdef DEBUG
578 	if (ldebug(sigreturn))
579 		printf(ARGS(sigreturn, "%p"), (void *)args->sfp);
580 #endif
581 	/*
582 	 * The trampoline code hands us the sigframe.
583 	 * It is unsafe to keep track of it ourselves, in the event that a
584 	 * program jumps out of a signal handler.
585 	 */
586 	if (copyin(args->sfp, &frame, sizeof(frame)) != 0)
587 		return (EFAULT);
588 
589 	/*
590 	 * Check for security violations.
591 	 */
592 #define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
593 	eflags = frame.sf_sc.sc_eflags;
594 	/*
595 	 * XXX do allow users to change the privileged flag PSL_RF.  The
596 	 * cpu sets PSL_RF in tf_eflags for faults.  Debuggers should
597 	 * sometimes set it there too.  tf_eflags is kept in the signal
598 	 * context during signal handling and there is no other place
599 	 * to remember it, so the PSL_RF bit may be corrupted by the
600 	 * signal handler without us knowing.  Corruption of the PSL_RF
601 	 * bit at worst causes one more or one less debugger trap, so
602 	 * allowing it is fairly harmless.
603 	 */
604 	if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF))
605 		return(EINVAL);
606 
607 	/*
608 	 * Don't allow users to load a valid privileged %cs.  Let the
609 	 * hardware check for invalid selectors, excess privilege in
610 	 * other selectors, invalid %eip's and invalid %esp's.
611 	 */
612 #define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
613 	if (!CS_SECURE(frame.sf_sc.sc_cs)) {
614 		ksiginfo_init_trap(&ksi);
615 		ksi.ksi_signo = SIGBUS;
616 		ksi.ksi_code = BUS_OBJERR;
617 		ksi.ksi_trapno = T_PROTFLT;
618 		ksi.ksi_addr = (void *)regs->tf_rip;
619 		trapsignal(td, &ksi);
620 		return(EINVAL);
621 	}
622 
623 	lmask.__bits[0] = frame.sf_sc.sc_mask;
624 	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
625 		lmask.__bits[i+1] = frame.sf_extramask[i];
626 	PROC_LOCK(p);
627 	linux_to_bsd_sigset(&lmask, &td->td_sigmask);
628 	SIG_CANTMASK(td->td_sigmask);
629 	signotify(td);
630 	PROC_UNLOCK(p);
631 
632 	/*
633 	 * Restore signal context.
634 	 */
635 	regs->tf_rdi    = frame.sf_sc.sc_edi;
636 	regs->tf_rsi    = frame.sf_sc.sc_esi;
637 	regs->tf_rbp    = frame.sf_sc.sc_ebp;
638 	regs->tf_rbx    = frame.sf_sc.sc_ebx;
639 	regs->tf_rdx    = frame.sf_sc.sc_edx;
640 	regs->tf_rcx    = frame.sf_sc.sc_ecx;
641 	regs->tf_rax    = frame.sf_sc.sc_eax;
642 	regs->tf_rip    = frame.sf_sc.sc_eip;
643 	regs->tf_cs     = frame.sf_sc.sc_cs;
644 	regs->tf_ds     = frame.sf_sc.sc_ds;
645 	regs->tf_es     = frame.sf_sc.sc_es;
646 	regs->tf_fs     = frame.sf_sc.sc_fs;
647 	regs->tf_gs     = frame.sf_sc.sc_gs;
648 	regs->tf_rflags = eflags;
649 	regs->tf_rsp    = frame.sf_sc.sc_esp_at_signal;
650 	regs->tf_ss     = frame.sf_sc.sc_ss;
651 	td->td_pcb->pcb_full_iret = 1;
652 
653 	return (EJUSTRETURN);
654 }
655 
656 /*
657  * System call to cleanup state after a signal
658  * has been taken.  Reset signal mask and
659  * stack state from context left by rt_sendsig (above).
660  * Return to previous pc and psl as specified by
661  * context left by sendsig. Check carefully to
662  * make sure that the user has not modified the
663  * psl to gain improper privileges or to cause
664  * a machine fault.
665  */
666 int
667 linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args)
668 {
669 	struct proc *p = td->td_proc;
670 	struct l_ucontext uc;
671 	struct l_sigcontext *context;
672 	l_stack_t *lss;
673 	stack_t ss;
674 	struct trapframe *regs;
675 	int eflags;
676 	ksiginfo_t ksi;
677 
678 	regs = td->td_frame;
679 
680 #ifdef DEBUG
681 	if (ldebug(rt_sigreturn))
682 		printf(ARGS(rt_sigreturn, "%p"), (void *)args->ucp);
683 #endif
684 	/*
685 	 * The trampoline code hands us the ucontext.
686 	 * It is unsafe to keep track of it ourselves, in the event that a
687 	 * program jumps out of a signal handler.
688 	 */
689 	if (copyin(args->ucp, &uc, sizeof(uc)) != 0)
690 		return (EFAULT);
691 
692 	context = &uc.uc_mcontext;
693 
694 	/*
695 	 * Check for security violations.
696 	 */
697 #define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
698 	eflags = context->sc_eflags;
699 	/*
700 	 * XXX do allow users to change the privileged flag PSL_RF.  The
701 	 * cpu sets PSL_RF in tf_eflags for faults.  Debuggers should
702 	 * sometimes set it there too.  tf_eflags is kept in the signal
703 	 * context during signal handling and there is no other place
704 	 * to remember it, so the PSL_RF bit may be corrupted by the
705 	 * signal handler without us knowing.  Corruption of the PSL_RF
706 	 * bit at worst causes one more or one less debugger trap, so
707 	 * allowing it is fairly harmless.
708 	 */
709 	if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF))
710 		return(EINVAL);
711 
712 	/*
713 	 * Don't allow users to load a valid privileged %cs.  Let the
714 	 * hardware check for invalid selectors, excess privilege in
715 	 * other selectors, invalid %eip's and invalid %esp's.
716 	 */
717 #define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
718 	if (!CS_SECURE(context->sc_cs)) {
719 		ksiginfo_init_trap(&ksi);
720 		ksi.ksi_signo = SIGBUS;
721 		ksi.ksi_code = BUS_OBJERR;
722 		ksi.ksi_trapno = T_PROTFLT;
723 		ksi.ksi_addr = (void *)regs->tf_rip;
724 		trapsignal(td, &ksi);
725 		return(EINVAL);
726 	}
727 
728 	PROC_LOCK(p);
729 	linux_to_bsd_sigset(&uc.uc_sigmask, &td->td_sigmask);
730 	SIG_CANTMASK(td->td_sigmask);
731 	signotify(td);
732 	PROC_UNLOCK(p);
733 
734 	/*
735 	 * Restore signal context
736 	 */
737 	regs->tf_gs	= context->sc_gs;
738 	regs->tf_fs	= context->sc_fs;
739 	regs->tf_es	= context->sc_es;
740 	regs->tf_ds	= context->sc_ds;
741 	regs->tf_rdi    = context->sc_edi;
742 	regs->tf_rsi    = context->sc_esi;
743 	regs->tf_rbp    = context->sc_ebp;
744 	regs->tf_rbx    = context->sc_ebx;
745 	regs->tf_rdx    = context->sc_edx;
746 	regs->tf_rcx    = context->sc_ecx;
747 	regs->tf_rax    = context->sc_eax;
748 	regs->tf_rip    = context->sc_eip;
749 	regs->tf_cs     = context->sc_cs;
750 	regs->tf_rflags = eflags;
751 	regs->tf_rsp    = context->sc_esp_at_signal;
752 	regs->tf_ss     = context->sc_ss;
753 	td->td_pcb->pcb_full_iret = 1;
754 
755 	/*
756 	 * call sigaltstack & ignore results..
757 	 */
758 	lss = &uc.uc_stack;
759 	ss.ss_sp = PTRIN(lss->ss_sp);
760 	ss.ss_size = lss->ss_size;
761 	ss.ss_flags = linux_to_bsd_sigaltstack(lss->ss_flags);
762 
763 #ifdef DEBUG
764 	if (ldebug(rt_sigreturn))
765 		printf(LMSG("rt_sigret flags: 0x%x, sp: %p, ss: 0x%lx, mask: 0x%x"),
766 		    ss.ss_flags, ss.ss_sp, ss.ss_size, context->sc_mask);
767 #endif
768 	(void)kern_sigaltstack(td, &ss, NULL);
769 
770 	return (EJUSTRETURN);
771 }
772 
773 /*
774  * MPSAFE
775  */
776 static void
777 linux_prepsyscall(struct trapframe *tf, int *args, u_int *code, caddr_t *params)
778 {
779 	args[0] = tf->tf_rbx;
780 	args[1] = tf->tf_rcx;
781 	args[2] = tf->tf_rdx;
782 	args[3] = tf->tf_rsi;
783 	args[4] = tf->tf_rdi;
784 	args[5] = tf->tf_rbp;	/* Unconfirmed */
785 	*params = NULL;		/* no copyin */
786 }
787 
788 /*
789  * If a linux binary is exec'ing something, try this image activator
790  * first.  We override standard shell script execution in order to
791  * be able to modify the interpreter path.  We only do this if a linux
792  * binary is doing the exec, so we do not create an EXEC module for it.
793  */
794 static int	exec_linux_imgact_try(struct image_params *iparams);
795 
796 static int
797 exec_linux_imgact_try(struct image_params *imgp)
798 {
799 	const char *head = (const char *)imgp->image_header;
800 	char *rpath;
801 	int error = -1, len;
802 
803 	/*
804 	* The interpreter for shell scripts run from a linux binary needs
805 	* to be located in /compat/linux if possible in order to recursively
806 	* maintain linux path emulation.
807 	*/
808 	if (((const short *)head)[0] == SHELLMAGIC) {
809 		/*
810 		* Run our normal shell image activator.  If it succeeds attempt
811 		* to use the alternate path for the interpreter.  If an
812 		* alternate * path is found, use our stringspace to store it.
813 		*/
814 		if ((error = exec_shell_imgact(imgp)) == 0) {
815 			linux_emul_convpath(FIRST_THREAD_IN_PROC(imgp->proc),
816 			    imgp->interpreter_name, UIO_SYSSPACE, &rpath, 0,
817 			    AT_FDCWD);
818 			if (rpath != NULL) {
819 				len = strlen(rpath) + 1;
820 
821 				if (len <= MAXSHELLCMDLEN) {
822 					memcpy(imgp->interpreter_name, rpath,
823 					    len);
824 				}
825 				free(rpath, M_TEMP);
826 			}
827 		}
828 	}
829 	return(error);
830 }
831 
832 /*
833  * Clear registers on exec
834  * XXX copied from ia32_signal.c.
835  */
836 static void
837 exec_linux_setregs(td, entry, stack, ps_strings)
838 	struct thread *td;
839 	u_long entry;
840 	u_long stack;
841 	u_long ps_strings;
842 {
843 	struct trapframe *regs = td->td_frame;
844 	struct pcb *pcb = td->td_pcb;
845 
846 	mtx_lock(&dt_lock);
847 	if (td->td_proc->p_md.md_ldt != NULL)
848 		user_ldt_free(td);
849 	else
850 		mtx_unlock(&dt_lock);
851 
852 	critical_enter();
853 	wrmsr(MSR_FSBASE, 0);
854 	wrmsr(MSR_KGSBASE, 0);	/* User value while we're in the kernel */
855 	pcb->pcb_fsbase = 0;
856 	pcb->pcb_gsbase = 0;
857 	critical_exit();
858 	pcb->pcb_initial_fpucw = __LINUX_NPXCW__;
859 
860 	bzero((char *)regs, sizeof(struct trapframe));
861 	regs->tf_rip = entry;
862 	regs->tf_rsp = stack;
863 	regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T);
864 	regs->tf_gs = _ugssel;
865 	regs->tf_fs = _ufssel;
866 	regs->tf_es = _udatasel;
867 	regs->tf_ds = _udatasel;
868 	regs->tf_ss = _udatasel;
869 	regs->tf_flags = TF_HASSEGS;
870 	regs->tf_cs = _ucode32sel;
871 	regs->tf_rbx = ps_strings;
872 	td->td_pcb->pcb_full_iret = 1;
873 	load_cr0(rcr0() | CR0_MP | CR0_TS);
874 	fpstate_drop(td);
875 
876 	/* Return via doreti so that we can change to a different %cs */
877 	pcb->pcb_flags |= PCB_FULLCTX | PCB_32BIT;
878 	pcb->pcb_flags &= ~PCB_GS32BIT;
879 	td->td_retval[1] = 0;
880 }
881 
882 /*
883  * XXX copied from ia32_sysvec.c.
884  */
885 static register_t *
886 linux_copyout_strings(struct image_params *imgp)
887 {
888 	int argc, envc;
889 	u_int32_t *vectp;
890 	char *stringp, *destp;
891 	u_int32_t *stack_base;
892 	struct linux32_ps_strings *arginfo;
893 
894 	/*
895 	 * Calculate string base and vector table pointers.
896 	 * Also deal with signal trampoline code for this exec type.
897 	 */
898 	arginfo = (struct linux32_ps_strings *)LINUX32_PS_STRINGS;
899 	destp =	(caddr_t)arginfo - linux_szsigcode - SPARE_USRSPACE -
900 	    linux_szplatform - roundup((ARG_MAX - imgp->args->stringspace),
901 	    sizeof(char *));
902 
903 	/*
904 	 * install sigcode
905 	 */
906 	copyout(imgp->proc->p_sysent->sv_sigcode,
907 	    ((caddr_t)arginfo - linux_szsigcode), linux_szsigcode);
908 
909 	/*
910 	 * Install LINUX_PLATFORM
911 	 */
912 	copyout(linux_platform, ((caddr_t)arginfo - linux_szsigcode -
913 	    linux_szplatform), linux_szplatform);
914 
915 	/*
916 	 * If we have a valid auxargs ptr, prepare some room
917 	 * on the stack.
918 	 */
919 	if (imgp->auxargs) {
920 		/*
921 		 * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for
922 		 * lower compatibility.
923 		 */
924 		imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size :
925 		    (LINUX_AT_COUNT * 2);
926 		/*
927 		 * The '+ 2' is for the null pointers at the end of each of
928 		 * the arg and env vector sets,and imgp->auxarg_size is room
929 		 * for argument of Runtime loader.
930 		 */
931 		vectp = (u_int32_t *) (destp - (imgp->args->argc +
932 		    imgp->args->envc + 2 + imgp->auxarg_size) *
933 		    sizeof(u_int32_t));
934 
935 	} else
936 		/*
937 		 * The '+ 2' is for the null pointers at the end of each of
938 		 * the arg and env vector sets
939 		 */
940 		vectp = (u_int32_t *)(destp - (imgp->args->argc +
941 		    imgp->args->envc + 2) * sizeof(u_int32_t));
942 
943 	/*
944 	 * vectp also becomes our initial stack base
945 	 */
946 	stack_base = vectp;
947 
948 	stringp = imgp->args->begin_argv;
949 	argc = imgp->args->argc;
950 	envc = imgp->args->envc;
951 	/*
952 	 * Copy out strings - arguments and environment.
953 	 */
954 	copyout(stringp, destp, ARG_MAX - imgp->args->stringspace);
955 
956 	/*
957 	 * Fill in "ps_strings" struct for ps, w, etc.
958 	 */
959 	suword32(&arginfo->ps_argvstr, (uint32_t)(intptr_t)vectp);
960 	suword32(&arginfo->ps_nargvstr, argc);
961 
962 	/*
963 	 * Fill in argument portion of vector table.
964 	 */
965 	for (; argc > 0; --argc) {
966 		suword32(vectp++, (uint32_t)(intptr_t)destp);
967 		while (*stringp++ != 0)
968 			destp++;
969 		destp++;
970 	}
971 
972 	/* a null vector table pointer separates the argp's from the envp's */
973 	suword32(vectp++, 0);
974 
975 	suword32(&arginfo->ps_envstr, (uint32_t)(intptr_t)vectp);
976 	suword32(&arginfo->ps_nenvstr, envc);
977 
978 	/*
979 	 * Fill in environment portion of vector table.
980 	 */
981 	for (; envc > 0; --envc) {
982 		suword32(vectp++, (uint32_t)(intptr_t)destp);
983 		while (*stringp++ != 0)
984 			destp++;
985 		destp++;
986 	}
987 
988 	/* end of vector table is a null pointer */
989 	suword32(vectp, 0);
990 
991 	return ((register_t *)stack_base);
992 }
993 
994 SYSCTL_NODE(_compat, OID_AUTO, linux32, CTLFLAG_RW, 0,
995     "32-bit Linux emulation");
996 
997 static u_long	linux32_maxdsiz = LINUX32_MAXDSIZ;
998 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxdsiz, CTLFLAG_RW,
999     &linux32_maxdsiz, 0, "");
1000 static u_long	linux32_maxssiz = LINUX32_MAXSSIZ;
1001 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxssiz, CTLFLAG_RW,
1002     &linux32_maxssiz, 0, "");
1003 static u_long	linux32_maxvmem = LINUX32_MAXVMEM;
1004 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxvmem, CTLFLAG_RW,
1005     &linux32_maxvmem, 0, "");
1006 
1007 static void
1008 linux32_fixlimit(struct rlimit *rl, int which)
1009 {
1010 
1011 	switch (which) {
1012 	case RLIMIT_DATA:
1013 		if (linux32_maxdsiz != 0) {
1014 			if (rl->rlim_cur > linux32_maxdsiz)
1015 				rl->rlim_cur = linux32_maxdsiz;
1016 			if (rl->rlim_max > linux32_maxdsiz)
1017 				rl->rlim_max = linux32_maxdsiz;
1018 		}
1019 		break;
1020 	case RLIMIT_STACK:
1021 		if (linux32_maxssiz != 0) {
1022 			if (rl->rlim_cur > linux32_maxssiz)
1023 				rl->rlim_cur = linux32_maxssiz;
1024 			if (rl->rlim_max > linux32_maxssiz)
1025 				rl->rlim_max = linux32_maxssiz;
1026 		}
1027 		break;
1028 	case RLIMIT_VMEM:
1029 		if (linux32_maxvmem != 0) {
1030 			if (rl->rlim_cur > linux32_maxvmem)
1031 				rl->rlim_cur = linux32_maxvmem;
1032 			if (rl->rlim_max > linux32_maxvmem)
1033 				rl->rlim_max = linux32_maxvmem;
1034 		}
1035 		break;
1036 	}
1037 }
1038 
1039 struct sysentvec elf_linux_sysvec = {
1040 	.sv_size	= LINUX_SYS_MAXSYSCALL,
1041 	.sv_table	= linux_sysent,
1042 	.sv_mask	= 0,
1043 	.sv_sigsize	= LINUX_SIGTBLSZ,
1044 	.sv_sigtbl	= bsd_to_linux_signal,
1045 	.sv_errsize	= ELAST + 1,
1046 	.sv_errtbl	= bsd_to_linux_errno,
1047 	.sv_transtrap	= translate_traps,
1048 	.sv_fixup	= elf_linux_fixup,
1049 	.sv_sendsig	= linux_sendsig,
1050 	.sv_sigcode	= linux_sigcode,
1051 	.sv_szsigcode	= &linux_szsigcode,
1052 	.sv_prepsyscall	= linux_prepsyscall,
1053 	.sv_name	= "Linux ELF32",
1054 	.sv_coredump	= elf32_coredump,
1055 	.sv_imgact_try	= exec_linux_imgact_try,
1056 	.sv_minsigstksz	= LINUX_MINSIGSTKSZ,
1057 	.sv_pagesize	= PAGE_SIZE,
1058 	.sv_minuser	= VM_MIN_ADDRESS,
1059 	.sv_maxuser	= LINUX32_USRSTACK,
1060 	.sv_usrstack	= LINUX32_USRSTACK,
1061 	.sv_psstrings	= LINUX32_PS_STRINGS,
1062 	.sv_stackprot	= VM_PROT_ALL,
1063 	.sv_copyout_strings = linux_copyout_strings,
1064 	.sv_setregs	= exec_linux_setregs,
1065 	.sv_fixlimit	= linux32_fixlimit,
1066 	.sv_maxssiz	= &linux32_maxssiz,
1067 	.sv_flags	= SV_ABI_LINUX | SV_ILP32 | SV_IA32
1068 };
1069 
1070 static char GNU_ABI_VENDOR[] = "GNU";
1071 static int GNULINUX_ABI_DESC = 0;
1072 
1073 static boolean_t
1074 linux32_trans_osrel(const Elf_Note *note, int32_t *osrel)
1075 {
1076 	const Elf32_Word *desc;
1077 	uintptr_t p;
1078 
1079 	p = (uintptr_t)(note + 1);
1080 	p += roundup2(note->n_namesz, sizeof(Elf32_Addr));
1081 
1082 	desc = (const Elf32_Word *)p;
1083 	if (desc[0] != GNULINUX_ABI_DESC)
1084 		return (FALSE);
1085 
1086 	/*
1087 	 * For linux we encode osrel as follows (see linux_mib.c):
1088 	 * VVVMMMIII (version, major, minor), see linux_mib.c.
1089 	 */
1090 	*osrel = desc[1] * 1000000 + desc[2] * 1000 + desc[3];
1091 
1092 	return (TRUE);
1093 }
1094 
1095 static Elf_Brandnote linux32_brandnote = {
1096 	.hdr.n_namesz	= sizeof(GNU_ABI_VENDOR),
1097 	.hdr.n_descsz	= 16,	/* XXX at least 16 */
1098 	.hdr.n_type	= 1,
1099 	.vendor		= GNU_ABI_VENDOR,
1100 	.flags		= BN_TRANSLATE_OSREL,
1101 	.trans_osrel	= linux32_trans_osrel
1102 };
1103 
1104 static Elf32_Brandinfo linux_brand = {
1105 	.brand		= ELFOSABI_LINUX,
1106 	.machine	= EM_386,
1107 	.compat_3_brand	= "Linux",
1108 	.emul_path	= "/compat/linux",
1109 	.interp_path	= "/lib/ld-linux.so.1",
1110 	.sysvec		= &elf_linux_sysvec,
1111 	.interp_newpath	= NULL,
1112 	.brand_note	= &linux32_brandnote,
1113 	.flags		= BI_CAN_EXEC_DYN | BI_BRAND_NOTE
1114 };
1115 
1116 static Elf32_Brandinfo linux_glibc2brand = {
1117 	.brand		= ELFOSABI_LINUX,
1118 	.machine	= EM_386,
1119 	.compat_3_brand	= "Linux",
1120 	.emul_path	= "/compat/linux",
1121 	.interp_path	= "/lib/ld-linux.so.2",
1122 	.sysvec		= &elf_linux_sysvec,
1123 	.interp_newpath	= NULL,
1124 	.brand_note	= &linux32_brandnote,
1125 	.flags		= BI_CAN_EXEC_DYN | BI_BRAND_NOTE
1126 };
1127 
1128 Elf32_Brandinfo *linux_brandlist[] = {
1129 	&linux_brand,
1130 	&linux_glibc2brand,
1131 	NULL
1132 };
1133 
1134 static int
1135 linux_elf_modevent(module_t mod, int type, void *data)
1136 {
1137 	Elf32_Brandinfo **brandinfo;
1138 	int error;
1139 	struct linux_ioctl_handler **lihp;
1140 	struct linux_device_handler **ldhp;
1141 
1142 	error = 0;
1143 
1144 	switch(type) {
1145 	case MOD_LOAD:
1146 		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
1147 		     ++brandinfo)
1148 			if (elf32_insert_brand_entry(*brandinfo) < 0)
1149 				error = EINVAL;
1150 		if (error == 0) {
1151 			SET_FOREACH(lihp, linux_ioctl_handler_set)
1152 				linux_ioctl_register_handler(*lihp);
1153 			SET_FOREACH(ldhp, linux_device_handler_set)
1154 				linux_device_register_handler(*ldhp);
1155 			mtx_init(&emul_lock, "emuldata lock", NULL, MTX_DEF);
1156 			sx_init(&emul_shared_lock, "emuldata->shared lock");
1157 			LIST_INIT(&futex_list);
1158 			mtx_init(&futex_mtx, "ftllk", NULL, MTX_DEF);
1159 			linux_exit_tag = EVENTHANDLER_REGISTER(process_exit,
1160 			    linux_proc_exit, NULL, 1000);
1161 			linux_schedtail_tag = EVENTHANDLER_REGISTER(schedtail,
1162 			    linux_schedtail, NULL, 1000);
1163 			linux_exec_tag = EVENTHANDLER_REGISTER(process_exec,
1164 			    linux_proc_exec, NULL, 1000);
1165 			linux_szplatform = roundup(strlen(linux_platform) + 1,
1166 			    sizeof(char *));
1167 			linux_osd_jail_register();
1168 			stclohz = (stathz ? stathz : hz);
1169 			if (bootverbose)
1170 				printf("Linux ELF exec handler installed\n");
1171 		} else
1172 			printf("cannot insert Linux ELF brand handler\n");
1173 		break;
1174 	case MOD_UNLOAD:
1175 		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
1176 		     ++brandinfo)
1177 			if (elf32_brand_inuse(*brandinfo))
1178 				error = EBUSY;
1179 		if (error == 0) {
1180 			for (brandinfo = &linux_brandlist[0];
1181 			     *brandinfo != NULL; ++brandinfo)
1182 				if (elf32_remove_brand_entry(*brandinfo) < 0)
1183 					error = EINVAL;
1184 		}
1185 		if (error == 0) {
1186 			SET_FOREACH(lihp, linux_ioctl_handler_set)
1187 				linux_ioctl_unregister_handler(*lihp);
1188 			SET_FOREACH(ldhp, linux_device_handler_set)
1189 				linux_device_unregister_handler(*ldhp);
1190 			mtx_destroy(&emul_lock);
1191 			sx_destroy(&emul_shared_lock);
1192 			mtx_destroy(&futex_mtx);
1193 			EVENTHANDLER_DEREGISTER(process_exit, linux_exit_tag);
1194 			EVENTHANDLER_DEREGISTER(schedtail, linux_schedtail_tag);
1195 			EVENTHANDLER_DEREGISTER(process_exec, linux_exec_tag);
1196 			linux_osd_jail_deregister();
1197 			if (bootverbose)
1198 				printf("Linux ELF exec handler removed\n");
1199 		} else
1200 			printf("Could not deinstall ELF interpreter entry\n");
1201 		break;
1202 	default:
1203 		return EOPNOTSUPP;
1204 	}
1205 	return error;
1206 }
1207 
1208 static moduledata_t linux_elf_mod = {
1209 	"linuxelf",
1210 	linux_elf_modevent,
1211 	0
1212 };
1213 
1214 DECLARE_MODULE(linuxelf, linux_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY);
1215