xref: /freebsd/sys/amd64/linux32/linux32_sysvec.c (revision 67d39748499e85cff626c202aa2cb6e9f180283e)
1 /*-
2  * Copyright (c) 2004 Tim J. Robbins
3  * Copyright (c) 2003 Peter Wemm
4  * Copyright (c) 2002 Doug Rabson
5  * Copyright (c) 1998-1999 Andrew Gallatin
6  * Copyright (c) 1994-1996 Søren Schmidt
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer
14  *    in this position and unchanged.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. The name of the author may not be used to endorse or promote products
19  *    derived from this software without specific prior written permission
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
22  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
23  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
24  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
25  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
26  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
30  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31  */
32 
33 #include <sys/cdefs.h>
34 __FBSDID("$FreeBSD$");
35 #include "opt_compat.h"
36 
37 #ifndef COMPAT_FREEBSD32
38 #error "Unable to compile Linux-emulator due to missing COMPAT_FREEBSD32 option!"
39 #endif
40 
41 #define	__ELF_WORD_SIZE	32
42 
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/exec.h>
46 #include <sys/fcntl.h>
47 #include <sys/imgact.h>
48 #include <sys/imgact_elf.h>
49 #include <sys/kernel.h>
50 #include <sys/lock.h>
51 #include <sys/malloc.h>
52 #include <sys/module.h>
53 #include <sys/mutex.h>
54 #include <sys/proc.h>
55 #include <sys/resourcevar.h>
56 #include <sys/signalvar.h>
57 #include <sys/sysctl.h>
58 #include <sys/syscallsubr.h>
59 #include <sys/sysent.h>
60 #include <sys/sysproto.h>
61 #include <sys/vnode.h>
62 #include <sys/eventhandler.h>
63 
64 #include <vm/vm.h>
65 #include <vm/pmap.h>
66 #include <vm/vm_extern.h>
67 #include <vm/vm_map.h>
68 #include <vm/vm_object.h>
69 #include <vm/vm_page.h>
70 #include <vm/vm_param.h>
71 
72 #include <machine/cpu.h>
73 #include <machine/md_var.h>
74 #include <machine/pcb.h>
75 #include <machine/specialreg.h>
76 
77 #include <amd64/linux32/linux.h>
78 #include <amd64/linux32/linux32_proto.h>
79 #include <compat/linux/linux_emul.h>
80 #include <compat/linux/linux_futex.h>
81 #include <compat/linux/linux_ioctl.h>
82 #include <compat/linux/linux_mib.h>
83 #include <compat/linux/linux_misc.h>
84 #include <compat/linux/linux_signal.h>
85 #include <compat/linux/linux_util.h>
86 #include <compat/linux/linux_vdso.h>
87 
88 MODULE_VERSION(linux, 1);
89 
90 #define	AUXARGS_ENTRY_32(pos, id, val)	\
91 	do {				\
92 		suword32(pos++, id);	\
93 		suword32(pos++, val);	\
94 	} while (0)
95 
96 #if BYTE_ORDER == LITTLE_ENDIAN
97 #define SHELLMAGIC      0x2123 /* #! */
98 #else
99 #define SHELLMAGIC      0x2321
100 #endif
101 
102 /*
103  * Allow the sendsig functions to use the ldebug() facility
104  * even though they are not syscalls themselves. Map them
105  * to syscall 0. This is slightly less bogus than using
106  * ldebug(sigreturn).
107  */
108 #define	LINUX_SYS_linux_rt_sendsig	0
109 #define	LINUX_SYS_linux_sendsig		0
110 
111 const char *linux_kplatform;
112 static int linux_szsigcode;
113 static vm_object_t linux_shared_page_obj;
114 static char *linux_shared_page_mapping;
115 extern char _binary_linux32_locore_o_start;
116 extern char _binary_linux32_locore_o_end;
117 
118 extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL];
119 
120 SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler);
121 
122 static int	elf_linux_fixup(register_t **stack_base,
123 		    struct image_params *iparams);
124 static register_t *linux_copyout_strings(struct image_params *imgp);
125 static void     linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask);
126 static void	exec_linux_setregs(struct thread *td,
127 				   struct image_params *imgp, u_long stack);
128 static void	linux32_fixlimit(struct rlimit *rl, int which);
129 static boolean_t linux32_trans_osrel(const Elf_Note *note, int32_t *osrel);
130 static void	linux_vdso_install(void *param);
131 static void	linux_vdso_deinstall(void *param);
132 
133 static eventhandler_tag linux_exit_tag;
134 static eventhandler_tag linux_exec_tag;
135 static eventhandler_tag linux_thread_dtor_tag;
136 
137 /*
138  * Linux syscalls return negative errno's, we do positive and map them
139  * Reference:
140  *   FreeBSD: src/sys/sys/errno.h
141  *   Linux:   linux-2.6.17.8/include/asm-generic/errno-base.h
142  *            linux-2.6.17.8/include/asm-generic/errno.h
143  */
144 static int bsd_to_linux_errno[ELAST + 1] = {
145 	-0,  -1,  -2,  -3,  -4,  -5,  -6,  -7,  -8,  -9,
146 	-10, -35, -12, -13, -14, -15, -16, -17, -18, -19,
147 	-20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
148 	-30, -31, -32, -33, -34, -11,-115,-114, -88, -89,
149 	-90, -91, -92, -93, -94, -95, -96, -97, -98, -99,
150 	-100,-101,-102,-103,-104,-105,-106,-107,-108,-109,
151 	-110,-111, -40, -36,-112,-113, -39, -11, -87,-122,
152 	-116, -66,  -6,  -6,  -6,  -6,  -6, -37, -38,  -9,
153 	  -6,  -6, -43, -42, -75,-125, -84, -95, -16, -74,
154 	 -72, -67, -71
155 };
156 
157 int bsd_to_linux_signal[LINUX_SIGTBLSZ] = {
158 	LINUX_SIGHUP, LINUX_SIGINT, LINUX_SIGQUIT, LINUX_SIGILL,
159 	LINUX_SIGTRAP, LINUX_SIGABRT, 0, LINUX_SIGFPE,
160 	LINUX_SIGKILL, LINUX_SIGBUS, LINUX_SIGSEGV, LINUX_SIGSYS,
161 	LINUX_SIGPIPE, LINUX_SIGALRM, LINUX_SIGTERM, LINUX_SIGURG,
162 	LINUX_SIGSTOP, LINUX_SIGTSTP, LINUX_SIGCONT, LINUX_SIGCHLD,
163 	LINUX_SIGTTIN, LINUX_SIGTTOU, LINUX_SIGIO, LINUX_SIGXCPU,
164 	LINUX_SIGXFSZ, LINUX_SIGVTALRM, LINUX_SIGPROF, LINUX_SIGWINCH,
165 	0, LINUX_SIGUSR1, LINUX_SIGUSR2
166 };
167 
168 int linux_to_bsd_signal[LINUX_SIGTBLSZ] = {
169 	SIGHUP, SIGINT, SIGQUIT, SIGILL,
170 	SIGTRAP, SIGABRT, SIGBUS, SIGFPE,
171 	SIGKILL, SIGUSR1, SIGSEGV, SIGUSR2,
172 	SIGPIPE, SIGALRM, SIGTERM, SIGBUS,
173 	SIGCHLD, SIGCONT, SIGSTOP, SIGTSTP,
174 	SIGTTIN, SIGTTOU, SIGURG, SIGXCPU,
175 	SIGXFSZ, SIGVTALRM, SIGPROF, SIGWINCH,
176 	SIGIO, SIGURG, SIGSYS
177 };
178 
179 #define LINUX_T_UNKNOWN  255
180 static int _bsd_to_linux_trapcode[] = {
181 	LINUX_T_UNKNOWN,	/* 0 */
182 	6,			/* 1  T_PRIVINFLT */
183 	LINUX_T_UNKNOWN,	/* 2 */
184 	3,			/* 3  T_BPTFLT */
185 	LINUX_T_UNKNOWN,	/* 4 */
186 	LINUX_T_UNKNOWN,	/* 5 */
187 	16,			/* 6  T_ARITHTRAP */
188 	254,			/* 7  T_ASTFLT */
189 	LINUX_T_UNKNOWN,	/* 8 */
190 	13,			/* 9  T_PROTFLT */
191 	1,			/* 10 T_TRCTRAP */
192 	LINUX_T_UNKNOWN,	/* 11 */
193 	14,			/* 12 T_PAGEFLT */
194 	LINUX_T_UNKNOWN,	/* 13 */
195 	17,			/* 14 T_ALIGNFLT */
196 	LINUX_T_UNKNOWN,	/* 15 */
197 	LINUX_T_UNKNOWN,	/* 16 */
198 	LINUX_T_UNKNOWN,	/* 17 */
199 	0,			/* 18 T_DIVIDE */
200 	2,			/* 19 T_NMI */
201 	4,			/* 20 T_OFLOW */
202 	5,			/* 21 T_BOUND */
203 	7,			/* 22 T_DNA */
204 	8,			/* 23 T_DOUBLEFLT */
205 	9,			/* 24 T_FPOPFLT */
206 	10,			/* 25 T_TSSFLT */
207 	11,			/* 26 T_SEGNPFLT */
208 	12,			/* 27 T_STKFLT */
209 	18,			/* 28 T_MCHK */
210 	19,			/* 29 T_XMMFLT */
211 	15			/* 30 T_RESERVED */
212 };
213 #define bsd_to_linux_trapcode(code) \
214     ((code)<sizeof(_bsd_to_linux_trapcode)/sizeof(*_bsd_to_linux_trapcode)? \
215      _bsd_to_linux_trapcode[(code)]: \
216      LINUX_T_UNKNOWN)
217 
218 struct linux32_ps_strings {
219 	u_int32_t ps_argvstr;	/* first of 0 or more argument strings */
220 	u_int ps_nargvstr;	/* the number of argument strings */
221 	u_int32_t ps_envstr;	/* first of 0 or more environment strings */
222 	u_int ps_nenvstr;	/* the number of environment strings */
223 };
224 
225 LINUX_VDSO_SYM_INTPTR(linux32_sigcode);
226 LINUX_VDSO_SYM_INTPTR(linux32_rt_sigcode);
227 LINUX_VDSO_SYM_INTPTR(linux32_vsyscall);
228 LINUX_VDSO_SYM_CHAR(linux_platform);
229 
230 /*
231  * If FreeBSD & Linux have a difference of opinion about what a trap
232  * means, deal with it here.
233  *
234  * MPSAFE
235  */
236 static int
237 translate_traps(int signal, int trap_code)
238 {
239 	if (signal != SIGBUS)
240 		return signal;
241 	switch (trap_code) {
242 	case T_PROTFLT:
243 	case T_TSSFLT:
244 	case T_DOUBLEFLT:
245 	case T_PAGEFLT:
246 		return SIGSEGV;
247 	default:
248 		return signal;
249 	}
250 }
251 
252 static int
253 elf_linux_fixup(register_t **stack_base, struct image_params *imgp)
254 {
255 	Elf32_Auxargs *args;
256 	Elf32_Addr *base;
257 	Elf32_Addr *pos;
258 	struct linux32_ps_strings *arginfo;
259 
260 	arginfo = (struct linux32_ps_strings *)LINUX32_PS_STRINGS;
261 
262 	KASSERT(curthread->td_proc == imgp->proc,
263 	    ("unsafe elf_linux_fixup(), should be curproc"));
264 	base = (Elf32_Addr *)*stack_base;
265 	args = (Elf32_Auxargs *)imgp->auxargs;
266 	pos = base + (imgp->args->argc + imgp->args->envc + 2);
267 
268 	AUXARGS_ENTRY_32(pos, LINUX_AT_SYSINFO_EHDR,
269 	    imgp->proc->p_sysent->sv_shared_page_base);
270 	AUXARGS_ENTRY_32(pos, LINUX_AT_SYSINFO, linux32_vsyscall);
271 	AUXARGS_ENTRY_32(pos, LINUX_AT_HWCAP, cpu_feature);
272 
273 	/*
274 	 * Do not export AT_CLKTCK when emulating Linux kernel prior to 2.4.0,
275 	 * as it has appeared in the 2.4.0-rc7 first time.
276 	 * Being exported, AT_CLKTCK is returned by sysconf(_SC_CLK_TCK),
277 	 * glibc falls back to the hard-coded CLK_TCK value when aux entry
278 	 * is not present.
279 	 * Also see linux_times() implementation.
280 	 */
281 	if (linux_kernver(curthread) >= LINUX_KERNVER_2004000)
282 		AUXARGS_ENTRY_32(pos, LINUX_AT_CLKTCK, stclohz);
283 	AUXARGS_ENTRY_32(pos, AT_PHDR, args->phdr);
284 	AUXARGS_ENTRY_32(pos, AT_PHENT, args->phent);
285 	AUXARGS_ENTRY_32(pos, AT_PHNUM, args->phnum);
286 	AUXARGS_ENTRY_32(pos, AT_PAGESZ, args->pagesz);
287 	AUXARGS_ENTRY_32(pos, AT_FLAGS, args->flags);
288 	AUXARGS_ENTRY_32(pos, AT_ENTRY, args->entry);
289 	AUXARGS_ENTRY_32(pos, AT_BASE, args->base);
290 	AUXARGS_ENTRY_32(pos, LINUX_AT_SECURE, 0);
291 	AUXARGS_ENTRY_32(pos, AT_UID, imgp->proc->p_ucred->cr_ruid);
292 	AUXARGS_ENTRY_32(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid);
293 	AUXARGS_ENTRY_32(pos, AT_GID, imgp->proc->p_ucred->cr_rgid);
294 	AUXARGS_ENTRY_32(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid);
295 	AUXARGS_ENTRY_32(pos, LINUX_AT_PLATFORM, PTROUT(linux_platform));
296 	if (args->execfd != -1)
297 		AUXARGS_ENTRY_32(pos, AT_EXECFD, args->execfd);
298 	AUXARGS_ENTRY_32(pos, AT_NULL, 0);
299 
300 	free(imgp->auxargs, M_TEMP);
301 	imgp->auxargs = NULL;
302 
303 	base--;
304 	suword32(base, (uint32_t)imgp->args->argc);
305 	*stack_base = (register_t *)base;
306 	return (0);
307 }
308 
309 static void
310 linux_rt_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
311 {
312 	struct thread *td = curthread;
313 	struct proc *p = td->td_proc;
314 	struct sigacts *psp;
315 	struct trapframe *regs;
316 	struct l_rt_sigframe *fp, frame;
317 	int oonstack;
318 	int sig;
319 	int code;
320 
321 	sig = ksi->ksi_signo;
322 	code = ksi->ksi_code;
323 	PROC_LOCK_ASSERT(p, MA_OWNED);
324 	psp = p->p_sigacts;
325 	mtx_assert(&psp->ps_mtx, MA_OWNED);
326 	regs = td->td_frame;
327 	oonstack = sigonstack(regs->tf_rsp);
328 
329 #ifdef DEBUG
330 	if (ldebug(rt_sendsig))
331 		printf(ARGS(rt_sendsig, "%p, %d, %p, %u"),
332 		    catcher, sig, (void*)mask, code);
333 #endif
334 	/*
335 	 * Allocate space for the signal handler context.
336 	 */
337 	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
338 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
339 		fp = (struct l_rt_sigframe *)(td->td_sigstk.ss_sp +
340 		    td->td_sigstk.ss_size - sizeof(struct l_rt_sigframe));
341 	} else
342 		fp = (struct l_rt_sigframe *)regs->tf_rsp - 1;
343 	mtx_unlock(&psp->ps_mtx);
344 
345 	/*
346 	 * Build the argument list for the signal handler.
347 	 */
348 	if (p->p_sysent->sv_sigtbl)
349 		if (sig <= p->p_sysent->sv_sigsize)
350 			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
351 
352 	bzero(&frame, sizeof(frame));
353 
354 	frame.sf_handler = PTROUT(catcher);
355 	frame.sf_sig = sig;
356 	frame.sf_siginfo = PTROUT(&fp->sf_si);
357 	frame.sf_ucontext = PTROUT(&fp->sf_sc);
358 
359 	/* Fill in POSIX parts */
360 	ksiginfo_to_lsiginfo(ksi, &frame.sf_si, sig);
361 
362 	/*
363 	 * Build the signal context to be used by sigreturn
364 	 * and libgcc unwind.
365 	 */
366 	frame.sf_sc.uc_flags = 0;		/* XXX ??? */
367 	frame.sf_sc.uc_link = 0;		/* XXX ??? */
368 
369 	frame.sf_sc.uc_stack.ss_sp = PTROUT(td->td_sigstk.ss_sp);
370 	frame.sf_sc.uc_stack.ss_size = td->td_sigstk.ss_size;
371 	frame.sf_sc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
372 	    ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE;
373 	PROC_UNLOCK(p);
374 
375 	bsd_to_linux_sigset(mask, &frame.sf_sc.uc_sigmask);
376 
377 	frame.sf_sc.uc_mcontext.sc_mask   = frame.sf_sc.uc_sigmask.__bits[0];
378 	frame.sf_sc.uc_mcontext.sc_edi    = regs->tf_rdi;
379 	frame.sf_sc.uc_mcontext.sc_esi    = regs->tf_rsi;
380 	frame.sf_sc.uc_mcontext.sc_ebp    = regs->tf_rbp;
381 	frame.sf_sc.uc_mcontext.sc_ebx    = regs->tf_rbx;
382 	frame.sf_sc.uc_mcontext.sc_esp    = regs->tf_rsp;
383 	frame.sf_sc.uc_mcontext.sc_edx    = regs->tf_rdx;
384 	frame.sf_sc.uc_mcontext.sc_ecx    = regs->tf_rcx;
385 	frame.sf_sc.uc_mcontext.sc_eax    = regs->tf_rax;
386 	frame.sf_sc.uc_mcontext.sc_eip    = regs->tf_rip;
387 	frame.sf_sc.uc_mcontext.sc_cs     = regs->tf_cs;
388 	frame.sf_sc.uc_mcontext.sc_gs     = regs->tf_gs;
389 	frame.sf_sc.uc_mcontext.sc_fs     = regs->tf_fs;
390 	frame.sf_sc.uc_mcontext.sc_es     = regs->tf_es;
391 	frame.sf_sc.uc_mcontext.sc_ds     = regs->tf_ds;
392 	frame.sf_sc.uc_mcontext.sc_eflags = regs->tf_rflags;
393 	frame.sf_sc.uc_mcontext.sc_esp_at_signal = regs->tf_rsp;
394 	frame.sf_sc.uc_mcontext.sc_ss     = regs->tf_ss;
395 	frame.sf_sc.uc_mcontext.sc_err    = regs->tf_err;
396 	frame.sf_sc.uc_mcontext.sc_cr2    = (u_int32_t)(uintptr_t)ksi->ksi_addr;
397 	frame.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code);
398 
399 #ifdef DEBUG
400 	if (ldebug(rt_sendsig))
401 		printf(LMSG("rt_sendsig flags: 0x%x, sp: %p, ss: 0x%lx, mask: 0x%x"),
402 		    frame.sf_sc.uc_stack.ss_flags, td->td_sigstk.ss_sp,
403 		    td->td_sigstk.ss_size, frame.sf_sc.uc_mcontext.sc_mask);
404 #endif
405 
406 	if (copyout(&frame, fp, sizeof(frame)) != 0) {
407 		/*
408 		 * Process has trashed its stack; give it an illegal
409 		 * instruction to halt it in its tracks.
410 		 */
411 #ifdef DEBUG
412 		if (ldebug(rt_sendsig))
413 			printf(LMSG("rt_sendsig: bad stack %p, oonstack=%x"),
414 			    fp, oonstack);
415 #endif
416 		PROC_LOCK(p);
417 		sigexit(td, SIGILL);
418 	}
419 
420 	/*
421 	 * Build context to run handler in.
422 	 */
423 	regs->tf_rsp = PTROUT(fp);
424 	regs->tf_rip = linux32_rt_sigcode;
425 	regs->tf_rflags &= ~(PSL_T | PSL_D);
426 	regs->tf_cs = _ucode32sel;
427 	regs->tf_ss = _udatasel;
428 	regs->tf_ds = _udatasel;
429 	regs->tf_es = _udatasel;
430 	regs->tf_fs = _ufssel;
431 	regs->tf_gs = _ugssel;
432 	regs->tf_flags = TF_HASSEGS;
433 	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
434 	PROC_LOCK(p);
435 	mtx_lock(&psp->ps_mtx);
436 }
437 
438 
439 /*
440  * Send an interrupt to process.
441  *
442  * Stack is set up to allow sigcode stored
443  * in u. to call routine, followed by kcall
444  * to sigreturn routine below.  After sigreturn
445  * resets the signal mask, the stack, and the
446  * frame pointer, it returns to the user
447  * specified pc, psl.
448  */
449 static void
450 linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
451 {
452 	struct thread *td = curthread;
453 	struct proc *p = td->td_proc;
454 	struct sigacts *psp;
455 	struct trapframe *regs;
456 	struct l_sigframe *fp, frame;
457 	l_sigset_t lmask;
458 	int oonstack, i;
459 	int sig, code;
460 
461 	sig = ksi->ksi_signo;
462 	code = ksi->ksi_code;
463 	PROC_LOCK_ASSERT(p, MA_OWNED);
464 	psp = p->p_sigacts;
465 	mtx_assert(&psp->ps_mtx, MA_OWNED);
466 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
467 		/* Signal handler installed with SA_SIGINFO. */
468 		linux_rt_sendsig(catcher, ksi, mask);
469 		return;
470 	}
471 
472 	regs = td->td_frame;
473 	oonstack = sigonstack(regs->tf_rsp);
474 
475 #ifdef DEBUG
476 	if (ldebug(sendsig))
477 		printf(ARGS(sendsig, "%p, %d, %p, %u"),
478 		    catcher, sig, (void*)mask, code);
479 #endif
480 
481 	/*
482 	 * Allocate space for the signal handler context.
483 	 */
484 	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
485 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
486 		fp = (struct l_sigframe *)(td->td_sigstk.ss_sp +
487 		    td->td_sigstk.ss_size - sizeof(struct l_sigframe));
488 	} else
489 		fp = (struct l_sigframe *)regs->tf_rsp - 1;
490 	mtx_unlock(&psp->ps_mtx);
491 	PROC_UNLOCK(p);
492 
493 	/*
494 	 * Build the argument list for the signal handler.
495 	 */
496 	if (p->p_sysent->sv_sigtbl)
497 		if (sig <= p->p_sysent->sv_sigsize)
498 			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
499 
500 	bzero(&frame, sizeof(frame));
501 
502 	frame.sf_handler = PTROUT(catcher);
503 	frame.sf_sig = sig;
504 
505 	bsd_to_linux_sigset(mask, &lmask);
506 
507 	/*
508 	 * Build the signal context to be used by sigreturn.
509 	 */
510 	frame.sf_sc.sc_mask   = lmask.__bits[0];
511 	frame.sf_sc.sc_gs     = regs->tf_gs;
512 	frame.sf_sc.sc_fs     = regs->tf_fs;
513 	frame.sf_sc.sc_es     = regs->tf_es;
514 	frame.sf_sc.sc_ds     = regs->tf_ds;
515 	frame.sf_sc.sc_edi    = regs->tf_rdi;
516 	frame.sf_sc.sc_esi    = regs->tf_rsi;
517 	frame.sf_sc.sc_ebp    = regs->tf_rbp;
518 	frame.sf_sc.sc_ebx    = regs->tf_rbx;
519 	frame.sf_sc.sc_esp    = regs->tf_rsp;
520 	frame.sf_sc.sc_edx    = regs->tf_rdx;
521 	frame.sf_sc.sc_ecx    = regs->tf_rcx;
522 	frame.sf_sc.sc_eax    = regs->tf_rax;
523 	frame.sf_sc.sc_eip    = regs->tf_rip;
524 	frame.sf_sc.sc_cs     = regs->tf_cs;
525 	frame.sf_sc.sc_eflags = regs->tf_rflags;
526 	frame.sf_sc.sc_esp_at_signal = regs->tf_rsp;
527 	frame.sf_sc.sc_ss     = regs->tf_ss;
528 	frame.sf_sc.sc_err    = regs->tf_err;
529 	frame.sf_sc.sc_cr2    = (u_int32_t)(uintptr_t)ksi->ksi_addr;
530 	frame.sf_sc.sc_trapno = bsd_to_linux_trapcode(code);
531 
532 	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
533 		frame.sf_extramask[i] = lmask.__bits[i+1];
534 
535 	if (copyout(&frame, fp, sizeof(frame)) != 0) {
536 		/*
537 		 * Process has trashed its stack; give it an illegal
538 		 * instruction to halt it in its tracks.
539 		 */
540 		PROC_LOCK(p);
541 		sigexit(td, SIGILL);
542 	}
543 
544 	/*
545 	 * Build context to run handler in.
546 	 */
547 	regs->tf_rsp = PTROUT(fp);
548 	regs->tf_rip = linux32_sigcode;
549 	regs->tf_rflags &= ~(PSL_T | PSL_D);
550 	regs->tf_cs = _ucode32sel;
551 	regs->tf_ss = _udatasel;
552 	regs->tf_ds = _udatasel;
553 	regs->tf_es = _udatasel;
554 	regs->tf_fs = _ufssel;
555 	regs->tf_gs = _ugssel;
556 	regs->tf_flags = TF_HASSEGS;
557 	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
558 	PROC_LOCK(p);
559 	mtx_lock(&psp->ps_mtx);
560 }
561 
562 /*
563  * System call to cleanup state after a signal
564  * has been taken.  Reset signal mask and
565  * stack state from context left by sendsig (above).
566  * Return to previous pc and psl as specified by
567  * context left by sendsig. Check carefully to
568  * make sure that the user has not modified the
569  * psl to gain improper privileges or to cause
570  * a machine fault.
571  */
572 int
573 linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args)
574 {
575 	struct l_sigframe frame;
576 	struct trapframe *regs;
577 	sigset_t bmask;
578 	l_sigset_t lmask;
579 	int eflags, i;
580 	ksiginfo_t ksi;
581 
582 	regs = td->td_frame;
583 
584 #ifdef DEBUG
585 	if (ldebug(sigreturn))
586 		printf(ARGS(sigreturn, "%p"), (void *)args->sfp);
587 #endif
588 	/*
589 	 * The trampoline code hands us the sigframe.
590 	 * It is unsafe to keep track of it ourselves, in the event that a
591 	 * program jumps out of a signal handler.
592 	 */
593 	if (copyin(args->sfp, &frame, sizeof(frame)) != 0)
594 		return (EFAULT);
595 
596 	/*
597 	 * Check for security violations.
598 	 */
599 #define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
600 	eflags = frame.sf_sc.sc_eflags;
601 	if (!EFLAGS_SECURE(eflags, regs->tf_rflags))
602 		return(EINVAL);
603 
604 	/*
605 	 * Don't allow users to load a valid privileged %cs.  Let the
606 	 * hardware check for invalid selectors, excess privilege in
607 	 * other selectors, invalid %eip's and invalid %esp's.
608 	 */
609 #define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
610 	if (!CS_SECURE(frame.sf_sc.sc_cs)) {
611 		ksiginfo_init_trap(&ksi);
612 		ksi.ksi_signo = SIGBUS;
613 		ksi.ksi_code = BUS_OBJERR;
614 		ksi.ksi_trapno = T_PROTFLT;
615 		ksi.ksi_addr = (void *)regs->tf_rip;
616 		trapsignal(td, &ksi);
617 		return(EINVAL);
618 	}
619 
620 	lmask.__bits[0] = frame.sf_sc.sc_mask;
621 	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
622 		lmask.__bits[i+1] = frame.sf_extramask[i];
623 	linux_to_bsd_sigset(&lmask, &bmask);
624 	kern_sigprocmask(td, SIG_SETMASK, &bmask, NULL, 0);
625 
626 	/*
627 	 * Restore signal context.
628 	 */
629 	regs->tf_rdi    = frame.sf_sc.sc_edi;
630 	regs->tf_rsi    = frame.sf_sc.sc_esi;
631 	regs->tf_rbp    = frame.sf_sc.sc_ebp;
632 	regs->tf_rbx    = frame.sf_sc.sc_ebx;
633 	regs->tf_rdx    = frame.sf_sc.sc_edx;
634 	regs->tf_rcx    = frame.sf_sc.sc_ecx;
635 	regs->tf_rax    = frame.sf_sc.sc_eax;
636 	regs->tf_rip    = frame.sf_sc.sc_eip;
637 	regs->tf_cs     = frame.sf_sc.sc_cs;
638 	regs->tf_ds     = frame.sf_sc.sc_ds;
639 	regs->tf_es     = frame.sf_sc.sc_es;
640 	regs->tf_fs     = frame.sf_sc.sc_fs;
641 	regs->tf_gs     = frame.sf_sc.sc_gs;
642 	regs->tf_rflags = eflags;
643 	regs->tf_rsp    = frame.sf_sc.sc_esp_at_signal;
644 	regs->tf_ss     = frame.sf_sc.sc_ss;
645 	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
646 
647 	return (EJUSTRETURN);
648 }
649 
650 /*
651  * System call to cleanup state after a signal
652  * has been taken.  Reset signal mask and
653  * stack state from context left by rt_sendsig (above).
654  * Return to previous pc and psl as specified by
655  * context left by sendsig. Check carefully to
656  * make sure that the user has not modified the
657  * psl to gain improper privileges or to cause
658  * a machine fault.
659  */
660 int
661 linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args)
662 {
663 	struct l_ucontext uc;
664 	struct l_sigcontext *context;
665 	sigset_t bmask;
666 	l_stack_t *lss;
667 	stack_t ss;
668 	struct trapframe *regs;
669 	int eflags;
670 	ksiginfo_t ksi;
671 
672 	regs = td->td_frame;
673 
674 #ifdef DEBUG
675 	if (ldebug(rt_sigreturn))
676 		printf(ARGS(rt_sigreturn, "%p"), (void *)args->ucp);
677 #endif
678 	/*
679 	 * The trampoline code hands us the ucontext.
680 	 * It is unsafe to keep track of it ourselves, in the event that a
681 	 * program jumps out of a signal handler.
682 	 */
683 	if (copyin(args->ucp, &uc, sizeof(uc)) != 0)
684 		return (EFAULT);
685 
686 	context = &uc.uc_mcontext;
687 
688 	/*
689 	 * Check for security violations.
690 	 */
691 #define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
692 	eflags = context->sc_eflags;
693 	if (!EFLAGS_SECURE(eflags, regs->tf_rflags))
694 		return(EINVAL);
695 
696 	/*
697 	 * Don't allow users to load a valid privileged %cs.  Let the
698 	 * hardware check for invalid selectors, excess privilege in
699 	 * other selectors, invalid %eip's and invalid %esp's.
700 	 */
701 #define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
702 	if (!CS_SECURE(context->sc_cs)) {
703 		ksiginfo_init_trap(&ksi);
704 		ksi.ksi_signo = SIGBUS;
705 		ksi.ksi_code = BUS_OBJERR;
706 		ksi.ksi_trapno = T_PROTFLT;
707 		ksi.ksi_addr = (void *)regs->tf_rip;
708 		trapsignal(td, &ksi);
709 		return(EINVAL);
710 	}
711 
712 	linux_to_bsd_sigset(&uc.uc_sigmask, &bmask);
713 	kern_sigprocmask(td, SIG_SETMASK, &bmask, NULL, 0);
714 
715 	/*
716 	 * Restore signal context
717 	 */
718 	regs->tf_gs	= context->sc_gs;
719 	regs->tf_fs	= context->sc_fs;
720 	regs->tf_es	= context->sc_es;
721 	regs->tf_ds	= context->sc_ds;
722 	regs->tf_rdi    = context->sc_edi;
723 	regs->tf_rsi    = context->sc_esi;
724 	regs->tf_rbp    = context->sc_ebp;
725 	regs->tf_rbx    = context->sc_ebx;
726 	regs->tf_rdx    = context->sc_edx;
727 	regs->tf_rcx    = context->sc_ecx;
728 	regs->tf_rax    = context->sc_eax;
729 	regs->tf_rip    = context->sc_eip;
730 	regs->tf_cs     = context->sc_cs;
731 	regs->tf_rflags = eflags;
732 	regs->tf_rsp    = context->sc_esp_at_signal;
733 	regs->tf_ss     = context->sc_ss;
734 	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
735 
736 	/*
737 	 * call sigaltstack & ignore results..
738 	 */
739 	lss = &uc.uc_stack;
740 	ss.ss_sp = PTRIN(lss->ss_sp);
741 	ss.ss_size = lss->ss_size;
742 	ss.ss_flags = linux_to_bsd_sigaltstack(lss->ss_flags);
743 
744 #ifdef DEBUG
745 	if (ldebug(rt_sigreturn))
746 		printf(LMSG("rt_sigret flags: 0x%x, sp: %p, ss: 0x%lx, mask: 0x%x"),
747 		    ss.ss_flags, ss.ss_sp, ss.ss_size, context->sc_mask);
748 #endif
749 	(void)kern_sigaltstack(td, &ss, NULL);
750 
751 	return (EJUSTRETURN);
752 }
753 
754 static int
755 linux32_fetch_syscall_args(struct thread *td, struct syscall_args *sa)
756 {
757 	struct proc *p;
758 	struct trapframe *frame;
759 
760 	p = td->td_proc;
761 	frame = td->td_frame;
762 
763 	sa->args[0] = frame->tf_rbx;
764 	sa->args[1] = frame->tf_rcx;
765 	sa->args[2] = frame->tf_rdx;
766 	sa->args[3] = frame->tf_rsi;
767 	sa->args[4] = frame->tf_rdi;
768 	sa->args[5] = frame->tf_rbp;	/* Unconfirmed */
769 	sa->code = frame->tf_rax;
770 
771 	if (sa->code >= p->p_sysent->sv_size)
772 		sa->callp = &p->p_sysent->sv_table[0];
773 	else
774 		sa->callp = &p->p_sysent->sv_table[sa->code];
775 	sa->narg = sa->callp->sy_narg;
776 
777 	td->td_retval[0] = 0;
778 	td->td_retval[1] = frame->tf_rdx;
779 
780 	return (0);
781 }
782 
783 /*
784  * If a linux binary is exec'ing something, try this image activator
785  * first.  We override standard shell script execution in order to
786  * be able to modify the interpreter path.  We only do this if a linux
787  * binary is doing the exec, so we do not create an EXEC module for it.
788  */
789 static int	exec_linux_imgact_try(struct image_params *iparams);
790 
791 static int
792 exec_linux_imgact_try(struct image_params *imgp)
793 {
794 	const char *head = (const char *)imgp->image_header;
795 	char *rpath;
796 	int error = -1;
797 
798 	/*
799 	* The interpreter for shell scripts run from a linux binary needs
800 	* to be located in /compat/linux if possible in order to recursively
801 	* maintain linux path emulation.
802 	*/
803 	if (((const short *)head)[0] == SHELLMAGIC) {
804 		/*
805 		* Run our normal shell image activator.  If it succeeds attempt
806 		* to use the alternate path for the interpreter.  If an
807 		* alternate * path is found, use our stringspace to store it.
808 		*/
809 		if ((error = exec_shell_imgact(imgp)) == 0) {
810 			linux_emul_convpath(FIRST_THREAD_IN_PROC(imgp->proc),
811 			    imgp->interpreter_name, UIO_SYSSPACE, &rpath, 0,
812 			    AT_FDCWD);
813 			if (rpath != NULL)
814 				imgp->args->fname_buf =
815 				    imgp->interpreter_name = rpath;
816 		}
817 	}
818 	return (error);
819 }
820 
821 /*
822  * Clear registers on exec
823  * XXX copied from ia32_signal.c.
824  */
825 static void
826 exec_linux_setregs(struct thread *td, struct image_params *imgp, u_long stack)
827 {
828 	struct trapframe *regs = td->td_frame;
829 	struct pcb *pcb = td->td_pcb;
830 
831 	mtx_lock(&dt_lock);
832 	if (td->td_proc->p_md.md_ldt != NULL)
833 		user_ldt_free(td);
834 	else
835 		mtx_unlock(&dt_lock);
836 
837 	critical_enter();
838 	wrmsr(MSR_FSBASE, 0);
839 	wrmsr(MSR_KGSBASE, 0);	/* User value while we're in the kernel */
840 	pcb->pcb_fsbase = 0;
841 	pcb->pcb_gsbase = 0;
842 	critical_exit();
843 	pcb->pcb_initial_fpucw = __LINUX_NPXCW__;
844 
845 	bzero((char *)regs, sizeof(struct trapframe));
846 	regs->tf_rip = imgp->entry_addr;
847 	regs->tf_rsp = stack;
848 	regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T);
849 	regs->tf_gs = _ugssel;
850 	regs->tf_fs = _ufssel;
851 	regs->tf_es = _udatasel;
852 	regs->tf_ds = _udatasel;
853 	regs->tf_ss = _udatasel;
854 	regs->tf_flags = TF_HASSEGS;
855 	regs->tf_cs = _ucode32sel;
856 	regs->tf_rbx = imgp->ps_strings;
857 
858 	fpstate_drop(td);
859 
860 	/* Do full restore on return so that we can change to a different %cs */
861 	set_pcb_flags(pcb, PCB_32BIT | PCB_FULL_IRET);
862 	td->td_retval[1] = 0;
863 }
864 
865 /*
866  * XXX copied from ia32_sysvec.c.
867  */
868 static register_t *
869 linux_copyout_strings(struct image_params *imgp)
870 {
871 	int argc, envc;
872 	u_int32_t *vectp;
873 	char *stringp, *destp;
874 	u_int32_t *stack_base;
875 	struct linux32_ps_strings *arginfo;
876 
877 	/*
878 	 * Calculate string base and vector table pointers.
879 	 */
880 	arginfo = (struct linux32_ps_strings *)LINUX32_PS_STRINGS;
881 	destp =	(caddr_t)arginfo - SPARE_USRSPACE -
882 	    roundup((ARG_MAX - imgp->args->stringspace), sizeof(char *));
883 
884 	/*
885 	 * If we have a valid auxargs ptr, prepare some room
886 	 * on the stack.
887 	 */
888 	if (imgp->auxargs) {
889 		/*
890 		 * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for
891 		 * lower compatibility.
892 		 */
893 		imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size :
894 		    (LINUX_AT_COUNT * 2);
895 		/*
896 		 * The '+ 2' is for the null pointers at the end of each of
897 		 * the arg and env vector sets,and imgp->auxarg_size is room
898 		 * for argument of Runtime loader.
899 		 */
900 		vectp = (u_int32_t *) (destp - (imgp->args->argc +
901 		    imgp->args->envc + 2 + imgp->auxarg_size) *
902 		    sizeof(u_int32_t));
903 
904 	} else
905 		/*
906 		 * The '+ 2' is for the null pointers at the end of each of
907 		 * the arg and env vector sets
908 		 */
909 		vectp = (u_int32_t *)(destp - (imgp->args->argc +
910 		    imgp->args->envc + 2) * sizeof(u_int32_t));
911 
912 	/*
913 	 * vectp also becomes our initial stack base
914 	 */
915 	stack_base = vectp;
916 
917 	stringp = imgp->args->begin_argv;
918 	argc = imgp->args->argc;
919 	envc = imgp->args->envc;
920 	/*
921 	 * Copy out strings - arguments and environment.
922 	 */
923 	copyout(stringp, destp, ARG_MAX - imgp->args->stringspace);
924 
925 	/*
926 	 * Fill in "ps_strings" struct for ps, w, etc.
927 	 */
928 	suword32(&arginfo->ps_argvstr, (uint32_t)(intptr_t)vectp);
929 	suword32(&arginfo->ps_nargvstr, argc);
930 
931 	/*
932 	 * Fill in argument portion of vector table.
933 	 */
934 	for (; argc > 0; --argc) {
935 		suword32(vectp++, (uint32_t)(intptr_t)destp);
936 		while (*stringp++ != 0)
937 			destp++;
938 		destp++;
939 	}
940 
941 	/* a null vector table pointer separates the argp's from the envp's */
942 	suword32(vectp++, 0);
943 
944 	suword32(&arginfo->ps_envstr, (uint32_t)(intptr_t)vectp);
945 	suword32(&arginfo->ps_nenvstr, envc);
946 
947 	/*
948 	 * Fill in environment portion of vector table.
949 	 */
950 	for (; envc > 0; --envc) {
951 		suword32(vectp++, (uint32_t)(intptr_t)destp);
952 		while (*stringp++ != 0)
953 			destp++;
954 		destp++;
955 	}
956 
957 	/* end of vector table is a null pointer */
958 	suword32(vectp, 0);
959 
960 	return ((register_t *)stack_base);
961 }
962 
963 static SYSCTL_NODE(_compat, OID_AUTO, linux32, CTLFLAG_RW, 0,
964     "32-bit Linux emulation");
965 
966 static u_long	linux32_maxdsiz = LINUX32_MAXDSIZ;
967 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxdsiz, CTLFLAG_RW,
968     &linux32_maxdsiz, 0, "");
969 static u_long	linux32_maxssiz = LINUX32_MAXSSIZ;
970 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxssiz, CTLFLAG_RW,
971     &linux32_maxssiz, 0, "");
972 static u_long	linux32_maxvmem = LINUX32_MAXVMEM;
973 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxvmem, CTLFLAG_RW,
974     &linux32_maxvmem, 0, "");
975 
976 #if defined(DEBUG)
977 SYSCTL_PROC(_compat_linux32, OID_AUTO, debug,
978             CTLTYPE_STRING | CTLFLAG_RW,
979             0, 0, linux_sysctl_debug, "A",
980             "Linux debugging control");
981 #endif
982 
983 static void
984 linux32_fixlimit(struct rlimit *rl, int which)
985 {
986 
987 	switch (which) {
988 	case RLIMIT_DATA:
989 		if (linux32_maxdsiz != 0) {
990 			if (rl->rlim_cur > linux32_maxdsiz)
991 				rl->rlim_cur = linux32_maxdsiz;
992 			if (rl->rlim_max > linux32_maxdsiz)
993 				rl->rlim_max = linux32_maxdsiz;
994 		}
995 		break;
996 	case RLIMIT_STACK:
997 		if (linux32_maxssiz != 0) {
998 			if (rl->rlim_cur > linux32_maxssiz)
999 				rl->rlim_cur = linux32_maxssiz;
1000 			if (rl->rlim_max > linux32_maxssiz)
1001 				rl->rlim_max = linux32_maxssiz;
1002 		}
1003 		break;
1004 	case RLIMIT_VMEM:
1005 		if (linux32_maxvmem != 0) {
1006 			if (rl->rlim_cur > linux32_maxvmem)
1007 				rl->rlim_cur = linux32_maxvmem;
1008 			if (rl->rlim_max > linux32_maxvmem)
1009 				rl->rlim_max = linux32_maxvmem;
1010 		}
1011 		break;
1012 	}
1013 }
1014 
1015 struct sysentvec elf_linux_sysvec = {
1016 	.sv_size	= LINUX_SYS_MAXSYSCALL,
1017 	.sv_table	= linux_sysent,
1018 	.sv_mask	= 0,
1019 	.sv_sigsize	= LINUX_SIGTBLSZ,
1020 	.sv_sigtbl	= bsd_to_linux_signal,
1021 	.sv_errsize	= ELAST + 1,
1022 	.sv_errtbl	= bsd_to_linux_errno,
1023 	.sv_transtrap	= translate_traps,
1024 	.sv_fixup	= elf_linux_fixup,
1025 	.sv_sendsig	= linux_sendsig,
1026 	.sv_sigcode	= &_binary_linux32_locore_o_start,
1027 	.sv_szsigcode	= &linux_szsigcode,
1028 	.sv_prepsyscall	= NULL,
1029 	.sv_name	= "Linux ELF32",
1030 	.sv_coredump	= elf32_coredump,
1031 	.sv_imgact_try	= exec_linux_imgact_try,
1032 	.sv_minsigstksz	= LINUX_MINSIGSTKSZ,
1033 	.sv_pagesize	= PAGE_SIZE,
1034 	.sv_minuser	= VM_MIN_ADDRESS,
1035 	.sv_maxuser	= LINUX32_MAXUSER,
1036 	.sv_usrstack	= LINUX32_USRSTACK,
1037 	.sv_psstrings	= LINUX32_PS_STRINGS,
1038 	.sv_stackprot	= VM_PROT_ALL,
1039 	.sv_copyout_strings = linux_copyout_strings,
1040 	.sv_setregs	= exec_linux_setregs,
1041 	.sv_fixlimit	= linux32_fixlimit,
1042 	.sv_maxssiz	= &linux32_maxssiz,
1043 	.sv_flags	= SV_ABI_LINUX | SV_ILP32 | SV_IA32 | SV_SHP,
1044 	.sv_set_syscall_retval = cpu_set_syscall_retval,
1045 	.sv_fetch_syscall_args = linux32_fetch_syscall_args,
1046 	.sv_syscallnames = NULL,
1047 	.sv_shared_page_base = LINUX32_SHAREDPAGE,
1048 	.sv_shared_page_len = PAGE_SIZE,
1049 	.sv_schedtail	= linux_schedtail,
1050 	.sv_thread_detach = linux_thread_detach,
1051 };
1052 
1053 static void
1054 linux_vdso_install(void *param)
1055 {
1056 
1057 	linux_szsigcode = (&_binary_linux32_locore_o_end -
1058 	    &_binary_linux32_locore_o_start);
1059 
1060 	if (linux_szsigcode > elf_linux_sysvec.sv_shared_page_len)
1061 		panic("Linux invalid vdso size\n");
1062 
1063 	__elfN(linux_vdso_fixup)(&elf_linux_sysvec);
1064 
1065 	linux_shared_page_obj = __elfN(linux_shared_page_init)
1066 	    (&linux_shared_page_mapping);
1067 
1068 	__elfN(linux_vdso_reloc)(&elf_linux_sysvec, LINUX32_SHAREDPAGE);
1069 
1070 	bcopy(elf_linux_sysvec.sv_sigcode, linux_shared_page_mapping,
1071 	    linux_szsigcode);
1072 	elf_linux_sysvec.sv_shared_page_obj = linux_shared_page_obj;
1073 
1074 	linux_kplatform = linux_shared_page_mapping +
1075 	    (linux_platform - (caddr_t)LINUX32_SHAREDPAGE);
1076 }
1077 SYSINIT(elf_linux_vdso_init, SI_SUB_EXEC, SI_ORDER_ANY,
1078     (sysinit_cfunc_t)linux_vdso_install, NULL);
1079 
1080 static void
1081 linux_vdso_deinstall(void *param)
1082 {
1083 
1084 	__elfN(linux_shared_page_fini)(linux_shared_page_obj);
1085 };
1086 SYSUNINIT(elf_linux_vdso_uninit, SI_SUB_EXEC, SI_ORDER_FIRST,
1087     (sysinit_cfunc_t)linux_vdso_deinstall, NULL);
1088 
1089 static char GNU_ABI_VENDOR[] = "GNU";
1090 static int GNULINUX_ABI_DESC = 0;
1091 
1092 static boolean_t
1093 linux32_trans_osrel(const Elf_Note *note, int32_t *osrel)
1094 {
1095 	const Elf32_Word *desc;
1096 	uintptr_t p;
1097 
1098 	p = (uintptr_t)(note + 1);
1099 	p += roundup2(note->n_namesz, sizeof(Elf32_Addr));
1100 
1101 	desc = (const Elf32_Word *)p;
1102 	if (desc[0] != GNULINUX_ABI_DESC)
1103 		return (FALSE);
1104 
1105 	/*
1106 	 * For linux we encode osrel as follows (see linux_mib.c):
1107 	 * VVVMMMIII (version, major, minor), see linux_mib.c.
1108 	 */
1109 	*osrel = desc[1] * 1000000 + desc[2] * 1000 + desc[3];
1110 
1111 	return (TRUE);
1112 }
1113 
1114 static Elf_Brandnote linux32_brandnote = {
1115 	.hdr.n_namesz	= sizeof(GNU_ABI_VENDOR),
1116 	.hdr.n_descsz	= 16,	/* XXX at least 16 */
1117 	.hdr.n_type	= 1,
1118 	.vendor		= GNU_ABI_VENDOR,
1119 	.flags		= BN_TRANSLATE_OSREL,
1120 	.trans_osrel	= linux32_trans_osrel
1121 };
1122 
1123 static Elf32_Brandinfo linux_brand = {
1124 	.brand		= ELFOSABI_LINUX,
1125 	.machine	= EM_386,
1126 	.compat_3_brand	= "Linux",
1127 	.emul_path	= "/compat/linux",
1128 	.interp_path	= "/lib/ld-linux.so.1",
1129 	.sysvec		= &elf_linux_sysvec,
1130 	.interp_newpath	= NULL,
1131 	.brand_note	= &linux32_brandnote,
1132 	.flags		= BI_CAN_EXEC_DYN | BI_BRAND_NOTE
1133 };
1134 
1135 static Elf32_Brandinfo linux_glibc2brand = {
1136 	.brand		= ELFOSABI_LINUX,
1137 	.machine	= EM_386,
1138 	.compat_3_brand	= "Linux",
1139 	.emul_path	= "/compat/linux",
1140 	.interp_path	= "/lib/ld-linux.so.2",
1141 	.sysvec		= &elf_linux_sysvec,
1142 	.interp_newpath	= NULL,
1143 	.brand_note	= &linux32_brandnote,
1144 	.flags		= BI_CAN_EXEC_DYN | BI_BRAND_NOTE
1145 };
1146 
1147 Elf32_Brandinfo *linux_brandlist[] = {
1148 	&linux_brand,
1149 	&linux_glibc2brand,
1150 	NULL
1151 };
1152 
1153 static int
1154 linux_elf_modevent(module_t mod, int type, void *data)
1155 {
1156 	Elf32_Brandinfo **brandinfo;
1157 	int error;
1158 	struct linux_ioctl_handler **lihp;
1159 
1160 	error = 0;
1161 
1162 	switch(type) {
1163 	case MOD_LOAD:
1164 		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
1165 		     ++brandinfo)
1166 			if (elf32_insert_brand_entry(*brandinfo) < 0)
1167 				error = EINVAL;
1168 		if (error == 0) {
1169 			SET_FOREACH(lihp, linux_ioctl_handler_set)
1170 				linux_ioctl_register_handler(*lihp);
1171 			LIST_INIT(&futex_list);
1172 			mtx_init(&futex_mtx, "ftllk", NULL, MTX_DEF);
1173 			linux_exit_tag = EVENTHANDLER_REGISTER(process_exit,
1174 			    linux_proc_exit, NULL, 1000);
1175 			linux_exec_tag = EVENTHANDLER_REGISTER(process_exec,
1176 			    linux_proc_exec, NULL, 1000);
1177 			linux_thread_dtor_tag = EVENTHANDLER_REGISTER(thread_dtor,
1178 			    linux_thread_dtor, NULL, EVENTHANDLER_PRI_ANY);
1179 			stclohz = (stathz ? stathz : hz);
1180 			if (bootverbose)
1181 				printf("Linux ELF exec handler installed\n");
1182 		} else
1183 			printf("cannot insert Linux ELF brand handler\n");
1184 		break;
1185 	case MOD_UNLOAD:
1186 		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
1187 		     ++brandinfo)
1188 			if (elf32_brand_inuse(*brandinfo))
1189 				error = EBUSY;
1190 		if (error == 0) {
1191 			for (brandinfo = &linux_brandlist[0];
1192 			     *brandinfo != NULL; ++brandinfo)
1193 				if (elf32_remove_brand_entry(*brandinfo) < 0)
1194 					error = EINVAL;
1195 		}
1196 		if (error == 0) {
1197 			SET_FOREACH(lihp, linux_ioctl_handler_set)
1198 				linux_ioctl_unregister_handler(*lihp);
1199 			mtx_destroy(&futex_mtx);
1200 			EVENTHANDLER_DEREGISTER(process_exit, linux_exit_tag);
1201 			EVENTHANDLER_DEREGISTER(process_exec, linux_exec_tag);
1202 			EVENTHANDLER_DEREGISTER(thread_dtor, linux_thread_dtor_tag);
1203 			if (bootverbose)
1204 				printf("Linux ELF exec handler removed\n");
1205 		} else
1206 			printf("Could not deinstall ELF interpreter entry\n");
1207 		break;
1208 	default:
1209 		return (EOPNOTSUPP);
1210 	}
1211 	return (error);
1212 }
1213 
1214 static moduledata_t linux_elf_mod = {
1215 	"linuxelf",
1216 	linux_elf_modevent,
1217 	0
1218 };
1219 
1220 DECLARE_MODULE_TIED(linuxelf, linux_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY);
1221 MODULE_DEPEND(linuxelf, linux_common, 1, 1, 1);
1222