xref: /freebsd/sys/amd64/linux32/linux32_sysvec.c (revision bdc379344aee7b07ea84d4da61a4f228b72f8079)
1 /*-
2  * Copyright (c) 2004 Tim J. Robbins
3  * Copyright (c) 2003 Peter Wemm
4  * Copyright (c) 2002 Doug Rabson
5  * Copyright (c) 1998-1999 Andrew Gallatin
6  * Copyright (c) 1994-1996 Søren Schmidt
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer
14  *    in this position and unchanged.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. The name of the author may not be used to endorse or promote products
19  *    derived from this software without specific prior written permission
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
22  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
23  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
24  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
25  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
26  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
30  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31  */
32 
33 #include <sys/cdefs.h>
34 __FBSDID("$FreeBSD$");
35 #include "opt_compat.h"
36 
37 #ifndef COMPAT_FREEBSD32
38 #error "Unable to compile Linux-emulator due to missing COMPAT_FREEBSD32 option!"
39 #endif
40 
41 #define	__ELF_WORD_SIZE	32
42 
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/exec.h>
46 #include <sys/fcntl.h>
47 #include <sys/imgact.h>
48 #include <sys/imgact_elf.h>
49 #include <sys/kernel.h>
50 #include <sys/lock.h>
51 #include <sys/malloc.h>
52 #include <sys/module.h>
53 #include <sys/mutex.h>
54 #include <sys/proc.h>
55 #include <sys/resourcevar.h>
56 #include <sys/signalvar.h>
57 #include <sys/sysctl.h>
58 #include <sys/syscallsubr.h>
59 #include <sys/sysent.h>
60 #include <sys/sysproto.h>
61 #include <sys/vnode.h>
62 #include <sys/eventhandler.h>
63 
64 #include <vm/vm.h>
65 #include <vm/pmap.h>
66 #include <vm/vm_extern.h>
67 #include <vm/vm_map.h>
68 #include <vm/vm_object.h>
69 #include <vm/vm_page.h>
70 #include <vm/vm_param.h>
71 
72 #include <machine/cpu.h>
73 #include <machine/md_var.h>
74 #include <machine/pcb.h>
75 #include <machine/specialreg.h>
76 
77 #include <amd64/linux32/linux.h>
78 #include <amd64/linux32/linux32_proto.h>
79 #include <compat/linux/linux_emul.h>
80 #include <compat/linux/linux_futex.h>
81 #include <compat/linux/linux_ioctl.h>
82 #include <compat/linux/linux_mib.h>
83 #include <compat/linux/linux_misc.h>
84 #include <compat/linux/linux_signal.h>
85 #include <compat/linux/linux_util.h>
86 #include <compat/linux/linux_vdso.h>
87 
88 MODULE_VERSION(linux, 1);
89 
90 MALLOC_DEFINE(M_LINUX, "linux", "Linux mode structures");
91 
92 #define	AUXARGS_ENTRY_32(pos, id, val)	\
93 	do {				\
94 		suword32(pos++, id);	\
95 		suword32(pos++, val);	\
96 	} while (0)
97 
98 #if BYTE_ORDER == LITTLE_ENDIAN
99 #define SHELLMAGIC      0x2123 /* #! */
100 #else
101 #define SHELLMAGIC      0x2321
102 #endif
103 
104 /*
105  * Allow the sendsig functions to use the ldebug() facility
106  * even though they are not syscalls themselves. Map them
107  * to syscall 0. This is slightly less bogus than using
108  * ldebug(sigreturn).
109  */
110 #define	LINUX_SYS_linux_rt_sendsig	0
111 #define	LINUX_SYS_linux_sendsig		0
112 
113 const char *linux_platform = "i686";
114 static int linux_szplatform;
115 static int linux_szsigcode;
116 static vm_object_t linux_shared_page_obj;
117 static char *linux_shared_page_mapping;
118 extern char _binary_linux32_locore_o_start;
119 extern char _binary_linux32_locore_o_end;
120 
121 extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL];
122 
123 SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler);
124 SET_DECLARE(linux_device_handler_set, struct linux_device_handler);
125 
126 static int	elf_linux_fixup(register_t **stack_base,
127 		    struct image_params *iparams);
128 static register_t *linux_copyout_strings(struct image_params *imgp);
129 static void     linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask);
130 static void	exec_linux_setregs(struct thread *td,
131 				   struct image_params *imgp, u_long stack);
132 static void	linux32_fixlimit(struct rlimit *rl, int which);
133 static boolean_t linux32_trans_osrel(const Elf_Note *note, int32_t *osrel);
134 static void	linux_vdso_install(void *param);
135 static void	linux_vdso_deinstall(void *param);
136 
137 static eventhandler_tag linux_exit_tag;
138 static eventhandler_tag linux_exec_tag;
139 static eventhandler_tag linux_thread_dtor_tag;
140 
141 /*
142  * Linux syscalls return negative errno's, we do positive and map them
143  * Reference:
144  *   FreeBSD: src/sys/sys/errno.h
145  *   Linux:   linux-2.6.17.8/include/asm-generic/errno-base.h
146  *            linux-2.6.17.8/include/asm-generic/errno.h
147  */
148 static int bsd_to_linux_errno[ELAST + 1] = {
149 	-0,  -1,  -2,  -3,  -4,  -5,  -6,  -7,  -8,  -9,
150 	-10, -35, -12, -13, -14, -15, -16, -17, -18, -19,
151 	-20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
152 	-30, -31, -32, -33, -34, -11,-115,-114, -88, -89,
153 	-90, -91, -92, -93, -94, -95, -96, -97, -98, -99,
154 	-100,-101,-102,-103,-104,-105,-106,-107,-108,-109,
155 	-110,-111, -40, -36,-112,-113, -39, -11, -87,-122,
156 	-116, -66,  -6,  -6,  -6,  -6,  -6, -37, -38,  -9,
157 	  -6,  -6, -43, -42, -75,-125, -84, -95, -16, -74,
158 	 -72, -67, -71
159 };
160 
161 int bsd_to_linux_signal[LINUX_SIGTBLSZ] = {
162 	LINUX_SIGHUP, LINUX_SIGINT, LINUX_SIGQUIT, LINUX_SIGILL,
163 	LINUX_SIGTRAP, LINUX_SIGABRT, 0, LINUX_SIGFPE,
164 	LINUX_SIGKILL, LINUX_SIGBUS, LINUX_SIGSEGV, LINUX_SIGSYS,
165 	LINUX_SIGPIPE, LINUX_SIGALRM, LINUX_SIGTERM, LINUX_SIGURG,
166 	LINUX_SIGSTOP, LINUX_SIGTSTP, LINUX_SIGCONT, LINUX_SIGCHLD,
167 	LINUX_SIGTTIN, LINUX_SIGTTOU, LINUX_SIGIO, LINUX_SIGXCPU,
168 	LINUX_SIGXFSZ, LINUX_SIGVTALRM, LINUX_SIGPROF, LINUX_SIGWINCH,
169 	0, LINUX_SIGUSR1, LINUX_SIGUSR2
170 };
171 
172 int linux_to_bsd_signal[LINUX_SIGTBLSZ] = {
173 	SIGHUP, SIGINT, SIGQUIT, SIGILL,
174 	SIGTRAP, SIGABRT, SIGBUS, SIGFPE,
175 	SIGKILL, SIGUSR1, SIGSEGV, SIGUSR2,
176 	SIGPIPE, SIGALRM, SIGTERM, SIGBUS,
177 	SIGCHLD, SIGCONT, SIGSTOP, SIGTSTP,
178 	SIGTTIN, SIGTTOU, SIGURG, SIGXCPU,
179 	SIGXFSZ, SIGVTALRM, SIGPROF, SIGWINCH,
180 	SIGIO, SIGURG, SIGSYS
181 };
182 
183 #define LINUX_T_UNKNOWN  255
184 static int _bsd_to_linux_trapcode[] = {
185 	LINUX_T_UNKNOWN,	/* 0 */
186 	6,			/* 1  T_PRIVINFLT */
187 	LINUX_T_UNKNOWN,	/* 2 */
188 	3,			/* 3  T_BPTFLT */
189 	LINUX_T_UNKNOWN,	/* 4 */
190 	LINUX_T_UNKNOWN,	/* 5 */
191 	16,			/* 6  T_ARITHTRAP */
192 	254,			/* 7  T_ASTFLT */
193 	LINUX_T_UNKNOWN,	/* 8 */
194 	13,			/* 9  T_PROTFLT */
195 	1,			/* 10 T_TRCTRAP */
196 	LINUX_T_UNKNOWN,	/* 11 */
197 	14,			/* 12 T_PAGEFLT */
198 	LINUX_T_UNKNOWN,	/* 13 */
199 	17,			/* 14 T_ALIGNFLT */
200 	LINUX_T_UNKNOWN,	/* 15 */
201 	LINUX_T_UNKNOWN,	/* 16 */
202 	LINUX_T_UNKNOWN,	/* 17 */
203 	0,			/* 18 T_DIVIDE */
204 	2,			/* 19 T_NMI */
205 	4,			/* 20 T_OFLOW */
206 	5,			/* 21 T_BOUND */
207 	7,			/* 22 T_DNA */
208 	8,			/* 23 T_DOUBLEFLT */
209 	9,			/* 24 T_FPOPFLT */
210 	10,			/* 25 T_TSSFLT */
211 	11,			/* 26 T_SEGNPFLT */
212 	12,			/* 27 T_STKFLT */
213 	18,			/* 28 T_MCHK */
214 	19,			/* 29 T_XMMFLT */
215 	15			/* 30 T_RESERVED */
216 };
217 #define bsd_to_linux_trapcode(code) \
218     ((code)<sizeof(_bsd_to_linux_trapcode)/sizeof(*_bsd_to_linux_trapcode)? \
219      _bsd_to_linux_trapcode[(code)]: \
220      LINUX_T_UNKNOWN)
221 
222 struct linux32_ps_strings {
223 	u_int32_t ps_argvstr;	/* first of 0 or more argument strings */
224 	u_int ps_nargvstr;	/* the number of argument strings */
225 	u_int32_t ps_envstr;	/* first of 0 or more environment strings */
226 	u_int ps_nenvstr;	/* the number of environment strings */
227 };
228 
229 LINUX_VDSO_SYM_INTPTR(linux32_sigcode);
230 LINUX_VDSO_SYM_INTPTR(linux32_rt_sigcode);
231 LINUX_VDSO_SYM_INTPTR(linux32_vsyscall);
232 
233 /*
234  * If FreeBSD & Linux have a difference of opinion about what a trap
235  * means, deal with it here.
236  *
237  * MPSAFE
238  */
239 static int
240 translate_traps(int signal, int trap_code)
241 {
242 	if (signal != SIGBUS)
243 		return signal;
244 	switch (trap_code) {
245 	case T_PROTFLT:
246 	case T_TSSFLT:
247 	case T_DOUBLEFLT:
248 	case T_PAGEFLT:
249 		return SIGSEGV;
250 	default:
251 		return signal;
252 	}
253 }
254 
255 static int
256 elf_linux_fixup(register_t **stack_base, struct image_params *imgp)
257 {
258 	Elf32_Auxargs *args;
259 	Elf32_Addr *base;
260 	Elf32_Addr *pos, *uplatform;
261 	struct linux32_ps_strings *arginfo;
262 
263 	arginfo = (struct linux32_ps_strings *)LINUX32_PS_STRINGS;
264 	uplatform = (Elf32_Addr *)((caddr_t)arginfo - linux_szplatform);
265 
266 	KASSERT(curthread->td_proc == imgp->proc,
267 	    ("unsafe elf_linux_fixup(), should be curproc"));
268 	base = (Elf32_Addr *)*stack_base;
269 	args = (Elf32_Auxargs *)imgp->auxargs;
270 	pos = base + (imgp->args->argc + imgp->args->envc + 2);
271 
272 	AUXARGS_ENTRY_32(pos, LINUX_AT_SYSINFO_EHDR,
273 	    imgp->proc->p_sysent->sv_shared_page_base);
274 	AUXARGS_ENTRY_32(pos, LINUX_AT_SYSINFO, linux32_vsyscall);
275 	AUXARGS_ENTRY_32(pos, LINUX_AT_HWCAP, cpu_feature);
276 
277 	/*
278 	 * Do not export AT_CLKTCK when emulating Linux kernel prior to 2.4.0,
279 	 * as it has appeared in the 2.4.0-rc7 first time.
280 	 * Being exported, AT_CLKTCK is returned by sysconf(_SC_CLK_TCK),
281 	 * glibc falls back to the hard-coded CLK_TCK value when aux entry
282 	 * is not present.
283 	 * Also see linux_times() implementation.
284 	 */
285 	if (linux_kernver(curthread) >= LINUX_KERNVER_2004000)
286 		AUXARGS_ENTRY_32(pos, LINUX_AT_CLKTCK, stclohz);
287 	AUXARGS_ENTRY_32(pos, AT_PHDR, args->phdr);
288 	AUXARGS_ENTRY_32(pos, AT_PHENT, args->phent);
289 	AUXARGS_ENTRY_32(pos, AT_PHNUM, args->phnum);
290 	AUXARGS_ENTRY_32(pos, AT_PAGESZ, args->pagesz);
291 	AUXARGS_ENTRY_32(pos, AT_FLAGS, args->flags);
292 	AUXARGS_ENTRY_32(pos, AT_ENTRY, args->entry);
293 	AUXARGS_ENTRY_32(pos, AT_BASE, args->base);
294 	AUXARGS_ENTRY_32(pos, LINUX_AT_SECURE, 0);
295 	AUXARGS_ENTRY_32(pos, AT_UID, imgp->proc->p_ucred->cr_ruid);
296 	AUXARGS_ENTRY_32(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid);
297 	AUXARGS_ENTRY_32(pos, AT_GID, imgp->proc->p_ucred->cr_rgid);
298 	AUXARGS_ENTRY_32(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid);
299 	AUXARGS_ENTRY_32(pos, LINUX_AT_PLATFORM, PTROUT(uplatform));
300 	if (args->execfd != -1)
301 		AUXARGS_ENTRY_32(pos, AT_EXECFD, args->execfd);
302 	AUXARGS_ENTRY_32(pos, AT_NULL, 0);
303 
304 	free(imgp->auxargs, M_TEMP);
305 	imgp->auxargs = NULL;
306 
307 	base--;
308 	suword32(base, (uint32_t)imgp->args->argc);
309 	*stack_base = (register_t *)base;
310 	return (0);
311 }
312 
313 static void
314 linux_rt_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
315 {
316 	struct thread *td = curthread;
317 	struct proc *p = td->td_proc;
318 	struct sigacts *psp;
319 	struct trapframe *regs;
320 	struct l_rt_sigframe *fp, frame;
321 	int oonstack;
322 	int sig;
323 	int code;
324 
325 	sig = ksi->ksi_signo;
326 	code = ksi->ksi_code;
327 	PROC_LOCK_ASSERT(p, MA_OWNED);
328 	psp = p->p_sigacts;
329 	mtx_assert(&psp->ps_mtx, MA_OWNED);
330 	regs = td->td_frame;
331 	oonstack = sigonstack(regs->tf_rsp);
332 
333 #ifdef DEBUG
334 	if (ldebug(rt_sendsig))
335 		printf(ARGS(rt_sendsig, "%p, %d, %p, %u"),
336 		    catcher, sig, (void*)mask, code);
337 #endif
338 	/*
339 	 * Allocate space for the signal handler context.
340 	 */
341 	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
342 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
343 		fp = (struct l_rt_sigframe *)(td->td_sigstk.ss_sp +
344 		    td->td_sigstk.ss_size - sizeof(struct l_rt_sigframe));
345 	} else
346 		fp = (struct l_rt_sigframe *)regs->tf_rsp - 1;
347 	mtx_unlock(&psp->ps_mtx);
348 
349 	/*
350 	 * Build the argument list for the signal handler.
351 	 */
352 	if (p->p_sysent->sv_sigtbl)
353 		if (sig <= p->p_sysent->sv_sigsize)
354 			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
355 
356 	bzero(&frame, sizeof(frame));
357 
358 	frame.sf_handler = PTROUT(catcher);
359 	frame.sf_sig = sig;
360 	frame.sf_siginfo = PTROUT(&fp->sf_si);
361 	frame.sf_ucontext = PTROUT(&fp->sf_sc);
362 
363 	/* Fill in POSIX parts */
364 	ksiginfo_to_lsiginfo(ksi, &frame.sf_si, sig);
365 
366 	/*
367 	 * Build the signal context to be used by sigreturn
368 	 * and libgcc unwind.
369 	 */
370 	frame.sf_sc.uc_flags = 0;		/* XXX ??? */
371 	frame.sf_sc.uc_link = 0;		/* XXX ??? */
372 
373 	frame.sf_sc.uc_stack.ss_sp = PTROUT(td->td_sigstk.ss_sp);
374 	frame.sf_sc.uc_stack.ss_size = td->td_sigstk.ss_size;
375 	frame.sf_sc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
376 	    ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE;
377 	PROC_UNLOCK(p);
378 
379 	bsd_to_linux_sigset(mask, &frame.sf_sc.uc_sigmask);
380 
381 	frame.sf_sc.uc_mcontext.sc_mask   = frame.sf_sc.uc_sigmask.__bits[0];
382 	frame.sf_sc.uc_mcontext.sc_edi    = regs->tf_rdi;
383 	frame.sf_sc.uc_mcontext.sc_esi    = regs->tf_rsi;
384 	frame.sf_sc.uc_mcontext.sc_ebp    = regs->tf_rbp;
385 	frame.sf_sc.uc_mcontext.sc_ebx    = regs->tf_rbx;
386 	frame.sf_sc.uc_mcontext.sc_esp    = regs->tf_rsp;
387 	frame.sf_sc.uc_mcontext.sc_edx    = regs->tf_rdx;
388 	frame.sf_sc.uc_mcontext.sc_ecx    = regs->tf_rcx;
389 	frame.sf_sc.uc_mcontext.sc_eax    = regs->tf_rax;
390 	frame.sf_sc.uc_mcontext.sc_eip    = regs->tf_rip;
391 	frame.sf_sc.uc_mcontext.sc_cs     = regs->tf_cs;
392 	frame.sf_sc.uc_mcontext.sc_gs     = regs->tf_gs;
393 	frame.sf_sc.uc_mcontext.sc_fs     = regs->tf_fs;
394 	frame.sf_sc.uc_mcontext.sc_es     = regs->tf_es;
395 	frame.sf_sc.uc_mcontext.sc_ds     = regs->tf_ds;
396 	frame.sf_sc.uc_mcontext.sc_eflags = regs->tf_rflags;
397 	frame.sf_sc.uc_mcontext.sc_esp_at_signal = regs->tf_rsp;
398 	frame.sf_sc.uc_mcontext.sc_ss     = regs->tf_ss;
399 	frame.sf_sc.uc_mcontext.sc_err    = regs->tf_err;
400 	frame.sf_sc.uc_mcontext.sc_cr2    = (u_int32_t)(uintptr_t)ksi->ksi_addr;
401 	frame.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code);
402 
403 #ifdef DEBUG
404 	if (ldebug(rt_sendsig))
405 		printf(LMSG("rt_sendsig flags: 0x%x, sp: %p, ss: 0x%lx, mask: 0x%x"),
406 		    frame.sf_sc.uc_stack.ss_flags, td->td_sigstk.ss_sp,
407 		    td->td_sigstk.ss_size, frame.sf_sc.uc_mcontext.sc_mask);
408 #endif
409 
410 	if (copyout(&frame, fp, sizeof(frame)) != 0) {
411 		/*
412 		 * Process has trashed its stack; give it an illegal
413 		 * instruction to halt it in its tracks.
414 		 */
415 #ifdef DEBUG
416 		if (ldebug(rt_sendsig))
417 			printf(LMSG("rt_sendsig: bad stack %p, oonstack=%x"),
418 			    fp, oonstack);
419 #endif
420 		PROC_LOCK(p);
421 		sigexit(td, SIGILL);
422 	}
423 
424 	/*
425 	 * Build context to run handler in.
426 	 */
427 	regs->tf_rsp = PTROUT(fp);
428 	regs->tf_rip = linux32_rt_sigcode;
429 	regs->tf_rflags &= ~(PSL_T | PSL_D);
430 	regs->tf_cs = _ucode32sel;
431 	regs->tf_ss = _udatasel;
432 	regs->tf_ds = _udatasel;
433 	regs->tf_es = _udatasel;
434 	regs->tf_fs = _ufssel;
435 	regs->tf_gs = _ugssel;
436 	regs->tf_flags = TF_HASSEGS;
437 	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
438 	PROC_LOCK(p);
439 	mtx_lock(&psp->ps_mtx);
440 }
441 
442 
443 /*
444  * Send an interrupt to process.
445  *
446  * Stack is set up to allow sigcode stored
447  * in u. to call routine, followed by kcall
448  * to sigreturn routine below.  After sigreturn
449  * resets the signal mask, the stack, and the
450  * frame pointer, it returns to the user
451  * specified pc, psl.
452  */
453 static void
454 linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
455 {
456 	struct thread *td = curthread;
457 	struct proc *p = td->td_proc;
458 	struct sigacts *psp;
459 	struct trapframe *regs;
460 	struct l_sigframe *fp, frame;
461 	l_sigset_t lmask;
462 	int oonstack, i;
463 	int sig, code;
464 
465 	sig = ksi->ksi_signo;
466 	code = ksi->ksi_code;
467 	PROC_LOCK_ASSERT(p, MA_OWNED);
468 	psp = p->p_sigacts;
469 	mtx_assert(&psp->ps_mtx, MA_OWNED);
470 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
471 		/* Signal handler installed with SA_SIGINFO. */
472 		linux_rt_sendsig(catcher, ksi, mask);
473 		return;
474 	}
475 
476 	regs = td->td_frame;
477 	oonstack = sigonstack(regs->tf_rsp);
478 
479 #ifdef DEBUG
480 	if (ldebug(sendsig))
481 		printf(ARGS(sendsig, "%p, %d, %p, %u"),
482 		    catcher, sig, (void*)mask, code);
483 #endif
484 
485 	/*
486 	 * Allocate space for the signal handler context.
487 	 */
488 	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
489 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
490 		fp = (struct l_sigframe *)(td->td_sigstk.ss_sp +
491 		    td->td_sigstk.ss_size - sizeof(struct l_sigframe));
492 	} else
493 		fp = (struct l_sigframe *)regs->tf_rsp - 1;
494 	mtx_unlock(&psp->ps_mtx);
495 	PROC_UNLOCK(p);
496 
497 	/*
498 	 * Build the argument list for the signal handler.
499 	 */
500 	if (p->p_sysent->sv_sigtbl)
501 		if (sig <= p->p_sysent->sv_sigsize)
502 			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
503 
504 	bzero(&frame, sizeof(frame));
505 
506 	frame.sf_handler = PTROUT(catcher);
507 	frame.sf_sig = sig;
508 
509 	bsd_to_linux_sigset(mask, &lmask);
510 
511 	/*
512 	 * Build the signal context to be used by sigreturn.
513 	 */
514 	frame.sf_sc.sc_mask   = lmask.__bits[0];
515 	frame.sf_sc.sc_gs     = regs->tf_gs;
516 	frame.sf_sc.sc_fs     = regs->tf_fs;
517 	frame.sf_sc.sc_es     = regs->tf_es;
518 	frame.sf_sc.sc_ds     = regs->tf_ds;
519 	frame.sf_sc.sc_edi    = regs->tf_rdi;
520 	frame.sf_sc.sc_esi    = regs->tf_rsi;
521 	frame.sf_sc.sc_ebp    = regs->tf_rbp;
522 	frame.sf_sc.sc_ebx    = regs->tf_rbx;
523 	frame.sf_sc.sc_esp    = regs->tf_rsp;
524 	frame.sf_sc.sc_edx    = regs->tf_rdx;
525 	frame.sf_sc.sc_ecx    = regs->tf_rcx;
526 	frame.sf_sc.sc_eax    = regs->tf_rax;
527 	frame.sf_sc.sc_eip    = regs->tf_rip;
528 	frame.sf_sc.sc_cs     = regs->tf_cs;
529 	frame.sf_sc.sc_eflags = regs->tf_rflags;
530 	frame.sf_sc.sc_esp_at_signal = regs->tf_rsp;
531 	frame.sf_sc.sc_ss     = regs->tf_ss;
532 	frame.sf_sc.sc_err    = regs->tf_err;
533 	frame.sf_sc.sc_cr2    = (u_int32_t)(uintptr_t)ksi->ksi_addr;
534 	frame.sf_sc.sc_trapno = bsd_to_linux_trapcode(code);
535 
536 	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
537 		frame.sf_extramask[i] = lmask.__bits[i+1];
538 
539 	if (copyout(&frame, fp, sizeof(frame)) != 0) {
540 		/*
541 		 * Process has trashed its stack; give it an illegal
542 		 * instruction to halt it in its tracks.
543 		 */
544 		PROC_LOCK(p);
545 		sigexit(td, SIGILL);
546 	}
547 
548 	/*
549 	 * Build context to run handler in.
550 	 */
551 	regs->tf_rsp = PTROUT(fp);
552 	regs->tf_rip = linux32_sigcode;
553 	regs->tf_rflags &= ~(PSL_T | PSL_D);
554 	regs->tf_cs = _ucode32sel;
555 	regs->tf_ss = _udatasel;
556 	regs->tf_ds = _udatasel;
557 	regs->tf_es = _udatasel;
558 	regs->tf_fs = _ufssel;
559 	regs->tf_gs = _ugssel;
560 	regs->tf_flags = TF_HASSEGS;
561 	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
562 	PROC_LOCK(p);
563 	mtx_lock(&psp->ps_mtx);
564 }
565 
566 /*
567  * System call to cleanup state after a signal
568  * has been taken.  Reset signal mask and
569  * stack state from context left by sendsig (above).
570  * Return to previous pc and psl as specified by
571  * context left by sendsig. Check carefully to
572  * make sure that the user has not modified the
573  * psl to gain improper privileges or to cause
574  * a machine fault.
575  */
576 int
577 linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args)
578 {
579 	struct l_sigframe frame;
580 	struct trapframe *regs;
581 	sigset_t bmask;
582 	l_sigset_t lmask;
583 	int eflags, i;
584 	ksiginfo_t ksi;
585 
586 	regs = td->td_frame;
587 
588 #ifdef DEBUG
589 	if (ldebug(sigreturn))
590 		printf(ARGS(sigreturn, "%p"), (void *)args->sfp);
591 #endif
592 	/*
593 	 * The trampoline code hands us the sigframe.
594 	 * It is unsafe to keep track of it ourselves, in the event that a
595 	 * program jumps out of a signal handler.
596 	 */
597 	if (copyin(args->sfp, &frame, sizeof(frame)) != 0)
598 		return (EFAULT);
599 
600 	/*
601 	 * Check for security violations.
602 	 */
603 #define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
604 	eflags = frame.sf_sc.sc_eflags;
605 	if (!EFLAGS_SECURE(eflags, regs->tf_rflags))
606 		return(EINVAL);
607 
608 	/*
609 	 * Don't allow users to load a valid privileged %cs.  Let the
610 	 * hardware check for invalid selectors, excess privilege in
611 	 * other selectors, invalid %eip's and invalid %esp's.
612 	 */
613 #define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
614 	if (!CS_SECURE(frame.sf_sc.sc_cs)) {
615 		ksiginfo_init_trap(&ksi);
616 		ksi.ksi_signo = SIGBUS;
617 		ksi.ksi_code = BUS_OBJERR;
618 		ksi.ksi_trapno = T_PROTFLT;
619 		ksi.ksi_addr = (void *)regs->tf_rip;
620 		trapsignal(td, &ksi);
621 		return(EINVAL);
622 	}
623 
624 	lmask.__bits[0] = frame.sf_sc.sc_mask;
625 	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
626 		lmask.__bits[i+1] = frame.sf_extramask[i];
627 	linux_to_bsd_sigset(&lmask, &bmask);
628 	kern_sigprocmask(td, SIG_SETMASK, &bmask, NULL, 0);
629 
630 	/*
631 	 * Restore signal context.
632 	 */
633 	regs->tf_rdi    = frame.sf_sc.sc_edi;
634 	regs->tf_rsi    = frame.sf_sc.sc_esi;
635 	regs->tf_rbp    = frame.sf_sc.sc_ebp;
636 	regs->tf_rbx    = frame.sf_sc.sc_ebx;
637 	regs->tf_rdx    = frame.sf_sc.sc_edx;
638 	regs->tf_rcx    = frame.sf_sc.sc_ecx;
639 	regs->tf_rax    = frame.sf_sc.sc_eax;
640 	regs->tf_rip    = frame.sf_sc.sc_eip;
641 	regs->tf_cs     = frame.sf_sc.sc_cs;
642 	regs->tf_ds     = frame.sf_sc.sc_ds;
643 	regs->tf_es     = frame.sf_sc.sc_es;
644 	regs->tf_fs     = frame.sf_sc.sc_fs;
645 	regs->tf_gs     = frame.sf_sc.sc_gs;
646 	regs->tf_rflags = eflags;
647 	regs->tf_rsp    = frame.sf_sc.sc_esp_at_signal;
648 	regs->tf_ss     = frame.sf_sc.sc_ss;
649 	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
650 
651 	return (EJUSTRETURN);
652 }
653 
654 /*
655  * System call to cleanup state after a signal
656  * has been taken.  Reset signal mask and
657  * stack state from context left by rt_sendsig (above).
658  * Return to previous pc and psl as specified by
659  * context left by sendsig. Check carefully to
660  * make sure that the user has not modified the
661  * psl to gain improper privileges or to cause
662  * a machine fault.
663  */
664 int
665 linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args)
666 {
667 	struct l_ucontext uc;
668 	struct l_sigcontext *context;
669 	sigset_t bmask;
670 	l_stack_t *lss;
671 	stack_t ss;
672 	struct trapframe *regs;
673 	int eflags;
674 	ksiginfo_t ksi;
675 
676 	regs = td->td_frame;
677 
678 #ifdef DEBUG
679 	if (ldebug(rt_sigreturn))
680 		printf(ARGS(rt_sigreturn, "%p"), (void *)args->ucp);
681 #endif
682 	/*
683 	 * The trampoline code hands us the ucontext.
684 	 * It is unsafe to keep track of it ourselves, in the event that a
685 	 * program jumps out of a signal handler.
686 	 */
687 	if (copyin(args->ucp, &uc, sizeof(uc)) != 0)
688 		return (EFAULT);
689 
690 	context = &uc.uc_mcontext;
691 
692 	/*
693 	 * Check for security violations.
694 	 */
695 #define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
696 	eflags = context->sc_eflags;
697 	if (!EFLAGS_SECURE(eflags, regs->tf_rflags))
698 		return(EINVAL);
699 
700 	/*
701 	 * Don't allow users to load a valid privileged %cs.  Let the
702 	 * hardware check for invalid selectors, excess privilege in
703 	 * other selectors, invalid %eip's and invalid %esp's.
704 	 */
705 #define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
706 	if (!CS_SECURE(context->sc_cs)) {
707 		ksiginfo_init_trap(&ksi);
708 		ksi.ksi_signo = SIGBUS;
709 		ksi.ksi_code = BUS_OBJERR;
710 		ksi.ksi_trapno = T_PROTFLT;
711 		ksi.ksi_addr = (void *)regs->tf_rip;
712 		trapsignal(td, &ksi);
713 		return(EINVAL);
714 	}
715 
716 	linux_to_bsd_sigset(&uc.uc_sigmask, &bmask);
717 	kern_sigprocmask(td, SIG_SETMASK, &bmask, NULL, 0);
718 
719 	/*
720 	 * Restore signal context
721 	 */
722 	regs->tf_gs	= context->sc_gs;
723 	regs->tf_fs	= context->sc_fs;
724 	regs->tf_es	= context->sc_es;
725 	regs->tf_ds	= context->sc_ds;
726 	regs->tf_rdi    = context->sc_edi;
727 	regs->tf_rsi    = context->sc_esi;
728 	regs->tf_rbp    = context->sc_ebp;
729 	regs->tf_rbx    = context->sc_ebx;
730 	regs->tf_rdx    = context->sc_edx;
731 	regs->tf_rcx    = context->sc_ecx;
732 	regs->tf_rax    = context->sc_eax;
733 	regs->tf_rip    = context->sc_eip;
734 	regs->tf_cs     = context->sc_cs;
735 	regs->tf_rflags = eflags;
736 	regs->tf_rsp    = context->sc_esp_at_signal;
737 	regs->tf_ss     = context->sc_ss;
738 	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
739 
740 	/*
741 	 * call sigaltstack & ignore results..
742 	 */
743 	lss = &uc.uc_stack;
744 	ss.ss_sp = PTRIN(lss->ss_sp);
745 	ss.ss_size = lss->ss_size;
746 	ss.ss_flags = linux_to_bsd_sigaltstack(lss->ss_flags);
747 
748 #ifdef DEBUG
749 	if (ldebug(rt_sigreturn))
750 		printf(LMSG("rt_sigret flags: 0x%x, sp: %p, ss: 0x%lx, mask: 0x%x"),
751 		    ss.ss_flags, ss.ss_sp, ss.ss_size, context->sc_mask);
752 #endif
753 	(void)kern_sigaltstack(td, &ss, NULL);
754 
755 	return (EJUSTRETURN);
756 }
757 
758 static int
759 linux32_fetch_syscall_args(struct thread *td, struct syscall_args *sa)
760 {
761 	struct proc *p;
762 	struct trapframe *frame;
763 
764 	p = td->td_proc;
765 	frame = td->td_frame;
766 
767 	sa->args[0] = frame->tf_rbx;
768 	sa->args[1] = frame->tf_rcx;
769 	sa->args[2] = frame->tf_rdx;
770 	sa->args[3] = frame->tf_rsi;
771 	sa->args[4] = frame->tf_rdi;
772 	sa->args[5] = frame->tf_rbp;	/* Unconfirmed */
773 	sa->code = frame->tf_rax;
774 
775 	if (sa->code >= p->p_sysent->sv_size)
776 		sa->callp = &p->p_sysent->sv_table[0];
777 	else
778 		sa->callp = &p->p_sysent->sv_table[sa->code];
779 	sa->narg = sa->callp->sy_narg;
780 
781 	td->td_retval[0] = 0;
782 	td->td_retval[1] = frame->tf_rdx;
783 
784 	return (0);
785 }
786 
787 /*
788  * If a linux binary is exec'ing something, try this image activator
789  * first.  We override standard shell script execution in order to
790  * be able to modify the interpreter path.  We only do this if a linux
791  * binary is doing the exec, so we do not create an EXEC module for it.
792  */
793 static int	exec_linux_imgact_try(struct image_params *iparams);
794 
795 static int
796 exec_linux_imgact_try(struct image_params *imgp)
797 {
798 	const char *head = (const char *)imgp->image_header;
799 	char *rpath;
800 	int error = -1;
801 
802 	/*
803 	* The interpreter for shell scripts run from a linux binary needs
804 	* to be located in /compat/linux if possible in order to recursively
805 	* maintain linux path emulation.
806 	*/
807 	if (((const short *)head)[0] == SHELLMAGIC) {
808 		/*
809 		* Run our normal shell image activator.  If it succeeds attempt
810 		* to use the alternate path for the interpreter.  If an
811 		* alternate * path is found, use our stringspace to store it.
812 		*/
813 		if ((error = exec_shell_imgact(imgp)) == 0) {
814 			linux_emul_convpath(FIRST_THREAD_IN_PROC(imgp->proc),
815 			    imgp->interpreter_name, UIO_SYSSPACE, &rpath, 0,
816 			    AT_FDCWD);
817 			if (rpath != NULL)
818 				imgp->args->fname_buf =
819 				    imgp->interpreter_name = rpath;
820 		}
821 	}
822 	return (error);
823 }
824 
825 /*
826  * Clear registers on exec
827  * XXX copied from ia32_signal.c.
828  */
829 static void
830 exec_linux_setregs(struct thread *td, struct image_params *imgp, u_long stack)
831 {
832 	struct trapframe *regs = td->td_frame;
833 	struct pcb *pcb = td->td_pcb;
834 
835 	mtx_lock(&dt_lock);
836 	if (td->td_proc->p_md.md_ldt != NULL)
837 		user_ldt_free(td);
838 	else
839 		mtx_unlock(&dt_lock);
840 
841 	critical_enter();
842 	wrmsr(MSR_FSBASE, 0);
843 	wrmsr(MSR_KGSBASE, 0);	/* User value while we're in the kernel */
844 	pcb->pcb_fsbase = 0;
845 	pcb->pcb_gsbase = 0;
846 	critical_exit();
847 	pcb->pcb_initial_fpucw = __LINUX_NPXCW__;
848 
849 	bzero((char *)regs, sizeof(struct trapframe));
850 	regs->tf_rip = imgp->entry_addr;
851 	regs->tf_rsp = stack;
852 	regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T);
853 	regs->tf_gs = _ugssel;
854 	regs->tf_fs = _ufssel;
855 	regs->tf_es = _udatasel;
856 	regs->tf_ds = _udatasel;
857 	regs->tf_ss = _udatasel;
858 	regs->tf_flags = TF_HASSEGS;
859 	regs->tf_cs = _ucode32sel;
860 	regs->tf_rbx = imgp->ps_strings;
861 
862 	fpstate_drop(td);
863 
864 	/* Do full restore on return so that we can change to a different %cs */
865 	set_pcb_flags(pcb, PCB_32BIT | PCB_FULL_IRET);
866 	td->td_retval[1] = 0;
867 }
868 
869 /*
870  * XXX copied from ia32_sysvec.c.
871  */
872 static register_t *
873 linux_copyout_strings(struct image_params *imgp)
874 {
875 	int argc, envc;
876 	u_int32_t *vectp;
877 	char *stringp, *destp;
878 	u_int32_t *stack_base;
879 	struct linux32_ps_strings *arginfo;
880 
881 	/*
882 	 * Calculate string base and vector table pointers.
883 	 * Also deal with signal trampoline code for this exec type.
884 	 */
885 	arginfo = (struct linux32_ps_strings *)LINUX32_PS_STRINGS;
886 	destp =	(caddr_t)arginfo - SPARE_USRSPACE - linux_szplatform -
887 	    roundup((ARG_MAX - imgp->args->stringspace),
888 	    sizeof(char *));
889 
890 	/*
891 	 * Install LINUX_PLATFORM
892 	 */
893 	copyout(linux_platform, ((caddr_t)arginfo - linux_szplatform),
894 	    linux_szplatform);
895 
896 	/*
897 	 * If we have a valid auxargs ptr, prepare some room
898 	 * on the stack.
899 	 */
900 	if (imgp->auxargs) {
901 		/*
902 		 * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for
903 		 * lower compatibility.
904 		 */
905 		imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size :
906 		    (LINUX_AT_COUNT * 2);
907 		/*
908 		 * The '+ 2' is for the null pointers at the end of each of
909 		 * the arg and env vector sets,and imgp->auxarg_size is room
910 		 * for argument of Runtime loader.
911 		 */
912 		vectp = (u_int32_t *) (destp - (imgp->args->argc +
913 		    imgp->args->envc + 2 + imgp->auxarg_size) *
914 		    sizeof(u_int32_t));
915 
916 	} else
917 		/*
918 		 * The '+ 2' is for the null pointers at the end of each of
919 		 * the arg and env vector sets
920 		 */
921 		vectp = (u_int32_t *)(destp - (imgp->args->argc +
922 		    imgp->args->envc + 2) * sizeof(u_int32_t));
923 
924 	/*
925 	 * vectp also becomes our initial stack base
926 	 */
927 	stack_base = vectp;
928 
929 	stringp = imgp->args->begin_argv;
930 	argc = imgp->args->argc;
931 	envc = imgp->args->envc;
932 	/*
933 	 * Copy out strings - arguments and environment.
934 	 */
935 	copyout(stringp, destp, ARG_MAX - imgp->args->stringspace);
936 
937 	/*
938 	 * Fill in "ps_strings" struct for ps, w, etc.
939 	 */
940 	suword32(&arginfo->ps_argvstr, (uint32_t)(intptr_t)vectp);
941 	suword32(&arginfo->ps_nargvstr, argc);
942 
943 	/*
944 	 * Fill in argument portion of vector table.
945 	 */
946 	for (; argc > 0; --argc) {
947 		suword32(vectp++, (uint32_t)(intptr_t)destp);
948 		while (*stringp++ != 0)
949 			destp++;
950 		destp++;
951 	}
952 
953 	/* a null vector table pointer separates the argp's from the envp's */
954 	suword32(vectp++, 0);
955 
956 	suword32(&arginfo->ps_envstr, (uint32_t)(intptr_t)vectp);
957 	suword32(&arginfo->ps_nenvstr, envc);
958 
959 	/*
960 	 * Fill in environment portion of vector table.
961 	 */
962 	for (; envc > 0; --envc) {
963 		suword32(vectp++, (uint32_t)(intptr_t)destp);
964 		while (*stringp++ != 0)
965 			destp++;
966 		destp++;
967 	}
968 
969 	/* end of vector table is a null pointer */
970 	suword32(vectp, 0);
971 
972 	return ((register_t *)stack_base);
973 }
974 
975 static SYSCTL_NODE(_compat, OID_AUTO, linux32, CTLFLAG_RW, 0,
976     "32-bit Linux emulation");
977 
978 static u_long	linux32_maxdsiz = LINUX32_MAXDSIZ;
979 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxdsiz, CTLFLAG_RW,
980     &linux32_maxdsiz, 0, "");
981 static u_long	linux32_maxssiz = LINUX32_MAXSSIZ;
982 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxssiz, CTLFLAG_RW,
983     &linux32_maxssiz, 0, "");
984 static u_long	linux32_maxvmem = LINUX32_MAXVMEM;
985 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxvmem, CTLFLAG_RW,
986     &linux32_maxvmem, 0, "");
987 
988 static void
989 linux32_fixlimit(struct rlimit *rl, int which)
990 {
991 
992 	switch (which) {
993 	case RLIMIT_DATA:
994 		if (linux32_maxdsiz != 0) {
995 			if (rl->rlim_cur > linux32_maxdsiz)
996 				rl->rlim_cur = linux32_maxdsiz;
997 			if (rl->rlim_max > linux32_maxdsiz)
998 				rl->rlim_max = linux32_maxdsiz;
999 		}
1000 		break;
1001 	case RLIMIT_STACK:
1002 		if (linux32_maxssiz != 0) {
1003 			if (rl->rlim_cur > linux32_maxssiz)
1004 				rl->rlim_cur = linux32_maxssiz;
1005 			if (rl->rlim_max > linux32_maxssiz)
1006 				rl->rlim_max = linux32_maxssiz;
1007 		}
1008 		break;
1009 	case RLIMIT_VMEM:
1010 		if (linux32_maxvmem != 0) {
1011 			if (rl->rlim_cur > linux32_maxvmem)
1012 				rl->rlim_cur = linux32_maxvmem;
1013 			if (rl->rlim_max > linux32_maxvmem)
1014 				rl->rlim_max = linux32_maxvmem;
1015 		}
1016 		break;
1017 	}
1018 }
1019 
1020 struct sysentvec elf_linux_sysvec = {
1021 	.sv_size	= LINUX_SYS_MAXSYSCALL,
1022 	.sv_table	= linux_sysent,
1023 	.sv_mask	= 0,
1024 	.sv_sigsize	= LINUX_SIGTBLSZ,
1025 	.sv_sigtbl	= bsd_to_linux_signal,
1026 	.sv_errsize	= ELAST + 1,
1027 	.sv_errtbl	= bsd_to_linux_errno,
1028 	.sv_transtrap	= translate_traps,
1029 	.sv_fixup	= elf_linux_fixup,
1030 	.sv_sendsig	= linux_sendsig,
1031 	.sv_sigcode	= &_binary_linux32_locore_o_start,
1032 	.sv_szsigcode	= &linux_szsigcode,
1033 	.sv_prepsyscall	= NULL,
1034 	.sv_name	= "Linux ELF32",
1035 	.sv_coredump	= elf32_coredump,
1036 	.sv_imgact_try	= exec_linux_imgact_try,
1037 	.sv_minsigstksz	= LINUX_MINSIGSTKSZ,
1038 	.sv_pagesize	= PAGE_SIZE,
1039 	.sv_minuser	= VM_MIN_ADDRESS,
1040 	.sv_maxuser	= LINUX32_MAXUSER,
1041 	.sv_usrstack	= LINUX32_USRSTACK,
1042 	.sv_psstrings	= LINUX32_PS_STRINGS,
1043 	.sv_stackprot	= VM_PROT_ALL,
1044 	.sv_copyout_strings = linux_copyout_strings,
1045 	.sv_setregs	= exec_linux_setregs,
1046 	.sv_fixlimit	= linux32_fixlimit,
1047 	.sv_maxssiz	= &linux32_maxssiz,
1048 	.sv_flags	= SV_ABI_LINUX | SV_ILP32 | SV_IA32 | SV_SHP,
1049 	.sv_set_syscall_retval = cpu_set_syscall_retval,
1050 	.sv_fetch_syscall_args = linux32_fetch_syscall_args,
1051 	.sv_syscallnames = NULL,
1052 	.sv_shared_page_base = LINUX32_SHAREDPAGE,
1053 	.sv_shared_page_len = PAGE_SIZE,
1054 	.sv_schedtail	= linux_schedtail,
1055 	.sv_thread_detach = linux_thread_detach,
1056 };
1057 
1058 static void
1059 linux_vdso_install(void *param)
1060 {
1061 
1062 	linux_szsigcode = (&_binary_linux32_locore_o_end -
1063 	    &_binary_linux32_locore_o_start);
1064 
1065 	if (linux_szsigcode > elf_linux_sysvec.sv_shared_page_len)
1066 		panic("Linux invalid vdso size\n");
1067 
1068 	__elfN(linux_vdso_fixup)(&elf_linux_sysvec);
1069 
1070 	linux_shared_page_obj = __elfN(linux_shared_page_init)
1071 	    (&linux_shared_page_mapping);
1072 
1073 	__elfN(linux_vdso_reloc)(&elf_linux_sysvec, LINUX32_SHAREDPAGE);
1074 
1075 	bcopy(elf_linux_sysvec.sv_sigcode, linux_shared_page_mapping,
1076 	    linux_szsigcode);
1077 	elf_linux_sysvec.sv_shared_page_obj = linux_shared_page_obj;
1078 }
1079 SYSINIT(elf_linux_vdso_init, SI_SUB_EXEC, SI_ORDER_ANY,
1080     (sysinit_cfunc_t)linux_vdso_install, NULL);
1081 
1082 static void
1083 linux_vdso_deinstall(void *param)
1084 {
1085 
1086 	__elfN(linux_shared_page_fini)(linux_shared_page_obj);
1087 };
1088 SYSUNINIT(elf_linux_vdso_uninit, SI_SUB_EXEC, SI_ORDER_FIRST,
1089     (sysinit_cfunc_t)linux_vdso_deinstall, NULL);
1090 
1091 static char GNU_ABI_VENDOR[] = "GNU";
1092 static int GNULINUX_ABI_DESC = 0;
1093 
1094 static boolean_t
1095 linux32_trans_osrel(const Elf_Note *note, int32_t *osrel)
1096 {
1097 	const Elf32_Word *desc;
1098 	uintptr_t p;
1099 
1100 	p = (uintptr_t)(note + 1);
1101 	p += roundup2(note->n_namesz, sizeof(Elf32_Addr));
1102 
1103 	desc = (const Elf32_Word *)p;
1104 	if (desc[0] != GNULINUX_ABI_DESC)
1105 		return (FALSE);
1106 
1107 	/*
1108 	 * For linux we encode osrel as follows (see linux_mib.c):
1109 	 * VVVMMMIII (version, major, minor), see linux_mib.c.
1110 	 */
1111 	*osrel = desc[1] * 1000000 + desc[2] * 1000 + desc[3];
1112 
1113 	return (TRUE);
1114 }
1115 
1116 static Elf_Brandnote linux32_brandnote = {
1117 	.hdr.n_namesz	= sizeof(GNU_ABI_VENDOR),
1118 	.hdr.n_descsz	= 16,	/* XXX at least 16 */
1119 	.hdr.n_type	= 1,
1120 	.vendor		= GNU_ABI_VENDOR,
1121 	.flags		= BN_TRANSLATE_OSREL,
1122 	.trans_osrel	= linux32_trans_osrel
1123 };
1124 
1125 static Elf32_Brandinfo linux_brand = {
1126 	.brand		= ELFOSABI_LINUX,
1127 	.machine	= EM_386,
1128 	.compat_3_brand	= "Linux",
1129 	.emul_path	= "/compat/linux",
1130 	.interp_path	= "/lib/ld-linux.so.1",
1131 	.sysvec		= &elf_linux_sysvec,
1132 	.interp_newpath	= NULL,
1133 	.brand_note	= &linux32_brandnote,
1134 	.flags		= BI_CAN_EXEC_DYN | BI_BRAND_NOTE
1135 };
1136 
1137 static Elf32_Brandinfo linux_glibc2brand = {
1138 	.brand		= ELFOSABI_LINUX,
1139 	.machine	= EM_386,
1140 	.compat_3_brand	= "Linux",
1141 	.emul_path	= "/compat/linux",
1142 	.interp_path	= "/lib/ld-linux.so.2",
1143 	.sysvec		= &elf_linux_sysvec,
1144 	.interp_newpath	= NULL,
1145 	.brand_note	= &linux32_brandnote,
1146 	.flags		= BI_CAN_EXEC_DYN | BI_BRAND_NOTE
1147 };
1148 
1149 Elf32_Brandinfo *linux_brandlist[] = {
1150 	&linux_brand,
1151 	&linux_glibc2brand,
1152 	NULL
1153 };
1154 
1155 static int
1156 linux_elf_modevent(module_t mod, int type, void *data)
1157 {
1158 	Elf32_Brandinfo **brandinfo;
1159 	int error;
1160 	struct linux_ioctl_handler **lihp;
1161 	struct linux_device_handler **ldhp;
1162 
1163 	error = 0;
1164 
1165 	switch(type) {
1166 	case MOD_LOAD:
1167 		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
1168 		     ++brandinfo)
1169 			if (elf32_insert_brand_entry(*brandinfo) < 0)
1170 				error = EINVAL;
1171 		if (error == 0) {
1172 			SET_FOREACH(lihp, linux_ioctl_handler_set)
1173 				linux_ioctl_register_handler(*lihp);
1174 			SET_FOREACH(ldhp, linux_device_handler_set)
1175 				linux_device_register_handler(*ldhp);
1176 			LIST_INIT(&futex_list);
1177 			mtx_init(&futex_mtx, "ftllk", NULL, MTX_DEF);
1178 			linux_exit_tag = EVENTHANDLER_REGISTER(process_exit,
1179 			    linux_proc_exit, NULL, 1000);
1180 			linux_exec_tag = EVENTHANDLER_REGISTER(process_exec,
1181 			    linux_proc_exec, NULL, 1000);
1182 			linux_thread_dtor_tag = EVENTHANDLER_REGISTER(thread_dtor,
1183 			    linux_thread_dtor, NULL, EVENTHANDLER_PRI_ANY);
1184 			linux_szplatform = roundup(strlen(linux_platform) + 1,
1185 			    sizeof(char *));
1186 			linux_osd_jail_register();
1187 			stclohz = (stathz ? stathz : hz);
1188 			if (bootverbose)
1189 				printf("Linux ELF exec handler installed\n");
1190 		} else
1191 			printf("cannot insert Linux ELF brand handler\n");
1192 		break;
1193 	case MOD_UNLOAD:
1194 		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
1195 		     ++brandinfo)
1196 			if (elf32_brand_inuse(*brandinfo))
1197 				error = EBUSY;
1198 		if (error == 0) {
1199 			for (brandinfo = &linux_brandlist[0];
1200 			     *brandinfo != NULL; ++brandinfo)
1201 				if (elf32_remove_brand_entry(*brandinfo) < 0)
1202 					error = EINVAL;
1203 		}
1204 		if (error == 0) {
1205 			SET_FOREACH(lihp, linux_ioctl_handler_set)
1206 				linux_ioctl_unregister_handler(*lihp);
1207 			SET_FOREACH(ldhp, linux_device_handler_set)
1208 				linux_device_unregister_handler(*ldhp);
1209 			mtx_destroy(&futex_mtx);
1210 			EVENTHANDLER_DEREGISTER(process_exit, linux_exit_tag);
1211 			EVENTHANDLER_DEREGISTER(process_exec, linux_exec_tag);
1212 			EVENTHANDLER_DEREGISTER(thread_dtor, linux_thread_dtor_tag);
1213 			linux_osd_jail_deregister();
1214 			if (bootverbose)
1215 				printf("Linux ELF exec handler removed\n");
1216 		} else
1217 			printf("Could not deinstall ELF interpreter entry\n");
1218 		break;
1219 	default:
1220 		return (EOPNOTSUPP);
1221 	}
1222 	return (error);
1223 }
1224 
1225 static moduledata_t linux_elf_mod = {
1226 	"linuxelf",
1227 	linux_elf_modevent,
1228 	0
1229 };
1230 
1231 DECLARE_MODULE_TIED(linuxelf, linux_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY);
1232