xref: /freebsd/sys/amd64/linux32/linux32_sysvec.c (revision 26cf41d6cabd645bce6e77afefc7c6a24aa6d47a)
1 /*-
2  * Copyright (c) 2004 Tim J. Robbins
3  * Copyright (c) 2003 Peter Wemm
4  * Copyright (c) 2002 Doug Rabson
5  * Copyright (c) 1998-1999 Andrew Gallatin
6  * Copyright (c) 1994-1996 Søren Schmidt
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer
14  *    in this position and unchanged.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. The name of the author may not be used to endorse or promote products
19  *    derived from this software without specific prior written permission
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
22  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
23  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
24  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
25  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
26  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
30  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31  */
32 
33 #include <sys/cdefs.h>
34 __FBSDID("$FreeBSD$");
35 #include "opt_compat.h"
36 
37 #ifndef COMPAT_FREEBSD32
38 #error "Unable to compile Linux-emulator due to missing COMPAT_FREEBSD32 option!"
39 #endif
40 
41 #define	__ELF_WORD_SIZE	32
42 
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/exec.h>
46 #include <sys/fcntl.h>
47 #include <sys/imgact.h>
48 #include <sys/imgact_elf.h>
49 #include <sys/kernel.h>
50 #include <sys/lock.h>
51 #include <sys/malloc.h>
52 #include <sys/module.h>
53 #include <sys/mutex.h>
54 #include <sys/proc.h>
55 #include <sys/resourcevar.h>
56 #include <sys/signalvar.h>
57 #include <sys/sysctl.h>
58 #include <sys/syscallsubr.h>
59 #include <sys/sysent.h>
60 #include <sys/sysproto.h>
61 #include <sys/vnode.h>
62 #include <sys/eventhandler.h>
63 
64 #include <vm/vm.h>
65 #include <vm/pmap.h>
66 #include <vm/vm_extern.h>
67 #include <vm/vm_map.h>
68 #include <vm/vm_object.h>
69 #include <vm/vm_page.h>
70 #include <vm/vm_param.h>
71 
72 #include <machine/cpu.h>
73 #include <machine/md_var.h>
74 #include <machine/pcb.h>
75 #include <machine/specialreg.h>
76 
77 #include <amd64/linux32/linux.h>
78 #include <amd64/linux32/linux32_proto.h>
79 #include <compat/linux/linux_emul.h>
80 #include <compat/linux/linux_futex.h>
81 #include <compat/linux/linux_ioctl.h>
82 #include <compat/linux/linux_mib.h>
83 #include <compat/linux/linux_misc.h>
84 #include <compat/linux/linux_signal.h>
85 #include <compat/linux/linux_util.h>
86 #include <compat/linux/linux_vdso.h>
87 
88 MODULE_VERSION(linux, 1);
89 
90 MALLOC_DEFINE(M_LINUX, "linux", "Linux mode structures");
91 
92 #define	AUXARGS_ENTRY_32(pos, id, val)	\
93 	do {				\
94 		suword32(pos++, id);	\
95 		suword32(pos++, val);	\
96 	} while (0)
97 
98 #if BYTE_ORDER == LITTLE_ENDIAN
99 #define SHELLMAGIC      0x2123 /* #! */
100 #else
101 #define SHELLMAGIC      0x2321
102 #endif
103 
104 /*
105  * Allow the sendsig functions to use the ldebug() facility
106  * even though they are not syscalls themselves. Map them
107  * to syscall 0. This is slightly less bogus than using
108  * ldebug(sigreturn).
109  */
110 #define	LINUX_SYS_linux_rt_sendsig	0
111 #define	LINUX_SYS_linux_sendsig		0
112 
113 const char *linux_kplatform;
114 static int linux_szsigcode;
115 static vm_object_t linux_shared_page_obj;
116 static char *linux_shared_page_mapping;
117 extern char _binary_linux32_locore_o_start;
118 extern char _binary_linux32_locore_o_end;
119 
120 extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL];
121 
122 SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler);
123 SET_DECLARE(linux_device_handler_set, struct linux_device_handler);
124 
125 static int	elf_linux_fixup(register_t **stack_base,
126 		    struct image_params *iparams);
127 static register_t *linux_copyout_strings(struct image_params *imgp);
128 static void     linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask);
129 static void	exec_linux_setregs(struct thread *td,
130 				   struct image_params *imgp, u_long stack);
131 static void	linux32_fixlimit(struct rlimit *rl, int which);
132 static boolean_t linux32_trans_osrel(const Elf_Note *note, int32_t *osrel);
133 static void	linux_vdso_install(void *param);
134 static void	linux_vdso_deinstall(void *param);
135 
136 static eventhandler_tag linux_exit_tag;
137 static eventhandler_tag linux_exec_tag;
138 static eventhandler_tag linux_thread_dtor_tag;
139 
140 /*
141  * Linux syscalls return negative errno's, we do positive and map them
142  * Reference:
143  *   FreeBSD: src/sys/sys/errno.h
144  *   Linux:   linux-2.6.17.8/include/asm-generic/errno-base.h
145  *            linux-2.6.17.8/include/asm-generic/errno.h
146  */
147 static int bsd_to_linux_errno[ELAST + 1] = {
148 	-0,  -1,  -2,  -3,  -4,  -5,  -6,  -7,  -8,  -9,
149 	-10, -35, -12, -13, -14, -15, -16, -17, -18, -19,
150 	-20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
151 	-30, -31, -32, -33, -34, -11,-115,-114, -88, -89,
152 	-90, -91, -92, -93, -94, -95, -96, -97, -98, -99,
153 	-100,-101,-102,-103,-104,-105,-106,-107,-108,-109,
154 	-110,-111, -40, -36,-112,-113, -39, -11, -87,-122,
155 	-116, -66,  -6,  -6,  -6,  -6,  -6, -37, -38,  -9,
156 	  -6,  -6, -43, -42, -75,-125, -84, -95, -16, -74,
157 	 -72, -67, -71
158 };
159 
160 int bsd_to_linux_signal[LINUX_SIGTBLSZ] = {
161 	LINUX_SIGHUP, LINUX_SIGINT, LINUX_SIGQUIT, LINUX_SIGILL,
162 	LINUX_SIGTRAP, LINUX_SIGABRT, 0, LINUX_SIGFPE,
163 	LINUX_SIGKILL, LINUX_SIGBUS, LINUX_SIGSEGV, LINUX_SIGSYS,
164 	LINUX_SIGPIPE, LINUX_SIGALRM, LINUX_SIGTERM, LINUX_SIGURG,
165 	LINUX_SIGSTOP, LINUX_SIGTSTP, LINUX_SIGCONT, LINUX_SIGCHLD,
166 	LINUX_SIGTTIN, LINUX_SIGTTOU, LINUX_SIGIO, LINUX_SIGXCPU,
167 	LINUX_SIGXFSZ, LINUX_SIGVTALRM, LINUX_SIGPROF, LINUX_SIGWINCH,
168 	0, LINUX_SIGUSR1, LINUX_SIGUSR2
169 };
170 
171 int linux_to_bsd_signal[LINUX_SIGTBLSZ] = {
172 	SIGHUP, SIGINT, SIGQUIT, SIGILL,
173 	SIGTRAP, SIGABRT, SIGBUS, SIGFPE,
174 	SIGKILL, SIGUSR1, SIGSEGV, SIGUSR2,
175 	SIGPIPE, SIGALRM, SIGTERM, SIGBUS,
176 	SIGCHLD, SIGCONT, SIGSTOP, SIGTSTP,
177 	SIGTTIN, SIGTTOU, SIGURG, SIGXCPU,
178 	SIGXFSZ, SIGVTALRM, SIGPROF, SIGWINCH,
179 	SIGIO, SIGURG, SIGSYS
180 };
181 
182 #define LINUX_T_UNKNOWN  255
183 static int _bsd_to_linux_trapcode[] = {
184 	LINUX_T_UNKNOWN,	/* 0 */
185 	6,			/* 1  T_PRIVINFLT */
186 	LINUX_T_UNKNOWN,	/* 2 */
187 	3,			/* 3  T_BPTFLT */
188 	LINUX_T_UNKNOWN,	/* 4 */
189 	LINUX_T_UNKNOWN,	/* 5 */
190 	16,			/* 6  T_ARITHTRAP */
191 	254,			/* 7  T_ASTFLT */
192 	LINUX_T_UNKNOWN,	/* 8 */
193 	13,			/* 9  T_PROTFLT */
194 	1,			/* 10 T_TRCTRAP */
195 	LINUX_T_UNKNOWN,	/* 11 */
196 	14,			/* 12 T_PAGEFLT */
197 	LINUX_T_UNKNOWN,	/* 13 */
198 	17,			/* 14 T_ALIGNFLT */
199 	LINUX_T_UNKNOWN,	/* 15 */
200 	LINUX_T_UNKNOWN,	/* 16 */
201 	LINUX_T_UNKNOWN,	/* 17 */
202 	0,			/* 18 T_DIVIDE */
203 	2,			/* 19 T_NMI */
204 	4,			/* 20 T_OFLOW */
205 	5,			/* 21 T_BOUND */
206 	7,			/* 22 T_DNA */
207 	8,			/* 23 T_DOUBLEFLT */
208 	9,			/* 24 T_FPOPFLT */
209 	10,			/* 25 T_TSSFLT */
210 	11,			/* 26 T_SEGNPFLT */
211 	12,			/* 27 T_STKFLT */
212 	18,			/* 28 T_MCHK */
213 	19,			/* 29 T_XMMFLT */
214 	15			/* 30 T_RESERVED */
215 };
216 #define bsd_to_linux_trapcode(code) \
217     ((code)<sizeof(_bsd_to_linux_trapcode)/sizeof(*_bsd_to_linux_trapcode)? \
218      _bsd_to_linux_trapcode[(code)]: \
219      LINUX_T_UNKNOWN)
220 
221 struct linux32_ps_strings {
222 	u_int32_t ps_argvstr;	/* first of 0 or more argument strings */
223 	u_int ps_nargvstr;	/* the number of argument strings */
224 	u_int32_t ps_envstr;	/* first of 0 or more environment strings */
225 	u_int ps_nenvstr;	/* the number of environment strings */
226 };
227 
228 LINUX_VDSO_SYM_INTPTR(linux32_sigcode);
229 LINUX_VDSO_SYM_INTPTR(linux32_rt_sigcode);
230 LINUX_VDSO_SYM_INTPTR(linux32_vsyscall);
231 LINUX_VDSO_SYM_CHAR(linux_platform);
232 
233 /*
234  * If FreeBSD & Linux have a difference of opinion about what a trap
235  * means, deal with it here.
236  *
237  * MPSAFE
238  */
239 static int
240 translate_traps(int signal, int trap_code)
241 {
242 	if (signal != SIGBUS)
243 		return signal;
244 	switch (trap_code) {
245 	case T_PROTFLT:
246 	case T_TSSFLT:
247 	case T_DOUBLEFLT:
248 	case T_PAGEFLT:
249 		return SIGSEGV;
250 	default:
251 		return signal;
252 	}
253 }
254 
255 static int
256 elf_linux_fixup(register_t **stack_base, struct image_params *imgp)
257 {
258 	Elf32_Auxargs *args;
259 	Elf32_Addr *base;
260 	Elf32_Addr *pos;
261 	struct linux32_ps_strings *arginfo;
262 
263 	arginfo = (struct linux32_ps_strings *)LINUX32_PS_STRINGS;
264 
265 	KASSERT(curthread->td_proc == imgp->proc,
266 	    ("unsafe elf_linux_fixup(), should be curproc"));
267 	base = (Elf32_Addr *)*stack_base;
268 	args = (Elf32_Auxargs *)imgp->auxargs;
269 	pos = base + (imgp->args->argc + imgp->args->envc + 2);
270 
271 	AUXARGS_ENTRY_32(pos, LINUX_AT_SYSINFO_EHDR,
272 	    imgp->proc->p_sysent->sv_shared_page_base);
273 	AUXARGS_ENTRY_32(pos, LINUX_AT_SYSINFO, linux32_vsyscall);
274 	AUXARGS_ENTRY_32(pos, LINUX_AT_HWCAP, cpu_feature);
275 
276 	/*
277 	 * Do not export AT_CLKTCK when emulating Linux kernel prior to 2.4.0,
278 	 * as it has appeared in the 2.4.0-rc7 first time.
279 	 * Being exported, AT_CLKTCK is returned by sysconf(_SC_CLK_TCK),
280 	 * glibc falls back to the hard-coded CLK_TCK value when aux entry
281 	 * is not present.
282 	 * Also see linux_times() implementation.
283 	 */
284 	if (linux_kernver(curthread) >= LINUX_KERNVER_2004000)
285 		AUXARGS_ENTRY_32(pos, LINUX_AT_CLKTCK, stclohz);
286 	AUXARGS_ENTRY_32(pos, AT_PHDR, args->phdr);
287 	AUXARGS_ENTRY_32(pos, AT_PHENT, args->phent);
288 	AUXARGS_ENTRY_32(pos, AT_PHNUM, args->phnum);
289 	AUXARGS_ENTRY_32(pos, AT_PAGESZ, args->pagesz);
290 	AUXARGS_ENTRY_32(pos, AT_FLAGS, args->flags);
291 	AUXARGS_ENTRY_32(pos, AT_ENTRY, args->entry);
292 	AUXARGS_ENTRY_32(pos, AT_BASE, args->base);
293 	AUXARGS_ENTRY_32(pos, LINUX_AT_SECURE, 0);
294 	AUXARGS_ENTRY_32(pos, AT_UID, imgp->proc->p_ucred->cr_ruid);
295 	AUXARGS_ENTRY_32(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid);
296 	AUXARGS_ENTRY_32(pos, AT_GID, imgp->proc->p_ucred->cr_rgid);
297 	AUXARGS_ENTRY_32(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid);
298 	AUXARGS_ENTRY_32(pos, LINUX_AT_PLATFORM, PTROUT(linux_platform));
299 	if (args->execfd != -1)
300 		AUXARGS_ENTRY_32(pos, AT_EXECFD, args->execfd);
301 	AUXARGS_ENTRY_32(pos, AT_NULL, 0);
302 
303 	free(imgp->auxargs, M_TEMP);
304 	imgp->auxargs = NULL;
305 
306 	base--;
307 	suword32(base, (uint32_t)imgp->args->argc);
308 	*stack_base = (register_t *)base;
309 	return (0);
310 }
311 
312 static void
313 linux_rt_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
314 {
315 	struct thread *td = curthread;
316 	struct proc *p = td->td_proc;
317 	struct sigacts *psp;
318 	struct trapframe *regs;
319 	struct l_rt_sigframe *fp, frame;
320 	int oonstack;
321 	int sig;
322 	int code;
323 
324 	sig = ksi->ksi_signo;
325 	code = ksi->ksi_code;
326 	PROC_LOCK_ASSERT(p, MA_OWNED);
327 	psp = p->p_sigacts;
328 	mtx_assert(&psp->ps_mtx, MA_OWNED);
329 	regs = td->td_frame;
330 	oonstack = sigonstack(regs->tf_rsp);
331 
332 #ifdef DEBUG
333 	if (ldebug(rt_sendsig))
334 		printf(ARGS(rt_sendsig, "%p, %d, %p, %u"),
335 		    catcher, sig, (void*)mask, code);
336 #endif
337 	/*
338 	 * Allocate space for the signal handler context.
339 	 */
340 	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
341 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
342 		fp = (struct l_rt_sigframe *)(td->td_sigstk.ss_sp +
343 		    td->td_sigstk.ss_size - sizeof(struct l_rt_sigframe));
344 	} else
345 		fp = (struct l_rt_sigframe *)regs->tf_rsp - 1;
346 	mtx_unlock(&psp->ps_mtx);
347 
348 	/*
349 	 * Build the argument list for the signal handler.
350 	 */
351 	if (p->p_sysent->sv_sigtbl)
352 		if (sig <= p->p_sysent->sv_sigsize)
353 			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
354 
355 	bzero(&frame, sizeof(frame));
356 
357 	frame.sf_handler = PTROUT(catcher);
358 	frame.sf_sig = sig;
359 	frame.sf_siginfo = PTROUT(&fp->sf_si);
360 	frame.sf_ucontext = PTROUT(&fp->sf_sc);
361 
362 	/* Fill in POSIX parts */
363 	ksiginfo_to_lsiginfo(ksi, &frame.sf_si, sig);
364 
365 	/*
366 	 * Build the signal context to be used by sigreturn
367 	 * and libgcc unwind.
368 	 */
369 	frame.sf_sc.uc_flags = 0;		/* XXX ??? */
370 	frame.sf_sc.uc_link = 0;		/* XXX ??? */
371 
372 	frame.sf_sc.uc_stack.ss_sp = PTROUT(td->td_sigstk.ss_sp);
373 	frame.sf_sc.uc_stack.ss_size = td->td_sigstk.ss_size;
374 	frame.sf_sc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
375 	    ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE;
376 	PROC_UNLOCK(p);
377 
378 	bsd_to_linux_sigset(mask, &frame.sf_sc.uc_sigmask);
379 
380 	frame.sf_sc.uc_mcontext.sc_mask   = frame.sf_sc.uc_sigmask.__bits[0];
381 	frame.sf_sc.uc_mcontext.sc_edi    = regs->tf_rdi;
382 	frame.sf_sc.uc_mcontext.sc_esi    = regs->tf_rsi;
383 	frame.sf_sc.uc_mcontext.sc_ebp    = regs->tf_rbp;
384 	frame.sf_sc.uc_mcontext.sc_ebx    = regs->tf_rbx;
385 	frame.sf_sc.uc_mcontext.sc_esp    = regs->tf_rsp;
386 	frame.sf_sc.uc_mcontext.sc_edx    = regs->tf_rdx;
387 	frame.sf_sc.uc_mcontext.sc_ecx    = regs->tf_rcx;
388 	frame.sf_sc.uc_mcontext.sc_eax    = regs->tf_rax;
389 	frame.sf_sc.uc_mcontext.sc_eip    = regs->tf_rip;
390 	frame.sf_sc.uc_mcontext.sc_cs     = regs->tf_cs;
391 	frame.sf_sc.uc_mcontext.sc_gs     = regs->tf_gs;
392 	frame.sf_sc.uc_mcontext.sc_fs     = regs->tf_fs;
393 	frame.sf_sc.uc_mcontext.sc_es     = regs->tf_es;
394 	frame.sf_sc.uc_mcontext.sc_ds     = regs->tf_ds;
395 	frame.sf_sc.uc_mcontext.sc_eflags = regs->tf_rflags;
396 	frame.sf_sc.uc_mcontext.sc_esp_at_signal = regs->tf_rsp;
397 	frame.sf_sc.uc_mcontext.sc_ss     = regs->tf_ss;
398 	frame.sf_sc.uc_mcontext.sc_err    = regs->tf_err;
399 	frame.sf_sc.uc_mcontext.sc_cr2    = (u_int32_t)(uintptr_t)ksi->ksi_addr;
400 	frame.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code);
401 
402 #ifdef DEBUG
403 	if (ldebug(rt_sendsig))
404 		printf(LMSG("rt_sendsig flags: 0x%x, sp: %p, ss: 0x%lx, mask: 0x%x"),
405 		    frame.sf_sc.uc_stack.ss_flags, td->td_sigstk.ss_sp,
406 		    td->td_sigstk.ss_size, frame.sf_sc.uc_mcontext.sc_mask);
407 #endif
408 
409 	if (copyout(&frame, fp, sizeof(frame)) != 0) {
410 		/*
411 		 * Process has trashed its stack; give it an illegal
412 		 * instruction to halt it in its tracks.
413 		 */
414 #ifdef DEBUG
415 		if (ldebug(rt_sendsig))
416 			printf(LMSG("rt_sendsig: bad stack %p, oonstack=%x"),
417 			    fp, oonstack);
418 #endif
419 		PROC_LOCK(p);
420 		sigexit(td, SIGILL);
421 	}
422 
423 	/*
424 	 * Build context to run handler in.
425 	 */
426 	regs->tf_rsp = PTROUT(fp);
427 	regs->tf_rip = linux32_rt_sigcode;
428 	regs->tf_rflags &= ~(PSL_T | PSL_D);
429 	regs->tf_cs = _ucode32sel;
430 	regs->tf_ss = _udatasel;
431 	regs->tf_ds = _udatasel;
432 	regs->tf_es = _udatasel;
433 	regs->tf_fs = _ufssel;
434 	regs->tf_gs = _ugssel;
435 	regs->tf_flags = TF_HASSEGS;
436 	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
437 	PROC_LOCK(p);
438 	mtx_lock(&psp->ps_mtx);
439 }
440 
441 
442 /*
443  * Send an interrupt to process.
444  *
445  * Stack is set up to allow sigcode stored
446  * in u. to call routine, followed by kcall
447  * to sigreturn routine below.  After sigreturn
448  * resets the signal mask, the stack, and the
449  * frame pointer, it returns to the user
450  * specified pc, psl.
451  */
452 static void
453 linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
454 {
455 	struct thread *td = curthread;
456 	struct proc *p = td->td_proc;
457 	struct sigacts *psp;
458 	struct trapframe *regs;
459 	struct l_sigframe *fp, frame;
460 	l_sigset_t lmask;
461 	int oonstack, i;
462 	int sig, code;
463 
464 	sig = ksi->ksi_signo;
465 	code = ksi->ksi_code;
466 	PROC_LOCK_ASSERT(p, MA_OWNED);
467 	psp = p->p_sigacts;
468 	mtx_assert(&psp->ps_mtx, MA_OWNED);
469 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
470 		/* Signal handler installed with SA_SIGINFO. */
471 		linux_rt_sendsig(catcher, ksi, mask);
472 		return;
473 	}
474 
475 	regs = td->td_frame;
476 	oonstack = sigonstack(regs->tf_rsp);
477 
478 #ifdef DEBUG
479 	if (ldebug(sendsig))
480 		printf(ARGS(sendsig, "%p, %d, %p, %u"),
481 		    catcher, sig, (void*)mask, code);
482 #endif
483 
484 	/*
485 	 * Allocate space for the signal handler context.
486 	 */
487 	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
488 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
489 		fp = (struct l_sigframe *)(td->td_sigstk.ss_sp +
490 		    td->td_sigstk.ss_size - sizeof(struct l_sigframe));
491 	} else
492 		fp = (struct l_sigframe *)regs->tf_rsp - 1;
493 	mtx_unlock(&psp->ps_mtx);
494 	PROC_UNLOCK(p);
495 
496 	/*
497 	 * Build the argument list for the signal handler.
498 	 */
499 	if (p->p_sysent->sv_sigtbl)
500 		if (sig <= p->p_sysent->sv_sigsize)
501 			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
502 
503 	bzero(&frame, sizeof(frame));
504 
505 	frame.sf_handler = PTROUT(catcher);
506 	frame.sf_sig = sig;
507 
508 	bsd_to_linux_sigset(mask, &lmask);
509 
510 	/*
511 	 * Build the signal context to be used by sigreturn.
512 	 */
513 	frame.sf_sc.sc_mask   = lmask.__bits[0];
514 	frame.sf_sc.sc_gs     = regs->tf_gs;
515 	frame.sf_sc.sc_fs     = regs->tf_fs;
516 	frame.sf_sc.sc_es     = regs->tf_es;
517 	frame.sf_sc.sc_ds     = regs->tf_ds;
518 	frame.sf_sc.sc_edi    = regs->tf_rdi;
519 	frame.sf_sc.sc_esi    = regs->tf_rsi;
520 	frame.sf_sc.sc_ebp    = regs->tf_rbp;
521 	frame.sf_sc.sc_ebx    = regs->tf_rbx;
522 	frame.sf_sc.sc_esp    = regs->tf_rsp;
523 	frame.sf_sc.sc_edx    = regs->tf_rdx;
524 	frame.sf_sc.sc_ecx    = regs->tf_rcx;
525 	frame.sf_sc.sc_eax    = regs->tf_rax;
526 	frame.sf_sc.sc_eip    = regs->tf_rip;
527 	frame.sf_sc.sc_cs     = regs->tf_cs;
528 	frame.sf_sc.sc_eflags = regs->tf_rflags;
529 	frame.sf_sc.sc_esp_at_signal = regs->tf_rsp;
530 	frame.sf_sc.sc_ss     = regs->tf_ss;
531 	frame.sf_sc.sc_err    = regs->tf_err;
532 	frame.sf_sc.sc_cr2    = (u_int32_t)(uintptr_t)ksi->ksi_addr;
533 	frame.sf_sc.sc_trapno = bsd_to_linux_trapcode(code);
534 
535 	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
536 		frame.sf_extramask[i] = lmask.__bits[i+1];
537 
538 	if (copyout(&frame, fp, sizeof(frame)) != 0) {
539 		/*
540 		 * Process has trashed its stack; give it an illegal
541 		 * instruction to halt it in its tracks.
542 		 */
543 		PROC_LOCK(p);
544 		sigexit(td, SIGILL);
545 	}
546 
547 	/*
548 	 * Build context to run handler in.
549 	 */
550 	regs->tf_rsp = PTROUT(fp);
551 	regs->tf_rip = linux32_sigcode;
552 	regs->tf_rflags &= ~(PSL_T | PSL_D);
553 	regs->tf_cs = _ucode32sel;
554 	regs->tf_ss = _udatasel;
555 	regs->tf_ds = _udatasel;
556 	regs->tf_es = _udatasel;
557 	regs->tf_fs = _ufssel;
558 	regs->tf_gs = _ugssel;
559 	regs->tf_flags = TF_HASSEGS;
560 	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
561 	PROC_LOCK(p);
562 	mtx_lock(&psp->ps_mtx);
563 }
564 
565 /*
566  * System call to cleanup state after a signal
567  * has been taken.  Reset signal mask and
568  * stack state from context left by sendsig (above).
569  * Return to previous pc and psl as specified by
570  * context left by sendsig. Check carefully to
571  * make sure that the user has not modified the
572  * psl to gain improper privileges or to cause
573  * a machine fault.
574  */
575 int
576 linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args)
577 {
578 	struct l_sigframe frame;
579 	struct trapframe *regs;
580 	sigset_t bmask;
581 	l_sigset_t lmask;
582 	int eflags, i;
583 	ksiginfo_t ksi;
584 
585 	regs = td->td_frame;
586 
587 #ifdef DEBUG
588 	if (ldebug(sigreturn))
589 		printf(ARGS(sigreturn, "%p"), (void *)args->sfp);
590 #endif
591 	/*
592 	 * The trampoline code hands us the sigframe.
593 	 * It is unsafe to keep track of it ourselves, in the event that a
594 	 * program jumps out of a signal handler.
595 	 */
596 	if (copyin(args->sfp, &frame, sizeof(frame)) != 0)
597 		return (EFAULT);
598 
599 	/*
600 	 * Check for security violations.
601 	 */
602 #define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
603 	eflags = frame.sf_sc.sc_eflags;
604 	if (!EFLAGS_SECURE(eflags, regs->tf_rflags))
605 		return(EINVAL);
606 
607 	/*
608 	 * Don't allow users to load a valid privileged %cs.  Let the
609 	 * hardware check for invalid selectors, excess privilege in
610 	 * other selectors, invalid %eip's and invalid %esp's.
611 	 */
612 #define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
613 	if (!CS_SECURE(frame.sf_sc.sc_cs)) {
614 		ksiginfo_init_trap(&ksi);
615 		ksi.ksi_signo = SIGBUS;
616 		ksi.ksi_code = BUS_OBJERR;
617 		ksi.ksi_trapno = T_PROTFLT;
618 		ksi.ksi_addr = (void *)regs->tf_rip;
619 		trapsignal(td, &ksi);
620 		return(EINVAL);
621 	}
622 
623 	lmask.__bits[0] = frame.sf_sc.sc_mask;
624 	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
625 		lmask.__bits[i+1] = frame.sf_extramask[i];
626 	linux_to_bsd_sigset(&lmask, &bmask);
627 	kern_sigprocmask(td, SIG_SETMASK, &bmask, NULL, 0);
628 
629 	/*
630 	 * Restore signal context.
631 	 */
632 	regs->tf_rdi    = frame.sf_sc.sc_edi;
633 	regs->tf_rsi    = frame.sf_sc.sc_esi;
634 	regs->tf_rbp    = frame.sf_sc.sc_ebp;
635 	regs->tf_rbx    = frame.sf_sc.sc_ebx;
636 	regs->tf_rdx    = frame.sf_sc.sc_edx;
637 	regs->tf_rcx    = frame.sf_sc.sc_ecx;
638 	regs->tf_rax    = frame.sf_sc.sc_eax;
639 	regs->tf_rip    = frame.sf_sc.sc_eip;
640 	regs->tf_cs     = frame.sf_sc.sc_cs;
641 	regs->tf_ds     = frame.sf_sc.sc_ds;
642 	regs->tf_es     = frame.sf_sc.sc_es;
643 	regs->tf_fs     = frame.sf_sc.sc_fs;
644 	regs->tf_gs     = frame.sf_sc.sc_gs;
645 	regs->tf_rflags = eflags;
646 	regs->tf_rsp    = frame.sf_sc.sc_esp_at_signal;
647 	regs->tf_ss     = frame.sf_sc.sc_ss;
648 	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
649 
650 	return (EJUSTRETURN);
651 }
652 
653 /*
654  * System call to cleanup state after a signal
655  * has been taken.  Reset signal mask and
656  * stack state from context left by rt_sendsig (above).
657  * Return to previous pc and psl as specified by
658  * context left by sendsig. Check carefully to
659  * make sure that the user has not modified the
660  * psl to gain improper privileges or to cause
661  * a machine fault.
662  */
663 int
664 linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args)
665 {
666 	struct l_ucontext uc;
667 	struct l_sigcontext *context;
668 	sigset_t bmask;
669 	l_stack_t *lss;
670 	stack_t ss;
671 	struct trapframe *regs;
672 	int eflags;
673 	ksiginfo_t ksi;
674 
675 	regs = td->td_frame;
676 
677 #ifdef DEBUG
678 	if (ldebug(rt_sigreturn))
679 		printf(ARGS(rt_sigreturn, "%p"), (void *)args->ucp);
680 #endif
681 	/*
682 	 * The trampoline code hands us the ucontext.
683 	 * It is unsafe to keep track of it ourselves, in the event that a
684 	 * program jumps out of a signal handler.
685 	 */
686 	if (copyin(args->ucp, &uc, sizeof(uc)) != 0)
687 		return (EFAULT);
688 
689 	context = &uc.uc_mcontext;
690 
691 	/*
692 	 * Check for security violations.
693 	 */
694 #define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
695 	eflags = context->sc_eflags;
696 	if (!EFLAGS_SECURE(eflags, regs->tf_rflags))
697 		return(EINVAL);
698 
699 	/*
700 	 * Don't allow users to load a valid privileged %cs.  Let the
701 	 * hardware check for invalid selectors, excess privilege in
702 	 * other selectors, invalid %eip's and invalid %esp's.
703 	 */
704 #define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
705 	if (!CS_SECURE(context->sc_cs)) {
706 		ksiginfo_init_trap(&ksi);
707 		ksi.ksi_signo = SIGBUS;
708 		ksi.ksi_code = BUS_OBJERR;
709 		ksi.ksi_trapno = T_PROTFLT;
710 		ksi.ksi_addr = (void *)regs->tf_rip;
711 		trapsignal(td, &ksi);
712 		return(EINVAL);
713 	}
714 
715 	linux_to_bsd_sigset(&uc.uc_sigmask, &bmask);
716 	kern_sigprocmask(td, SIG_SETMASK, &bmask, NULL, 0);
717 
718 	/*
719 	 * Restore signal context
720 	 */
721 	regs->tf_gs	= context->sc_gs;
722 	regs->tf_fs	= context->sc_fs;
723 	regs->tf_es	= context->sc_es;
724 	regs->tf_ds	= context->sc_ds;
725 	regs->tf_rdi    = context->sc_edi;
726 	regs->tf_rsi    = context->sc_esi;
727 	regs->tf_rbp    = context->sc_ebp;
728 	regs->tf_rbx    = context->sc_ebx;
729 	regs->tf_rdx    = context->sc_edx;
730 	regs->tf_rcx    = context->sc_ecx;
731 	regs->tf_rax    = context->sc_eax;
732 	regs->tf_rip    = context->sc_eip;
733 	regs->tf_cs     = context->sc_cs;
734 	regs->tf_rflags = eflags;
735 	regs->tf_rsp    = context->sc_esp_at_signal;
736 	regs->tf_ss     = context->sc_ss;
737 	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
738 
739 	/*
740 	 * call sigaltstack & ignore results..
741 	 */
742 	lss = &uc.uc_stack;
743 	ss.ss_sp = PTRIN(lss->ss_sp);
744 	ss.ss_size = lss->ss_size;
745 	ss.ss_flags = linux_to_bsd_sigaltstack(lss->ss_flags);
746 
747 #ifdef DEBUG
748 	if (ldebug(rt_sigreturn))
749 		printf(LMSG("rt_sigret flags: 0x%x, sp: %p, ss: 0x%lx, mask: 0x%x"),
750 		    ss.ss_flags, ss.ss_sp, ss.ss_size, context->sc_mask);
751 #endif
752 	(void)kern_sigaltstack(td, &ss, NULL);
753 
754 	return (EJUSTRETURN);
755 }
756 
757 static int
758 linux32_fetch_syscall_args(struct thread *td, struct syscall_args *sa)
759 {
760 	struct proc *p;
761 	struct trapframe *frame;
762 
763 	p = td->td_proc;
764 	frame = td->td_frame;
765 
766 	sa->args[0] = frame->tf_rbx;
767 	sa->args[1] = frame->tf_rcx;
768 	sa->args[2] = frame->tf_rdx;
769 	sa->args[3] = frame->tf_rsi;
770 	sa->args[4] = frame->tf_rdi;
771 	sa->args[5] = frame->tf_rbp;	/* Unconfirmed */
772 	sa->code = frame->tf_rax;
773 
774 	if (sa->code >= p->p_sysent->sv_size)
775 		sa->callp = &p->p_sysent->sv_table[0];
776 	else
777 		sa->callp = &p->p_sysent->sv_table[sa->code];
778 	sa->narg = sa->callp->sy_narg;
779 
780 	td->td_retval[0] = 0;
781 	td->td_retval[1] = frame->tf_rdx;
782 
783 	return (0);
784 }
785 
786 /*
787  * If a linux binary is exec'ing something, try this image activator
788  * first.  We override standard shell script execution in order to
789  * be able to modify the interpreter path.  We only do this if a linux
790  * binary is doing the exec, so we do not create an EXEC module for it.
791  */
792 static int	exec_linux_imgact_try(struct image_params *iparams);
793 
794 static int
795 exec_linux_imgact_try(struct image_params *imgp)
796 {
797 	const char *head = (const char *)imgp->image_header;
798 	char *rpath;
799 	int error = -1;
800 
801 	/*
802 	* The interpreter for shell scripts run from a linux binary needs
803 	* to be located in /compat/linux if possible in order to recursively
804 	* maintain linux path emulation.
805 	*/
806 	if (((const short *)head)[0] == SHELLMAGIC) {
807 		/*
808 		* Run our normal shell image activator.  If it succeeds attempt
809 		* to use the alternate path for the interpreter.  If an
810 		* alternate * path is found, use our stringspace to store it.
811 		*/
812 		if ((error = exec_shell_imgact(imgp)) == 0) {
813 			linux_emul_convpath(FIRST_THREAD_IN_PROC(imgp->proc),
814 			    imgp->interpreter_name, UIO_SYSSPACE, &rpath, 0,
815 			    AT_FDCWD);
816 			if (rpath != NULL)
817 				imgp->args->fname_buf =
818 				    imgp->interpreter_name = rpath;
819 		}
820 	}
821 	return (error);
822 }
823 
824 /*
825  * Clear registers on exec
826  * XXX copied from ia32_signal.c.
827  */
828 static void
829 exec_linux_setregs(struct thread *td, struct image_params *imgp, u_long stack)
830 {
831 	struct trapframe *regs = td->td_frame;
832 	struct pcb *pcb = td->td_pcb;
833 
834 	mtx_lock(&dt_lock);
835 	if (td->td_proc->p_md.md_ldt != NULL)
836 		user_ldt_free(td);
837 	else
838 		mtx_unlock(&dt_lock);
839 
840 	critical_enter();
841 	wrmsr(MSR_FSBASE, 0);
842 	wrmsr(MSR_KGSBASE, 0);	/* User value while we're in the kernel */
843 	pcb->pcb_fsbase = 0;
844 	pcb->pcb_gsbase = 0;
845 	critical_exit();
846 	pcb->pcb_initial_fpucw = __LINUX_NPXCW__;
847 
848 	bzero((char *)regs, sizeof(struct trapframe));
849 	regs->tf_rip = imgp->entry_addr;
850 	regs->tf_rsp = stack;
851 	regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T);
852 	regs->tf_gs = _ugssel;
853 	regs->tf_fs = _ufssel;
854 	regs->tf_es = _udatasel;
855 	regs->tf_ds = _udatasel;
856 	regs->tf_ss = _udatasel;
857 	regs->tf_flags = TF_HASSEGS;
858 	regs->tf_cs = _ucode32sel;
859 	regs->tf_rbx = imgp->ps_strings;
860 
861 	fpstate_drop(td);
862 
863 	/* Do full restore on return so that we can change to a different %cs */
864 	set_pcb_flags(pcb, PCB_32BIT | PCB_FULL_IRET);
865 	td->td_retval[1] = 0;
866 }
867 
868 /*
869  * XXX copied from ia32_sysvec.c.
870  */
871 static register_t *
872 linux_copyout_strings(struct image_params *imgp)
873 {
874 	int argc, envc;
875 	u_int32_t *vectp;
876 	char *stringp, *destp;
877 	u_int32_t *stack_base;
878 	struct linux32_ps_strings *arginfo;
879 
880 	/*
881 	 * Calculate string base and vector table pointers.
882 	 */
883 	arginfo = (struct linux32_ps_strings *)LINUX32_PS_STRINGS;
884 	destp =	(caddr_t)arginfo - SPARE_USRSPACE -
885 	    roundup((ARG_MAX - imgp->args->stringspace), sizeof(char *));
886 
887 	/*
888 	 * If we have a valid auxargs ptr, prepare some room
889 	 * on the stack.
890 	 */
891 	if (imgp->auxargs) {
892 		/*
893 		 * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for
894 		 * lower compatibility.
895 		 */
896 		imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size :
897 		    (LINUX_AT_COUNT * 2);
898 		/*
899 		 * The '+ 2' is for the null pointers at the end of each of
900 		 * the arg and env vector sets,and imgp->auxarg_size is room
901 		 * for argument of Runtime loader.
902 		 */
903 		vectp = (u_int32_t *) (destp - (imgp->args->argc +
904 		    imgp->args->envc + 2 + imgp->auxarg_size) *
905 		    sizeof(u_int32_t));
906 
907 	} else
908 		/*
909 		 * The '+ 2' is for the null pointers at the end of each of
910 		 * the arg and env vector sets
911 		 */
912 		vectp = (u_int32_t *)(destp - (imgp->args->argc +
913 		    imgp->args->envc + 2) * sizeof(u_int32_t));
914 
915 	/*
916 	 * vectp also becomes our initial stack base
917 	 */
918 	stack_base = vectp;
919 
920 	stringp = imgp->args->begin_argv;
921 	argc = imgp->args->argc;
922 	envc = imgp->args->envc;
923 	/*
924 	 * Copy out strings - arguments and environment.
925 	 */
926 	copyout(stringp, destp, ARG_MAX - imgp->args->stringspace);
927 
928 	/*
929 	 * Fill in "ps_strings" struct for ps, w, etc.
930 	 */
931 	suword32(&arginfo->ps_argvstr, (uint32_t)(intptr_t)vectp);
932 	suword32(&arginfo->ps_nargvstr, argc);
933 
934 	/*
935 	 * Fill in argument portion of vector table.
936 	 */
937 	for (; argc > 0; --argc) {
938 		suword32(vectp++, (uint32_t)(intptr_t)destp);
939 		while (*stringp++ != 0)
940 			destp++;
941 		destp++;
942 	}
943 
944 	/* a null vector table pointer separates the argp's from the envp's */
945 	suword32(vectp++, 0);
946 
947 	suword32(&arginfo->ps_envstr, (uint32_t)(intptr_t)vectp);
948 	suword32(&arginfo->ps_nenvstr, envc);
949 
950 	/*
951 	 * Fill in environment portion of vector table.
952 	 */
953 	for (; envc > 0; --envc) {
954 		suword32(vectp++, (uint32_t)(intptr_t)destp);
955 		while (*stringp++ != 0)
956 			destp++;
957 		destp++;
958 	}
959 
960 	/* end of vector table is a null pointer */
961 	suword32(vectp, 0);
962 
963 	return ((register_t *)stack_base);
964 }
965 
966 static SYSCTL_NODE(_compat, OID_AUTO, linux32, CTLFLAG_RW, 0,
967     "32-bit Linux emulation");
968 
969 static u_long	linux32_maxdsiz = LINUX32_MAXDSIZ;
970 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxdsiz, CTLFLAG_RW,
971     &linux32_maxdsiz, 0, "");
972 static u_long	linux32_maxssiz = LINUX32_MAXSSIZ;
973 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxssiz, CTLFLAG_RW,
974     &linux32_maxssiz, 0, "");
975 static u_long	linux32_maxvmem = LINUX32_MAXVMEM;
976 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxvmem, CTLFLAG_RW,
977     &linux32_maxvmem, 0, "");
978 
979 static void
980 linux32_fixlimit(struct rlimit *rl, int which)
981 {
982 
983 	switch (which) {
984 	case RLIMIT_DATA:
985 		if (linux32_maxdsiz != 0) {
986 			if (rl->rlim_cur > linux32_maxdsiz)
987 				rl->rlim_cur = linux32_maxdsiz;
988 			if (rl->rlim_max > linux32_maxdsiz)
989 				rl->rlim_max = linux32_maxdsiz;
990 		}
991 		break;
992 	case RLIMIT_STACK:
993 		if (linux32_maxssiz != 0) {
994 			if (rl->rlim_cur > linux32_maxssiz)
995 				rl->rlim_cur = linux32_maxssiz;
996 			if (rl->rlim_max > linux32_maxssiz)
997 				rl->rlim_max = linux32_maxssiz;
998 		}
999 		break;
1000 	case RLIMIT_VMEM:
1001 		if (linux32_maxvmem != 0) {
1002 			if (rl->rlim_cur > linux32_maxvmem)
1003 				rl->rlim_cur = linux32_maxvmem;
1004 			if (rl->rlim_max > linux32_maxvmem)
1005 				rl->rlim_max = linux32_maxvmem;
1006 		}
1007 		break;
1008 	}
1009 }
1010 
1011 struct sysentvec elf_linux_sysvec = {
1012 	.sv_size	= LINUX_SYS_MAXSYSCALL,
1013 	.sv_table	= linux_sysent,
1014 	.sv_mask	= 0,
1015 	.sv_sigsize	= LINUX_SIGTBLSZ,
1016 	.sv_sigtbl	= bsd_to_linux_signal,
1017 	.sv_errsize	= ELAST + 1,
1018 	.sv_errtbl	= bsd_to_linux_errno,
1019 	.sv_transtrap	= translate_traps,
1020 	.sv_fixup	= elf_linux_fixup,
1021 	.sv_sendsig	= linux_sendsig,
1022 	.sv_sigcode	= &_binary_linux32_locore_o_start,
1023 	.sv_szsigcode	= &linux_szsigcode,
1024 	.sv_prepsyscall	= NULL,
1025 	.sv_name	= "Linux ELF32",
1026 	.sv_coredump	= elf32_coredump,
1027 	.sv_imgact_try	= exec_linux_imgact_try,
1028 	.sv_minsigstksz	= LINUX_MINSIGSTKSZ,
1029 	.sv_pagesize	= PAGE_SIZE,
1030 	.sv_minuser	= VM_MIN_ADDRESS,
1031 	.sv_maxuser	= LINUX32_MAXUSER,
1032 	.sv_usrstack	= LINUX32_USRSTACK,
1033 	.sv_psstrings	= LINUX32_PS_STRINGS,
1034 	.sv_stackprot	= VM_PROT_ALL,
1035 	.sv_copyout_strings = linux_copyout_strings,
1036 	.sv_setregs	= exec_linux_setregs,
1037 	.sv_fixlimit	= linux32_fixlimit,
1038 	.sv_maxssiz	= &linux32_maxssiz,
1039 	.sv_flags	= SV_ABI_LINUX | SV_ILP32 | SV_IA32 | SV_SHP,
1040 	.sv_set_syscall_retval = cpu_set_syscall_retval,
1041 	.sv_fetch_syscall_args = linux32_fetch_syscall_args,
1042 	.sv_syscallnames = NULL,
1043 	.sv_shared_page_base = LINUX32_SHAREDPAGE,
1044 	.sv_shared_page_len = PAGE_SIZE,
1045 	.sv_schedtail	= linux_schedtail,
1046 	.sv_thread_detach = linux_thread_detach,
1047 };
1048 
1049 static void
1050 linux_vdso_install(void *param)
1051 {
1052 
1053 	linux_szsigcode = (&_binary_linux32_locore_o_end -
1054 	    &_binary_linux32_locore_o_start);
1055 
1056 	if (linux_szsigcode > elf_linux_sysvec.sv_shared_page_len)
1057 		panic("Linux invalid vdso size\n");
1058 
1059 	__elfN(linux_vdso_fixup)(&elf_linux_sysvec);
1060 
1061 	linux_shared_page_obj = __elfN(linux_shared_page_init)
1062 	    (&linux_shared_page_mapping);
1063 
1064 	__elfN(linux_vdso_reloc)(&elf_linux_sysvec, LINUX32_SHAREDPAGE);
1065 
1066 	bcopy(elf_linux_sysvec.sv_sigcode, linux_shared_page_mapping,
1067 	    linux_szsigcode);
1068 	elf_linux_sysvec.sv_shared_page_obj = linux_shared_page_obj;
1069 
1070 	linux_kplatform = linux_shared_page_mapping +
1071 	    (linux_platform - (caddr_t)LINUX32_SHAREDPAGE);
1072 }
1073 SYSINIT(elf_linux_vdso_init, SI_SUB_EXEC, SI_ORDER_ANY,
1074     (sysinit_cfunc_t)linux_vdso_install, NULL);
1075 
1076 static void
1077 linux_vdso_deinstall(void *param)
1078 {
1079 
1080 	__elfN(linux_shared_page_fini)(linux_shared_page_obj);
1081 };
1082 SYSUNINIT(elf_linux_vdso_uninit, SI_SUB_EXEC, SI_ORDER_FIRST,
1083     (sysinit_cfunc_t)linux_vdso_deinstall, NULL);
1084 
1085 static char GNU_ABI_VENDOR[] = "GNU";
1086 static int GNULINUX_ABI_DESC = 0;
1087 
1088 static boolean_t
1089 linux32_trans_osrel(const Elf_Note *note, int32_t *osrel)
1090 {
1091 	const Elf32_Word *desc;
1092 	uintptr_t p;
1093 
1094 	p = (uintptr_t)(note + 1);
1095 	p += roundup2(note->n_namesz, sizeof(Elf32_Addr));
1096 
1097 	desc = (const Elf32_Word *)p;
1098 	if (desc[0] != GNULINUX_ABI_DESC)
1099 		return (FALSE);
1100 
1101 	/*
1102 	 * For linux we encode osrel as follows (see linux_mib.c):
1103 	 * VVVMMMIII (version, major, minor), see linux_mib.c.
1104 	 */
1105 	*osrel = desc[1] * 1000000 + desc[2] * 1000 + desc[3];
1106 
1107 	return (TRUE);
1108 }
1109 
1110 static Elf_Brandnote linux32_brandnote = {
1111 	.hdr.n_namesz	= sizeof(GNU_ABI_VENDOR),
1112 	.hdr.n_descsz	= 16,	/* XXX at least 16 */
1113 	.hdr.n_type	= 1,
1114 	.vendor		= GNU_ABI_VENDOR,
1115 	.flags		= BN_TRANSLATE_OSREL,
1116 	.trans_osrel	= linux32_trans_osrel
1117 };
1118 
1119 static Elf32_Brandinfo linux_brand = {
1120 	.brand		= ELFOSABI_LINUX,
1121 	.machine	= EM_386,
1122 	.compat_3_brand	= "Linux",
1123 	.emul_path	= "/compat/linux",
1124 	.interp_path	= "/lib/ld-linux.so.1",
1125 	.sysvec		= &elf_linux_sysvec,
1126 	.interp_newpath	= NULL,
1127 	.brand_note	= &linux32_brandnote,
1128 	.flags		= BI_CAN_EXEC_DYN | BI_BRAND_NOTE
1129 };
1130 
1131 static Elf32_Brandinfo linux_glibc2brand = {
1132 	.brand		= ELFOSABI_LINUX,
1133 	.machine	= EM_386,
1134 	.compat_3_brand	= "Linux",
1135 	.emul_path	= "/compat/linux",
1136 	.interp_path	= "/lib/ld-linux.so.2",
1137 	.sysvec		= &elf_linux_sysvec,
1138 	.interp_newpath	= NULL,
1139 	.brand_note	= &linux32_brandnote,
1140 	.flags		= BI_CAN_EXEC_DYN | BI_BRAND_NOTE
1141 };
1142 
1143 Elf32_Brandinfo *linux_brandlist[] = {
1144 	&linux_brand,
1145 	&linux_glibc2brand,
1146 	NULL
1147 };
1148 
1149 static int
1150 linux_elf_modevent(module_t mod, int type, void *data)
1151 {
1152 	Elf32_Brandinfo **brandinfo;
1153 	int error;
1154 	struct linux_ioctl_handler **lihp;
1155 	struct linux_device_handler **ldhp;
1156 
1157 	error = 0;
1158 
1159 	switch(type) {
1160 	case MOD_LOAD:
1161 		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
1162 		     ++brandinfo)
1163 			if (elf32_insert_brand_entry(*brandinfo) < 0)
1164 				error = EINVAL;
1165 		if (error == 0) {
1166 			SET_FOREACH(lihp, linux_ioctl_handler_set)
1167 				linux_ioctl_register_handler(*lihp);
1168 			SET_FOREACH(ldhp, linux_device_handler_set)
1169 				linux_device_register_handler(*ldhp);
1170 			LIST_INIT(&futex_list);
1171 			mtx_init(&futex_mtx, "ftllk", NULL, MTX_DEF);
1172 			linux_exit_tag = EVENTHANDLER_REGISTER(process_exit,
1173 			    linux_proc_exit, NULL, 1000);
1174 			linux_exec_tag = EVENTHANDLER_REGISTER(process_exec,
1175 			    linux_proc_exec, NULL, 1000);
1176 			linux_thread_dtor_tag = EVENTHANDLER_REGISTER(thread_dtor,
1177 			    linux_thread_dtor, NULL, EVENTHANDLER_PRI_ANY);
1178 			linux_osd_jail_register();
1179 			stclohz = (stathz ? stathz : hz);
1180 			if (bootverbose)
1181 				printf("Linux ELF exec handler installed\n");
1182 		} else
1183 			printf("cannot insert Linux ELF brand handler\n");
1184 		break;
1185 	case MOD_UNLOAD:
1186 		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
1187 		     ++brandinfo)
1188 			if (elf32_brand_inuse(*brandinfo))
1189 				error = EBUSY;
1190 		if (error == 0) {
1191 			for (brandinfo = &linux_brandlist[0];
1192 			     *brandinfo != NULL; ++brandinfo)
1193 				if (elf32_remove_brand_entry(*brandinfo) < 0)
1194 					error = EINVAL;
1195 		}
1196 		if (error == 0) {
1197 			SET_FOREACH(lihp, linux_ioctl_handler_set)
1198 				linux_ioctl_unregister_handler(*lihp);
1199 			SET_FOREACH(ldhp, linux_device_handler_set)
1200 				linux_device_unregister_handler(*ldhp);
1201 			mtx_destroy(&futex_mtx);
1202 			EVENTHANDLER_DEREGISTER(process_exit, linux_exit_tag);
1203 			EVENTHANDLER_DEREGISTER(process_exec, linux_exec_tag);
1204 			EVENTHANDLER_DEREGISTER(thread_dtor, linux_thread_dtor_tag);
1205 			linux_osd_jail_deregister();
1206 			if (bootverbose)
1207 				printf("Linux ELF exec handler removed\n");
1208 		} else
1209 			printf("Could not deinstall ELF interpreter entry\n");
1210 		break;
1211 	default:
1212 		return (EOPNOTSUPP);
1213 	}
1214 	return (error);
1215 }
1216 
1217 static moduledata_t linux_elf_mod = {
1218 	"linuxelf",
1219 	linux_elf_modevent,
1220 	0
1221 };
1222 
1223 DECLARE_MODULE_TIED(linuxelf, linux_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY);
1224