xref: /freebsd/sys/amd64/linux32/linux32_sysvec.c (revision 0020bdf13a648162724210025bad2380c778961e)
1 /*-
2  * Copyright (c) 2004 Tim J. Robbins
3  * Copyright (c) 2003 Peter Wemm
4  * Copyright (c) 2002 Doug Rabson
5  * Copyright (c) 1998-1999 Andrew Gallatin
6  * Copyright (c) 1994-1996 Søren Schmidt
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer
14  *    in this position and unchanged.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. The name of the author may not be used to endorse or promote products
19  *    derived from this software without specific prior written permission
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
22  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
23  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
24  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
25  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
26  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
30  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31  */
32 
33 #include <sys/cdefs.h>
34 __FBSDID("$FreeBSD$");
35 #include "opt_compat.h"
36 
37 #ifndef COMPAT_FREEBSD32
38 #error "Unable to compile Linux-emulator due to missing COMPAT_FREEBSD32 option!"
39 #endif
40 
41 #define	__ELF_WORD_SIZE	32
42 
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/exec.h>
46 #include <sys/fcntl.h>
47 #include <sys/imgact.h>
48 #include <sys/imgact_elf.h>
49 #include <sys/kernel.h>
50 #include <sys/lock.h>
51 #include <sys/malloc.h>
52 #include <sys/module.h>
53 #include <sys/mutex.h>
54 #include <sys/proc.h>
55 #include <sys/resourcevar.h>
56 #include <sys/signalvar.h>
57 #include <sys/sysctl.h>
58 #include <sys/syscallsubr.h>
59 #include <sys/sysent.h>
60 #include <sys/sysproto.h>
61 #include <sys/vnode.h>
62 #include <sys/eventhandler.h>
63 
64 #include <vm/vm.h>
65 #include <vm/pmap.h>
66 #include <vm/vm_extern.h>
67 #include <vm/vm_map.h>
68 #include <vm/vm_object.h>
69 #include <vm/vm_page.h>
70 #include <vm/vm_param.h>
71 
72 #include <machine/cpu.h>
73 #include <machine/md_var.h>
74 #include <machine/pcb.h>
75 #include <machine/specialreg.h>
76 
77 #include <amd64/linux32/linux.h>
78 #include <amd64/linux32/linux32_proto.h>
79 #include <compat/linux/linux_emul.h>
80 #include <compat/linux/linux_futex.h>
81 #include <compat/linux/linux_ioctl.h>
82 #include <compat/linux/linux_mib.h>
83 #include <compat/linux/linux_misc.h>
84 #include <compat/linux/linux_signal.h>
85 #include <compat/linux/linux_util.h>
86 #include <compat/linux/linux_vdso.h>
87 
88 MODULE_VERSION(linux, 1);
89 
90 MALLOC_DEFINE(M_LINUX, "linux", "Linux mode structures");
91 
92 #define	AUXARGS_ENTRY_32(pos, id, val)	\
93 	do {				\
94 		suword32(pos++, id);	\
95 		suword32(pos++, val);	\
96 	} while (0)
97 
98 #if BYTE_ORDER == LITTLE_ENDIAN
99 #define SHELLMAGIC      0x2123 /* #! */
100 #else
101 #define SHELLMAGIC      0x2321
102 #endif
103 
104 /*
105  * Allow the sendsig functions to use the ldebug() facility
106  * even though they are not syscalls themselves. Map them
107  * to syscall 0. This is slightly less bogus than using
108  * ldebug(sigreturn).
109  */
110 #define	LINUX_SYS_linux_rt_sendsig	0
111 #define	LINUX_SYS_linux_sendsig		0
112 
113 const char *linux_kplatform;
114 static int linux_szsigcode;
115 static vm_object_t linux_shared_page_obj;
116 static char *linux_shared_page_mapping;
117 extern char _binary_linux32_locore_o_start;
118 extern char _binary_linux32_locore_o_end;
119 
120 extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL];
121 
122 SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler);
123 SET_DECLARE(linux_device_handler_set, struct linux_device_handler);
124 
125 static int	elf_linux_fixup(register_t **stack_base,
126 		    struct image_params *iparams);
127 static register_t *linux_copyout_strings(struct image_params *imgp);
128 static void     linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask);
129 static void	exec_linux_setregs(struct thread *td,
130 				   struct image_params *imgp, u_long stack);
131 static void	linux32_fixlimit(struct rlimit *rl, int which);
132 static boolean_t linux32_trans_osrel(const Elf_Note *note, int32_t *osrel);
133 static void	linux_vdso_install(void *param);
134 static void	linux_vdso_deinstall(void *param);
135 
136 static eventhandler_tag linux_exit_tag;
137 static eventhandler_tag linux_exec_tag;
138 static eventhandler_tag linux_thread_dtor_tag;
139 
140 /*
141  * Linux syscalls return negative errno's, we do positive and map them
142  * Reference:
143  *   FreeBSD: src/sys/sys/errno.h
144  *   Linux:   linux-2.6.17.8/include/asm-generic/errno-base.h
145  *            linux-2.6.17.8/include/asm-generic/errno.h
146  */
147 static int bsd_to_linux_errno[ELAST + 1] = {
148 	-0,  -1,  -2,  -3,  -4,  -5,  -6,  -7,  -8,  -9,
149 	-10, -35, -12, -13, -14, -15, -16, -17, -18, -19,
150 	-20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
151 	-30, -31, -32, -33, -34, -11,-115,-114, -88, -89,
152 	-90, -91, -92, -93, -94, -95, -96, -97, -98, -99,
153 	-100,-101,-102,-103,-104,-105,-106,-107,-108,-109,
154 	-110,-111, -40, -36,-112,-113, -39, -11, -87,-122,
155 	-116, -66,  -6,  -6,  -6,  -6,  -6, -37, -38,  -9,
156 	  -6,  -6, -43, -42, -75,-125, -84, -95, -16, -74,
157 	 -72, -67, -71
158 };
159 
160 int bsd_to_linux_signal[LINUX_SIGTBLSZ] = {
161 	LINUX_SIGHUP, LINUX_SIGINT, LINUX_SIGQUIT, LINUX_SIGILL,
162 	LINUX_SIGTRAP, LINUX_SIGABRT, 0, LINUX_SIGFPE,
163 	LINUX_SIGKILL, LINUX_SIGBUS, LINUX_SIGSEGV, LINUX_SIGSYS,
164 	LINUX_SIGPIPE, LINUX_SIGALRM, LINUX_SIGTERM, LINUX_SIGURG,
165 	LINUX_SIGSTOP, LINUX_SIGTSTP, LINUX_SIGCONT, LINUX_SIGCHLD,
166 	LINUX_SIGTTIN, LINUX_SIGTTOU, LINUX_SIGIO, LINUX_SIGXCPU,
167 	LINUX_SIGXFSZ, LINUX_SIGVTALRM, LINUX_SIGPROF, LINUX_SIGWINCH,
168 	0, LINUX_SIGUSR1, LINUX_SIGUSR2
169 };
170 
171 int linux_to_bsd_signal[LINUX_SIGTBLSZ] = {
172 	SIGHUP, SIGINT, SIGQUIT, SIGILL,
173 	SIGTRAP, SIGABRT, SIGBUS, SIGFPE,
174 	SIGKILL, SIGUSR1, SIGSEGV, SIGUSR2,
175 	SIGPIPE, SIGALRM, SIGTERM, SIGBUS,
176 	SIGCHLD, SIGCONT, SIGSTOP, SIGTSTP,
177 	SIGTTIN, SIGTTOU, SIGURG, SIGXCPU,
178 	SIGXFSZ, SIGVTALRM, SIGPROF, SIGWINCH,
179 	SIGIO, SIGURG, SIGSYS
180 };
181 
182 #define LINUX_T_UNKNOWN  255
183 static int _bsd_to_linux_trapcode[] = {
184 	LINUX_T_UNKNOWN,	/* 0 */
185 	6,			/* 1  T_PRIVINFLT */
186 	LINUX_T_UNKNOWN,	/* 2 */
187 	3,			/* 3  T_BPTFLT */
188 	LINUX_T_UNKNOWN,	/* 4 */
189 	LINUX_T_UNKNOWN,	/* 5 */
190 	16,			/* 6  T_ARITHTRAP */
191 	254,			/* 7  T_ASTFLT */
192 	LINUX_T_UNKNOWN,	/* 8 */
193 	13,			/* 9  T_PROTFLT */
194 	1,			/* 10 T_TRCTRAP */
195 	LINUX_T_UNKNOWN,	/* 11 */
196 	14,			/* 12 T_PAGEFLT */
197 	LINUX_T_UNKNOWN,	/* 13 */
198 	17,			/* 14 T_ALIGNFLT */
199 	LINUX_T_UNKNOWN,	/* 15 */
200 	LINUX_T_UNKNOWN,	/* 16 */
201 	LINUX_T_UNKNOWN,	/* 17 */
202 	0,			/* 18 T_DIVIDE */
203 	2,			/* 19 T_NMI */
204 	4,			/* 20 T_OFLOW */
205 	5,			/* 21 T_BOUND */
206 	7,			/* 22 T_DNA */
207 	8,			/* 23 T_DOUBLEFLT */
208 	9,			/* 24 T_FPOPFLT */
209 	10,			/* 25 T_TSSFLT */
210 	11,			/* 26 T_SEGNPFLT */
211 	12,			/* 27 T_STKFLT */
212 	18,			/* 28 T_MCHK */
213 	19,			/* 29 T_XMMFLT */
214 	15			/* 30 T_RESERVED */
215 };
216 #define bsd_to_linux_trapcode(code) \
217     ((code)<sizeof(_bsd_to_linux_trapcode)/sizeof(*_bsd_to_linux_trapcode)? \
218      _bsd_to_linux_trapcode[(code)]: \
219      LINUX_T_UNKNOWN)
220 
221 struct linux32_ps_strings {
222 	u_int32_t ps_argvstr;	/* first of 0 or more argument strings */
223 	u_int ps_nargvstr;	/* the number of argument strings */
224 	u_int32_t ps_envstr;	/* first of 0 or more environment strings */
225 	u_int ps_nenvstr;	/* the number of environment strings */
226 };
227 
228 LINUX_VDSO_SYM_INTPTR(linux32_sigcode);
229 LINUX_VDSO_SYM_INTPTR(linux32_rt_sigcode);
230 LINUX_VDSO_SYM_INTPTR(linux32_vsyscall);
231 LINUX_VDSO_SYM_CHAR(linux_platform);
232 
233 /*
234  * If FreeBSD & Linux have a difference of opinion about what a trap
235  * means, deal with it here.
236  *
237  * MPSAFE
238  */
239 static int
240 translate_traps(int signal, int trap_code)
241 {
242 	if (signal != SIGBUS)
243 		return signal;
244 	switch (trap_code) {
245 	case T_PROTFLT:
246 	case T_TSSFLT:
247 	case T_DOUBLEFLT:
248 	case T_PAGEFLT:
249 		return SIGSEGV;
250 	default:
251 		return signal;
252 	}
253 }
254 
255 static int
256 elf_linux_fixup(register_t **stack_base, struct image_params *imgp)
257 {
258 	Elf32_Auxargs *args;
259 	Elf32_Addr *base;
260 	Elf32_Addr *pos;
261 	struct linux32_ps_strings *arginfo;
262 
263 	arginfo = (struct linux32_ps_strings *)LINUX32_PS_STRINGS;
264 
265 	KASSERT(curthread->td_proc == imgp->proc,
266 	    ("unsafe elf_linux_fixup(), should be curproc"));
267 	base = (Elf32_Addr *)*stack_base;
268 	args = (Elf32_Auxargs *)imgp->auxargs;
269 	pos = base + (imgp->args->argc + imgp->args->envc + 2);
270 
271 	AUXARGS_ENTRY_32(pos, LINUX_AT_SYSINFO_EHDR,
272 	    imgp->proc->p_sysent->sv_shared_page_base);
273 	AUXARGS_ENTRY_32(pos, LINUX_AT_SYSINFO, linux32_vsyscall);
274 	AUXARGS_ENTRY_32(pos, LINUX_AT_HWCAP, cpu_feature);
275 
276 	/*
277 	 * Do not export AT_CLKTCK when emulating Linux kernel prior to 2.4.0,
278 	 * as it has appeared in the 2.4.0-rc7 first time.
279 	 * Being exported, AT_CLKTCK is returned by sysconf(_SC_CLK_TCK),
280 	 * glibc falls back to the hard-coded CLK_TCK value when aux entry
281 	 * is not present.
282 	 * Also see linux_times() implementation.
283 	 */
284 	if (linux_kernver(curthread) >= LINUX_KERNVER_2004000)
285 		AUXARGS_ENTRY_32(pos, LINUX_AT_CLKTCK, stclohz);
286 	AUXARGS_ENTRY_32(pos, AT_PHDR, args->phdr);
287 	AUXARGS_ENTRY_32(pos, AT_PHENT, args->phent);
288 	AUXARGS_ENTRY_32(pos, AT_PHNUM, args->phnum);
289 	AUXARGS_ENTRY_32(pos, AT_PAGESZ, args->pagesz);
290 	AUXARGS_ENTRY_32(pos, AT_FLAGS, args->flags);
291 	AUXARGS_ENTRY_32(pos, AT_ENTRY, args->entry);
292 	AUXARGS_ENTRY_32(pos, AT_BASE, args->base);
293 	AUXARGS_ENTRY_32(pos, LINUX_AT_SECURE, 0);
294 	AUXARGS_ENTRY_32(pos, AT_UID, imgp->proc->p_ucred->cr_ruid);
295 	AUXARGS_ENTRY_32(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid);
296 	AUXARGS_ENTRY_32(pos, AT_GID, imgp->proc->p_ucred->cr_rgid);
297 	AUXARGS_ENTRY_32(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid);
298 	AUXARGS_ENTRY_32(pos, LINUX_AT_PLATFORM, PTROUT(linux_platform));
299 	if (args->execfd != -1)
300 		AUXARGS_ENTRY_32(pos, AT_EXECFD, args->execfd);
301 	AUXARGS_ENTRY_32(pos, AT_NULL, 0);
302 
303 	free(imgp->auxargs, M_TEMP);
304 	imgp->auxargs = NULL;
305 
306 	base--;
307 	suword32(base, (uint32_t)imgp->args->argc);
308 	*stack_base = (register_t *)base;
309 	return (0);
310 }
311 
312 static void
313 linux_rt_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
314 {
315 	struct thread *td = curthread;
316 	struct proc *p = td->td_proc;
317 	struct sigacts *psp;
318 	struct trapframe *regs;
319 	struct l_rt_sigframe *fp, frame;
320 	int oonstack;
321 	int sig;
322 	int code;
323 
324 	sig = ksi->ksi_signo;
325 	code = ksi->ksi_code;
326 	PROC_LOCK_ASSERT(p, MA_OWNED);
327 	psp = p->p_sigacts;
328 	mtx_assert(&psp->ps_mtx, MA_OWNED);
329 	regs = td->td_frame;
330 	oonstack = sigonstack(regs->tf_rsp);
331 
332 #ifdef DEBUG
333 	if (ldebug(rt_sendsig))
334 		printf(ARGS(rt_sendsig, "%p, %d, %p, %u"),
335 		    catcher, sig, (void*)mask, code);
336 #endif
337 	/*
338 	 * Allocate space for the signal handler context.
339 	 */
340 	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
341 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
342 		fp = (struct l_rt_sigframe *)(td->td_sigstk.ss_sp +
343 		    td->td_sigstk.ss_size - sizeof(struct l_rt_sigframe));
344 	} else
345 		fp = (struct l_rt_sigframe *)regs->tf_rsp - 1;
346 	mtx_unlock(&psp->ps_mtx);
347 
348 	/*
349 	 * Build the argument list for the signal handler.
350 	 */
351 	if (p->p_sysent->sv_sigtbl)
352 		if (sig <= p->p_sysent->sv_sigsize)
353 			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
354 
355 	bzero(&frame, sizeof(frame));
356 
357 	frame.sf_handler = PTROUT(catcher);
358 	frame.sf_sig = sig;
359 	frame.sf_siginfo = PTROUT(&fp->sf_si);
360 	frame.sf_ucontext = PTROUT(&fp->sf_sc);
361 
362 	/* Fill in POSIX parts */
363 	ksiginfo_to_lsiginfo(ksi, &frame.sf_si, sig);
364 
365 	/*
366 	 * Build the signal context to be used by sigreturn
367 	 * and libgcc unwind.
368 	 */
369 	frame.sf_sc.uc_flags = 0;		/* XXX ??? */
370 	frame.sf_sc.uc_link = 0;		/* XXX ??? */
371 
372 	frame.sf_sc.uc_stack.ss_sp = PTROUT(td->td_sigstk.ss_sp);
373 	frame.sf_sc.uc_stack.ss_size = td->td_sigstk.ss_size;
374 	frame.sf_sc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
375 	    ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE;
376 	PROC_UNLOCK(p);
377 
378 	bsd_to_linux_sigset(mask, &frame.sf_sc.uc_sigmask);
379 
380 	frame.sf_sc.uc_mcontext.sc_mask   = frame.sf_sc.uc_sigmask.__bits[0];
381 	frame.sf_sc.uc_mcontext.sc_edi    = regs->tf_rdi;
382 	frame.sf_sc.uc_mcontext.sc_esi    = regs->tf_rsi;
383 	frame.sf_sc.uc_mcontext.sc_ebp    = regs->tf_rbp;
384 	frame.sf_sc.uc_mcontext.sc_ebx    = regs->tf_rbx;
385 	frame.sf_sc.uc_mcontext.sc_esp    = regs->tf_rsp;
386 	frame.sf_sc.uc_mcontext.sc_edx    = regs->tf_rdx;
387 	frame.sf_sc.uc_mcontext.sc_ecx    = regs->tf_rcx;
388 	frame.sf_sc.uc_mcontext.sc_eax    = regs->tf_rax;
389 	frame.sf_sc.uc_mcontext.sc_eip    = regs->tf_rip;
390 	frame.sf_sc.uc_mcontext.sc_cs     = regs->tf_cs;
391 	frame.sf_sc.uc_mcontext.sc_gs     = regs->tf_gs;
392 	frame.sf_sc.uc_mcontext.sc_fs     = regs->tf_fs;
393 	frame.sf_sc.uc_mcontext.sc_es     = regs->tf_es;
394 	frame.sf_sc.uc_mcontext.sc_ds     = regs->tf_ds;
395 	frame.sf_sc.uc_mcontext.sc_eflags = regs->tf_rflags;
396 	frame.sf_sc.uc_mcontext.sc_esp_at_signal = regs->tf_rsp;
397 	frame.sf_sc.uc_mcontext.sc_ss     = regs->tf_ss;
398 	frame.sf_sc.uc_mcontext.sc_err    = regs->tf_err;
399 	frame.sf_sc.uc_mcontext.sc_cr2    = (u_int32_t)(uintptr_t)ksi->ksi_addr;
400 	frame.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code);
401 
402 #ifdef DEBUG
403 	if (ldebug(rt_sendsig))
404 		printf(LMSG("rt_sendsig flags: 0x%x, sp: %p, ss: 0x%lx, mask: 0x%x"),
405 		    frame.sf_sc.uc_stack.ss_flags, td->td_sigstk.ss_sp,
406 		    td->td_sigstk.ss_size, frame.sf_sc.uc_mcontext.sc_mask);
407 #endif
408 
409 	if (copyout(&frame, fp, sizeof(frame)) != 0) {
410 		/*
411 		 * Process has trashed its stack; give it an illegal
412 		 * instruction to halt it in its tracks.
413 		 */
414 #ifdef DEBUG
415 		if (ldebug(rt_sendsig))
416 			printf(LMSG("rt_sendsig: bad stack %p, oonstack=%x"),
417 			    fp, oonstack);
418 #endif
419 		PROC_LOCK(p);
420 		sigexit(td, SIGILL);
421 	}
422 
423 	/*
424 	 * Build context to run handler in.
425 	 */
426 	regs->tf_rsp = PTROUT(fp);
427 	regs->tf_rip = linux32_rt_sigcode;
428 	regs->tf_rflags &= ~(PSL_T | PSL_D);
429 	regs->tf_cs = _ucode32sel;
430 	regs->tf_ss = _udatasel;
431 	regs->tf_ds = _udatasel;
432 	regs->tf_es = _udatasel;
433 	regs->tf_fs = _ufssel;
434 	regs->tf_gs = _ugssel;
435 	regs->tf_flags = TF_HASSEGS;
436 	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
437 	PROC_LOCK(p);
438 	mtx_lock(&psp->ps_mtx);
439 }
440 
441 
442 /*
443  * Send an interrupt to process.
444  *
445  * Stack is set up to allow sigcode stored
446  * in u. to call routine, followed by kcall
447  * to sigreturn routine below.  After sigreturn
448  * resets the signal mask, the stack, and the
449  * frame pointer, it returns to the user
450  * specified pc, psl.
451  */
452 static void
453 linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
454 {
455 	struct thread *td = curthread;
456 	struct proc *p = td->td_proc;
457 	struct sigacts *psp;
458 	struct trapframe *regs;
459 	struct l_sigframe *fp, frame;
460 	l_sigset_t lmask;
461 	int oonstack, i;
462 	int sig, code;
463 
464 	sig = ksi->ksi_signo;
465 	code = ksi->ksi_code;
466 	PROC_LOCK_ASSERT(p, MA_OWNED);
467 	psp = p->p_sigacts;
468 	mtx_assert(&psp->ps_mtx, MA_OWNED);
469 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
470 		/* Signal handler installed with SA_SIGINFO. */
471 		linux_rt_sendsig(catcher, ksi, mask);
472 		return;
473 	}
474 
475 	regs = td->td_frame;
476 	oonstack = sigonstack(regs->tf_rsp);
477 
478 #ifdef DEBUG
479 	if (ldebug(sendsig))
480 		printf(ARGS(sendsig, "%p, %d, %p, %u"),
481 		    catcher, sig, (void*)mask, code);
482 #endif
483 
484 	/*
485 	 * Allocate space for the signal handler context.
486 	 */
487 	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
488 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
489 		fp = (struct l_sigframe *)(td->td_sigstk.ss_sp +
490 		    td->td_sigstk.ss_size - sizeof(struct l_sigframe));
491 	} else
492 		fp = (struct l_sigframe *)regs->tf_rsp - 1;
493 	mtx_unlock(&psp->ps_mtx);
494 	PROC_UNLOCK(p);
495 
496 	/*
497 	 * Build the argument list for the signal handler.
498 	 */
499 	if (p->p_sysent->sv_sigtbl)
500 		if (sig <= p->p_sysent->sv_sigsize)
501 			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
502 
503 	bzero(&frame, sizeof(frame));
504 
505 	frame.sf_handler = PTROUT(catcher);
506 	frame.sf_sig = sig;
507 
508 	bsd_to_linux_sigset(mask, &lmask);
509 
510 	/*
511 	 * Build the signal context to be used by sigreturn.
512 	 */
513 	frame.sf_sc.sc_mask   = lmask.__bits[0];
514 	frame.sf_sc.sc_gs     = regs->tf_gs;
515 	frame.sf_sc.sc_fs     = regs->tf_fs;
516 	frame.sf_sc.sc_es     = regs->tf_es;
517 	frame.sf_sc.sc_ds     = regs->tf_ds;
518 	frame.sf_sc.sc_edi    = regs->tf_rdi;
519 	frame.sf_sc.sc_esi    = regs->tf_rsi;
520 	frame.sf_sc.sc_ebp    = regs->tf_rbp;
521 	frame.sf_sc.sc_ebx    = regs->tf_rbx;
522 	frame.sf_sc.sc_esp    = regs->tf_rsp;
523 	frame.sf_sc.sc_edx    = regs->tf_rdx;
524 	frame.sf_sc.sc_ecx    = regs->tf_rcx;
525 	frame.sf_sc.sc_eax    = regs->tf_rax;
526 	frame.sf_sc.sc_eip    = regs->tf_rip;
527 	frame.sf_sc.sc_cs     = regs->tf_cs;
528 	frame.sf_sc.sc_eflags = regs->tf_rflags;
529 	frame.sf_sc.sc_esp_at_signal = regs->tf_rsp;
530 	frame.sf_sc.sc_ss     = regs->tf_ss;
531 	frame.sf_sc.sc_err    = regs->tf_err;
532 	frame.sf_sc.sc_cr2    = (u_int32_t)(uintptr_t)ksi->ksi_addr;
533 	frame.sf_sc.sc_trapno = bsd_to_linux_trapcode(code);
534 
535 	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
536 		frame.sf_extramask[i] = lmask.__bits[i+1];
537 
538 	if (copyout(&frame, fp, sizeof(frame)) != 0) {
539 		/*
540 		 * Process has trashed its stack; give it an illegal
541 		 * instruction to halt it in its tracks.
542 		 */
543 		PROC_LOCK(p);
544 		sigexit(td, SIGILL);
545 	}
546 
547 	/*
548 	 * Build context to run handler in.
549 	 */
550 	regs->tf_rsp = PTROUT(fp);
551 	regs->tf_rip = linux32_sigcode;
552 	regs->tf_rflags &= ~(PSL_T | PSL_D);
553 	regs->tf_cs = _ucode32sel;
554 	regs->tf_ss = _udatasel;
555 	regs->tf_ds = _udatasel;
556 	regs->tf_es = _udatasel;
557 	regs->tf_fs = _ufssel;
558 	regs->tf_gs = _ugssel;
559 	regs->tf_flags = TF_HASSEGS;
560 	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
561 	PROC_LOCK(p);
562 	mtx_lock(&psp->ps_mtx);
563 }
564 
565 /*
566  * System call to cleanup state after a signal
567  * has been taken.  Reset signal mask and
568  * stack state from context left by sendsig (above).
569  * Return to previous pc and psl as specified by
570  * context left by sendsig. Check carefully to
571  * make sure that the user has not modified the
572  * psl to gain improper privileges or to cause
573  * a machine fault.
574  */
575 int
576 linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args)
577 {
578 	struct l_sigframe frame;
579 	struct trapframe *regs;
580 	sigset_t bmask;
581 	l_sigset_t lmask;
582 	int eflags, i;
583 	ksiginfo_t ksi;
584 
585 	regs = td->td_frame;
586 
587 #ifdef DEBUG
588 	if (ldebug(sigreturn))
589 		printf(ARGS(sigreturn, "%p"), (void *)args->sfp);
590 #endif
591 	/*
592 	 * The trampoline code hands us the sigframe.
593 	 * It is unsafe to keep track of it ourselves, in the event that a
594 	 * program jumps out of a signal handler.
595 	 */
596 	if (copyin(args->sfp, &frame, sizeof(frame)) != 0)
597 		return (EFAULT);
598 
599 	/*
600 	 * Check for security violations.
601 	 */
602 #define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
603 	eflags = frame.sf_sc.sc_eflags;
604 	if (!EFLAGS_SECURE(eflags, regs->tf_rflags))
605 		return(EINVAL);
606 
607 	/*
608 	 * Don't allow users to load a valid privileged %cs.  Let the
609 	 * hardware check for invalid selectors, excess privilege in
610 	 * other selectors, invalid %eip's and invalid %esp's.
611 	 */
612 #define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
613 	if (!CS_SECURE(frame.sf_sc.sc_cs)) {
614 		ksiginfo_init_trap(&ksi);
615 		ksi.ksi_signo = SIGBUS;
616 		ksi.ksi_code = BUS_OBJERR;
617 		ksi.ksi_trapno = T_PROTFLT;
618 		ksi.ksi_addr = (void *)regs->tf_rip;
619 		trapsignal(td, &ksi);
620 		return(EINVAL);
621 	}
622 
623 	lmask.__bits[0] = frame.sf_sc.sc_mask;
624 	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
625 		lmask.__bits[i+1] = frame.sf_extramask[i];
626 	linux_to_bsd_sigset(&lmask, &bmask);
627 	kern_sigprocmask(td, SIG_SETMASK, &bmask, NULL, 0);
628 
629 	/*
630 	 * Restore signal context.
631 	 */
632 	regs->tf_rdi    = frame.sf_sc.sc_edi;
633 	regs->tf_rsi    = frame.sf_sc.sc_esi;
634 	regs->tf_rbp    = frame.sf_sc.sc_ebp;
635 	regs->tf_rbx    = frame.sf_sc.sc_ebx;
636 	regs->tf_rdx    = frame.sf_sc.sc_edx;
637 	regs->tf_rcx    = frame.sf_sc.sc_ecx;
638 	regs->tf_rax    = frame.sf_sc.sc_eax;
639 	regs->tf_rip    = frame.sf_sc.sc_eip;
640 	regs->tf_cs     = frame.sf_sc.sc_cs;
641 	regs->tf_ds     = frame.sf_sc.sc_ds;
642 	regs->tf_es     = frame.sf_sc.sc_es;
643 	regs->tf_fs     = frame.sf_sc.sc_fs;
644 	regs->tf_gs     = frame.sf_sc.sc_gs;
645 	regs->tf_rflags = eflags;
646 	regs->tf_rsp    = frame.sf_sc.sc_esp_at_signal;
647 	regs->tf_ss     = frame.sf_sc.sc_ss;
648 	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
649 
650 	return (EJUSTRETURN);
651 }
652 
653 /*
654  * System call to cleanup state after a signal
655  * has been taken.  Reset signal mask and
656  * stack state from context left by rt_sendsig (above).
657  * Return to previous pc and psl as specified by
658  * context left by sendsig. Check carefully to
659  * make sure that the user has not modified the
660  * psl to gain improper privileges or to cause
661  * a machine fault.
662  */
663 int
664 linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args)
665 {
666 	struct l_ucontext uc;
667 	struct l_sigcontext *context;
668 	sigset_t bmask;
669 	l_stack_t *lss;
670 	stack_t ss;
671 	struct trapframe *regs;
672 	int eflags;
673 	ksiginfo_t ksi;
674 
675 	regs = td->td_frame;
676 
677 #ifdef DEBUG
678 	if (ldebug(rt_sigreturn))
679 		printf(ARGS(rt_sigreturn, "%p"), (void *)args->ucp);
680 #endif
681 	/*
682 	 * The trampoline code hands us the ucontext.
683 	 * It is unsafe to keep track of it ourselves, in the event that a
684 	 * program jumps out of a signal handler.
685 	 */
686 	if (copyin(args->ucp, &uc, sizeof(uc)) != 0)
687 		return (EFAULT);
688 
689 	context = &uc.uc_mcontext;
690 
691 	/*
692 	 * Check for security violations.
693 	 */
694 #define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
695 	eflags = context->sc_eflags;
696 	if (!EFLAGS_SECURE(eflags, regs->tf_rflags))
697 		return(EINVAL);
698 
699 	/*
700 	 * Don't allow users to load a valid privileged %cs.  Let the
701 	 * hardware check for invalid selectors, excess privilege in
702 	 * other selectors, invalid %eip's and invalid %esp's.
703 	 */
704 #define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
705 	if (!CS_SECURE(context->sc_cs)) {
706 		ksiginfo_init_trap(&ksi);
707 		ksi.ksi_signo = SIGBUS;
708 		ksi.ksi_code = BUS_OBJERR;
709 		ksi.ksi_trapno = T_PROTFLT;
710 		ksi.ksi_addr = (void *)regs->tf_rip;
711 		trapsignal(td, &ksi);
712 		return(EINVAL);
713 	}
714 
715 	linux_to_bsd_sigset(&uc.uc_sigmask, &bmask);
716 	kern_sigprocmask(td, SIG_SETMASK, &bmask, NULL, 0);
717 
718 	/*
719 	 * Restore signal context
720 	 */
721 	regs->tf_gs	= context->sc_gs;
722 	regs->tf_fs	= context->sc_fs;
723 	regs->tf_es	= context->sc_es;
724 	regs->tf_ds	= context->sc_ds;
725 	regs->tf_rdi    = context->sc_edi;
726 	regs->tf_rsi    = context->sc_esi;
727 	regs->tf_rbp    = context->sc_ebp;
728 	regs->tf_rbx    = context->sc_ebx;
729 	regs->tf_rdx    = context->sc_edx;
730 	regs->tf_rcx    = context->sc_ecx;
731 	regs->tf_rax    = context->sc_eax;
732 	regs->tf_rip    = context->sc_eip;
733 	regs->tf_cs     = context->sc_cs;
734 	regs->tf_rflags = eflags;
735 	regs->tf_rsp    = context->sc_esp_at_signal;
736 	regs->tf_ss     = context->sc_ss;
737 	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
738 
739 	/*
740 	 * call sigaltstack & ignore results..
741 	 */
742 	lss = &uc.uc_stack;
743 	ss.ss_sp = PTRIN(lss->ss_sp);
744 	ss.ss_size = lss->ss_size;
745 	ss.ss_flags = linux_to_bsd_sigaltstack(lss->ss_flags);
746 
747 #ifdef DEBUG
748 	if (ldebug(rt_sigreturn))
749 		printf(LMSG("rt_sigret flags: 0x%x, sp: %p, ss: 0x%lx, mask: 0x%x"),
750 		    ss.ss_flags, ss.ss_sp, ss.ss_size, context->sc_mask);
751 #endif
752 	(void)kern_sigaltstack(td, &ss, NULL);
753 
754 	return (EJUSTRETURN);
755 }
756 
757 static int
758 linux32_fetch_syscall_args(struct thread *td, struct syscall_args *sa)
759 {
760 	struct proc *p;
761 	struct trapframe *frame;
762 
763 	p = td->td_proc;
764 	frame = td->td_frame;
765 
766 	sa->args[0] = frame->tf_rbx;
767 	sa->args[1] = frame->tf_rcx;
768 	sa->args[2] = frame->tf_rdx;
769 	sa->args[3] = frame->tf_rsi;
770 	sa->args[4] = frame->tf_rdi;
771 	sa->args[5] = frame->tf_rbp;	/* Unconfirmed */
772 	sa->code = frame->tf_rax;
773 
774 	if (sa->code >= p->p_sysent->sv_size)
775 		sa->callp = &p->p_sysent->sv_table[0];
776 	else
777 		sa->callp = &p->p_sysent->sv_table[sa->code];
778 	sa->narg = sa->callp->sy_narg;
779 
780 	td->td_retval[0] = 0;
781 	td->td_retval[1] = frame->tf_rdx;
782 
783 	return (0);
784 }
785 
786 /*
787  * If a linux binary is exec'ing something, try this image activator
788  * first.  We override standard shell script execution in order to
789  * be able to modify the interpreter path.  We only do this if a linux
790  * binary is doing the exec, so we do not create an EXEC module for it.
791  */
792 static int	exec_linux_imgact_try(struct image_params *iparams);
793 
794 static int
795 exec_linux_imgact_try(struct image_params *imgp)
796 {
797 	const char *head = (const char *)imgp->image_header;
798 	char *rpath;
799 	int error = -1;
800 
801 	/*
802 	* The interpreter for shell scripts run from a linux binary needs
803 	* to be located in /compat/linux if possible in order to recursively
804 	* maintain linux path emulation.
805 	*/
806 	if (((const short *)head)[0] == SHELLMAGIC) {
807 		/*
808 		* Run our normal shell image activator.  If it succeeds attempt
809 		* to use the alternate path for the interpreter.  If an
810 		* alternate * path is found, use our stringspace to store it.
811 		*/
812 		if ((error = exec_shell_imgact(imgp)) == 0) {
813 			linux_emul_convpath(FIRST_THREAD_IN_PROC(imgp->proc),
814 			    imgp->interpreter_name, UIO_SYSSPACE, &rpath, 0,
815 			    AT_FDCWD);
816 			if (rpath != NULL)
817 				imgp->args->fname_buf =
818 				    imgp->interpreter_name = rpath;
819 		}
820 	}
821 	return (error);
822 }
823 
824 /*
825  * Clear registers on exec
826  * XXX copied from ia32_signal.c.
827  */
828 static void
829 exec_linux_setregs(struct thread *td, struct image_params *imgp, u_long stack)
830 {
831 	struct trapframe *regs = td->td_frame;
832 	struct pcb *pcb = td->td_pcb;
833 
834 	mtx_lock(&dt_lock);
835 	if (td->td_proc->p_md.md_ldt != NULL)
836 		user_ldt_free(td);
837 	else
838 		mtx_unlock(&dt_lock);
839 
840 	critical_enter();
841 	wrmsr(MSR_FSBASE, 0);
842 	wrmsr(MSR_KGSBASE, 0);	/* User value while we're in the kernel */
843 	pcb->pcb_fsbase = 0;
844 	pcb->pcb_gsbase = 0;
845 	critical_exit();
846 	pcb->pcb_initial_fpucw = __LINUX_NPXCW__;
847 
848 	bzero((char *)regs, sizeof(struct trapframe));
849 	regs->tf_rip = imgp->entry_addr;
850 	regs->tf_rsp = stack;
851 	regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T);
852 	regs->tf_gs = _ugssel;
853 	regs->tf_fs = _ufssel;
854 	regs->tf_es = _udatasel;
855 	regs->tf_ds = _udatasel;
856 	regs->tf_ss = _udatasel;
857 	regs->tf_flags = TF_HASSEGS;
858 	regs->tf_cs = _ucode32sel;
859 	regs->tf_rbx = imgp->ps_strings;
860 
861 	fpstate_drop(td);
862 
863 	/* Do full restore on return so that we can change to a different %cs */
864 	set_pcb_flags(pcb, PCB_32BIT | PCB_FULL_IRET);
865 	td->td_retval[1] = 0;
866 }
867 
868 /*
869  * XXX copied from ia32_sysvec.c.
870  */
871 static register_t *
872 linux_copyout_strings(struct image_params *imgp)
873 {
874 	int argc, envc;
875 	u_int32_t *vectp;
876 	char *stringp, *destp;
877 	u_int32_t *stack_base;
878 	struct linux32_ps_strings *arginfo;
879 
880 	/*
881 	 * Calculate string base and vector table pointers.
882 	 * Also deal with signal trampoline code for this exec type.
883 	 */
884 	arginfo = (struct linux32_ps_strings *)LINUX32_PS_STRINGS;
885 	destp =	(caddr_t)arginfo - SPARE_USRSPACE -
886 	    roundup((ARG_MAX - imgp->args->stringspace), sizeof(char *));
887 
888 	/*
889 	 * If we have a valid auxargs ptr, prepare some room
890 	 * on the stack.
891 	 */
892 	if (imgp->auxargs) {
893 		/*
894 		 * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for
895 		 * lower compatibility.
896 		 */
897 		imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size :
898 		    (LINUX_AT_COUNT * 2);
899 		/*
900 		 * The '+ 2' is for the null pointers at the end of each of
901 		 * the arg and env vector sets,and imgp->auxarg_size is room
902 		 * for argument of Runtime loader.
903 		 */
904 		vectp = (u_int32_t *) (destp - (imgp->args->argc +
905 		    imgp->args->envc + 2 + imgp->auxarg_size) *
906 		    sizeof(u_int32_t));
907 
908 	} else
909 		/*
910 		 * The '+ 2' is for the null pointers at the end of each of
911 		 * the arg and env vector sets
912 		 */
913 		vectp = (u_int32_t *)(destp - (imgp->args->argc +
914 		    imgp->args->envc + 2) * sizeof(u_int32_t));
915 
916 	/*
917 	 * vectp also becomes our initial stack base
918 	 */
919 	stack_base = vectp;
920 
921 	stringp = imgp->args->begin_argv;
922 	argc = imgp->args->argc;
923 	envc = imgp->args->envc;
924 	/*
925 	 * Copy out strings - arguments and environment.
926 	 */
927 	copyout(stringp, destp, ARG_MAX - imgp->args->stringspace);
928 
929 	/*
930 	 * Fill in "ps_strings" struct for ps, w, etc.
931 	 */
932 	suword32(&arginfo->ps_argvstr, (uint32_t)(intptr_t)vectp);
933 	suword32(&arginfo->ps_nargvstr, argc);
934 
935 	/*
936 	 * Fill in argument portion of vector table.
937 	 */
938 	for (; argc > 0; --argc) {
939 		suword32(vectp++, (uint32_t)(intptr_t)destp);
940 		while (*stringp++ != 0)
941 			destp++;
942 		destp++;
943 	}
944 
945 	/* a null vector table pointer separates the argp's from the envp's */
946 	suword32(vectp++, 0);
947 
948 	suword32(&arginfo->ps_envstr, (uint32_t)(intptr_t)vectp);
949 	suword32(&arginfo->ps_nenvstr, envc);
950 
951 	/*
952 	 * Fill in environment portion of vector table.
953 	 */
954 	for (; envc > 0; --envc) {
955 		suword32(vectp++, (uint32_t)(intptr_t)destp);
956 		while (*stringp++ != 0)
957 			destp++;
958 		destp++;
959 	}
960 
961 	/* end of vector table is a null pointer */
962 	suword32(vectp, 0);
963 
964 	return ((register_t *)stack_base);
965 }
966 
967 static SYSCTL_NODE(_compat, OID_AUTO, linux32, CTLFLAG_RW, 0,
968     "32-bit Linux emulation");
969 
970 static u_long	linux32_maxdsiz = LINUX32_MAXDSIZ;
971 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxdsiz, CTLFLAG_RW,
972     &linux32_maxdsiz, 0, "");
973 static u_long	linux32_maxssiz = LINUX32_MAXSSIZ;
974 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxssiz, CTLFLAG_RW,
975     &linux32_maxssiz, 0, "");
976 static u_long	linux32_maxvmem = LINUX32_MAXVMEM;
977 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxvmem, CTLFLAG_RW,
978     &linux32_maxvmem, 0, "");
979 
980 static void
981 linux32_fixlimit(struct rlimit *rl, int which)
982 {
983 
984 	switch (which) {
985 	case RLIMIT_DATA:
986 		if (linux32_maxdsiz != 0) {
987 			if (rl->rlim_cur > linux32_maxdsiz)
988 				rl->rlim_cur = linux32_maxdsiz;
989 			if (rl->rlim_max > linux32_maxdsiz)
990 				rl->rlim_max = linux32_maxdsiz;
991 		}
992 		break;
993 	case RLIMIT_STACK:
994 		if (linux32_maxssiz != 0) {
995 			if (rl->rlim_cur > linux32_maxssiz)
996 				rl->rlim_cur = linux32_maxssiz;
997 			if (rl->rlim_max > linux32_maxssiz)
998 				rl->rlim_max = linux32_maxssiz;
999 		}
1000 		break;
1001 	case RLIMIT_VMEM:
1002 		if (linux32_maxvmem != 0) {
1003 			if (rl->rlim_cur > linux32_maxvmem)
1004 				rl->rlim_cur = linux32_maxvmem;
1005 			if (rl->rlim_max > linux32_maxvmem)
1006 				rl->rlim_max = linux32_maxvmem;
1007 		}
1008 		break;
1009 	}
1010 }
1011 
1012 struct sysentvec elf_linux_sysvec = {
1013 	.sv_size	= LINUX_SYS_MAXSYSCALL,
1014 	.sv_table	= linux_sysent,
1015 	.sv_mask	= 0,
1016 	.sv_sigsize	= LINUX_SIGTBLSZ,
1017 	.sv_sigtbl	= bsd_to_linux_signal,
1018 	.sv_errsize	= ELAST + 1,
1019 	.sv_errtbl	= bsd_to_linux_errno,
1020 	.sv_transtrap	= translate_traps,
1021 	.sv_fixup	= elf_linux_fixup,
1022 	.sv_sendsig	= linux_sendsig,
1023 	.sv_sigcode	= &_binary_linux32_locore_o_start,
1024 	.sv_szsigcode	= &linux_szsigcode,
1025 	.sv_prepsyscall	= NULL,
1026 	.sv_name	= "Linux ELF32",
1027 	.sv_coredump	= elf32_coredump,
1028 	.sv_imgact_try	= exec_linux_imgact_try,
1029 	.sv_minsigstksz	= LINUX_MINSIGSTKSZ,
1030 	.sv_pagesize	= PAGE_SIZE,
1031 	.sv_minuser	= VM_MIN_ADDRESS,
1032 	.sv_maxuser	= LINUX32_MAXUSER,
1033 	.sv_usrstack	= LINUX32_USRSTACK,
1034 	.sv_psstrings	= LINUX32_PS_STRINGS,
1035 	.sv_stackprot	= VM_PROT_ALL,
1036 	.sv_copyout_strings = linux_copyout_strings,
1037 	.sv_setregs	= exec_linux_setregs,
1038 	.sv_fixlimit	= linux32_fixlimit,
1039 	.sv_maxssiz	= &linux32_maxssiz,
1040 	.sv_flags	= SV_ABI_LINUX | SV_ILP32 | SV_IA32 | SV_SHP,
1041 	.sv_set_syscall_retval = cpu_set_syscall_retval,
1042 	.sv_fetch_syscall_args = linux32_fetch_syscall_args,
1043 	.sv_syscallnames = NULL,
1044 	.sv_shared_page_base = LINUX32_SHAREDPAGE,
1045 	.sv_shared_page_len = PAGE_SIZE,
1046 	.sv_schedtail	= linux_schedtail,
1047 	.sv_thread_detach = linux_thread_detach,
1048 };
1049 
1050 static void
1051 linux_vdso_install(void *param)
1052 {
1053 
1054 	linux_szsigcode = (&_binary_linux32_locore_o_end -
1055 	    &_binary_linux32_locore_o_start);
1056 
1057 	if (linux_szsigcode > elf_linux_sysvec.sv_shared_page_len)
1058 		panic("Linux invalid vdso size\n");
1059 
1060 	__elfN(linux_vdso_fixup)(&elf_linux_sysvec);
1061 
1062 	linux_shared_page_obj = __elfN(linux_shared_page_init)
1063 	    (&linux_shared_page_mapping);
1064 
1065 	__elfN(linux_vdso_reloc)(&elf_linux_sysvec, LINUX32_SHAREDPAGE);
1066 
1067 	bcopy(elf_linux_sysvec.sv_sigcode, linux_shared_page_mapping,
1068 	    linux_szsigcode);
1069 	elf_linux_sysvec.sv_shared_page_obj = linux_shared_page_obj;
1070 
1071 	linux_kplatform = linux_shared_page_mapping +
1072 	    (linux_platform - (caddr_t)LINUX32_SHAREDPAGE);
1073 }
1074 SYSINIT(elf_linux_vdso_init, SI_SUB_EXEC, SI_ORDER_ANY,
1075     (sysinit_cfunc_t)linux_vdso_install, NULL);
1076 
1077 static void
1078 linux_vdso_deinstall(void *param)
1079 {
1080 
1081 	__elfN(linux_shared_page_fini)(linux_shared_page_obj);
1082 };
1083 SYSUNINIT(elf_linux_vdso_uninit, SI_SUB_EXEC, SI_ORDER_FIRST,
1084     (sysinit_cfunc_t)linux_vdso_deinstall, NULL);
1085 
1086 static char GNU_ABI_VENDOR[] = "GNU";
1087 static int GNULINUX_ABI_DESC = 0;
1088 
1089 static boolean_t
1090 linux32_trans_osrel(const Elf_Note *note, int32_t *osrel)
1091 {
1092 	const Elf32_Word *desc;
1093 	uintptr_t p;
1094 
1095 	p = (uintptr_t)(note + 1);
1096 	p += roundup2(note->n_namesz, sizeof(Elf32_Addr));
1097 
1098 	desc = (const Elf32_Word *)p;
1099 	if (desc[0] != GNULINUX_ABI_DESC)
1100 		return (FALSE);
1101 
1102 	/*
1103 	 * For linux we encode osrel as follows (see linux_mib.c):
1104 	 * VVVMMMIII (version, major, minor), see linux_mib.c.
1105 	 */
1106 	*osrel = desc[1] * 1000000 + desc[2] * 1000 + desc[3];
1107 
1108 	return (TRUE);
1109 }
1110 
1111 static Elf_Brandnote linux32_brandnote = {
1112 	.hdr.n_namesz	= sizeof(GNU_ABI_VENDOR),
1113 	.hdr.n_descsz	= 16,	/* XXX at least 16 */
1114 	.hdr.n_type	= 1,
1115 	.vendor		= GNU_ABI_VENDOR,
1116 	.flags		= BN_TRANSLATE_OSREL,
1117 	.trans_osrel	= linux32_trans_osrel
1118 };
1119 
1120 static Elf32_Brandinfo linux_brand = {
1121 	.brand		= ELFOSABI_LINUX,
1122 	.machine	= EM_386,
1123 	.compat_3_brand	= "Linux",
1124 	.emul_path	= "/compat/linux",
1125 	.interp_path	= "/lib/ld-linux.so.1",
1126 	.sysvec		= &elf_linux_sysvec,
1127 	.interp_newpath	= NULL,
1128 	.brand_note	= &linux32_brandnote,
1129 	.flags		= BI_CAN_EXEC_DYN | BI_BRAND_NOTE
1130 };
1131 
1132 static Elf32_Brandinfo linux_glibc2brand = {
1133 	.brand		= ELFOSABI_LINUX,
1134 	.machine	= EM_386,
1135 	.compat_3_brand	= "Linux",
1136 	.emul_path	= "/compat/linux",
1137 	.interp_path	= "/lib/ld-linux.so.2",
1138 	.sysvec		= &elf_linux_sysvec,
1139 	.interp_newpath	= NULL,
1140 	.brand_note	= &linux32_brandnote,
1141 	.flags		= BI_CAN_EXEC_DYN | BI_BRAND_NOTE
1142 };
1143 
1144 Elf32_Brandinfo *linux_brandlist[] = {
1145 	&linux_brand,
1146 	&linux_glibc2brand,
1147 	NULL
1148 };
1149 
1150 static int
1151 linux_elf_modevent(module_t mod, int type, void *data)
1152 {
1153 	Elf32_Brandinfo **brandinfo;
1154 	int error;
1155 	struct linux_ioctl_handler **lihp;
1156 	struct linux_device_handler **ldhp;
1157 
1158 	error = 0;
1159 
1160 	switch(type) {
1161 	case MOD_LOAD:
1162 		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
1163 		     ++brandinfo)
1164 			if (elf32_insert_brand_entry(*brandinfo) < 0)
1165 				error = EINVAL;
1166 		if (error == 0) {
1167 			SET_FOREACH(lihp, linux_ioctl_handler_set)
1168 				linux_ioctl_register_handler(*lihp);
1169 			SET_FOREACH(ldhp, linux_device_handler_set)
1170 				linux_device_register_handler(*ldhp);
1171 			LIST_INIT(&futex_list);
1172 			mtx_init(&futex_mtx, "ftllk", NULL, MTX_DEF);
1173 			linux_exit_tag = EVENTHANDLER_REGISTER(process_exit,
1174 			    linux_proc_exit, NULL, 1000);
1175 			linux_exec_tag = EVENTHANDLER_REGISTER(process_exec,
1176 			    linux_proc_exec, NULL, 1000);
1177 			linux_thread_dtor_tag = EVENTHANDLER_REGISTER(thread_dtor,
1178 			    linux_thread_dtor, NULL, EVENTHANDLER_PRI_ANY);
1179 			linux_osd_jail_register();
1180 			stclohz = (stathz ? stathz : hz);
1181 			if (bootverbose)
1182 				printf("Linux ELF exec handler installed\n");
1183 		} else
1184 			printf("cannot insert Linux ELF brand handler\n");
1185 		break;
1186 	case MOD_UNLOAD:
1187 		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
1188 		     ++brandinfo)
1189 			if (elf32_brand_inuse(*brandinfo))
1190 				error = EBUSY;
1191 		if (error == 0) {
1192 			for (brandinfo = &linux_brandlist[0];
1193 			     *brandinfo != NULL; ++brandinfo)
1194 				if (elf32_remove_brand_entry(*brandinfo) < 0)
1195 					error = EINVAL;
1196 		}
1197 		if (error == 0) {
1198 			SET_FOREACH(lihp, linux_ioctl_handler_set)
1199 				linux_ioctl_unregister_handler(*lihp);
1200 			SET_FOREACH(ldhp, linux_device_handler_set)
1201 				linux_device_unregister_handler(*ldhp);
1202 			mtx_destroy(&futex_mtx);
1203 			EVENTHANDLER_DEREGISTER(process_exit, linux_exit_tag);
1204 			EVENTHANDLER_DEREGISTER(process_exec, linux_exec_tag);
1205 			EVENTHANDLER_DEREGISTER(thread_dtor, linux_thread_dtor_tag);
1206 			linux_osd_jail_deregister();
1207 			if (bootverbose)
1208 				printf("Linux ELF exec handler removed\n");
1209 		} else
1210 			printf("Could not deinstall ELF interpreter entry\n");
1211 		break;
1212 	default:
1213 		return (EOPNOTSUPP);
1214 	}
1215 	return (error);
1216 }
1217 
1218 static moduledata_t linux_elf_mod = {
1219 	"linuxelf",
1220 	linux_elf_modevent,
1221 	0
1222 };
1223 
1224 DECLARE_MODULE_TIED(linuxelf, linux_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY);
1225