xref: /freebsd/sys/amd64/linux32/linux32_sysvec.c (revision f5f7c05209ca2c3748fd8b27c5e80ffad49120eb)
1 /*-
2  * Copyright (c) 2004 Tim J. Robbins
3  * Copyright (c) 2003 Peter Wemm
4  * Copyright (c) 2002 Doug Rabson
5  * Copyright (c) 1998-1999 Andrew Gallatin
6  * Copyright (c) 1994-1996 Søren Schmidt
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer
14  *    in this position and unchanged.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. The name of the author may not be used to endorse or promote products
19  *    derived from this software without specific prior written permission
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
22  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
23  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
24  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
25  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
26  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
30  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31  */
32 
33 #include <sys/cdefs.h>
34 __FBSDID("$FreeBSD$");
35 #include "opt_compat.h"
36 
37 #ifndef COMPAT_FREEBSD32
38 #error "Unable to compile Linux-emulator due to missing COMPAT_FREEBSD32 option!"
39 #endif
40 
41 #define	__ELF_WORD_SIZE	32
42 
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/exec.h>
46 #include <sys/fcntl.h>
47 #include <sys/imgact.h>
48 #include <sys/imgact_elf.h>
49 #include <sys/kernel.h>
50 #include <sys/lock.h>
51 #include <sys/malloc.h>
52 #include <sys/module.h>
53 #include <sys/mutex.h>
54 #include <sys/proc.h>
55 #include <sys/resourcevar.h>
56 #include <sys/signalvar.h>
57 #include <sys/sysctl.h>
58 #include <sys/syscallsubr.h>
59 #include <sys/sysent.h>
60 #include <sys/sysproto.h>
61 #include <sys/vnode.h>
62 #include <sys/eventhandler.h>
63 
64 #include <vm/vm.h>
65 #include <vm/pmap.h>
66 #include <vm/vm_extern.h>
67 #include <vm/vm_map.h>
68 #include <vm/vm_object.h>
69 #include <vm/vm_page.h>
70 #include <vm/vm_param.h>
71 
72 #include <machine/cpu.h>
73 #include <machine/md_var.h>
74 #include <machine/pcb.h>
75 #include <machine/specialreg.h>
76 
77 #include <amd64/linux32/linux.h>
78 #include <amd64/linux32/linux32_proto.h>
79 #include <compat/linux/linux_emul.h>
80 #include <compat/linux/linux_futex.h>
81 #include <compat/linux/linux_ioctl.h>
82 #include <compat/linux/linux_mib.h>
83 #include <compat/linux/linux_misc.h>
84 #include <compat/linux/linux_signal.h>
85 #include <compat/linux/linux_util.h>
86 
87 MODULE_VERSION(linux, 1);
88 
89 MALLOC_DEFINE(M_LINUX, "linux", "Linux mode structures");
90 
91 #define	AUXARGS_ENTRY_32(pos, id, val)	\
92 	do {				\
93 		suword32(pos++, id);	\
94 		suword32(pos++, val);	\
95 	} while (0)
96 
97 #if BYTE_ORDER == LITTLE_ENDIAN
98 #define SHELLMAGIC      0x2123 /* #! */
99 #else
100 #define SHELLMAGIC      0x2321
101 #endif
102 
103 /*
104  * Allow the sendsig functions to use the ldebug() facility
105  * even though they are not syscalls themselves. Map them
106  * to syscall 0. This is slightly less bogus than using
107  * ldebug(sigreturn).
108  */
109 #define	LINUX_SYS_linux_rt_sendsig	0
110 #define	LINUX_SYS_linux_sendsig		0
111 
112 const char *linux_platform = "i686";
113 static int linux_szplatform;
114 extern char linux_sigcode[];
115 extern int linux_szsigcode;
116 
117 extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL];
118 
119 SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler);
120 SET_DECLARE(linux_device_handler_set, struct linux_device_handler);
121 
122 static int	elf_linux_fixup(register_t **stack_base,
123 		    struct image_params *iparams);
124 static register_t *linux_copyout_strings(struct image_params *imgp);
125 static void     linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask);
126 static void	exec_linux_setregs(struct thread *td,
127 				   struct image_params *imgp, u_long stack);
128 static void	linux32_fixlimit(struct rlimit *rl, int which);
129 static boolean_t linux32_trans_osrel(const Elf_Note *note, int32_t *osrel);
130 
131 static eventhandler_tag linux_exit_tag;
132 static eventhandler_tag linux_exec_tag;
133 
134 /*
135  * Linux syscalls return negative errno's, we do positive and map them
136  * Reference:
137  *   FreeBSD: src/sys/sys/errno.h
138  *   Linux:   linux-2.6.17.8/include/asm-generic/errno-base.h
139  *            linux-2.6.17.8/include/asm-generic/errno.h
140  */
141 static int bsd_to_linux_errno[ELAST + 1] = {
142 	-0,  -1,  -2,  -3,  -4,  -5,  -6,  -7,  -8,  -9,
143 	-10, -35, -12, -13, -14, -15, -16, -17, -18, -19,
144 	-20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
145 	-30, -31, -32, -33, -34, -11,-115,-114, -88, -89,
146 	-90, -91, -92, -93, -94, -95, -96, -97, -98, -99,
147 	-100,-101,-102,-103,-104,-105,-106,-107,-108,-109,
148 	-110,-111, -40, -36,-112,-113, -39, -11, -87,-122,
149 	-116, -66,  -6,  -6,  -6,  -6,  -6, -37, -38,  -9,
150 	  -6,  -6, -43, -42, -75,-125, -84, -95, -16, -74,
151 	 -72, -67, -71
152 };
153 
154 int bsd_to_linux_signal[LINUX_SIGTBLSZ] = {
155 	LINUX_SIGHUP, LINUX_SIGINT, LINUX_SIGQUIT, LINUX_SIGILL,
156 	LINUX_SIGTRAP, LINUX_SIGABRT, 0, LINUX_SIGFPE,
157 	LINUX_SIGKILL, LINUX_SIGBUS, LINUX_SIGSEGV, LINUX_SIGSYS,
158 	LINUX_SIGPIPE, LINUX_SIGALRM, LINUX_SIGTERM, LINUX_SIGURG,
159 	LINUX_SIGSTOP, LINUX_SIGTSTP, LINUX_SIGCONT, LINUX_SIGCHLD,
160 	LINUX_SIGTTIN, LINUX_SIGTTOU, LINUX_SIGIO, LINUX_SIGXCPU,
161 	LINUX_SIGXFSZ, LINUX_SIGVTALRM, LINUX_SIGPROF, LINUX_SIGWINCH,
162 	0, LINUX_SIGUSR1, LINUX_SIGUSR2
163 };
164 
165 int linux_to_bsd_signal[LINUX_SIGTBLSZ] = {
166 	SIGHUP, SIGINT, SIGQUIT, SIGILL,
167 	SIGTRAP, SIGABRT, SIGBUS, SIGFPE,
168 	SIGKILL, SIGUSR1, SIGSEGV, SIGUSR2,
169 	SIGPIPE, SIGALRM, SIGTERM, SIGBUS,
170 	SIGCHLD, SIGCONT, SIGSTOP, SIGTSTP,
171 	SIGTTIN, SIGTTOU, SIGURG, SIGXCPU,
172 	SIGXFSZ, SIGVTALRM, SIGPROF, SIGWINCH,
173 	SIGIO, SIGURG, SIGSYS
174 };
175 
176 #define LINUX_T_UNKNOWN  255
177 static int _bsd_to_linux_trapcode[] = {
178 	LINUX_T_UNKNOWN,	/* 0 */
179 	6,			/* 1  T_PRIVINFLT */
180 	LINUX_T_UNKNOWN,	/* 2 */
181 	3,			/* 3  T_BPTFLT */
182 	LINUX_T_UNKNOWN,	/* 4 */
183 	LINUX_T_UNKNOWN,	/* 5 */
184 	16,			/* 6  T_ARITHTRAP */
185 	254,			/* 7  T_ASTFLT */
186 	LINUX_T_UNKNOWN,	/* 8 */
187 	13,			/* 9  T_PROTFLT */
188 	1,			/* 10 T_TRCTRAP */
189 	LINUX_T_UNKNOWN,	/* 11 */
190 	14,			/* 12 T_PAGEFLT */
191 	LINUX_T_UNKNOWN,	/* 13 */
192 	17,			/* 14 T_ALIGNFLT */
193 	LINUX_T_UNKNOWN,	/* 15 */
194 	LINUX_T_UNKNOWN,	/* 16 */
195 	LINUX_T_UNKNOWN,	/* 17 */
196 	0,			/* 18 T_DIVIDE */
197 	2,			/* 19 T_NMI */
198 	4,			/* 20 T_OFLOW */
199 	5,			/* 21 T_BOUND */
200 	7,			/* 22 T_DNA */
201 	8,			/* 23 T_DOUBLEFLT */
202 	9,			/* 24 T_FPOPFLT */
203 	10,			/* 25 T_TSSFLT */
204 	11,			/* 26 T_SEGNPFLT */
205 	12,			/* 27 T_STKFLT */
206 	18,			/* 28 T_MCHK */
207 	19,			/* 29 T_XMMFLT */
208 	15			/* 30 T_RESERVED */
209 };
210 #define bsd_to_linux_trapcode(code) \
211     ((code)<sizeof(_bsd_to_linux_trapcode)/sizeof(*_bsd_to_linux_trapcode)? \
212      _bsd_to_linux_trapcode[(code)]: \
213      LINUX_T_UNKNOWN)
214 
215 struct linux32_ps_strings {
216 	u_int32_t ps_argvstr;	/* first of 0 or more argument strings */
217 	u_int ps_nargvstr;	/* the number of argument strings */
218 	u_int32_t ps_envstr;	/* first of 0 or more environment strings */
219 	u_int ps_nenvstr;	/* the number of environment strings */
220 };
221 
222 /*
223  * If FreeBSD & Linux have a difference of opinion about what a trap
224  * means, deal with it here.
225  *
226  * MPSAFE
227  */
228 static int
229 translate_traps(int signal, int trap_code)
230 {
231 	if (signal != SIGBUS)
232 		return signal;
233 	switch (trap_code) {
234 	case T_PROTFLT:
235 	case T_TSSFLT:
236 	case T_DOUBLEFLT:
237 	case T_PAGEFLT:
238 		return SIGSEGV;
239 	default:
240 		return signal;
241 	}
242 }
243 
244 static int
245 elf_linux_fixup(register_t **stack_base, struct image_params *imgp)
246 {
247 	Elf32_Auxargs *args;
248 	Elf32_Addr *base;
249 	Elf32_Addr *pos, *uplatform;
250 	struct linux32_ps_strings *arginfo;
251 
252 	arginfo = (struct linux32_ps_strings *)LINUX32_PS_STRINGS;
253 	uplatform = (Elf32_Addr *)((caddr_t)arginfo - linux_szplatform);
254 
255 	KASSERT(curthread->td_proc == imgp->proc,
256 	    ("unsafe elf_linux_fixup(), should be curproc"));
257 	base = (Elf32_Addr *)*stack_base;
258 	args = (Elf32_Auxargs *)imgp->auxargs;
259 	pos = base + (imgp->args->argc + imgp->args->envc + 2);
260 
261 	AUXARGS_ENTRY_32(pos, LINUX_AT_HWCAP, cpu_feature);
262 
263 	/*
264 	 * Do not export AT_CLKTCK when emulating Linux kernel prior to 2.4.0,
265 	 * as it has appeared in the 2.4.0-rc7 first time.
266 	 * Being exported, AT_CLKTCK is returned by sysconf(_SC_CLK_TCK),
267 	 * glibc falls back to the hard-coded CLK_TCK value when aux entry
268 	 * is not present.
269 	 * Also see linux_times() implementation.
270 	 */
271 	if (linux_kernver(curthread) >= LINUX_KERNVER_2004000)
272 		AUXARGS_ENTRY_32(pos, LINUX_AT_CLKTCK, stclohz);
273 	AUXARGS_ENTRY_32(pos, AT_PHDR, args->phdr);
274 	AUXARGS_ENTRY_32(pos, AT_PHENT, args->phent);
275 	AUXARGS_ENTRY_32(pos, AT_PHNUM, args->phnum);
276 	AUXARGS_ENTRY_32(pos, AT_PAGESZ, args->pagesz);
277 	AUXARGS_ENTRY_32(pos, AT_FLAGS, args->flags);
278 	AUXARGS_ENTRY_32(pos, AT_ENTRY, args->entry);
279 	AUXARGS_ENTRY_32(pos, AT_BASE, args->base);
280 	AUXARGS_ENTRY_32(pos, LINUX_AT_SECURE, 0);
281 	AUXARGS_ENTRY_32(pos, AT_UID, imgp->proc->p_ucred->cr_ruid);
282 	AUXARGS_ENTRY_32(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid);
283 	AUXARGS_ENTRY_32(pos, AT_GID, imgp->proc->p_ucred->cr_rgid);
284 	AUXARGS_ENTRY_32(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid);
285 	AUXARGS_ENTRY_32(pos, LINUX_AT_PLATFORM, PTROUT(uplatform));
286 	if (args->execfd != -1)
287 		AUXARGS_ENTRY_32(pos, AT_EXECFD, args->execfd);
288 	AUXARGS_ENTRY_32(pos, AT_NULL, 0);
289 
290 	free(imgp->auxargs, M_TEMP);
291 	imgp->auxargs = NULL;
292 
293 	base--;
294 	suword32(base, (uint32_t)imgp->args->argc);
295 	*stack_base = (register_t *)base;
296 	return 0;
297 }
298 
299 extern unsigned long linux_sznonrtsigcode;
300 
301 static void
302 linux_rt_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
303 {
304 	struct thread *td = curthread;
305 	struct proc *p = td->td_proc;
306 	struct sigacts *psp;
307 	struct trapframe *regs;
308 	struct l_rt_sigframe *fp, frame;
309 	int oonstack;
310 	int sig;
311 	int code;
312 
313 	sig = ksi->ksi_signo;
314 	code = ksi->ksi_code;
315 	PROC_LOCK_ASSERT(p, MA_OWNED);
316 	psp = p->p_sigacts;
317 	mtx_assert(&psp->ps_mtx, MA_OWNED);
318 	regs = td->td_frame;
319 	oonstack = sigonstack(regs->tf_rsp);
320 
321 #ifdef DEBUG
322 	if (ldebug(rt_sendsig))
323 		printf(ARGS(rt_sendsig, "%p, %d, %p, %u"),
324 		    catcher, sig, (void*)mask, code);
325 #endif
326 	/*
327 	 * Allocate space for the signal handler context.
328 	 */
329 	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
330 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
331 		fp = (struct l_rt_sigframe *)(td->td_sigstk.ss_sp +
332 		    td->td_sigstk.ss_size - sizeof(struct l_rt_sigframe));
333 	} else
334 		fp = (struct l_rt_sigframe *)regs->tf_rsp - 1;
335 	mtx_unlock(&psp->ps_mtx);
336 
337 	/*
338 	 * Build the argument list for the signal handler.
339 	 */
340 	if (p->p_sysent->sv_sigtbl)
341 		if (sig <= p->p_sysent->sv_sigsize)
342 			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
343 
344 	bzero(&frame, sizeof(frame));
345 
346 	frame.sf_handler = PTROUT(catcher);
347 	frame.sf_sig = sig;
348 	frame.sf_siginfo = PTROUT(&fp->sf_si);
349 	frame.sf_ucontext = PTROUT(&fp->sf_sc);
350 
351 	/* Fill in POSIX parts */
352 	ksiginfo_to_lsiginfo(ksi, &frame.sf_si, sig);
353 
354 	/*
355 	 * Build the signal context to be used by sigreturn.
356 	 */
357 	frame.sf_sc.uc_flags = 0;		/* XXX ??? */
358 	frame.sf_sc.uc_link = 0;		/* XXX ??? */
359 
360 	frame.sf_sc.uc_stack.ss_sp = PTROUT(td->td_sigstk.ss_sp);
361 	frame.sf_sc.uc_stack.ss_size = td->td_sigstk.ss_size;
362 	frame.sf_sc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
363 	    ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE;
364 	PROC_UNLOCK(p);
365 
366 	bsd_to_linux_sigset(mask, &frame.sf_sc.uc_sigmask);
367 
368 	frame.sf_sc.uc_mcontext.sc_mask   = frame.sf_sc.uc_sigmask.__bits[0];
369 	frame.sf_sc.uc_mcontext.sc_edi    = regs->tf_rdi;
370 	frame.sf_sc.uc_mcontext.sc_esi    = regs->tf_rsi;
371 	frame.sf_sc.uc_mcontext.sc_ebp    = regs->tf_rbp;
372 	frame.sf_sc.uc_mcontext.sc_ebx    = regs->tf_rbx;
373 	frame.sf_sc.uc_mcontext.sc_edx    = regs->tf_rdx;
374 	frame.sf_sc.uc_mcontext.sc_ecx    = regs->tf_rcx;
375 	frame.sf_sc.uc_mcontext.sc_eax    = regs->tf_rax;
376 	frame.sf_sc.uc_mcontext.sc_eip    = regs->tf_rip;
377 	frame.sf_sc.uc_mcontext.sc_cs     = regs->tf_cs;
378 	frame.sf_sc.uc_mcontext.sc_gs     = regs->tf_gs;
379 	frame.sf_sc.uc_mcontext.sc_fs     = regs->tf_fs;
380 	frame.sf_sc.uc_mcontext.sc_es     = regs->tf_es;
381 	frame.sf_sc.uc_mcontext.sc_ds     = regs->tf_ds;
382 	frame.sf_sc.uc_mcontext.sc_eflags = regs->tf_rflags;
383 	frame.sf_sc.uc_mcontext.sc_esp_at_signal = regs->tf_rsp;
384 	frame.sf_sc.uc_mcontext.sc_ss     = regs->tf_ss;
385 	frame.sf_sc.uc_mcontext.sc_err    = regs->tf_err;
386 	frame.sf_sc.uc_mcontext.sc_cr2    = (u_int32_t)(uintptr_t)ksi->ksi_addr;
387 	frame.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code);
388 
389 #ifdef DEBUG
390 	if (ldebug(rt_sendsig))
391 		printf(LMSG("rt_sendsig flags: 0x%x, sp: %p, ss: 0x%lx, mask: 0x%x"),
392 		    frame.sf_sc.uc_stack.ss_flags, td->td_sigstk.ss_sp,
393 		    td->td_sigstk.ss_size, frame.sf_sc.uc_mcontext.sc_mask);
394 #endif
395 
396 	if (copyout(&frame, fp, sizeof(frame)) != 0) {
397 		/*
398 		 * Process has trashed its stack; give it an illegal
399 		 * instruction to halt it in its tracks.
400 		 */
401 #ifdef DEBUG
402 		if (ldebug(rt_sendsig))
403 			printf(LMSG("rt_sendsig: bad stack %p, oonstack=%x"),
404 			    fp, oonstack);
405 #endif
406 		PROC_LOCK(p);
407 		sigexit(td, SIGILL);
408 	}
409 
410 	/*
411 	 * Build context to run handler in.
412 	 */
413 	regs->tf_rsp = PTROUT(fp);
414 	regs->tf_rip = p->p_sysent->sv_sigcode_base + linux_sznonrtsigcode;
415 	regs->tf_rflags &= ~(PSL_T | PSL_D);
416 	regs->tf_cs = _ucode32sel;
417 	regs->tf_ss = _udatasel;
418 	regs->tf_ds = _udatasel;
419 	regs->tf_es = _udatasel;
420 	regs->tf_fs = _ufssel;
421 	regs->tf_gs = _ugssel;
422 	regs->tf_flags = TF_HASSEGS;
423 	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
424 	PROC_LOCK(p);
425 	mtx_lock(&psp->ps_mtx);
426 }
427 
428 
429 /*
430  * Send an interrupt to process.
431  *
432  * Stack is set up to allow sigcode stored
433  * in u. to call routine, followed by kcall
434  * to sigreturn routine below.  After sigreturn
435  * resets the signal mask, the stack, and the
436  * frame pointer, it returns to the user
437  * specified pc, psl.
438  */
439 static void
440 linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
441 {
442 	struct thread *td = curthread;
443 	struct proc *p = td->td_proc;
444 	struct sigacts *psp;
445 	struct trapframe *regs;
446 	struct l_sigframe *fp, frame;
447 	l_sigset_t lmask;
448 	int oonstack, i;
449 	int sig, code;
450 
451 	sig = ksi->ksi_signo;
452 	code = ksi->ksi_code;
453 	PROC_LOCK_ASSERT(p, MA_OWNED);
454 	psp = p->p_sigacts;
455 	mtx_assert(&psp->ps_mtx, MA_OWNED);
456 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
457 		/* Signal handler installed with SA_SIGINFO. */
458 		linux_rt_sendsig(catcher, ksi, mask);
459 		return;
460 	}
461 
462 	regs = td->td_frame;
463 	oonstack = sigonstack(regs->tf_rsp);
464 
465 #ifdef DEBUG
466 	if (ldebug(sendsig))
467 		printf(ARGS(sendsig, "%p, %d, %p, %u"),
468 		    catcher, sig, (void*)mask, code);
469 #endif
470 
471 	/*
472 	 * Allocate space for the signal handler context.
473 	 */
474 	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
475 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
476 		fp = (struct l_sigframe *)(td->td_sigstk.ss_sp +
477 		    td->td_sigstk.ss_size - sizeof(struct l_sigframe));
478 	} else
479 		fp = (struct l_sigframe *)regs->tf_rsp - 1;
480 	mtx_unlock(&psp->ps_mtx);
481 	PROC_UNLOCK(p);
482 
483 	/*
484 	 * Build the argument list for the signal handler.
485 	 */
486 	if (p->p_sysent->sv_sigtbl)
487 		if (sig <= p->p_sysent->sv_sigsize)
488 			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
489 
490 	bzero(&frame, sizeof(frame));
491 
492 	frame.sf_handler = PTROUT(catcher);
493 	frame.sf_sig = sig;
494 
495 	bsd_to_linux_sigset(mask, &lmask);
496 
497 	/*
498 	 * Build the signal context to be used by sigreturn.
499 	 */
500 	frame.sf_sc.sc_mask   = lmask.__bits[0];
501 	frame.sf_sc.sc_gs     = regs->tf_gs;
502 	frame.sf_sc.sc_fs     = regs->tf_fs;
503 	frame.sf_sc.sc_es     = regs->tf_es;
504 	frame.sf_sc.sc_ds     = regs->tf_ds;
505 	frame.sf_sc.sc_edi    = regs->tf_rdi;
506 	frame.sf_sc.sc_esi    = regs->tf_rsi;
507 	frame.sf_sc.sc_ebp    = regs->tf_rbp;
508 	frame.sf_sc.sc_ebx    = regs->tf_rbx;
509 	frame.sf_sc.sc_edx    = regs->tf_rdx;
510 	frame.sf_sc.sc_ecx    = regs->tf_rcx;
511 	frame.sf_sc.sc_eax    = regs->tf_rax;
512 	frame.sf_sc.sc_eip    = regs->tf_rip;
513 	frame.sf_sc.sc_cs     = regs->tf_cs;
514 	frame.sf_sc.sc_eflags = regs->tf_rflags;
515 	frame.sf_sc.sc_esp_at_signal = regs->tf_rsp;
516 	frame.sf_sc.sc_ss     = regs->tf_ss;
517 	frame.sf_sc.sc_err    = regs->tf_err;
518 	frame.sf_sc.sc_cr2    = (u_int32_t)(uintptr_t)ksi->ksi_addr;
519 	frame.sf_sc.sc_trapno = bsd_to_linux_trapcode(code);
520 
521 	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
522 		frame.sf_extramask[i] = lmask.__bits[i+1];
523 
524 	if (copyout(&frame, fp, sizeof(frame)) != 0) {
525 		/*
526 		 * Process has trashed its stack; give it an illegal
527 		 * instruction to halt it in its tracks.
528 		 */
529 		PROC_LOCK(p);
530 		sigexit(td, SIGILL);
531 	}
532 
533 	/*
534 	 * Build context to run handler in.
535 	 */
536 	regs->tf_rsp = PTROUT(fp);
537 	regs->tf_rip = p->p_sysent->sv_sigcode_base;
538 	regs->tf_rflags &= ~(PSL_T | PSL_D);
539 	regs->tf_cs = _ucode32sel;
540 	regs->tf_ss = _udatasel;
541 	regs->tf_ds = _udatasel;
542 	regs->tf_es = _udatasel;
543 	regs->tf_fs = _ufssel;
544 	regs->tf_gs = _ugssel;
545 	regs->tf_flags = TF_HASSEGS;
546 	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
547 	PROC_LOCK(p);
548 	mtx_lock(&psp->ps_mtx);
549 }
550 
551 /*
552  * System call to cleanup state after a signal
553  * has been taken.  Reset signal mask and
554  * stack state from context left by sendsig (above).
555  * Return to previous pc and psl as specified by
556  * context left by sendsig. Check carefully to
557  * make sure that the user has not modified the
558  * psl to gain improper privileges or to cause
559  * a machine fault.
560  */
561 int
562 linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args)
563 {
564 	struct l_sigframe frame;
565 	struct trapframe *regs;
566 	sigset_t bmask;
567 	l_sigset_t lmask;
568 	int eflags, i;
569 	ksiginfo_t ksi;
570 
571 	regs = td->td_frame;
572 
573 #ifdef DEBUG
574 	if (ldebug(sigreturn))
575 		printf(ARGS(sigreturn, "%p"), (void *)args->sfp);
576 #endif
577 	/*
578 	 * The trampoline code hands us the sigframe.
579 	 * It is unsafe to keep track of it ourselves, in the event that a
580 	 * program jumps out of a signal handler.
581 	 */
582 	if (copyin(args->sfp, &frame, sizeof(frame)) != 0)
583 		return (EFAULT);
584 
585 	/*
586 	 * Check for security violations.
587 	 */
588 #define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
589 	eflags = frame.sf_sc.sc_eflags;
590 	/*
591 	 * XXX do allow users to change the privileged flag PSL_RF.  The
592 	 * cpu sets PSL_RF in tf_eflags for faults.  Debuggers should
593 	 * sometimes set it there too.  tf_eflags is kept in the signal
594 	 * context during signal handling and there is no other place
595 	 * to remember it, so the PSL_RF bit may be corrupted by the
596 	 * signal handler without us knowing.  Corruption of the PSL_RF
597 	 * bit at worst causes one more or one less debugger trap, so
598 	 * allowing it is fairly harmless.
599 	 */
600 	if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF))
601 		return(EINVAL);
602 
603 	/*
604 	 * Don't allow users to load a valid privileged %cs.  Let the
605 	 * hardware check for invalid selectors, excess privilege in
606 	 * other selectors, invalid %eip's and invalid %esp's.
607 	 */
608 #define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
609 	if (!CS_SECURE(frame.sf_sc.sc_cs)) {
610 		ksiginfo_init_trap(&ksi);
611 		ksi.ksi_signo = SIGBUS;
612 		ksi.ksi_code = BUS_OBJERR;
613 		ksi.ksi_trapno = T_PROTFLT;
614 		ksi.ksi_addr = (void *)regs->tf_rip;
615 		trapsignal(td, &ksi);
616 		return(EINVAL);
617 	}
618 
619 	lmask.__bits[0] = frame.sf_sc.sc_mask;
620 	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
621 		lmask.__bits[i+1] = frame.sf_extramask[i];
622 	linux_to_bsd_sigset(&lmask, &bmask);
623 	kern_sigprocmask(td, SIG_SETMASK, &bmask, NULL, 0);
624 
625 	/*
626 	 * Restore signal context.
627 	 */
628 	regs->tf_rdi    = frame.sf_sc.sc_edi;
629 	regs->tf_rsi    = frame.sf_sc.sc_esi;
630 	regs->tf_rbp    = frame.sf_sc.sc_ebp;
631 	regs->tf_rbx    = frame.sf_sc.sc_ebx;
632 	regs->tf_rdx    = frame.sf_sc.sc_edx;
633 	regs->tf_rcx    = frame.sf_sc.sc_ecx;
634 	regs->tf_rax    = frame.sf_sc.sc_eax;
635 	regs->tf_rip    = frame.sf_sc.sc_eip;
636 	regs->tf_cs     = frame.sf_sc.sc_cs;
637 	regs->tf_ds     = frame.sf_sc.sc_ds;
638 	regs->tf_es     = frame.sf_sc.sc_es;
639 	regs->tf_fs     = frame.sf_sc.sc_fs;
640 	regs->tf_gs     = frame.sf_sc.sc_gs;
641 	regs->tf_rflags = eflags;
642 	regs->tf_rsp    = frame.sf_sc.sc_esp_at_signal;
643 	regs->tf_ss     = frame.sf_sc.sc_ss;
644 	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
645 
646 	return (EJUSTRETURN);
647 }
648 
649 /*
650  * System call to cleanup state after a signal
651  * has been taken.  Reset signal mask and
652  * stack state from context left by rt_sendsig (above).
653  * Return to previous pc and psl as specified by
654  * context left by sendsig. Check carefully to
655  * make sure that the user has not modified the
656  * psl to gain improper privileges or to cause
657  * a machine fault.
658  */
659 int
660 linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args)
661 {
662 	struct l_ucontext uc;
663 	struct l_sigcontext *context;
664 	sigset_t bmask;
665 	l_stack_t *lss;
666 	stack_t ss;
667 	struct trapframe *regs;
668 	int eflags;
669 	ksiginfo_t ksi;
670 
671 	regs = td->td_frame;
672 
673 #ifdef DEBUG
674 	if (ldebug(rt_sigreturn))
675 		printf(ARGS(rt_sigreturn, "%p"), (void *)args->ucp);
676 #endif
677 	/*
678 	 * The trampoline code hands us the ucontext.
679 	 * It is unsafe to keep track of it ourselves, in the event that a
680 	 * program jumps out of a signal handler.
681 	 */
682 	if (copyin(args->ucp, &uc, sizeof(uc)) != 0)
683 		return (EFAULT);
684 
685 	context = &uc.uc_mcontext;
686 
687 	/*
688 	 * Check for security violations.
689 	 */
690 #define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
691 	eflags = context->sc_eflags;
692 	/*
693 	 * XXX do allow users to change the privileged flag PSL_RF.  The
694 	 * cpu sets PSL_RF in tf_eflags for faults.  Debuggers should
695 	 * sometimes set it there too.  tf_eflags is kept in the signal
696 	 * context during signal handling and there is no other place
697 	 * to remember it, so the PSL_RF bit may be corrupted by the
698 	 * signal handler without us knowing.  Corruption of the PSL_RF
699 	 * bit at worst causes one more or one less debugger trap, so
700 	 * allowing it is fairly harmless.
701 	 */
702 	if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF))
703 		return(EINVAL);
704 
705 	/*
706 	 * Don't allow users to load a valid privileged %cs.  Let the
707 	 * hardware check for invalid selectors, excess privilege in
708 	 * other selectors, invalid %eip's and invalid %esp's.
709 	 */
710 #define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
711 	if (!CS_SECURE(context->sc_cs)) {
712 		ksiginfo_init_trap(&ksi);
713 		ksi.ksi_signo = SIGBUS;
714 		ksi.ksi_code = BUS_OBJERR;
715 		ksi.ksi_trapno = T_PROTFLT;
716 		ksi.ksi_addr = (void *)regs->tf_rip;
717 		trapsignal(td, &ksi);
718 		return(EINVAL);
719 	}
720 
721 	linux_to_bsd_sigset(&uc.uc_sigmask, &bmask);
722 	kern_sigprocmask(td, SIG_SETMASK, &bmask, NULL, 0);
723 
724 	/*
725 	 * Restore signal context
726 	 */
727 	regs->tf_gs	= context->sc_gs;
728 	regs->tf_fs	= context->sc_fs;
729 	regs->tf_es	= context->sc_es;
730 	regs->tf_ds	= context->sc_ds;
731 	regs->tf_rdi    = context->sc_edi;
732 	regs->tf_rsi    = context->sc_esi;
733 	regs->tf_rbp    = context->sc_ebp;
734 	regs->tf_rbx    = context->sc_ebx;
735 	regs->tf_rdx    = context->sc_edx;
736 	regs->tf_rcx    = context->sc_ecx;
737 	regs->tf_rax    = context->sc_eax;
738 	regs->tf_rip    = context->sc_eip;
739 	regs->tf_cs     = context->sc_cs;
740 	regs->tf_rflags = eflags;
741 	regs->tf_rsp    = context->sc_esp_at_signal;
742 	regs->tf_ss     = context->sc_ss;
743 	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
744 
745 	/*
746 	 * call sigaltstack & ignore results..
747 	 */
748 	lss = &uc.uc_stack;
749 	ss.ss_sp = PTRIN(lss->ss_sp);
750 	ss.ss_size = lss->ss_size;
751 	ss.ss_flags = linux_to_bsd_sigaltstack(lss->ss_flags);
752 
753 #ifdef DEBUG
754 	if (ldebug(rt_sigreturn))
755 		printf(LMSG("rt_sigret flags: 0x%x, sp: %p, ss: 0x%lx, mask: 0x%x"),
756 		    ss.ss_flags, ss.ss_sp, ss.ss_size, context->sc_mask);
757 #endif
758 	(void)kern_sigaltstack(td, &ss, NULL);
759 
760 	return (EJUSTRETURN);
761 }
762 
763 static int
764 linux32_fetch_syscall_args(struct thread *td, struct syscall_args *sa)
765 {
766 	struct proc *p;
767 	struct trapframe *frame;
768 
769 	p = td->td_proc;
770 	frame = td->td_frame;
771 
772 	sa->args[0] = frame->tf_rbx;
773 	sa->args[1] = frame->tf_rcx;
774 	sa->args[2] = frame->tf_rdx;
775 	sa->args[3] = frame->tf_rsi;
776 	sa->args[4] = frame->tf_rdi;
777 	sa->args[5] = frame->tf_rbp;	/* Unconfirmed */
778 	sa->code = frame->tf_rax;
779 
780 	if (sa->code >= p->p_sysent->sv_size)
781 		sa->callp = &p->p_sysent->sv_table[0];
782 	else
783 		sa->callp = &p->p_sysent->sv_table[sa->code];
784 	sa->narg = sa->callp->sy_narg;
785 
786 	td->td_retval[0] = 0;
787 	td->td_retval[1] = frame->tf_rdx;
788 
789 	return (0);
790 }
791 
792 /*
793  * If a linux binary is exec'ing something, try this image activator
794  * first.  We override standard shell script execution in order to
795  * be able to modify the interpreter path.  We only do this if a linux
796  * binary is doing the exec, so we do not create an EXEC module for it.
797  */
798 static int	exec_linux_imgact_try(struct image_params *iparams);
799 
800 static int
801 exec_linux_imgact_try(struct image_params *imgp)
802 {
803 	const char *head = (const char *)imgp->image_header;
804 	char *rpath;
805 	int error = -1;
806 
807 	/*
808 	* The interpreter for shell scripts run from a linux binary needs
809 	* to be located in /compat/linux if possible in order to recursively
810 	* maintain linux path emulation.
811 	*/
812 	if (((const short *)head)[0] == SHELLMAGIC) {
813 		/*
814 		* Run our normal shell image activator.  If it succeeds attempt
815 		* to use the alternate path for the interpreter.  If an
816 		* alternate * path is found, use our stringspace to store it.
817 		*/
818 		if ((error = exec_shell_imgact(imgp)) == 0) {
819 			linux_emul_convpath(FIRST_THREAD_IN_PROC(imgp->proc),
820 			    imgp->interpreter_name, UIO_SYSSPACE, &rpath, 0,
821 			    AT_FDCWD);
822 			if (rpath != NULL)
823 				imgp->args->fname_buf =
824 				    imgp->interpreter_name = rpath;
825 		}
826 	}
827 	return (error);
828 }
829 
830 /*
831  * Clear registers on exec
832  * XXX copied from ia32_signal.c.
833  */
834 static void
835 exec_linux_setregs(struct thread *td, struct image_params *imgp, u_long stack)
836 {
837 	struct trapframe *regs = td->td_frame;
838 	struct pcb *pcb = td->td_pcb;
839 
840 	mtx_lock(&dt_lock);
841 	if (td->td_proc->p_md.md_ldt != NULL)
842 		user_ldt_free(td);
843 	else
844 		mtx_unlock(&dt_lock);
845 
846 	critical_enter();
847 	wrmsr(MSR_FSBASE, 0);
848 	wrmsr(MSR_KGSBASE, 0);	/* User value while we're in the kernel */
849 	pcb->pcb_fsbase = 0;
850 	pcb->pcb_gsbase = 0;
851 	critical_exit();
852 	pcb->pcb_initial_fpucw = __LINUX_NPXCW__;
853 
854 	bzero((char *)regs, sizeof(struct trapframe));
855 	regs->tf_rip = imgp->entry_addr;
856 	regs->tf_rsp = stack;
857 	regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T);
858 	regs->tf_gs = _ugssel;
859 	regs->tf_fs = _ufssel;
860 	regs->tf_es = _udatasel;
861 	regs->tf_ds = _udatasel;
862 	regs->tf_ss = _udatasel;
863 	regs->tf_flags = TF_HASSEGS;
864 	regs->tf_cs = _ucode32sel;
865 	regs->tf_rbx = imgp->ps_strings;
866 
867 	fpstate_drop(td);
868 
869 	/* Do full restore on return so that we can change to a different %cs */
870 	set_pcb_flags(pcb, PCB_32BIT | PCB_FULL_IRET);
871 	clear_pcb_flags(pcb, PCB_GS32BIT);
872 	td->td_retval[1] = 0;
873 }
874 
875 /*
876  * XXX copied from ia32_sysvec.c.
877  */
878 static register_t *
879 linux_copyout_strings(struct image_params *imgp)
880 {
881 	int argc, envc;
882 	u_int32_t *vectp;
883 	char *stringp, *destp;
884 	u_int32_t *stack_base;
885 	struct linux32_ps_strings *arginfo;
886 
887 	/*
888 	 * Calculate string base and vector table pointers.
889 	 * Also deal with signal trampoline code for this exec type.
890 	 */
891 	arginfo = (struct linux32_ps_strings *)LINUX32_PS_STRINGS;
892 	destp =	(caddr_t)arginfo - SPARE_USRSPACE - linux_szplatform -
893 	    roundup((ARG_MAX - imgp->args->stringspace),
894 	    sizeof(char *));
895 
896 	/*
897 	 * Install LINUX_PLATFORM
898 	 */
899 	copyout(linux_platform, ((caddr_t)arginfo - linux_szplatform),
900 	    linux_szplatform);
901 
902 	/*
903 	 * If we have a valid auxargs ptr, prepare some room
904 	 * on the stack.
905 	 */
906 	if (imgp->auxargs) {
907 		/*
908 		 * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for
909 		 * lower compatibility.
910 		 */
911 		imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size :
912 		    (LINUX_AT_COUNT * 2);
913 		/*
914 		 * The '+ 2' is for the null pointers at the end of each of
915 		 * the arg and env vector sets,and imgp->auxarg_size is room
916 		 * for argument of Runtime loader.
917 		 */
918 		vectp = (u_int32_t *) (destp - (imgp->args->argc +
919 		    imgp->args->envc + 2 + imgp->auxarg_size) *
920 		    sizeof(u_int32_t));
921 
922 	} else
923 		/*
924 		 * The '+ 2' is for the null pointers at the end of each of
925 		 * the arg and env vector sets
926 		 */
927 		vectp = (u_int32_t *)(destp - (imgp->args->argc +
928 		    imgp->args->envc + 2) * sizeof(u_int32_t));
929 
930 	/*
931 	 * vectp also becomes our initial stack base
932 	 */
933 	stack_base = vectp;
934 
935 	stringp = imgp->args->begin_argv;
936 	argc = imgp->args->argc;
937 	envc = imgp->args->envc;
938 	/*
939 	 * Copy out strings - arguments and environment.
940 	 */
941 	copyout(stringp, destp, ARG_MAX - imgp->args->stringspace);
942 
943 	/*
944 	 * Fill in "ps_strings" struct for ps, w, etc.
945 	 */
946 	suword32(&arginfo->ps_argvstr, (uint32_t)(intptr_t)vectp);
947 	suword32(&arginfo->ps_nargvstr, argc);
948 
949 	/*
950 	 * Fill in argument portion of vector table.
951 	 */
952 	for (; argc > 0; --argc) {
953 		suword32(vectp++, (uint32_t)(intptr_t)destp);
954 		while (*stringp++ != 0)
955 			destp++;
956 		destp++;
957 	}
958 
959 	/* a null vector table pointer separates the argp's from the envp's */
960 	suword32(vectp++, 0);
961 
962 	suword32(&arginfo->ps_envstr, (uint32_t)(intptr_t)vectp);
963 	suword32(&arginfo->ps_nenvstr, envc);
964 
965 	/*
966 	 * Fill in environment portion of vector table.
967 	 */
968 	for (; envc > 0; --envc) {
969 		suword32(vectp++, (uint32_t)(intptr_t)destp);
970 		while (*stringp++ != 0)
971 			destp++;
972 		destp++;
973 	}
974 
975 	/* end of vector table is a null pointer */
976 	suword32(vectp, 0);
977 
978 	return ((register_t *)stack_base);
979 }
980 
981 static SYSCTL_NODE(_compat, OID_AUTO, linux32, CTLFLAG_RW, 0,
982     "32-bit Linux emulation");
983 
984 static u_long	linux32_maxdsiz = LINUX32_MAXDSIZ;
985 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxdsiz, CTLFLAG_RW,
986     &linux32_maxdsiz, 0, "");
987 static u_long	linux32_maxssiz = LINUX32_MAXSSIZ;
988 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxssiz, CTLFLAG_RW,
989     &linux32_maxssiz, 0, "");
990 static u_long	linux32_maxvmem = LINUX32_MAXVMEM;
991 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxvmem, CTLFLAG_RW,
992     &linux32_maxvmem, 0, "");
993 
994 static void
995 linux32_fixlimit(struct rlimit *rl, int which)
996 {
997 
998 	switch (which) {
999 	case RLIMIT_DATA:
1000 		if (linux32_maxdsiz != 0) {
1001 			if (rl->rlim_cur > linux32_maxdsiz)
1002 				rl->rlim_cur = linux32_maxdsiz;
1003 			if (rl->rlim_max > linux32_maxdsiz)
1004 				rl->rlim_max = linux32_maxdsiz;
1005 		}
1006 		break;
1007 	case RLIMIT_STACK:
1008 		if (linux32_maxssiz != 0) {
1009 			if (rl->rlim_cur > linux32_maxssiz)
1010 				rl->rlim_cur = linux32_maxssiz;
1011 			if (rl->rlim_max > linux32_maxssiz)
1012 				rl->rlim_max = linux32_maxssiz;
1013 		}
1014 		break;
1015 	case RLIMIT_VMEM:
1016 		if (linux32_maxvmem != 0) {
1017 			if (rl->rlim_cur > linux32_maxvmem)
1018 				rl->rlim_cur = linux32_maxvmem;
1019 			if (rl->rlim_max > linux32_maxvmem)
1020 				rl->rlim_max = linux32_maxvmem;
1021 		}
1022 		break;
1023 	}
1024 }
1025 
1026 struct sysentvec elf_linux_sysvec = {
1027 	.sv_size	= LINUX_SYS_MAXSYSCALL,
1028 	.sv_table	= linux_sysent,
1029 	.sv_mask	= 0,
1030 	.sv_sigsize	= LINUX_SIGTBLSZ,
1031 	.sv_sigtbl	= bsd_to_linux_signal,
1032 	.sv_errsize	= ELAST + 1,
1033 	.sv_errtbl	= bsd_to_linux_errno,
1034 	.sv_transtrap	= translate_traps,
1035 	.sv_fixup	= elf_linux_fixup,
1036 	.sv_sendsig	= linux_sendsig,
1037 	.sv_sigcode	= linux_sigcode,
1038 	.sv_szsigcode	= &linux_szsigcode,
1039 	.sv_prepsyscall	= NULL,
1040 	.sv_name	= "Linux ELF32",
1041 	.sv_coredump	= elf32_coredump,
1042 	.sv_imgact_try	= exec_linux_imgact_try,
1043 	.sv_minsigstksz	= LINUX_MINSIGSTKSZ,
1044 	.sv_pagesize	= PAGE_SIZE,
1045 	.sv_minuser	= VM_MIN_ADDRESS,
1046 	.sv_maxuser	= LINUX32_MAXUSER,
1047 	.sv_usrstack	= LINUX32_USRSTACK,
1048 	.sv_psstrings	= LINUX32_PS_STRINGS,
1049 	.sv_stackprot	= VM_PROT_ALL,
1050 	.sv_copyout_strings = linux_copyout_strings,
1051 	.sv_setregs	= exec_linux_setregs,
1052 	.sv_fixlimit	= linux32_fixlimit,
1053 	.sv_maxssiz	= &linux32_maxssiz,
1054 	.sv_flags	= SV_ABI_LINUX | SV_ILP32 | SV_IA32 | SV_SHP,
1055 	.sv_set_syscall_retval = cpu_set_syscall_retval,
1056 	.sv_fetch_syscall_args = linux32_fetch_syscall_args,
1057 	.sv_syscallnames = NULL,
1058 	.sv_shared_page_base = LINUX32_SHAREDPAGE,
1059 	.sv_shared_page_len = PAGE_SIZE,
1060 	.sv_schedtail	= linux_schedtail,
1061 };
1062 INIT_SYSENTVEC(elf_sysvec, &elf_linux_sysvec);
1063 
1064 static char GNU_ABI_VENDOR[] = "GNU";
1065 static int GNULINUX_ABI_DESC = 0;
1066 
1067 static boolean_t
1068 linux32_trans_osrel(const Elf_Note *note, int32_t *osrel)
1069 {
1070 	const Elf32_Word *desc;
1071 	uintptr_t p;
1072 
1073 	p = (uintptr_t)(note + 1);
1074 	p += roundup2(note->n_namesz, sizeof(Elf32_Addr));
1075 
1076 	desc = (const Elf32_Word *)p;
1077 	if (desc[0] != GNULINUX_ABI_DESC)
1078 		return (FALSE);
1079 
1080 	/*
1081 	 * For linux we encode osrel as follows (see linux_mib.c):
1082 	 * VVVMMMIII (version, major, minor), see linux_mib.c.
1083 	 */
1084 	*osrel = desc[1] * 1000000 + desc[2] * 1000 + desc[3];
1085 
1086 	return (TRUE);
1087 }
1088 
1089 static Elf_Brandnote linux32_brandnote = {
1090 	.hdr.n_namesz	= sizeof(GNU_ABI_VENDOR),
1091 	.hdr.n_descsz	= 16,	/* XXX at least 16 */
1092 	.hdr.n_type	= 1,
1093 	.vendor		= GNU_ABI_VENDOR,
1094 	.flags		= BN_TRANSLATE_OSREL,
1095 	.trans_osrel	= linux32_trans_osrel
1096 };
1097 
1098 static Elf32_Brandinfo linux_brand = {
1099 	.brand		= ELFOSABI_LINUX,
1100 	.machine	= EM_386,
1101 	.compat_3_brand	= "Linux",
1102 	.emul_path	= "/compat/linux",
1103 	.interp_path	= "/lib/ld-linux.so.1",
1104 	.sysvec		= &elf_linux_sysvec,
1105 	.interp_newpath	= NULL,
1106 	.brand_note	= &linux32_brandnote,
1107 	.flags		= BI_CAN_EXEC_DYN | BI_BRAND_NOTE
1108 };
1109 
1110 static Elf32_Brandinfo linux_glibc2brand = {
1111 	.brand		= ELFOSABI_LINUX,
1112 	.machine	= EM_386,
1113 	.compat_3_brand	= "Linux",
1114 	.emul_path	= "/compat/linux",
1115 	.interp_path	= "/lib/ld-linux.so.2",
1116 	.sysvec		= &elf_linux_sysvec,
1117 	.interp_newpath	= NULL,
1118 	.brand_note	= &linux32_brandnote,
1119 	.flags		= BI_CAN_EXEC_DYN | BI_BRAND_NOTE
1120 };
1121 
1122 Elf32_Brandinfo *linux_brandlist[] = {
1123 	&linux_brand,
1124 	&linux_glibc2brand,
1125 	NULL
1126 };
1127 
1128 static int
1129 linux_elf_modevent(module_t mod, int type, void *data)
1130 {
1131 	Elf32_Brandinfo **brandinfo;
1132 	int error;
1133 	struct linux_ioctl_handler **lihp;
1134 	struct linux_device_handler **ldhp;
1135 
1136 	error = 0;
1137 
1138 	switch(type) {
1139 	case MOD_LOAD:
1140 		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
1141 		     ++brandinfo)
1142 			if (elf32_insert_brand_entry(*brandinfo) < 0)
1143 				error = EINVAL;
1144 		if (error == 0) {
1145 			SET_FOREACH(lihp, linux_ioctl_handler_set)
1146 				linux_ioctl_register_handler(*lihp);
1147 			SET_FOREACH(ldhp, linux_device_handler_set)
1148 				linux_device_register_handler(*ldhp);
1149 			mtx_init(&emul_lock, "emuldata lock", NULL, MTX_DEF);
1150 			sx_init(&emul_shared_lock, "emuldata->shared lock");
1151 			LIST_INIT(&futex_list);
1152 			mtx_init(&futex_mtx, "ftllk", NULL, MTX_DEF);
1153 			linux_exit_tag = EVENTHANDLER_REGISTER(process_exit,
1154 			    linux_proc_exit, NULL, 1000);
1155 			linux_exec_tag = EVENTHANDLER_REGISTER(process_exec,
1156 			    linux_proc_exec, NULL, 1000);
1157 			linux_szplatform = roundup(strlen(linux_platform) + 1,
1158 			    sizeof(char *));
1159 			linux_osd_jail_register();
1160 			stclohz = (stathz ? stathz : hz);
1161 			if (bootverbose)
1162 				printf("Linux ELF exec handler installed\n");
1163 		} else
1164 			printf("cannot insert Linux ELF brand handler\n");
1165 		break;
1166 	case MOD_UNLOAD:
1167 		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
1168 		     ++brandinfo)
1169 			if (elf32_brand_inuse(*brandinfo))
1170 				error = EBUSY;
1171 		if (error == 0) {
1172 			for (brandinfo = &linux_brandlist[0];
1173 			     *brandinfo != NULL; ++brandinfo)
1174 				if (elf32_remove_brand_entry(*brandinfo) < 0)
1175 					error = EINVAL;
1176 		}
1177 		if (error == 0) {
1178 			SET_FOREACH(lihp, linux_ioctl_handler_set)
1179 				linux_ioctl_unregister_handler(*lihp);
1180 			SET_FOREACH(ldhp, linux_device_handler_set)
1181 				linux_device_unregister_handler(*ldhp);
1182 			mtx_destroy(&emul_lock);
1183 			sx_destroy(&emul_shared_lock);
1184 			mtx_destroy(&futex_mtx);
1185 			EVENTHANDLER_DEREGISTER(process_exit, linux_exit_tag);
1186 			EVENTHANDLER_DEREGISTER(process_exec, linux_exec_tag);
1187 			linux_osd_jail_deregister();
1188 			if (bootverbose)
1189 				printf("Linux ELF exec handler removed\n");
1190 		} else
1191 			printf("Could not deinstall ELF interpreter entry\n");
1192 		break;
1193 	default:
1194 		return EOPNOTSUPP;
1195 	}
1196 	return error;
1197 }
1198 
1199 static moduledata_t linux_elf_mod = {
1200 	"linuxelf",
1201 	linux_elf_modevent,
1202 	0
1203 };
1204 
1205 DECLARE_MODULE_TIED(linuxelf, linux_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY);
1206