1 /*- 2 * Copyright (c) 2004 Tim J. Robbins 3 * Copyright (c) 2003 Peter Wemm 4 * Copyright (c) 2002 Doug Rabson 5 * Copyright (c) 1998-1999 Andrew Gallatin 6 * Copyright (c) 1994-1996 S�ren Schmidt 7 * All rights reserved. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer 14 * in this position and unchanged. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. The name of the author may not be used to endorse or promote products 19 * derived from this software without specific prior written permission 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 22 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 23 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 24 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 25 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 26 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 27 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 28 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 30 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 */ 32 33 #include <sys/cdefs.h> 34 __FBSDID("$FreeBSD$"); 35 #include "opt_compat.h" 36 37 #ifndef COMPAT_IA32 38 #error "Unable to compile Linux-emulator due to missing COMPAT_IA32 option!" 39 #endif 40 41 #define __ELF_WORD_SIZE 32 42 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/exec.h> 46 #include <sys/imgact.h> 47 #include <sys/imgact_elf.h> 48 #include <sys/kernel.h> 49 #include <sys/lock.h> 50 #include <sys/malloc.h> 51 #include <sys/module.h> 52 #include <sys/mutex.h> 53 #include <sys/proc.h> 54 #include <sys/resourcevar.h> 55 #include <sys/signalvar.h> 56 #include <sys/sysctl.h> 57 #include <sys/syscallsubr.h> 58 #include <sys/sysent.h> 59 #include <sys/sysproto.h> 60 #include <sys/vnode.h> 61 #include <sys/eventhandler.h> 62 63 #include <vm/vm.h> 64 #include <vm/pmap.h> 65 #include <vm/vm_extern.h> 66 #include <vm/vm_map.h> 67 #include <vm/vm_object.h> 68 #include <vm/vm_page.h> 69 #include <vm/vm_param.h> 70 71 #include <machine/cpu.h> 72 #include <machine/md_var.h> 73 #include <machine/pcb.h> 74 #include <machine/specialreg.h> 75 76 #include <amd64/linux32/linux.h> 77 #include <amd64/linux32/linux32_proto.h> 78 #include <compat/linux/linux_emul.h> 79 #include <compat/linux/linux_mib.h> 80 #include <compat/linux/linux_signal.h> 81 #include <compat/linux/linux_util.h> 82 83 MODULE_VERSION(linux, 1); 84 85 MALLOC_DEFINE(M_LINUX, "linux", "Linux mode structures"); 86 87 #define AUXARGS_ENTRY_32(pos, id, val) \ 88 do { \ 89 suword32(pos++, id); \ 90 suword32(pos++, val); \ 91 } while (0) 92 93 #if BYTE_ORDER == LITTLE_ENDIAN 94 #define SHELLMAGIC 0x2123 /* #! */ 95 #else 96 #define SHELLMAGIC 0x2321 97 #endif 98 99 /* 100 * Allow the sendsig functions to use the ldebug() facility 101 * even though they are not syscalls themselves. Map them 102 * to syscall 0. This is slightly less bogus than using 103 * ldebug(sigreturn). 104 */ 105 #define LINUX_SYS_linux_rt_sendsig 0 106 #define LINUX_SYS_linux_sendsig 0 107 108 extern char linux_sigcode[]; 109 extern int linux_szsigcode; 110 111 extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL]; 112 113 SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler); 114 SET_DECLARE(linux_device_handler_set, struct linux_device_handler); 115 116 static int elf_linux_fixup(register_t **stack_base, 117 struct image_params *iparams); 118 static register_t *linux_copyout_strings(struct image_params *imgp); 119 static void linux_prepsyscall(struct trapframe *tf, int *args, u_int *code, 120 caddr_t *params); 121 static void linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask); 122 static void exec_linux_setregs(struct thread *td, u_long entry, 123 u_long stack, u_long ps_strings); 124 static void linux32_fixlimit(struct rlimit *rl, int which); 125 126 extern LIST_HEAD(futex_list, futex) futex_list; 127 extern struct sx futex_sx; 128 129 static eventhandler_tag linux_exit_tag; 130 static eventhandler_tag linux_schedtail_tag; 131 static eventhandler_tag linux_exec_tag; 132 133 /* 134 * Linux syscalls return negative errno's, we do positive and map them 135 * Reference: 136 * FreeBSD: src/sys/sys/errno.h 137 * Linux: linux-2.6.17.8/include/asm-generic/errno-base.h 138 * linux-2.6.17.8/include/asm-generic/errno.h 139 */ 140 static int bsd_to_linux_errno[ELAST + 1] = { 141 -0, -1, -2, -3, -4, -5, -6, -7, -8, -9, 142 -10, -35, -12, -13, -14, -15, -16, -17, -18, -19, 143 -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, 144 -30, -31, -32, -33, -34, -11,-115,-114, -88, -89, 145 -90, -91, -92, -93, -94, -95, -96, -97, -98, -99, 146 -100,-101,-102,-103,-104,-105,-106,-107,-108,-109, 147 -110,-111, -40, -36,-112,-113, -39, -11, -87,-122, 148 -116, -66, -6, -6, -6, -6, -6, -37, -38, -9, 149 -6, -6, -43, -42, -75,-125, -84, -95, -16, -74, 150 -72, -67, -71 151 }; 152 153 int bsd_to_linux_signal[LINUX_SIGTBLSZ] = { 154 LINUX_SIGHUP, LINUX_SIGINT, LINUX_SIGQUIT, LINUX_SIGILL, 155 LINUX_SIGTRAP, LINUX_SIGABRT, 0, LINUX_SIGFPE, 156 LINUX_SIGKILL, LINUX_SIGBUS, LINUX_SIGSEGV, LINUX_SIGSYS, 157 LINUX_SIGPIPE, LINUX_SIGALRM, LINUX_SIGTERM, LINUX_SIGURG, 158 LINUX_SIGSTOP, LINUX_SIGTSTP, LINUX_SIGCONT, LINUX_SIGCHLD, 159 LINUX_SIGTTIN, LINUX_SIGTTOU, LINUX_SIGIO, LINUX_SIGXCPU, 160 LINUX_SIGXFSZ, LINUX_SIGVTALRM, LINUX_SIGPROF, LINUX_SIGWINCH, 161 0, LINUX_SIGUSR1, LINUX_SIGUSR2 162 }; 163 164 int linux_to_bsd_signal[LINUX_SIGTBLSZ] = { 165 SIGHUP, SIGINT, SIGQUIT, SIGILL, 166 SIGTRAP, SIGABRT, SIGBUS, SIGFPE, 167 SIGKILL, SIGUSR1, SIGSEGV, SIGUSR2, 168 SIGPIPE, SIGALRM, SIGTERM, SIGBUS, 169 SIGCHLD, SIGCONT, SIGSTOP, SIGTSTP, 170 SIGTTIN, SIGTTOU, SIGURG, SIGXCPU, 171 SIGXFSZ, SIGVTALRM, SIGPROF, SIGWINCH, 172 SIGIO, SIGURG, SIGSYS 173 }; 174 175 #define LINUX_T_UNKNOWN 255 176 static int _bsd_to_linux_trapcode[] = { 177 LINUX_T_UNKNOWN, /* 0 */ 178 6, /* 1 T_PRIVINFLT */ 179 LINUX_T_UNKNOWN, /* 2 */ 180 3, /* 3 T_BPTFLT */ 181 LINUX_T_UNKNOWN, /* 4 */ 182 LINUX_T_UNKNOWN, /* 5 */ 183 16, /* 6 T_ARITHTRAP */ 184 254, /* 7 T_ASTFLT */ 185 LINUX_T_UNKNOWN, /* 8 */ 186 13, /* 9 T_PROTFLT */ 187 1, /* 10 T_TRCTRAP */ 188 LINUX_T_UNKNOWN, /* 11 */ 189 14, /* 12 T_PAGEFLT */ 190 LINUX_T_UNKNOWN, /* 13 */ 191 17, /* 14 T_ALIGNFLT */ 192 LINUX_T_UNKNOWN, /* 15 */ 193 LINUX_T_UNKNOWN, /* 16 */ 194 LINUX_T_UNKNOWN, /* 17 */ 195 0, /* 18 T_DIVIDE */ 196 2, /* 19 T_NMI */ 197 4, /* 20 T_OFLOW */ 198 5, /* 21 T_BOUND */ 199 7, /* 22 T_DNA */ 200 8, /* 23 T_DOUBLEFLT */ 201 9, /* 24 T_FPOPFLT */ 202 10, /* 25 T_TSSFLT */ 203 11, /* 26 T_SEGNPFLT */ 204 12, /* 27 T_STKFLT */ 205 18, /* 28 T_MCHK */ 206 19, /* 29 T_XMMFLT */ 207 15 /* 30 T_RESERVED */ 208 }; 209 #define bsd_to_linux_trapcode(code) \ 210 ((code)<sizeof(_bsd_to_linux_trapcode)/sizeof(*_bsd_to_linux_trapcode)? \ 211 _bsd_to_linux_trapcode[(code)]: \ 212 LINUX_T_UNKNOWN) 213 214 struct linux32_ps_strings { 215 u_int32_t ps_argvstr; /* first of 0 or more argument strings */ 216 u_int ps_nargvstr; /* the number of argument strings */ 217 u_int32_t ps_envstr; /* first of 0 or more environment strings */ 218 u_int ps_nenvstr; /* the number of environment strings */ 219 }; 220 221 /* 222 * If FreeBSD & Linux have a difference of opinion about what a trap 223 * means, deal with it here. 224 * 225 * MPSAFE 226 */ 227 static int 228 translate_traps(int signal, int trap_code) 229 { 230 if (signal != SIGBUS) 231 return signal; 232 switch (trap_code) { 233 case T_PROTFLT: 234 case T_TSSFLT: 235 case T_DOUBLEFLT: 236 case T_PAGEFLT: 237 return SIGSEGV; 238 default: 239 return signal; 240 } 241 } 242 243 static int 244 elf_linux_fixup(register_t **stack_base, struct image_params *imgp) 245 { 246 Elf32_Auxargs *args; 247 Elf32_Addr *base; 248 Elf32_Addr *pos; 249 250 KASSERT(curthread->td_proc == imgp->proc && 251 (curthread->td_proc->p_flag & P_SA) == 0, 252 ("unsafe elf_linux_fixup(), should be curproc")); 253 base = (Elf32_Addr *)*stack_base; 254 args = (Elf32_Auxargs *)imgp->auxargs; 255 pos = base + (imgp->args->argc + imgp->args->envc + 2); 256 257 if (args->trace) 258 AUXARGS_ENTRY_32(pos, AT_DEBUG, 1); 259 if (args->execfd != -1) 260 AUXARGS_ENTRY_32(pos, AT_EXECFD, args->execfd); 261 AUXARGS_ENTRY_32(pos, AT_PHDR, args->phdr); 262 AUXARGS_ENTRY_32(pos, AT_PHENT, args->phent); 263 AUXARGS_ENTRY_32(pos, AT_PHNUM, args->phnum); 264 AUXARGS_ENTRY_32(pos, AT_PAGESZ, args->pagesz); 265 AUXARGS_ENTRY_32(pos, AT_FLAGS, args->flags); 266 AUXARGS_ENTRY_32(pos, AT_ENTRY, args->entry); 267 AUXARGS_ENTRY_32(pos, AT_BASE, args->base); 268 AUXARGS_ENTRY_32(pos, AT_UID, imgp->proc->p_ucred->cr_ruid); 269 AUXARGS_ENTRY_32(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid); 270 AUXARGS_ENTRY_32(pos, AT_GID, imgp->proc->p_ucred->cr_rgid); 271 AUXARGS_ENTRY_32(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid); 272 AUXARGS_ENTRY_32(pos, AT_NULL, 0); 273 274 free(imgp->auxargs, M_TEMP); 275 imgp->auxargs = NULL; 276 277 base--; 278 suword32(base, (uint32_t)imgp->args->argc); 279 *stack_base = (register_t *)base; 280 return 0; 281 } 282 283 extern int _ucodesel, _ucode32sel, _udatasel; 284 extern unsigned long linux_sznonrtsigcode; 285 286 static void 287 linux_rt_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) 288 { 289 struct thread *td = curthread; 290 struct proc *p = td->td_proc; 291 struct sigacts *psp; 292 struct trapframe *regs; 293 struct l_rt_sigframe *fp, frame; 294 int oonstack; 295 int sig; 296 int code; 297 298 sig = ksi->ksi_signo; 299 code = ksi->ksi_code; 300 PROC_LOCK_ASSERT(p, MA_OWNED); 301 psp = p->p_sigacts; 302 mtx_assert(&psp->ps_mtx, MA_OWNED); 303 regs = td->td_frame; 304 oonstack = sigonstack(regs->tf_rsp); 305 306 #ifdef DEBUG 307 if (ldebug(rt_sendsig)) 308 printf(ARGS(rt_sendsig, "%p, %d, %p, %u"), 309 catcher, sig, (void*)mask, code); 310 #endif 311 /* 312 * Allocate space for the signal handler context. 313 */ 314 if ((td->td_pflags & TDP_ALTSTACK) && !oonstack && 315 SIGISMEMBER(psp->ps_sigonstack, sig)) { 316 fp = (struct l_rt_sigframe *)(td->td_sigstk.ss_sp + 317 td->td_sigstk.ss_size - sizeof(struct l_rt_sigframe)); 318 } else 319 fp = (struct l_rt_sigframe *)regs->tf_rsp - 1; 320 mtx_unlock(&psp->ps_mtx); 321 322 /* 323 * Build the argument list for the signal handler. 324 */ 325 if (p->p_sysent->sv_sigtbl) 326 if (sig <= p->p_sysent->sv_sigsize) 327 sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)]; 328 329 bzero(&frame, sizeof(frame)); 330 331 frame.sf_handler = PTROUT(catcher); 332 frame.sf_sig = sig; 333 frame.sf_siginfo = PTROUT(&fp->sf_si); 334 frame.sf_ucontext = PTROUT(&fp->sf_sc); 335 336 /* Fill in POSIX parts */ 337 frame.sf_si.lsi_signo = sig; 338 frame.sf_si.lsi_code = code; 339 frame.sf_si.lsi_addr = PTROUT(ksi->ksi_addr); 340 341 /* 342 * Build the signal context to be used by sigreturn. 343 */ 344 frame.sf_sc.uc_flags = 0; /* XXX ??? */ 345 frame.sf_sc.uc_link = 0; /* XXX ??? */ 346 347 frame.sf_sc.uc_stack.ss_sp = PTROUT(td->td_sigstk.ss_sp); 348 frame.sf_sc.uc_stack.ss_size = td->td_sigstk.ss_size; 349 frame.sf_sc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) 350 ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE; 351 PROC_UNLOCK(p); 352 353 bsd_to_linux_sigset(mask, &frame.sf_sc.uc_sigmask); 354 355 frame.sf_sc.uc_mcontext.sc_mask = frame.sf_sc.uc_sigmask.__bits[0]; 356 frame.sf_sc.uc_mcontext.sc_gs = rgs(); 357 frame.sf_sc.uc_mcontext.sc_fs = rfs(); 358 __asm __volatile("movl %%es,%0" : 359 "=rm" (frame.sf_sc.uc_mcontext.sc_es)); 360 __asm __volatile("movl %%ds,%0" : 361 "=rm" (frame.sf_sc.uc_mcontext.sc_ds)); 362 frame.sf_sc.uc_mcontext.sc_edi = regs->tf_rdi; 363 frame.sf_sc.uc_mcontext.sc_esi = regs->tf_rsi; 364 frame.sf_sc.uc_mcontext.sc_ebp = regs->tf_rbp; 365 frame.sf_sc.uc_mcontext.sc_ebx = regs->tf_rbx; 366 frame.sf_sc.uc_mcontext.sc_edx = regs->tf_rdx; 367 frame.sf_sc.uc_mcontext.sc_ecx = regs->tf_rcx; 368 frame.sf_sc.uc_mcontext.sc_eax = regs->tf_rax; 369 frame.sf_sc.uc_mcontext.sc_eip = regs->tf_rip; 370 frame.sf_sc.uc_mcontext.sc_cs = regs->tf_cs; 371 frame.sf_sc.uc_mcontext.sc_eflags = regs->tf_rflags; 372 frame.sf_sc.uc_mcontext.sc_esp_at_signal = regs->tf_rsp; 373 frame.sf_sc.uc_mcontext.sc_ss = regs->tf_ss; 374 frame.sf_sc.uc_mcontext.sc_err = regs->tf_err; 375 frame.sf_sc.uc_mcontext.sc_cr2 = (u_int32_t)(uintptr_t)ksi->ksi_addr; 376 frame.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code); 377 378 #ifdef DEBUG 379 if (ldebug(rt_sendsig)) 380 printf(LMSG("rt_sendsig flags: 0x%x, sp: %p, ss: 0x%lx, mask: 0x%x"), 381 frame.sf_sc.uc_stack.ss_flags, td->td_sigstk.ss_sp, 382 td->td_sigstk.ss_size, frame.sf_sc.uc_mcontext.sc_mask); 383 #endif 384 385 if (copyout(&frame, fp, sizeof(frame)) != 0) { 386 /* 387 * Process has trashed its stack; give it an illegal 388 * instruction to halt it in its tracks. 389 */ 390 #ifdef DEBUG 391 if (ldebug(rt_sendsig)) 392 printf(LMSG("rt_sendsig: bad stack %p, oonstack=%x"), 393 fp, oonstack); 394 #endif 395 PROC_LOCK(p); 396 sigexit(td, SIGILL); 397 } 398 399 /* 400 * Build context to run handler in. 401 */ 402 regs->tf_rsp = PTROUT(fp); 403 regs->tf_rip = LINUX32_PS_STRINGS - *(p->p_sysent->sv_szsigcode) + 404 linux_sznonrtsigcode; 405 regs->tf_rflags &= ~PSL_T; 406 regs->tf_cs = _ucode32sel; 407 regs->tf_ss = _udatasel; 408 load_ds(_udatasel); 409 td->td_pcb->pcb_ds = _udatasel; 410 load_es(_udatasel); 411 td->td_pcb->pcb_es = _udatasel; 412 /* leave user %fs and %gs untouched */ 413 PROC_LOCK(p); 414 mtx_lock(&psp->ps_mtx); 415 } 416 417 418 /* 419 * Send an interrupt to process. 420 * 421 * Stack is set up to allow sigcode stored 422 * in u. to call routine, followed by kcall 423 * to sigreturn routine below. After sigreturn 424 * resets the signal mask, the stack, and the 425 * frame pointer, it returns to the user 426 * specified pc, psl. 427 */ 428 static void 429 linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) 430 { 431 struct thread *td = curthread; 432 struct proc *p = td->td_proc; 433 struct sigacts *psp; 434 struct trapframe *regs; 435 struct l_sigframe *fp, frame; 436 l_sigset_t lmask; 437 int oonstack, i; 438 int sig, code; 439 440 sig = ksi->ksi_signo; 441 code = ksi->ksi_code; 442 PROC_LOCK_ASSERT(p, MA_OWNED); 443 psp = p->p_sigacts; 444 mtx_assert(&psp->ps_mtx, MA_OWNED); 445 if (SIGISMEMBER(psp->ps_siginfo, sig)) { 446 /* Signal handler installed with SA_SIGINFO. */ 447 linux_rt_sendsig(catcher, ksi, mask); 448 return; 449 } 450 451 regs = td->td_frame; 452 oonstack = sigonstack(regs->tf_rsp); 453 454 #ifdef DEBUG 455 if (ldebug(sendsig)) 456 printf(ARGS(sendsig, "%p, %d, %p, %u"), 457 catcher, sig, (void*)mask, code); 458 #endif 459 460 /* 461 * Allocate space for the signal handler context. 462 */ 463 if ((td->td_pflags & TDP_ALTSTACK) && !oonstack && 464 SIGISMEMBER(psp->ps_sigonstack, sig)) { 465 fp = (struct l_sigframe *)(td->td_sigstk.ss_sp + 466 td->td_sigstk.ss_size - sizeof(struct l_sigframe)); 467 } else 468 fp = (struct l_sigframe *)regs->tf_rsp - 1; 469 mtx_unlock(&psp->ps_mtx); 470 PROC_UNLOCK(p); 471 472 /* 473 * Build the argument list for the signal handler. 474 */ 475 if (p->p_sysent->sv_sigtbl) 476 if (sig <= p->p_sysent->sv_sigsize) 477 sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)]; 478 479 bzero(&frame, sizeof(frame)); 480 481 frame.sf_handler = PTROUT(catcher); 482 frame.sf_sig = sig; 483 484 bsd_to_linux_sigset(mask, &lmask); 485 486 /* 487 * Build the signal context to be used by sigreturn. 488 */ 489 frame.sf_sc.sc_mask = lmask.__bits[0]; 490 frame.sf_sc.sc_gs = rgs(); 491 frame.sf_sc.sc_fs = rfs(); 492 __asm __volatile("movl %%es,%0" : "=rm" (frame.sf_sc.sc_es)); 493 __asm __volatile("movl %%ds,%0" : "=rm" (frame.sf_sc.sc_ds)); 494 frame.sf_sc.sc_edi = regs->tf_rdi; 495 frame.sf_sc.sc_esi = regs->tf_rsi; 496 frame.sf_sc.sc_ebp = regs->tf_rbp; 497 frame.sf_sc.sc_ebx = regs->tf_rbx; 498 frame.sf_sc.sc_edx = regs->tf_rdx; 499 frame.sf_sc.sc_ecx = regs->tf_rcx; 500 frame.sf_sc.sc_eax = regs->tf_rax; 501 frame.sf_sc.sc_eip = regs->tf_rip; 502 frame.sf_sc.sc_cs = regs->tf_cs; 503 frame.sf_sc.sc_eflags = regs->tf_rflags; 504 frame.sf_sc.sc_esp_at_signal = regs->tf_rsp; 505 frame.sf_sc.sc_ss = regs->tf_ss; 506 frame.sf_sc.sc_err = regs->tf_err; 507 frame.sf_sc.sc_cr2 = (u_int32_t)(uintptr_t)ksi->ksi_addr; 508 frame.sf_sc.sc_trapno = bsd_to_linux_trapcode(code); 509 510 for (i = 0; i < (LINUX_NSIG_WORDS-1); i++) 511 frame.sf_extramask[i] = lmask.__bits[i+1]; 512 513 if (copyout(&frame, fp, sizeof(frame)) != 0) { 514 /* 515 * Process has trashed its stack; give it an illegal 516 * instruction to halt it in its tracks. 517 */ 518 PROC_LOCK(p); 519 sigexit(td, SIGILL); 520 } 521 522 /* 523 * Build context to run handler in. 524 */ 525 regs->tf_rsp = PTROUT(fp); 526 regs->tf_rip = LINUX32_PS_STRINGS - *(p->p_sysent->sv_szsigcode); 527 regs->tf_rflags &= ~PSL_T; 528 regs->tf_cs = _ucode32sel; 529 regs->tf_ss = _udatasel; 530 load_ds(_udatasel); 531 td->td_pcb->pcb_ds = _udatasel; 532 load_es(_udatasel); 533 td->td_pcb->pcb_es = _udatasel; 534 /* leave user %fs and %gs untouched */ 535 PROC_LOCK(p); 536 mtx_lock(&psp->ps_mtx); 537 } 538 539 /* 540 * System call to cleanup state after a signal 541 * has been taken. Reset signal mask and 542 * stack state from context left by sendsig (above). 543 * Return to previous pc and psl as specified by 544 * context left by sendsig. Check carefully to 545 * make sure that the user has not modified the 546 * psl to gain improper privileges or to cause 547 * a machine fault. 548 */ 549 int 550 linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args) 551 { 552 struct proc *p = td->td_proc; 553 struct l_sigframe frame; 554 struct trapframe *regs; 555 l_sigset_t lmask; 556 int eflags, i; 557 ksiginfo_t ksi; 558 559 regs = td->td_frame; 560 561 #ifdef DEBUG 562 if (ldebug(sigreturn)) 563 printf(ARGS(sigreturn, "%p"), (void *)args->sfp); 564 #endif 565 /* 566 * The trampoline code hands us the sigframe. 567 * It is unsafe to keep track of it ourselves, in the event that a 568 * program jumps out of a signal handler. 569 */ 570 if (copyin(args->sfp, &frame, sizeof(frame)) != 0) 571 return (EFAULT); 572 573 /* 574 * Check for security violations. 575 */ 576 #define EFLAGS_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) 577 eflags = frame.sf_sc.sc_eflags; 578 /* 579 * XXX do allow users to change the privileged flag PSL_RF. The 580 * cpu sets PSL_RF in tf_eflags for faults. Debuggers should 581 * sometimes set it there too. tf_eflags is kept in the signal 582 * context during signal handling and there is no other place 583 * to remember it, so the PSL_RF bit may be corrupted by the 584 * signal handler without us knowing. Corruption of the PSL_RF 585 * bit at worst causes one more or one less debugger trap, so 586 * allowing it is fairly harmless. 587 */ 588 if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF)) 589 return(EINVAL); 590 591 /* 592 * Don't allow users to load a valid privileged %cs. Let the 593 * hardware check for invalid selectors, excess privilege in 594 * other selectors, invalid %eip's and invalid %esp's. 595 */ 596 #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) 597 if (!CS_SECURE(frame.sf_sc.sc_cs)) { 598 ksiginfo_init_trap(&ksi); 599 ksi.ksi_signo = SIGBUS; 600 ksi.ksi_code = BUS_OBJERR; 601 ksi.ksi_trapno = T_PROTFLT; 602 ksi.ksi_addr = (void *)regs->tf_rip; 603 trapsignal(td, &ksi); 604 return(EINVAL); 605 } 606 607 lmask.__bits[0] = frame.sf_sc.sc_mask; 608 for (i = 0; i < (LINUX_NSIG_WORDS-1); i++) 609 lmask.__bits[i+1] = frame.sf_extramask[i]; 610 PROC_LOCK(p); 611 linux_to_bsd_sigset(&lmask, &td->td_sigmask); 612 SIG_CANTMASK(td->td_sigmask); 613 signotify(td); 614 PROC_UNLOCK(p); 615 616 /* 617 * Restore signal context. 618 */ 619 /* Selectors were restored by the trampoline. */ 620 regs->tf_rdi = frame.sf_sc.sc_edi; 621 regs->tf_rsi = frame.sf_sc.sc_esi; 622 regs->tf_rbp = frame.sf_sc.sc_ebp; 623 regs->tf_rbx = frame.sf_sc.sc_ebx; 624 regs->tf_rdx = frame.sf_sc.sc_edx; 625 regs->tf_rcx = frame.sf_sc.sc_ecx; 626 regs->tf_rax = frame.sf_sc.sc_eax; 627 regs->tf_rip = frame.sf_sc.sc_eip; 628 regs->tf_cs = frame.sf_sc.sc_cs; 629 regs->tf_rflags = eflags; 630 regs->tf_rsp = frame.sf_sc.sc_esp_at_signal; 631 regs->tf_ss = frame.sf_sc.sc_ss; 632 633 return (EJUSTRETURN); 634 } 635 636 /* 637 * System call to cleanup state after a signal 638 * has been taken. Reset signal mask and 639 * stack state from context left by rt_sendsig (above). 640 * Return to previous pc and psl as specified by 641 * context left by sendsig. Check carefully to 642 * make sure that the user has not modified the 643 * psl to gain improper privileges or to cause 644 * a machine fault. 645 */ 646 int 647 linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args) 648 { 649 struct proc *p = td->td_proc; 650 struct l_ucontext uc; 651 struct l_sigcontext *context; 652 l_stack_t *lss; 653 stack_t ss; 654 struct trapframe *regs; 655 int eflags; 656 ksiginfo_t ksi; 657 658 regs = td->td_frame; 659 660 #ifdef DEBUG 661 if (ldebug(rt_sigreturn)) 662 printf(ARGS(rt_sigreturn, "%p"), (void *)args->ucp); 663 #endif 664 /* 665 * The trampoline code hands us the ucontext. 666 * It is unsafe to keep track of it ourselves, in the event that a 667 * program jumps out of a signal handler. 668 */ 669 if (copyin(args->ucp, &uc, sizeof(uc)) != 0) 670 return (EFAULT); 671 672 context = &uc.uc_mcontext; 673 674 /* 675 * Check for security violations. 676 */ 677 #define EFLAGS_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) 678 eflags = context->sc_eflags; 679 /* 680 * XXX do allow users to change the privileged flag PSL_RF. The 681 * cpu sets PSL_RF in tf_eflags for faults. Debuggers should 682 * sometimes set it there too. tf_eflags is kept in the signal 683 * context during signal handling and there is no other place 684 * to remember it, so the PSL_RF bit may be corrupted by the 685 * signal handler without us knowing. Corruption of the PSL_RF 686 * bit at worst causes one more or one less debugger trap, so 687 * allowing it is fairly harmless. 688 */ 689 if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF)) 690 return(EINVAL); 691 692 /* 693 * Don't allow users to load a valid privileged %cs. Let the 694 * hardware check for invalid selectors, excess privilege in 695 * other selectors, invalid %eip's and invalid %esp's. 696 */ 697 #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) 698 if (!CS_SECURE(context->sc_cs)) { 699 ksiginfo_init_trap(&ksi); 700 ksi.ksi_signo = SIGBUS; 701 ksi.ksi_code = BUS_OBJERR; 702 ksi.ksi_trapno = T_PROTFLT; 703 ksi.ksi_addr = (void *)regs->tf_rip; 704 trapsignal(td, &ksi); 705 return(EINVAL); 706 } 707 708 PROC_LOCK(p); 709 linux_to_bsd_sigset(&uc.uc_sigmask, &td->td_sigmask); 710 SIG_CANTMASK(td->td_sigmask); 711 signotify(td); 712 PROC_UNLOCK(p); 713 714 /* 715 * Restore signal context 716 */ 717 /* Selectors were restored by the trampoline. */ 718 regs->tf_rdi = context->sc_edi; 719 regs->tf_rsi = context->sc_esi; 720 regs->tf_rbp = context->sc_ebp; 721 regs->tf_rbx = context->sc_ebx; 722 regs->tf_rdx = context->sc_edx; 723 regs->tf_rcx = context->sc_ecx; 724 regs->tf_rax = context->sc_eax; 725 regs->tf_rip = context->sc_eip; 726 regs->tf_cs = context->sc_cs; 727 regs->tf_rflags = eflags; 728 regs->tf_rsp = context->sc_esp_at_signal; 729 regs->tf_ss = context->sc_ss; 730 731 /* 732 * call sigaltstack & ignore results.. 733 */ 734 lss = &uc.uc_stack; 735 ss.ss_sp = PTRIN(lss->ss_sp); 736 ss.ss_size = lss->ss_size; 737 ss.ss_flags = linux_to_bsd_sigaltstack(lss->ss_flags); 738 739 #ifdef DEBUG 740 if (ldebug(rt_sigreturn)) 741 printf(LMSG("rt_sigret flags: 0x%x, sp: %p, ss: 0x%lx, mask: 0x%x"), 742 ss.ss_flags, ss.ss_sp, ss.ss_size, context->sc_mask); 743 #endif 744 (void)kern_sigaltstack(td, &ss, NULL); 745 746 return (EJUSTRETURN); 747 } 748 749 /* 750 * MPSAFE 751 */ 752 static void 753 linux_prepsyscall(struct trapframe *tf, int *args, u_int *code, caddr_t *params) 754 { 755 args[0] = tf->tf_rbx; 756 args[1] = tf->tf_rcx; 757 args[2] = tf->tf_rdx; 758 args[3] = tf->tf_rsi; 759 args[4] = tf->tf_rdi; 760 args[5] = tf->tf_rbp; /* Unconfirmed */ 761 *params = NULL; /* no copyin */ 762 } 763 764 /* 765 * If a linux binary is exec'ing something, try this image activator 766 * first. We override standard shell script execution in order to 767 * be able to modify the interpreter path. We only do this if a linux 768 * binary is doing the exec, so we do not create an EXEC module for it. 769 */ 770 static int exec_linux_imgact_try(struct image_params *iparams); 771 772 static int 773 exec_linux_imgact_try(struct image_params *imgp) 774 { 775 const char *head = (const char *)imgp->image_header; 776 char *rpath; 777 int error = -1, len; 778 779 /* 780 * The interpreter for shell scripts run from a linux binary needs 781 * to be located in /compat/linux if possible in order to recursively 782 * maintain linux path emulation. 783 */ 784 if (((const short *)head)[0] == SHELLMAGIC) { 785 /* 786 * Run our normal shell image activator. If it succeeds attempt 787 * to use the alternate path for the interpreter. If an alternate 788 * path is found, use our stringspace to store it. 789 */ 790 if ((error = exec_shell_imgact(imgp)) == 0) { 791 linux_emul_convpath(FIRST_THREAD_IN_PROC(imgp->proc), 792 imgp->interpreter_name, UIO_SYSSPACE, &rpath, 0); 793 if (rpath != NULL) { 794 len = strlen(rpath) + 1; 795 796 if (len <= MAXSHELLCMDLEN) { 797 memcpy(imgp->interpreter_name, rpath, len); 798 } 799 free(rpath, M_TEMP); 800 } 801 } 802 } 803 return(error); 804 } 805 806 /* 807 * Clear registers on exec 808 * XXX copied from ia32_signal.c. 809 */ 810 static void 811 exec_linux_setregs(td, entry, stack, ps_strings) 812 struct thread *td; 813 u_long entry; 814 u_long stack; 815 u_long ps_strings; 816 { 817 struct trapframe *regs = td->td_frame; 818 struct pcb *pcb = td->td_pcb; 819 820 critical_enter(); 821 wrmsr(MSR_FSBASE, 0); 822 wrmsr(MSR_KGSBASE, 0); /* User value while we're in the kernel */ 823 pcb->pcb_fsbase = 0; 824 pcb->pcb_gsbase = 0; 825 critical_exit(); 826 load_ds(_udatasel); 827 load_es(_udatasel); 828 load_fs(_udatasel); 829 load_gs(_udatasel); 830 pcb->pcb_ds = _udatasel; 831 pcb->pcb_es = _udatasel; 832 pcb->pcb_fs = _udatasel; 833 pcb->pcb_gs = _udatasel; 834 835 bzero((char *)regs, sizeof(struct trapframe)); 836 regs->tf_rip = entry; 837 regs->tf_rsp = stack; 838 regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T); 839 regs->tf_ss = _udatasel; 840 regs->tf_cs = _ucode32sel; 841 regs->tf_rbx = ps_strings; 842 load_cr0(rcr0() | CR0_MP | CR0_TS); 843 fpstate_drop(td); 844 845 /* Return via doreti so that we can change to a different %cs */ 846 pcb->pcb_flags |= PCB_FULLCTX; 847 td->td_retval[1] = 0; 848 } 849 850 /* 851 * XXX copied from ia32_sysvec.c. 852 */ 853 static register_t * 854 linux_copyout_strings(struct image_params *imgp) 855 { 856 int argc, envc; 857 u_int32_t *vectp; 858 char *stringp, *destp; 859 u_int32_t *stack_base; 860 struct linux32_ps_strings *arginfo; 861 int sigcodesz; 862 863 /* 864 * Calculate string base and vector table pointers. 865 * Also deal with signal trampoline code for this exec type. 866 */ 867 arginfo = (struct linux32_ps_strings *)LINUX32_PS_STRINGS; 868 sigcodesz = *(imgp->proc->p_sysent->sv_szsigcode); 869 destp = (caddr_t)arginfo - sigcodesz - SPARE_USRSPACE - 870 roundup((ARG_MAX - imgp->args->stringspace), sizeof(char *)); 871 872 /* 873 * install sigcode 874 */ 875 if (sigcodesz) 876 copyout(imgp->proc->p_sysent->sv_sigcode, 877 ((caddr_t)arginfo - sigcodesz), sigcodesz); 878 879 /* 880 * If we have a valid auxargs ptr, prepare some room 881 * on the stack. 882 */ 883 if (imgp->auxargs) { 884 /* 885 * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for 886 * lower compatibility. 887 */ 888 imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size 889 : (AT_COUNT * 2); 890 /* 891 * The '+ 2' is for the null pointers at the end of each of 892 * the arg and env vector sets,and imgp->auxarg_size is room 893 * for argument of Runtime loader. 894 */ 895 vectp = (u_int32_t *) (destp - (imgp->args->argc + imgp->args->envc + 2 + 896 imgp->auxarg_size) * sizeof(u_int32_t)); 897 898 } else 899 /* 900 * The '+ 2' is for the null pointers at the end of each of 901 * the arg and env vector sets 902 */ 903 vectp = (u_int32_t *) 904 (destp - (imgp->args->argc + imgp->args->envc + 2) * sizeof(u_int32_t)); 905 906 /* 907 * vectp also becomes our initial stack base 908 */ 909 stack_base = vectp; 910 911 stringp = imgp->args->begin_argv; 912 argc = imgp->args->argc; 913 envc = imgp->args->envc; 914 /* 915 * Copy out strings - arguments and environment. 916 */ 917 copyout(stringp, destp, ARG_MAX - imgp->args->stringspace); 918 919 /* 920 * Fill in "ps_strings" struct for ps, w, etc. 921 */ 922 suword32(&arginfo->ps_argvstr, (u_int32_t)(intptr_t)vectp); 923 suword32(&arginfo->ps_nargvstr, argc); 924 925 /* 926 * Fill in argument portion of vector table. 927 */ 928 for (; argc > 0; --argc) { 929 suword32(vectp++, (u_int32_t)(intptr_t)destp); 930 while (*stringp++ != 0) 931 destp++; 932 destp++; 933 } 934 935 /* a null vector table pointer separates the argp's from the envp's */ 936 suword32(vectp++, 0); 937 938 suword32(&arginfo->ps_envstr, (u_int32_t)(intptr_t)vectp); 939 suword32(&arginfo->ps_nenvstr, envc); 940 941 /* 942 * Fill in environment portion of vector table. 943 */ 944 for (; envc > 0; --envc) { 945 suword32(vectp++, (u_int32_t)(intptr_t)destp); 946 while (*stringp++ != 0) 947 destp++; 948 destp++; 949 } 950 951 /* end of vector table is a null pointer */ 952 suword32(vectp, 0); 953 954 return ((register_t *)stack_base); 955 } 956 957 SYSCTL_NODE(_compat, OID_AUTO, linux32, CTLFLAG_RW, 0, 958 "32-bit Linux emulation"); 959 960 static u_long linux32_maxdsiz = LINUX32_MAXDSIZ; 961 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxdsiz, CTLFLAG_RW, 962 &linux32_maxdsiz, 0, ""); 963 static u_long linux32_maxssiz = LINUX32_MAXSSIZ; 964 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxssiz, CTLFLAG_RW, 965 &linux32_maxssiz, 0, ""); 966 static u_long linux32_maxvmem = LINUX32_MAXVMEM; 967 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxvmem, CTLFLAG_RW, 968 &linux32_maxvmem, 0, ""); 969 970 static void 971 linux32_fixlimit(struct rlimit *rl, int which) 972 { 973 974 switch (which) { 975 case RLIMIT_DATA: 976 if (linux32_maxdsiz != 0) { 977 if (rl->rlim_cur > linux32_maxdsiz) 978 rl->rlim_cur = linux32_maxdsiz; 979 if (rl->rlim_max > linux32_maxdsiz) 980 rl->rlim_max = linux32_maxdsiz; 981 } 982 break; 983 case RLIMIT_STACK: 984 if (linux32_maxssiz != 0) { 985 if (rl->rlim_cur > linux32_maxssiz) 986 rl->rlim_cur = linux32_maxssiz; 987 if (rl->rlim_max > linux32_maxssiz) 988 rl->rlim_max = linux32_maxssiz; 989 } 990 break; 991 case RLIMIT_VMEM: 992 if (linux32_maxvmem != 0) { 993 if (rl->rlim_cur > linux32_maxvmem) 994 rl->rlim_cur = linux32_maxvmem; 995 if (rl->rlim_max > linux32_maxvmem) 996 rl->rlim_max = linux32_maxvmem; 997 } 998 break; 999 } 1000 } 1001 1002 struct sysentvec elf_linux_sysvec = { 1003 LINUX_SYS_MAXSYSCALL, 1004 linux_sysent, 1005 0, 1006 LINUX_SIGTBLSZ, 1007 bsd_to_linux_signal, 1008 ELAST + 1, 1009 bsd_to_linux_errno, 1010 translate_traps, 1011 elf_linux_fixup, 1012 linux_sendsig, 1013 linux_sigcode, 1014 &linux_szsigcode, 1015 linux_prepsyscall, 1016 "Linux ELF32", 1017 elf32_coredump, 1018 exec_linux_imgact_try, 1019 LINUX_MINSIGSTKSZ, 1020 PAGE_SIZE, 1021 VM_MIN_ADDRESS, 1022 LINUX32_USRSTACK, 1023 LINUX32_USRSTACK, 1024 LINUX32_PS_STRINGS, 1025 VM_PROT_ALL, 1026 linux_copyout_strings, 1027 exec_linux_setregs, 1028 linux32_fixlimit, 1029 &linux32_maxssiz, 1030 }; 1031 1032 static Elf32_Brandinfo linux_brand = { 1033 ELFOSABI_LINUX, 1034 EM_386, 1035 "Linux", 1036 "/compat/linux", 1037 "/lib/ld-linux.so.1", 1038 &elf_linux_sysvec, 1039 NULL, 1040 BI_CAN_EXEC_DYN, 1041 }; 1042 1043 static Elf32_Brandinfo linux_glibc2brand = { 1044 ELFOSABI_LINUX, 1045 EM_386, 1046 "Linux", 1047 "/compat/linux", 1048 "/lib/ld-linux.so.2", 1049 &elf_linux_sysvec, 1050 NULL, 1051 BI_CAN_EXEC_DYN, 1052 }; 1053 1054 Elf32_Brandinfo *linux_brandlist[] = { 1055 &linux_brand, 1056 &linux_glibc2brand, 1057 NULL 1058 }; 1059 1060 static int 1061 linux_elf_modevent(module_t mod, int type, void *data) 1062 { 1063 Elf32_Brandinfo **brandinfo; 1064 int error; 1065 struct linux_ioctl_handler **lihp; 1066 struct linux_device_handler **ldhp; 1067 1068 error = 0; 1069 1070 switch(type) { 1071 case MOD_LOAD: 1072 for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; 1073 ++brandinfo) 1074 if (elf32_insert_brand_entry(*brandinfo) < 0) 1075 error = EINVAL; 1076 if (error == 0) { 1077 SET_FOREACH(lihp, linux_ioctl_handler_set) 1078 linux_ioctl_register_handler(*lihp); 1079 SET_FOREACH(ldhp, linux_device_handler_set) 1080 linux_device_register_handler(*ldhp); 1081 mtx_init(&emul_lock, "emuldata lock", NULL, MTX_DEF); 1082 sx_init(&emul_shared_lock, "emuldata->shared lock"); 1083 LIST_INIT(&futex_list); 1084 sx_init(&futex_sx, "futex protection lock"); 1085 linux_exit_tag = EVENTHANDLER_REGISTER(process_exit, linux_proc_exit, 1086 NULL, 1000); 1087 linux_schedtail_tag = EVENTHANDLER_REGISTER(schedtail, linux_schedtail, 1088 NULL, 1000); 1089 linux_exec_tag = EVENTHANDLER_REGISTER(process_exec, linux_proc_exec, 1090 NULL, 1000); 1091 if (bootverbose) 1092 printf("Linux ELF exec handler installed\n"); 1093 } else 1094 printf("cannot insert Linux ELF brand handler\n"); 1095 break; 1096 case MOD_UNLOAD: 1097 for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; 1098 ++brandinfo) 1099 if (elf32_brand_inuse(*brandinfo)) 1100 error = EBUSY; 1101 if (error == 0) { 1102 for (brandinfo = &linux_brandlist[0]; 1103 *brandinfo != NULL; ++brandinfo) 1104 if (elf32_remove_brand_entry(*brandinfo) < 0) 1105 error = EINVAL; 1106 } 1107 if (error == 0) { 1108 SET_FOREACH(lihp, linux_ioctl_handler_set) 1109 linux_ioctl_unregister_handler(*lihp); 1110 SET_FOREACH(ldhp, linux_device_handler_set) 1111 linux_device_unregister_handler(*ldhp); 1112 mtx_destroy(&emul_lock); 1113 sx_destroy(&emul_shared_lock); 1114 sx_destroy(&futex_sx); 1115 EVENTHANDLER_DEREGISTER(process_exit, linux_exit_tag); 1116 EVENTHANDLER_DEREGISTER(schedtail, linux_schedtail_tag); 1117 EVENTHANDLER_DEREGISTER(process_exec, linux_exec_tag); 1118 if (bootverbose) 1119 printf("Linux ELF exec handler removed\n"); 1120 } else 1121 printf("Could not deinstall ELF interpreter entry\n"); 1122 break; 1123 default: 1124 return EOPNOTSUPP; 1125 } 1126 return error; 1127 } 1128 1129 static moduledata_t linux_elf_mod = { 1130 "linuxelf", 1131 linux_elf_modevent, 1132 0 1133 }; 1134 1135 DECLARE_MODULE(linuxelf, linux_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY); 1136