1 /*- 2 * Copyright (c) 2004 Tim J. Robbins 3 * Copyright (c) 2003 Peter Wemm 4 * Copyright (c) 2002 Doug Rabson 5 * Copyright (c) 1998-1999 Andrew Gallatin 6 * Copyright (c) 1994-1996 S�ren Schmidt 7 * All rights reserved. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer 14 * in this position and unchanged. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. The name of the author may not be used to endorse or promote products 19 * derived from this software without specific prior written permission 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 22 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 23 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 24 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 25 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 26 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 27 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 28 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 30 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 */ 32 33 #include <sys/cdefs.h> 34 __FBSDID("$FreeBSD$"); 35 #include "opt_compat.h" 36 37 #ifndef COMPAT_IA32 38 #error "Unable to compile Linux-emulator due to missing COMPAT_IA32 option!" 39 #endif 40 41 #define __ELF_WORD_SIZE 32 42 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/exec.h> 46 #include <sys/imgact.h> 47 #include <sys/imgact_elf.h> 48 #include <sys/kernel.h> 49 #include <sys/lock.h> 50 #include <sys/malloc.h> 51 #include <sys/module.h> 52 #include <sys/mutex.h> 53 #include <sys/proc.h> 54 #include <sys/resourcevar.h> 55 #include <sys/signalvar.h> 56 #include <sys/sysctl.h> 57 #include <sys/syscallsubr.h> 58 #include <sys/sysent.h> 59 #include <sys/sysproto.h> 60 #include <sys/vnode.h> 61 #include <sys/eventhandler.h> 62 63 #include <vm/vm.h> 64 #include <vm/pmap.h> 65 #include <vm/vm_extern.h> 66 #include <vm/vm_map.h> 67 #include <vm/vm_object.h> 68 #include <vm/vm_page.h> 69 #include <vm/vm_param.h> 70 71 #include <machine/cpu.h> 72 #include <machine/md_var.h> 73 #include <machine/pcb.h> 74 #include <machine/specialreg.h> 75 76 #include <amd64/linux32/linux.h> 77 #include <amd64/linux32/linux32_proto.h> 78 #include <compat/linux/linux_emul.h> 79 #include <compat/linux/linux_mib.h> 80 #include <compat/linux/linux_signal.h> 81 #include <compat/linux/linux_util.h> 82 83 MODULE_VERSION(linux, 1); 84 85 MALLOC_DEFINE(M_LINUX, "linux", "Linux mode structures"); 86 87 #define AUXARGS_ENTRY_32(pos, id, val) \ 88 do { \ 89 suword32(pos++, id); \ 90 suword32(pos++, val); \ 91 } while (0) 92 93 #if BYTE_ORDER == LITTLE_ENDIAN 94 #define SHELLMAGIC 0x2123 /* #! */ 95 #else 96 #define SHELLMAGIC 0x2321 97 #endif 98 99 /* 100 * Allow the sendsig functions to use the ldebug() facility 101 * even though they are not syscalls themselves. Map them 102 * to syscall 0. This is slightly less bogus than using 103 * ldebug(sigreturn). 104 */ 105 #define LINUX_SYS_linux_rt_sendsig 0 106 #define LINUX_SYS_linux_sendsig 0 107 108 extern char linux_sigcode[]; 109 extern int linux_szsigcode; 110 111 extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL]; 112 113 SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler); 114 SET_DECLARE(linux_device_handler_set, struct linux_device_handler); 115 116 static int elf_linux_fixup(register_t **stack_base, 117 struct image_params *iparams); 118 static register_t *linux_copyout_strings(struct image_params *imgp); 119 static void linux_prepsyscall(struct trapframe *tf, int *args, u_int *code, 120 caddr_t *params); 121 static void linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask); 122 static void exec_linux_setregs(struct thread *td, u_long entry, 123 u_long stack, u_long ps_strings); 124 static void linux32_fixlimit(struct rlimit *rl, int which); 125 126 extern LIST_HEAD(futex_list, futex) futex_list; 127 extern struct sx futex_sx; 128 129 static eventhandler_tag linux_exit_tag; 130 static eventhandler_tag linux_schedtail_tag; 131 static eventhandler_tag linux_exec_tag; 132 133 /* 134 * Linux syscalls return negative errno's, we do positive and map them 135 * Reference: 136 * FreeBSD: src/sys/sys/errno.h 137 * Linux: linux-2.6.17.8/include/asm-generic/errno-base.h 138 * linux-2.6.17.8/include/asm-generic/errno.h 139 */ 140 static int bsd_to_linux_errno[ELAST + 1] = { 141 -0, -1, -2, -3, -4, -5, -6, -7, -8, -9, 142 -10, -35, -12, -13, -14, -15, -16, -17, -18, -19, 143 -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, 144 -30, -31, -32, -33, -34, -11,-115,-114, -88, -89, 145 -90, -91, -92, -93, -94, -95, -96, -97, -98, -99, 146 -100,-101,-102,-103,-104,-105,-106,-107,-108,-109, 147 -110,-111, -40, -36,-112,-113, -39, -11, -87,-122, 148 -116, -66, -6, -6, -6, -6, -6, -37, -38, -9, 149 -6, -6, -43, -42, -75,-125, -84, -95, -16, -74, 150 -72, -67, -71 151 }; 152 153 int bsd_to_linux_signal[LINUX_SIGTBLSZ] = { 154 LINUX_SIGHUP, LINUX_SIGINT, LINUX_SIGQUIT, LINUX_SIGILL, 155 LINUX_SIGTRAP, LINUX_SIGABRT, 0, LINUX_SIGFPE, 156 LINUX_SIGKILL, LINUX_SIGBUS, LINUX_SIGSEGV, LINUX_SIGSYS, 157 LINUX_SIGPIPE, LINUX_SIGALRM, LINUX_SIGTERM, LINUX_SIGURG, 158 LINUX_SIGSTOP, LINUX_SIGTSTP, LINUX_SIGCONT, LINUX_SIGCHLD, 159 LINUX_SIGTTIN, LINUX_SIGTTOU, LINUX_SIGIO, LINUX_SIGXCPU, 160 LINUX_SIGXFSZ, LINUX_SIGVTALRM, LINUX_SIGPROF, LINUX_SIGWINCH, 161 0, LINUX_SIGUSR1, LINUX_SIGUSR2 162 }; 163 164 int linux_to_bsd_signal[LINUX_SIGTBLSZ] = { 165 SIGHUP, SIGINT, SIGQUIT, SIGILL, 166 SIGTRAP, SIGABRT, SIGBUS, SIGFPE, 167 SIGKILL, SIGUSR1, SIGSEGV, SIGUSR2, 168 SIGPIPE, SIGALRM, SIGTERM, SIGBUS, 169 SIGCHLD, SIGCONT, SIGSTOP, SIGTSTP, 170 SIGTTIN, SIGTTOU, SIGURG, SIGXCPU, 171 SIGXFSZ, SIGVTALRM, SIGPROF, SIGWINCH, 172 SIGIO, SIGURG, SIGSYS 173 }; 174 175 #define LINUX_T_UNKNOWN 255 176 static int _bsd_to_linux_trapcode[] = { 177 LINUX_T_UNKNOWN, /* 0 */ 178 6, /* 1 T_PRIVINFLT */ 179 LINUX_T_UNKNOWN, /* 2 */ 180 3, /* 3 T_BPTFLT */ 181 LINUX_T_UNKNOWN, /* 4 */ 182 LINUX_T_UNKNOWN, /* 5 */ 183 16, /* 6 T_ARITHTRAP */ 184 254, /* 7 T_ASTFLT */ 185 LINUX_T_UNKNOWN, /* 8 */ 186 13, /* 9 T_PROTFLT */ 187 1, /* 10 T_TRCTRAP */ 188 LINUX_T_UNKNOWN, /* 11 */ 189 14, /* 12 T_PAGEFLT */ 190 LINUX_T_UNKNOWN, /* 13 */ 191 17, /* 14 T_ALIGNFLT */ 192 LINUX_T_UNKNOWN, /* 15 */ 193 LINUX_T_UNKNOWN, /* 16 */ 194 LINUX_T_UNKNOWN, /* 17 */ 195 0, /* 18 T_DIVIDE */ 196 2, /* 19 T_NMI */ 197 4, /* 20 T_OFLOW */ 198 5, /* 21 T_BOUND */ 199 7, /* 22 T_DNA */ 200 8, /* 23 T_DOUBLEFLT */ 201 9, /* 24 T_FPOPFLT */ 202 10, /* 25 T_TSSFLT */ 203 11, /* 26 T_SEGNPFLT */ 204 12, /* 27 T_STKFLT */ 205 18, /* 28 T_MCHK */ 206 19, /* 29 T_XMMFLT */ 207 15 /* 30 T_RESERVED */ 208 }; 209 #define bsd_to_linux_trapcode(code) \ 210 ((code)<sizeof(_bsd_to_linux_trapcode)/sizeof(*_bsd_to_linux_trapcode)? \ 211 _bsd_to_linux_trapcode[(code)]: \ 212 LINUX_T_UNKNOWN) 213 214 struct linux32_ps_strings { 215 u_int32_t ps_argvstr; /* first of 0 or more argument strings */ 216 u_int ps_nargvstr; /* the number of argument strings */ 217 u_int32_t ps_envstr; /* first of 0 or more environment strings */ 218 u_int ps_nenvstr; /* the number of environment strings */ 219 }; 220 221 /* 222 * If FreeBSD & Linux have a difference of opinion about what a trap 223 * means, deal with it here. 224 * 225 * MPSAFE 226 */ 227 static int 228 translate_traps(int signal, int trap_code) 229 { 230 if (signal != SIGBUS) 231 return signal; 232 switch (trap_code) { 233 case T_PROTFLT: 234 case T_TSSFLT: 235 case T_DOUBLEFLT: 236 case T_PAGEFLT: 237 return SIGSEGV; 238 default: 239 return signal; 240 } 241 } 242 243 static int 244 elf_linux_fixup(register_t **stack_base, struct image_params *imgp) 245 { 246 Elf32_Auxargs *args; 247 Elf32_Addr *base; 248 Elf32_Addr *pos; 249 250 KASSERT(curthread->td_proc == imgp->proc, 251 ("unsafe elf_linux_fixup(), should be curproc")); 252 base = (Elf32_Addr *)*stack_base; 253 args = (Elf32_Auxargs *)imgp->auxargs; 254 pos = base + (imgp->args->argc + imgp->args->envc + 2); 255 256 if (args->trace) 257 AUXARGS_ENTRY_32(pos, AT_DEBUG, 1); 258 if (args->execfd != -1) 259 AUXARGS_ENTRY_32(pos, AT_EXECFD, args->execfd); 260 AUXARGS_ENTRY_32(pos, AT_PHDR, args->phdr); 261 AUXARGS_ENTRY_32(pos, AT_PHENT, args->phent); 262 AUXARGS_ENTRY_32(pos, AT_PHNUM, args->phnum); 263 AUXARGS_ENTRY_32(pos, AT_PAGESZ, args->pagesz); 264 AUXARGS_ENTRY_32(pos, AT_FLAGS, args->flags); 265 AUXARGS_ENTRY_32(pos, AT_ENTRY, args->entry); 266 AUXARGS_ENTRY_32(pos, AT_BASE, args->base); 267 AUXARGS_ENTRY_32(pos, AT_UID, imgp->proc->p_ucred->cr_ruid); 268 AUXARGS_ENTRY_32(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid); 269 AUXARGS_ENTRY_32(pos, AT_GID, imgp->proc->p_ucred->cr_rgid); 270 AUXARGS_ENTRY_32(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid); 271 AUXARGS_ENTRY_32(pos, AT_NULL, 0); 272 273 free(imgp->auxargs, M_TEMP); 274 imgp->auxargs = NULL; 275 276 base--; 277 suword32(base, (uint32_t)imgp->args->argc); 278 *stack_base = (register_t *)base; 279 return 0; 280 } 281 282 extern int _ucodesel, _ucode32sel, _udatasel; 283 extern unsigned long linux_sznonrtsigcode; 284 285 static void 286 linux_rt_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) 287 { 288 struct thread *td = curthread; 289 struct proc *p = td->td_proc; 290 struct sigacts *psp; 291 struct trapframe *regs; 292 struct l_rt_sigframe *fp, frame; 293 int oonstack; 294 int sig; 295 int code; 296 297 sig = ksi->ksi_signo; 298 code = ksi->ksi_code; 299 PROC_LOCK_ASSERT(p, MA_OWNED); 300 psp = p->p_sigacts; 301 mtx_assert(&psp->ps_mtx, MA_OWNED); 302 regs = td->td_frame; 303 oonstack = sigonstack(regs->tf_rsp); 304 305 #ifdef DEBUG 306 if (ldebug(rt_sendsig)) 307 printf(ARGS(rt_sendsig, "%p, %d, %p, %u"), 308 catcher, sig, (void*)mask, code); 309 #endif 310 /* 311 * Allocate space for the signal handler context. 312 */ 313 if ((td->td_pflags & TDP_ALTSTACK) && !oonstack && 314 SIGISMEMBER(psp->ps_sigonstack, sig)) { 315 fp = (struct l_rt_sigframe *)(td->td_sigstk.ss_sp + 316 td->td_sigstk.ss_size - sizeof(struct l_rt_sigframe)); 317 } else 318 fp = (struct l_rt_sigframe *)regs->tf_rsp - 1; 319 mtx_unlock(&psp->ps_mtx); 320 321 /* 322 * Build the argument list for the signal handler. 323 */ 324 if (p->p_sysent->sv_sigtbl) 325 if (sig <= p->p_sysent->sv_sigsize) 326 sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)]; 327 328 bzero(&frame, sizeof(frame)); 329 330 frame.sf_handler = PTROUT(catcher); 331 frame.sf_sig = sig; 332 frame.sf_siginfo = PTROUT(&fp->sf_si); 333 frame.sf_ucontext = PTROUT(&fp->sf_sc); 334 335 /* Fill in POSIX parts */ 336 frame.sf_si.lsi_signo = sig; 337 frame.sf_si.lsi_code = code; 338 frame.sf_si.lsi_addr = PTROUT(ksi->ksi_addr); 339 340 /* 341 * Build the signal context to be used by sigreturn. 342 */ 343 frame.sf_sc.uc_flags = 0; /* XXX ??? */ 344 frame.sf_sc.uc_link = 0; /* XXX ??? */ 345 346 frame.sf_sc.uc_stack.ss_sp = PTROUT(td->td_sigstk.ss_sp); 347 frame.sf_sc.uc_stack.ss_size = td->td_sigstk.ss_size; 348 frame.sf_sc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) 349 ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE; 350 PROC_UNLOCK(p); 351 352 bsd_to_linux_sigset(mask, &frame.sf_sc.uc_sigmask); 353 354 frame.sf_sc.uc_mcontext.sc_mask = frame.sf_sc.uc_sigmask.__bits[0]; 355 frame.sf_sc.uc_mcontext.sc_gs = rgs(); 356 frame.sf_sc.uc_mcontext.sc_fs = rfs(); 357 __asm __volatile("movl %%es,%0" : 358 "=rm" (frame.sf_sc.uc_mcontext.sc_es)); 359 __asm __volatile("movl %%ds,%0" : 360 "=rm" (frame.sf_sc.uc_mcontext.sc_ds)); 361 frame.sf_sc.uc_mcontext.sc_edi = regs->tf_rdi; 362 frame.sf_sc.uc_mcontext.sc_esi = regs->tf_rsi; 363 frame.sf_sc.uc_mcontext.sc_ebp = regs->tf_rbp; 364 frame.sf_sc.uc_mcontext.sc_ebx = regs->tf_rbx; 365 frame.sf_sc.uc_mcontext.sc_edx = regs->tf_rdx; 366 frame.sf_sc.uc_mcontext.sc_ecx = regs->tf_rcx; 367 frame.sf_sc.uc_mcontext.sc_eax = regs->tf_rax; 368 frame.sf_sc.uc_mcontext.sc_eip = regs->tf_rip; 369 frame.sf_sc.uc_mcontext.sc_cs = regs->tf_cs; 370 frame.sf_sc.uc_mcontext.sc_eflags = regs->tf_rflags; 371 frame.sf_sc.uc_mcontext.sc_esp_at_signal = regs->tf_rsp; 372 frame.sf_sc.uc_mcontext.sc_ss = regs->tf_ss; 373 frame.sf_sc.uc_mcontext.sc_err = regs->tf_err; 374 frame.sf_sc.uc_mcontext.sc_cr2 = (u_int32_t)(uintptr_t)ksi->ksi_addr; 375 frame.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code); 376 377 #ifdef DEBUG 378 if (ldebug(rt_sendsig)) 379 printf(LMSG("rt_sendsig flags: 0x%x, sp: %p, ss: 0x%lx, mask: 0x%x"), 380 frame.sf_sc.uc_stack.ss_flags, td->td_sigstk.ss_sp, 381 td->td_sigstk.ss_size, frame.sf_sc.uc_mcontext.sc_mask); 382 #endif 383 384 if (copyout(&frame, fp, sizeof(frame)) != 0) { 385 /* 386 * Process has trashed its stack; give it an illegal 387 * instruction to halt it in its tracks. 388 */ 389 #ifdef DEBUG 390 if (ldebug(rt_sendsig)) 391 printf(LMSG("rt_sendsig: bad stack %p, oonstack=%x"), 392 fp, oonstack); 393 #endif 394 PROC_LOCK(p); 395 sigexit(td, SIGILL); 396 } 397 398 /* 399 * Build context to run handler in. 400 */ 401 regs->tf_rsp = PTROUT(fp); 402 regs->tf_rip = LINUX32_PS_STRINGS - *(p->p_sysent->sv_szsigcode) + 403 linux_sznonrtsigcode; 404 regs->tf_rflags &= ~(PSL_T | PSL_D); 405 regs->tf_cs = _ucode32sel; 406 regs->tf_ss = _udatasel; 407 load_ds(_udatasel); 408 td->td_pcb->pcb_ds = _udatasel; 409 load_es(_udatasel); 410 td->td_pcb->pcb_es = _udatasel; 411 /* leave user %fs and %gs untouched */ 412 PROC_LOCK(p); 413 mtx_lock(&psp->ps_mtx); 414 } 415 416 417 /* 418 * Send an interrupt to process. 419 * 420 * Stack is set up to allow sigcode stored 421 * in u. to call routine, followed by kcall 422 * to sigreturn routine below. After sigreturn 423 * resets the signal mask, the stack, and the 424 * frame pointer, it returns to the user 425 * specified pc, psl. 426 */ 427 static void 428 linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) 429 { 430 struct thread *td = curthread; 431 struct proc *p = td->td_proc; 432 struct sigacts *psp; 433 struct trapframe *regs; 434 struct l_sigframe *fp, frame; 435 l_sigset_t lmask; 436 int oonstack, i; 437 int sig, code; 438 439 sig = ksi->ksi_signo; 440 code = ksi->ksi_code; 441 PROC_LOCK_ASSERT(p, MA_OWNED); 442 psp = p->p_sigacts; 443 mtx_assert(&psp->ps_mtx, MA_OWNED); 444 if (SIGISMEMBER(psp->ps_siginfo, sig)) { 445 /* Signal handler installed with SA_SIGINFO. */ 446 linux_rt_sendsig(catcher, ksi, mask); 447 return; 448 } 449 450 regs = td->td_frame; 451 oonstack = sigonstack(regs->tf_rsp); 452 453 #ifdef DEBUG 454 if (ldebug(sendsig)) 455 printf(ARGS(sendsig, "%p, %d, %p, %u"), 456 catcher, sig, (void*)mask, code); 457 #endif 458 459 /* 460 * Allocate space for the signal handler context. 461 */ 462 if ((td->td_pflags & TDP_ALTSTACK) && !oonstack && 463 SIGISMEMBER(psp->ps_sigonstack, sig)) { 464 fp = (struct l_sigframe *)(td->td_sigstk.ss_sp + 465 td->td_sigstk.ss_size - sizeof(struct l_sigframe)); 466 } else 467 fp = (struct l_sigframe *)regs->tf_rsp - 1; 468 mtx_unlock(&psp->ps_mtx); 469 PROC_UNLOCK(p); 470 471 /* 472 * Build the argument list for the signal handler. 473 */ 474 if (p->p_sysent->sv_sigtbl) 475 if (sig <= p->p_sysent->sv_sigsize) 476 sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)]; 477 478 bzero(&frame, sizeof(frame)); 479 480 frame.sf_handler = PTROUT(catcher); 481 frame.sf_sig = sig; 482 483 bsd_to_linux_sigset(mask, &lmask); 484 485 /* 486 * Build the signal context to be used by sigreturn. 487 */ 488 frame.sf_sc.sc_mask = lmask.__bits[0]; 489 frame.sf_sc.sc_gs = rgs(); 490 frame.sf_sc.sc_fs = rfs(); 491 __asm __volatile("movl %%es,%0" : "=rm" (frame.sf_sc.sc_es)); 492 __asm __volatile("movl %%ds,%0" : "=rm" (frame.sf_sc.sc_ds)); 493 frame.sf_sc.sc_edi = regs->tf_rdi; 494 frame.sf_sc.sc_esi = regs->tf_rsi; 495 frame.sf_sc.sc_ebp = regs->tf_rbp; 496 frame.sf_sc.sc_ebx = regs->tf_rbx; 497 frame.sf_sc.sc_edx = regs->tf_rdx; 498 frame.sf_sc.sc_ecx = regs->tf_rcx; 499 frame.sf_sc.sc_eax = regs->tf_rax; 500 frame.sf_sc.sc_eip = regs->tf_rip; 501 frame.sf_sc.sc_cs = regs->tf_cs; 502 frame.sf_sc.sc_eflags = regs->tf_rflags; 503 frame.sf_sc.sc_esp_at_signal = regs->tf_rsp; 504 frame.sf_sc.sc_ss = regs->tf_ss; 505 frame.sf_sc.sc_err = regs->tf_err; 506 frame.sf_sc.sc_cr2 = (u_int32_t)(uintptr_t)ksi->ksi_addr; 507 frame.sf_sc.sc_trapno = bsd_to_linux_trapcode(code); 508 509 for (i = 0; i < (LINUX_NSIG_WORDS-1); i++) 510 frame.sf_extramask[i] = lmask.__bits[i+1]; 511 512 if (copyout(&frame, fp, sizeof(frame)) != 0) { 513 /* 514 * Process has trashed its stack; give it an illegal 515 * instruction to halt it in its tracks. 516 */ 517 PROC_LOCK(p); 518 sigexit(td, SIGILL); 519 } 520 521 /* 522 * Build context to run handler in. 523 */ 524 regs->tf_rsp = PTROUT(fp); 525 regs->tf_rip = LINUX32_PS_STRINGS - *(p->p_sysent->sv_szsigcode); 526 regs->tf_rflags &= ~(PSL_T | PSL_D); 527 regs->tf_cs = _ucode32sel; 528 regs->tf_ss = _udatasel; 529 load_ds(_udatasel); 530 td->td_pcb->pcb_ds = _udatasel; 531 load_es(_udatasel); 532 td->td_pcb->pcb_es = _udatasel; 533 /* leave user %fs and %gs untouched */ 534 PROC_LOCK(p); 535 mtx_lock(&psp->ps_mtx); 536 } 537 538 /* 539 * System call to cleanup state after a signal 540 * has been taken. Reset signal mask and 541 * stack state from context left by sendsig (above). 542 * Return to previous pc and psl as specified by 543 * context left by sendsig. Check carefully to 544 * make sure that the user has not modified the 545 * psl to gain improper privileges or to cause 546 * a machine fault. 547 */ 548 int 549 linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args) 550 { 551 struct proc *p = td->td_proc; 552 struct l_sigframe frame; 553 struct trapframe *regs; 554 l_sigset_t lmask; 555 int eflags, i; 556 ksiginfo_t ksi; 557 558 regs = td->td_frame; 559 560 #ifdef DEBUG 561 if (ldebug(sigreturn)) 562 printf(ARGS(sigreturn, "%p"), (void *)args->sfp); 563 #endif 564 /* 565 * The trampoline code hands us the sigframe. 566 * It is unsafe to keep track of it ourselves, in the event that a 567 * program jumps out of a signal handler. 568 */ 569 if (copyin(args->sfp, &frame, sizeof(frame)) != 0) 570 return (EFAULT); 571 572 /* 573 * Check for security violations. 574 */ 575 #define EFLAGS_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) 576 eflags = frame.sf_sc.sc_eflags; 577 /* 578 * XXX do allow users to change the privileged flag PSL_RF. The 579 * cpu sets PSL_RF in tf_eflags for faults. Debuggers should 580 * sometimes set it there too. tf_eflags is kept in the signal 581 * context during signal handling and there is no other place 582 * to remember it, so the PSL_RF bit may be corrupted by the 583 * signal handler without us knowing. Corruption of the PSL_RF 584 * bit at worst causes one more or one less debugger trap, so 585 * allowing it is fairly harmless. 586 */ 587 if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF)) 588 return(EINVAL); 589 590 /* 591 * Don't allow users to load a valid privileged %cs. Let the 592 * hardware check for invalid selectors, excess privilege in 593 * other selectors, invalid %eip's and invalid %esp's. 594 */ 595 #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) 596 if (!CS_SECURE(frame.sf_sc.sc_cs)) { 597 ksiginfo_init_trap(&ksi); 598 ksi.ksi_signo = SIGBUS; 599 ksi.ksi_code = BUS_OBJERR; 600 ksi.ksi_trapno = T_PROTFLT; 601 ksi.ksi_addr = (void *)regs->tf_rip; 602 trapsignal(td, &ksi); 603 return(EINVAL); 604 } 605 606 lmask.__bits[0] = frame.sf_sc.sc_mask; 607 for (i = 0; i < (LINUX_NSIG_WORDS-1); i++) 608 lmask.__bits[i+1] = frame.sf_extramask[i]; 609 PROC_LOCK(p); 610 linux_to_bsd_sigset(&lmask, &td->td_sigmask); 611 SIG_CANTMASK(td->td_sigmask); 612 signotify(td); 613 PROC_UNLOCK(p); 614 615 /* 616 * Restore signal context. 617 */ 618 /* Selectors were restored by the trampoline. */ 619 regs->tf_rdi = frame.sf_sc.sc_edi; 620 regs->tf_rsi = frame.sf_sc.sc_esi; 621 regs->tf_rbp = frame.sf_sc.sc_ebp; 622 regs->tf_rbx = frame.sf_sc.sc_ebx; 623 regs->tf_rdx = frame.sf_sc.sc_edx; 624 regs->tf_rcx = frame.sf_sc.sc_ecx; 625 regs->tf_rax = frame.sf_sc.sc_eax; 626 regs->tf_rip = frame.sf_sc.sc_eip; 627 regs->tf_cs = frame.sf_sc.sc_cs; 628 regs->tf_rflags = eflags; 629 regs->tf_rsp = frame.sf_sc.sc_esp_at_signal; 630 regs->tf_ss = frame.sf_sc.sc_ss; 631 632 return (EJUSTRETURN); 633 } 634 635 /* 636 * System call to cleanup state after a signal 637 * has been taken. Reset signal mask and 638 * stack state from context left by rt_sendsig (above). 639 * Return to previous pc and psl as specified by 640 * context left by sendsig. Check carefully to 641 * make sure that the user has not modified the 642 * psl to gain improper privileges or to cause 643 * a machine fault. 644 */ 645 int 646 linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args) 647 { 648 struct proc *p = td->td_proc; 649 struct l_ucontext uc; 650 struct l_sigcontext *context; 651 l_stack_t *lss; 652 stack_t ss; 653 struct trapframe *regs; 654 int eflags; 655 ksiginfo_t ksi; 656 657 regs = td->td_frame; 658 659 #ifdef DEBUG 660 if (ldebug(rt_sigreturn)) 661 printf(ARGS(rt_sigreturn, "%p"), (void *)args->ucp); 662 #endif 663 /* 664 * The trampoline code hands us the ucontext. 665 * It is unsafe to keep track of it ourselves, in the event that a 666 * program jumps out of a signal handler. 667 */ 668 if (copyin(args->ucp, &uc, sizeof(uc)) != 0) 669 return (EFAULT); 670 671 context = &uc.uc_mcontext; 672 673 /* 674 * Check for security violations. 675 */ 676 #define EFLAGS_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) 677 eflags = context->sc_eflags; 678 /* 679 * XXX do allow users to change the privileged flag PSL_RF. The 680 * cpu sets PSL_RF in tf_eflags for faults. Debuggers should 681 * sometimes set it there too. tf_eflags is kept in the signal 682 * context during signal handling and there is no other place 683 * to remember it, so the PSL_RF bit may be corrupted by the 684 * signal handler without us knowing. Corruption of the PSL_RF 685 * bit at worst causes one more or one less debugger trap, so 686 * allowing it is fairly harmless. 687 */ 688 if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF)) 689 return(EINVAL); 690 691 /* 692 * Don't allow users to load a valid privileged %cs. Let the 693 * hardware check for invalid selectors, excess privilege in 694 * other selectors, invalid %eip's and invalid %esp's. 695 */ 696 #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) 697 if (!CS_SECURE(context->sc_cs)) { 698 ksiginfo_init_trap(&ksi); 699 ksi.ksi_signo = SIGBUS; 700 ksi.ksi_code = BUS_OBJERR; 701 ksi.ksi_trapno = T_PROTFLT; 702 ksi.ksi_addr = (void *)regs->tf_rip; 703 trapsignal(td, &ksi); 704 return(EINVAL); 705 } 706 707 PROC_LOCK(p); 708 linux_to_bsd_sigset(&uc.uc_sigmask, &td->td_sigmask); 709 SIG_CANTMASK(td->td_sigmask); 710 signotify(td); 711 PROC_UNLOCK(p); 712 713 /* 714 * Restore signal context 715 */ 716 /* Selectors were restored by the trampoline. */ 717 regs->tf_rdi = context->sc_edi; 718 regs->tf_rsi = context->sc_esi; 719 regs->tf_rbp = context->sc_ebp; 720 regs->tf_rbx = context->sc_ebx; 721 regs->tf_rdx = context->sc_edx; 722 regs->tf_rcx = context->sc_ecx; 723 regs->tf_rax = context->sc_eax; 724 regs->tf_rip = context->sc_eip; 725 regs->tf_cs = context->sc_cs; 726 regs->tf_rflags = eflags; 727 regs->tf_rsp = context->sc_esp_at_signal; 728 regs->tf_ss = context->sc_ss; 729 730 /* 731 * call sigaltstack & ignore results.. 732 */ 733 lss = &uc.uc_stack; 734 ss.ss_sp = PTRIN(lss->ss_sp); 735 ss.ss_size = lss->ss_size; 736 ss.ss_flags = linux_to_bsd_sigaltstack(lss->ss_flags); 737 738 #ifdef DEBUG 739 if (ldebug(rt_sigreturn)) 740 printf(LMSG("rt_sigret flags: 0x%x, sp: %p, ss: 0x%lx, mask: 0x%x"), 741 ss.ss_flags, ss.ss_sp, ss.ss_size, context->sc_mask); 742 #endif 743 (void)kern_sigaltstack(td, &ss, NULL); 744 745 return (EJUSTRETURN); 746 } 747 748 /* 749 * MPSAFE 750 */ 751 static void 752 linux_prepsyscall(struct trapframe *tf, int *args, u_int *code, caddr_t *params) 753 { 754 args[0] = tf->tf_rbx; 755 args[1] = tf->tf_rcx; 756 args[2] = tf->tf_rdx; 757 args[3] = tf->tf_rsi; 758 args[4] = tf->tf_rdi; 759 args[5] = tf->tf_rbp; /* Unconfirmed */ 760 *params = NULL; /* no copyin */ 761 } 762 763 /* 764 * If a linux binary is exec'ing something, try this image activator 765 * first. We override standard shell script execution in order to 766 * be able to modify the interpreter path. We only do this if a linux 767 * binary is doing the exec, so we do not create an EXEC module for it. 768 */ 769 static int exec_linux_imgact_try(struct image_params *iparams); 770 771 static int 772 exec_linux_imgact_try(struct image_params *imgp) 773 { 774 const char *head = (const char *)imgp->image_header; 775 char *rpath; 776 int error = -1, len; 777 778 /* 779 * The interpreter for shell scripts run from a linux binary needs 780 * to be located in /compat/linux if possible in order to recursively 781 * maintain linux path emulation. 782 */ 783 if (((const short *)head)[0] == SHELLMAGIC) { 784 /* 785 * Run our normal shell image activator. If it succeeds attempt 786 * to use the alternate path for the interpreter. If an alternate 787 * path is found, use our stringspace to store it. 788 */ 789 if ((error = exec_shell_imgact(imgp)) == 0) { 790 linux_emul_convpath(FIRST_THREAD_IN_PROC(imgp->proc), 791 imgp->interpreter_name, UIO_SYSSPACE, &rpath, 0); 792 if (rpath != NULL) { 793 len = strlen(rpath) + 1; 794 795 if (len <= MAXSHELLCMDLEN) { 796 memcpy(imgp->interpreter_name, rpath, len); 797 } 798 free(rpath, M_TEMP); 799 } 800 } 801 } 802 return(error); 803 } 804 805 /* 806 * Clear registers on exec 807 * XXX copied from ia32_signal.c. 808 */ 809 static void 810 exec_linux_setregs(td, entry, stack, ps_strings) 811 struct thread *td; 812 u_long entry; 813 u_long stack; 814 u_long ps_strings; 815 { 816 struct trapframe *regs = td->td_frame; 817 struct pcb *pcb = td->td_pcb; 818 819 critical_enter(); 820 wrmsr(MSR_FSBASE, 0); 821 wrmsr(MSR_KGSBASE, 0); /* User value while we're in the kernel */ 822 pcb->pcb_fsbase = 0; 823 pcb->pcb_gsbase = 0; 824 critical_exit(); 825 load_ds(_udatasel); 826 load_es(_udatasel); 827 load_fs(_udatasel); 828 load_gs(_udatasel); 829 pcb->pcb_ds = _udatasel; 830 pcb->pcb_es = _udatasel; 831 pcb->pcb_fs = _udatasel; 832 pcb->pcb_gs = _udatasel; 833 834 bzero((char *)regs, sizeof(struct trapframe)); 835 regs->tf_rip = entry; 836 regs->tf_rsp = stack; 837 regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T); 838 regs->tf_ss = _udatasel; 839 regs->tf_cs = _ucode32sel; 840 regs->tf_rbx = ps_strings; 841 load_cr0(rcr0() | CR0_MP | CR0_TS); 842 fpstate_drop(td); 843 844 /* Return via doreti so that we can change to a different %cs */ 845 pcb->pcb_flags |= PCB_FULLCTX; 846 td->td_retval[1] = 0; 847 } 848 849 /* 850 * XXX copied from ia32_sysvec.c. 851 */ 852 static register_t * 853 linux_copyout_strings(struct image_params *imgp) 854 { 855 int argc, envc; 856 u_int32_t *vectp; 857 char *stringp, *destp; 858 u_int32_t *stack_base; 859 struct linux32_ps_strings *arginfo; 860 int sigcodesz; 861 862 /* 863 * Calculate string base and vector table pointers. 864 * Also deal with signal trampoline code for this exec type. 865 */ 866 arginfo = (struct linux32_ps_strings *)LINUX32_PS_STRINGS; 867 sigcodesz = *(imgp->proc->p_sysent->sv_szsigcode); 868 destp = (caddr_t)arginfo - sigcodesz - SPARE_USRSPACE - 869 roundup((ARG_MAX - imgp->args->stringspace), sizeof(char *)); 870 871 /* 872 * install sigcode 873 */ 874 if (sigcodesz) 875 copyout(imgp->proc->p_sysent->sv_sigcode, 876 ((caddr_t)arginfo - sigcodesz), sigcodesz); 877 878 /* 879 * If we have a valid auxargs ptr, prepare some room 880 * on the stack. 881 */ 882 if (imgp->auxargs) { 883 /* 884 * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for 885 * lower compatibility. 886 */ 887 imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size 888 : (AT_COUNT * 2); 889 /* 890 * The '+ 2' is for the null pointers at the end of each of 891 * the arg and env vector sets,and imgp->auxarg_size is room 892 * for argument of Runtime loader. 893 */ 894 vectp = (u_int32_t *) (destp - (imgp->args->argc + imgp->args->envc + 2 + 895 imgp->auxarg_size) * sizeof(u_int32_t)); 896 897 } else 898 /* 899 * The '+ 2' is for the null pointers at the end of each of 900 * the arg and env vector sets 901 */ 902 vectp = (u_int32_t *) 903 (destp - (imgp->args->argc + imgp->args->envc + 2) * sizeof(u_int32_t)); 904 905 /* 906 * vectp also becomes our initial stack base 907 */ 908 stack_base = vectp; 909 910 stringp = imgp->args->begin_argv; 911 argc = imgp->args->argc; 912 envc = imgp->args->envc; 913 /* 914 * Copy out strings - arguments and environment. 915 */ 916 copyout(stringp, destp, ARG_MAX - imgp->args->stringspace); 917 918 /* 919 * Fill in "ps_strings" struct for ps, w, etc. 920 */ 921 suword32(&arginfo->ps_argvstr, (u_int32_t)(intptr_t)vectp); 922 suword32(&arginfo->ps_nargvstr, argc); 923 924 /* 925 * Fill in argument portion of vector table. 926 */ 927 for (; argc > 0; --argc) { 928 suword32(vectp++, (u_int32_t)(intptr_t)destp); 929 while (*stringp++ != 0) 930 destp++; 931 destp++; 932 } 933 934 /* a null vector table pointer separates the argp's from the envp's */ 935 suword32(vectp++, 0); 936 937 suword32(&arginfo->ps_envstr, (u_int32_t)(intptr_t)vectp); 938 suword32(&arginfo->ps_nenvstr, envc); 939 940 /* 941 * Fill in environment portion of vector table. 942 */ 943 for (; envc > 0; --envc) { 944 suword32(vectp++, (u_int32_t)(intptr_t)destp); 945 while (*stringp++ != 0) 946 destp++; 947 destp++; 948 } 949 950 /* end of vector table is a null pointer */ 951 suword32(vectp, 0); 952 953 return ((register_t *)stack_base); 954 } 955 956 SYSCTL_NODE(_compat, OID_AUTO, linux32, CTLFLAG_RW, 0, 957 "32-bit Linux emulation"); 958 959 static u_long linux32_maxdsiz = LINUX32_MAXDSIZ; 960 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxdsiz, CTLFLAG_RW, 961 &linux32_maxdsiz, 0, ""); 962 static u_long linux32_maxssiz = LINUX32_MAXSSIZ; 963 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxssiz, CTLFLAG_RW, 964 &linux32_maxssiz, 0, ""); 965 static u_long linux32_maxvmem = LINUX32_MAXVMEM; 966 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxvmem, CTLFLAG_RW, 967 &linux32_maxvmem, 0, ""); 968 969 static void 970 linux32_fixlimit(struct rlimit *rl, int which) 971 { 972 973 switch (which) { 974 case RLIMIT_DATA: 975 if (linux32_maxdsiz != 0) { 976 if (rl->rlim_cur > linux32_maxdsiz) 977 rl->rlim_cur = linux32_maxdsiz; 978 if (rl->rlim_max > linux32_maxdsiz) 979 rl->rlim_max = linux32_maxdsiz; 980 } 981 break; 982 case RLIMIT_STACK: 983 if (linux32_maxssiz != 0) { 984 if (rl->rlim_cur > linux32_maxssiz) 985 rl->rlim_cur = linux32_maxssiz; 986 if (rl->rlim_max > linux32_maxssiz) 987 rl->rlim_max = linux32_maxssiz; 988 } 989 break; 990 case RLIMIT_VMEM: 991 if (linux32_maxvmem != 0) { 992 if (rl->rlim_cur > linux32_maxvmem) 993 rl->rlim_cur = linux32_maxvmem; 994 if (rl->rlim_max > linux32_maxvmem) 995 rl->rlim_max = linux32_maxvmem; 996 } 997 break; 998 } 999 } 1000 1001 struct sysentvec elf_linux_sysvec = { 1002 LINUX_SYS_MAXSYSCALL, 1003 linux_sysent, 1004 0, 1005 LINUX_SIGTBLSZ, 1006 bsd_to_linux_signal, 1007 ELAST + 1, 1008 bsd_to_linux_errno, 1009 translate_traps, 1010 elf_linux_fixup, 1011 linux_sendsig, 1012 linux_sigcode, 1013 &linux_szsigcode, 1014 linux_prepsyscall, 1015 "Linux ELF32", 1016 elf32_coredump, 1017 exec_linux_imgact_try, 1018 LINUX_MINSIGSTKSZ, 1019 PAGE_SIZE, 1020 VM_MIN_ADDRESS, 1021 LINUX32_USRSTACK, 1022 LINUX32_USRSTACK, 1023 LINUX32_PS_STRINGS, 1024 VM_PROT_ALL, 1025 linux_copyout_strings, 1026 exec_linux_setregs, 1027 linux32_fixlimit, 1028 &linux32_maxssiz, 1029 }; 1030 1031 static Elf32_Brandinfo linux_brand = { 1032 ELFOSABI_LINUX, 1033 EM_386, 1034 "Linux", 1035 "/compat/linux", 1036 "/lib/ld-linux.so.1", 1037 &elf_linux_sysvec, 1038 NULL, 1039 BI_CAN_EXEC_DYN, 1040 }; 1041 1042 static Elf32_Brandinfo linux_glibc2brand = { 1043 ELFOSABI_LINUX, 1044 EM_386, 1045 "Linux", 1046 "/compat/linux", 1047 "/lib/ld-linux.so.2", 1048 &elf_linux_sysvec, 1049 NULL, 1050 BI_CAN_EXEC_DYN, 1051 }; 1052 1053 Elf32_Brandinfo *linux_brandlist[] = { 1054 &linux_brand, 1055 &linux_glibc2brand, 1056 NULL 1057 }; 1058 1059 static int 1060 linux_elf_modevent(module_t mod, int type, void *data) 1061 { 1062 Elf32_Brandinfo **brandinfo; 1063 int error; 1064 struct linux_ioctl_handler **lihp; 1065 struct linux_device_handler **ldhp; 1066 1067 error = 0; 1068 1069 switch(type) { 1070 case MOD_LOAD: 1071 for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; 1072 ++brandinfo) 1073 if (elf32_insert_brand_entry(*brandinfo) < 0) 1074 error = EINVAL; 1075 if (error == 0) { 1076 SET_FOREACH(lihp, linux_ioctl_handler_set) 1077 linux_ioctl_register_handler(*lihp); 1078 SET_FOREACH(ldhp, linux_device_handler_set) 1079 linux_device_register_handler(*ldhp); 1080 mtx_init(&emul_lock, "emuldata lock", NULL, MTX_DEF); 1081 sx_init(&emul_shared_lock, "emuldata->shared lock"); 1082 LIST_INIT(&futex_list); 1083 sx_init(&futex_sx, "futex protection lock"); 1084 linux_exit_tag = EVENTHANDLER_REGISTER(process_exit, linux_proc_exit, 1085 NULL, 1000); 1086 linux_schedtail_tag = EVENTHANDLER_REGISTER(schedtail, linux_schedtail, 1087 NULL, 1000); 1088 linux_exec_tag = EVENTHANDLER_REGISTER(process_exec, linux_proc_exec, 1089 NULL, 1000); 1090 if (bootverbose) 1091 printf("Linux ELF exec handler installed\n"); 1092 } else 1093 printf("cannot insert Linux ELF brand handler\n"); 1094 break; 1095 case MOD_UNLOAD: 1096 for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; 1097 ++brandinfo) 1098 if (elf32_brand_inuse(*brandinfo)) 1099 error = EBUSY; 1100 if (error == 0) { 1101 for (brandinfo = &linux_brandlist[0]; 1102 *brandinfo != NULL; ++brandinfo) 1103 if (elf32_remove_brand_entry(*brandinfo) < 0) 1104 error = EINVAL; 1105 } 1106 if (error == 0) { 1107 SET_FOREACH(lihp, linux_ioctl_handler_set) 1108 linux_ioctl_unregister_handler(*lihp); 1109 SET_FOREACH(ldhp, linux_device_handler_set) 1110 linux_device_unregister_handler(*ldhp); 1111 mtx_destroy(&emul_lock); 1112 sx_destroy(&emul_shared_lock); 1113 sx_destroy(&futex_sx); 1114 EVENTHANDLER_DEREGISTER(process_exit, linux_exit_tag); 1115 EVENTHANDLER_DEREGISTER(schedtail, linux_schedtail_tag); 1116 EVENTHANDLER_DEREGISTER(process_exec, linux_exec_tag); 1117 if (bootverbose) 1118 printf("Linux ELF exec handler removed\n"); 1119 } else 1120 printf("Could not deinstall ELF interpreter entry\n"); 1121 break; 1122 default: 1123 return EOPNOTSUPP; 1124 } 1125 return error; 1126 } 1127 1128 static moduledata_t linux_elf_mod = { 1129 "linuxelf", 1130 linux_elf_modevent, 1131 0 1132 }; 1133 1134 DECLARE_MODULE(linuxelf, linux_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY); 1135