1 /*- 2 * Copyright (c) 2004 Tim J. Robbins 3 * Copyright (c) 2003 Peter Wemm 4 * Copyright (c) 2002 Doug Rabson 5 * Copyright (c) 1998-1999 Andrew Gallatin 6 * Copyright (c) 1994-1996 S�ren Schmidt 7 * All rights reserved. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer 14 * in this position and unchanged. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. The name of the author may not be used to endorse or promote products 19 * derived from this software without specific prior written permission 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 22 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 23 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 24 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 25 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 26 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 27 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 28 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 30 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 */ 32 33 #include <sys/cdefs.h> 34 __FBSDID("$FreeBSD$"); 35 #include "opt_compat.h" 36 37 #ifndef COMPAT_IA32 38 #error "Unable to compile Linux-emulator due to missing COMPAT_IA32 option!" 39 #endif 40 41 #define __ELF_WORD_SIZE 32 42 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/exec.h> 46 #include <sys/imgact.h> 47 #include <sys/imgact_elf.h> 48 #include <sys/kernel.h> 49 #include <sys/lock.h> 50 #include <sys/malloc.h> 51 #include <sys/module.h> 52 #include <sys/mutex.h> 53 #include <sys/proc.h> 54 #include <sys/resourcevar.h> 55 #include <sys/signalvar.h> 56 #include <sys/sysctl.h> 57 #include <sys/syscallsubr.h> 58 #include <sys/sysent.h> 59 #include <sys/sysproto.h> 60 #include <sys/vnode.h> 61 62 #include <vm/vm.h> 63 #include <vm/pmap.h> 64 #include <vm/vm_extern.h> 65 #include <vm/vm_map.h> 66 #include <vm/vm_object.h> 67 #include <vm/vm_page.h> 68 #include <vm/vm_param.h> 69 70 #include <machine/cpu.h> 71 #include <machine/md_var.h> 72 #include <machine/pcb.h> 73 #include <machine/specialreg.h> 74 75 #include <amd64/linux32/linux.h> 76 #include <amd64/linux32/linux32_proto.h> 77 #include <compat/linux/linux_mib.h> 78 #include <compat/linux/linux_signal.h> 79 #include <compat/linux/linux_util.h> 80 81 MODULE_VERSION(linux, 1); 82 83 MALLOC_DEFINE(M_LINUX, "linux", "Linux mode structures"); 84 85 #define AUXARGS_ENTRY_32(pos, id, val) \ 86 do { \ 87 suword32(pos++, id); \ 88 suword32(pos++, val); \ 89 } while (0) 90 91 #if BYTE_ORDER == LITTLE_ENDIAN 92 #define SHELLMAGIC 0x2123 /* #! */ 93 #else 94 #define SHELLMAGIC 0x2321 95 #endif 96 97 /* 98 * Allow the sendsig functions to use the ldebug() facility 99 * even though they are not syscalls themselves. Map them 100 * to syscall 0. This is slightly less bogus than using 101 * ldebug(sigreturn). 102 */ 103 #define LINUX_SYS_linux_rt_sendsig 0 104 #define LINUX_SYS_linux_sendsig 0 105 106 extern char linux_sigcode[]; 107 extern int linux_szsigcode; 108 109 extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL]; 110 111 SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler); 112 SET_DECLARE(linux_device_handler_set, struct linux_device_handler); 113 114 static int elf_linux_fixup(register_t **stack_base, 115 struct image_params *iparams); 116 static register_t *linux_copyout_strings(struct image_params *imgp); 117 static void linux_prepsyscall(struct trapframe *tf, int *args, u_int *code, 118 caddr_t *params); 119 static void linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask); 120 static void exec_linux_setregs(struct thread *td, u_long entry, 121 u_long stack, u_long ps_strings); 122 static void linux32_fixlimits(struct proc *p); 123 124 /* 125 * Linux syscalls return negative errno's, we do positive and map them 126 */ 127 static int bsd_to_linux_errno[ELAST + 1] = { 128 -0, -1, -2, -3, -4, -5, -6, -7, -8, -9, 129 -10, -35, -12, -13, -14, -15, -16, -17, -18, -19, 130 -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, 131 -30, -31, -32, -33, -34, -11,-115,-114, -88, -89, 132 -90, -91, -92, -93, -94, -95, -96, -97, -98, -99, 133 -100,-101,-102,-103,-104,-105,-106,-107,-108,-109, 134 -110,-111, -40, -36,-112,-113, -39, -11, -87,-122, 135 -116, -66, -6, -6, -6, -6, -6, -37, -38, -9, 136 -6, -6, -43, -42, -75, -6, -84 137 }; 138 139 int bsd_to_linux_signal[LINUX_SIGTBLSZ] = { 140 LINUX_SIGHUP, LINUX_SIGINT, LINUX_SIGQUIT, LINUX_SIGILL, 141 LINUX_SIGTRAP, LINUX_SIGABRT, 0, LINUX_SIGFPE, 142 LINUX_SIGKILL, LINUX_SIGBUS, LINUX_SIGSEGV, LINUX_SIGSYS, 143 LINUX_SIGPIPE, LINUX_SIGALRM, LINUX_SIGTERM, LINUX_SIGURG, 144 LINUX_SIGSTOP, LINUX_SIGTSTP, LINUX_SIGCONT, LINUX_SIGCHLD, 145 LINUX_SIGTTIN, LINUX_SIGTTOU, LINUX_SIGIO, LINUX_SIGXCPU, 146 LINUX_SIGXFSZ, LINUX_SIGVTALRM, LINUX_SIGPROF, LINUX_SIGWINCH, 147 0, LINUX_SIGUSR1, LINUX_SIGUSR2 148 }; 149 150 int linux_to_bsd_signal[LINUX_SIGTBLSZ] = { 151 SIGHUP, SIGINT, SIGQUIT, SIGILL, 152 SIGTRAP, SIGABRT, SIGBUS, SIGFPE, 153 SIGKILL, SIGUSR1, SIGSEGV, SIGUSR2, 154 SIGPIPE, SIGALRM, SIGTERM, SIGBUS, 155 SIGCHLD, SIGCONT, SIGSTOP, SIGTSTP, 156 SIGTTIN, SIGTTOU, SIGURG, SIGXCPU, 157 SIGXFSZ, SIGVTALRM, SIGPROF, SIGWINCH, 158 SIGIO, SIGURG, SIGSYS 159 }; 160 161 #define LINUX_T_UNKNOWN 255 162 static int _bsd_to_linux_trapcode[] = { 163 LINUX_T_UNKNOWN, /* 0 */ 164 6, /* 1 T_PRIVINFLT */ 165 LINUX_T_UNKNOWN, /* 2 */ 166 3, /* 3 T_BPTFLT */ 167 LINUX_T_UNKNOWN, /* 4 */ 168 LINUX_T_UNKNOWN, /* 5 */ 169 16, /* 6 T_ARITHTRAP */ 170 254, /* 7 T_ASTFLT */ 171 LINUX_T_UNKNOWN, /* 8 */ 172 13, /* 9 T_PROTFLT */ 173 1, /* 10 T_TRCTRAP */ 174 LINUX_T_UNKNOWN, /* 11 */ 175 14, /* 12 T_PAGEFLT */ 176 LINUX_T_UNKNOWN, /* 13 */ 177 17, /* 14 T_ALIGNFLT */ 178 LINUX_T_UNKNOWN, /* 15 */ 179 LINUX_T_UNKNOWN, /* 16 */ 180 LINUX_T_UNKNOWN, /* 17 */ 181 0, /* 18 T_DIVIDE */ 182 2, /* 19 T_NMI */ 183 4, /* 20 T_OFLOW */ 184 5, /* 21 T_BOUND */ 185 7, /* 22 T_DNA */ 186 8, /* 23 T_DOUBLEFLT */ 187 9, /* 24 T_FPOPFLT */ 188 10, /* 25 T_TSSFLT */ 189 11, /* 26 T_SEGNPFLT */ 190 12, /* 27 T_STKFLT */ 191 18, /* 28 T_MCHK */ 192 19, /* 29 T_XMMFLT */ 193 15 /* 30 T_RESERVED */ 194 }; 195 #define bsd_to_linux_trapcode(code) \ 196 ((code)<sizeof(_bsd_to_linux_trapcode)/sizeof(*_bsd_to_linux_trapcode)? \ 197 _bsd_to_linux_trapcode[(code)]: \ 198 LINUX_T_UNKNOWN) 199 200 struct linux32_ps_strings { 201 u_int32_t ps_argvstr; /* first of 0 or more argument strings */ 202 u_int ps_nargvstr; /* the number of argument strings */ 203 u_int32_t ps_envstr; /* first of 0 or more environment strings */ 204 u_int ps_nenvstr; /* the number of environment strings */ 205 }; 206 207 /* 208 * If FreeBSD & Linux have a difference of opinion about what a trap 209 * means, deal with it here. 210 * 211 * MPSAFE 212 */ 213 static int 214 translate_traps(int signal, int trap_code) 215 { 216 if (signal != SIGBUS) 217 return signal; 218 switch (trap_code) { 219 case T_PROTFLT: 220 case T_TSSFLT: 221 case T_DOUBLEFLT: 222 case T_PAGEFLT: 223 return SIGSEGV; 224 default: 225 return signal; 226 } 227 } 228 229 static int 230 elf_linux_fixup(register_t **stack_base, struct image_params *imgp) 231 { 232 Elf32_Auxargs *args; 233 Elf32_Addr *base; 234 Elf32_Addr *pos; 235 236 KASSERT(curthread->td_proc == imgp->proc && 237 (curthread->td_proc->p_flag & P_SA) == 0, 238 ("unsafe elf_linux_fixup(), should be curproc")); 239 base = (Elf32_Addr *)*stack_base; 240 args = (Elf32_Auxargs *)imgp->auxargs; 241 pos = base + (imgp->args->argc + imgp->args->envc + 2); 242 243 if (args->trace) 244 AUXARGS_ENTRY_32(pos, AT_DEBUG, 1); 245 if (args->execfd != -1) 246 AUXARGS_ENTRY_32(pos, AT_EXECFD, args->execfd); 247 AUXARGS_ENTRY_32(pos, AT_PHDR, args->phdr); 248 AUXARGS_ENTRY_32(pos, AT_PHENT, args->phent); 249 AUXARGS_ENTRY_32(pos, AT_PHNUM, args->phnum); 250 AUXARGS_ENTRY_32(pos, AT_PAGESZ, args->pagesz); 251 AUXARGS_ENTRY_32(pos, AT_FLAGS, args->flags); 252 AUXARGS_ENTRY_32(pos, AT_ENTRY, args->entry); 253 AUXARGS_ENTRY_32(pos, AT_BASE, args->base); 254 AUXARGS_ENTRY_32(pos, AT_UID, imgp->proc->p_ucred->cr_ruid); 255 AUXARGS_ENTRY_32(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid); 256 AUXARGS_ENTRY_32(pos, AT_GID, imgp->proc->p_ucred->cr_rgid); 257 AUXARGS_ENTRY_32(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid); 258 AUXARGS_ENTRY_32(pos, AT_NULL, 0); 259 260 free(imgp->auxargs, M_TEMP); 261 imgp->auxargs = NULL; 262 263 base--; 264 suword32(base, (uint32_t)imgp->args->argc); 265 *stack_base = (register_t *)base; 266 return 0; 267 } 268 269 extern int _ucodesel, _ucode32sel, _udatasel; 270 extern unsigned long linux_sznonrtsigcode; 271 272 static void 273 linux_rt_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) 274 { 275 struct thread *td = curthread; 276 struct proc *p = td->td_proc; 277 struct sigacts *psp; 278 struct trapframe *regs; 279 struct l_rt_sigframe *fp, frame; 280 int oonstack; 281 int sig; 282 int code; 283 284 sig = ksi->ksi_signo; 285 code = ksi->ksi_code; 286 PROC_LOCK_ASSERT(p, MA_OWNED); 287 psp = p->p_sigacts; 288 mtx_assert(&psp->ps_mtx, MA_OWNED); 289 regs = td->td_frame; 290 oonstack = sigonstack(regs->tf_rsp); 291 292 #ifdef DEBUG 293 if (ldebug(rt_sendsig)) 294 printf(ARGS(rt_sendsig, "%p, %d, %p, %u"), 295 catcher, sig, (void*)mask, code); 296 #endif 297 /* 298 * Allocate space for the signal handler context. 299 */ 300 if ((td->td_pflags & TDP_ALTSTACK) && !oonstack && 301 SIGISMEMBER(psp->ps_sigonstack, sig)) { 302 fp = (struct l_rt_sigframe *)(td->td_sigstk.ss_sp + 303 td->td_sigstk.ss_size - sizeof(struct l_rt_sigframe)); 304 } else 305 fp = (struct l_rt_sigframe *)regs->tf_rsp - 1; 306 mtx_unlock(&psp->ps_mtx); 307 308 /* 309 * Build the argument list for the signal handler. 310 */ 311 if (p->p_sysent->sv_sigtbl) 312 if (sig <= p->p_sysent->sv_sigsize) 313 sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)]; 314 315 bzero(&frame, sizeof(frame)); 316 317 frame.sf_handler = PTROUT(catcher); 318 frame.sf_sig = sig; 319 frame.sf_siginfo = PTROUT(&fp->sf_si); 320 frame.sf_ucontext = PTROUT(&fp->sf_sc); 321 322 /* Fill in POSIX parts */ 323 frame.sf_si.lsi_signo = sig; 324 frame.sf_si.lsi_code = code; 325 frame.sf_si.lsi_addr = PTROUT(ksi->ksi_addr); 326 327 /* 328 * Build the signal context to be used by sigreturn. 329 */ 330 frame.sf_sc.uc_flags = 0; /* XXX ??? */ 331 frame.sf_sc.uc_link = 0; /* XXX ??? */ 332 333 frame.sf_sc.uc_stack.ss_sp = PTROUT(td->td_sigstk.ss_sp); 334 frame.sf_sc.uc_stack.ss_size = td->td_sigstk.ss_size; 335 frame.sf_sc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) 336 ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE; 337 PROC_UNLOCK(p); 338 339 bsd_to_linux_sigset(mask, &frame.sf_sc.uc_sigmask); 340 341 frame.sf_sc.uc_mcontext.sc_mask = frame.sf_sc.uc_sigmask.__bits[0]; 342 frame.sf_sc.uc_mcontext.sc_gs = rgs(); 343 frame.sf_sc.uc_mcontext.sc_fs = rfs(); 344 __asm __volatile("movl %%es,%0" : 345 "=rm" (frame.sf_sc.uc_mcontext.sc_es)); 346 __asm __volatile("movl %%ds,%0" : 347 "=rm" (frame.sf_sc.uc_mcontext.sc_ds)); 348 frame.sf_sc.uc_mcontext.sc_edi = regs->tf_rdi; 349 frame.sf_sc.uc_mcontext.sc_esi = regs->tf_rsi; 350 frame.sf_sc.uc_mcontext.sc_ebp = regs->tf_rbp; 351 frame.sf_sc.uc_mcontext.sc_ebx = regs->tf_rbx; 352 frame.sf_sc.uc_mcontext.sc_edx = regs->tf_rdx; 353 frame.sf_sc.uc_mcontext.sc_ecx = regs->tf_rcx; 354 frame.sf_sc.uc_mcontext.sc_eax = regs->tf_rax; 355 frame.sf_sc.uc_mcontext.sc_eip = regs->tf_rip; 356 frame.sf_sc.uc_mcontext.sc_cs = regs->tf_cs; 357 frame.sf_sc.uc_mcontext.sc_eflags = regs->tf_rflags; 358 frame.sf_sc.uc_mcontext.sc_esp_at_signal = regs->tf_rsp; 359 frame.sf_sc.uc_mcontext.sc_ss = regs->tf_ss; 360 frame.sf_sc.uc_mcontext.sc_err = regs->tf_err; 361 frame.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code); 362 363 #ifdef DEBUG 364 if (ldebug(rt_sendsig)) 365 printf(LMSG("rt_sendsig flags: 0x%x, sp: %p, ss: 0x%lx, mask: 0x%x"), 366 frame.sf_sc.uc_stack.ss_flags, td->td_sigstk.ss_sp, 367 td->td_sigstk.ss_size, frame.sf_sc.uc_mcontext.sc_mask); 368 #endif 369 370 if (copyout(&frame, fp, sizeof(frame)) != 0) { 371 /* 372 * Process has trashed its stack; give it an illegal 373 * instruction to halt it in its tracks. 374 */ 375 #ifdef DEBUG 376 if (ldebug(rt_sendsig)) 377 printf(LMSG("rt_sendsig: bad stack %p, oonstack=%x"), 378 fp, oonstack); 379 #endif 380 PROC_LOCK(p); 381 sigexit(td, SIGILL); 382 } 383 384 /* 385 * Build context to run handler in. 386 */ 387 regs->tf_rsp = PTROUT(fp); 388 regs->tf_rip = LINUX32_PS_STRINGS - *(p->p_sysent->sv_szsigcode) + 389 linux_sznonrtsigcode; 390 regs->tf_rflags &= ~PSL_T; 391 regs->tf_cs = _ucode32sel; 392 regs->tf_ss = _udatasel; 393 load_ds(_udatasel); 394 td->td_pcb->pcb_ds = _udatasel; 395 load_es(_udatasel); 396 td->td_pcb->pcb_es = _udatasel; 397 PROC_LOCK(p); 398 mtx_lock(&psp->ps_mtx); 399 } 400 401 402 /* 403 * Send an interrupt to process. 404 * 405 * Stack is set up to allow sigcode stored 406 * in u. to call routine, followed by kcall 407 * to sigreturn routine below. After sigreturn 408 * resets the signal mask, the stack, and the 409 * frame pointer, it returns to the user 410 * specified pc, psl. 411 */ 412 static void 413 linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) 414 { 415 struct thread *td = curthread; 416 struct proc *p = td->td_proc; 417 struct sigacts *psp; 418 struct trapframe *regs; 419 struct l_sigframe *fp, frame; 420 l_sigset_t lmask; 421 int oonstack, i; 422 int sig, code; 423 424 sig = ksi->ksi_signo; 425 code = ksi->ksi_code; 426 PROC_LOCK_ASSERT(p, MA_OWNED); 427 psp = p->p_sigacts; 428 mtx_assert(&psp->ps_mtx, MA_OWNED); 429 if (SIGISMEMBER(psp->ps_siginfo, sig)) { 430 /* Signal handler installed with SA_SIGINFO. */ 431 linux_rt_sendsig(catcher, ksi, mask); 432 return; 433 } 434 435 regs = td->td_frame; 436 oonstack = sigonstack(regs->tf_rsp); 437 438 #ifdef DEBUG 439 if (ldebug(sendsig)) 440 printf(ARGS(sendsig, "%p, %d, %p, %u"), 441 catcher, sig, (void*)mask, code); 442 #endif 443 444 /* 445 * Allocate space for the signal handler context. 446 */ 447 if ((td->td_pflags & TDP_ALTSTACK) && !oonstack && 448 SIGISMEMBER(psp->ps_sigonstack, sig)) { 449 fp = (struct l_sigframe *)(td->td_sigstk.ss_sp + 450 td->td_sigstk.ss_size - sizeof(struct l_sigframe)); 451 } else 452 fp = (struct l_sigframe *)regs->tf_rsp - 1; 453 mtx_unlock(&psp->ps_mtx); 454 PROC_UNLOCK(p); 455 456 /* 457 * Build the argument list for the signal handler. 458 */ 459 if (p->p_sysent->sv_sigtbl) 460 if (sig <= p->p_sysent->sv_sigsize) 461 sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)]; 462 463 bzero(&frame, sizeof(frame)); 464 465 frame.sf_handler = PTROUT(catcher); 466 frame.sf_sig = sig; 467 468 bsd_to_linux_sigset(mask, &lmask); 469 470 /* 471 * Build the signal context to be used by sigreturn. 472 */ 473 frame.sf_sc.sc_mask = lmask.__bits[0]; 474 frame.sf_sc.sc_gs = rgs(); 475 frame.sf_sc.sc_fs = rfs(); 476 __asm __volatile("movl %%es,%0" : "=rm" (frame.sf_sc.sc_es)); 477 __asm __volatile("movl %%ds,%0" : "=rm" (frame.sf_sc.sc_ds)); 478 frame.sf_sc.sc_edi = regs->tf_rdi; 479 frame.sf_sc.sc_esi = regs->tf_rsi; 480 frame.sf_sc.sc_ebp = regs->tf_rbp; 481 frame.sf_sc.sc_ebx = regs->tf_rbx; 482 frame.sf_sc.sc_edx = regs->tf_rdx; 483 frame.sf_sc.sc_ecx = regs->tf_rcx; 484 frame.sf_sc.sc_eax = regs->tf_rax; 485 frame.sf_sc.sc_eip = regs->tf_rip; 486 frame.sf_sc.sc_cs = regs->tf_cs; 487 frame.sf_sc.sc_eflags = regs->tf_rflags; 488 frame.sf_sc.sc_esp_at_signal = regs->tf_rsp; 489 frame.sf_sc.sc_ss = regs->tf_ss; 490 frame.sf_sc.sc_err = regs->tf_err; 491 frame.sf_sc.sc_trapno = bsd_to_linux_trapcode(code); 492 493 for (i = 0; i < (LINUX_NSIG_WORDS-1); i++) 494 frame.sf_extramask[i] = lmask.__bits[i+1]; 495 496 if (copyout(&frame, fp, sizeof(frame)) != 0) { 497 /* 498 * Process has trashed its stack; give it an illegal 499 * instruction to halt it in its tracks. 500 */ 501 PROC_LOCK(p); 502 sigexit(td, SIGILL); 503 } 504 505 /* 506 * Build context to run handler in. 507 */ 508 regs->tf_rsp = PTROUT(fp); 509 regs->tf_rip = LINUX32_PS_STRINGS - *(p->p_sysent->sv_szsigcode); 510 regs->tf_rflags &= ~PSL_T; 511 regs->tf_cs = _ucode32sel; 512 regs->tf_ss = _udatasel; 513 load_ds(_udatasel); 514 td->td_pcb->pcb_ds = _udatasel; 515 load_es(_udatasel); 516 td->td_pcb->pcb_es = _udatasel; 517 PROC_LOCK(p); 518 mtx_lock(&psp->ps_mtx); 519 } 520 521 /* 522 * System call to cleanup state after a signal 523 * has been taken. Reset signal mask and 524 * stack state from context left by sendsig (above). 525 * Return to previous pc and psl as specified by 526 * context left by sendsig. Check carefully to 527 * make sure that the user has not modified the 528 * psl to gain improper privileges or to cause 529 * a machine fault. 530 */ 531 int 532 linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args) 533 { 534 struct proc *p = td->td_proc; 535 struct l_sigframe frame; 536 struct trapframe *regs; 537 l_sigset_t lmask; 538 int eflags, i; 539 ksiginfo_t ksi; 540 541 regs = td->td_frame; 542 543 #ifdef DEBUG 544 if (ldebug(sigreturn)) 545 printf(ARGS(sigreturn, "%p"), (void *)args->sfp); 546 #endif 547 /* 548 * The trampoline code hands us the sigframe. 549 * It is unsafe to keep track of it ourselves, in the event that a 550 * program jumps out of a signal handler. 551 */ 552 if (copyin(args->sfp, &frame, sizeof(frame)) != 0) 553 return (EFAULT); 554 555 /* 556 * Check for security violations. 557 */ 558 #define EFLAGS_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) 559 eflags = frame.sf_sc.sc_eflags; 560 /* 561 * XXX do allow users to change the privileged flag PSL_RF. The 562 * cpu sets PSL_RF in tf_eflags for faults. Debuggers should 563 * sometimes set it there too. tf_eflags is kept in the signal 564 * context during signal handling and there is no other place 565 * to remember it, so the PSL_RF bit may be corrupted by the 566 * signal handler without us knowing. Corruption of the PSL_RF 567 * bit at worst causes one more or one less debugger trap, so 568 * allowing it is fairly harmless. 569 */ 570 if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF)) 571 return(EINVAL); 572 573 /* 574 * Don't allow users to load a valid privileged %cs. Let the 575 * hardware check for invalid selectors, excess privilege in 576 * other selectors, invalid %eip's and invalid %esp's. 577 */ 578 #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) 579 if (!CS_SECURE(frame.sf_sc.sc_cs)) { 580 ksiginfo_init_trap(&ksi); 581 ksi.ksi_signo = SIGBUS; 582 ksi.ksi_code = BUS_OBJERR; 583 ksi.ksi_trapno = T_PROTFLT; 584 ksi.ksi_addr = (void *)regs->tf_rip; 585 trapsignal(td, &ksi); 586 return(EINVAL); 587 } 588 589 lmask.__bits[0] = frame.sf_sc.sc_mask; 590 for (i = 0; i < (LINUX_NSIG_WORDS-1); i++) 591 lmask.__bits[i+1] = frame.sf_extramask[i]; 592 PROC_LOCK(p); 593 linux_to_bsd_sigset(&lmask, &td->td_sigmask); 594 SIG_CANTMASK(td->td_sigmask); 595 signotify(td); 596 PROC_UNLOCK(p); 597 598 /* 599 * Restore signal context. 600 */ 601 /* Selectors were restored by the trampoline. */ 602 regs->tf_rdi = frame.sf_sc.sc_edi; 603 regs->tf_rsi = frame.sf_sc.sc_esi; 604 regs->tf_rbp = frame.sf_sc.sc_ebp; 605 regs->tf_rbx = frame.sf_sc.sc_ebx; 606 regs->tf_rdx = frame.sf_sc.sc_edx; 607 regs->tf_rcx = frame.sf_sc.sc_ecx; 608 regs->tf_rax = frame.sf_sc.sc_eax; 609 regs->tf_rip = frame.sf_sc.sc_eip; 610 regs->tf_cs = frame.sf_sc.sc_cs; 611 regs->tf_rflags = eflags; 612 regs->tf_rsp = frame.sf_sc.sc_esp_at_signal; 613 regs->tf_ss = frame.sf_sc.sc_ss; 614 615 return (EJUSTRETURN); 616 } 617 618 /* 619 * System call to cleanup state after a signal 620 * has been taken. Reset signal mask and 621 * stack state from context left by rt_sendsig (above). 622 * Return to previous pc and psl as specified by 623 * context left by sendsig. Check carefully to 624 * make sure that the user has not modified the 625 * psl to gain improper privileges or to cause 626 * a machine fault. 627 */ 628 int 629 linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args) 630 { 631 struct proc *p = td->td_proc; 632 struct l_ucontext uc; 633 struct l_sigcontext *context; 634 l_stack_t *lss; 635 stack_t ss; 636 struct trapframe *regs; 637 int eflags; 638 ksiginfo_t ksi; 639 640 regs = td->td_frame; 641 642 #ifdef DEBUG 643 if (ldebug(rt_sigreturn)) 644 printf(ARGS(rt_sigreturn, "%p"), (void *)args->ucp); 645 #endif 646 /* 647 * The trampoline code hands us the ucontext. 648 * It is unsafe to keep track of it ourselves, in the event that a 649 * program jumps out of a signal handler. 650 */ 651 if (copyin(args->ucp, &uc, sizeof(uc)) != 0) 652 return (EFAULT); 653 654 context = &uc.uc_mcontext; 655 656 /* 657 * Check for security violations. 658 */ 659 #define EFLAGS_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) 660 eflags = context->sc_eflags; 661 /* 662 * XXX do allow users to change the privileged flag PSL_RF. The 663 * cpu sets PSL_RF in tf_eflags for faults. Debuggers should 664 * sometimes set it there too. tf_eflags is kept in the signal 665 * context during signal handling and there is no other place 666 * to remember it, so the PSL_RF bit may be corrupted by the 667 * signal handler without us knowing. Corruption of the PSL_RF 668 * bit at worst causes one more or one less debugger trap, so 669 * allowing it is fairly harmless. 670 */ 671 if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF)) 672 return(EINVAL); 673 674 /* 675 * Don't allow users to load a valid privileged %cs. Let the 676 * hardware check for invalid selectors, excess privilege in 677 * other selectors, invalid %eip's and invalid %esp's. 678 */ 679 #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) 680 if (!CS_SECURE(context->sc_cs)) { 681 ksiginfo_init_trap(&ksi); 682 ksi.ksi_signo = SIGBUS; 683 ksi.ksi_code = BUS_OBJERR; 684 ksi.ksi_trapno = T_PROTFLT; 685 ksi.ksi_addr = (void *)regs->tf_rip; 686 trapsignal(td, &ksi); 687 return(EINVAL); 688 } 689 690 PROC_LOCK(p); 691 linux_to_bsd_sigset(&uc.uc_sigmask, &td->td_sigmask); 692 SIG_CANTMASK(td->td_sigmask); 693 signotify(td); 694 PROC_UNLOCK(p); 695 696 /* 697 * Restore signal context 698 */ 699 /* Selectors were restored by the trampoline. */ 700 regs->tf_rdi = context->sc_edi; 701 regs->tf_rsi = context->sc_esi; 702 regs->tf_rbp = context->sc_ebp; 703 regs->tf_rbx = context->sc_ebx; 704 regs->tf_rdx = context->sc_edx; 705 regs->tf_rcx = context->sc_ecx; 706 regs->tf_rax = context->sc_eax; 707 regs->tf_rip = context->sc_eip; 708 regs->tf_cs = context->sc_cs; 709 regs->tf_rflags = eflags; 710 regs->tf_rsp = context->sc_esp_at_signal; 711 regs->tf_ss = context->sc_ss; 712 713 /* 714 * call sigaltstack & ignore results.. 715 */ 716 lss = &uc.uc_stack; 717 ss.ss_sp = PTRIN(lss->ss_sp); 718 ss.ss_size = lss->ss_size; 719 ss.ss_flags = linux_to_bsd_sigaltstack(lss->ss_flags); 720 721 #ifdef DEBUG 722 if (ldebug(rt_sigreturn)) 723 printf(LMSG("rt_sigret flags: 0x%x, sp: %p, ss: 0x%lx, mask: 0x%x"), 724 ss.ss_flags, ss.ss_sp, ss.ss_size, context->sc_mask); 725 #endif 726 (void)kern_sigaltstack(td, &ss, NULL); 727 728 return (EJUSTRETURN); 729 } 730 731 /* 732 * MPSAFE 733 */ 734 static void 735 linux_prepsyscall(struct trapframe *tf, int *args, u_int *code, caddr_t *params) 736 { 737 args[0] = tf->tf_rbx; 738 args[1] = tf->tf_rcx; 739 args[2] = tf->tf_rdx; 740 args[3] = tf->tf_rsi; 741 args[4] = tf->tf_rdi; 742 args[5] = tf->tf_rbp; /* Unconfirmed */ 743 *params = NULL; /* no copyin */ 744 } 745 746 /* 747 * If a linux binary is exec'ing something, try this image activator 748 * first. We override standard shell script execution in order to 749 * be able to modify the interpreter path. We only do this if a linux 750 * binary is doing the exec, so we do not create an EXEC module for it. 751 */ 752 static int exec_linux_imgact_try(struct image_params *iparams); 753 754 static int 755 exec_linux_imgact_try(struct image_params *imgp) 756 { 757 const char *head = (const char *)imgp->image_header; 758 char *rpath; 759 int error = -1, len; 760 761 /* 762 * The interpreter for shell scripts run from a linux binary needs 763 * to be located in /compat/linux if possible in order to recursively 764 * maintain linux path emulation. 765 */ 766 if (((const short *)head)[0] == SHELLMAGIC) { 767 /* 768 * Run our normal shell image activator. If it succeeds attempt 769 * to use the alternate path for the interpreter. If an alternate 770 * path is found, use our stringspace to store it. 771 */ 772 if ((error = exec_shell_imgact(imgp)) == 0) { 773 linux_emul_convpath(FIRST_THREAD_IN_PROC(imgp->proc), 774 imgp->interpreter_name, UIO_SYSSPACE, &rpath, 0); 775 if (rpath != NULL) { 776 len = strlen(rpath) + 1; 777 778 if (len <= MAXSHELLCMDLEN) { 779 memcpy(imgp->interpreter_name, rpath, len); 780 } 781 free(rpath, M_TEMP); 782 } 783 } 784 } 785 return(error); 786 } 787 788 /* 789 * Clear registers on exec 790 * XXX copied from ia32_signal.c. 791 */ 792 static void 793 exec_linux_setregs(td, entry, stack, ps_strings) 794 struct thread *td; 795 u_long entry; 796 u_long stack; 797 u_long ps_strings; 798 { 799 struct trapframe *regs = td->td_frame; 800 struct pcb *pcb = td->td_pcb; 801 802 wrmsr(MSR_FSBASE, 0); 803 wrmsr(MSR_KGSBASE, 0); /* User value while we're in the kernel */ 804 pcb->pcb_fsbase = 0; 805 pcb->pcb_gsbase = 0; 806 load_ds(_udatasel); 807 load_es(_udatasel); 808 load_fs(_udatasel); 809 load_gs(0); 810 pcb->pcb_ds = _udatasel; 811 pcb->pcb_es = _udatasel; 812 pcb->pcb_fs = _udatasel; 813 pcb->pcb_gs = 0; 814 815 bzero((char *)regs, sizeof(struct trapframe)); 816 regs->tf_rip = entry; 817 regs->tf_rsp = stack; 818 regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T); 819 regs->tf_ss = _udatasel; 820 regs->tf_cs = _ucode32sel; 821 regs->tf_rbx = ps_strings; 822 load_cr0(rcr0() | CR0_MP | CR0_TS); 823 fpstate_drop(td); 824 825 /* Return via doreti so that we can change to a different %cs */ 826 pcb->pcb_flags |= PCB_FULLCTX; 827 td->td_retval[1] = 0; 828 } 829 830 /* 831 * XXX copied from ia32_sysvec.c. 832 */ 833 static register_t * 834 linux_copyout_strings(struct image_params *imgp) 835 { 836 int argc, envc; 837 u_int32_t *vectp; 838 char *stringp, *destp; 839 u_int32_t *stack_base; 840 struct linux32_ps_strings *arginfo; 841 int sigcodesz; 842 843 /* 844 * Calculate string base and vector table pointers. 845 * Also deal with signal trampoline code for this exec type. 846 */ 847 arginfo = (struct linux32_ps_strings *)LINUX32_PS_STRINGS; 848 sigcodesz = *(imgp->proc->p_sysent->sv_szsigcode); 849 destp = (caddr_t)arginfo - sigcodesz - SPARE_USRSPACE - 850 roundup((ARG_MAX - imgp->args->stringspace), sizeof(char *)); 851 852 /* 853 * install sigcode 854 */ 855 if (sigcodesz) 856 copyout(imgp->proc->p_sysent->sv_sigcode, 857 ((caddr_t)arginfo - sigcodesz), szsigcode); 858 859 /* 860 * If we have a valid auxargs ptr, prepare some room 861 * on the stack. 862 */ 863 if (imgp->auxargs) { 864 /* 865 * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for 866 * lower compatibility. 867 */ 868 imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size 869 : (AT_COUNT * 2); 870 /* 871 * The '+ 2' is for the null pointers at the end of each of 872 * the arg and env vector sets,and imgp->auxarg_size is room 873 * for argument of Runtime loader. 874 */ 875 vectp = (u_int32_t *) (destp - (imgp->args->argc + imgp->args->envc + 2 + 876 imgp->auxarg_size) * sizeof(u_int32_t)); 877 878 } else 879 /* 880 * The '+ 2' is for the null pointers at the end of each of 881 * the arg and env vector sets 882 */ 883 vectp = (u_int32_t *) 884 (destp - (imgp->args->argc + imgp->args->envc + 2) * sizeof(u_int32_t)); 885 886 /* 887 * vectp also becomes our initial stack base 888 */ 889 stack_base = vectp; 890 891 stringp = imgp->args->begin_argv; 892 argc = imgp->args->argc; 893 envc = imgp->args->envc; 894 /* 895 * Copy out strings - arguments and environment. 896 */ 897 copyout(stringp, destp, ARG_MAX - imgp->args->stringspace); 898 899 /* 900 * Fill in "ps_strings" struct for ps, w, etc. 901 */ 902 suword32(&arginfo->ps_argvstr, (u_int32_t)(intptr_t)vectp); 903 suword32(&arginfo->ps_nargvstr, argc); 904 905 /* 906 * Fill in argument portion of vector table. 907 */ 908 for (; argc > 0; --argc) { 909 suword32(vectp++, (u_int32_t)(intptr_t)destp); 910 while (*stringp++ != 0) 911 destp++; 912 destp++; 913 } 914 915 /* a null vector table pointer separates the argp's from the envp's */ 916 suword32(vectp++, 0); 917 918 suword32(&arginfo->ps_envstr, (u_int32_t)(intptr_t)vectp); 919 suword32(&arginfo->ps_nenvstr, envc); 920 921 /* 922 * Fill in environment portion of vector table. 923 */ 924 for (; envc > 0; --envc) { 925 suword32(vectp++, (u_int32_t)(intptr_t)destp); 926 while (*stringp++ != 0) 927 destp++; 928 destp++; 929 } 930 931 /* end of vector table is a null pointer */ 932 suword32(vectp, 0); 933 934 return ((register_t *)stack_base); 935 } 936 937 SYSCTL_NODE(_compat, OID_AUTO, linux32, CTLFLAG_RW, 0, 938 "32-bit Linux emulation"); 939 940 static u_long linux32_maxdsiz = LINUX32_MAXDSIZ; 941 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxdsiz, CTLFLAG_RW, 942 &linux32_maxdsiz, 0, ""); 943 static u_long linux32_maxssiz = LINUX32_MAXSSIZ; 944 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxssiz, CTLFLAG_RW, 945 &linux32_maxssiz, 0, ""); 946 static u_long linux32_maxvmem = LINUX32_MAXVMEM; 947 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxvmem, CTLFLAG_RW, 948 &linux32_maxvmem, 0, ""); 949 950 /* 951 * XXX copied from ia32_sysvec.c. 952 */ 953 static void 954 linux32_fixlimits(struct proc *p) 955 { 956 struct plimit *oldlim, *newlim; 957 958 if (linux32_maxdsiz == 0 && linux32_maxssiz == 0 && 959 linux32_maxvmem == 0) 960 return; 961 newlim = lim_alloc(); 962 PROC_LOCK(p); 963 oldlim = p->p_limit; 964 lim_copy(newlim, oldlim); 965 if (linux32_maxdsiz != 0) { 966 if (newlim->pl_rlimit[RLIMIT_DATA].rlim_cur > linux32_maxdsiz) 967 newlim->pl_rlimit[RLIMIT_DATA].rlim_cur = linux32_maxdsiz; 968 if (newlim->pl_rlimit[RLIMIT_DATA].rlim_max > linux32_maxdsiz) 969 newlim->pl_rlimit[RLIMIT_DATA].rlim_max = linux32_maxdsiz; 970 } 971 if (linux32_maxssiz != 0) { 972 if (newlim->pl_rlimit[RLIMIT_STACK].rlim_cur > linux32_maxssiz) 973 newlim->pl_rlimit[RLIMIT_STACK].rlim_cur = linux32_maxssiz; 974 if (newlim->pl_rlimit[RLIMIT_STACK].rlim_max > linux32_maxssiz) 975 newlim->pl_rlimit[RLIMIT_STACK].rlim_max = linux32_maxssiz; 976 } 977 if (linux32_maxvmem != 0) { 978 if (newlim->pl_rlimit[RLIMIT_VMEM].rlim_cur > linux32_maxvmem) 979 newlim->pl_rlimit[RLIMIT_VMEM].rlim_cur = linux32_maxvmem; 980 if (newlim->pl_rlimit[RLIMIT_VMEM].rlim_max > linux32_maxvmem) 981 newlim->pl_rlimit[RLIMIT_VMEM].rlim_max = linux32_maxvmem; 982 } 983 p->p_limit = newlim; 984 PROC_UNLOCK(p); 985 lim_free(oldlim); 986 } 987 988 struct sysentvec elf_linux_sysvec = { 989 LINUX_SYS_MAXSYSCALL, 990 linux_sysent, 991 0xff, 992 LINUX_SIGTBLSZ, 993 bsd_to_linux_signal, 994 ELAST + 1, 995 bsd_to_linux_errno, 996 translate_traps, 997 elf_linux_fixup, 998 linux_sendsig, 999 linux_sigcode, 1000 &linux_szsigcode, 1001 linux_prepsyscall, 1002 "Linux ELF32", 1003 elf32_coredump, 1004 exec_linux_imgact_try, 1005 LINUX_MINSIGSTKSZ, 1006 PAGE_SIZE, 1007 VM_MIN_ADDRESS, 1008 LINUX32_USRSTACK, 1009 LINUX32_USRSTACK, 1010 LINUX32_PS_STRINGS, 1011 VM_PROT_ALL, 1012 linux_copyout_strings, 1013 exec_linux_setregs, 1014 linux32_fixlimits 1015 }; 1016 1017 static Elf32_Brandinfo linux_brand = { 1018 ELFOSABI_LINUX, 1019 EM_386, 1020 "Linux", 1021 "/compat/linux", 1022 "/lib/ld-linux.so.1", 1023 &elf_linux_sysvec, 1024 NULL, 1025 BI_CAN_EXEC_DYN, 1026 }; 1027 1028 static Elf32_Brandinfo linux_glibc2brand = { 1029 ELFOSABI_LINUX, 1030 EM_386, 1031 "Linux", 1032 "/compat/linux", 1033 "/lib/ld-linux.so.2", 1034 &elf_linux_sysvec, 1035 NULL, 1036 BI_CAN_EXEC_DYN, 1037 }; 1038 1039 Elf32_Brandinfo *linux_brandlist[] = { 1040 &linux_brand, 1041 &linux_glibc2brand, 1042 NULL 1043 }; 1044 1045 static int 1046 linux_elf_modevent(module_t mod, int type, void *data) 1047 { 1048 Elf32_Brandinfo **brandinfo; 1049 int error; 1050 struct linux_ioctl_handler **lihp; 1051 struct linux_device_handler **ldhp; 1052 1053 error = 0; 1054 1055 switch(type) { 1056 case MOD_LOAD: 1057 for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; 1058 ++brandinfo) 1059 if (elf32_insert_brand_entry(*brandinfo) < 0) 1060 error = EINVAL; 1061 if (error == 0) { 1062 SET_FOREACH(lihp, linux_ioctl_handler_set) 1063 linux_ioctl_register_handler(*lihp); 1064 SET_FOREACH(ldhp, linux_device_handler_set) 1065 linux_device_register_handler(*ldhp); 1066 if (bootverbose) 1067 printf("Linux ELF exec handler installed\n"); 1068 } else 1069 printf("cannot insert Linux ELF brand handler\n"); 1070 break; 1071 case MOD_UNLOAD: 1072 for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; 1073 ++brandinfo) 1074 if (elf32_brand_inuse(*brandinfo)) 1075 error = EBUSY; 1076 if (error == 0) { 1077 for (brandinfo = &linux_brandlist[0]; 1078 *brandinfo != NULL; ++brandinfo) 1079 if (elf32_remove_brand_entry(*brandinfo) < 0) 1080 error = EINVAL; 1081 } 1082 if (error == 0) { 1083 SET_FOREACH(lihp, linux_ioctl_handler_set) 1084 linux_ioctl_unregister_handler(*lihp); 1085 SET_FOREACH(ldhp, linux_device_handler_set) 1086 linux_device_unregister_handler(*ldhp); 1087 if (bootverbose) 1088 printf("Linux ELF exec handler removed\n"); 1089 } else 1090 printf("Could not deinstall ELF interpreter entry\n"); 1091 break; 1092 default: 1093 break; 1094 } 1095 return error; 1096 } 1097 1098 static moduledata_t linux_elf_mod = { 1099 "linuxelf", 1100 linux_elf_modevent, 1101 0 1102 }; 1103 1104 DECLARE_MODULE(linuxelf, linux_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY); 1105