1 /*- 2 * Copyright (c) 2004 Tim J. Robbins 3 * Copyright (c) 2003 Peter Wemm 4 * Copyright (c) 2002 Doug Rabson 5 * Copyright (c) 1998-1999 Andrew Gallatin 6 * Copyright (c) 1994-1996 S�ren Schmidt 7 * All rights reserved. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer 14 * in this position and unchanged. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. The name of the author may not be used to endorse or promote products 19 * derived from this software without specific prior written permission 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 22 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 23 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 24 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 25 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 26 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 27 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 28 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 30 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 */ 32 33 #include <sys/cdefs.h> 34 __FBSDID("$FreeBSD$"); 35 #include "opt_compat.h" 36 37 #ifndef COMPAT_IA32 38 #error "Unable to compile Linux-emulator due to missing COMPAT_IA32 option!" 39 #endif 40 41 #define __ELF_WORD_SIZE 32 42 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/exec.h> 46 #include <sys/imgact.h> 47 #include <sys/imgact_elf.h> 48 #include <sys/kernel.h> 49 #include <sys/lock.h> 50 #include <sys/malloc.h> 51 #include <sys/module.h> 52 #include <sys/mutex.h> 53 #include <sys/proc.h> 54 #include <sys/resourcevar.h> 55 #include <sys/signalvar.h> 56 #include <sys/sysctl.h> 57 #include <sys/syscallsubr.h> 58 #include <sys/sysent.h> 59 #include <sys/sysproto.h> 60 #include <sys/vnode.h> 61 62 #include <vm/vm.h> 63 #include <vm/pmap.h> 64 #include <vm/vm_extern.h> 65 #include <vm/vm_map.h> 66 #include <vm/vm_object.h> 67 #include <vm/vm_page.h> 68 #include <vm/vm_param.h> 69 70 #include <machine/cpu.h> 71 #include <machine/md_var.h> 72 #include <machine/pcb.h> 73 #include <machine/specialreg.h> 74 75 #include <amd64/linux32/linux.h> 76 #include <amd64/linux32/linux32_proto.h> 77 #include <compat/linux/linux_mib.h> 78 #include <compat/linux/linux_signal.h> 79 #include <compat/linux/linux_util.h> 80 81 MODULE_VERSION(linux, 1); 82 83 MALLOC_DEFINE(M_LINUX, "linux", "Linux mode structures"); 84 85 #define AUXARGS_ENTRY_32(pos, id, val) \ 86 do { \ 87 suword32(pos++, id); \ 88 suword32(pos++, val); \ 89 } while (0) 90 91 #if BYTE_ORDER == LITTLE_ENDIAN 92 #define SHELLMAGIC 0x2123 /* #! */ 93 #else 94 #define SHELLMAGIC 0x2321 95 #endif 96 97 /* 98 * Allow the sendsig functions to use the ldebug() facility 99 * even though they are not syscalls themselves. Map them 100 * to syscall 0. This is slightly less bogus than using 101 * ldebug(sigreturn). 102 */ 103 #define LINUX_SYS_linux_rt_sendsig 0 104 #define LINUX_SYS_linux_sendsig 0 105 106 extern char linux_sigcode[]; 107 extern int linux_szsigcode; 108 109 extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL]; 110 111 SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler); 112 113 static int elf_linux_fixup(register_t **stack_base, 114 struct image_params *iparams); 115 static register_t *linux_copyout_strings(struct image_params *imgp); 116 static void linux_prepsyscall(struct trapframe *tf, int *args, u_int *code, 117 caddr_t *params); 118 static void linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask); 119 static void exec_linux_setregs(struct thread *td, u_long entry, 120 u_long stack, u_long ps_strings); 121 static void linux32_fixlimits(struct proc *p); 122 123 /* 124 * Linux syscalls return negative errno's, we do positive and map them 125 */ 126 static int bsd_to_linux_errno[ELAST + 1] = { 127 -0, -1, -2, -3, -4, -5, -6, -7, -8, -9, 128 -10, -35, -12, -13, -14, -15, -16, -17, -18, -19, 129 -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, 130 -30, -31, -32, -33, -34, -11,-115,-114, -88, -89, 131 -90, -91, -92, -93, -94, -95, -96, -97, -98, -99, 132 -100,-101,-102,-103,-104,-105,-106,-107,-108,-109, 133 -110,-111, -40, -36,-112,-113, -39, -11, -87,-122, 134 -116, -66, -6, -6, -6, -6, -6, -37, -38, -9, 135 -6, -6, -43, -42, -75, -6, -84 136 }; 137 138 int bsd_to_linux_signal[LINUX_SIGTBLSZ] = { 139 LINUX_SIGHUP, LINUX_SIGINT, LINUX_SIGQUIT, LINUX_SIGILL, 140 LINUX_SIGTRAP, LINUX_SIGABRT, 0, LINUX_SIGFPE, 141 LINUX_SIGKILL, LINUX_SIGBUS, LINUX_SIGSEGV, LINUX_SIGSYS, 142 LINUX_SIGPIPE, LINUX_SIGALRM, LINUX_SIGTERM, LINUX_SIGURG, 143 LINUX_SIGSTOP, LINUX_SIGTSTP, LINUX_SIGCONT, LINUX_SIGCHLD, 144 LINUX_SIGTTIN, LINUX_SIGTTOU, LINUX_SIGIO, LINUX_SIGXCPU, 145 LINUX_SIGXFSZ, LINUX_SIGVTALRM, LINUX_SIGPROF, LINUX_SIGWINCH, 146 0, LINUX_SIGUSR1, LINUX_SIGUSR2 147 }; 148 149 int linux_to_bsd_signal[LINUX_SIGTBLSZ] = { 150 SIGHUP, SIGINT, SIGQUIT, SIGILL, 151 SIGTRAP, SIGABRT, SIGBUS, SIGFPE, 152 SIGKILL, SIGUSR1, SIGSEGV, SIGUSR2, 153 SIGPIPE, SIGALRM, SIGTERM, SIGBUS, 154 SIGCHLD, SIGCONT, SIGSTOP, SIGTSTP, 155 SIGTTIN, SIGTTOU, SIGURG, SIGXCPU, 156 SIGXFSZ, SIGVTALRM, SIGPROF, SIGWINCH, 157 SIGIO, SIGURG, SIGSYS 158 }; 159 160 #define LINUX_T_UNKNOWN 255 161 static int _bsd_to_linux_trapcode[] = { 162 LINUX_T_UNKNOWN, /* 0 */ 163 6, /* 1 T_PRIVINFLT */ 164 LINUX_T_UNKNOWN, /* 2 */ 165 3, /* 3 T_BPTFLT */ 166 LINUX_T_UNKNOWN, /* 4 */ 167 LINUX_T_UNKNOWN, /* 5 */ 168 16, /* 6 T_ARITHTRAP */ 169 254, /* 7 T_ASTFLT */ 170 LINUX_T_UNKNOWN, /* 8 */ 171 13, /* 9 T_PROTFLT */ 172 1, /* 10 T_TRCTRAP */ 173 LINUX_T_UNKNOWN, /* 11 */ 174 14, /* 12 T_PAGEFLT */ 175 LINUX_T_UNKNOWN, /* 13 */ 176 17, /* 14 T_ALIGNFLT */ 177 LINUX_T_UNKNOWN, /* 15 */ 178 LINUX_T_UNKNOWN, /* 16 */ 179 LINUX_T_UNKNOWN, /* 17 */ 180 0, /* 18 T_DIVIDE */ 181 2, /* 19 T_NMI */ 182 4, /* 20 T_OFLOW */ 183 5, /* 21 T_BOUND */ 184 7, /* 22 T_DNA */ 185 8, /* 23 T_DOUBLEFLT */ 186 9, /* 24 T_FPOPFLT */ 187 10, /* 25 T_TSSFLT */ 188 11, /* 26 T_SEGNPFLT */ 189 12, /* 27 T_STKFLT */ 190 18, /* 28 T_MCHK */ 191 19, /* 29 T_XMMFLT */ 192 15 /* 30 T_RESERVED */ 193 }; 194 #define bsd_to_linux_trapcode(code) \ 195 ((code)<sizeof(_bsd_to_linux_trapcode)/sizeof(*_bsd_to_linux_trapcode)? \ 196 _bsd_to_linux_trapcode[(code)]: \ 197 LINUX_T_UNKNOWN) 198 199 struct linux32_ps_strings { 200 u_int32_t ps_argvstr; /* first of 0 or more argument strings */ 201 u_int ps_nargvstr; /* the number of argument strings */ 202 u_int32_t ps_envstr; /* first of 0 or more environment strings */ 203 u_int ps_nenvstr; /* the number of environment strings */ 204 }; 205 206 /* 207 * If FreeBSD & Linux have a difference of opinion about what a trap 208 * means, deal with it here. 209 * 210 * MPSAFE 211 */ 212 static int 213 translate_traps(int signal, int trap_code) 214 { 215 if (signal != SIGBUS) 216 return signal; 217 switch (trap_code) { 218 case T_PROTFLT: 219 case T_TSSFLT: 220 case T_DOUBLEFLT: 221 case T_PAGEFLT: 222 return SIGSEGV; 223 default: 224 return signal; 225 } 226 } 227 228 static int 229 elf_linux_fixup(register_t **stack_base, struct image_params *imgp) 230 { 231 Elf32_Auxargs *args; 232 Elf32_Addr *base; 233 Elf32_Addr *pos; 234 235 KASSERT(curthread->td_proc == imgp->proc && 236 (curthread->td_proc->p_flag & P_SA) == 0, 237 ("unsafe elf_linux_fixup(), should be curproc")); 238 base = (Elf32_Addr *)*stack_base; 239 args = (Elf32_Auxargs *)imgp->auxargs; 240 pos = base + (imgp->args->argc + imgp->args->envc + 2); 241 242 if (args->trace) 243 AUXARGS_ENTRY_32(pos, AT_DEBUG, 1); 244 if (args->execfd != -1) 245 AUXARGS_ENTRY_32(pos, AT_EXECFD, args->execfd); 246 AUXARGS_ENTRY_32(pos, AT_PHDR, args->phdr); 247 AUXARGS_ENTRY_32(pos, AT_PHENT, args->phent); 248 AUXARGS_ENTRY_32(pos, AT_PHNUM, args->phnum); 249 AUXARGS_ENTRY_32(pos, AT_PAGESZ, args->pagesz); 250 AUXARGS_ENTRY_32(pos, AT_FLAGS, args->flags); 251 AUXARGS_ENTRY_32(pos, AT_ENTRY, args->entry); 252 AUXARGS_ENTRY_32(pos, AT_BASE, args->base); 253 AUXARGS_ENTRY_32(pos, AT_UID, imgp->proc->p_ucred->cr_ruid); 254 AUXARGS_ENTRY_32(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid); 255 AUXARGS_ENTRY_32(pos, AT_GID, imgp->proc->p_ucred->cr_rgid); 256 AUXARGS_ENTRY_32(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid); 257 AUXARGS_ENTRY_32(pos, AT_NULL, 0); 258 259 free(imgp->auxargs, M_TEMP); 260 imgp->auxargs = NULL; 261 262 base--; 263 suword32(base, (uint32_t)imgp->args->argc); 264 *stack_base = (register_t *)base; 265 return 0; 266 } 267 268 extern int _ucodesel, _ucode32sel, _udatasel; 269 extern unsigned long linux_sznonrtsigcode; 270 271 static void 272 linux_rt_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) 273 { 274 struct thread *td = curthread; 275 struct proc *p = td->td_proc; 276 struct sigacts *psp; 277 struct trapframe *regs; 278 struct l_rt_sigframe *fp, frame; 279 int oonstack; 280 int sig; 281 int code; 282 283 sig = ksi->ksi_signo; 284 code = ksi->ksi_code; 285 PROC_LOCK_ASSERT(p, MA_OWNED); 286 psp = p->p_sigacts; 287 mtx_assert(&psp->ps_mtx, MA_OWNED); 288 regs = td->td_frame; 289 oonstack = sigonstack(regs->tf_rsp); 290 291 #ifdef DEBUG 292 if (ldebug(rt_sendsig)) 293 printf(ARGS(rt_sendsig, "%p, %d, %p, %u"), 294 catcher, sig, (void*)mask, code); 295 #endif 296 /* 297 * Allocate space for the signal handler context. 298 */ 299 if ((td->td_pflags & TDP_ALTSTACK) && !oonstack && 300 SIGISMEMBER(psp->ps_sigonstack, sig)) { 301 fp = (struct l_rt_sigframe *)(td->td_sigstk.ss_sp + 302 td->td_sigstk.ss_size - sizeof(struct l_rt_sigframe)); 303 } else 304 fp = (struct l_rt_sigframe *)regs->tf_rsp - 1; 305 mtx_unlock(&psp->ps_mtx); 306 307 /* 308 * Build the argument list for the signal handler. 309 */ 310 if (p->p_sysent->sv_sigtbl) 311 if (sig <= p->p_sysent->sv_sigsize) 312 sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)]; 313 314 bzero(&frame, sizeof(frame)); 315 316 frame.sf_handler = PTROUT(catcher); 317 frame.sf_sig = sig; 318 frame.sf_siginfo = PTROUT(&fp->sf_si); 319 frame.sf_ucontext = PTROUT(&fp->sf_sc); 320 321 /* Fill in POSIX parts */ 322 frame.sf_si.lsi_signo = sig; 323 frame.sf_si.lsi_code = code; 324 frame.sf_si.lsi_addr = PTROUT(ksi->ksi_addr); 325 326 /* 327 * Build the signal context to be used by sigreturn. 328 */ 329 frame.sf_sc.uc_flags = 0; /* XXX ??? */ 330 frame.sf_sc.uc_link = 0; /* XXX ??? */ 331 332 frame.sf_sc.uc_stack.ss_sp = PTROUT(td->td_sigstk.ss_sp); 333 frame.sf_sc.uc_stack.ss_size = td->td_sigstk.ss_size; 334 frame.sf_sc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) 335 ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE; 336 PROC_UNLOCK(p); 337 338 bsd_to_linux_sigset(mask, &frame.sf_sc.uc_sigmask); 339 340 frame.sf_sc.uc_mcontext.sc_mask = frame.sf_sc.uc_sigmask.__bits[0]; 341 frame.sf_sc.uc_mcontext.sc_gs = rgs(); 342 frame.sf_sc.uc_mcontext.sc_fs = rfs(); 343 __asm __volatile("movl %%es,%0" : 344 "=rm" (frame.sf_sc.uc_mcontext.sc_es)); 345 __asm __volatile("movl %%ds,%0" : 346 "=rm" (frame.sf_sc.uc_mcontext.sc_ds)); 347 frame.sf_sc.uc_mcontext.sc_edi = regs->tf_rdi; 348 frame.sf_sc.uc_mcontext.sc_esi = regs->tf_rsi; 349 frame.sf_sc.uc_mcontext.sc_ebp = regs->tf_rbp; 350 frame.sf_sc.uc_mcontext.sc_ebx = regs->tf_rbx; 351 frame.sf_sc.uc_mcontext.sc_edx = regs->tf_rdx; 352 frame.sf_sc.uc_mcontext.sc_ecx = regs->tf_rcx; 353 frame.sf_sc.uc_mcontext.sc_eax = regs->tf_rax; 354 frame.sf_sc.uc_mcontext.sc_eip = regs->tf_rip; 355 frame.sf_sc.uc_mcontext.sc_cs = regs->tf_cs; 356 frame.sf_sc.uc_mcontext.sc_eflags = regs->tf_rflags; 357 frame.sf_sc.uc_mcontext.sc_esp_at_signal = regs->tf_rsp; 358 frame.sf_sc.uc_mcontext.sc_ss = regs->tf_ss; 359 frame.sf_sc.uc_mcontext.sc_err = regs->tf_err; 360 frame.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code); 361 362 #ifdef DEBUG 363 if (ldebug(rt_sendsig)) 364 printf(LMSG("rt_sendsig flags: 0x%x, sp: %p, ss: 0x%lx, mask: 0x%x"), 365 frame.sf_sc.uc_stack.ss_flags, td->td_sigstk.ss_sp, 366 td->td_sigstk.ss_size, frame.sf_sc.uc_mcontext.sc_mask); 367 #endif 368 369 if (copyout(&frame, fp, sizeof(frame)) != 0) { 370 /* 371 * Process has trashed its stack; give it an illegal 372 * instruction to halt it in its tracks. 373 */ 374 #ifdef DEBUG 375 if (ldebug(rt_sendsig)) 376 printf(LMSG("rt_sendsig: bad stack %p, oonstack=%x"), 377 fp, oonstack); 378 #endif 379 PROC_LOCK(p); 380 sigexit(td, SIGILL); 381 } 382 383 /* 384 * Build context to run handler in. 385 */ 386 regs->tf_rsp = PTROUT(fp); 387 regs->tf_rip = LINUX32_PS_STRINGS - *(p->p_sysent->sv_szsigcode) + 388 linux_sznonrtsigcode; 389 regs->tf_rflags &= ~PSL_T; 390 regs->tf_cs = _ucode32sel; 391 regs->tf_ss = _udatasel; 392 load_ds(_udatasel); 393 td->td_pcb->pcb_ds = _udatasel; 394 load_es(_udatasel); 395 td->td_pcb->pcb_es = _udatasel; 396 PROC_LOCK(p); 397 mtx_lock(&psp->ps_mtx); 398 } 399 400 401 /* 402 * Send an interrupt to process. 403 * 404 * Stack is set up to allow sigcode stored 405 * in u. to call routine, followed by kcall 406 * to sigreturn routine below. After sigreturn 407 * resets the signal mask, the stack, and the 408 * frame pointer, it returns to the user 409 * specified pc, psl. 410 */ 411 static void 412 linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) 413 { 414 struct thread *td = curthread; 415 struct proc *p = td->td_proc; 416 struct sigacts *psp; 417 struct trapframe *regs; 418 struct l_sigframe *fp, frame; 419 l_sigset_t lmask; 420 int oonstack, i; 421 int sig, code; 422 423 sig = ksi->ksi_signo; 424 code = ksi->ksi_code; 425 PROC_LOCK_ASSERT(p, MA_OWNED); 426 psp = p->p_sigacts; 427 mtx_assert(&psp->ps_mtx, MA_OWNED); 428 if (SIGISMEMBER(psp->ps_siginfo, sig)) { 429 /* Signal handler installed with SA_SIGINFO. */ 430 linux_rt_sendsig(catcher, ksi, mask); 431 return; 432 } 433 434 regs = td->td_frame; 435 oonstack = sigonstack(regs->tf_rsp); 436 437 #ifdef DEBUG 438 if (ldebug(sendsig)) 439 printf(ARGS(sendsig, "%p, %d, %p, %u"), 440 catcher, sig, (void*)mask, code); 441 #endif 442 443 /* 444 * Allocate space for the signal handler context. 445 */ 446 if ((td->td_pflags & TDP_ALTSTACK) && !oonstack && 447 SIGISMEMBER(psp->ps_sigonstack, sig)) { 448 fp = (struct l_sigframe *)(td->td_sigstk.ss_sp + 449 td->td_sigstk.ss_size - sizeof(struct l_sigframe)); 450 } else 451 fp = (struct l_sigframe *)regs->tf_rsp - 1; 452 mtx_unlock(&psp->ps_mtx); 453 PROC_UNLOCK(p); 454 455 /* 456 * Build the argument list for the signal handler. 457 */ 458 if (p->p_sysent->sv_sigtbl) 459 if (sig <= p->p_sysent->sv_sigsize) 460 sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)]; 461 462 bzero(&frame, sizeof(frame)); 463 464 frame.sf_handler = PTROUT(catcher); 465 frame.sf_sig = sig; 466 467 bsd_to_linux_sigset(mask, &lmask); 468 469 /* 470 * Build the signal context to be used by sigreturn. 471 */ 472 frame.sf_sc.sc_mask = lmask.__bits[0]; 473 frame.sf_sc.sc_gs = rgs(); 474 frame.sf_sc.sc_fs = rfs(); 475 __asm __volatile("movl %%es,%0" : "=rm" (frame.sf_sc.sc_es)); 476 __asm __volatile("movl %%ds,%0" : "=rm" (frame.sf_sc.sc_ds)); 477 frame.sf_sc.sc_edi = regs->tf_rdi; 478 frame.sf_sc.sc_esi = regs->tf_rsi; 479 frame.sf_sc.sc_ebp = regs->tf_rbp; 480 frame.sf_sc.sc_ebx = regs->tf_rbx; 481 frame.sf_sc.sc_edx = regs->tf_rdx; 482 frame.sf_sc.sc_ecx = regs->tf_rcx; 483 frame.sf_sc.sc_eax = regs->tf_rax; 484 frame.sf_sc.sc_eip = regs->tf_rip; 485 frame.sf_sc.sc_cs = regs->tf_cs; 486 frame.sf_sc.sc_eflags = regs->tf_rflags; 487 frame.sf_sc.sc_esp_at_signal = regs->tf_rsp; 488 frame.sf_sc.sc_ss = regs->tf_ss; 489 frame.sf_sc.sc_err = regs->tf_err; 490 frame.sf_sc.sc_trapno = bsd_to_linux_trapcode(code); 491 492 for (i = 0; i < (LINUX_NSIG_WORDS-1); i++) 493 frame.sf_extramask[i] = lmask.__bits[i+1]; 494 495 if (copyout(&frame, fp, sizeof(frame)) != 0) { 496 /* 497 * Process has trashed its stack; give it an illegal 498 * instruction to halt it in its tracks. 499 */ 500 PROC_LOCK(p); 501 sigexit(td, SIGILL); 502 } 503 504 /* 505 * Build context to run handler in. 506 */ 507 regs->tf_rsp = PTROUT(fp); 508 regs->tf_rip = LINUX32_PS_STRINGS - *(p->p_sysent->sv_szsigcode); 509 regs->tf_rflags &= ~PSL_T; 510 regs->tf_cs = _ucode32sel; 511 regs->tf_ss = _udatasel; 512 load_ds(_udatasel); 513 td->td_pcb->pcb_ds = _udatasel; 514 load_es(_udatasel); 515 td->td_pcb->pcb_es = _udatasel; 516 PROC_LOCK(p); 517 mtx_lock(&psp->ps_mtx); 518 } 519 520 /* 521 * System call to cleanup state after a signal 522 * has been taken. Reset signal mask and 523 * stack state from context left by sendsig (above). 524 * Return to previous pc and psl as specified by 525 * context left by sendsig. Check carefully to 526 * make sure that the user has not modified the 527 * psl to gain improper privileges or to cause 528 * a machine fault. 529 */ 530 int 531 linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args) 532 { 533 struct proc *p = td->td_proc; 534 struct l_sigframe frame; 535 struct trapframe *regs; 536 l_sigset_t lmask; 537 int eflags, i; 538 ksiginfo_t ksi; 539 540 regs = td->td_frame; 541 542 #ifdef DEBUG 543 if (ldebug(sigreturn)) 544 printf(ARGS(sigreturn, "%p"), (void *)args->sfp); 545 #endif 546 /* 547 * The trampoline code hands us the sigframe. 548 * It is unsafe to keep track of it ourselves, in the event that a 549 * program jumps out of a signal handler. 550 */ 551 if (copyin(args->sfp, &frame, sizeof(frame)) != 0) 552 return (EFAULT); 553 554 /* 555 * Check for security violations. 556 */ 557 #define EFLAGS_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) 558 eflags = frame.sf_sc.sc_eflags; 559 /* 560 * XXX do allow users to change the privileged flag PSL_RF. The 561 * cpu sets PSL_RF in tf_eflags for faults. Debuggers should 562 * sometimes set it there too. tf_eflags is kept in the signal 563 * context during signal handling and there is no other place 564 * to remember it, so the PSL_RF bit may be corrupted by the 565 * signal handler without us knowing. Corruption of the PSL_RF 566 * bit at worst causes one more or one less debugger trap, so 567 * allowing it is fairly harmless. 568 */ 569 if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF)) 570 return(EINVAL); 571 572 /* 573 * Don't allow users to load a valid privileged %cs. Let the 574 * hardware check for invalid selectors, excess privilege in 575 * other selectors, invalid %eip's and invalid %esp's. 576 */ 577 #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) 578 if (!CS_SECURE(frame.sf_sc.sc_cs)) { 579 ksiginfo_init_trap(&ksi); 580 ksi.ksi_signo = SIGBUS; 581 ksi.ksi_code = BUS_OBJERR; 582 ksi.ksi_trapno = T_PROTFLT; 583 ksi.ksi_addr = (void *)regs->tf_rip; 584 trapsignal(td, &ksi); 585 return(EINVAL); 586 } 587 588 lmask.__bits[0] = frame.sf_sc.sc_mask; 589 for (i = 0; i < (LINUX_NSIG_WORDS-1); i++) 590 lmask.__bits[i+1] = frame.sf_extramask[i]; 591 PROC_LOCK(p); 592 linux_to_bsd_sigset(&lmask, &td->td_sigmask); 593 SIG_CANTMASK(td->td_sigmask); 594 signotify(td); 595 PROC_UNLOCK(p); 596 597 /* 598 * Restore signal context. 599 */ 600 /* Selectors were restored by the trampoline. */ 601 regs->tf_rdi = frame.sf_sc.sc_edi; 602 regs->tf_rsi = frame.sf_sc.sc_esi; 603 regs->tf_rbp = frame.sf_sc.sc_ebp; 604 regs->tf_rbx = frame.sf_sc.sc_ebx; 605 regs->tf_rdx = frame.sf_sc.sc_edx; 606 regs->tf_rcx = frame.sf_sc.sc_ecx; 607 regs->tf_rax = frame.sf_sc.sc_eax; 608 regs->tf_rip = frame.sf_sc.sc_eip; 609 regs->tf_cs = frame.sf_sc.sc_cs; 610 regs->tf_rflags = eflags; 611 regs->tf_rsp = frame.sf_sc.sc_esp_at_signal; 612 regs->tf_ss = frame.sf_sc.sc_ss; 613 614 return (EJUSTRETURN); 615 } 616 617 /* 618 * System call to cleanup state after a signal 619 * has been taken. Reset signal mask and 620 * stack state from context left by rt_sendsig (above). 621 * Return to previous pc and psl as specified by 622 * context left by sendsig. Check carefully to 623 * make sure that the user has not modified the 624 * psl to gain improper privileges or to cause 625 * a machine fault. 626 */ 627 int 628 linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args) 629 { 630 struct proc *p = td->td_proc; 631 struct l_ucontext uc; 632 struct l_sigcontext *context; 633 l_stack_t *lss; 634 stack_t ss; 635 struct trapframe *regs; 636 int eflags; 637 ksiginfo_t ksi; 638 639 regs = td->td_frame; 640 641 #ifdef DEBUG 642 if (ldebug(rt_sigreturn)) 643 printf(ARGS(rt_sigreturn, "%p"), (void *)args->ucp); 644 #endif 645 /* 646 * The trampoline code hands us the ucontext. 647 * It is unsafe to keep track of it ourselves, in the event that a 648 * program jumps out of a signal handler. 649 */ 650 if (copyin(args->ucp, &uc, sizeof(uc)) != 0) 651 return (EFAULT); 652 653 context = &uc.uc_mcontext; 654 655 /* 656 * Check for security violations. 657 */ 658 #define EFLAGS_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) 659 eflags = context->sc_eflags; 660 /* 661 * XXX do allow users to change the privileged flag PSL_RF. The 662 * cpu sets PSL_RF in tf_eflags for faults. Debuggers should 663 * sometimes set it there too. tf_eflags is kept in the signal 664 * context during signal handling and there is no other place 665 * to remember it, so the PSL_RF bit may be corrupted by the 666 * signal handler without us knowing. Corruption of the PSL_RF 667 * bit at worst causes one more or one less debugger trap, so 668 * allowing it is fairly harmless. 669 */ 670 if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF)) 671 return(EINVAL); 672 673 /* 674 * Don't allow users to load a valid privileged %cs. Let the 675 * hardware check for invalid selectors, excess privilege in 676 * other selectors, invalid %eip's and invalid %esp's. 677 */ 678 #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) 679 if (!CS_SECURE(context->sc_cs)) { 680 ksiginfo_init_trap(&ksi); 681 ksi.ksi_signo = SIGBUS; 682 ksi.ksi_code = BUS_OBJERR; 683 ksi.ksi_trapno = T_PROTFLT; 684 ksi.ksi_addr = (void *)regs->tf_rip; 685 trapsignal(td, &ksi); 686 return(EINVAL); 687 } 688 689 PROC_LOCK(p); 690 linux_to_bsd_sigset(&uc.uc_sigmask, &td->td_sigmask); 691 SIG_CANTMASK(td->td_sigmask); 692 signotify(td); 693 PROC_UNLOCK(p); 694 695 /* 696 * Restore signal context 697 */ 698 /* Selectors were restored by the trampoline. */ 699 regs->tf_rdi = context->sc_edi; 700 regs->tf_rsi = context->sc_esi; 701 regs->tf_rbp = context->sc_ebp; 702 regs->tf_rbx = context->sc_ebx; 703 regs->tf_rdx = context->sc_edx; 704 regs->tf_rcx = context->sc_ecx; 705 regs->tf_rax = context->sc_eax; 706 regs->tf_rip = context->sc_eip; 707 regs->tf_cs = context->sc_cs; 708 regs->tf_rflags = eflags; 709 regs->tf_rsp = context->sc_esp_at_signal; 710 regs->tf_ss = context->sc_ss; 711 712 /* 713 * call sigaltstack & ignore results.. 714 */ 715 lss = &uc.uc_stack; 716 ss.ss_sp = PTRIN(lss->ss_sp); 717 ss.ss_size = lss->ss_size; 718 ss.ss_flags = linux_to_bsd_sigaltstack(lss->ss_flags); 719 720 #ifdef DEBUG 721 if (ldebug(rt_sigreturn)) 722 printf(LMSG("rt_sigret flags: 0x%x, sp: %p, ss: 0x%lx, mask: 0x%x"), 723 ss.ss_flags, ss.ss_sp, ss.ss_size, context->sc_mask); 724 #endif 725 (void)kern_sigaltstack(td, &ss, NULL); 726 727 return (EJUSTRETURN); 728 } 729 730 /* 731 * MPSAFE 732 */ 733 static void 734 linux_prepsyscall(struct trapframe *tf, int *args, u_int *code, caddr_t *params) 735 { 736 args[0] = tf->tf_rbx; 737 args[1] = tf->tf_rcx; 738 args[2] = tf->tf_rdx; 739 args[3] = tf->tf_rsi; 740 args[4] = tf->tf_rdi; 741 args[5] = tf->tf_rbp; /* Unconfirmed */ 742 *params = NULL; /* no copyin */ 743 } 744 745 /* 746 * If a linux binary is exec'ing something, try this image activator 747 * first. We override standard shell script execution in order to 748 * be able to modify the interpreter path. We only do this if a linux 749 * binary is doing the exec, so we do not create an EXEC module for it. 750 */ 751 static int exec_linux_imgact_try(struct image_params *iparams); 752 753 static int 754 exec_linux_imgact_try(struct image_params *imgp) 755 { 756 const char *head = (const char *)imgp->image_header; 757 char *rpath; 758 int error = -1, len; 759 760 /* 761 * The interpreter for shell scripts run from a linux binary needs 762 * to be located in /compat/linux if possible in order to recursively 763 * maintain linux path emulation. 764 */ 765 if (((const short *)head)[0] == SHELLMAGIC) { 766 /* 767 * Run our normal shell image activator. If it succeeds attempt 768 * to use the alternate path for the interpreter. If an alternate 769 * path is found, use our stringspace to store it. 770 */ 771 if ((error = exec_shell_imgact(imgp)) == 0) { 772 linux_emul_convpath(FIRST_THREAD_IN_PROC(imgp->proc), 773 imgp->interpreter_name, UIO_SYSSPACE, &rpath, 0); 774 if (rpath != NULL) { 775 len = strlen(rpath) + 1; 776 777 if (len <= MAXSHELLCMDLEN) { 778 memcpy(imgp->interpreter_name, rpath, len); 779 } 780 free(rpath, M_TEMP); 781 } 782 } 783 } 784 return(error); 785 } 786 787 /* 788 * Clear registers on exec 789 * XXX copied from ia32_signal.c. 790 */ 791 static void 792 exec_linux_setregs(td, entry, stack, ps_strings) 793 struct thread *td; 794 u_long entry; 795 u_long stack; 796 u_long ps_strings; 797 { 798 struct trapframe *regs = td->td_frame; 799 struct pcb *pcb = td->td_pcb; 800 801 wrmsr(MSR_FSBASE, 0); 802 wrmsr(MSR_KGSBASE, 0); /* User value while we're in the kernel */ 803 pcb->pcb_fsbase = 0; 804 pcb->pcb_gsbase = 0; 805 load_ds(_udatasel); 806 load_es(_udatasel); 807 load_fs(_udatasel); 808 load_gs(0); 809 pcb->pcb_ds = _udatasel; 810 pcb->pcb_es = _udatasel; 811 pcb->pcb_fs = _udatasel; 812 pcb->pcb_gs = 0; 813 814 bzero((char *)regs, sizeof(struct trapframe)); 815 regs->tf_rip = entry; 816 regs->tf_rsp = stack; 817 regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T); 818 regs->tf_ss = _udatasel; 819 regs->tf_cs = _ucode32sel; 820 regs->tf_rbx = ps_strings; 821 load_cr0(rcr0() | CR0_MP | CR0_TS); 822 fpstate_drop(td); 823 824 /* Return via doreti so that we can change to a different %cs */ 825 pcb->pcb_flags |= PCB_FULLCTX; 826 td->td_retval[1] = 0; 827 } 828 829 /* 830 * XXX copied from ia32_sysvec.c. 831 */ 832 static register_t * 833 linux_copyout_strings(struct image_params *imgp) 834 { 835 int argc, envc; 836 u_int32_t *vectp; 837 char *stringp, *destp; 838 u_int32_t *stack_base; 839 struct linux32_ps_strings *arginfo; 840 int sigcodesz; 841 842 /* 843 * Calculate string base and vector table pointers. 844 * Also deal with signal trampoline code for this exec type. 845 */ 846 arginfo = (struct linux32_ps_strings *)LINUX32_PS_STRINGS; 847 sigcodesz = *(imgp->proc->p_sysent->sv_szsigcode); 848 destp = (caddr_t)arginfo - sigcodesz - SPARE_USRSPACE - 849 roundup((ARG_MAX - imgp->args->stringspace), sizeof(char *)); 850 851 /* 852 * install sigcode 853 */ 854 if (sigcodesz) 855 copyout(imgp->proc->p_sysent->sv_sigcode, 856 ((caddr_t)arginfo - sigcodesz), szsigcode); 857 858 /* 859 * If we have a valid auxargs ptr, prepare some room 860 * on the stack. 861 */ 862 if (imgp->auxargs) { 863 /* 864 * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for 865 * lower compatibility. 866 */ 867 imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size 868 : (AT_COUNT * 2); 869 /* 870 * The '+ 2' is for the null pointers at the end of each of 871 * the arg and env vector sets,and imgp->auxarg_size is room 872 * for argument of Runtime loader. 873 */ 874 vectp = (u_int32_t *) (destp - (imgp->args->argc + imgp->args->envc + 2 + 875 imgp->auxarg_size) * sizeof(u_int32_t)); 876 877 } else 878 /* 879 * The '+ 2' is for the null pointers at the end of each of 880 * the arg and env vector sets 881 */ 882 vectp = (u_int32_t *) 883 (destp - (imgp->args->argc + imgp->args->envc + 2) * sizeof(u_int32_t)); 884 885 /* 886 * vectp also becomes our initial stack base 887 */ 888 stack_base = vectp; 889 890 stringp = imgp->args->begin_argv; 891 argc = imgp->args->argc; 892 envc = imgp->args->envc; 893 /* 894 * Copy out strings - arguments and environment. 895 */ 896 copyout(stringp, destp, ARG_MAX - imgp->args->stringspace); 897 898 /* 899 * Fill in "ps_strings" struct for ps, w, etc. 900 */ 901 suword32(&arginfo->ps_argvstr, (u_int32_t)(intptr_t)vectp); 902 suword32(&arginfo->ps_nargvstr, argc); 903 904 /* 905 * Fill in argument portion of vector table. 906 */ 907 for (; argc > 0; --argc) { 908 suword32(vectp++, (u_int32_t)(intptr_t)destp); 909 while (*stringp++ != 0) 910 destp++; 911 destp++; 912 } 913 914 /* a null vector table pointer separates the argp's from the envp's */ 915 suword32(vectp++, 0); 916 917 suword32(&arginfo->ps_envstr, (u_int32_t)(intptr_t)vectp); 918 suword32(&arginfo->ps_nenvstr, envc); 919 920 /* 921 * Fill in environment portion of vector table. 922 */ 923 for (; envc > 0; --envc) { 924 suword32(vectp++, (u_int32_t)(intptr_t)destp); 925 while (*stringp++ != 0) 926 destp++; 927 destp++; 928 } 929 930 /* end of vector table is a null pointer */ 931 suword32(vectp, 0); 932 933 return ((register_t *)stack_base); 934 } 935 936 SYSCTL_NODE(_compat, OID_AUTO, linux32, CTLFLAG_RW, 0, 937 "32-bit Linux emulation"); 938 939 static u_long linux32_maxdsiz = LINUX32_MAXDSIZ; 940 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxdsiz, CTLFLAG_RW, 941 &linux32_maxdsiz, 0, ""); 942 static u_long linux32_maxssiz = LINUX32_MAXSSIZ; 943 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxssiz, CTLFLAG_RW, 944 &linux32_maxssiz, 0, ""); 945 static u_long linux32_maxvmem = LINUX32_MAXVMEM; 946 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxvmem, CTLFLAG_RW, 947 &linux32_maxvmem, 0, ""); 948 949 /* 950 * XXX copied from ia32_sysvec.c. 951 */ 952 static void 953 linux32_fixlimits(struct proc *p) 954 { 955 struct plimit *oldlim, *newlim; 956 957 if (linux32_maxdsiz == 0 && linux32_maxssiz == 0 && 958 linux32_maxvmem == 0) 959 return; 960 newlim = lim_alloc(); 961 PROC_LOCK(p); 962 oldlim = p->p_limit; 963 lim_copy(newlim, oldlim); 964 if (linux32_maxdsiz != 0) { 965 if (newlim->pl_rlimit[RLIMIT_DATA].rlim_cur > linux32_maxdsiz) 966 newlim->pl_rlimit[RLIMIT_DATA].rlim_cur = linux32_maxdsiz; 967 if (newlim->pl_rlimit[RLIMIT_DATA].rlim_max > linux32_maxdsiz) 968 newlim->pl_rlimit[RLIMIT_DATA].rlim_max = linux32_maxdsiz; 969 } 970 if (linux32_maxssiz != 0) { 971 if (newlim->pl_rlimit[RLIMIT_STACK].rlim_cur > linux32_maxssiz) 972 newlim->pl_rlimit[RLIMIT_STACK].rlim_cur = linux32_maxssiz; 973 if (newlim->pl_rlimit[RLIMIT_STACK].rlim_max > linux32_maxssiz) 974 newlim->pl_rlimit[RLIMIT_STACK].rlim_max = linux32_maxssiz; 975 } 976 if (linux32_maxvmem != 0) { 977 if (newlim->pl_rlimit[RLIMIT_VMEM].rlim_cur > linux32_maxvmem) 978 newlim->pl_rlimit[RLIMIT_VMEM].rlim_cur = linux32_maxvmem; 979 if (newlim->pl_rlimit[RLIMIT_VMEM].rlim_max > linux32_maxvmem) 980 newlim->pl_rlimit[RLIMIT_VMEM].rlim_max = linux32_maxvmem; 981 } 982 p->p_limit = newlim; 983 PROC_UNLOCK(p); 984 lim_free(oldlim); 985 } 986 987 struct sysentvec elf_linux_sysvec = { 988 LINUX_SYS_MAXSYSCALL, 989 linux_sysent, 990 0xff, 991 LINUX_SIGTBLSZ, 992 bsd_to_linux_signal, 993 ELAST + 1, 994 bsd_to_linux_errno, 995 translate_traps, 996 elf_linux_fixup, 997 linux_sendsig, 998 linux_sigcode, 999 &linux_szsigcode, 1000 linux_prepsyscall, 1001 "Linux ELF32", 1002 elf32_coredump, 1003 exec_linux_imgact_try, 1004 LINUX_MINSIGSTKSZ, 1005 PAGE_SIZE, 1006 VM_MIN_ADDRESS, 1007 LINUX32_USRSTACK, 1008 LINUX32_USRSTACK, 1009 LINUX32_PS_STRINGS, 1010 VM_PROT_ALL, 1011 linux_copyout_strings, 1012 exec_linux_setregs, 1013 linux32_fixlimits 1014 }; 1015 1016 static Elf32_Brandinfo linux_brand = { 1017 ELFOSABI_LINUX, 1018 EM_386, 1019 "Linux", 1020 "/compat/linux", 1021 "/lib/ld-linux.so.1", 1022 &elf_linux_sysvec, 1023 NULL, 1024 BI_CAN_EXEC_DYN, 1025 }; 1026 1027 static Elf32_Brandinfo linux_glibc2brand = { 1028 ELFOSABI_LINUX, 1029 EM_386, 1030 "Linux", 1031 "/compat/linux", 1032 "/lib/ld-linux.so.2", 1033 &elf_linux_sysvec, 1034 NULL, 1035 BI_CAN_EXEC_DYN, 1036 }; 1037 1038 Elf32_Brandinfo *linux_brandlist[] = { 1039 &linux_brand, 1040 &linux_glibc2brand, 1041 NULL 1042 }; 1043 1044 static int 1045 linux_elf_modevent(module_t mod, int type, void *data) 1046 { 1047 Elf32_Brandinfo **brandinfo; 1048 int error; 1049 struct linux_ioctl_handler **lihp; 1050 1051 error = 0; 1052 1053 switch(type) { 1054 case MOD_LOAD: 1055 for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; 1056 ++brandinfo) 1057 if (elf32_insert_brand_entry(*brandinfo) < 0) 1058 error = EINVAL; 1059 if (error == 0) { 1060 SET_FOREACH(lihp, linux_ioctl_handler_set) 1061 linux_ioctl_register_handler(*lihp); 1062 if (bootverbose) 1063 printf("Linux ELF exec handler installed\n"); 1064 } else 1065 printf("cannot insert Linux ELF brand handler\n"); 1066 break; 1067 case MOD_UNLOAD: 1068 for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; 1069 ++brandinfo) 1070 if (elf32_brand_inuse(*brandinfo)) 1071 error = EBUSY; 1072 if (error == 0) { 1073 for (brandinfo = &linux_brandlist[0]; 1074 *brandinfo != NULL; ++brandinfo) 1075 if (elf32_remove_brand_entry(*brandinfo) < 0) 1076 error = EINVAL; 1077 } 1078 if (error == 0) { 1079 SET_FOREACH(lihp, linux_ioctl_handler_set) 1080 linux_ioctl_unregister_handler(*lihp); 1081 if (bootverbose) 1082 printf("Linux ELF exec handler removed\n"); 1083 } else 1084 printf("Could not deinstall ELF interpreter entry\n"); 1085 break; 1086 default: 1087 break; 1088 } 1089 return error; 1090 } 1091 1092 static moduledata_t linux_elf_mod = { 1093 "linuxelf", 1094 linux_elf_modevent, 1095 0 1096 }; 1097 1098 DECLARE_MODULE(linuxelf, linux_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY); 1099