1 /*- 2 * Copyright (c) 2013 Dmitry Chagin 3 * Copyright (c) 2004 Tim J. Robbins 4 * Copyright (c) 2002 Doug Rabson 5 * Copyright (c) 2000 Marcel Moolenaar 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer 13 * in this position and unchanged. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 3. The name of the author may not be used to endorse or promote products 18 * derived from this software without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 21 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 22 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 23 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 24 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 25 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 29 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 #include <sys/cdefs.h> 33 __FBSDID("$FreeBSD$"); 34 35 #include <sys/param.h> 36 #include <sys/kernel.h> 37 #include <sys/systm.h> 38 #include <sys/capability.h> 39 #include <sys/dirent.h> 40 #include <sys/file.h> 41 #include <sys/fcntl.h> 42 #include <sys/filedesc.h> 43 #include <sys/clock.h> 44 #include <sys/imgact.h> 45 #include <sys/ktr.h> 46 #include <sys/limits.h> 47 #include <sys/lock.h> 48 #include <sys/malloc.h> 49 #include <sys/mman.h> 50 #include <sys/mutex.h> 51 #include <sys/priv.h> 52 #include <sys/proc.h> 53 #include <sys/resource.h> 54 #include <sys/resourcevar.h> 55 #include <sys/sched.h> 56 #include <sys/syscallsubr.h> 57 #include <sys/sysproto.h> 58 #include <sys/vnode.h> 59 #include <sys/unistd.h> 60 #include <sys/wait.h> 61 62 #include <security/mac/mac_framework.h> 63 64 #include <ufs/ufs/extattr.h> 65 #include <ufs/ufs/quota.h> 66 #include <ufs/ufs/ufsmount.h> 67 68 #include <machine/frame.h> 69 #include <machine/md_var.h> 70 #include <machine/pcb.h> 71 #include <machine/psl.h> 72 #include <machine/segments.h> 73 #include <machine/specialreg.h> 74 75 #include <vm/vm.h> 76 #include <vm/pmap.h> 77 #include <vm/vm_extern.h> 78 #include <vm/vm_kern.h> 79 #include <vm/vm_map.h> 80 81 #include <amd64/linux/linux.h> 82 #include <amd64/linux/linux_proto.h> 83 #include <compat/linux/linux_ipc.h> 84 #include <compat/linux/linux_file.h> 85 #include <compat/linux/linux_misc.h> 86 #include <compat/linux/linux_signal.h> 87 #include <compat/linux/linux_util.h> 88 #include <compat/linux/linux_emul.h> 89 90 91 int 92 linux_execve(struct thread *td, struct linux_execve_args *args) 93 { 94 struct image_args eargs; 95 char *path; 96 int error; 97 98 LCONVPATHEXIST(td, args->path, &path); 99 100 LINUX_CTR(execve); 101 102 error = exec_copyin_args(&eargs, path, UIO_SYSSPACE, args->argp, 103 args->envp); 104 free(path, M_TEMP); 105 if (error == 0) 106 error = linux_common_execve(td, &eargs); 107 return (error); 108 } 109 110 int 111 linux_set_upcall_kse(struct thread *td, register_t stack) 112 { 113 114 if (stack) 115 td->td_frame->tf_rsp = stack; 116 117 /* 118 * The newly created Linux thread returns 119 * to the user space by the same path that a parent do. 120 */ 121 td->td_frame->tf_rax = 0; 122 return (0); 123 } 124 125 #define STACK_SIZE (2 * 1024 * 1024) 126 #define GUARD_SIZE (4 * PAGE_SIZE) 127 128 int 129 linux_mmap2(struct thread *td, struct linux_mmap2_args *args) 130 { 131 struct proc *p = td->td_proc; 132 struct mmap_args /* { 133 caddr_t addr; 134 size_t len; 135 int prot; 136 int flags; 137 int fd; 138 long pad; 139 off_t pos; 140 } */ bsd_args; 141 int error; 142 struct file *fp; 143 cap_rights_t rights; 144 145 LINUX_CTR6(mmap2, "0x%lx, %ld, %ld, 0x%08lx, %ld, 0x%lx", 146 args->addr, args->len, args->prot, 147 args->flags, args->fd, args->pgoff); 148 149 error = 0; 150 bsd_args.flags = 0; 151 fp = NULL; 152 153 /* 154 * Linux mmap(2): 155 * You must specify exactly one of MAP_SHARED and MAP_PRIVATE 156 */ 157 if (! ((args->flags & LINUX_MAP_SHARED) ^ 158 (args->flags & LINUX_MAP_PRIVATE))) 159 return (EINVAL); 160 161 if (args->flags & LINUX_MAP_SHARED) 162 bsd_args.flags |= MAP_SHARED; 163 if (args->flags & LINUX_MAP_PRIVATE) 164 bsd_args.flags |= MAP_PRIVATE; 165 if (args->flags & LINUX_MAP_FIXED) 166 bsd_args.flags |= MAP_FIXED; 167 if (args->flags & LINUX_MAP_ANON) 168 bsd_args.flags |= MAP_ANON; 169 else 170 bsd_args.flags |= MAP_NOSYNC; 171 if (args->flags & LINUX_MAP_GROWSDOWN) 172 bsd_args.flags |= MAP_STACK; 173 174 /* 175 * PROT_READ, PROT_WRITE, or PROT_EXEC implies PROT_READ and PROT_EXEC 176 * on Linux/i386. We do this to ensure maximum compatibility. 177 * Linux/ia64 does the same in i386 emulation mode. 178 */ 179 bsd_args.prot = args->prot; 180 if (bsd_args.prot & (PROT_READ | PROT_WRITE | PROT_EXEC)) 181 bsd_args.prot |= PROT_READ | PROT_EXEC; 182 183 /* Linux does not check file descriptor when MAP_ANONYMOUS is set. */ 184 bsd_args.fd = (bsd_args.flags & MAP_ANON) ? -1 : args->fd; 185 if (bsd_args.fd != -1) { 186 /* 187 * Linux follows Solaris mmap(2) description: 188 * The file descriptor fildes is opened with 189 * read permission, regardless of the 190 * protection options specified. 191 */ 192 193 error = fget(td, bsd_args.fd, 194 cap_rights_init(&rights, CAP_MMAP), &fp); 195 if (error != 0 ) 196 return (error); 197 if (fp->f_type != DTYPE_VNODE) { 198 fdrop(fp, td); 199 return (EINVAL); 200 } 201 202 /* Linux mmap() just fails for O_WRONLY files */ 203 if (!(fp->f_flag & FREAD)) { 204 fdrop(fp, td); 205 return (EACCES); 206 } 207 208 fdrop(fp, td); 209 } 210 211 if (args->flags & LINUX_MAP_GROWSDOWN) { 212 /* 213 * The Linux MAP_GROWSDOWN option does not limit auto 214 * growth of the region. Linux mmap with this option 215 * takes as addr the inital BOS, and as len, the initial 216 * region size. It can then grow down from addr without 217 * limit. However, Linux threads has an implicit internal 218 * limit to stack size of STACK_SIZE. Its just not 219 * enforced explicitly in Linux. But, here we impose 220 * a limit of (STACK_SIZE - GUARD_SIZE) on the stack 221 * region, since we can do this with our mmap. 222 * 223 * Our mmap with MAP_STACK takes addr as the maximum 224 * downsize limit on BOS, and as len the max size of 225 * the region. It then maps the top SGROWSIZ bytes, 226 * and auto grows the region down, up to the limit 227 * in addr. 228 * 229 * If we don't use the MAP_STACK option, the effect 230 * of this code is to allocate a stack region of a 231 * fixed size of (STACK_SIZE - GUARD_SIZE). 232 */ 233 234 if ((caddr_t)PTRIN(args->addr) + args->len > 235 p->p_vmspace->vm_maxsaddr) { 236 /* 237 * Some Linux apps will attempt to mmap 238 * thread stacks near the top of their 239 * address space. If their TOS is greater 240 * than vm_maxsaddr, vm_map_growstack() 241 * will confuse the thread stack with the 242 * process stack and deliver a SEGV if they 243 * attempt to grow the thread stack past their 244 * current stacksize rlimit. To avoid this, 245 * adjust vm_maxsaddr upwards to reflect 246 * the current stacksize rlimit rather 247 * than the maximum possible stacksize. 248 * It would be better to adjust the 249 * mmap'ed region, but some apps do not check 250 * mmap's return value. 251 */ 252 PROC_LOCK(p); 253 p->p_vmspace->vm_maxsaddr = (char *)USRSTACK - 254 lim_cur_proc(p, RLIMIT_STACK); 255 PROC_UNLOCK(p); 256 } 257 258 /* 259 * This gives us our maximum stack size and a new BOS. 260 * If we're using VM_STACK, then mmap will just map 261 * the top SGROWSIZ bytes, and let the stack grow down 262 * to the limit at BOS. If we're not using VM_STACK 263 * we map the full stack, since we don't have a way 264 * to autogrow it. 265 */ 266 if (args->len > STACK_SIZE - GUARD_SIZE) { 267 bsd_args.addr = (caddr_t)PTRIN(args->addr); 268 bsd_args.len = args->len; 269 } else { 270 bsd_args.addr = (caddr_t)PTRIN(args->addr) - 271 (STACK_SIZE - GUARD_SIZE - args->len); 272 bsd_args.len = STACK_SIZE - GUARD_SIZE; 273 } 274 } else { 275 bsd_args.addr = (caddr_t)PTRIN(args->addr); 276 bsd_args.len = args->len; 277 } 278 bsd_args.pos = (off_t)args->pgoff; 279 280 error = sys_mmap(td, &bsd_args); 281 282 LINUX_CTR2(mmap2, "return: %d (%p)", 283 error, td->td_retval[0]); 284 return (error); 285 } 286 287 int 288 linux_mprotect(struct thread *td, struct linux_mprotect_args *uap) 289 { 290 struct mprotect_args bsd_args; 291 292 LINUX_CTR(mprotect); 293 294 bsd_args.addr = uap->addr; 295 bsd_args.len = uap->len; 296 bsd_args.prot = uap->prot; 297 if (bsd_args.prot & (PROT_READ | PROT_WRITE | PROT_EXEC)) 298 bsd_args.prot |= PROT_READ | PROT_EXEC; 299 return (sys_mprotect(td, &bsd_args)); 300 } 301 302 int 303 linux_iopl(struct thread *td, struct linux_iopl_args *args) 304 { 305 int error; 306 307 LINUX_CTR(iopl); 308 309 if (args->level > 3) 310 return (EINVAL); 311 if ((error = priv_check(td, PRIV_IO)) != 0) 312 return (error); 313 if ((error = securelevel_gt(td->td_ucred, 0)) != 0) 314 return (error); 315 td->td_frame->tf_rflags = (td->td_frame->tf_rflags & ~PSL_IOPL) | 316 (args->level * (PSL_IOPL / 3)); 317 318 return (0); 319 } 320 321 int 322 linux_rt_sigsuspend(struct thread *td, struct linux_rt_sigsuspend_args *uap) 323 { 324 l_sigset_t lmask; 325 sigset_t sigmask; 326 int error; 327 328 LINUX_CTR2(rt_sigsuspend, "%p, %ld", 329 uap->newset, uap->sigsetsize); 330 331 if (uap->sigsetsize != sizeof(l_sigset_t)) 332 return (EINVAL); 333 334 error = copyin(uap->newset, &lmask, sizeof(l_sigset_t)); 335 if (error) 336 return (error); 337 338 linux_to_bsd_sigset(&lmask, &sigmask); 339 return (kern_sigsuspend(td, sigmask)); 340 } 341 342 int 343 linux_pause(struct thread *td, struct linux_pause_args *args) 344 { 345 struct proc *p = td->td_proc; 346 sigset_t sigmask; 347 348 LINUX_CTR(pause); 349 350 PROC_LOCK(p); 351 sigmask = td->td_sigmask; 352 PROC_UNLOCK(p); 353 return (kern_sigsuspend(td, sigmask)); 354 } 355 356 int 357 linux_sigaltstack(struct thread *td, struct linux_sigaltstack_args *uap) 358 { 359 stack_t ss, oss; 360 l_stack_t lss; 361 int error; 362 363 LINUX_CTR2(sigaltstack, "%p, %p", uap->uss, uap->uoss); 364 365 if (uap->uss != NULL) { 366 error = copyin(uap->uss, &lss, sizeof(l_stack_t)); 367 if (error) 368 return (error); 369 370 ss.ss_sp = PTRIN(lss.ss_sp); 371 ss.ss_size = lss.ss_size; 372 ss.ss_flags = linux_to_bsd_sigaltstack(lss.ss_flags); 373 } 374 error = kern_sigaltstack(td, (uap->uss != NULL) ? &ss : NULL, 375 (uap->uoss != NULL) ? &oss : NULL); 376 if (!error && uap->uoss != NULL) { 377 lss.ss_sp = PTROUT(oss.ss_sp); 378 lss.ss_size = oss.ss_size; 379 lss.ss_flags = bsd_to_linux_sigaltstack(oss.ss_flags); 380 error = copyout(&lss, uap->uoss, sizeof(l_stack_t)); 381 } 382 383 return (error); 384 } 385 386 int 387 linux_arch_prctl(struct thread *td, struct linux_arch_prctl_args *args) 388 { 389 int error; 390 struct pcb *pcb; 391 392 LINUX_CTR2(arch_prctl, "0x%x, %p", args->code, args->addr); 393 394 error = ENOTSUP; 395 pcb = td->td_pcb; 396 397 switch (args->code) { 398 case LINUX_ARCH_GET_GS: 399 error = copyout(&pcb->pcb_gsbase, (unsigned long *)args->addr, 400 sizeof(args->addr)); 401 break; 402 case LINUX_ARCH_SET_GS: 403 if (args->addr >= VM_MAXUSER_ADDRESS) 404 return(EPERM); 405 break; 406 case LINUX_ARCH_GET_FS: 407 error = copyout(&pcb->pcb_fsbase, (unsigned long *)args->addr, 408 sizeof(args->addr)); 409 break; 410 case LINUX_ARCH_SET_FS: 411 error = linux_set_cloned_tls(td, (void *)args->addr); 412 break; 413 default: 414 error = EINVAL; 415 } 416 return (error); 417 } 418 419 int 420 linux_set_cloned_tls(struct thread *td, void *desc) 421 { 422 struct pcb *pcb; 423 424 if ((uint64_t)desc >= VM_MAXUSER_ADDRESS) 425 return (EPERM); 426 427 pcb = td->td_pcb; 428 pcb->pcb_fsbase = (register_t)desc; 429 td->td_frame->tf_fs = _ufssel; 430 431 return (0); 432 } 433