1 /*- 2 * Copyright (c) 2004 Tim J. Robbins 3 * Copyright (c) 2002 Doug Rabson 4 * Copyright (c) 2000 Marcel Moolenaar 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer 12 * in this position and unchanged. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. The name of the author may not be used to endorse or promote products 17 * derived from this software without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 20 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 21 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 22 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 24 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 28 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 */ 30 31 #include <sys/cdefs.h> 32 __FBSDID("$FreeBSD$"); 33 34 #include <sys/param.h> 35 #include <sys/kernel.h> 36 #include <sys/systm.h> 37 #include <sys/file.h> 38 #include <sys/fcntl.h> 39 #include <sys/clock.h> 40 #include <sys/imgact.h> 41 #include <sys/limits.h> 42 #include <sys/lock.h> 43 #include <sys/malloc.h> 44 #include <sys/mman.h> 45 #include <sys/mutex.h> 46 #include <sys/priv.h> 47 #include <sys/proc.h> 48 #include <sys/resource.h> 49 #include <sys/resourcevar.h> 50 #include <sys/sched.h> 51 #include <sys/syscallsubr.h> 52 #include <sys/sysproto.h> 53 #include <sys/unistd.h> 54 55 #include <machine/frame.h> 56 #include <machine/pcb.h> 57 #include <machine/psl.h> 58 #include <machine/segments.h> 59 #include <machine/specialreg.h> 60 61 #include <vm/vm.h> 62 #include <vm/pmap.h> 63 #include <vm/vm_map.h> 64 65 #include <compat/freebsd32/freebsd32_util.h> 66 #include <amd64/linux32/linux.h> 67 #include <amd64/linux32/linux32_proto.h> 68 #include <compat/linux/linux_ipc.h> 69 #include <compat/linux/linux_signal.h> 70 #include <compat/linux/linux_util.h> 71 #include <compat/linux/linux_emul.h> 72 73 struct l_old_select_argv { 74 l_int nfds; 75 l_uintptr_t readfds; 76 l_uintptr_t writefds; 77 l_uintptr_t exceptfds; 78 l_uintptr_t timeout; 79 } __packed; 80 81 int 82 linux_to_bsd_sigaltstack(int lsa) 83 { 84 int bsa = 0; 85 86 if (lsa & LINUX_SS_DISABLE) 87 bsa |= SS_DISABLE; 88 if (lsa & LINUX_SS_ONSTACK) 89 bsa |= SS_ONSTACK; 90 return (bsa); 91 } 92 93 static int linux_mmap_common(struct thread *td, l_uintptr_t addr, 94 l_size_t len, l_int prot, l_int flags, l_int fd, 95 l_loff_t pos); 96 97 int 98 bsd_to_linux_sigaltstack(int bsa) 99 { 100 int lsa = 0; 101 102 if (bsa & SS_DISABLE) 103 lsa |= LINUX_SS_DISABLE; 104 if (bsa & SS_ONSTACK) 105 lsa |= LINUX_SS_ONSTACK; 106 return (lsa); 107 } 108 109 int 110 linux_execve(struct thread *td, struct linux_execve_args *args) 111 { 112 struct image_args eargs; 113 char *path; 114 int error; 115 116 LCONVPATHEXIST(td, args->path, &path); 117 118 #ifdef DEBUG 119 if (ldebug(execve)) 120 printf(ARGS(execve, "%s"), path); 121 #endif 122 123 error = freebsd32_exec_copyin_args(&eargs, path, UIO_SYSSPACE, 124 args->argp, args->envp); 125 free(path, M_TEMP); 126 if (error == 0) 127 error = kern_execve(td, &eargs, NULL); 128 if (error == 0) 129 /* Linux process can execute FreeBSD one, do not attempt 130 * to create emuldata for such process using 131 * linux_proc_init, this leads to a panic on KASSERT 132 * because such process has p->p_emuldata == NULL. 133 */ 134 if (td->td_proc->p_sysent == &elf_linux_sysvec) 135 error = linux_proc_init(td, 0, 0); 136 return (error); 137 } 138 139 CTASSERT(sizeof(struct l_iovec32) == 8); 140 141 static int 142 linux32_copyinuio(struct l_iovec32 *iovp, l_ulong iovcnt, struct uio **uiop) 143 { 144 struct l_iovec32 iov32; 145 struct iovec *iov; 146 struct uio *uio; 147 uint32_t iovlen; 148 int error, i; 149 150 *uiop = NULL; 151 if (iovcnt > UIO_MAXIOV) 152 return (EINVAL); 153 iovlen = iovcnt * sizeof(struct iovec); 154 uio = malloc(iovlen + sizeof(*uio), M_IOV, M_WAITOK); 155 iov = (struct iovec *)(uio + 1); 156 for (i = 0; i < iovcnt; i++) { 157 error = copyin(&iovp[i], &iov32, sizeof(struct l_iovec32)); 158 if (error) { 159 free(uio, M_IOV); 160 return (error); 161 } 162 iov[i].iov_base = PTRIN(iov32.iov_base); 163 iov[i].iov_len = iov32.iov_len; 164 } 165 uio->uio_iov = iov; 166 uio->uio_iovcnt = iovcnt; 167 uio->uio_segflg = UIO_USERSPACE; 168 uio->uio_offset = -1; 169 uio->uio_resid = 0; 170 for (i = 0; i < iovcnt; i++) { 171 if (iov->iov_len > INT_MAX - uio->uio_resid) { 172 free(uio, M_IOV); 173 return (EINVAL); 174 } 175 uio->uio_resid += iov->iov_len; 176 iov++; 177 } 178 *uiop = uio; 179 return (0); 180 } 181 182 int 183 linux32_copyiniov(struct l_iovec32 *iovp32, l_ulong iovcnt, struct iovec **iovp, 184 int error) 185 { 186 struct l_iovec32 iov32; 187 struct iovec *iov; 188 uint32_t iovlen; 189 int i; 190 191 *iovp = NULL; 192 if (iovcnt > UIO_MAXIOV) 193 return (error); 194 iovlen = iovcnt * sizeof(struct iovec); 195 iov = malloc(iovlen, M_IOV, M_WAITOK); 196 for (i = 0; i < iovcnt; i++) { 197 error = copyin(&iovp32[i], &iov32, sizeof(struct l_iovec32)); 198 if (error) { 199 free(iov, M_IOV); 200 return (error); 201 } 202 iov[i].iov_base = PTRIN(iov32.iov_base); 203 iov[i].iov_len = iov32.iov_len; 204 } 205 *iovp = iov; 206 return(0); 207 208 } 209 210 int 211 linux_readv(struct thread *td, struct linux_readv_args *uap) 212 { 213 struct uio *auio; 214 int error; 215 216 error = linux32_copyinuio(uap->iovp, uap->iovcnt, &auio); 217 if (error) 218 return (error); 219 error = kern_readv(td, uap->fd, auio); 220 free(auio, M_IOV); 221 return (error); 222 } 223 224 int 225 linux_writev(struct thread *td, struct linux_writev_args *uap) 226 { 227 struct uio *auio; 228 int error; 229 230 error = linux32_copyinuio(uap->iovp, uap->iovcnt, &auio); 231 if (error) 232 return (error); 233 error = kern_writev(td, uap->fd, auio); 234 free(auio, M_IOV); 235 return (error); 236 } 237 238 struct l_ipc_kludge { 239 l_uintptr_t msgp; 240 l_long msgtyp; 241 } __packed; 242 243 int 244 linux_ipc(struct thread *td, struct linux_ipc_args *args) 245 { 246 247 switch (args->what & 0xFFFF) { 248 case LINUX_SEMOP: { 249 struct linux_semop_args a; 250 251 a.semid = args->arg1; 252 a.tsops = args->ptr; 253 a.nsops = args->arg2; 254 return (linux_semop(td, &a)); 255 } 256 case LINUX_SEMGET: { 257 struct linux_semget_args a; 258 259 a.key = args->arg1; 260 a.nsems = args->arg2; 261 a.semflg = args->arg3; 262 return (linux_semget(td, &a)); 263 } 264 case LINUX_SEMCTL: { 265 struct linux_semctl_args a; 266 int error; 267 268 a.semid = args->arg1; 269 a.semnum = args->arg2; 270 a.cmd = args->arg3; 271 error = copyin(args->ptr, &a.arg, sizeof(a.arg)); 272 if (error) 273 return (error); 274 return (linux_semctl(td, &a)); 275 } 276 case LINUX_MSGSND: { 277 struct linux_msgsnd_args a; 278 279 a.msqid = args->arg1; 280 a.msgp = args->ptr; 281 a.msgsz = args->arg2; 282 a.msgflg = args->arg3; 283 return (linux_msgsnd(td, &a)); 284 } 285 case LINUX_MSGRCV: { 286 struct linux_msgrcv_args a; 287 288 a.msqid = args->arg1; 289 a.msgsz = args->arg2; 290 a.msgflg = args->arg3; 291 if ((args->what >> 16) == 0) { 292 struct l_ipc_kludge tmp; 293 int error; 294 295 if (args->ptr == 0) 296 return (EINVAL); 297 error = copyin(args->ptr, &tmp, sizeof(tmp)); 298 if (error) 299 return (error); 300 a.msgp = PTRIN(tmp.msgp); 301 a.msgtyp = tmp.msgtyp; 302 } else { 303 a.msgp = args->ptr; 304 a.msgtyp = args->arg5; 305 } 306 return (linux_msgrcv(td, &a)); 307 } 308 case LINUX_MSGGET: { 309 struct linux_msgget_args a; 310 311 a.key = args->arg1; 312 a.msgflg = args->arg2; 313 return (linux_msgget(td, &a)); 314 } 315 case LINUX_MSGCTL: { 316 struct linux_msgctl_args a; 317 318 a.msqid = args->arg1; 319 a.cmd = args->arg2; 320 a.buf = args->ptr; 321 return (linux_msgctl(td, &a)); 322 } 323 case LINUX_SHMAT: { 324 struct linux_shmat_args a; 325 326 a.shmid = args->arg1; 327 a.shmaddr = args->ptr; 328 a.shmflg = args->arg2; 329 a.raddr = PTRIN((l_uint)args->arg3); 330 return (linux_shmat(td, &a)); 331 } 332 case LINUX_SHMDT: { 333 struct linux_shmdt_args a; 334 335 a.shmaddr = args->ptr; 336 return (linux_shmdt(td, &a)); 337 } 338 case LINUX_SHMGET: { 339 struct linux_shmget_args a; 340 341 a.key = args->arg1; 342 a.size = args->arg2; 343 a.shmflg = args->arg3; 344 return (linux_shmget(td, &a)); 345 } 346 case LINUX_SHMCTL: { 347 struct linux_shmctl_args a; 348 349 a.shmid = args->arg1; 350 a.cmd = args->arg2; 351 a.buf = args->ptr; 352 return (linux_shmctl(td, &a)); 353 } 354 default: 355 break; 356 } 357 358 return (EINVAL); 359 } 360 361 int 362 linux_old_select(struct thread *td, struct linux_old_select_args *args) 363 { 364 struct l_old_select_argv linux_args; 365 struct linux_select_args newsel; 366 int error; 367 368 #ifdef DEBUG 369 if (ldebug(old_select)) 370 printf(ARGS(old_select, "%p"), args->ptr); 371 #endif 372 373 error = copyin(args->ptr, &linux_args, sizeof(linux_args)); 374 if (error) 375 return (error); 376 377 newsel.nfds = linux_args.nfds; 378 newsel.readfds = PTRIN(linux_args.readfds); 379 newsel.writefds = PTRIN(linux_args.writefds); 380 newsel.exceptfds = PTRIN(linux_args.exceptfds); 381 newsel.timeout = PTRIN(linux_args.timeout); 382 return (linux_select(td, &newsel)); 383 } 384 385 int 386 linux_fork(struct thread *td, struct linux_fork_args *args) 387 { 388 int error; 389 struct proc *p2; 390 struct thread *td2; 391 392 #ifdef DEBUG 393 if (ldebug(fork)) 394 printf(ARGS(fork, "")); 395 #endif 396 397 if ((error = fork1(td, RFFDG | RFPROC | RFSTOPPED, 0, &p2)) != 0) 398 return (error); 399 400 if (error == 0) { 401 td->td_retval[0] = p2->p_pid; 402 td->td_retval[1] = 0; 403 } 404 405 if (td->td_retval[1] == 1) 406 td->td_retval[0] = 0; 407 error = linux_proc_init(td, td->td_retval[0], 0); 408 if (error) 409 return (error); 410 411 td2 = FIRST_THREAD_IN_PROC(p2); 412 413 /* 414 * Make this runnable after we are finished with it. 415 */ 416 thread_lock(td2); 417 TD_SET_CAN_RUN(td2); 418 sched_add(td2, SRQ_BORING); 419 thread_unlock(td2); 420 421 return (0); 422 } 423 424 int 425 linux_vfork(struct thread *td, struct linux_vfork_args *args) 426 { 427 int error; 428 struct proc *p2; 429 struct thread *td2; 430 431 #ifdef DEBUG 432 if (ldebug(vfork)) 433 printf(ARGS(vfork, "")); 434 #endif 435 436 /* Exclude RFPPWAIT */ 437 if ((error = fork1(td, RFFDG | RFPROC | RFMEM | RFSTOPPED, 0, &p2)) != 0) 438 return (error); 439 if (error == 0) { 440 td->td_retval[0] = p2->p_pid; 441 td->td_retval[1] = 0; 442 } 443 /* Are we the child? */ 444 if (td->td_retval[1] == 1) 445 td->td_retval[0] = 0; 446 error = linux_proc_init(td, td->td_retval[0], 0); 447 if (error) 448 return (error); 449 450 PROC_LOCK(p2); 451 p2->p_flag |= P_PPWAIT; 452 PROC_UNLOCK(p2); 453 454 td2 = FIRST_THREAD_IN_PROC(p2); 455 456 /* 457 * Make this runnable after we are finished with it. 458 */ 459 thread_lock(td2); 460 TD_SET_CAN_RUN(td2); 461 sched_add(td2, SRQ_BORING); 462 thread_unlock(td2); 463 464 /* wait for the children to exit, ie. emulate vfork */ 465 PROC_LOCK(p2); 466 while (p2->p_flag & P_PPWAIT) 467 cv_wait(&p2->p_pwait, &p2->p_mtx); 468 PROC_UNLOCK(p2); 469 470 return (0); 471 } 472 473 int 474 linux_clone(struct thread *td, struct linux_clone_args *args) 475 { 476 int error, ff = RFPROC | RFSTOPPED; 477 struct proc *p2; 478 struct thread *td2; 479 int exit_signal; 480 struct linux_emuldata *em; 481 482 #ifdef DEBUG 483 if (ldebug(clone)) { 484 printf(ARGS(clone, "flags %x, stack %p, parent tid: %p, " 485 "child tid: %p"), (unsigned)args->flags, 486 args->stack, args->parent_tidptr, args->child_tidptr); 487 } 488 #endif 489 490 exit_signal = args->flags & 0x000000ff; 491 if (LINUX_SIG_VALID(exit_signal)) { 492 if (exit_signal <= LINUX_SIGTBLSZ) 493 exit_signal = 494 linux_to_bsd_signal[_SIG_IDX(exit_signal)]; 495 } else if (exit_signal != 0) 496 return (EINVAL); 497 498 if (args->flags & LINUX_CLONE_VM) 499 ff |= RFMEM; 500 if (args->flags & LINUX_CLONE_SIGHAND) 501 ff |= RFSIGSHARE; 502 /* 503 * XXX: In Linux, sharing of fs info (chroot/cwd/umask) 504 * and open files is independant. In FreeBSD, its in one 505 * structure but in reality it does not cause any problems 506 * because both of these flags are usually set together. 507 */ 508 if (!(args->flags & (LINUX_CLONE_FILES | LINUX_CLONE_FS))) 509 ff |= RFFDG; 510 511 /* 512 * Attempt to detect when linux_clone(2) is used for creating 513 * kernel threads. Unfortunately despite the existence of the 514 * CLONE_THREAD flag, version of linuxthreads package used in 515 * most popular distros as of beginning of 2005 doesn't make 516 * any use of it. Therefore, this detection relies on 517 * empirical observation that linuxthreads sets certain 518 * combination of flags, so that we can make more or less 519 * precise detection and notify the FreeBSD kernel that several 520 * processes are in fact part of the same threading group, so 521 * that special treatment is necessary for signal delivery 522 * between those processes and fd locking. 523 */ 524 if ((args->flags & 0xffffff00) == LINUX_THREADING_FLAGS) 525 ff |= RFTHREAD; 526 527 if (args->flags & LINUX_CLONE_PARENT_SETTID) 528 if (args->parent_tidptr == NULL) 529 return (EINVAL); 530 531 error = fork1(td, ff, 0, &p2); 532 if (error) 533 return (error); 534 535 if (args->flags & (LINUX_CLONE_PARENT | LINUX_CLONE_THREAD)) { 536 sx_xlock(&proctree_lock); 537 PROC_LOCK(p2); 538 proc_reparent(p2, td->td_proc->p_pptr); 539 PROC_UNLOCK(p2); 540 sx_xunlock(&proctree_lock); 541 } 542 543 /* create the emuldata */ 544 error = linux_proc_init(td, p2->p_pid, args->flags); 545 /* reference it - no need to check this */ 546 em = em_find(p2, EMUL_DOLOCK); 547 KASSERT(em != NULL, ("clone: emuldata not found.\n")); 548 /* and adjust it */ 549 550 if (args->flags & LINUX_CLONE_THREAD) { 551 #ifdef notyet 552 PROC_LOCK(p2); 553 p2->p_pgrp = td->td_proc->p_pgrp; 554 PROC_UNLOCK(p2); 555 #endif 556 exit_signal = 0; 557 } 558 559 if (args->flags & LINUX_CLONE_CHILD_SETTID) 560 em->child_set_tid = args->child_tidptr; 561 else 562 em->child_set_tid = NULL; 563 564 if (args->flags & LINUX_CLONE_CHILD_CLEARTID) 565 em->child_clear_tid = args->child_tidptr; 566 else 567 em->child_clear_tid = NULL; 568 569 EMUL_UNLOCK(&emul_lock); 570 571 if (args->flags & LINUX_CLONE_PARENT_SETTID) { 572 error = copyout(&p2->p_pid, args->parent_tidptr, 573 sizeof(p2->p_pid)); 574 if (error) 575 printf(LMSG("copyout failed!")); 576 } 577 578 PROC_LOCK(p2); 579 p2->p_sigparent = exit_signal; 580 PROC_UNLOCK(p2); 581 td2 = FIRST_THREAD_IN_PROC(p2); 582 /* 583 * In a case of stack = NULL, we are supposed to COW calling process 584 * stack. This is what normal fork() does, so we just keep tf_rsp arg 585 * intact. 586 */ 587 if (args->stack) 588 td2->td_frame->tf_rsp = PTROUT(args->stack); 589 590 if (args->flags & LINUX_CLONE_SETTLS) { 591 struct user_segment_descriptor sd; 592 struct l_user_desc info; 593 int a[2]; 594 595 error = copyin((void *)td->td_frame->tf_rsi, &info, 596 sizeof(struct l_user_desc)); 597 if (error) { 598 printf(LMSG("copyin failed!")); 599 } else { 600 /* We might copy out the entry_number as GUGS32_SEL. */ 601 info.entry_number = GUGS32_SEL; 602 error = copyout(&info, (void *)td->td_frame->tf_rsi, 603 sizeof(struct l_user_desc)); 604 if (error) 605 printf(LMSG("copyout failed!")); 606 607 a[0] = LINUX_LDT_entry_a(&info); 608 a[1] = LINUX_LDT_entry_b(&info); 609 610 memcpy(&sd, &a, sizeof(a)); 611 #ifdef DEBUG 612 if (ldebug(clone)) 613 printf("Segment created in clone with " 614 "CLONE_SETTLS: lobase: %x, hibase: %x, " 615 "lolimit: %x, hilimit: %x, type: %i, " 616 "dpl: %i, p: %i, xx: %i, long: %i, " 617 "def32: %i, gran: %i\n", sd.sd_lobase, 618 sd.sd_hibase, sd.sd_lolimit, sd.sd_hilimit, 619 sd.sd_type, sd.sd_dpl, sd.sd_p, sd.sd_xx, 620 sd.sd_long, sd.sd_def32, sd.sd_gran); 621 #endif 622 td2->td_pcb->pcb_gsbase = (register_t)info.base_addr; 623 /* XXXKIB td2->td_pcb->pcb_gs32sd = sd; */ 624 td2->td_frame->tf_gs = GSEL(GUGS32_SEL, SEL_UPL); 625 td2->td_pcb->pcb_flags |= PCB_GS32BIT | PCB_32BIT; 626 } 627 } 628 629 #ifdef DEBUG 630 if (ldebug(clone)) 631 printf(LMSG("clone: successful rfork to %d, " 632 "stack %p sig = %d"), (int)p2->p_pid, args->stack, 633 exit_signal); 634 #endif 635 if (args->flags & LINUX_CLONE_VFORK) { 636 PROC_LOCK(p2); 637 p2->p_flag |= P_PPWAIT; 638 PROC_UNLOCK(p2); 639 } 640 641 /* 642 * Make this runnable after we are finished with it. 643 */ 644 thread_lock(td2); 645 TD_SET_CAN_RUN(td2); 646 sched_add(td2, SRQ_BORING); 647 thread_unlock(td2); 648 649 td->td_retval[0] = p2->p_pid; 650 td->td_retval[1] = 0; 651 652 if (args->flags & LINUX_CLONE_VFORK) { 653 /* wait for the children to exit, ie. emulate vfork */ 654 PROC_LOCK(p2); 655 while (p2->p_flag & P_PPWAIT) 656 cv_wait(&p2->p_pwait, &p2->p_mtx); 657 PROC_UNLOCK(p2); 658 } 659 660 return (0); 661 } 662 663 #define STACK_SIZE (2 * 1024 * 1024) 664 #define GUARD_SIZE (4 * PAGE_SIZE) 665 666 int 667 linux_mmap2(struct thread *td, struct linux_mmap2_args *args) 668 { 669 670 #ifdef DEBUG 671 if (ldebug(mmap2)) 672 printf(ARGS(mmap2, "0x%08x, %d, %d, 0x%08x, %d, %d"), 673 args->addr, args->len, args->prot, 674 args->flags, args->fd, args->pgoff); 675 #endif 676 677 return (linux_mmap_common(td, PTROUT(args->addr), args->len, args->prot, 678 args->flags, args->fd, (uint64_t)(uint32_t)args->pgoff * 679 PAGE_SIZE)); 680 } 681 682 int 683 linux_mmap(struct thread *td, struct linux_mmap_args *args) 684 { 685 int error; 686 struct l_mmap_argv linux_args; 687 688 error = copyin(args->ptr, &linux_args, sizeof(linux_args)); 689 if (error) 690 return (error); 691 692 #ifdef DEBUG 693 if (ldebug(mmap)) 694 printf(ARGS(mmap, "0x%08x, %d, %d, 0x%08x, %d, %d"), 695 linux_args.addr, linux_args.len, linux_args.prot, 696 linux_args.flags, linux_args.fd, linux_args.pgoff); 697 #endif 698 699 return (linux_mmap_common(td, linux_args.addr, linux_args.len, 700 linux_args.prot, linux_args.flags, linux_args.fd, 701 (uint32_t)linux_args.pgoff)); 702 } 703 704 static int 705 linux_mmap_common(struct thread *td, l_uintptr_t addr, l_size_t len, l_int prot, 706 l_int flags, l_int fd, l_loff_t pos) 707 { 708 struct proc *p = td->td_proc; 709 struct mmap_args /* { 710 caddr_t addr; 711 size_t len; 712 int prot; 713 int flags; 714 int fd; 715 long pad; 716 off_t pos; 717 } */ bsd_args; 718 int error; 719 struct file *fp; 720 721 error = 0; 722 bsd_args.flags = 0; 723 fp = NULL; 724 725 /* 726 * Linux mmap(2): 727 * You must specify exactly one of MAP_SHARED and MAP_PRIVATE 728 */ 729 if (!((flags & LINUX_MAP_SHARED) ^ (flags & LINUX_MAP_PRIVATE))) 730 return (EINVAL); 731 732 if (flags & LINUX_MAP_SHARED) 733 bsd_args.flags |= MAP_SHARED; 734 if (flags & LINUX_MAP_PRIVATE) 735 bsd_args.flags |= MAP_PRIVATE; 736 if (flags & LINUX_MAP_FIXED) 737 bsd_args.flags |= MAP_FIXED; 738 if (flags & LINUX_MAP_ANON) { 739 /* Enforce pos to be on page boundary, then ignore. */ 740 if ((pos & PAGE_MASK) != 0) 741 return (EINVAL); 742 pos = 0; 743 bsd_args.flags |= MAP_ANON; 744 } else 745 bsd_args.flags |= MAP_NOSYNC; 746 if (flags & LINUX_MAP_GROWSDOWN) 747 bsd_args.flags |= MAP_STACK; 748 749 /* 750 * PROT_READ, PROT_WRITE, or PROT_EXEC implies PROT_READ and PROT_EXEC 751 * on Linux/i386. We do this to ensure maximum compatibility. 752 * Linux/ia64 does the same in i386 emulation mode. 753 */ 754 bsd_args.prot = prot; 755 if (bsd_args.prot & (PROT_READ | PROT_WRITE | PROT_EXEC)) 756 bsd_args.prot |= PROT_READ | PROT_EXEC; 757 758 /* Linux does not check file descriptor when MAP_ANONYMOUS is set. */ 759 bsd_args.fd = (bsd_args.flags & MAP_ANON) ? -1 : fd; 760 if (bsd_args.fd != -1) { 761 /* 762 * Linux follows Solaris mmap(2) description: 763 * The file descriptor fildes is opened with 764 * read permission, regardless of the 765 * protection options specified. 766 */ 767 768 if ((error = fget(td, bsd_args.fd, &fp)) != 0) 769 return (error); 770 if (fp->f_type != DTYPE_VNODE) { 771 fdrop(fp, td); 772 return (EINVAL); 773 } 774 775 /* Linux mmap() just fails for O_WRONLY files */ 776 if (!(fp->f_flag & FREAD)) { 777 fdrop(fp, td); 778 return (EACCES); 779 } 780 781 fdrop(fp, td); 782 } 783 784 if (flags & LINUX_MAP_GROWSDOWN) { 785 /* 786 * The Linux MAP_GROWSDOWN option does not limit auto 787 * growth of the region. Linux mmap with this option 788 * takes as addr the inital BOS, and as len, the initial 789 * region size. It can then grow down from addr without 790 * limit. However, Linux threads has an implicit internal 791 * limit to stack size of STACK_SIZE. Its just not 792 * enforced explicitly in Linux. But, here we impose 793 * a limit of (STACK_SIZE - GUARD_SIZE) on the stack 794 * region, since we can do this with our mmap. 795 * 796 * Our mmap with MAP_STACK takes addr as the maximum 797 * downsize limit on BOS, and as len the max size of 798 * the region. It then maps the top SGROWSIZ bytes, 799 * and auto grows the region down, up to the limit 800 * in addr. 801 * 802 * If we don't use the MAP_STACK option, the effect 803 * of this code is to allocate a stack region of a 804 * fixed size of (STACK_SIZE - GUARD_SIZE). 805 */ 806 807 if ((caddr_t)PTRIN(addr) + len > p->p_vmspace->vm_maxsaddr) { 808 /* 809 * Some Linux apps will attempt to mmap 810 * thread stacks near the top of their 811 * address space. If their TOS is greater 812 * than vm_maxsaddr, vm_map_growstack() 813 * will confuse the thread stack with the 814 * process stack and deliver a SEGV if they 815 * attempt to grow the thread stack past their 816 * current stacksize rlimit. To avoid this, 817 * adjust vm_maxsaddr upwards to reflect 818 * the current stacksize rlimit rather 819 * than the maximum possible stacksize. 820 * It would be better to adjust the 821 * mmap'ed region, but some apps do not check 822 * mmap's return value. 823 */ 824 PROC_LOCK(p); 825 p->p_vmspace->vm_maxsaddr = (char *)LINUX32_USRSTACK - 826 lim_cur(p, RLIMIT_STACK); 827 PROC_UNLOCK(p); 828 } 829 830 /* 831 * This gives us our maximum stack size and a new BOS. 832 * If we're using VM_STACK, then mmap will just map 833 * the top SGROWSIZ bytes, and let the stack grow down 834 * to the limit at BOS. If we're not using VM_STACK 835 * we map the full stack, since we don't have a way 836 * to autogrow it. 837 */ 838 if (len > STACK_SIZE - GUARD_SIZE) { 839 bsd_args.addr = (caddr_t)PTRIN(addr); 840 bsd_args.len = len; 841 } else { 842 bsd_args.addr = (caddr_t)PTRIN(addr) - 843 (STACK_SIZE - GUARD_SIZE - len); 844 bsd_args.len = STACK_SIZE - GUARD_SIZE; 845 } 846 } else { 847 bsd_args.addr = (caddr_t)PTRIN(addr); 848 bsd_args.len = len; 849 } 850 bsd_args.pos = pos; 851 852 #ifdef DEBUG 853 if (ldebug(mmap)) 854 printf("-> %s(%p, %d, %d, 0x%08x, %d, 0x%x)\n", 855 __func__, 856 (void *)bsd_args.addr, (int)bsd_args.len, bsd_args.prot, 857 bsd_args.flags, bsd_args.fd, (int)bsd_args.pos); 858 #endif 859 error = mmap(td, &bsd_args); 860 #ifdef DEBUG 861 if (ldebug(mmap)) 862 printf("-> %s() return: 0x%x (0x%08x)\n", 863 __func__, error, (u_int)td->td_retval[0]); 864 #endif 865 return (error); 866 } 867 868 int 869 linux_mprotect(struct thread *td, struct linux_mprotect_args *uap) 870 { 871 struct mprotect_args bsd_args; 872 873 bsd_args.addr = uap->addr; 874 bsd_args.len = uap->len; 875 bsd_args.prot = uap->prot; 876 if (bsd_args.prot & (PROT_READ | PROT_WRITE | PROT_EXEC)) 877 bsd_args.prot |= PROT_READ | PROT_EXEC; 878 return (mprotect(td, &bsd_args)); 879 } 880 881 int 882 linux_iopl(struct thread *td, struct linux_iopl_args *args) 883 { 884 int error; 885 886 if (args->level < 0 || args->level > 3) 887 return (EINVAL); 888 if ((error = priv_check(td, PRIV_IO)) != 0) 889 return (error); 890 if ((error = securelevel_gt(td->td_ucred, 0)) != 0) 891 return (error); 892 td->td_frame->tf_rflags = (td->td_frame->tf_rflags & ~PSL_IOPL) | 893 (args->level * (PSL_IOPL / 3)); 894 895 return (0); 896 } 897 898 int 899 linux_pipe(struct thread *td, struct linux_pipe_args *args) 900 { 901 int error; 902 int fildes[2]; 903 904 #ifdef DEBUG 905 if (ldebug(pipe)) 906 printf(ARGS(pipe, "*")); 907 #endif 908 909 error = kern_pipe(td, fildes); 910 if (error) 911 return (error); 912 913 /* XXX: Close descriptors on error. */ 914 return (copyout(fildes, args->pipefds, sizeof fildes)); 915 } 916 917 int 918 linux_sigaction(struct thread *td, struct linux_sigaction_args *args) 919 { 920 l_osigaction_t osa; 921 l_sigaction_t act, oact; 922 int error; 923 924 #ifdef DEBUG 925 if (ldebug(sigaction)) 926 printf(ARGS(sigaction, "%d, %p, %p"), 927 args->sig, (void *)args->nsa, (void *)args->osa); 928 #endif 929 930 if (args->nsa != NULL) { 931 error = copyin(args->nsa, &osa, sizeof(l_osigaction_t)); 932 if (error) 933 return (error); 934 act.lsa_handler = osa.lsa_handler; 935 act.lsa_flags = osa.lsa_flags; 936 act.lsa_restorer = osa.lsa_restorer; 937 LINUX_SIGEMPTYSET(act.lsa_mask); 938 act.lsa_mask.__bits[0] = osa.lsa_mask; 939 } 940 941 error = linux_do_sigaction(td, args->sig, args->nsa ? &act : NULL, 942 args->osa ? &oact : NULL); 943 944 if (args->osa != NULL && !error) { 945 osa.lsa_handler = oact.lsa_handler; 946 osa.lsa_flags = oact.lsa_flags; 947 osa.lsa_restorer = oact.lsa_restorer; 948 osa.lsa_mask = oact.lsa_mask.__bits[0]; 949 error = copyout(&osa, args->osa, sizeof(l_osigaction_t)); 950 } 951 952 return (error); 953 } 954 955 /* 956 * Linux has two extra args, restart and oldmask. We don't use these, 957 * but it seems that "restart" is actually a context pointer that 958 * enables the signal to happen with a different register set. 959 */ 960 int 961 linux_sigsuspend(struct thread *td, struct linux_sigsuspend_args *args) 962 { 963 sigset_t sigmask; 964 l_sigset_t mask; 965 966 #ifdef DEBUG 967 if (ldebug(sigsuspend)) 968 printf(ARGS(sigsuspend, "%08lx"), (unsigned long)args->mask); 969 #endif 970 971 LINUX_SIGEMPTYSET(mask); 972 mask.__bits[0] = args->mask; 973 linux_to_bsd_sigset(&mask, &sigmask); 974 return (kern_sigsuspend(td, sigmask)); 975 } 976 977 int 978 linux_rt_sigsuspend(struct thread *td, struct linux_rt_sigsuspend_args *uap) 979 { 980 l_sigset_t lmask; 981 sigset_t sigmask; 982 int error; 983 984 #ifdef DEBUG 985 if (ldebug(rt_sigsuspend)) 986 printf(ARGS(rt_sigsuspend, "%p, %d"), 987 (void *)uap->newset, uap->sigsetsize); 988 #endif 989 990 if (uap->sigsetsize != sizeof(l_sigset_t)) 991 return (EINVAL); 992 993 error = copyin(uap->newset, &lmask, sizeof(l_sigset_t)); 994 if (error) 995 return (error); 996 997 linux_to_bsd_sigset(&lmask, &sigmask); 998 return (kern_sigsuspend(td, sigmask)); 999 } 1000 1001 int 1002 linux_pause(struct thread *td, struct linux_pause_args *args) 1003 { 1004 struct proc *p = td->td_proc; 1005 sigset_t sigmask; 1006 1007 #ifdef DEBUG 1008 if (ldebug(pause)) 1009 printf(ARGS(pause, "")); 1010 #endif 1011 1012 PROC_LOCK(p); 1013 sigmask = td->td_sigmask; 1014 PROC_UNLOCK(p); 1015 return (kern_sigsuspend(td, sigmask)); 1016 } 1017 1018 int 1019 linux_sigaltstack(struct thread *td, struct linux_sigaltstack_args *uap) 1020 { 1021 stack_t ss, oss; 1022 l_stack_t lss; 1023 int error; 1024 1025 #ifdef DEBUG 1026 if (ldebug(sigaltstack)) 1027 printf(ARGS(sigaltstack, "%p, %p"), uap->uss, uap->uoss); 1028 #endif 1029 1030 if (uap->uss != NULL) { 1031 error = copyin(uap->uss, &lss, sizeof(l_stack_t)); 1032 if (error) 1033 return (error); 1034 1035 ss.ss_sp = PTRIN(lss.ss_sp); 1036 ss.ss_size = lss.ss_size; 1037 ss.ss_flags = linux_to_bsd_sigaltstack(lss.ss_flags); 1038 } 1039 error = kern_sigaltstack(td, (uap->uss != NULL) ? &ss : NULL, 1040 (uap->uoss != NULL) ? &oss : NULL); 1041 if (!error && uap->uoss != NULL) { 1042 lss.ss_sp = PTROUT(oss.ss_sp); 1043 lss.ss_size = oss.ss_size; 1044 lss.ss_flags = bsd_to_linux_sigaltstack(oss.ss_flags); 1045 error = copyout(&lss, uap->uoss, sizeof(l_stack_t)); 1046 } 1047 1048 return (error); 1049 } 1050 1051 int 1052 linux_ftruncate64(struct thread *td, struct linux_ftruncate64_args *args) 1053 { 1054 struct ftruncate_args sa; 1055 1056 #ifdef DEBUG 1057 if (ldebug(ftruncate64)) 1058 printf(ARGS(ftruncate64, "%u, %jd"), args->fd, 1059 (intmax_t)args->length); 1060 #endif 1061 1062 sa.fd = args->fd; 1063 sa.length = args->length; 1064 return ftruncate(td, &sa); 1065 } 1066 1067 int 1068 linux_gettimeofday(struct thread *td, struct linux_gettimeofday_args *uap) 1069 { 1070 struct timeval atv; 1071 l_timeval atv32; 1072 struct timezone rtz; 1073 int error = 0; 1074 1075 if (uap->tp) { 1076 microtime(&atv); 1077 atv32.tv_sec = atv.tv_sec; 1078 atv32.tv_usec = atv.tv_usec; 1079 error = copyout(&atv32, uap->tp, sizeof(atv32)); 1080 } 1081 if (error == 0 && uap->tzp != NULL) { 1082 rtz.tz_minuteswest = tz_minuteswest; 1083 rtz.tz_dsttime = tz_dsttime; 1084 error = copyout(&rtz, uap->tzp, sizeof(rtz)); 1085 } 1086 return (error); 1087 } 1088 1089 int 1090 linux_settimeofday(struct thread *td, struct linux_settimeofday_args *uap) 1091 { 1092 l_timeval atv32; 1093 struct timeval atv, *tvp; 1094 struct timezone atz, *tzp; 1095 int error; 1096 1097 if (uap->tp) { 1098 error = copyin(uap->tp, &atv32, sizeof(atv32)); 1099 if (error) 1100 return (error); 1101 atv.tv_sec = atv32.tv_sec; 1102 atv.tv_usec = atv32.tv_usec; 1103 tvp = &atv; 1104 } else 1105 tvp = NULL; 1106 if (uap->tzp) { 1107 error = copyin(uap->tzp, &atz, sizeof(atz)); 1108 if (error) 1109 return (error); 1110 tzp = &atz; 1111 } else 1112 tzp = NULL; 1113 return (kern_settimeofday(td, tvp, tzp)); 1114 } 1115 1116 int 1117 linux_getrusage(struct thread *td, struct linux_getrusage_args *uap) 1118 { 1119 struct l_rusage s32; 1120 struct rusage s; 1121 int error; 1122 1123 error = kern_getrusage(td, uap->who, &s); 1124 if (error != 0) 1125 return (error); 1126 if (uap->rusage != NULL) { 1127 s32.ru_utime.tv_sec = s.ru_utime.tv_sec; 1128 s32.ru_utime.tv_usec = s.ru_utime.tv_usec; 1129 s32.ru_stime.tv_sec = s.ru_stime.tv_sec; 1130 s32.ru_stime.tv_usec = s.ru_stime.tv_usec; 1131 s32.ru_maxrss = s.ru_maxrss; 1132 s32.ru_ixrss = s.ru_ixrss; 1133 s32.ru_idrss = s.ru_idrss; 1134 s32.ru_isrss = s.ru_isrss; 1135 s32.ru_minflt = s.ru_minflt; 1136 s32.ru_majflt = s.ru_majflt; 1137 s32.ru_nswap = s.ru_nswap; 1138 s32.ru_inblock = s.ru_inblock; 1139 s32.ru_oublock = s.ru_oublock; 1140 s32.ru_msgsnd = s.ru_msgsnd; 1141 s32.ru_msgrcv = s.ru_msgrcv; 1142 s32.ru_nsignals = s.ru_nsignals; 1143 s32.ru_nvcsw = s.ru_nvcsw; 1144 s32.ru_nivcsw = s.ru_nivcsw; 1145 error = copyout(&s32, uap->rusage, sizeof(s32)); 1146 } 1147 return (error); 1148 } 1149 1150 int 1151 linux_sched_rr_get_interval(struct thread *td, 1152 struct linux_sched_rr_get_interval_args *uap) 1153 { 1154 struct timespec ts; 1155 struct l_timespec ts32; 1156 int error; 1157 1158 error = kern_sched_rr_get_interval(td, uap->pid, &ts); 1159 if (error != 0) 1160 return (error); 1161 ts32.tv_sec = ts.tv_sec; 1162 ts32.tv_nsec = ts.tv_nsec; 1163 return (copyout(&ts32, uap->interval, sizeof(ts32))); 1164 } 1165 1166 int 1167 linux_set_thread_area(struct thread *td, 1168 struct linux_set_thread_area_args *args) 1169 { 1170 struct l_user_desc info; 1171 struct user_segment_descriptor sd; 1172 int a[2]; 1173 int error; 1174 1175 error = copyin(args->desc, &info, sizeof(struct l_user_desc)); 1176 if (error) 1177 return (error); 1178 1179 #ifdef DEBUG 1180 if (ldebug(set_thread_area)) 1181 printf(ARGS(set_thread_area, "%i, %x, %x, %i, %i, %i, " 1182 "%i, %i, %i"), info.entry_number, info.base_addr, 1183 info.limit, info.seg_32bit, info.contents, 1184 info.read_exec_only, info.limit_in_pages, 1185 info.seg_not_present, info.useable); 1186 #endif 1187 1188 /* 1189 * Semantics of Linux version: every thread in the system has array 1190 * of three TLS descriptors. 1st is GLIBC TLS, 2nd is WINE, 3rd unknown. 1191 * This syscall loads one of the selected TLS decriptors with a value 1192 * and also loads GDT descriptors 6, 7 and 8 with the content of 1193 * the per-thread descriptors. 1194 * 1195 * Semantics of FreeBSD version: I think we can ignore that Linux has 1196 * three per-thread descriptors and use just the first one. 1197 * The tls_array[] is used only in [gs]et_thread_area() syscalls and 1198 * for loading the GDT descriptors. We use just one GDT descriptor 1199 * for TLS, so we will load just one. 1200 * 1201 * XXX: This doesn't work when a user space process tries to use more 1202 * than one TLS segment. Comment in the Linux source says wine might 1203 * do this. 1204 */ 1205 1206 /* 1207 * GLIBC reads current %gs and call set_thread_area() with it. 1208 * We should let GUDATA_SEL and GUGS32_SEL proceed as well because 1209 * we use these segments. 1210 */ 1211 switch (info.entry_number) { 1212 case GUGS32_SEL: 1213 case GUDATA_SEL: 1214 case 6: 1215 case -1: 1216 info.entry_number = GUGS32_SEL; 1217 break; 1218 default: 1219 return (EINVAL); 1220 } 1221 1222 /* 1223 * We have to copy out the GDT entry we use. 1224 * 1225 * XXX: What if a user space program does not check the return value 1226 * and tries to use 6, 7 or 8? 1227 */ 1228 error = copyout(&info, args->desc, sizeof(struct l_user_desc)); 1229 if (error) 1230 return (error); 1231 1232 if (LINUX_LDT_empty(&info)) { 1233 a[0] = 0; 1234 a[1] = 0; 1235 } else { 1236 a[0] = LINUX_LDT_entry_a(&info); 1237 a[1] = LINUX_LDT_entry_b(&info); 1238 } 1239 1240 memcpy(&sd, &a, sizeof(a)); 1241 #ifdef DEBUG 1242 if (ldebug(set_thread_area)) 1243 printf("Segment created in set_thread_area: " 1244 "lobase: %x, hibase: %x, lolimit: %x, hilimit: %x, " 1245 "type: %i, dpl: %i, p: %i, xx: %i, long: %i, " 1246 "def32: %i, gran: %i\n", 1247 sd.sd_lobase, 1248 sd.sd_hibase, 1249 sd.sd_lolimit, 1250 sd.sd_hilimit, 1251 sd.sd_type, 1252 sd.sd_dpl, 1253 sd.sd_p, 1254 sd.sd_xx, 1255 sd.sd_long, 1256 sd.sd_def32, 1257 sd.sd_gran); 1258 #endif 1259 1260 td->td_pcb->pcb_gsbase = (register_t)info.base_addr; 1261 td->td_pcb->pcb_flags |= PCB_32BIT | PCB_GS32BIT; 1262 update_gdt_gsbase(td, info.base_addr); 1263 1264 return (0); 1265 } 1266