xref: /freebsd/sys/amd64/linux32/linux32_machdep.c (revision 57c4583f70ab9d25b3aed17f20ec7843f9673539)
1 /*-
2  * Copyright (c) 2004 Tim J. Robbins
3  * Copyright (c) 2002 Doug Rabson
4  * Copyright (c) 2000 Marcel Moolenaar
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer
12  *    in this position and unchanged.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. The name of the author may not be used to endorse or promote products
17  *    derived from this software without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
20  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
21  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
22  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
23  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
24  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
28  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29  */
30 
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33 
34 #include <sys/param.h>
35 #include <sys/kernel.h>
36 #include <sys/systm.h>
37 #include <sys/clock.h>
38 #include <sys/imgact.h>
39 #include <sys/limits.h>
40 #include <sys/lock.h>
41 #include <sys/malloc.h>
42 #include <sys/mman.h>
43 #include <sys/mutex.h>
44 #include <sys/proc.h>
45 #include <sys/resource.h>
46 #include <sys/resourcevar.h>
47 #include <sys/syscallsubr.h>
48 #include <sys/sysproto.h>
49 #include <sys/unistd.h>
50 
51 #include <machine/frame.h>
52 
53 #include <vm/vm.h>
54 #include <vm/pmap.h>
55 #include <vm/vm_extern.h>
56 #include <vm/vm_kern.h>
57 #include <vm/vm_map.h>
58 
59 #include <amd64/linux32/linux.h>
60 #include <amd64/linux32/linux32_proto.h>
61 #include <compat/linux/linux_ipc.h>
62 #include <compat/linux/linux_signal.h>
63 #include <compat/linux/linux_util.h>
64 #include <compat/linux/linux_emul.h>
65 
66 struct l_old_select_argv {
67 	l_int		nfds;
68 	l_uintptr_t	readfds;
69 	l_uintptr_t	writefds;
70 	l_uintptr_t	exceptfds;
71 	l_uintptr_t	timeout;
72 } __packed;
73 
74 int
75 linux_to_bsd_sigaltstack(int lsa)
76 {
77 	int bsa = 0;
78 
79 	if (lsa & LINUX_SS_DISABLE)
80 		bsa |= SS_DISABLE;
81 	if (lsa & LINUX_SS_ONSTACK)
82 		bsa |= SS_ONSTACK;
83 	return (bsa);
84 }
85 
86 int
87 bsd_to_linux_sigaltstack(int bsa)
88 {
89 	int lsa = 0;
90 
91 	if (bsa & SS_DISABLE)
92 		lsa |= LINUX_SS_DISABLE;
93 	if (bsa & SS_ONSTACK)
94 		lsa |= LINUX_SS_ONSTACK;
95 	return (lsa);
96 }
97 
98 /*
99  * Custom version of exec_copyin_args() so that we can translate
100  * the pointers.
101  */
102 static int
103 linux_exec_copyin_args(struct image_args *args, char *fname,
104     enum uio_seg segflg, char **argv, char **envv)
105 {
106 	char *argp, *envp;
107 	u_int32_t *p32, arg;
108 	size_t length;
109 	int error;
110 
111 	bzero(args, sizeof(*args));
112 	if (argv == NULL)
113 		return (EFAULT);
114 
115 	/*
116 	 * Allocate temporary demand zeroed space for argument and
117 	 *	environment strings
118 	 */
119 	args->buf = (char *) kmem_alloc_wait(exec_map,
120 	    PATH_MAX + ARG_MAX + MAXSHELLCMDLEN);
121 	if (args->buf == NULL)
122 		return (ENOMEM);
123 	args->begin_argv = args->buf;
124 	args->endp = args->begin_argv;
125 	args->stringspace = ARG_MAX;
126 
127 	args->fname = args->buf + ARG_MAX;
128 
129 	/*
130 	 * Copy the file name.
131 	 */
132 	error = (segflg == UIO_SYSSPACE) ?
133 	    copystr(fname, args->fname, PATH_MAX, &length) :
134 	    copyinstr(fname, args->fname, PATH_MAX, &length);
135 	if (error != 0)
136 		goto err_exit;
137 
138 	/*
139 	 * extract arguments first
140 	 */
141 	p32 = (u_int32_t *)argv;
142 	for (;;) {
143 		error = copyin(p32++, &arg, sizeof(arg));
144 		if (error)
145 			goto err_exit;
146 		if (arg == 0)
147 			break;
148 		argp = PTRIN(arg);
149 		error = copyinstr(argp, args->endp, args->stringspace, &length);
150 		if (error) {
151 			if (error == ENAMETOOLONG)
152 				error = E2BIG;
153 
154 			goto err_exit;
155 		}
156 		args->stringspace -= length;
157 		args->endp += length;
158 		args->argc++;
159 	}
160 
161 	args->begin_envv = args->endp;
162 
163 	/*
164 	 * extract environment strings
165 	 */
166 	if (envv) {
167 		p32 = (u_int32_t *)envv;
168 		for (;;) {
169 			error = copyin(p32++, &arg, sizeof(arg));
170 			if (error)
171 				goto err_exit;
172 			if (arg == 0)
173 				break;
174 			envp = PTRIN(arg);
175 			error = copyinstr(envp, args->endp, args->stringspace,
176 			    &length);
177 			if (error) {
178 				if (error == ENAMETOOLONG)
179 					error = E2BIG;
180 				goto err_exit;
181 			}
182 			args->stringspace -= length;
183 			args->endp += length;
184 			args->envc++;
185 		}
186 	}
187 
188 	return (0);
189 
190 err_exit:
191 	kmem_free_wakeup(exec_map, (vm_offset_t)args->buf,
192 	    PATH_MAX + ARG_MAX + MAXSHELLCMDLEN);
193 	args->buf = NULL;
194 	return (error);
195 }
196 
197 int
198 linux_execve(struct thread *td, struct linux_execve_args *args)
199 {
200 	struct image_args eargs;
201 	char *path;
202 	int error;
203 
204 	LCONVPATHEXIST(td, args->path, &path);
205 
206 #ifdef DEBUG
207 	if (ldebug(execve))
208 		printf(ARGS(execve, "%s"), path);
209 #endif
210 
211 	error = linux_exec_copyin_args(&eargs, path, UIO_SYSSPACE, args->argp,
212 	    args->envp);
213 	free(path, M_TEMP);
214 	if (error == 0)
215 		error = kern_execve(td, &eargs, NULL);
216 	if (error == 0)
217 	   	/* linux process can exec fbsd one, dont attempt
218 		 * to create emuldata for such process using
219 		 * linux_proc_init, this leads to a panic on KASSERT
220 		 * because such process has p->p_emuldata == NULL
221 		 */
222 	   	if (td->td_proc->p_sysent == &elf_linux_sysvec)
223    		   	error = linux_proc_init(td, 0, 0);
224 	return (error);
225 }
226 
227 struct iovec32 {
228 	u_int32_t iov_base;
229 	int	iov_len;
230 };
231 
232 CTASSERT(sizeof(struct iovec32) == 8);
233 
234 static int
235 linux32_copyinuio(struct iovec32 *iovp, u_int iovcnt, struct uio **uiop)
236 {
237 	struct iovec32 iov32;
238 	struct iovec *iov;
239 	struct uio *uio;
240 	u_int iovlen;
241 	int error, i;
242 
243 	*uiop = NULL;
244 	if (iovcnt > UIO_MAXIOV)
245 		return (EINVAL);
246 	iovlen = iovcnt * sizeof(struct iovec);
247 	uio = malloc(iovlen + sizeof *uio, M_IOV, M_WAITOK);
248 	iov = (struct iovec *)(uio + 1);
249 	for (i = 0; i < iovcnt; i++) {
250 		error = copyin(&iovp[i], &iov32, sizeof(struct iovec32));
251 		if (error) {
252 			free(uio, M_IOV);
253 			return (error);
254 		}
255 		iov[i].iov_base = PTRIN(iov32.iov_base);
256 		iov[i].iov_len = iov32.iov_len;
257 	}
258 	uio->uio_iov = iov;
259 	uio->uio_iovcnt = iovcnt;
260 	uio->uio_segflg = UIO_USERSPACE;
261 	uio->uio_offset = -1;
262 	uio->uio_resid = 0;
263 	for (i = 0; i < iovcnt; i++) {
264 		if (iov->iov_len > INT_MAX - uio->uio_resid) {
265 			free(uio, M_IOV);
266 			return (EINVAL);
267 		}
268 		uio->uio_resid += iov->iov_len;
269 		iov++;
270 	}
271 	*uiop = uio;
272 	return (0);
273 }
274 
275 int
276 linux_readv(struct thread *td, struct linux_readv_args *uap)
277 {
278 	struct uio *auio;
279 	int error;
280 
281 	error = linux32_copyinuio(uap->iovp, uap->iovcnt, &auio);
282 	if (error)
283 		return (error);
284 	error = kern_readv(td, uap->fd, auio);
285 	free(auio, M_IOV);
286 	return (error);
287 }
288 
289 int
290 linux_writev(struct thread *td, struct linux_writev_args *uap)
291 {
292 	struct uio *auio;
293 	int error;
294 
295 	error = linux32_copyinuio(uap->iovp, uap->iovcnt, &auio);
296 	if (error)
297 		return (error);
298 	error = kern_writev(td, uap->fd, auio);
299 	free(auio, M_IOV);
300 	return (error);
301 }
302 
303 struct l_ipc_kludge {
304 	l_uintptr_t msgp;
305 	l_long msgtyp;
306 } __packed;
307 
308 int
309 linux_ipc(struct thread *td, struct linux_ipc_args *args)
310 {
311 
312 	switch (args->what & 0xFFFF) {
313 	case LINUX_SEMOP: {
314 		struct linux_semop_args a;
315 
316 		a.semid = args->arg1;
317 		a.tsops = args->ptr;
318 		a.nsops = args->arg2;
319 		return (linux_semop(td, &a));
320 	}
321 	case LINUX_SEMGET: {
322 		struct linux_semget_args a;
323 
324 		a.key = args->arg1;
325 		a.nsems = args->arg2;
326 		a.semflg = args->arg3;
327 		return (linux_semget(td, &a));
328 	}
329 	case LINUX_SEMCTL: {
330 		struct linux_semctl_args a;
331 		int error;
332 
333 		a.semid = args->arg1;
334 		a.semnum = args->arg2;
335 		a.cmd = args->arg3;
336 		error = copyin(args->ptr, &a.arg, sizeof(a.arg));
337 		if (error)
338 			return (error);
339 		return (linux_semctl(td, &a));
340 	}
341 	case LINUX_MSGSND: {
342 		struct linux_msgsnd_args a;
343 
344 		a.msqid = args->arg1;
345 		a.msgp = args->ptr;
346 		a.msgsz = args->arg2;
347 		a.msgflg = args->arg3;
348 		return (linux_msgsnd(td, &a));
349 	}
350 	case LINUX_MSGRCV: {
351 		struct linux_msgrcv_args a;
352 
353 		a.msqid = args->arg1;
354 		a.msgsz = args->arg2;
355 		a.msgflg = args->arg3;
356 		if ((args->what >> 16) == 0) {
357 			struct l_ipc_kludge tmp;
358 			int error;
359 
360 			if (args->ptr == 0)
361 				return (EINVAL);
362 			error = copyin(args->ptr, &tmp, sizeof(tmp));
363 			if (error)
364 				return (error);
365 			a.msgp = PTRIN(tmp.msgp);
366 			a.msgtyp = tmp.msgtyp;
367 		} else {
368 			a.msgp = args->ptr;
369 			a.msgtyp = args->arg5;
370 		}
371 		return (linux_msgrcv(td, &a));
372 	}
373 	case LINUX_MSGGET: {
374 		struct linux_msgget_args a;
375 
376 		a.key = args->arg1;
377 		a.msgflg = args->arg2;
378 		return (linux_msgget(td, &a));
379 	}
380 	case LINUX_MSGCTL: {
381 		struct linux_msgctl_args a;
382 
383 		a.msqid = args->arg1;
384 		a.cmd = args->arg2;
385 		a.buf = args->ptr;
386 		return (linux_msgctl(td, &a));
387 	}
388 	case LINUX_SHMAT: {
389 		struct linux_shmat_args a;
390 
391 		a.shmid = args->arg1;
392 		a.shmaddr = args->ptr;
393 		a.shmflg = args->arg2;
394 		a.raddr = PTRIN((l_uint)args->arg3);
395 		return (linux_shmat(td, &a));
396 	}
397 	case LINUX_SHMDT: {
398 		struct linux_shmdt_args a;
399 
400 		a.shmaddr = args->ptr;
401 		return (linux_shmdt(td, &a));
402 	}
403 	case LINUX_SHMGET: {
404 		struct linux_shmget_args a;
405 
406 		a.key = args->arg1;
407 		a.size = args->arg2;
408 		a.shmflg = args->arg3;
409 		return (linux_shmget(td, &a));
410 	}
411 	case LINUX_SHMCTL: {
412 		struct linux_shmctl_args a;
413 
414 		a.shmid = args->arg1;
415 		a.cmd = args->arg2;
416 		a.buf = args->ptr;
417 		return (linux_shmctl(td, &a));
418 	}
419 	default:
420 		break;
421 	}
422 
423 	return (EINVAL);
424 }
425 
426 int
427 linux_old_select(struct thread *td, struct linux_old_select_args *args)
428 {
429 	struct l_old_select_argv linux_args;
430 	struct linux_select_args newsel;
431 	int error;
432 
433 #ifdef DEBUG
434 	if (ldebug(old_select))
435 		printf(ARGS(old_select, "%p"), args->ptr);
436 #endif
437 
438 	error = copyin(args->ptr, &linux_args, sizeof(linux_args));
439 	if (error)
440 		return (error);
441 
442 	newsel.nfds = linux_args.nfds;
443 	newsel.readfds = PTRIN(linux_args.readfds);
444 	newsel.writefds = PTRIN(linux_args.writefds);
445 	newsel.exceptfds = PTRIN(linux_args.exceptfds);
446 	newsel.timeout = PTRIN(linux_args.timeout);
447 	return (linux_select(td, &newsel));
448 }
449 
450 int
451 linux_fork(struct thread *td, struct linux_fork_args *args)
452 {
453 	int error;
454 
455 #ifdef DEBUG
456 	if (ldebug(fork))
457 		printf(ARGS(fork, ""));
458 #endif
459 
460 	if ((error = fork(td, (struct fork_args *)args)) != 0)
461 		return (error);
462 
463 	if (td->td_retval[1] == 1)
464 		td->td_retval[0] = 0;
465 	error = linux_proc_init(td, td->td_retval[0], 0);
466 	if (error)
467 		return (error);
468 
469 	return (0);
470 }
471 
472 int
473 linux_vfork(struct thread *td, struct linux_vfork_args *args)
474 {
475 	int error;
476 	struct proc *p2;
477 
478 #ifdef DEBUG
479 	if (ldebug(vfork))
480 		printf(ARGS(vfork, ""));
481 #endif
482 
483 	/* exclude RFPPWAIT */
484 	if ((error = fork1(td, RFFDG | RFPROC | RFMEM, 0, &p2)) != 0)
485 		return (error);
486 	if (error == 0) {
487 	   	td->td_retval[0] = p2->p_pid;
488 		td->td_retval[1] = 0;
489 	}
490 	/* Are we the child? */
491 	if (td->td_retval[1] == 1)
492 		td->td_retval[0] = 0;
493 	error = linux_proc_init(td, td->td_retval[0], 0);
494 	if (error)
495 		return (error);
496 	/* wait for the children to exit, ie. emulate vfork */
497 	PROC_LOCK(p2);
498 	while (p2->p_flag & P_PPWAIT)
499 	   	msleep(td->td_proc, &p2->p_mtx, PWAIT, "ppwait", 0);
500 	PROC_UNLOCK(p2);
501 	return (0);
502 }
503 
504 int
505 linux_clone(struct thread *td, struct linux_clone_args *args)
506 {
507 	int error, ff = RFPROC | RFSTOPPED;
508 	struct proc *p2;
509 	struct thread *td2;
510 	int exit_signal;
511 	struct linux_emuldata *em;
512 
513 #ifdef DEBUG
514 	if (ldebug(clone)) {
515    	   	printf(ARGS(clone, "flags %x, stack %x, parent tid: %x, child tid: %x"),
516 		    (unsigned int)args->flags, (unsigned int)(uintptr_t)args->stack,
517 		    (unsigned int)(uintptr_t)args->parent_tidptr,
518 		    (unsigned int)(uintptr_t)args->child_tidptr);
519 	}
520 #endif
521 
522 	exit_signal = args->flags & 0x000000ff;
523 	if (exit_signal >= LINUX_NSIG)
524 		return (EINVAL);
525 
526 	if (exit_signal <= LINUX_SIGTBLSZ)
527 		exit_signal = linux_to_bsd_signal[_SIG_IDX(exit_signal)];
528 
529 	if (args->flags & CLONE_VM)
530 		ff |= RFMEM;
531 	if (args->flags & CLONE_SIGHAND)
532 		ff |= RFSIGSHARE;
533 	/*
534 	 * XXX: in linux sharing of fs info (chroot/cwd/umask)
535 	 * and open files is independant. in fbsd its in one
536 	 * structure but in reality it doesnt make any problems
537 	 * because both this flags are set at once usually.
538 	 */
539 	if (!(args->flags & (CLONE_FILES | CLONE_FS)))
540 		ff |= RFFDG;
541 
542 	/*
543 	 * Attempt to detect when linux_clone(2) is used for creating
544 	 * kernel threads. Unfortunately despite the existence of the
545 	 * CLONE_THREAD flag, version of linuxthreads package used in
546 	 * most popular distros as of beginning of 2005 doesn't make
547 	 * any use of it. Therefore, this detection relay fully on
548 	 * empirical observation that linuxthreads sets certain
549 	 * combination of flags, so that we can make more or less
550 	 * precise detection and notify the FreeBSD kernel that several
551 	 * processes are in fact part of the same threading group, so
552 	 * that special treatment is necessary for signal delivery
553 	 * between those processes and fd locking.
554 	 */
555 	if ((args->flags & 0xffffff00) == THREADING_FLAGS)
556 		ff |= RFTHREAD;
557 
558 	error = fork1(td, ff, 0, &p2);
559 	if (error)
560 		return (error);
561 
562 	/* create the emuldata */
563 	error = linux_proc_init(td, p2->p_pid, args->flags);
564 	/* reference it - no need to check this */
565 	em = em_find(p2, EMUL_UNLOCKED);
566 	KASSERT(em != NULL, ("clone: emuldata not found.\n"));
567 	/* and adjust it */
568 	if (args->flags & CLONE_PARENT_SETTID) {
569 	   	if (args->parent_tidptr == NULL) {
570 		   	EMUL_UNLOCK(&emul_lock);
571 			return (EINVAL);
572 		}
573 		error = copyout(&p2->p_pid, args->parent_tidptr, sizeof(p2->p_pid));
574 		if (error) {
575 		   	EMUL_UNLOCK(&emul_lock);
576 			return (error);
577 		}
578 	}
579 
580 	if (args->flags & (CLONE_PARENT|CLONE_THREAD)) {
581 	   	sx_xlock(&proctree_lock);
582 		PROC_LOCK(p2);
583 		proc_reparent(p2, td->td_proc->p_pptr);
584 		PROC_UNLOCK(p2);
585 		sx_xunlock(&proctree_lock);
586 	}
587 
588 	if (args->flags & CLONE_THREAD) {
589 	   	/* XXX: linux mangles pgrp and pptr somehow
590 		 * I think it might be this but I am not sure.
591 		 */
592 #ifdef notyet
593 	   	PROC_LOCK(p2);
594 	   	p2->p_pgrp = td->td_proc->p_pgrp;
595 	   	PROC_UNLOCK(p2);
596 #endif
597 	 	exit_signal = 0;
598 	}
599 
600 	if (args->flags & CLONE_CHILD_SETTID)
601 		em->child_set_tid = args->child_tidptr;
602 	else
603 	   	em->child_set_tid = NULL;
604 
605 	if (args->flags & CLONE_CHILD_CLEARTID)
606 		em->child_clear_tid = args->child_tidptr;
607 	else
608 	   	em->child_clear_tid = NULL;
609 
610 	EMUL_UNLOCK(&emul_lock);
611 
612 	PROC_LOCK(p2);
613 	p2->p_sigparent = exit_signal;
614 	PROC_UNLOCK(p2);
615 	td2 = FIRST_THREAD_IN_PROC(p2);
616 	/*
617 	 * in a case of stack = NULL we are supposed to COW calling process stack
618 	 * this is what normal fork() does so we just keep the tf_rsp arg intact
619 	 */
620 	if (args->stack)
621    	   	td2->td_frame->tf_rsp = PTROUT(args->stack);
622 
623 	if (args->flags & CLONE_SETTLS) {
624 	   	/* XXX: todo */
625 	}
626 
627 #ifdef DEBUG
628 	if (ldebug(clone))
629 		printf(LMSG("clone: successful rfork to %ld, stack %p sig = %d"),
630 		    (long)p2->p_pid, args->stack, exit_signal);
631 #endif
632 
633 	/*
634 	 * Make this runnable after we are finished with it.
635 	 */
636 	mtx_lock_spin(&sched_lock);
637 	TD_SET_CAN_RUN(td2);
638 	setrunqueue(td2, SRQ_BORING);
639 	mtx_unlock_spin(&sched_lock);
640 
641 	td->td_retval[0] = p2->p_pid;
642 	td->td_retval[1] = 0;
643 
644 	if (args->flags & CLONE_VFORK) {
645    	   	/* wait for the children to exit, ie. emulate vfork */
646    	   	PROC_LOCK(p2);
647 		p2->p_flag |= P_PPWAIT;
648 		while (p2->p_flag & P_PPWAIT)
649    		   	msleep(td->td_proc, &p2->p_mtx, PWAIT, "ppwait", 0);
650 		PROC_UNLOCK(p2);
651 	}
652 
653 	return (0);
654 }
655 
656 /* XXX move */
657 struct l_mmap_argv {
658 	l_ulong		addr;
659 	l_ulong		len;
660 	l_ulong		prot;
661 	l_ulong		flags;
662 	l_ulong		fd;
663 	l_ulong		pgoff;
664 };
665 
666 #define STACK_SIZE  (2 * 1024 * 1024)
667 #define GUARD_SIZE  (4 * PAGE_SIZE)
668 
669 static int linux_mmap_common(struct thread *, struct l_mmap_argv *);
670 
671 int
672 linux_mmap2(struct thread *td, struct linux_mmap2_args *args)
673 {
674 	struct l_mmap_argv linux_args;
675 
676 #ifdef DEBUG
677 	if (ldebug(mmap2))
678 		printf(ARGS(mmap2, "%p, %d, %d, 0x%08x, %d, %d"),
679 		    (void *)(intptr_t)args->addr, args->len, args->prot,
680 		    args->flags, args->fd, args->pgoff);
681 #endif
682 
683 	linux_args.addr = PTROUT(args->addr);
684 	linux_args.len = args->len;
685 	linux_args.prot = args->prot;
686 	linux_args.flags = args->flags;
687 	linux_args.fd = args->fd;
688 	linux_args.pgoff = args->pgoff;
689 
690 	return (linux_mmap_common(td, &linux_args));
691 }
692 
693 int
694 linux_mmap(struct thread *td, struct linux_mmap_args *args)
695 {
696 	int error;
697 	struct l_mmap_argv linux_args;
698 
699 	error = copyin(args->ptr, &linux_args, sizeof(linux_args));
700 	if (error)
701 		return (error);
702 
703 #ifdef DEBUG
704 	if (ldebug(mmap))
705 		printf(ARGS(mmap, "%p, %d, %d, 0x%08x, %d, %d"),
706 		    (void *)(intptr_t)linux_args.addr, linux_args.len,
707 		    linux_args.prot, linux_args.flags, linux_args.fd,
708 		    linux_args.pgoff);
709 #endif
710 	if ((linux_args.pgoff % PAGE_SIZE) != 0)
711 		return (EINVAL);
712 	linux_args.pgoff /= PAGE_SIZE;
713 
714 	return (linux_mmap_common(td, &linux_args));
715 }
716 
717 static int
718 linux_mmap_common(struct thread *td, struct l_mmap_argv *linux_args)
719 {
720 	struct proc *p = td->td_proc;
721 	struct mmap_args /* {
722 		caddr_t addr;
723 		size_t len;
724 		int prot;
725 		int flags;
726 		int fd;
727 		long pad;
728 		off_t pos;
729 	} */ bsd_args;
730 	int error;
731 
732 	error = 0;
733 	bsd_args.flags = 0;
734 	if (linux_args->flags & LINUX_MAP_SHARED)
735 		bsd_args.flags |= MAP_SHARED;
736 	if (linux_args->flags & LINUX_MAP_PRIVATE)
737 		bsd_args.flags |= MAP_PRIVATE;
738 	if (linux_args->flags & LINUX_MAP_FIXED)
739 		bsd_args.flags |= MAP_FIXED;
740 	if (linux_args->flags & LINUX_MAP_ANON)
741 		bsd_args.flags |= MAP_ANON;
742 	else
743 		bsd_args.flags |= MAP_NOSYNC;
744 	if (linux_args->flags & LINUX_MAP_GROWSDOWN) {
745 		bsd_args.flags |= MAP_STACK;
746 
747 		/*
748 		 * The linux MAP_GROWSDOWN option does not limit auto
749 		 * growth of the region.  Linux mmap with this option
750 		 * takes as addr the inital BOS, and as len, the initial
751 		 * region size.  It can then grow down from addr without
752 		 * limit.  However, linux threads has an implicit internal
753 		 * limit to stack size of STACK_SIZE.  Its just not
754 		 * enforced explicitly in linux.  But, here we impose
755 		 * a limit of (STACK_SIZE - GUARD_SIZE) on the stack
756 		 * region, since we can do this with our mmap.
757 		 *
758 		 * Our mmap with MAP_STACK takes addr as the maximum
759 		 * downsize limit on BOS, and as len the max size of
760 		 * the region.  It them maps the top SGROWSIZ bytes,
761 		 * and autgrows the region down, up to the limit
762 		 * in addr.
763 		 *
764 		 * If we don't use the MAP_STACK option, the effect
765 		 * of this code is to allocate a stack region of a
766 		 * fixed size of (STACK_SIZE - GUARD_SIZE).
767 		 */
768 
769 		/* This gives us TOS */
770 		bsd_args.addr = (caddr_t)PTRIN(linux_args->addr) +
771 		    linux_args->len;
772 
773 		if ((caddr_t)PTRIN(bsd_args.addr) >
774 		    p->p_vmspace->vm_maxsaddr) {
775 			/*
776 			 * Some linux apps will attempt to mmap
777 			 * thread stacks near the top of their
778 			 * address space.  If their TOS is greater
779 			 * than vm_maxsaddr, vm_map_growstack()
780 			 * will confuse the thread stack with the
781 			 * process stack and deliver a SEGV if they
782 			 * attempt to grow the thread stack past their
783 			 * current stacksize rlimit.  To avoid this,
784 			 * adjust vm_maxsaddr upwards to reflect
785 			 * the current stacksize rlimit rather
786 			 * than the maximum possible stacksize.
787 			 * It would be better to adjust the
788 			 * mmap'ed region, but some apps do not check
789 			 * mmap's return value.
790 			 */
791 			PROC_LOCK(p);
792 			p->p_vmspace->vm_maxsaddr =
793 			    (char *)LINUX32_USRSTACK -
794 			    lim_cur(p, RLIMIT_STACK);
795 			PROC_UNLOCK(p);
796 		}
797 
798 		/* This gives us our maximum stack size */
799 		if (linux_args->len > STACK_SIZE - GUARD_SIZE)
800 			bsd_args.len = linux_args->len;
801 		else
802 			bsd_args.len  = STACK_SIZE - GUARD_SIZE;
803 
804 		/*
805 		 * This gives us a new BOS.  If we're using VM_STACK, then
806 		 * mmap will just map the top SGROWSIZ bytes, and let
807 		 * the stack grow down to the limit at BOS.  If we're
808 		 * not using VM_STACK we map the full stack, since we
809 		 * don't have a way to autogrow it.
810 		 */
811 		bsd_args.addr -= bsd_args.len;
812 	} else {
813 		bsd_args.addr = (caddr_t)PTRIN(linux_args->addr);
814 		bsd_args.len  = linux_args->len;
815 	}
816 	/*
817 	 * XXX i386 Linux always emulator forces PROT_READ on (why?)
818 	 * so we do the same. We add PROT_EXEC to work around buggy
819 	 * applications (e.g. Java) that take advantage of the fact
820 	 * that execute permissions are not enforced by x86 CPUs.
821 	 */
822 	bsd_args.prot = linux_args->prot | PROT_EXEC | PROT_READ;
823 	if (linux_args->flags & LINUX_MAP_ANON)
824 		bsd_args.fd = -1;
825 	else
826 		bsd_args.fd = linux_args->fd;
827 	bsd_args.pos = (off_t)linux_args->pgoff * PAGE_SIZE;
828 	bsd_args.pad = 0;
829 
830 #ifdef DEBUG
831 	if (ldebug(mmap))
832 		printf("-> %s(%p, %d, %d, 0x%08x, %d, 0x%x)\n",
833 		    __func__,
834 		    (void *)bsd_args.addr, (int)bsd_args.len, bsd_args.prot,
835 		    bsd_args.flags, bsd_args.fd, (int)bsd_args.pos);
836 #endif
837 	error = mmap(td, &bsd_args);
838 #ifdef DEBUG
839 	if (ldebug(mmap))
840 		printf("-> %s() return: 0x%x (0x%08x)\n",
841 			__func__, error, (u_int)td->td_retval[0]);
842 #endif
843 	return (error);
844 }
845 
846 int
847 linux_pipe(struct thread *td, struct linux_pipe_args *args)
848 {
849 	int pip[2];
850 	int error;
851 	register_t reg_rdx;
852 
853 #ifdef DEBUG
854 	if (ldebug(pipe))
855 		printf(ARGS(pipe, "*"));
856 #endif
857 
858 	reg_rdx = td->td_retval[1];
859 	error = pipe(td, 0);
860 	if (error) {
861 		td->td_retval[1] = reg_rdx;
862 		return (error);
863 	}
864 
865 	pip[0] = td->td_retval[0];
866 	pip[1] = td->td_retval[1];
867 	error = copyout(pip, args->pipefds, 2 * sizeof(int));
868 	if (error) {
869 		td->td_retval[1] = reg_rdx;
870 		return (error);
871 	}
872 
873 	td->td_retval[1] = reg_rdx;
874 	td->td_retval[0] = 0;
875 	return (0);
876 }
877 
878 int
879 linux_sigaction(struct thread *td, struct linux_sigaction_args *args)
880 {
881 	l_osigaction_t osa;
882 	l_sigaction_t act, oact;
883 	int error;
884 
885 #ifdef DEBUG
886 	if (ldebug(sigaction))
887 		printf(ARGS(sigaction, "%d, %p, %p"),
888 		    args->sig, (void *)args->nsa, (void *)args->osa);
889 #endif
890 
891 	if (args->nsa != NULL) {
892 		error = copyin(args->nsa, &osa, sizeof(l_osigaction_t));
893 		if (error)
894 			return (error);
895 		act.lsa_handler = osa.lsa_handler;
896 		act.lsa_flags = osa.lsa_flags;
897 		act.lsa_restorer = osa.lsa_restorer;
898 		LINUX_SIGEMPTYSET(act.lsa_mask);
899 		act.lsa_mask.__bits[0] = osa.lsa_mask;
900 	}
901 
902 	error = linux_do_sigaction(td, args->sig, args->nsa ? &act : NULL,
903 	    args->osa ? &oact : NULL);
904 
905 	if (args->osa != NULL && !error) {
906 		osa.lsa_handler = oact.lsa_handler;
907 		osa.lsa_flags = oact.lsa_flags;
908 		osa.lsa_restorer = oact.lsa_restorer;
909 		osa.lsa_mask = oact.lsa_mask.__bits[0];
910 		error = copyout(&osa, args->osa, sizeof(l_osigaction_t));
911 	}
912 
913 	return (error);
914 }
915 
916 /*
917  * Linux has two extra args, restart and oldmask.  We dont use these,
918  * but it seems that "restart" is actually a context pointer that
919  * enables the signal to happen with a different register set.
920  */
921 int
922 linux_sigsuspend(struct thread *td, struct linux_sigsuspend_args *args)
923 {
924 	sigset_t sigmask;
925 	l_sigset_t mask;
926 
927 #ifdef DEBUG
928 	if (ldebug(sigsuspend))
929 		printf(ARGS(sigsuspend, "%08lx"), (unsigned long)args->mask);
930 #endif
931 
932 	LINUX_SIGEMPTYSET(mask);
933 	mask.__bits[0] = args->mask;
934 	linux_to_bsd_sigset(&mask, &sigmask);
935 	return (kern_sigsuspend(td, sigmask));
936 }
937 
938 int
939 linux_rt_sigsuspend(struct thread *td, struct linux_rt_sigsuspend_args *uap)
940 {
941 	l_sigset_t lmask;
942 	sigset_t sigmask;
943 	int error;
944 
945 #ifdef DEBUG
946 	if (ldebug(rt_sigsuspend))
947 		printf(ARGS(rt_sigsuspend, "%p, %d"),
948 		    (void *)uap->newset, uap->sigsetsize);
949 #endif
950 
951 	if (uap->sigsetsize != sizeof(l_sigset_t))
952 		return (EINVAL);
953 
954 	error = copyin(uap->newset, &lmask, sizeof(l_sigset_t));
955 	if (error)
956 		return (error);
957 
958 	linux_to_bsd_sigset(&lmask, &sigmask);
959 	return (kern_sigsuspend(td, sigmask));
960 }
961 
962 int
963 linux_pause(struct thread *td, struct linux_pause_args *args)
964 {
965 	struct proc *p = td->td_proc;
966 	sigset_t sigmask;
967 
968 #ifdef DEBUG
969 	if (ldebug(pause))
970 		printf(ARGS(pause, ""));
971 #endif
972 
973 	PROC_LOCK(p);
974 	sigmask = td->td_sigmask;
975 	PROC_UNLOCK(p);
976 	return (kern_sigsuspend(td, sigmask));
977 }
978 
979 int
980 linux_sigaltstack(struct thread *td, struct linux_sigaltstack_args *uap)
981 {
982 	stack_t ss, oss;
983 	l_stack_t lss;
984 	int error;
985 
986 #ifdef DEBUG
987 	if (ldebug(sigaltstack))
988 		printf(ARGS(sigaltstack, "%p, %p"), uap->uss, uap->uoss);
989 #endif
990 
991 	if (uap->uss != NULL) {
992 		error = copyin(uap->uss, &lss, sizeof(l_stack_t));
993 		if (error)
994 			return (error);
995 
996 		ss.ss_sp = PTRIN(lss.ss_sp);
997 		ss.ss_size = lss.ss_size;
998 		ss.ss_flags = linux_to_bsd_sigaltstack(lss.ss_flags);
999 	}
1000 	error = kern_sigaltstack(td, (uap->uss != NULL) ? &ss : NULL,
1001 	    (uap->uoss != NULL) ? &oss : NULL);
1002 	if (!error && uap->uoss != NULL) {
1003 		lss.ss_sp = PTROUT(oss.ss_sp);
1004 		lss.ss_size = oss.ss_size;
1005 		lss.ss_flags = bsd_to_linux_sigaltstack(oss.ss_flags);
1006 		error = copyout(&lss, uap->uoss, sizeof(l_stack_t));
1007 	}
1008 
1009 	return (error);
1010 }
1011 
1012 int
1013 linux_ftruncate64(struct thread *td, struct linux_ftruncate64_args *args)
1014 {
1015 	struct ftruncate_args sa;
1016 
1017 #ifdef DEBUG
1018 	if (ldebug(ftruncate64))
1019 		printf(ARGS(ftruncate64, "%u, %jd"), args->fd,
1020 		    (intmax_t)args->length);
1021 #endif
1022 
1023 	sa.fd = args->fd;
1024 	sa.pad = 0;
1025 	sa.length = args->length;
1026 	return ftruncate(td, &sa);
1027 }
1028 
1029 int
1030 linux_gettimeofday(struct thread *td, struct linux_gettimeofday_args *uap)
1031 {
1032 	struct timeval atv;
1033 	l_timeval atv32;
1034 	struct timezone rtz;
1035 	int error = 0;
1036 
1037 	if (uap->tp) {
1038 		microtime(&atv);
1039 		atv32.tv_sec = atv.tv_sec;
1040 		atv32.tv_usec = atv.tv_usec;
1041 		error = copyout(&atv32, uap->tp, sizeof (atv32));
1042 	}
1043 	if (error == 0 && uap->tzp != NULL) {
1044 		rtz.tz_minuteswest = tz_minuteswest;
1045 		rtz.tz_dsttime = tz_dsttime;
1046 		error = copyout(&rtz, uap->tzp, sizeof (rtz));
1047 	}
1048 	return (error);
1049 }
1050 
1051 int
1052 linux_nanosleep(struct thread *td, struct linux_nanosleep_args *uap)
1053 {
1054 	struct timespec rqt, rmt;
1055 	struct l_timespec ats32;
1056 	int error;
1057 
1058 	error = copyin(uap->rqtp, &ats32, sizeof(ats32));
1059 	if (error != 0)
1060 		return (error);
1061 	rqt.tv_sec = ats32.tv_sec;
1062 	rqt.tv_nsec = ats32.tv_nsec;
1063 	error = kern_nanosleep(td, &rqt, &rmt);
1064 	if (uap->rmtp != NULL) {
1065 		ats32.tv_sec = rmt.tv_sec;
1066 		ats32.tv_nsec = rmt.tv_nsec;
1067 		error = copyout(&ats32, uap->rmtp, sizeof(ats32));
1068 	}
1069 	return (error);
1070 }
1071 
1072 int
1073 linux_getrusage(struct thread *td, struct linux_getrusage_args *uap)
1074 {
1075 	struct l_rusage s32;
1076 	struct rusage s;
1077 	int error;
1078 
1079 	error = kern_getrusage(td, uap->who, &s);
1080 	if (error != 0)
1081 		return (error);
1082 	if (uap->rusage != NULL) {
1083 		s32.ru_utime.tv_sec = s.ru_utime.tv_sec;
1084 		s32.ru_utime.tv_usec = s.ru_utime.tv_usec;
1085 		s32.ru_stime.tv_sec = s.ru_stime.tv_sec;
1086 		s32.ru_stime.tv_usec = s.ru_stime.tv_usec;
1087 		s32.ru_maxrss = s.ru_maxrss;
1088 		s32.ru_ixrss = s.ru_ixrss;
1089 		s32.ru_idrss = s.ru_idrss;
1090 		s32.ru_isrss = s.ru_isrss;
1091 		s32.ru_minflt = s.ru_minflt;
1092 		s32.ru_majflt = s.ru_majflt;
1093 		s32.ru_nswap = s.ru_nswap;
1094 		s32.ru_inblock = s.ru_inblock;
1095 		s32.ru_oublock = s.ru_oublock;
1096 		s32.ru_msgsnd = s.ru_msgsnd;
1097 		s32.ru_msgrcv = s.ru_msgrcv;
1098 		s32.ru_nsignals = s.ru_nsignals;
1099 		s32.ru_nvcsw = s.ru_nvcsw;
1100 		s32.ru_nivcsw = s.ru_nivcsw;
1101 		error = copyout(&s32, uap->rusage, sizeof(s32));
1102 	}
1103 	return (error);
1104 }
1105 
1106 int
1107 linux_sched_rr_get_interval(struct thread *td,
1108     struct linux_sched_rr_get_interval_args *uap)
1109 {
1110 	struct timespec ts;
1111 	struct l_timespec ts32;
1112 	int error;
1113 
1114 	error = kern_sched_rr_get_interval(td, uap->pid, &ts);
1115 	if (error != 0)
1116 		return (error);
1117 	ts32.tv_sec = ts.tv_sec;
1118 	ts32.tv_nsec = ts.tv_nsec;
1119 	return (copyout(&ts32, uap->interval, sizeof(ts32)));
1120 }
1121 
1122 int
1123 linux_mprotect(struct thread *td, struct linux_mprotect_args *uap)
1124 {
1125 	struct mprotect_args bsd_args;
1126 
1127 	bsd_args.addr = uap->addr;
1128 	bsd_args.len = uap->len;
1129 	bsd_args.prot = uap->prot;
1130 	/* XXX PROT_READ implies PROT_EXEC; see linux_mmap_common(). */
1131 	if ((bsd_args.prot & PROT_READ) != 0)
1132 		bsd_args.prot |= PROT_EXEC;
1133 	return (mprotect(td, &bsd_args));
1134 }
1135