xref: /freebsd/sys/amd64/linux32/linux32_machdep.c (revision 63d45d7da0eac8efdeb765ac5caddfc2c5ca021e)
1 /*-
2  * Copyright (c) 2004 Tim J. Robbins
3  * Copyright (c) 2002 Doug Rabson
4  * Copyright (c) 2000 Marcel Moolenaar
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer
12  *    in this position and unchanged.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. The name of the author may not be used to endorse or promote products
17  *    derived from this software without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
20  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
21  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
22  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
23  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
24  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
28  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29  */
30 
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33 
34 #include <sys/param.h>
35 #include <sys/kernel.h>
36 #include <sys/systm.h>
37 #include <sys/imgact.h>
38 #include <sys/lock.h>
39 #include <sys/malloc.h>
40 #include <sys/mman.h>
41 #include <sys/mutex.h>
42 #include <sys/proc.h>
43 #include <sys/resource.h>
44 #include <sys/resourcevar.h>
45 #include <sys/syscallsubr.h>
46 #include <sys/sysproto.h>
47 #include <sys/unistd.h>
48 
49 #include <machine/frame.h>
50 
51 #include <vm/vm.h>
52 #include <vm/pmap.h>
53 #include <vm/vm_extern.h>
54 #include <vm/vm_kern.h>
55 #include <vm/vm_map.h>
56 
57 #include <amd64/linux32/linux.h>
58 #include <amd64/linux32/linux32_proto.h>
59 #include <compat/linux/linux_ipc.h>
60 #include <compat/linux/linux_signal.h>
61 #include <compat/linux/linux_util.h>
62 
63 struct l_old_select_argv {
64 	l_int		nfds;
65 	l_uintptr_t	readfds;
66 	l_uintptr_t	writefds;
67 	l_uintptr_t	exceptfds;
68 	l_uintptr_t	timeout;
69 } __packed;
70 
71 int
72 linux_to_bsd_sigaltstack(int lsa)
73 {
74 	int bsa = 0;
75 
76 	if (lsa & LINUX_SS_DISABLE)
77 		bsa |= SS_DISABLE;
78 	if (lsa & LINUX_SS_ONSTACK)
79 		bsa |= SS_ONSTACK;
80 	return (bsa);
81 }
82 
83 int
84 bsd_to_linux_sigaltstack(int bsa)
85 {
86 	int lsa = 0;
87 
88 	if (bsa & SS_DISABLE)
89 		lsa |= LINUX_SS_DISABLE;
90 	if (bsa & SS_ONSTACK)
91 		lsa |= LINUX_SS_ONSTACK;
92 	return (lsa);
93 }
94 
95 /*
96  * Custom version of exec_copyin_args() so that we can translate
97  * the pointers.
98  */
99 static int
100 linux_exec_copyin_args(struct image_args *args, char *fname,
101     enum uio_seg segflg, char **argv, char **envv)
102 {
103 	char *argp, *envp;
104 	u_int32_t *p32, arg;
105 	size_t length;
106 	int error;
107 
108 	bzero(args, sizeof(*args));
109 	if (argv == NULL)
110 		return (EFAULT);
111 
112 	/*
113 	 * Allocate temporary demand zeroed space for argument and
114 	 *	environment strings
115 	 */
116 	args->buf = (char *) kmem_alloc_wait(exec_map, PATH_MAX + ARG_MAX);
117 	if (args->buf == NULL)
118 		return (ENOMEM);
119 	args->begin_argv = args->buf;
120 	args->endp = args->begin_argv;
121 	args->stringspace = ARG_MAX;
122 
123 	args->fname = args->buf + ARG_MAX;
124 
125 	/*
126 	 * Copy the file name.
127 	 */
128 	error = (segflg == UIO_SYSSPACE) ?
129 	    copystr(fname, args->fname, PATH_MAX, &length) :
130 	    copyinstr(fname, args->fname, PATH_MAX, &length);
131 	if (error != 0)
132 		return (error);
133 
134 	/*
135 	 * extract arguments first
136 	 */
137 	p32 = (u_int32_t *)argv;
138 	for (;;) {
139 		error = copyin(p32++, &arg, sizeof(arg));
140 		if (error)
141 			return (error);
142 		if (arg == 0)
143 			break;
144 		argp = PTRIN(arg);
145 		error = copyinstr(argp, args->endp, args->stringspace, &length);
146 		if (error) {
147 			if (error == ENAMETOOLONG)
148 				return (E2BIG);
149 			else
150 				return (error);
151 		}
152 		args->stringspace -= length;
153 		args->endp += length;
154 		args->argc++;
155 	}
156 
157 	args->begin_envv = args->endp;
158 
159 	/*
160 	 * extract environment strings
161 	 */
162 	if (envv) {
163 		p32 = (u_int32_t *)envv;
164 		for (;;) {
165 			error = copyin(p32++, &arg, sizeof(arg));
166 			if (error)
167 				return (error);
168 			if (arg == 0)
169 				break;
170 			envp = PTRIN(arg);
171 			error = copyinstr(envp, args->endp, args->stringspace,
172 			    &length);
173 			if (error) {
174 				if (error == ENAMETOOLONG)
175 					return (E2BIG);
176 				else
177 					return (error);
178 			}
179 			args->stringspace -= length;
180 			args->endp += length;
181 			args->envc++;
182 		}
183 	}
184 
185 	return (0);
186 }
187 
188 int
189 linux_execve(struct thread *td, struct linux_execve_args *args)
190 {
191 	struct image_args eargs;
192 	char *path;
193 	int error;
194 
195 	LCONVPATHEXIST(td, args->path, &path);
196 
197 #ifdef DEBUG
198 	if (ldebug(execve))
199 		printf(ARGS(execve, "%s"), path);
200 #endif
201 
202 	error = linux_exec_copyin_args(&eargs, path, UIO_SYSSPACE, args->argp,
203 	    args->envp);
204 	free(path, M_TEMP);
205 	if (error == 0)
206 		error = kern_execve(td, &eargs, NULL);
207 	exec_free_args(&eargs);
208 	return (error);
209 }
210 
211 struct iovec32 {
212 	u_int32_t iov_base;
213 	int	iov_len;
214 };
215 #define	STACKGAPLEN	400
216 
217 CTASSERT(sizeof(struct iovec32) == 8);
218 
219 int
220 linux_readv(struct thread *td, struct linux_readv_args *uap)
221 {
222 	int error, osize, nsize, i;
223 	caddr_t sg;
224 	struct readv_args /* {
225 		syscallarg(int) fd;
226 		syscallarg(struct iovec *) iovp;
227 		syscallarg(u_int) iovcnt;
228 	} */ a;
229 	struct iovec32 *oio;
230 	struct iovec *nio;
231 
232 	sg = stackgap_init();
233 
234 	if (uap->iovcnt > (STACKGAPLEN / sizeof (struct iovec)))
235 		return (EINVAL);
236 
237 	osize = uap->iovcnt * sizeof (struct iovec32);
238 	nsize = uap->iovcnt * sizeof (struct iovec);
239 
240 	oio = malloc(osize, M_TEMP, M_WAITOK);
241 	nio = malloc(nsize, M_TEMP, M_WAITOK);
242 
243 	error = 0;
244 	if ((error = copyin(uap->iovp, oio, osize)))
245 		goto punt;
246 	for (i = 0; i < uap->iovcnt; i++) {
247 		nio[i].iov_base = PTRIN(oio[i].iov_base);
248 		nio[i].iov_len = oio[i].iov_len;
249 	}
250 
251 	a.fd = uap->fd;
252 	a.iovp = stackgap_alloc(&sg, nsize);
253 	a.iovcnt = uap->iovcnt;
254 
255 	if ((error = copyout(nio, (caddr_t)a.iovp, nsize)))
256 		goto punt;
257 	error = readv(td, &a);
258 
259 punt:
260 	free(oio, M_TEMP);
261 	free(nio, M_TEMP);
262 	return (error);
263 }
264 
265 int
266 linux_writev(struct thread *td, struct linux_writev_args *uap)
267 {
268 	int error, i, nsize, osize;
269 	caddr_t sg;
270 	struct writev_args /* {
271 		syscallarg(int) fd;
272 		syscallarg(struct iovec *) iovp;
273 		syscallarg(u_int) iovcnt;
274 	} */ a;
275 	struct iovec32 *oio;
276 	struct iovec *nio;
277 
278 	sg = stackgap_init();
279 
280 	if (uap->iovcnt > (STACKGAPLEN / sizeof (struct iovec)))
281 		return (EINVAL);
282 
283 	osize = uap->iovcnt * sizeof (struct iovec32);
284 	nsize = uap->iovcnt * sizeof (struct iovec);
285 
286 	oio = malloc(osize, M_TEMP, M_WAITOK);
287 	nio = malloc(nsize, M_TEMP, M_WAITOK);
288 
289 	error = 0;
290 	if ((error = copyin(uap->iovp, oio, osize)))
291 		goto punt;
292 	for (i = 0; i < uap->iovcnt; i++) {
293 		nio[i].iov_base = PTRIN(oio[i].iov_base);
294 		nio[i].iov_len = oio[i].iov_len;
295 	}
296 
297 	a.fd = uap->fd;
298 	a.iovp = stackgap_alloc(&sg, nsize);
299 	a.iovcnt = uap->iovcnt;
300 
301 	if ((error = copyout(nio, (caddr_t)a.iovp, nsize)))
302 		goto punt;
303 	error = writev(td, &a);
304 
305 punt:
306 	free(oio, M_TEMP);
307 	free(nio, M_TEMP);
308 	return (error);
309 }
310 
311 struct l_ipc_kludge {
312 	l_uintptr_t msgp;
313 	l_long msgtyp;
314 } __packed;
315 
316 int
317 linux_ipc(struct thread *td, struct linux_ipc_args *args)
318 {
319 
320 	switch (args->what & 0xFFFF) {
321 	case LINUX_SEMOP: {
322 		struct linux_semop_args a;
323 
324 		a.semid = args->arg1;
325 		a.tsops = args->ptr;
326 		a.nsops = args->arg2;
327 		return (linux_semop(td, &a));
328 	}
329 	case LINUX_SEMGET: {
330 		struct linux_semget_args a;
331 
332 		a.key = args->arg1;
333 		a.nsems = args->arg2;
334 		a.semflg = args->arg3;
335 		return (linux_semget(td, &a));
336 	}
337 	case LINUX_SEMCTL: {
338 		struct linux_semctl_args a;
339 		int error;
340 
341 		a.semid = args->arg1;
342 		a.semnum = args->arg2;
343 		a.cmd = args->arg3;
344 		error = copyin(args->ptr, &a.arg, sizeof(a.arg));
345 		if (error)
346 			return (error);
347 		return (linux_semctl(td, &a));
348 	}
349 	case LINUX_MSGSND: {
350 		struct linux_msgsnd_args a;
351 
352 		a.msqid = args->arg1;
353 		a.msgp = args->ptr;
354 		a.msgsz = args->arg2;
355 		a.msgflg = args->arg3;
356 		return (linux_msgsnd(td, &a));
357 	}
358 	case LINUX_MSGRCV: {
359 		struct linux_msgrcv_args a;
360 
361 		a.msqid = args->arg1;
362 		a.msgsz = args->arg2;
363 		a.msgflg = args->arg3;
364 		if ((args->what >> 16) == 0) {
365 			struct l_ipc_kludge tmp;
366 			int error;
367 
368 			if (args->ptr == 0)
369 				return (EINVAL);
370 			error = copyin(args->ptr, &tmp, sizeof(tmp));
371 			if (error)
372 				return (error);
373 			a.msgp = PTRIN(tmp.msgp);
374 			a.msgtyp = tmp.msgtyp;
375 		} else {
376 			a.msgp = args->ptr;
377 			a.msgtyp = args->arg5;
378 		}
379 		return (linux_msgrcv(td, &a));
380 	}
381 	case LINUX_MSGGET: {
382 		struct linux_msgget_args a;
383 
384 		a.key = args->arg1;
385 		a.msgflg = args->arg2;
386 		return (linux_msgget(td, &a));
387 	}
388 	case LINUX_MSGCTL: {
389 		struct linux_msgctl_args a;
390 
391 		a.msqid = args->arg1;
392 		a.cmd = args->arg2;
393 		a.buf = args->ptr;
394 		return (linux_msgctl(td, &a));
395 	}
396 	case LINUX_SHMAT: {
397 		struct linux_shmat_args a;
398 
399 		a.shmid = args->arg1;
400 		a.shmaddr = args->ptr;
401 		a.shmflg = args->arg2;
402 		a.raddr = PTRIN(args->arg3);
403 		return (linux_shmat(td, &a));
404 	}
405 	case LINUX_SHMDT: {
406 		struct linux_shmdt_args a;
407 
408 		a.shmaddr = args->ptr;
409 		return (linux_shmdt(td, &a));
410 	}
411 	case LINUX_SHMGET: {
412 		struct linux_shmget_args a;
413 
414 		a.key = args->arg1;
415 		a.size = args->arg2;
416 		a.shmflg = args->arg3;
417 		return (linux_shmget(td, &a));
418 	}
419 	case LINUX_SHMCTL: {
420 		struct linux_shmctl_args a;
421 
422 		a.shmid = args->arg1;
423 		a.cmd = args->arg2;
424 		a.buf = args->ptr;
425 		return (linux_shmctl(td, &a));
426 	}
427 	default:
428 		break;
429 	}
430 
431 	return (EINVAL);
432 }
433 
434 int
435 linux_old_select(struct thread *td, struct linux_old_select_args *args)
436 {
437 	struct l_old_select_argv linux_args;
438 	struct linux_select_args newsel;
439 	int error;
440 
441 #ifdef DEBUG
442 	if (ldebug(old_select))
443 		printf(ARGS(old_select, "%p"), args->ptr);
444 #endif
445 
446 	error = copyin(args->ptr, &linux_args, sizeof(linux_args));
447 	if (error)
448 		return (error);
449 
450 	newsel.nfds = linux_args.nfds;
451 	newsel.readfds = PTRIN(linux_args.readfds);
452 	newsel.writefds = PTRIN(linux_args.writefds);
453 	newsel.exceptfds = PTRIN(linux_args.exceptfds);
454 	newsel.timeout = PTRIN(linux_args.timeout);
455 	return (linux_select(td, &newsel));
456 }
457 
458 int
459 linux_fork(struct thread *td, struct linux_fork_args *args)
460 {
461 	int error;
462 
463 #ifdef DEBUG
464 	if (ldebug(fork))
465 		printf(ARGS(fork, ""));
466 #endif
467 
468 	if ((error = fork(td, (struct fork_args *)args)) != 0)
469 		return (error);
470 
471 	if (td->td_retval[1] == 1)
472 		td->td_retval[0] = 0;
473 	return (0);
474 }
475 
476 int
477 linux_vfork(struct thread *td, struct linux_vfork_args *args)
478 {
479 	int error;
480 
481 #ifdef DEBUG
482 	if (ldebug(vfork))
483 		printf(ARGS(vfork, ""));
484 #endif
485 
486 	if ((error = vfork(td, (struct vfork_args *)args)) != 0)
487 		return (error);
488 	/* Are we the child? */
489 	if (td->td_retval[1] == 1)
490 		td->td_retval[0] = 0;
491 	return (0);
492 }
493 
494 #define CLONE_VM	0x100
495 #define CLONE_FS	0x200
496 #define CLONE_FILES	0x400
497 #define CLONE_SIGHAND	0x800
498 #define CLONE_PID	0x1000
499 
500 int
501 linux_clone(struct thread *td, struct linux_clone_args *args)
502 {
503 	int error, ff = RFPROC | RFSTOPPED;
504 	struct proc *p2;
505 	struct thread *td2;
506 	int exit_signal;
507 
508 #ifdef DEBUG
509 	if (ldebug(clone)) {
510 		printf(ARGS(clone, "flags %x, stack %x"),
511 		    (unsigned int)(uintptr_t)args->flags,
512 		    (unsigned int)(uintptr_t)args->stack);
513 		if (args->flags & CLONE_PID)
514 			printf(LMSG("CLONE_PID not yet supported"));
515 	}
516 #endif
517 
518 	if (!args->stack)
519 		return (EINVAL);
520 
521 	exit_signal = args->flags & 0x000000ff;
522 	if (exit_signal >= LINUX_NSIG)
523 		return (EINVAL);
524 
525 	if (exit_signal <= LINUX_SIGTBLSZ)
526 		exit_signal = linux_to_bsd_signal[_SIG_IDX(exit_signal)];
527 
528 	if (args->flags & CLONE_VM)
529 		ff |= RFMEM;
530 	if (args->flags & CLONE_SIGHAND)
531 		ff |= RFSIGSHARE;
532 	if (!(args->flags & CLONE_FILES))
533 		ff |= RFFDG;
534 
535 	error = fork1(td, ff, 0, &p2);
536 	if (error)
537 		return (error);
538 
539 
540 	PROC_LOCK(p2);
541 	p2->p_sigparent = exit_signal;
542 	PROC_UNLOCK(p2);
543 	td2 = FIRST_THREAD_IN_PROC(p2);
544 	td2->td_frame->tf_rsp = PTROUT(args->stack);
545 
546 #ifdef DEBUG
547 	if (ldebug(clone))
548 		printf(LMSG("clone: successful rfork to %ld, stack %p sig = %d"),
549 		    (long)p2->p_pid, args->stack, exit_signal);
550 #endif
551 
552 	/*
553 	 * Make this runnable after we are finished with it.
554 	 */
555 	mtx_lock_spin(&sched_lock);
556 	TD_SET_CAN_RUN(td2);
557 	setrunqueue(td2, SRQ_BORING);
558 	mtx_unlock_spin(&sched_lock);
559 
560 	td->td_retval[0] = p2->p_pid;
561 	td->td_retval[1] = 0;
562 	return (0);
563 }
564 
565 /* XXX move */
566 struct l_mmap_argv {
567 	l_ulong		addr;
568 	l_int		len;
569 	l_int		prot;
570 	l_int		flags;
571 	l_int		fd;
572 	l_int		pos;
573 };
574 
575 #define STACK_SIZE  (2 * 1024 * 1024)
576 #define GUARD_SIZE  (4 * PAGE_SIZE)
577 
578 static int linux_mmap_common(struct thread *, struct l_mmap_argv *);
579 
580 int
581 linux_mmap2(struct thread *td, struct linux_mmap2_args *args)
582 {
583 	struct l_mmap_argv linux_args;
584 
585 #ifdef DEBUG
586 	if (ldebug(mmap2))
587 		printf(ARGS(mmap2, "%p, %d, %d, 0x%08x, %d, %d"),
588 		    (void *)(intptr_t)args->addr, args->len, args->prot,
589 		    args->flags, args->fd, args->pgoff);
590 #endif
591 
592 	linux_args.addr = PTROUT(args->addr);
593 	linux_args.len = args->len;
594 	linux_args.prot = args->prot;
595 	linux_args.flags = args->flags;
596 	linux_args.fd = args->fd;
597 	linux_args.pos = args->pgoff * PAGE_SIZE;
598 
599 	return (linux_mmap_common(td, &linux_args));
600 }
601 
602 int
603 linux_mmap(struct thread *td, struct linux_mmap_args *args)
604 {
605 	int error;
606 	struct l_mmap_argv linux_args;
607 
608 	error = copyin(args->ptr, &linux_args, sizeof(linux_args));
609 	if (error)
610 		return (error);
611 
612 #ifdef DEBUG
613 	if (ldebug(mmap))
614 		printf(ARGS(mmap, "%p, %d, %d, 0x%08x, %d, %d"),
615 		    (void *)(intptr_t)linux_args.addr, linux_args.len,
616 		    linux_args.prot, linux_args.flags, linux_args.fd,
617 		    linux_args.pos);
618 #endif
619 
620 	return (linux_mmap_common(td, &linux_args));
621 }
622 
623 static int
624 linux_mmap_common(struct thread *td, struct l_mmap_argv *linux_args)
625 {
626 	struct proc *p = td->td_proc;
627 	struct mmap_args /* {
628 		caddr_t addr;
629 		size_t len;
630 		int prot;
631 		int flags;
632 		int fd;
633 		long pad;
634 		off_t pos;
635 	} */ bsd_args;
636 	int error;
637 
638 	error = 0;
639 	bsd_args.flags = 0;
640 	if (linux_args->flags & LINUX_MAP_SHARED)
641 		bsd_args.flags |= MAP_SHARED;
642 	if (linux_args->flags & LINUX_MAP_PRIVATE)
643 		bsd_args.flags |= MAP_PRIVATE;
644 	if (linux_args->flags & LINUX_MAP_FIXED)
645 		bsd_args.flags |= MAP_FIXED;
646 	if (linux_args->flags & LINUX_MAP_ANON)
647 		bsd_args.flags |= MAP_ANON;
648 	else
649 		bsd_args.flags |= MAP_NOSYNC;
650 	if (linux_args->flags & LINUX_MAP_GROWSDOWN) {
651 		bsd_args.flags |= MAP_STACK;
652 
653 		/* The linux MAP_GROWSDOWN option does not limit auto
654 		 * growth of the region.  Linux mmap with this option
655 		 * takes as addr the inital BOS, and as len, the initial
656 		 * region size.  It can then grow down from addr without
657 		 * limit.  However, linux threads has an implicit internal
658 		 * limit to stack size of STACK_SIZE.  Its just not
659 		 * enforced explicitly in linux.  But, here we impose
660 		 * a limit of (STACK_SIZE - GUARD_SIZE) on the stack
661 		 * region, since we can do this with our mmap.
662 		 *
663 		 * Our mmap with MAP_STACK takes addr as the maximum
664 		 * downsize limit on BOS, and as len the max size of
665 		 * the region.  It them maps the top SGROWSIZ bytes,
666 		 * and autgrows the region down, up to the limit
667 		 * in addr.
668 		 *
669 		 * If we don't use the MAP_STACK option, the effect
670 		 * of this code is to allocate a stack region of a
671 		 * fixed size of (STACK_SIZE - GUARD_SIZE).
672 		 */
673 
674 		/* This gives us TOS */
675 		bsd_args.addr = (caddr_t)PTRIN(linux_args->addr) +
676 		    linux_args->len;
677 
678 		if ((caddr_t)PTRIN(bsd_args.addr) >
679 		    p->p_vmspace->vm_maxsaddr) {
680 			/* Some linux apps will attempt to mmap
681 			 * thread stacks near the top of their
682 			 * address space.  If their TOS is greater
683 			 * than vm_maxsaddr, vm_map_growstack()
684 			 * will confuse the thread stack with the
685 			 * process stack and deliver a SEGV if they
686 			 * attempt to grow the thread stack past their
687 			 * current stacksize rlimit.  To avoid this,
688 			 * adjust vm_maxsaddr upwards to reflect
689 			 * the current stacksize rlimit rather
690 			 * than the maximum possible stacksize.
691 			 * It would be better to adjust the
692 			 * mmap'ed region, but some apps do not check
693 			 * mmap's return value.
694 			 */
695 			PROC_LOCK(p);
696 			p->p_vmspace->vm_maxsaddr =
697 			    (char *)LINUX32_USRSTACK -
698 			    lim_cur(p, RLIMIT_STACK);
699 			PROC_UNLOCK(p);
700 		}
701 
702 		/* This gives us our maximum stack size */
703 		if (linux_args->len > STACK_SIZE - GUARD_SIZE)
704 			bsd_args.len = linux_args->len;
705 		else
706 			bsd_args.len  = STACK_SIZE - GUARD_SIZE;
707 
708 		/* This gives us a new BOS.  If we're using VM_STACK, then
709 		 * mmap will just map the top SGROWSIZ bytes, and let
710 		 * the stack grow down to the limit at BOS.  If we're
711 		 * not using VM_STACK we map the full stack, since we
712 		 * don't have a way to autogrow it.
713 		 */
714 		bsd_args.addr -= bsd_args.len;
715 	} else {
716 		bsd_args.addr = (caddr_t)PTRIN(linux_args->addr);
717 		bsd_args.len  = linux_args->len;
718 	}
719 	/*
720 	 * XXX i386 Linux always emulator forces PROT_READ on (why?)
721 	 * so we do the same. We add PROT_EXEC to work around buggy
722 	 * applications (e.g. Java) that take advantage of the fact
723 	 * that execute permissions are not enforced by x86 CPUs.
724 	 */
725 	bsd_args.prot = linux_args->prot | PROT_EXEC | PROT_READ;
726 	if (linux_args->flags & LINUX_MAP_ANON)
727 		bsd_args.fd = -1;
728 	else
729 		bsd_args.fd = linux_args->fd;
730 	bsd_args.pos = linux_args->pos;
731 	bsd_args.pad = 0;
732 
733 #ifdef DEBUG
734 	if (ldebug(mmap))
735 		printf("-> %s(%p, %d, %d, 0x%08x, %d, 0x%x)\n",
736 		    __func__,
737 		    (void *)bsd_args.addr, (int)bsd_args.len, bsd_args.prot,
738 		    bsd_args.flags, bsd_args.fd, (int)bsd_args.pos);
739 #endif
740 	error = mmap(td, &bsd_args);
741 #ifdef DEBUG
742 	if (ldebug(mmap))
743 		printf("-> %s() return: 0x%x (0x%08x)\n",
744 			__func__, error, (u_int)td->td_retval[0]);
745 #endif
746 	return (error);
747 }
748 
749 int
750 linux_pipe(struct thread *td, struct linux_pipe_args *args)
751 {
752 	int pip[2];
753 	int error;
754 	register_t reg_rdx;
755 
756 #ifdef DEBUG
757 	if (ldebug(pipe))
758 		printf(ARGS(pipe, "*"));
759 #endif
760 
761 	reg_rdx = td->td_retval[1];
762 	error = pipe(td, 0);
763 	if (error) {
764 		td->td_retval[1] = reg_rdx;
765 		return (error);
766 	}
767 
768 	pip[0] = td->td_retval[0];
769 	pip[1] = td->td_retval[1];
770 	error = copyout(pip, args->pipefds, 2 * sizeof(int));
771 	if (error) {
772 		td->td_retval[1] = reg_rdx;
773 		return (error);
774 	}
775 
776 	td->td_retval[1] = reg_rdx;
777 	td->td_retval[0] = 0;
778 	return (0);
779 }
780 
781 int
782 linux_sigaction(struct thread *td, struct linux_sigaction_args *args)
783 {
784 	l_osigaction_t osa;
785 	l_sigaction_t act, oact;
786 	int error;
787 
788 #ifdef DEBUG
789 	if (ldebug(sigaction))
790 		printf(ARGS(sigaction, "%d, %p, %p"),
791 		    args->sig, (void *)args->nsa, (void *)args->osa);
792 #endif
793 
794 	if (args->nsa != NULL) {
795 		error = copyin(args->nsa, &osa, sizeof(l_osigaction_t));
796 		if (error)
797 			return (error);
798 		act.lsa_handler = osa.lsa_handler;
799 		act.lsa_flags = osa.lsa_flags;
800 		act.lsa_restorer = osa.lsa_restorer;
801 		LINUX_SIGEMPTYSET(act.lsa_mask);
802 		act.lsa_mask.__bits[0] = osa.lsa_mask;
803 	}
804 
805 	error = linux_do_sigaction(td, args->sig, args->nsa ? &act : NULL,
806 	    args->osa ? &oact : NULL);
807 
808 	if (args->osa != NULL && !error) {
809 		osa.lsa_handler = oact.lsa_handler;
810 		osa.lsa_flags = oact.lsa_flags;
811 		osa.lsa_restorer = oact.lsa_restorer;
812 		osa.lsa_mask = oact.lsa_mask.__bits[0];
813 		error = copyout(&osa, args->osa, sizeof(l_osigaction_t));
814 	}
815 
816 	return (error);
817 }
818 
819 /*
820  * Linux has two extra args, restart and oldmask.  We dont use these,
821  * but it seems that "restart" is actually a context pointer that
822  * enables the signal to happen with a different register set.
823  */
824 int
825 linux_sigsuspend(struct thread *td, struct linux_sigsuspend_args *args)
826 {
827 	sigset_t sigmask;
828 	l_sigset_t mask;
829 
830 #ifdef DEBUG
831 	if (ldebug(sigsuspend))
832 		printf(ARGS(sigsuspend, "%08lx"), (unsigned long)args->mask);
833 #endif
834 
835 	LINUX_SIGEMPTYSET(mask);
836 	mask.__bits[0] = args->mask;
837 	linux_to_bsd_sigset(&mask, &sigmask);
838 	return (kern_sigsuspend(td, sigmask));
839 }
840 
841 int
842 linux_rt_sigsuspend(struct thread *td, struct linux_rt_sigsuspend_args *uap)
843 {
844 	l_sigset_t lmask;
845 	sigset_t sigmask;
846 	int error;
847 
848 #ifdef DEBUG
849 	if (ldebug(rt_sigsuspend))
850 		printf(ARGS(rt_sigsuspend, "%p, %d"),
851 		    (void *)uap->newset, uap->sigsetsize);
852 #endif
853 
854 	if (uap->sigsetsize != sizeof(l_sigset_t))
855 		return (EINVAL);
856 
857 	error = copyin(uap->newset, &lmask, sizeof(l_sigset_t));
858 	if (error)
859 		return (error);
860 
861 	linux_to_bsd_sigset(&lmask, &sigmask);
862 	return (kern_sigsuspend(td, sigmask));
863 }
864 
865 int
866 linux_pause(struct thread *td, struct linux_pause_args *args)
867 {
868 	struct proc *p = td->td_proc;
869 	sigset_t sigmask;
870 
871 #ifdef DEBUG
872 	if (ldebug(pause))
873 		printf(ARGS(pause, ""));
874 #endif
875 
876 	PROC_LOCK(p);
877 	sigmask = td->td_sigmask;
878 	PROC_UNLOCK(p);
879 	return (kern_sigsuspend(td, sigmask));
880 }
881 
882 int
883 linux_sigaltstack(struct thread *td, struct linux_sigaltstack_args *uap)
884 {
885 	stack_t ss, oss;
886 	l_stack_t lss;
887 	int error;
888 
889 #ifdef DEBUG
890 	if (ldebug(sigaltstack))
891 		printf(ARGS(sigaltstack, "%p, %p"), uap->uss, uap->uoss);
892 #endif
893 
894 	if (uap->uss != NULL) {
895 		error = copyin(uap->uss, &lss, sizeof(l_stack_t));
896 		if (error)
897 			return (error);
898 
899 		ss.ss_sp = PTRIN(lss.ss_sp);
900 		ss.ss_size = lss.ss_size;
901 		ss.ss_flags = linux_to_bsd_sigaltstack(lss.ss_flags);
902 	}
903 	error = kern_sigaltstack(td, (uap->uss != NULL) ? &ss : NULL,
904 	    (uap->uoss != NULL) ? &oss : NULL);
905 	if (!error && uap->uoss != NULL) {
906 		lss.ss_sp = PTROUT(oss.ss_sp);
907 		lss.ss_size = oss.ss_size;
908 		lss.ss_flags = bsd_to_linux_sigaltstack(oss.ss_flags);
909 		error = copyout(&lss, uap->uoss, sizeof(l_stack_t));
910 	}
911 
912 	return (error);
913 }
914 
915 int
916 linux_ftruncate64(struct thread *td, struct linux_ftruncate64_args *args)
917 {
918 	struct ftruncate_args sa;
919 
920 #ifdef DEBUG
921 	if (ldebug(ftruncate64))
922 		printf(ARGS(ftruncate64, "%u, %jd"), args->fd,
923 		    (intmax_t)args->length);
924 #endif
925 
926 	sa.fd = args->fd;
927 	sa.pad = 0;
928 	sa.length = args->length;
929 	return ftruncate(td, &sa);
930 }
931 
932 int
933 linux_gettimeofday(struct thread *td, struct linux_gettimeofday_args *uap)
934 {
935 	struct timeval atv;
936 	l_timeval atv32;
937 	struct timezone rtz;
938 	int error = 0;
939 
940 	if (uap->tp) {
941 		microtime(&atv);
942 		atv32.tv_sec = atv.tv_sec;
943 		atv32.tv_usec = atv.tv_usec;
944 		error = copyout(&atv32, uap->tp, sizeof (atv32));
945 	}
946 	if (error == 0 && uap->tzp != NULL) {
947 		rtz.tz_minuteswest = tz_minuteswest;
948 		rtz.tz_dsttime = tz_dsttime;
949 		error = copyout(&rtz, uap->tzp, sizeof (rtz));
950 	}
951 	return (error);
952 }
953 
954 int
955 linux_nanosleep(struct thread *td, struct linux_nanosleep_args *uap)
956 {
957 	struct timespec rqt, rmt;
958 	struct l_timespec ats32;
959 	int error;
960 
961 	error = copyin(uap->rqtp, &ats32, sizeof(ats32));
962 	if (error != 0)
963 		return (error);
964 	rqt.tv_sec = ats32.tv_sec;
965 	rqt.tv_nsec = ats32.tv_nsec;
966 	error = kern_nanosleep(td, &rqt, &rmt);
967 	if (uap->rmtp != NULL) {
968 		ats32.tv_sec = rmt.tv_sec;
969 		ats32.tv_nsec = rmt.tv_nsec;
970 		error = copyout(&ats32, uap->rmtp, sizeof(ats32));
971 	}
972 	return (error);
973 }
974 
975 int
976 linux_getrusage(struct thread *td, struct linux_getrusage_args *uap)
977 {
978 	struct l_rusage s32;
979 	struct rusage s;
980 	int error;
981 
982 	error = kern_getrusage(td, uap->who, &s);
983 	if (error != 0)
984 		return (error);
985 	if (uap->rusage != NULL) {
986 		s32.ru_utime.tv_sec = s.ru_utime.tv_sec;
987 		s32.ru_utime.tv_usec = s.ru_utime.tv_usec;
988 		s32.ru_stime.tv_sec = s.ru_stime.tv_sec;
989 		s32.ru_stime.tv_usec = s.ru_stime.tv_usec;
990 		s32.ru_maxrss = s.ru_maxrss;
991 		s32.ru_ixrss = s.ru_ixrss;
992 		s32.ru_idrss = s.ru_idrss;
993 		s32.ru_isrss = s.ru_isrss;
994 		s32.ru_minflt = s.ru_minflt;
995 		s32.ru_majflt = s.ru_majflt;
996 		s32.ru_nswap = s.ru_nswap;
997 		s32.ru_inblock = s.ru_inblock;
998 		s32.ru_oublock = s.ru_oublock;
999 		s32.ru_msgsnd = s.ru_msgsnd;
1000 		s32.ru_msgrcv = s.ru_msgrcv;
1001 		s32.ru_nsignals = s.ru_nsignals;
1002 		s32.ru_nvcsw = s.ru_nvcsw;
1003 		s32.ru_nivcsw = s.ru_nivcsw;
1004 		error = copyout(&s32, uap->rusage, sizeof(s32));
1005 	}
1006 	return (error);
1007 }
1008 
1009 int
1010 linux_sched_rr_get_interval(struct thread *td,
1011     struct linux_sched_rr_get_interval_args *uap)
1012 {
1013 	struct sched_rr_get_interval_args bsd_args;
1014 	caddr_t sg, psgts;
1015 	struct timespec ts;
1016 	struct l_timespec ts32;
1017 	int error;
1018 
1019 	sg = stackgap_init();
1020 	psgts = stackgap_alloc(&sg, sizeof(struct timespec));
1021 	bsd_args.pid = uap->pid;
1022 	bsd_args.interval = (void *)psgts;
1023 	error = sched_rr_get_interval(td, &bsd_args);
1024 	if (error != 0)
1025 		return (error);
1026 	error = copyin(psgts, &ts, sizeof(ts));
1027 	if (error != 0)
1028 		return (error);
1029 	ts32.tv_sec = ts.tv_sec;
1030 	ts32.tv_nsec = ts.tv_nsec;
1031 	return (copyout(&ts32, uap->interval, sizeof(ts32)));
1032 }
1033 
1034 int
1035 linux_mprotect(struct thread *td, struct linux_mprotect_args *uap)
1036 {
1037 	struct mprotect_args bsd_args;
1038 
1039 	bsd_args.addr = uap->addr;
1040 	bsd_args.len = uap->len;
1041 	bsd_args.prot = uap->prot;
1042 	/* XXX PROT_READ implies PROT_EXEC; see linux_mmap_common(). */
1043 	if ((bsd_args.prot & PROT_READ) != 0)
1044 		bsd_args.prot |= PROT_EXEC;
1045 	return (mprotect(td, &bsd_args));
1046 }
1047