xref: /freebsd/sys/amd64/linux32/linux32_machdep.c (revision 46c1105fbb6fbff6d6ccd0a18571342eb992d637)
1 /*-
2  * Copyright (c) 2004 Tim J. Robbins
3  * Copyright (c) 2002 Doug Rabson
4  * Copyright (c) 2000 Marcel Moolenaar
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer
12  *    in this position and unchanged.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. The name of the author may not be used to endorse or promote products
17  *    derived from this software without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
20  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
21  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
22  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
23  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
24  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
28  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29  */
30 
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33 
34 #include "opt_compat.h"
35 
36 #include <sys/param.h>
37 #include <sys/kernel.h>
38 #include <sys/systm.h>
39 #include <sys/capsicum.h>
40 #include <sys/file.h>
41 #include <sys/fcntl.h>
42 #include <sys/clock.h>
43 #include <sys/imgact.h>
44 #include <sys/limits.h>
45 #include <sys/lock.h>
46 #include <sys/malloc.h>
47 #include <sys/mman.h>
48 #include <sys/mutex.h>
49 #include <sys/priv.h>
50 #include <sys/proc.h>
51 #include <sys/resource.h>
52 #include <sys/resourcevar.h>
53 #include <sys/syscallsubr.h>
54 #include <sys/sysproto.h>
55 #include <sys/unistd.h>
56 #include <sys/wait.h>
57 
58 #include <machine/frame.h>
59 #include <machine/pcb.h>
60 #include <machine/psl.h>
61 #include <machine/segments.h>
62 #include <machine/specialreg.h>
63 
64 #include <vm/vm.h>
65 #include <vm/pmap.h>
66 #include <vm/vm_map.h>
67 
68 #include <compat/freebsd32/freebsd32_util.h>
69 #include <amd64/linux32/linux.h>
70 #include <amd64/linux32/linux32_proto.h>
71 #include <compat/linux/linux_ipc.h>
72 #include <compat/linux/linux_misc.h>
73 #include <compat/linux/linux_signal.h>
74 #include <compat/linux/linux_util.h>
75 #include <compat/linux/linux_emul.h>
76 
77 static void	bsd_to_linux_rusage(struct rusage *ru, struct l_rusage *lru);
78 
79 struct l_old_select_argv {
80 	l_int		nfds;
81 	l_uintptr_t	readfds;
82 	l_uintptr_t	writefds;
83 	l_uintptr_t	exceptfds;
84 	l_uintptr_t	timeout;
85 } __packed;
86 
87 static int	linux_mmap_common(struct thread *td, l_uintptr_t addr,
88 		    l_size_t len, l_int prot, l_int flags, l_int fd,
89 		    l_loff_t pos);
90 
91 static void
92 bsd_to_linux_rusage(struct rusage *ru, struct l_rusage *lru)
93 {
94 
95 	lru->ru_utime.tv_sec = ru->ru_utime.tv_sec;
96 	lru->ru_utime.tv_usec = ru->ru_utime.tv_usec;
97 	lru->ru_stime.tv_sec = ru->ru_stime.tv_sec;
98 	lru->ru_stime.tv_usec = ru->ru_stime.tv_usec;
99 	lru->ru_maxrss = ru->ru_maxrss;
100 	lru->ru_ixrss = ru->ru_ixrss;
101 	lru->ru_idrss = ru->ru_idrss;
102 	lru->ru_isrss = ru->ru_isrss;
103 	lru->ru_minflt = ru->ru_minflt;
104 	lru->ru_majflt = ru->ru_majflt;
105 	lru->ru_nswap = ru->ru_nswap;
106 	lru->ru_inblock = ru->ru_inblock;
107 	lru->ru_oublock = ru->ru_oublock;
108 	lru->ru_msgsnd = ru->ru_msgsnd;
109 	lru->ru_msgrcv = ru->ru_msgrcv;
110 	lru->ru_nsignals = ru->ru_nsignals;
111 	lru->ru_nvcsw = ru->ru_nvcsw;
112 	lru->ru_nivcsw = ru->ru_nivcsw;
113 }
114 
115 int
116 linux_copyout_rusage(struct rusage *ru, void *uaddr)
117 {
118 	struct l_rusage lru;
119 
120 	bsd_to_linux_rusage(ru, &lru);
121 
122 	return (copyout(&lru, uaddr, sizeof(struct l_rusage)));
123 }
124 
125 int
126 linux_execve(struct thread *td, struct linux_execve_args *args)
127 {
128 	struct image_args eargs;
129 	char *path;
130 	int error;
131 
132 	LCONVPATHEXIST(td, args->path, &path);
133 
134 #ifdef DEBUG
135 	if (ldebug(execve))
136 		printf(ARGS(execve, "%s"), path);
137 #endif
138 
139 	error = freebsd32_exec_copyin_args(&eargs, path, UIO_SYSSPACE,
140 	    args->argp, args->envp);
141 	free(path, M_TEMP);
142 	if (error == 0)
143 		error = linux_common_execve(td, &eargs);
144 	return (error);
145 }
146 
147 CTASSERT(sizeof(struct l_iovec32) == 8);
148 
149 static int
150 linux32_copyinuio(struct l_iovec32 *iovp, l_ulong iovcnt, struct uio **uiop)
151 {
152 	struct l_iovec32 iov32;
153 	struct iovec *iov;
154 	struct uio *uio;
155 	uint32_t iovlen;
156 	int error, i;
157 
158 	*uiop = NULL;
159 	if (iovcnt > UIO_MAXIOV)
160 		return (EINVAL);
161 	iovlen = iovcnt * sizeof(struct iovec);
162 	uio = malloc(iovlen + sizeof(*uio), M_IOV, M_WAITOK);
163 	iov = (struct iovec *)(uio + 1);
164 	for (i = 0; i < iovcnt; i++) {
165 		error = copyin(&iovp[i], &iov32, sizeof(struct l_iovec32));
166 		if (error) {
167 			free(uio, M_IOV);
168 			return (error);
169 		}
170 		iov[i].iov_base = PTRIN(iov32.iov_base);
171 		iov[i].iov_len = iov32.iov_len;
172 	}
173 	uio->uio_iov = iov;
174 	uio->uio_iovcnt = iovcnt;
175 	uio->uio_segflg = UIO_USERSPACE;
176 	uio->uio_offset = -1;
177 	uio->uio_resid = 0;
178 	for (i = 0; i < iovcnt; i++) {
179 		if (iov->iov_len > INT_MAX - uio->uio_resid) {
180 			free(uio, M_IOV);
181 			return (EINVAL);
182 		}
183 		uio->uio_resid += iov->iov_len;
184 		iov++;
185 	}
186 	*uiop = uio;
187 	return (0);
188 }
189 
190 int
191 linux32_copyiniov(struct l_iovec32 *iovp32, l_ulong iovcnt, struct iovec **iovp,
192     int error)
193 {
194 	struct l_iovec32 iov32;
195 	struct iovec *iov;
196 	uint32_t iovlen;
197 	int i;
198 
199 	*iovp = NULL;
200 	if (iovcnt > UIO_MAXIOV)
201 		return (error);
202 	iovlen = iovcnt * sizeof(struct iovec);
203 	iov = malloc(iovlen, M_IOV, M_WAITOK);
204 	for (i = 0; i < iovcnt; i++) {
205 		error = copyin(&iovp32[i], &iov32, sizeof(struct l_iovec32));
206 		if (error) {
207 			free(iov, M_IOV);
208 			return (error);
209 		}
210 		iov[i].iov_base = PTRIN(iov32.iov_base);
211 		iov[i].iov_len = iov32.iov_len;
212 	}
213 	*iovp = iov;
214 	return(0);
215 
216 }
217 
218 int
219 linux_readv(struct thread *td, struct linux_readv_args *uap)
220 {
221 	struct uio *auio;
222 	int error;
223 
224 	error = linux32_copyinuio(uap->iovp, uap->iovcnt, &auio);
225 	if (error)
226 		return (error);
227 	error = kern_readv(td, uap->fd, auio);
228 	free(auio, M_IOV);
229 	return (error);
230 }
231 
232 int
233 linux_writev(struct thread *td, struct linux_writev_args *uap)
234 {
235 	struct uio *auio;
236 	int error;
237 
238 	error = linux32_copyinuio(uap->iovp, uap->iovcnt, &auio);
239 	if (error)
240 		return (error);
241 	error = kern_writev(td, uap->fd, auio);
242 	free(auio, M_IOV);
243 	return (error);
244 }
245 
246 struct l_ipc_kludge {
247 	l_uintptr_t msgp;
248 	l_long msgtyp;
249 } __packed;
250 
251 int
252 linux_ipc(struct thread *td, struct linux_ipc_args *args)
253 {
254 
255 	switch (args->what & 0xFFFF) {
256 	case LINUX_SEMOP: {
257 		struct linux_semop_args a;
258 
259 		a.semid = args->arg1;
260 		a.tsops = args->ptr;
261 		a.nsops = args->arg2;
262 		return (linux_semop(td, &a));
263 	}
264 	case LINUX_SEMGET: {
265 		struct linux_semget_args a;
266 
267 		a.key = args->arg1;
268 		a.nsems = args->arg2;
269 		a.semflg = args->arg3;
270 		return (linux_semget(td, &a));
271 	}
272 	case LINUX_SEMCTL: {
273 		struct linux_semctl_args a;
274 		int error;
275 
276 		a.semid = args->arg1;
277 		a.semnum = args->arg2;
278 		a.cmd = args->arg3;
279 		error = copyin(args->ptr, &a.arg, sizeof(a.arg));
280 		if (error)
281 			return (error);
282 		return (linux_semctl(td, &a));
283 	}
284 	case LINUX_MSGSND: {
285 		struct linux_msgsnd_args a;
286 
287 		a.msqid = args->arg1;
288 		a.msgp = args->ptr;
289 		a.msgsz = args->arg2;
290 		a.msgflg = args->arg3;
291 		return (linux_msgsnd(td, &a));
292 	}
293 	case LINUX_MSGRCV: {
294 		struct linux_msgrcv_args a;
295 
296 		a.msqid = args->arg1;
297 		a.msgsz = args->arg2;
298 		a.msgflg = args->arg3;
299 		if ((args->what >> 16) == 0) {
300 			struct l_ipc_kludge tmp;
301 			int error;
302 
303 			if (args->ptr == 0)
304 				return (EINVAL);
305 			error = copyin(args->ptr, &tmp, sizeof(tmp));
306 			if (error)
307 				return (error);
308 			a.msgp = PTRIN(tmp.msgp);
309 			a.msgtyp = tmp.msgtyp;
310 		} else {
311 			a.msgp = args->ptr;
312 			a.msgtyp = args->arg5;
313 		}
314 		return (linux_msgrcv(td, &a));
315 	}
316 	case LINUX_MSGGET: {
317 		struct linux_msgget_args a;
318 
319 		a.key = args->arg1;
320 		a.msgflg = args->arg2;
321 		return (linux_msgget(td, &a));
322 	}
323 	case LINUX_MSGCTL: {
324 		struct linux_msgctl_args a;
325 
326 		a.msqid = args->arg1;
327 		a.cmd = args->arg2;
328 		a.buf = args->ptr;
329 		return (linux_msgctl(td, &a));
330 	}
331 	case LINUX_SHMAT: {
332 		struct linux_shmat_args a;
333 
334 		a.shmid = args->arg1;
335 		a.shmaddr = args->ptr;
336 		a.shmflg = args->arg2;
337 		a.raddr = PTRIN((l_uint)args->arg3);
338 		return (linux_shmat(td, &a));
339 	}
340 	case LINUX_SHMDT: {
341 		struct linux_shmdt_args a;
342 
343 		a.shmaddr = args->ptr;
344 		return (linux_shmdt(td, &a));
345 	}
346 	case LINUX_SHMGET: {
347 		struct linux_shmget_args a;
348 
349 		a.key = args->arg1;
350 		a.size = args->arg2;
351 		a.shmflg = args->arg3;
352 		return (linux_shmget(td, &a));
353 	}
354 	case LINUX_SHMCTL: {
355 		struct linux_shmctl_args a;
356 
357 		a.shmid = args->arg1;
358 		a.cmd = args->arg2;
359 		a.buf = args->ptr;
360 		return (linux_shmctl(td, &a));
361 	}
362 	default:
363 		break;
364 	}
365 
366 	return (EINVAL);
367 }
368 
369 int
370 linux_old_select(struct thread *td, struct linux_old_select_args *args)
371 {
372 	struct l_old_select_argv linux_args;
373 	struct linux_select_args newsel;
374 	int error;
375 
376 #ifdef DEBUG
377 	if (ldebug(old_select))
378 		printf(ARGS(old_select, "%p"), args->ptr);
379 #endif
380 
381 	error = copyin(args->ptr, &linux_args, sizeof(linux_args));
382 	if (error)
383 		return (error);
384 
385 	newsel.nfds = linux_args.nfds;
386 	newsel.readfds = PTRIN(linux_args.readfds);
387 	newsel.writefds = PTRIN(linux_args.writefds);
388 	newsel.exceptfds = PTRIN(linux_args.exceptfds);
389 	newsel.timeout = PTRIN(linux_args.timeout);
390 	return (linux_select(td, &newsel));
391 }
392 
393 int
394 linux_set_cloned_tls(struct thread *td, void *desc)
395 {
396 	struct user_segment_descriptor sd;
397 	struct l_user_desc info;
398 	struct pcb *pcb;
399 	int error;
400 	int a[2];
401 
402 	error = copyin(desc, &info, sizeof(struct l_user_desc));
403 	if (error) {
404 		printf(LMSG("copyin failed!"));
405 	} else {
406 		/* We might copy out the entry_number as GUGS32_SEL. */
407 		info.entry_number = GUGS32_SEL;
408 		error = copyout(&info, desc, sizeof(struct l_user_desc));
409 		if (error)
410 			printf(LMSG("copyout failed!"));
411 
412 		a[0] = LINUX_LDT_entry_a(&info);
413 		a[1] = LINUX_LDT_entry_b(&info);
414 
415 		memcpy(&sd, &a, sizeof(a));
416 #ifdef DEBUG
417 		if (ldebug(clone))
418 			printf("Segment created in clone with "
419 			    "CLONE_SETTLS: lobase: %x, hibase: %x, "
420 			    "lolimit: %x, hilimit: %x, type: %i, "
421 			    "dpl: %i, p: %i, xx: %i, long: %i, "
422 			    "def32: %i, gran: %i\n", sd.sd_lobase,
423 			    sd.sd_hibase, sd.sd_lolimit, sd.sd_hilimit,
424 			    sd.sd_type, sd.sd_dpl, sd.sd_p, sd.sd_xx,
425 			    sd.sd_long, sd.sd_def32, sd.sd_gran);
426 #endif
427 		pcb = td->td_pcb;
428 		pcb->pcb_gsbase = (register_t)info.base_addr;
429 		td->td_frame->tf_gs = GSEL(GUGS32_SEL, SEL_UPL);
430 		set_pcb_flags(pcb, PCB_32BIT);
431 	}
432 
433 	return (error);
434 }
435 
436 int
437 linux_set_upcall_kse(struct thread *td, register_t stack)
438 {
439 
440 	if (stack)
441 		td->td_frame->tf_rsp = stack;
442 
443 	/*
444 	 * The newly created Linux thread returns
445 	 * to the user space by the same path that a parent do.
446 	 */
447 	td->td_frame->tf_rax = 0;
448 	return (0);
449 }
450 
451 #define STACK_SIZE  (2 * 1024 * 1024)
452 #define GUARD_SIZE  (4 * PAGE_SIZE)
453 
454 int
455 linux_mmap2(struct thread *td, struct linux_mmap2_args *args)
456 {
457 
458 #ifdef DEBUG
459 	if (ldebug(mmap2))
460 		printf(ARGS(mmap2, "0x%08x, %d, %d, 0x%08x, %d, %d"),
461 		    args->addr, args->len, args->prot,
462 		    args->flags, args->fd, args->pgoff);
463 #endif
464 
465 	return (linux_mmap_common(td, PTROUT(args->addr), args->len, args->prot,
466 		args->flags, args->fd, (uint64_t)(uint32_t)args->pgoff *
467 		PAGE_SIZE));
468 }
469 
470 int
471 linux_mmap(struct thread *td, struct linux_mmap_args *args)
472 {
473 	int error;
474 	struct l_mmap_argv linux_args;
475 
476 	error = copyin(args->ptr, &linux_args, sizeof(linux_args));
477 	if (error)
478 		return (error);
479 
480 #ifdef DEBUG
481 	if (ldebug(mmap))
482 		printf(ARGS(mmap, "0x%08x, %d, %d, 0x%08x, %d, %d"),
483 		    linux_args.addr, linux_args.len, linux_args.prot,
484 		    linux_args.flags, linux_args.fd, linux_args.pgoff);
485 #endif
486 
487 	return (linux_mmap_common(td, linux_args.addr, linux_args.len,
488 	    linux_args.prot, linux_args.flags, linux_args.fd,
489 	    (uint32_t)linux_args.pgoff));
490 }
491 
492 static int
493 linux_mmap_common(struct thread *td, l_uintptr_t addr, l_size_t len, l_int prot,
494     l_int flags, l_int fd, l_loff_t pos)
495 {
496 	struct proc *p = td->td_proc;
497 	struct mmap_args /* {
498 		caddr_t addr;
499 		size_t len;
500 		int prot;
501 		int flags;
502 		int fd;
503 		long pad;
504 		off_t pos;
505 	} */ bsd_args;
506 	int error;
507 	struct file *fp;
508 	cap_rights_t rights;
509 
510 	error = 0;
511 	bsd_args.flags = 0;
512 	fp = NULL;
513 
514 	/*
515 	 * Linux mmap(2):
516 	 * You must specify exactly one of MAP_SHARED and MAP_PRIVATE
517 	 */
518 	if (!((flags & LINUX_MAP_SHARED) ^ (flags & LINUX_MAP_PRIVATE)))
519 		return (EINVAL);
520 
521 	if (flags & LINUX_MAP_SHARED)
522 		bsd_args.flags |= MAP_SHARED;
523 	if (flags & LINUX_MAP_PRIVATE)
524 		bsd_args.flags |= MAP_PRIVATE;
525 	if (flags & LINUX_MAP_FIXED)
526 		bsd_args.flags |= MAP_FIXED;
527 	if (flags & LINUX_MAP_ANON) {
528 		/* Enforce pos to be on page boundary, then ignore. */
529 		if ((pos & PAGE_MASK) != 0)
530 			return (EINVAL);
531 		pos = 0;
532 		bsd_args.flags |= MAP_ANON;
533 	} else
534 		bsd_args.flags |= MAP_NOSYNC;
535 	if (flags & LINUX_MAP_GROWSDOWN)
536 		bsd_args.flags |= MAP_STACK;
537 
538 	/*
539 	 * PROT_READ, PROT_WRITE, or PROT_EXEC implies PROT_READ and PROT_EXEC
540 	 * on Linux/i386. We do this to ensure maximum compatibility.
541 	 * Linux/ia64 does the same in i386 emulation mode.
542 	 */
543 	bsd_args.prot = prot;
544 	if (bsd_args.prot & (PROT_READ | PROT_WRITE | PROT_EXEC))
545 		bsd_args.prot |= PROT_READ | PROT_EXEC;
546 
547 	/* Linux does not check file descriptor when MAP_ANONYMOUS is set. */
548 	bsd_args.fd = (bsd_args.flags & MAP_ANON) ? -1 : fd;
549 	if (bsd_args.fd != -1) {
550 		/*
551 		 * Linux follows Solaris mmap(2) description:
552 		 * The file descriptor fildes is opened with
553 		 * read permission, regardless of the
554 		 * protection options specified.
555 		 */
556 
557 		error = fget(td, bsd_args.fd,
558 		    cap_rights_init(&rights, CAP_MMAP), &fp);
559 		if (error != 0)
560 			return (error);
561 		if (fp->f_type != DTYPE_VNODE) {
562 			fdrop(fp, td);
563 			return (EINVAL);
564 		}
565 
566 		/* Linux mmap() just fails for O_WRONLY files */
567 		if (!(fp->f_flag & FREAD)) {
568 			fdrop(fp, td);
569 			return (EACCES);
570 		}
571 
572 		fdrop(fp, td);
573 	}
574 
575 	if (flags & LINUX_MAP_GROWSDOWN) {
576 		/*
577 		 * The Linux MAP_GROWSDOWN option does not limit auto
578 		 * growth of the region.  Linux mmap with this option
579 		 * takes as addr the initial BOS, and as len, the initial
580 		 * region size.  It can then grow down from addr without
581 		 * limit.  However, Linux threads has an implicit internal
582 		 * limit to stack size of STACK_SIZE.  Its just not
583 		 * enforced explicitly in Linux.  But, here we impose
584 		 * a limit of (STACK_SIZE - GUARD_SIZE) on the stack
585 		 * region, since we can do this with our mmap.
586 		 *
587 		 * Our mmap with MAP_STACK takes addr as the maximum
588 		 * downsize limit on BOS, and as len the max size of
589 		 * the region.  It then maps the top SGROWSIZ bytes,
590 		 * and auto grows the region down, up to the limit
591 		 * in addr.
592 		 *
593 		 * If we don't use the MAP_STACK option, the effect
594 		 * of this code is to allocate a stack region of a
595 		 * fixed size of (STACK_SIZE - GUARD_SIZE).
596 		 */
597 
598 		if ((caddr_t)PTRIN(addr) + len > p->p_vmspace->vm_maxsaddr) {
599 			/*
600 			 * Some Linux apps will attempt to mmap
601 			 * thread stacks near the top of their
602 			 * address space.  If their TOS is greater
603 			 * than vm_maxsaddr, vm_map_growstack()
604 			 * will confuse the thread stack with the
605 			 * process stack and deliver a SEGV if they
606 			 * attempt to grow the thread stack past their
607 			 * current stacksize rlimit.  To avoid this,
608 			 * adjust vm_maxsaddr upwards to reflect
609 			 * the current stacksize rlimit rather
610 			 * than the maximum possible stacksize.
611 			 * It would be better to adjust the
612 			 * mmap'ed region, but some apps do not check
613 			 * mmap's return value.
614 			 */
615 			PROC_LOCK(p);
616 			p->p_vmspace->vm_maxsaddr = (char *)LINUX32_USRSTACK -
617 			    lim_cur_proc(p, RLIMIT_STACK);
618 			PROC_UNLOCK(p);
619 		}
620 
621 		/*
622 		 * This gives us our maximum stack size and a new BOS.
623 		 * If we're using VM_STACK, then mmap will just map
624 		 * the top SGROWSIZ bytes, and let the stack grow down
625 		 * to the limit at BOS.  If we're not using VM_STACK
626 		 * we map the full stack, since we don't have a way
627 		 * to autogrow it.
628 		 */
629 		if (len > STACK_SIZE - GUARD_SIZE) {
630 			bsd_args.addr = (caddr_t)PTRIN(addr);
631 			bsd_args.len = len;
632 		} else {
633 			bsd_args.addr = (caddr_t)PTRIN(addr) -
634 			    (STACK_SIZE - GUARD_SIZE - len);
635 			bsd_args.len = STACK_SIZE - GUARD_SIZE;
636 		}
637 	} else {
638 		bsd_args.addr = (caddr_t)PTRIN(addr);
639 		bsd_args.len  = len;
640 	}
641 	bsd_args.pos = pos;
642 
643 #ifdef DEBUG
644 	if (ldebug(mmap))
645 		printf("-> %s(%p, %d, %d, 0x%08x, %d, 0x%x)\n",
646 		    __func__,
647 		    (void *)bsd_args.addr, (int)bsd_args.len, bsd_args.prot,
648 		    bsd_args.flags, bsd_args.fd, (int)bsd_args.pos);
649 #endif
650 	error = sys_mmap(td, &bsd_args);
651 #ifdef DEBUG
652 	if (ldebug(mmap))
653 		printf("-> %s() return: 0x%x (0x%08x)\n",
654 			__func__, error, (u_int)td->td_retval[0]);
655 #endif
656 	return (error);
657 }
658 
659 int
660 linux_mprotect(struct thread *td, struct linux_mprotect_args *uap)
661 {
662 	struct mprotect_args bsd_args;
663 
664 	bsd_args.addr = uap->addr;
665 	bsd_args.len = uap->len;
666 	bsd_args.prot = uap->prot;
667 	if (bsd_args.prot & (PROT_READ | PROT_WRITE | PROT_EXEC))
668 		bsd_args.prot |= PROT_READ | PROT_EXEC;
669 	return (sys_mprotect(td, &bsd_args));
670 }
671 
672 int
673 linux_iopl(struct thread *td, struct linux_iopl_args *args)
674 {
675 	int error;
676 
677 	if (args->level < 0 || args->level > 3)
678 		return (EINVAL);
679 	if ((error = priv_check(td, PRIV_IO)) != 0)
680 		return (error);
681 	if ((error = securelevel_gt(td->td_ucred, 0)) != 0)
682 		return (error);
683 	td->td_frame->tf_rflags = (td->td_frame->tf_rflags & ~PSL_IOPL) |
684 	    (args->level * (PSL_IOPL / 3));
685 
686 	return (0);
687 }
688 
689 int
690 linux_sigaction(struct thread *td, struct linux_sigaction_args *args)
691 {
692 	l_osigaction_t osa;
693 	l_sigaction_t act, oact;
694 	int error;
695 
696 #ifdef DEBUG
697 	if (ldebug(sigaction))
698 		printf(ARGS(sigaction, "%d, %p, %p"),
699 		    args->sig, (void *)args->nsa, (void *)args->osa);
700 #endif
701 
702 	if (args->nsa != NULL) {
703 		error = copyin(args->nsa, &osa, sizeof(l_osigaction_t));
704 		if (error)
705 			return (error);
706 		act.lsa_handler = osa.lsa_handler;
707 		act.lsa_flags = osa.lsa_flags;
708 		act.lsa_restorer = osa.lsa_restorer;
709 		LINUX_SIGEMPTYSET(act.lsa_mask);
710 		act.lsa_mask.__mask = osa.lsa_mask;
711 	}
712 
713 	error = linux_do_sigaction(td, args->sig, args->nsa ? &act : NULL,
714 	    args->osa ? &oact : NULL);
715 
716 	if (args->osa != NULL && !error) {
717 		osa.lsa_handler = oact.lsa_handler;
718 		osa.lsa_flags = oact.lsa_flags;
719 		osa.lsa_restorer = oact.lsa_restorer;
720 		osa.lsa_mask = oact.lsa_mask.__mask;
721 		error = copyout(&osa, args->osa, sizeof(l_osigaction_t));
722 	}
723 
724 	return (error);
725 }
726 
727 /*
728  * Linux has two extra args, restart and oldmask.  We don't use these,
729  * but it seems that "restart" is actually a context pointer that
730  * enables the signal to happen with a different register set.
731  */
732 int
733 linux_sigsuspend(struct thread *td, struct linux_sigsuspend_args *args)
734 {
735 	sigset_t sigmask;
736 	l_sigset_t mask;
737 
738 #ifdef DEBUG
739 	if (ldebug(sigsuspend))
740 		printf(ARGS(sigsuspend, "%08lx"), (unsigned long)args->mask);
741 #endif
742 
743 	LINUX_SIGEMPTYSET(mask);
744 	mask.__mask = args->mask;
745 	linux_to_bsd_sigset(&mask, &sigmask);
746 	return (kern_sigsuspend(td, sigmask));
747 }
748 
749 int
750 linux_rt_sigsuspend(struct thread *td, struct linux_rt_sigsuspend_args *uap)
751 {
752 	l_sigset_t lmask;
753 	sigset_t sigmask;
754 	int error;
755 
756 #ifdef DEBUG
757 	if (ldebug(rt_sigsuspend))
758 		printf(ARGS(rt_sigsuspend, "%p, %d"),
759 		    (void *)uap->newset, uap->sigsetsize);
760 #endif
761 
762 	if (uap->sigsetsize != sizeof(l_sigset_t))
763 		return (EINVAL);
764 
765 	error = copyin(uap->newset, &lmask, sizeof(l_sigset_t));
766 	if (error)
767 		return (error);
768 
769 	linux_to_bsd_sigset(&lmask, &sigmask);
770 	return (kern_sigsuspend(td, sigmask));
771 }
772 
773 int
774 linux_pause(struct thread *td, struct linux_pause_args *args)
775 {
776 	struct proc *p = td->td_proc;
777 	sigset_t sigmask;
778 
779 #ifdef DEBUG
780 	if (ldebug(pause))
781 		printf(ARGS(pause, ""));
782 #endif
783 
784 	PROC_LOCK(p);
785 	sigmask = td->td_sigmask;
786 	PROC_UNLOCK(p);
787 	return (kern_sigsuspend(td, sigmask));
788 }
789 
790 int
791 linux_sigaltstack(struct thread *td, struct linux_sigaltstack_args *uap)
792 {
793 	stack_t ss, oss;
794 	l_stack_t lss;
795 	int error;
796 
797 #ifdef DEBUG
798 	if (ldebug(sigaltstack))
799 		printf(ARGS(sigaltstack, "%p, %p"), uap->uss, uap->uoss);
800 #endif
801 
802 	if (uap->uss != NULL) {
803 		error = copyin(uap->uss, &lss, sizeof(l_stack_t));
804 		if (error)
805 			return (error);
806 
807 		ss.ss_sp = PTRIN(lss.ss_sp);
808 		ss.ss_size = lss.ss_size;
809 		ss.ss_flags = linux_to_bsd_sigaltstack(lss.ss_flags);
810 	}
811 	error = kern_sigaltstack(td, (uap->uss != NULL) ? &ss : NULL,
812 	    (uap->uoss != NULL) ? &oss : NULL);
813 	if (!error && uap->uoss != NULL) {
814 		lss.ss_sp = PTROUT(oss.ss_sp);
815 		lss.ss_size = oss.ss_size;
816 		lss.ss_flags = bsd_to_linux_sigaltstack(oss.ss_flags);
817 		error = copyout(&lss, uap->uoss, sizeof(l_stack_t));
818 	}
819 
820 	return (error);
821 }
822 
823 int
824 linux_ftruncate64(struct thread *td, struct linux_ftruncate64_args *args)
825 {
826 	struct ftruncate_args sa;
827 
828 #ifdef DEBUG
829 	if (ldebug(ftruncate64))
830 		printf(ARGS(ftruncate64, "%u, %jd"), args->fd,
831 		    (intmax_t)args->length);
832 #endif
833 
834 	sa.fd = args->fd;
835 	sa.length = args->length;
836 	return sys_ftruncate(td, &sa);
837 }
838 
839 int
840 linux_gettimeofday(struct thread *td, struct linux_gettimeofday_args *uap)
841 {
842 	struct timeval atv;
843 	l_timeval atv32;
844 	struct timezone rtz;
845 	int error = 0;
846 
847 	if (uap->tp) {
848 		microtime(&atv);
849 		atv32.tv_sec = atv.tv_sec;
850 		atv32.tv_usec = atv.tv_usec;
851 		error = copyout(&atv32, uap->tp, sizeof(atv32));
852 	}
853 	if (error == 0 && uap->tzp != NULL) {
854 		rtz.tz_minuteswest = tz_minuteswest;
855 		rtz.tz_dsttime = tz_dsttime;
856 		error = copyout(&rtz, uap->tzp, sizeof(rtz));
857 	}
858 	return (error);
859 }
860 
861 int
862 linux_settimeofday(struct thread *td, struct linux_settimeofday_args *uap)
863 {
864 	l_timeval atv32;
865 	struct timeval atv, *tvp;
866 	struct timezone atz, *tzp;
867 	int error;
868 
869 	if (uap->tp) {
870 		error = copyin(uap->tp, &atv32, sizeof(atv32));
871 		if (error)
872 			return (error);
873 		atv.tv_sec = atv32.tv_sec;
874 		atv.tv_usec = atv32.tv_usec;
875 		tvp = &atv;
876 	} else
877 		tvp = NULL;
878 	if (uap->tzp) {
879 		error = copyin(uap->tzp, &atz, sizeof(atz));
880 		if (error)
881 			return (error);
882 		tzp = &atz;
883 	} else
884 		tzp = NULL;
885 	return (kern_settimeofday(td, tvp, tzp));
886 }
887 
888 int
889 linux_getrusage(struct thread *td, struct linux_getrusage_args *uap)
890 {
891 	struct rusage s;
892 	int error;
893 
894 	error = kern_getrusage(td, uap->who, &s);
895 	if (error != 0)
896 		return (error);
897 	if (uap->rusage != NULL)
898 		error = linux_copyout_rusage(&s, uap->rusage);
899 	return (error);
900 }
901 
902 int
903 linux_set_thread_area(struct thread *td,
904     struct linux_set_thread_area_args *args)
905 {
906 	struct l_user_desc info;
907 	struct user_segment_descriptor sd;
908 	struct pcb *pcb;
909 	int a[2];
910 	int error;
911 
912 	error = copyin(args->desc, &info, sizeof(struct l_user_desc));
913 	if (error)
914 		return (error);
915 
916 #ifdef DEBUG
917 	if (ldebug(set_thread_area))
918 		printf(ARGS(set_thread_area, "%i, %x, %x, %i, %i, %i, "
919 		    "%i, %i, %i"), info.entry_number, info.base_addr,
920 		    info.limit, info.seg_32bit, info.contents,
921 		    info.read_exec_only, info.limit_in_pages,
922 		    info.seg_not_present, info.useable);
923 #endif
924 
925 	/*
926 	 * Semantics of Linux version: every thread in the system has array
927 	 * of three TLS descriptors. 1st is GLIBC TLS, 2nd is WINE, 3rd unknown.
928 	 * This syscall loads one of the selected TLS decriptors with a value
929 	 * and also loads GDT descriptors 6, 7 and 8 with the content of
930 	 * the per-thread descriptors.
931 	 *
932 	 * Semantics of FreeBSD version: I think we can ignore that Linux has
933 	 * three per-thread descriptors and use just the first one.
934 	 * The tls_array[] is used only in [gs]et_thread_area() syscalls and
935 	 * for loading the GDT descriptors. We use just one GDT descriptor
936 	 * for TLS, so we will load just one.
937 	 *
938 	 * XXX: This doesn't work when a user space process tries to use more
939 	 * than one TLS segment. Comment in the Linux source says wine might
940 	 * do this.
941 	 */
942 
943 	/*
944 	 * GLIBC reads current %gs and call set_thread_area() with it.
945 	 * We should let GUDATA_SEL and GUGS32_SEL proceed as well because
946 	 * we use these segments.
947 	 */
948 	switch (info.entry_number) {
949 	case GUGS32_SEL:
950 	case GUDATA_SEL:
951 	case 6:
952 	case -1:
953 		info.entry_number = GUGS32_SEL;
954 		break;
955 	default:
956 		return (EINVAL);
957 	}
958 
959 	/*
960 	 * We have to copy out the GDT entry we use.
961 	 *
962 	 * XXX: What if a user space program does not check the return value
963 	 * and tries to use 6, 7 or 8?
964 	 */
965 	error = copyout(&info, args->desc, sizeof(struct l_user_desc));
966 	if (error)
967 		return (error);
968 
969 	if (LINUX_LDT_empty(&info)) {
970 		a[0] = 0;
971 		a[1] = 0;
972 	} else {
973 		a[0] = LINUX_LDT_entry_a(&info);
974 		a[1] = LINUX_LDT_entry_b(&info);
975 	}
976 
977 	memcpy(&sd, &a, sizeof(a));
978 #ifdef DEBUG
979 	if (ldebug(set_thread_area))
980 		printf("Segment created in set_thread_area: "
981 		    "lobase: %x, hibase: %x, lolimit: %x, hilimit: %x, "
982 		    "type: %i, dpl: %i, p: %i, xx: %i, long: %i, "
983 		    "def32: %i, gran: %i\n",
984 		    sd.sd_lobase,
985 		    sd.sd_hibase,
986 		    sd.sd_lolimit,
987 		    sd.sd_hilimit,
988 		    sd.sd_type,
989 		    sd.sd_dpl,
990 		    sd.sd_p,
991 		    sd.sd_xx,
992 		    sd.sd_long,
993 		    sd.sd_def32,
994 		    sd.sd_gran);
995 #endif
996 
997 	pcb = td->td_pcb;
998 	pcb->pcb_gsbase = (register_t)info.base_addr;
999 	set_pcb_flags(pcb, PCB_32BIT);
1000 	update_gdt_gsbase(td, info.base_addr);
1001 
1002 	return (0);
1003 }
1004