xref: /freebsd/sys/amd64/linux32/linux32_machdep.c (revision 5f0216bd883edee71bf81051e3c20505e4820903)
1 /*-
2  * Copyright (c) 2004 Tim J. Robbins
3  * Copyright (c) 2002 Doug Rabson
4  * Copyright (c) 2000 Marcel Moolenaar
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer
12  *    in this position and unchanged.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. The name of the author may not be used to endorse or promote products
17  *    derived from this software without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
20  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
21  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
22  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
23  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
24  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
28  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29  */
30 
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33 
34 #include "opt_compat.h"
35 
36 #include <sys/param.h>
37 #include <sys/kernel.h>
38 #include <sys/systm.h>
39 #include <sys/capsicum.h>
40 #include <sys/file.h>
41 #include <sys/fcntl.h>
42 #include <sys/clock.h>
43 #include <sys/imgact.h>
44 #include <sys/limits.h>
45 #include <sys/lock.h>
46 #include <sys/malloc.h>
47 #include <sys/mman.h>
48 #include <sys/mutex.h>
49 #include <sys/priv.h>
50 #include <sys/proc.h>
51 #include <sys/resource.h>
52 #include <sys/resourcevar.h>
53 #include <sys/syscallsubr.h>
54 #include <sys/sysproto.h>
55 #include <sys/unistd.h>
56 #include <sys/wait.h>
57 
58 #include <machine/frame.h>
59 #include <machine/pcb.h>
60 #include <machine/psl.h>
61 #include <machine/segments.h>
62 #include <machine/specialreg.h>
63 
64 #include <vm/vm.h>
65 #include <vm/pmap.h>
66 #include <vm/vm_map.h>
67 
68 #include <compat/freebsd32/freebsd32_util.h>
69 #include <amd64/linux32/linux.h>
70 #include <amd64/linux32/linux32_proto.h>
71 #include <compat/linux/linux_ipc.h>
72 #include <compat/linux/linux_misc.h>
73 #include <compat/linux/linux_signal.h>
74 #include <compat/linux/linux_util.h>
75 #include <compat/linux/linux_emul.h>
76 
77 static void	bsd_to_linux_rusage(struct rusage *ru, struct l_rusage *lru);
78 
79 struct l_old_select_argv {
80 	l_int		nfds;
81 	l_uintptr_t	readfds;
82 	l_uintptr_t	writefds;
83 	l_uintptr_t	exceptfds;
84 	l_uintptr_t	timeout;
85 } __packed;
86 
87 static int	linux_mmap_common(struct thread *td, l_uintptr_t addr,
88 		    l_size_t len, l_int prot, l_int flags, l_int fd,
89 		    l_loff_t pos);
90 
91 static void
92 bsd_to_linux_rusage(struct rusage *ru, struct l_rusage *lru)
93 {
94 
95 	lru->ru_utime.tv_sec = ru->ru_utime.tv_sec;
96 	lru->ru_utime.tv_usec = ru->ru_utime.tv_usec;
97 	lru->ru_stime.tv_sec = ru->ru_stime.tv_sec;
98 	lru->ru_stime.tv_usec = ru->ru_stime.tv_usec;
99 	lru->ru_maxrss = ru->ru_maxrss;
100 	lru->ru_ixrss = ru->ru_ixrss;
101 	lru->ru_idrss = ru->ru_idrss;
102 	lru->ru_isrss = ru->ru_isrss;
103 	lru->ru_minflt = ru->ru_minflt;
104 	lru->ru_majflt = ru->ru_majflt;
105 	lru->ru_nswap = ru->ru_nswap;
106 	lru->ru_inblock = ru->ru_inblock;
107 	lru->ru_oublock = ru->ru_oublock;
108 	lru->ru_msgsnd = ru->ru_msgsnd;
109 	lru->ru_msgrcv = ru->ru_msgrcv;
110 	lru->ru_nsignals = ru->ru_nsignals;
111 	lru->ru_nvcsw = ru->ru_nvcsw;
112 	lru->ru_nivcsw = ru->ru_nivcsw;
113 }
114 
115 int
116 linux_copyout_rusage(struct rusage *ru, void *uaddr)
117 {
118 	struct l_rusage lru;
119 
120 	bsd_to_linux_rusage(ru, &lru);
121 
122 	return (copyout(&lru, uaddr, sizeof(struct l_rusage)));
123 }
124 
125 int
126 linux_execve(struct thread *td, struct linux_execve_args *args)
127 {
128 	struct image_args eargs;
129 	char *path;
130 	int error;
131 
132 	LCONVPATHEXIST(td, args->path, &path);
133 
134 #ifdef DEBUG
135 	if (ldebug(execve))
136 		printf(ARGS(execve, "%s"), path);
137 #endif
138 
139 	error = freebsd32_exec_copyin_args(&eargs, path, UIO_SYSSPACE,
140 	    args->argp, args->envp);
141 	free(path, M_TEMP);
142 	if (error == 0)
143 		error = linux_common_execve(td, &eargs);
144 	return (error);
145 }
146 
147 CTASSERT(sizeof(struct l_iovec32) == 8);
148 
149 static int
150 linux32_copyinuio(struct l_iovec32 *iovp, l_ulong iovcnt, struct uio **uiop)
151 {
152 	struct l_iovec32 iov32;
153 	struct iovec *iov;
154 	struct uio *uio;
155 	uint32_t iovlen;
156 	int error, i;
157 
158 	*uiop = NULL;
159 	if (iovcnt > UIO_MAXIOV)
160 		return (EINVAL);
161 	iovlen = iovcnt * sizeof(struct iovec);
162 	uio = malloc(iovlen + sizeof(*uio), M_IOV, M_WAITOK);
163 	iov = (struct iovec *)(uio + 1);
164 	for (i = 0; i < iovcnt; i++) {
165 		error = copyin(&iovp[i], &iov32, sizeof(struct l_iovec32));
166 		if (error) {
167 			free(uio, M_IOV);
168 			return (error);
169 		}
170 		iov[i].iov_base = PTRIN(iov32.iov_base);
171 		iov[i].iov_len = iov32.iov_len;
172 	}
173 	uio->uio_iov = iov;
174 	uio->uio_iovcnt = iovcnt;
175 	uio->uio_segflg = UIO_USERSPACE;
176 	uio->uio_offset = -1;
177 	uio->uio_resid = 0;
178 	for (i = 0; i < iovcnt; i++) {
179 		if (iov->iov_len > INT_MAX - uio->uio_resid) {
180 			free(uio, M_IOV);
181 			return (EINVAL);
182 		}
183 		uio->uio_resid += iov->iov_len;
184 		iov++;
185 	}
186 	*uiop = uio;
187 	return (0);
188 }
189 
190 int
191 linux32_copyiniov(struct l_iovec32 *iovp32, l_ulong iovcnt, struct iovec **iovp,
192     int error)
193 {
194 	struct l_iovec32 iov32;
195 	struct iovec *iov;
196 	uint32_t iovlen;
197 	int i;
198 
199 	*iovp = NULL;
200 	if (iovcnt > UIO_MAXIOV)
201 		return (error);
202 	iovlen = iovcnt * sizeof(struct iovec);
203 	iov = malloc(iovlen, M_IOV, M_WAITOK);
204 	for (i = 0; i < iovcnt; i++) {
205 		error = copyin(&iovp32[i], &iov32, sizeof(struct l_iovec32));
206 		if (error) {
207 			free(iov, M_IOV);
208 			return (error);
209 		}
210 		iov[i].iov_base = PTRIN(iov32.iov_base);
211 		iov[i].iov_len = iov32.iov_len;
212 	}
213 	*iovp = iov;
214 	return(0);
215 
216 }
217 
218 int
219 linux_readv(struct thread *td, struct linux_readv_args *uap)
220 {
221 	struct uio *auio;
222 	int error;
223 
224 	error = linux32_copyinuio(uap->iovp, uap->iovcnt, &auio);
225 	if (error)
226 		return (error);
227 	error = kern_readv(td, uap->fd, auio);
228 	free(auio, M_IOV);
229 	return (error);
230 }
231 
232 int
233 linux_writev(struct thread *td, struct linux_writev_args *uap)
234 {
235 	struct uio *auio;
236 	int error;
237 
238 	error = linux32_copyinuio(uap->iovp, uap->iovcnt, &auio);
239 	if (error)
240 		return (error);
241 	error = kern_writev(td, uap->fd, auio);
242 	free(auio, M_IOV);
243 	return (error);
244 }
245 
246 struct l_ipc_kludge {
247 	l_uintptr_t msgp;
248 	l_long msgtyp;
249 } __packed;
250 
251 int
252 linux_ipc(struct thread *td, struct linux_ipc_args *args)
253 {
254 
255 	switch (args->what & 0xFFFF) {
256 	case LINUX_SEMOP: {
257 		struct linux_semop_args a;
258 
259 		a.semid = args->arg1;
260 		a.tsops = args->ptr;
261 		a.nsops = args->arg2;
262 		return (linux_semop(td, &a));
263 	}
264 	case LINUX_SEMGET: {
265 		struct linux_semget_args a;
266 
267 		a.key = args->arg1;
268 		a.nsems = args->arg2;
269 		a.semflg = args->arg3;
270 		return (linux_semget(td, &a));
271 	}
272 	case LINUX_SEMCTL: {
273 		struct linux_semctl_args a;
274 		int error;
275 
276 		a.semid = args->arg1;
277 		a.semnum = args->arg2;
278 		a.cmd = args->arg3;
279 		error = copyin(args->ptr, &a.arg, sizeof(a.arg));
280 		if (error)
281 			return (error);
282 		return (linux_semctl(td, &a));
283 	}
284 	case LINUX_MSGSND: {
285 		struct linux_msgsnd_args a;
286 
287 		a.msqid = args->arg1;
288 		a.msgp = args->ptr;
289 		a.msgsz = args->arg2;
290 		a.msgflg = args->arg3;
291 		return (linux_msgsnd(td, &a));
292 	}
293 	case LINUX_MSGRCV: {
294 		struct linux_msgrcv_args a;
295 
296 		a.msqid = args->arg1;
297 		a.msgsz = args->arg2;
298 		a.msgflg = args->arg3;
299 		if ((args->what >> 16) == 0) {
300 			struct l_ipc_kludge tmp;
301 			int error;
302 
303 			if (args->ptr == 0)
304 				return (EINVAL);
305 			error = copyin(args->ptr, &tmp, sizeof(tmp));
306 			if (error)
307 				return (error);
308 			a.msgp = PTRIN(tmp.msgp);
309 			a.msgtyp = tmp.msgtyp;
310 		} else {
311 			a.msgp = args->ptr;
312 			a.msgtyp = args->arg5;
313 		}
314 		return (linux_msgrcv(td, &a));
315 	}
316 	case LINUX_MSGGET: {
317 		struct linux_msgget_args a;
318 
319 		a.key = args->arg1;
320 		a.msgflg = args->arg2;
321 		return (linux_msgget(td, &a));
322 	}
323 	case LINUX_MSGCTL: {
324 		struct linux_msgctl_args a;
325 
326 		a.msqid = args->arg1;
327 		a.cmd = args->arg2;
328 		a.buf = args->ptr;
329 		return (linux_msgctl(td, &a));
330 	}
331 	case LINUX_SHMAT: {
332 		struct linux_shmat_args a;
333 
334 		a.shmid = args->arg1;
335 		a.shmaddr = args->ptr;
336 		a.shmflg = args->arg2;
337 		a.raddr = PTRIN((l_uint)args->arg3);
338 		return (linux_shmat(td, &a));
339 	}
340 	case LINUX_SHMDT: {
341 		struct linux_shmdt_args a;
342 
343 		a.shmaddr = args->ptr;
344 		return (linux_shmdt(td, &a));
345 	}
346 	case LINUX_SHMGET: {
347 		struct linux_shmget_args a;
348 
349 		a.key = args->arg1;
350 		a.size = args->arg2;
351 		a.shmflg = args->arg3;
352 		return (linux_shmget(td, &a));
353 	}
354 	case LINUX_SHMCTL: {
355 		struct linux_shmctl_args a;
356 
357 		a.shmid = args->arg1;
358 		a.cmd = args->arg2;
359 		a.buf = args->ptr;
360 		return (linux_shmctl(td, &a));
361 	}
362 	default:
363 		break;
364 	}
365 
366 	return (EINVAL);
367 }
368 
369 int
370 linux_old_select(struct thread *td, struct linux_old_select_args *args)
371 {
372 	struct l_old_select_argv linux_args;
373 	struct linux_select_args newsel;
374 	int error;
375 
376 #ifdef DEBUG
377 	if (ldebug(old_select))
378 		printf(ARGS(old_select, "%p"), args->ptr);
379 #endif
380 
381 	error = copyin(args->ptr, &linux_args, sizeof(linux_args));
382 	if (error)
383 		return (error);
384 
385 	newsel.nfds = linux_args.nfds;
386 	newsel.readfds = PTRIN(linux_args.readfds);
387 	newsel.writefds = PTRIN(linux_args.writefds);
388 	newsel.exceptfds = PTRIN(linux_args.exceptfds);
389 	newsel.timeout = PTRIN(linux_args.timeout);
390 	return (linux_select(td, &newsel));
391 }
392 
393 int
394 linux_set_cloned_tls(struct thread *td, void *desc)
395 {
396 	struct user_segment_descriptor sd;
397 	struct l_user_desc info;
398 	struct pcb *pcb;
399 	int error;
400 	int a[2];
401 
402 	error = copyin(desc, &info, sizeof(struct l_user_desc));
403 	if (error) {
404 		printf(LMSG("copyin failed!"));
405 	} else {
406 		/* We might copy out the entry_number as GUGS32_SEL. */
407 		info.entry_number = GUGS32_SEL;
408 		error = copyout(&info, desc, sizeof(struct l_user_desc));
409 		if (error)
410 			printf(LMSG("copyout failed!"));
411 
412 		a[0] = LINUX_LDT_entry_a(&info);
413 		a[1] = LINUX_LDT_entry_b(&info);
414 
415 		memcpy(&sd, &a, sizeof(a));
416 #ifdef DEBUG
417 		if (ldebug(clone))
418 			printf("Segment created in clone with "
419 			    "CLONE_SETTLS: lobase: %x, hibase: %x, "
420 			    "lolimit: %x, hilimit: %x, type: %i, "
421 			    "dpl: %i, p: %i, xx: %i, long: %i, "
422 			    "def32: %i, gran: %i\n", sd.sd_lobase,
423 			    sd.sd_hibase, sd.sd_lolimit, sd.sd_hilimit,
424 			    sd.sd_type, sd.sd_dpl, sd.sd_p, sd.sd_xx,
425 			    sd.sd_long, sd.sd_def32, sd.sd_gran);
426 #endif
427 		pcb = td->td_pcb;
428 		pcb->pcb_gsbase = (register_t)info.base_addr;
429 /* XXXKIB	pcb->pcb_gs32sd = sd; */
430 		td->td_frame->tf_gs = GSEL(GUGS32_SEL, SEL_UPL);
431 		set_pcb_flags(pcb, PCB_32BIT);
432 	}
433 
434 	return (error);
435 }
436 
437 int
438 linux_set_upcall_kse(struct thread *td, register_t stack)
439 {
440 
441 	if (stack)
442 		td->td_frame->tf_rsp = stack;
443 
444 	/*
445 	 * The newly created Linux thread returns
446 	 * to the user space by the same path that a parent do.
447 	 */
448 	td->td_frame->tf_rax = 0;
449 	return (0);
450 }
451 
452 #define STACK_SIZE  (2 * 1024 * 1024)
453 #define GUARD_SIZE  (4 * PAGE_SIZE)
454 
455 int
456 linux_mmap2(struct thread *td, struct linux_mmap2_args *args)
457 {
458 
459 #ifdef DEBUG
460 	if (ldebug(mmap2))
461 		printf(ARGS(mmap2, "0x%08x, %d, %d, 0x%08x, %d, %d"),
462 		    args->addr, args->len, args->prot,
463 		    args->flags, args->fd, args->pgoff);
464 #endif
465 
466 	return (linux_mmap_common(td, PTROUT(args->addr), args->len, args->prot,
467 		args->flags, args->fd, (uint64_t)(uint32_t)args->pgoff *
468 		PAGE_SIZE));
469 }
470 
471 int
472 linux_mmap(struct thread *td, struct linux_mmap_args *args)
473 {
474 	int error;
475 	struct l_mmap_argv linux_args;
476 
477 	error = copyin(args->ptr, &linux_args, sizeof(linux_args));
478 	if (error)
479 		return (error);
480 
481 #ifdef DEBUG
482 	if (ldebug(mmap))
483 		printf(ARGS(mmap, "0x%08x, %d, %d, 0x%08x, %d, %d"),
484 		    linux_args.addr, linux_args.len, linux_args.prot,
485 		    linux_args.flags, linux_args.fd, linux_args.pgoff);
486 #endif
487 
488 	return (linux_mmap_common(td, linux_args.addr, linux_args.len,
489 	    linux_args.prot, linux_args.flags, linux_args.fd,
490 	    (uint32_t)linux_args.pgoff));
491 }
492 
493 static int
494 linux_mmap_common(struct thread *td, l_uintptr_t addr, l_size_t len, l_int prot,
495     l_int flags, l_int fd, l_loff_t pos)
496 {
497 	struct proc *p = td->td_proc;
498 	struct mmap_args /* {
499 		caddr_t addr;
500 		size_t len;
501 		int prot;
502 		int flags;
503 		int fd;
504 		long pad;
505 		off_t pos;
506 	} */ bsd_args;
507 	int error;
508 	struct file *fp;
509 	cap_rights_t rights;
510 
511 	error = 0;
512 	bsd_args.flags = 0;
513 	fp = NULL;
514 
515 	/*
516 	 * Linux mmap(2):
517 	 * You must specify exactly one of MAP_SHARED and MAP_PRIVATE
518 	 */
519 	if (!((flags & LINUX_MAP_SHARED) ^ (flags & LINUX_MAP_PRIVATE)))
520 		return (EINVAL);
521 
522 	if (flags & LINUX_MAP_SHARED)
523 		bsd_args.flags |= MAP_SHARED;
524 	if (flags & LINUX_MAP_PRIVATE)
525 		bsd_args.flags |= MAP_PRIVATE;
526 	if (flags & LINUX_MAP_FIXED)
527 		bsd_args.flags |= MAP_FIXED;
528 	if (flags & LINUX_MAP_ANON) {
529 		/* Enforce pos to be on page boundary, then ignore. */
530 		if ((pos & PAGE_MASK) != 0)
531 			return (EINVAL);
532 		pos = 0;
533 		bsd_args.flags |= MAP_ANON;
534 	} else
535 		bsd_args.flags |= MAP_NOSYNC;
536 	if (flags & LINUX_MAP_GROWSDOWN)
537 		bsd_args.flags |= MAP_STACK;
538 
539 	/*
540 	 * PROT_READ, PROT_WRITE, or PROT_EXEC implies PROT_READ and PROT_EXEC
541 	 * on Linux/i386. We do this to ensure maximum compatibility.
542 	 * Linux/ia64 does the same in i386 emulation mode.
543 	 */
544 	bsd_args.prot = prot;
545 	if (bsd_args.prot & (PROT_READ | PROT_WRITE | PROT_EXEC))
546 		bsd_args.prot |= PROT_READ | PROT_EXEC;
547 
548 	/* Linux does not check file descriptor when MAP_ANONYMOUS is set. */
549 	bsd_args.fd = (bsd_args.flags & MAP_ANON) ? -1 : fd;
550 	if (bsd_args.fd != -1) {
551 		/*
552 		 * Linux follows Solaris mmap(2) description:
553 		 * The file descriptor fildes is opened with
554 		 * read permission, regardless of the
555 		 * protection options specified.
556 		 */
557 
558 		error = fget(td, bsd_args.fd,
559 		    cap_rights_init(&rights, CAP_MMAP), &fp);
560 		if (error != 0)
561 			return (error);
562 		if (fp->f_type != DTYPE_VNODE) {
563 			fdrop(fp, td);
564 			return (EINVAL);
565 		}
566 
567 		/* Linux mmap() just fails for O_WRONLY files */
568 		if (!(fp->f_flag & FREAD)) {
569 			fdrop(fp, td);
570 			return (EACCES);
571 		}
572 
573 		fdrop(fp, td);
574 	}
575 
576 	if (flags & LINUX_MAP_GROWSDOWN) {
577 		/*
578 		 * The Linux MAP_GROWSDOWN option does not limit auto
579 		 * growth of the region.  Linux mmap with this option
580 		 * takes as addr the inital BOS, and as len, the initial
581 		 * region size.  It can then grow down from addr without
582 		 * limit.  However, Linux threads has an implicit internal
583 		 * limit to stack size of STACK_SIZE.  Its just not
584 		 * enforced explicitly in Linux.  But, here we impose
585 		 * a limit of (STACK_SIZE - GUARD_SIZE) on the stack
586 		 * region, since we can do this with our mmap.
587 		 *
588 		 * Our mmap with MAP_STACK takes addr as the maximum
589 		 * downsize limit on BOS, and as len the max size of
590 		 * the region.  It then maps the top SGROWSIZ bytes,
591 		 * and auto grows the region down, up to the limit
592 		 * in addr.
593 		 *
594 		 * If we don't use the MAP_STACK option, the effect
595 		 * of this code is to allocate a stack region of a
596 		 * fixed size of (STACK_SIZE - GUARD_SIZE).
597 		 */
598 
599 		if ((caddr_t)PTRIN(addr) + len > p->p_vmspace->vm_maxsaddr) {
600 			/*
601 			 * Some Linux apps will attempt to mmap
602 			 * thread stacks near the top of their
603 			 * address space.  If their TOS is greater
604 			 * than vm_maxsaddr, vm_map_growstack()
605 			 * will confuse the thread stack with the
606 			 * process stack and deliver a SEGV if they
607 			 * attempt to grow the thread stack past their
608 			 * current stacksize rlimit.  To avoid this,
609 			 * adjust vm_maxsaddr upwards to reflect
610 			 * the current stacksize rlimit rather
611 			 * than the maximum possible stacksize.
612 			 * It would be better to adjust the
613 			 * mmap'ed region, but some apps do not check
614 			 * mmap's return value.
615 			 */
616 			PROC_LOCK(p);
617 			p->p_vmspace->vm_maxsaddr = (char *)LINUX32_USRSTACK -
618 			    lim_cur_proc(p, RLIMIT_STACK);
619 			PROC_UNLOCK(p);
620 		}
621 
622 		/*
623 		 * This gives us our maximum stack size and a new BOS.
624 		 * If we're using VM_STACK, then mmap will just map
625 		 * the top SGROWSIZ bytes, and let the stack grow down
626 		 * to the limit at BOS.  If we're not using VM_STACK
627 		 * we map the full stack, since we don't have a way
628 		 * to autogrow it.
629 		 */
630 		if (len > STACK_SIZE - GUARD_SIZE) {
631 			bsd_args.addr = (caddr_t)PTRIN(addr);
632 			bsd_args.len = len;
633 		} else {
634 			bsd_args.addr = (caddr_t)PTRIN(addr) -
635 			    (STACK_SIZE - GUARD_SIZE - len);
636 			bsd_args.len = STACK_SIZE - GUARD_SIZE;
637 		}
638 	} else {
639 		bsd_args.addr = (caddr_t)PTRIN(addr);
640 		bsd_args.len  = len;
641 	}
642 	bsd_args.pos = pos;
643 
644 #ifdef DEBUG
645 	if (ldebug(mmap))
646 		printf("-> %s(%p, %d, %d, 0x%08x, %d, 0x%x)\n",
647 		    __func__,
648 		    (void *)bsd_args.addr, (int)bsd_args.len, bsd_args.prot,
649 		    bsd_args.flags, bsd_args.fd, (int)bsd_args.pos);
650 #endif
651 	error = sys_mmap(td, &bsd_args);
652 #ifdef DEBUG
653 	if (ldebug(mmap))
654 		printf("-> %s() return: 0x%x (0x%08x)\n",
655 			__func__, error, (u_int)td->td_retval[0]);
656 #endif
657 	return (error);
658 }
659 
660 int
661 linux_mprotect(struct thread *td, struct linux_mprotect_args *uap)
662 {
663 	struct mprotect_args bsd_args;
664 
665 	bsd_args.addr = uap->addr;
666 	bsd_args.len = uap->len;
667 	bsd_args.prot = uap->prot;
668 	if (bsd_args.prot & (PROT_READ | PROT_WRITE | PROT_EXEC))
669 		bsd_args.prot |= PROT_READ | PROT_EXEC;
670 	return (sys_mprotect(td, &bsd_args));
671 }
672 
673 int
674 linux_iopl(struct thread *td, struct linux_iopl_args *args)
675 {
676 	int error;
677 
678 	if (args->level < 0 || args->level > 3)
679 		return (EINVAL);
680 	if ((error = priv_check(td, PRIV_IO)) != 0)
681 		return (error);
682 	if ((error = securelevel_gt(td->td_ucred, 0)) != 0)
683 		return (error);
684 	td->td_frame->tf_rflags = (td->td_frame->tf_rflags & ~PSL_IOPL) |
685 	    (args->level * (PSL_IOPL / 3));
686 
687 	return (0);
688 }
689 
690 int
691 linux_sigaction(struct thread *td, struct linux_sigaction_args *args)
692 {
693 	l_osigaction_t osa;
694 	l_sigaction_t act, oact;
695 	int error;
696 
697 #ifdef DEBUG
698 	if (ldebug(sigaction))
699 		printf(ARGS(sigaction, "%d, %p, %p"),
700 		    args->sig, (void *)args->nsa, (void *)args->osa);
701 #endif
702 
703 	if (args->nsa != NULL) {
704 		error = copyin(args->nsa, &osa, sizeof(l_osigaction_t));
705 		if (error)
706 			return (error);
707 		act.lsa_handler = osa.lsa_handler;
708 		act.lsa_flags = osa.lsa_flags;
709 		act.lsa_restorer = osa.lsa_restorer;
710 		LINUX_SIGEMPTYSET(act.lsa_mask);
711 		act.lsa_mask.__mask = osa.lsa_mask;
712 	}
713 
714 	error = linux_do_sigaction(td, args->sig, args->nsa ? &act : NULL,
715 	    args->osa ? &oact : NULL);
716 
717 	if (args->osa != NULL && !error) {
718 		osa.lsa_handler = oact.lsa_handler;
719 		osa.lsa_flags = oact.lsa_flags;
720 		osa.lsa_restorer = oact.lsa_restorer;
721 		osa.lsa_mask = oact.lsa_mask.__mask;
722 		error = copyout(&osa, args->osa, sizeof(l_osigaction_t));
723 	}
724 
725 	return (error);
726 }
727 
728 /*
729  * Linux has two extra args, restart and oldmask.  We don't use these,
730  * but it seems that "restart" is actually a context pointer that
731  * enables the signal to happen with a different register set.
732  */
733 int
734 linux_sigsuspend(struct thread *td, struct linux_sigsuspend_args *args)
735 {
736 	sigset_t sigmask;
737 	l_sigset_t mask;
738 
739 #ifdef DEBUG
740 	if (ldebug(sigsuspend))
741 		printf(ARGS(sigsuspend, "%08lx"), (unsigned long)args->mask);
742 #endif
743 
744 	LINUX_SIGEMPTYSET(mask);
745 	mask.__mask = args->mask;
746 	linux_to_bsd_sigset(&mask, &sigmask);
747 	return (kern_sigsuspend(td, sigmask));
748 }
749 
750 int
751 linux_rt_sigsuspend(struct thread *td, struct linux_rt_sigsuspend_args *uap)
752 {
753 	l_sigset_t lmask;
754 	sigset_t sigmask;
755 	int error;
756 
757 #ifdef DEBUG
758 	if (ldebug(rt_sigsuspend))
759 		printf(ARGS(rt_sigsuspend, "%p, %d"),
760 		    (void *)uap->newset, uap->sigsetsize);
761 #endif
762 
763 	if (uap->sigsetsize != sizeof(l_sigset_t))
764 		return (EINVAL);
765 
766 	error = copyin(uap->newset, &lmask, sizeof(l_sigset_t));
767 	if (error)
768 		return (error);
769 
770 	linux_to_bsd_sigset(&lmask, &sigmask);
771 	return (kern_sigsuspend(td, sigmask));
772 }
773 
774 int
775 linux_pause(struct thread *td, struct linux_pause_args *args)
776 {
777 	struct proc *p = td->td_proc;
778 	sigset_t sigmask;
779 
780 #ifdef DEBUG
781 	if (ldebug(pause))
782 		printf(ARGS(pause, ""));
783 #endif
784 
785 	PROC_LOCK(p);
786 	sigmask = td->td_sigmask;
787 	PROC_UNLOCK(p);
788 	return (kern_sigsuspend(td, sigmask));
789 }
790 
791 int
792 linux_sigaltstack(struct thread *td, struct linux_sigaltstack_args *uap)
793 {
794 	stack_t ss, oss;
795 	l_stack_t lss;
796 	int error;
797 
798 #ifdef DEBUG
799 	if (ldebug(sigaltstack))
800 		printf(ARGS(sigaltstack, "%p, %p"), uap->uss, uap->uoss);
801 #endif
802 
803 	if (uap->uss != NULL) {
804 		error = copyin(uap->uss, &lss, sizeof(l_stack_t));
805 		if (error)
806 			return (error);
807 
808 		ss.ss_sp = PTRIN(lss.ss_sp);
809 		ss.ss_size = lss.ss_size;
810 		ss.ss_flags = linux_to_bsd_sigaltstack(lss.ss_flags);
811 	}
812 	error = kern_sigaltstack(td, (uap->uss != NULL) ? &ss : NULL,
813 	    (uap->uoss != NULL) ? &oss : NULL);
814 	if (!error && uap->uoss != NULL) {
815 		lss.ss_sp = PTROUT(oss.ss_sp);
816 		lss.ss_size = oss.ss_size;
817 		lss.ss_flags = bsd_to_linux_sigaltstack(oss.ss_flags);
818 		error = copyout(&lss, uap->uoss, sizeof(l_stack_t));
819 	}
820 
821 	return (error);
822 }
823 
824 int
825 linux_ftruncate64(struct thread *td, struct linux_ftruncate64_args *args)
826 {
827 	struct ftruncate_args sa;
828 
829 #ifdef DEBUG
830 	if (ldebug(ftruncate64))
831 		printf(ARGS(ftruncate64, "%u, %jd"), args->fd,
832 		    (intmax_t)args->length);
833 #endif
834 
835 	sa.fd = args->fd;
836 	sa.length = args->length;
837 	return sys_ftruncate(td, &sa);
838 }
839 
840 int
841 linux_gettimeofday(struct thread *td, struct linux_gettimeofday_args *uap)
842 {
843 	struct timeval atv;
844 	l_timeval atv32;
845 	struct timezone rtz;
846 	int error = 0;
847 
848 	if (uap->tp) {
849 		microtime(&atv);
850 		atv32.tv_sec = atv.tv_sec;
851 		atv32.tv_usec = atv.tv_usec;
852 		error = copyout(&atv32, uap->tp, sizeof(atv32));
853 	}
854 	if (error == 0 && uap->tzp != NULL) {
855 		rtz.tz_minuteswest = tz_minuteswest;
856 		rtz.tz_dsttime = tz_dsttime;
857 		error = copyout(&rtz, uap->tzp, sizeof(rtz));
858 	}
859 	return (error);
860 }
861 
862 int
863 linux_settimeofday(struct thread *td, struct linux_settimeofday_args *uap)
864 {
865 	l_timeval atv32;
866 	struct timeval atv, *tvp;
867 	struct timezone atz, *tzp;
868 	int error;
869 
870 	if (uap->tp) {
871 		error = copyin(uap->tp, &atv32, sizeof(atv32));
872 		if (error)
873 			return (error);
874 		atv.tv_sec = atv32.tv_sec;
875 		atv.tv_usec = atv32.tv_usec;
876 		tvp = &atv;
877 	} else
878 		tvp = NULL;
879 	if (uap->tzp) {
880 		error = copyin(uap->tzp, &atz, sizeof(atz));
881 		if (error)
882 			return (error);
883 		tzp = &atz;
884 	} else
885 		tzp = NULL;
886 	return (kern_settimeofday(td, tvp, tzp));
887 }
888 
889 int
890 linux_getrusage(struct thread *td, struct linux_getrusage_args *uap)
891 {
892 	struct rusage s;
893 	int error;
894 
895 	error = kern_getrusage(td, uap->who, &s);
896 	if (error != 0)
897 		return (error);
898 	if (uap->rusage != NULL)
899 		error = linux_copyout_rusage(&s, uap->rusage);
900 	return (error);
901 }
902 
903 int
904 linux_set_thread_area(struct thread *td,
905     struct linux_set_thread_area_args *args)
906 {
907 	struct l_user_desc info;
908 	struct user_segment_descriptor sd;
909 	struct pcb *pcb;
910 	int a[2];
911 	int error;
912 
913 	error = copyin(args->desc, &info, sizeof(struct l_user_desc));
914 	if (error)
915 		return (error);
916 
917 #ifdef DEBUG
918 	if (ldebug(set_thread_area))
919 		printf(ARGS(set_thread_area, "%i, %x, %x, %i, %i, %i, "
920 		    "%i, %i, %i"), info.entry_number, info.base_addr,
921 		    info.limit, info.seg_32bit, info.contents,
922 		    info.read_exec_only, info.limit_in_pages,
923 		    info.seg_not_present, info.useable);
924 #endif
925 
926 	/*
927 	 * Semantics of Linux version: every thread in the system has array
928 	 * of three TLS descriptors. 1st is GLIBC TLS, 2nd is WINE, 3rd unknown.
929 	 * This syscall loads one of the selected TLS decriptors with a value
930 	 * and also loads GDT descriptors 6, 7 and 8 with the content of
931 	 * the per-thread descriptors.
932 	 *
933 	 * Semantics of FreeBSD version: I think we can ignore that Linux has
934 	 * three per-thread descriptors and use just the first one.
935 	 * The tls_array[] is used only in [gs]et_thread_area() syscalls and
936 	 * for loading the GDT descriptors. We use just one GDT descriptor
937 	 * for TLS, so we will load just one.
938 	 *
939 	 * XXX: This doesn't work when a user space process tries to use more
940 	 * than one TLS segment. Comment in the Linux source says wine might
941 	 * do this.
942 	 */
943 
944 	/*
945 	 * GLIBC reads current %gs and call set_thread_area() with it.
946 	 * We should let GUDATA_SEL and GUGS32_SEL proceed as well because
947 	 * we use these segments.
948 	 */
949 	switch (info.entry_number) {
950 	case GUGS32_SEL:
951 	case GUDATA_SEL:
952 	case 6:
953 	case -1:
954 		info.entry_number = GUGS32_SEL;
955 		break;
956 	default:
957 		return (EINVAL);
958 	}
959 
960 	/*
961 	 * We have to copy out the GDT entry we use.
962 	 *
963 	 * XXX: What if a user space program does not check the return value
964 	 * and tries to use 6, 7 or 8?
965 	 */
966 	error = copyout(&info, args->desc, sizeof(struct l_user_desc));
967 	if (error)
968 		return (error);
969 
970 	if (LINUX_LDT_empty(&info)) {
971 		a[0] = 0;
972 		a[1] = 0;
973 	} else {
974 		a[0] = LINUX_LDT_entry_a(&info);
975 		a[1] = LINUX_LDT_entry_b(&info);
976 	}
977 
978 	memcpy(&sd, &a, sizeof(a));
979 #ifdef DEBUG
980 	if (ldebug(set_thread_area))
981 		printf("Segment created in set_thread_area: "
982 		    "lobase: %x, hibase: %x, lolimit: %x, hilimit: %x, "
983 		    "type: %i, dpl: %i, p: %i, xx: %i, long: %i, "
984 		    "def32: %i, gran: %i\n",
985 		    sd.sd_lobase,
986 		    sd.sd_hibase,
987 		    sd.sd_lolimit,
988 		    sd.sd_hilimit,
989 		    sd.sd_type,
990 		    sd.sd_dpl,
991 		    sd.sd_p,
992 		    sd.sd_xx,
993 		    sd.sd_long,
994 		    sd.sd_def32,
995 		    sd.sd_gran);
996 #endif
997 
998 	pcb = td->td_pcb;
999 	pcb->pcb_gsbase = (register_t)info.base_addr;
1000 	set_pcb_flags(pcb, PCB_32BIT);
1001 	update_gdt_gsbase(td, info.base_addr);
1002 
1003 	return (0);
1004 }
1005