xref: /freebsd/sys/i386/linux/linux_machdep.c (revision f856af0466c076beef4ea9b15d088e1119a945b8)
1 /*-
2  * Copyright (c) 2000 Marcel Moolenaar
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer
10  *    in this position and unchanged.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  * 3. The name of the author may not be used to endorse or promote products
15  *    derived from this software without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/file.h>
35 #include <sys/fcntl.h>
36 #include <sys/imgact.h>
37 #include <sys/lock.h>
38 #include <sys/malloc.h>
39 #include <sys/mman.h>
40 #include <sys/mutex.h>
41 #include <sys/sx.h>
42 #include <sys/priv.h>
43 #include <sys/proc.h>
44 #include <sys/queue.h>
45 #include <sys/resource.h>
46 #include <sys/resourcevar.h>
47 #include <sys/signalvar.h>
48 #include <sys/syscallsubr.h>
49 #include <sys/sysproto.h>
50 #include <sys/unistd.h>
51 #include <sys/wait.h>
52 
53 #include <machine/frame.h>
54 #include <machine/psl.h>
55 #include <machine/segments.h>
56 #include <machine/sysarch.h>
57 
58 #include <vm/vm.h>
59 #include <vm/pmap.h>
60 #include <vm/vm_map.h>
61 
62 #include <i386/linux/linux.h>
63 #include <i386/linux/linux_proto.h>
64 #include <compat/linux/linux_ipc.h>
65 #include <compat/linux/linux_signal.h>
66 #include <compat/linux/linux_util.h>
67 #include <compat/linux/linux_emul.h>
68 
69 #include <i386/include/pcb.h>			/* needed for pcb definition in linux_set_thread_area */
70 
71 #include "opt_posix.h"
72 
73 extern struct sysentvec elf32_freebsd_sysvec;	/* defined in i386/i386/elf_machdep.c */
74 
75 struct l_descriptor {
76 	l_uint		entry_number;
77 	l_ulong		base_addr;
78 	l_uint		limit;
79 	l_uint		seg_32bit:1;
80 	l_uint		contents:2;
81 	l_uint		read_exec_only:1;
82 	l_uint		limit_in_pages:1;
83 	l_uint		seg_not_present:1;
84 	l_uint		useable:1;
85 };
86 
87 struct l_old_select_argv {
88 	l_int		nfds;
89 	l_fd_set	*readfds;
90 	l_fd_set	*writefds;
91 	l_fd_set	*exceptfds;
92 	struct l_timeval	*timeout;
93 };
94 
95 int
96 linux_to_bsd_sigaltstack(int lsa)
97 {
98 	int bsa = 0;
99 
100 	if (lsa & LINUX_SS_DISABLE)
101 		bsa |= SS_DISABLE;
102 	if (lsa & LINUX_SS_ONSTACK)
103 		bsa |= SS_ONSTACK;
104 	return (bsa);
105 }
106 
107 int
108 bsd_to_linux_sigaltstack(int bsa)
109 {
110 	int lsa = 0;
111 
112 	if (bsa & SS_DISABLE)
113 		lsa |= LINUX_SS_DISABLE;
114 	if (bsa & SS_ONSTACK)
115 		lsa |= LINUX_SS_ONSTACK;
116 	return (lsa);
117 }
118 
119 int
120 linux_execve(struct thread *td, struct linux_execve_args *args)
121 {
122 	int error;
123 	char *newpath;
124 	struct image_args eargs;
125 
126 	LCONVPATHEXIST(td, args->path, &newpath);
127 
128 #ifdef DEBUG
129 	if (ldebug(execve))
130 		printf(ARGS(execve, "%s"), newpath);
131 #endif
132 
133 	error = exec_copyin_args(&eargs, newpath, UIO_SYSSPACE,
134 	    args->argp, args->envp);
135 	free(newpath, M_TEMP);
136 	if (error == 0)
137 		error = kern_execve(td, &eargs, NULL);
138 	if (error == 0)
139 	   	/* linux process can exec fbsd one, dont attempt
140 		 * to create emuldata for such process using
141 		 * linux_proc_init, this leads to a panic on KASSERT
142 		 * because such process has p->p_emuldata == NULL
143 		 */
144 	   	if (td->td_proc->p_sysent == &elf_linux_sysvec)
145    		   	error = linux_proc_init(td, 0, 0);
146 	return (error);
147 }
148 
149 struct l_ipc_kludge {
150 	struct l_msgbuf *msgp;
151 	l_long msgtyp;
152 };
153 
154 int
155 linux_ipc(struct thread *td, struct linux_ipc_args *args)
156 {
157 
158 	switch (args->what & 0xFFFF) {
159 	case LINUX_SEMOP: {
160 		struct linux_semop_args a;
161 
162 		a.semid = args->arg1;
163 		a.tsops = args->ptr;
164 		a.nsops = args->arg2;
165 		return (linux_semop(td, &a));
166 	}
167 	case LINUX_SEMGET: {
168 		struct linux_semget_args a;
169 
170 		a.key = args->arg1;
171 		a.nsems = args->arg2;
172 		a.semflg = args->arg3;
173 		return (linux_semget(td, &a));
174 	}
175 	case LINUX_SEMCTL: {
176 		struct linux_semctl_args a;
177 		int error;
178 
179 		a.semid = args->arg1;
180 		a.semnum = args->arg2;
181 		a.cmd = args->arg3;
182 		error = copyin(args->ptr, &a.arg, sizeof(a.arg));
183 		if (error)
184 			return (error);
185 		return (linux_semctl(td, &a));
186 	}
187 	case LINUX_MSGSND: {
188 		struct linux_msgsnd_args a;
189 
190 		a.msqid = args->arg1;
191 		a.msgp = args->ptr;
192 		a.msgsz = args->arg2;
193 		a.msgflg = args->arg3;
194 		return (linux_msgsnd(td, &a));
195 	}
196 	case LINUX_MSGRCV: {
197 		struct linux_msgrcv_args a;
198 
199 		a.msqid = args->arg1;
200 		a.msgsz = args->arg2;
201 		a.msgflg = args->arg3;
202 		if ((args->what >> 16) == 0) {
203 			struct l_ipc_kludge tmp;
204 			int error;
205 
206 			if (args->ptr == NULL)
207 				return (EINVAL);
208 			error = copyin(args->ptr, &tmp, sizeof(tmp));
209 			if (error)
210 				return (error);
211 			a.msgp = tmp.msgp;
212 			a.msgtyp = tmp.msgtyp;
213 		} else {
214 			a.msgp = args->ptr;
215 			a.msgtyp = args->arg5;
216 		}
217 		return (linux_msgrcv(td, &a));
218 	}
219 	case LINUX_MSGGET: {
220 		struct linux_msgget_args a;
221 
222 		a.key = args->arg1;
223 		a.msgflg = args->arg2;
224 		return (linux_msgget(td, &a));
225 	}
226 	case LINUX_MSGCTL: {
227 		struct linux_msgctl_args a;
228 
229 		a.msqid = args->arg1;
230 		a.cmd = args->arg2;
231 		a.buf = args->ptr;
232 		return (linux_msgctl(td, &a));
233 	}
234 	case LINUX_SHMAT: {
235 		struct linux_shmat_args a;
236 
237 		a.shmid = args->arg1;
238 		a.shmaddr = args->ptr;
239 		a.shmflg = args->arg2;
240 		a.raddr = (l_ulong *)args->arg3;
241 		return (linux_shmat(td, &a));
242 	}
243 	case LINUX_SHMDT: {
244 		struct linux_shmdt_args a;
245 
246 		a.shmaddr = args->ptr;
247 		return (linux_shmdt(td, &a));
248 	}
249 	case LINUX_SHMGET: {
250 		struct linux_shmget_args a;
251 
252 		a.key = args->arg1;
253 		a.size = args->arg2;
254 		a.shmflg = args->arg3;
255 		return (linux_shmget(td, &a));
256 	}
257 	case LINUX_SHMCTL: {
258 		struct linux_shmctl_args a;
259 
260 		a.shmid = args->arg1;
261 		a.cmd = args->arg2;
262 		a.buf = args->ptr;
263 		return (linux_shmctl(td, &a));
264 	}
265 	default:
266 		break;
267 	}
268 
269 	return (EINVAL);
270 }
271 
272 int
273 linux_old_select(struct thread *td, struct linux_old_select_args *args)
274 {
275 	struct l_old_select_argv linux_args;
276 	struct linux_select_args newsel;
277 	int error;
278 
279 #ifdef DEBUG
280 	if (ldebug(old_select))
281 		printf(ARGS(old_select, "%p"), args->ptr);
282 #endif
283 
284 	error = copyin(args->ptr, &linux_args, sizeof(linux_args));
285 	if (error)
286 		return (error);
287 
288 	newsel.nfds = linux_args.nfds;
289 	newsel.readfds = linux_args.readfds;
290 	newsel.writefds = linux_args.writefds;
291 	newsel.exceptfds = linux_args.exceptfds;
292 	newsel.timeout = linux_args.timeout;
293 	return (linux_select(td, &newsel));
294 }
295 
296 int
297 linux_fork(struct thread *td, struct linux_fork_args *args)
298 {
299 	int error;
300 
301 #ifdef DEBUG
302 	if (ldebug(fork))
303 		printf(ARGS(fork, ""));
304 #endif
305 
306 	if ((error = fork(td, (struct fork_args *)args)) != 0)
307 		return (error);
308 
309 	if (td->td_retval[1] == 1)
310 		td->td_retval[0] = 0;
311 	error = linux_proc_init(td, td->td_retval[0], 0);
312 	if (error)
313 		return (error);
314 
315 	return (0);
316 }
317 
318 int
319 linux_vfork(struct thread *td, struct linux_vfork_args *args)
320 {
321 	int error;
322 	struct proc *p2;
323 
324 #ifdef DEBUG
325 	if (ldebug(vfork))
326 		printf(ARGS(vfork, ""));
327 #endif
328 
329 	/* exclude RFPPWAIT */
330 	if ((error = fork1(td, RFFDG | RFPROC | RFMEM, 0, &p2)) != 0)
331 		return (error);
332 	if (error == 0) {
333 	   	td->td_retval[0] = p2->p_pid;
334 		td->td_retval[1] = 0;
335 	}
336 	/* Are we the child? */
337 	if (td->td_retval[1] == 1)
338 		td->td_retval[0] = 0;
339 	error = linux_proc_init(td, td->td_retval[0], 0);
340 	if (error)
341 		return (error);
342 	/* wait for the children to exit, ie. emulate vfork */
343 	PROC_LOCK(p2);
344 	while (p2->p_flag & P_PPWAIT)
345 	   	msleep(td->td_proc, &p2->p_mtx, PWAIT, "ppwait", 0);
346 	PROC_UNLOCK(p2);
347 
348 	return (0);
349 }
350 
351 int
352 linux_clone(struct thread *td, struct linux_clone_args *args)
353 {
354 	int error, ff = RFPROC | RFSTOPPED;
355 	struct proc *p2;
356 	struct thread *td2;
357 	int exit_signal;
358 	struct linux_emuldata *em;
359 
360 #ifdef DEBUG
361 	if (ldebug(clone)) {
362    	   	printf(ARGS(clone, "flags %x, stack %x, parent tid: %x, child tid: %x"),
363 		    (unsigned int)args->flags, (unsigned int)args->stack,
364 		    (unsigned int)args->parent_tidptr, (unsigned int)args->child_tidptr);
365 	}
366 #endif
367 
368 	exit_signal = args->flags & 0x000000ff;
369 	if (!LINUX_SIG_VALID(exit_signal) && exit_signal != 0)
370 		return (EINVAL);
371 
372 	if (exit_signal <= LINUX_SIGTBLSZ)
373 		exit_signal = linux_to_bsd_signal[_SIG_IDX(exit_signal)];
374 
375 	if (args->flags & CLONE_VM)
376 		ff |= RFMEM;
377 	if (args->flags & CLONE_SIGHAND)
378 		ff |= RFSIGSHARE;
379 	/*
380 	 * XXX: in linux sharing of fs info (chroot/cwd/umask)
381 	 * and open files is independant. in fbsd its in one
382 	 * structure but in reality it doesnt make any problems
383 	 * because both this flags are set at once usually.
384 	 */
385 	if (!(args->flags & (CLONE_FILES | CLONE_FS)))
386 		ff |= RFFDG;
387 
388 	/*
389 	 * Attempt to detect when linux_clone(2) is used for creating
390 	 * kernel threads. Unfortunately despite the existence of the
391 	 * CLONE_THREAD flag, version of linuxthreads package used in
392 	 * most popular distros as of beginning of 2005 doesn't make
393 	 * any use of it. Therefore, this detection relay fully on
394 	 * empirical observation that linuxthreads sets certain
395 	 * combination of flags, so that we can make more or less
396 	 * precise detection and notify the FreeBSD kernel that several
397 	 * processes are in fact part of the same threading group, so
398 	 * that special treatment is necessary for signal delivery
399 	 * between those processes and fd locking.
400 	 */
401 	if ((args->flags & 0xffffff00) == THREADING_FLAGS)
402 		ff |= RFTHREAD;
403 
404 	error = fork1(td, ff, 0, &p2);
405 	if (error)
406 		return (error);
407 
408 	/* create the emuldata */
409 	error = linux_proc_init(td, p2->p_pid, args->flags);
410 	/* reference it - no need to check this */
411 	em = em_find(p2, EMUL_UNLOCKED);
412 	KASSERT(em != NULL, ("clone: emuldata not found.\n"));
413 	/* and adjust it */
414 	if (args->flags & CLONE_PARENT_SETTID) {
415 	   	if (args->parent_tidptr == NULL) {
416 		   	EMUL_UNLOCK(&emul_lock);
417 			return (EINVAL);
418 		}
419 		error = copyout(&p2->p_pid, args->parent_tidptr, sizeof(p2->p_pid));
420 		if (error) {
421 		   	EMUL_UNLOCK(&emul_lock);
422 			return (error);
423 		}
424 	}
425 
426 	if (args->flags & (CLONE_PARENT|CLONE_THREAD)) {
427 	   	sx_xlock(&proctree_lock);
428 		PROC_LOCK(p2);
429 		proc_reparent(p2, td->td_proc->p_pptr);
430 		PROC_UNLOCK(p2);
431 		sx_xunlock(&proctree_lock);
432 	}
433 
434 	if (args->flags & CLONE_THREAD) {
435 	   	/* XXX: linux mangles pgrp and pptr somehow
436 		 * I think it might be this but I am not sure.
437 		 */
438 #ifdef notyet
439 	   	PROC_LOCK(p2);
440 	   	p2->p_pgrp = td->td_proc->p_pgrp;
441 	   	PROC_UNLOCK(p2);
442 #endif
443 	 	exit_signal = 0;
444 	}
445 
446 	if (args->flags & CLONE_CHILD_SETTID)
447 		em->child_set_tid = args->child_tidptr;
448 	else
449 	   	em->child_set_tid = NULL;
450 
451 	if (args->flags & CLONE_CHILD_CLEARTID)
452 		em->child_clear_tid = args->child_tidptr;
453 	else
454 	   	em->child_clear_tid = NULL;
455 
456 	EMUL_UNLOCK(&emul_lock);
457 
458 	PROC_LOCK(p2);
459 	p2->p_sigparent = exit_signal;
460 	PROC_UNLOCK(p2);
461 	td2 = FIRST_THREAD_IN_PROC(p2);
462 	/*
463 	 * in a case of stack = NULL we are supposed to COW calling process stack
464 	 * this is what normal fork() does so we just keep the tf_esp arg intact
465 	 */
466 	if (args->stack)
467    	   	td2->td_frame->tf_esp = (unsigned int)args->stack;
468 
469 	if (args->flags & CLONE_SETTLS) {
470    	   	struct l_user_desc info;
471    	   	int idx;
472 	   	int a[2];
473 		struct segment_descriptor sd;
474 
475 	   	error = copyin((void *)td->td_frame->tf_esi, &info, sizeof(struct l_user_desc));
476 		if (error)
477    		   	return (error);
478 
479 		idx = info.entry_number;
480 
481 		/*
482 		 * looks like we're getting the idx we returned
483 		 * in the set_thread_area() syscall
484 		 */
485 		if (idx != 6 && idx != 3)
486 			return (EINVAL);
487 
488 		/* this doesnt happen in practice */
489 		if (idx == 6) {
490 		   	/* we might copy out the entry_number as 3 */
491 		   	info.entry_number = 3;
492 			error = copyout(&info, (void *) td->td_frame->tf_esi, sizeof(struct l_user_desc));
493 			if (error)
494 	   		   	return (error);
495 		}
496 
497 		a[0] = LDT_entry_a(&info);
498 		a[1] = LDT_entry_b(&info);
499 
500 		memcpy(&sd, &a, sizeof(a));
501 #ifdef DEBUG
502 	if (ldebug(clone))
503 	   	printf("Segment created in clone with CLONE_SETTLS: lobase: %x, hibase: %x, lolimit: %x, hilimit: %x, type: %i, dpl: %i, p: %i, xx: %i, def32: %i, gran: %i\n", sd.sd_lobase,
504 			sd.sd_hibase,
505 			sd.sd_lolimit,
506 			sd.sd_hilimit,
507 			sd.sd_type,
508 			sd.sd_dpl,
509 			sd.sd_p,
510 			sd.sd_xx,
511 			sd.sd_def32,
512 			sd.sd_gran);
513 #endif
514 
515 		/* set %gs */
516 		td2->td_pcb->pcb_gsd = sd;
517 		td2->td_pcb->pcb_gs = GSEL(GUGS_SEL, SEL_UPL);
518 	}
519 
520 #ifdef DEBUG
521 	if (ldebug(clone))
522 		printf(LMSG("clone: successful rfork to %ld, stack %p sig = %d"),
523 		    (long)p2->p_pid, args->stack, exit_signal);
524 #endif
525 
526 	/*
527 	 * Make this runnable after we are finished with it.
528 	 */
529 	mtx_lock_spin(&sched_lock);
530 	TD_SET_CAN_RUN(td2);
531 	setrunqueue(td2, SRQ_BORING);
532 	mtx_unlock_spin(&sched_lock);
533 
534 	td->td_retval[0] = p2->p_pid;
535 	td->td_retval[1] = 0;
536 
537 	if (args->flags & CLONE_VFORK) {
538    	   	/* wait for the children to exit, ie. emulate vfork */
539    	   	PROC_LOCK(p2);
540 		p2->p_flag |= P_PPWAIT;
541 		while (p2->p_flag & P_PPWAIT)
542    		   	msleep(td->td_proc, &p2->p_mtx, PWAIT, "ppwait", 0);
543 		PROC_UNLOCK(p2);
544 	}
545 
546 	return (0);
547 }
548 
549 /* XXX move */
550 struct l_mmap_argv {
551 	l_caddr_t	addr;
552 	l_int		len;
553 	l_int		prot;
554 	l_int		flags;
555 	l_int		fd;
556 	l_int		pos;
557 };
558 
559 #define STACK_SIZE  (2 * 1024 * 1024)
560 #define GUARD_SIZE  (4 * PAGE_SIZE)
561 
562 static int linux_mmap_common(struct thread *, struct l_mmap_argv *);
563 
564 int
565 linux_mmap2(struct thread *td, struct linux_mmap2_args *args)
566 {
567 	struct l_mmap_argv linux_args;
568 
569 #ifdef DEBUG
570 	if (ldebug(mmap2))
571 		printf(ARGS(mmap2, "%p, %d, %d, 0x%08x, %d, %d"),
572 		    (void *)args->addr, args->len, args->prot,
573 		    args->flags, args->fd, args->pgoff);
574 #endif
575 
576 	linux_args.addr = (l_caddr_t)args->addr;
577 	linux_args.len = args->len;
578 	linux_args.prot = args->prot;
579 	linux_args.flags = args->flags;
580 	linux_args.fd = args->fd;
581 	linux_args.pos = args->pgoff * PAGE_SIZE;
582 
583 	return (linux_mmap_common(td, &linux_args));
584 }
585 
586 int
587 linux_mmap(struct thread *td, struct linux_mmap_args *args)
588 {
589 	int error;
590 	struct l_mmap_argv linux_args;
591 
592 	error = copyin(args->ptr, &linux_args, sizeof(linux_args));
593 	if (error)
594 		return (error);
595 
596 #ifdef DEBUG
597 	if (ldebug(mmap))
598 		printf(ARGS(mmap, "%p, %d, %d, 0x%08x, %d, %d"),
599 		    (void *)linux_args.addr, linux_args.len, linux_args.prot,
600 		    linux_args.flags, linux_args.fd, linux_args.pos);
601 #endif
602 
603 	return (linux_mmap_common(td, &linux_args));
604 }
605 
606 static int
607 linux_mmap_common(struct thread *td, struct l_mmap_argv *linux_args)
608 {
609 	struct proc *p = td->td_proc;
610 	struct mmap_args /* {
611 		caddr_t addr;
612 		size_t len;
613 		int prot;
614 		int flags;
615 		int fd;
616 		long pad;
617 		off_t pos;
618 	} */ bsd_args;
619 	int error;
620 	struct file *fp;
621 
622 	error = 0;
623 	bsd_args.flags = 0;
624 	fp = NULL;
625 
626 	/*
627 	 * Linux mmap(2):
628 	 * You must specify exactly one of MAP_SHARED and MAP_PRIVATE
629 	 */
630 	if (! ((linux_args->flags & LINUX_MAP_SHARED) ^
631 	    (linux_args->flags & LINUX_MAP_PRIVATE)))
632 		return (EINVAL);
633 
634 	if (linux_args->flags & LINUX_MAP_SHARED)
635 		bsd_args.flags |= MAP_SHARED;
636 	if (linux_args->flags & LINUX_MAP_PRIVATE)
637 		bsd_args.flags |= MAP_PRIVATE;
638 	if (linux_args->flags & LINUX_MAP_FIXED)
639 		bsd_args.flags |= MAP_FIXED;
640 	if (linux_args->flags & LINUX_MAP_ANON)
641 		bsd_args.flags |= MAP_ANON;
642 	else
643 		bsd_args.flags |= MAP_NOSYNC;
644 	if (linux_args->flags & LINUX_MAP_GROWSDOWN) {
645 		bsd_args.flags |= MAP_STACK;
646 
647 		/*
648 		 * The linux MAP_GROWSDOWN option does not limit auto
649 		 * growth of the region.  Linux mmap with this option
650 		 * takes as addr the inital BOS, and as len, the initial
651 		 * region size.  It can then grow down from addr without
652 		 * limit.  However, linux threads has an implicit internal
653 		 * limit to stack size of STACK_SIZE.  Its just not
654 		 * enforced explicitly in linux.  But, here we impose
655 		 * a limit of (STACK_SIZE - GUARD_SIZE) on the stack
656 		 * region, since we can do this with our mmap.
657 		 *
658 		 * Our mmap with MAP_STACK takes addr as the maximum
659 		 * downsize limit on BOS, and as len the max size of
660 		 * the region.  It them maps the top SGROWSIZ bytes,
661 		 * and autgrows the region down, up to the limit
662 		 * in addr.
663 		 *
664 		 * If we don't use the MAP_STACK option, the effect
665 		 * of this code is to allocate a stack region of a
666 		 * fixed size of (STACK_SIZE - GUARD_SIZE).
667 		 */
668 
669 		/* This gives us TOS */
670 		bsd_args.addr = linux_args->addr + linux_args->len;
671 
672 		if (bsd_args.addr > p->p_vmspace->vm_maxsaddr) {
673 			/*
674 			 * Some linux apps will attempt to mmap
675 			 * thread stacks near the top of their
676 			 * address space.  If their TOS is greater
677 			 * than vm_maxsaddr, vm_map_growstack()
678 			 * will confuse the thread stack with the
679 			 * process stack and deliver a SEGV if they
680 			 * attempt to grow the thread stack past their
681 			 * current stacksize rlimit.  To avoid this,
682 			 * adjust vm_maxsaddr upwards to reflect
683 			 * the current stacksize rlimit rather
684 			 * than the maximum possible stacksize.
685 			 * It would be better to adjust the
686 			 * mmap'ed region, but some apps do not check
687 			 * mmap's return value.
688 			 */
689 			PROC_LOCK(p);
690 			p->p_vmspace->vm_maxsaddr = (char *)USRSTACK -
691 			    lim_cur(p, RLIMIT_STACK);
692 			PROC_UNLOCK(p);
693 		}
694 
695 		/* This gives us our maximum stack size */
696 		if (linux_args->len > STACK_SIZE - GUARD_SIZE)
697 			bsd_args.len = linux_args->len;
698 		else
699 			bsd_args.len  = STACK_SIZE - GUARD_SIZE;
700 
701 		/*
702 		 * This gives us a new BOS.  If we're using VM_STACK, then
703 		 * mmap will just map the top SGROWSIZ bytes, and let
704 		 * the stack grow down to the limit at BOS.  If we're
705 		 * not using VM_STACK we map the full stack, since we
706 		 * don't have a way to autogrow it.
707 		 */
708 		bsd_args.addr -= bsd_args.len;
709 	} else {
710 		bsd_args.addr = linux_args->addr;
711 		bsd_args.len  = linux_args->len;
712 	}
713 
714 	bsd_args.prot = linux_args->prot;
715 	if (linux_args->flags & LINUX_MAP_ANON)
716 		bsd_args.fd = -1;
717 	else {
718 		/*
719 		 * Linux follows Solaris mmap(2) description:
720 		 * The file descriptor fildes is opened with
721 		 * read permission, regardless of the
722 		 * protection options specified.
723 		 * If PROT_WRITE is specified, the application
724 		 * must have opened the file descriptor
725 		 * fildes with write permission unless
726 		 * MAP_PRIVATE is specified in the flag
727 		 * argument as described below.
728 		 */
729 
730 		if ((error = fget(td, linux_args->fd, &fp)) != 0)
731 			return (error);
732 		if (fp->f_type != DTYPE_VNODE) {
733 			fdrop(fp, td);
734 			return (EINVAL);
735 		}
736 
737 		/* Linux mmap() just fails for O_WRONLY files */
738 		if (! (fp->f_flag & FREAD)) {
739 			fdrop(fp, td);
740 			return (EACCES);
741 		}
742 
743 		bsd_args.fd = linux_args->fd;
744 		fdrop(fp, td);
745 	}
746 	bsd_args.pos = linux_args->pos;
747 	bsd_args.pad = 0;
748 
749 #ifdef DEBUG
750 	if (ldebug(mmap))
751 		printf("-> %s(%p, %d, %d, 0x%08x, %d, 0x%x)\n",
752 		    __func__,
753 		    (void *)bsd_args.addr, bsd_args.len, bsd_args.prot,
754 		    bsd_args.flags, bsd_args.fd, (int)bsd_args.pos);
755 #endif
756 	error = mmap(td, &bsd_args);
757 #ifdef DEBUG
758 	if (ldebug(mmap))
759 		printf("-> %s() return: 0x%x (0x%08x)\n",
760 			__func__, error, (u_int)td->td_retval[0]);
761 #endif
762 	return (error);
763 }
764 
765 int
766 linux_pipe(struct thread *td, struct linux_pipe_args *args)
767 {
768 	int error;
769 	int reg_edx;
770 
771 #ifdef DEBUG
772 	if (ldebug(pipe))
773 		printf(ARGS(pipe, "*"));
774 #endif
775 
776 	reg_edx = td->td_retval[1];
777 	error = pipe(td, 0);
778 	if (error) {
779 		td->td_retval[1] = reg_edx;
780 		return (error);
781 	}
782 
783 	error = copyout(td->td_retval, args->pipefds, 2*sizeof(int));
784 	if (error) {
785 		td->td_retval[1] = reg_edx;
786 		return (error);
787 	}
788 
789 	td->td_retval[1] = reg_edx;
790 	td->td_retval[0] = 0;
791 	return (0);
792 }
793 
794 int
795 linux_ioperm(struct thread *td, struct linux_ioperm_args *args)
796 {
797 	int error;
798 	struct i386_ioperm_args iia;
799 
800 	iia.start = args->start;
801 	iia.length = args->length;
802 	iia.enable = args->enable;
803 	mtx_lock(&Giant);
804 	error = i386_set_ioperm(td, &iia);
805 	mtx_unlock(&Giant);
806 	return (error);
807 }
808 
809 int
810 linux_iopl(struct thread *td, struct linux_iopl_args *args)
811 {
812 	int error;
813 
814 	if (args->level < 0 || args->level > 3)
815 		return (EINVAL);
816 	if ((error = priv_check(td, PRIV_IO)) != 0)
817 		return (error);
818 	if ((error = securelevel_gt(td->td_ucred, 0)) != 0)
819 		return (error);
820 	td->td_frame->tf_eflags = (td->td_frame->tf_eflags & ~PSL_IOPL) |
821 	    (args->level * (PSL_IOPL / 3));
822 	return (0);
823 }
824 
825 int
826 linux_modify_ldt(struct thread *td, struct linux_modify_ldt_args *uap)
827 {
828 	int error;
829 	struct i386_ldt_args ldt;
830 	struct l_descriptor ld;
831 	union descriptor desc;
832 
833 	if (uap->ptr == NULL)
834 		return (EINVAL);
835 
836 	switch (uap->func) {
837 	case 0x00: /* read_ldt */
838 		ldt.start = 0;
839 		ldt.descs = uap->ptr;
840 		ldt.num = uap->bytecount / sizeof(union descriptor);
841 		mtx_lock(&Giant);
842 		error = i386_get_ldt(td, &ldt);
843 		td->td_retval[0] *= sizeof(union descriptor);
844 		mtx_unlock(&Giant);
845 		break;
846 	case 0x01: /* write_ldt */
847 	case 0x11: /* write_ldt */
848 		if (uap->bytecount != sizeof(ld))
849 			return (EINVAL);
850 
851 		error = copyin(uap->ptr, &ld, sizeof(ld));
852 		if (error)
853 			return (error);
854 
855 		ldt.start = ld.entry_number;
856 		ldt.descs = &desc;
857 		ldt.num = 1;
858 		desc.sd.sd_lolimit = (ld.limit & 0x0000ffff);
859 		desc.sd.sd_hilimit = (ld.limit & 0x000f0000) >> 16;
860 		desc.sd.sd_lobase = (ld.base_addr & 0x00ffffff);
861 		desc.sd.sd_hibase = (ld.base_addr & 0xff000000) >> 24;
862 		desc.sd.sd_type = SDT_MEMRO | ((ld.read_exec_only ^ 1) << 1) |
863 			(ld.contents << 2);
864 		desc.sd.sd_dpl = 3;
865 		desc.sd.sd_p = (ld.seg_not_present ^ 1);
866 		desc.sd.sd_xx = 0;
867 		desc.sd.sd_def32 = ld.seg_32bit;
868 		desc.sd.sd_gran = ld.limit_in_pages;
869 		mtx_lock(&Giant);
870 		error = i386_set_ldt(td, &ldt, &desc);
871 		mtx_unlock(&Giant);
872 		break;
873 	default:
874 		error = EINVAL;
875 		break;
876 	}
877 
878 	if (error == EOPNOTSUPP) {
879 		printf("linux: modify_ldt needs kernel option USER_LDT\n");
880 		error = ENOSYS;
881 	}
882 
883 	return (error);
884 }
885 
886 int
887 linux_sigaction(struct thread *td, struct linux_sigaction_args *args)
888 {
889 	l_osigaction_t osa;
890 	l_sigaction_t act, oact;
891 	int error;
892 
893 #ifdef DEBUG
894 	if (ldebug(sigaction))
895 		printf(ARGS(sigaction, "%d, %p, %p"),
896 		    args->sig, (void *)args->nsa, (void *)args->osa);
897 #endif
898 
899 	if (args->nsa != NULL) {
900 		error = copyin(args->nsa, &osa, sizeof(l_osigaction_t));
901 		if (error)
902 			return (error);
903 		act.lsa_handler = osa.lsa_handler;
904 		act.lsa_flags = osa.lsa_flags;
905 		act.lsa_restorer = osa.lsa_restorer;
906 		LINUX_SIGEMPTYSET(act.lsa_mask);
907 		act.lsa_mask.__bits[0] = osa.lsa_mask;
908 	}
909 
910 	error = linux_do_sigaction(td, args->sig, args->nsa ? &act : NULL,
911 	    args->osa ? &oact : NULL);
912 
913 	if (args->osa != NULL && !error) {
914 		osa.lsa_handler = oact.lsa_handler;
915 		osa.lsa_flags = oact.lsa_flags;
916 		osa.lsa_restorer = oact.lsa_restorer;
917 		osa.lsa_mask = oact.lsa_mask.__bits[0];
918 		error = copyout(&osa, args->osa, sizeof(l_osigaction_t));
919 	}
920 
921 	return (error);
922 }
923 
924 /*
925  * Linux has two extra args, restart and oldmask.  We dont use these,
926  * but it seems that "restart" is actually a context pointer that
927  * enables the signal to happen with a different register set.
928  */
929 int
930 linux_sigsuspend(struct thread *td, struct linux_sigsuspend_args *args)
931 {
932 	sigset_t sigmask;
933 	l_sigset_t mask;
934 
935 #ifdef DEBUG
936 	if (ldebug(sigsuspend))
937 		printf(ARGS(sigsuspend, "%08lx"), (unsigned long)args->mask);
938 #endif
939 
940 	LINUX_SIGEMPTYSET(mask);
941 	mask.__bits[0] = args->mask;
942 	linux_to_bsd_sigset(&mask, &sigmask);
943 	return (kern_sigsuspend(td, sigmask));
944 }
945 
946 int
947 linux_rt_sigsuspend(struct thread *td, struct linux_rt_sigsuspend_args *uap)
948 {
949 	l_sigset_t lmask;
950 	sigset_t sigmask;
951 	int error;
952 
953 #ifdef DEBUG
954 	if (ldebug(rt_sigsuspend))
955 		printf(ARGS(rt_sigsuspend, "%p, %d"),
956 		    (void *)uap->newset, uap->sigsetsize);
957 #endif
958 
959 	if (uap->sigsetsize != sizeof(l_sigset_t))
960 		return (EINVAL);
961 
962 	error = copyin(uap->newset, &lmask, sizeof(l_sigset_t));
963 	if (error)
964 		return (error);
965 
966 	linux_to_bsd_sigset(&lmask, &sigmask);
967 	return (kern_sigsuspend(td, sigmask));
968 }
969 
970 int
971 linux_pause(struct thread *td, struct linux_pause_args *args)
972 {
973 	struct proc *p = td->td_proc;
974 	sigset_t sigmask;
975 
976 #ifdef DEBUG
977 	if (ldebug(pause))
978 		printf(ARGS(pause, ""));
979 #endif
980 
981 	PROC_LOCK(p);
982 	sigmask = td->td_sigmask;
983 	PROC_UNLOCK(p);
984 	return (kern_sigsuspend(td, sigmask));
985 }
986 
987 int
988 linux_sigaltstack(struct thread *td, struct linux_sigaltstack_args *uap)
989 {
990 	stack_t ss, oss;
991 	l_stack_t lss;
992 	int error;
993 
994 #ifdef DEBUG
995 	if (ldebug(sigaltstack))
996 		printf(ARGS(sigaltstack, "%p, %p"), uap->uss, uap->uoss);
997 #endif
998 
999 	if (uap->uss != NULL) {
1000 		error = copyin(uap->uss, &lss, sizeof(l_stack_t));
1001 		if (error)
1002 			return (error);
1003 
1004 		ss.ss_sp = lss.ss_sp;
1005 		ss.ss_size = lss.ss_size;
1006 		ss.ss_flags = linux_to_bsd_sigaltstack(lss.ss_flags);
1007 	}
1008 	error = kern_sigaltstack(td, (uap->uss != NULL) ? &ss : NULL,
1009 	    (uap->uoss != NULL) ? &oss : NULL);
1010 	if (!error && uap->uoss != NULL) {
1011 		lss.ss_sp = oss.ss_sp;
1012 		lss.ss_size = oss.ss_size;
1013 		lss.ss_flags = bsd_to_linux_sigaltstack(oss.ss_flags);
1014 		error = copyout(&lss, uap->uoss, sizeof(l_stack_t));
1015 	}
1016 
1017 	return (error);
1018 }
1019 
1020 int
1021 linux_ftruncate64(struct thread *td, struct linux_ftruncate64_args *args)
1022 {
1023 	struct ftruncate_args sa;
1024 
1025 #ifdef DEBUG
1026 	if (ldebug(ftruncate64))
1027 		printf(ARGS(ftruncate64, "%u, %jd"), args->fd,
1028 		    (intmax_t)args->length);
1029 #endif
1030 
1031 	sa.fd = args->fd;
1032 	sa.pad = 0;
1033 	sa.length = args->length;
1034 	return ftruncate(td, &sa);
1035 }
1036 
1037 int
1038 linux_set_thread_area(struct thread *td, struct linux_set_thread_area_args *args)
1039 {
1040 	struct l_user_desc info;
1041 	int error;
1042 	int idx;
1043 	int a[2];
1044 	struct segment_descriptor sd;
1045 
1046 	error = copyin(args->desc, &info, sizeof(struct l_user_desc));
1047 	if (error)
1048 		return (error);
1049 
1050 #ifdef DEBUG
1051 	if (ldebug(set_thread_area))
1052 	   	printf(ARGS(set_thread_area, "%i, %x, %x, %i, %i, %i, %i, %i, %i\n"),
1053 		      info.entry_number,
1054       		      info.base_addr,
1055       		      info.limit,
1056       		      info.seg_32bit,
1057 		      info.contents,
1058       		      info.read_exec_only,
1059       		      info.limit_in_pages,
1060       		      info.seg_not_present,
1061       		      info.useable);
1062 #endif
1063 
1064 	idx = info.entry_number;
1065 	/*
1066 	 * Semantics of linux version: every thread in the system has array
1067 	 * of 3 tls descriptors. 1st is GLIBC TLS, 2nd is WINE, 3rd unknown. This
1068 	 * syscall loads one of the selected tls decriptors with a value
1069 	 * and also loads GDT descriptors 6, 7 and 8 with the content of the per-thread
1070 	 * descriptors.
1071 	 *
1072 	 * Semantics of fbsd version: I think we can ignore that linux has 3 per-thread
1073 	 * descriptors and use just the 1st one. The tls_array[] is used only in
1074 	 * set/get-thread_area() syscalls and for loading the GDT descriptors. In fbsd
1075 	 * we use just one GDT descriptor for TLS so we will load just one.
1076 	 * XXX: this doesnt work when user-space process tries to use more then 1 TLS segment
1077 	 * comment in the linux sources says wine might do that.
1078 	 */
1079 
1080 	/*
1081 	 * we support just GLIBC TLS now
1082 	 * we should let 3 proceed as well because we use this segment so
1083 	 * if code does two subsequent calls it should succeed
1084 	 */
1085 	if (idx != 6 && idx != -1 && idx != 3)
1086 		return (EINVAL);
1087 
1088 	/*
1089 	 * we have to copy out the GDT entry we use
1090 	 * FreeBSD uses GDT entry #3 for storing %gs so load that
1091 	 * XXX: what if userspace program doesnt check this value and tries
1092 	 * to use 6, 7 or 8?
1093 	 */
1094 	idx = info.entry_number = 3;
1095 	error = copyout(&info, args->desc, sizeof(struct l_user_desc));
1096 	if (error)
1097 		return (error);
1098 
1099 	if (LDT_empty(&info)) {
1100 		a[0] = 0;
1101 		a[1] = 0;
1102 	} else {
1103 		a[0] = LDT_entry_a(&info);
1104 		a[1] = LDT_entry_b(&info);
1105 	}
1106 
1107 	memcpy(&sd, &a, sizeof(a));
1108 #ifdef DEBUG
1109 	if (ldebug(set_thread_area))
1110 	   	printf("Segment created in set_thread_area: lobase: %x, hibase: %x, lolimit: %x, hilimit: %x, type: %i, dpl: %i, p: %i, xx: %i, def32: %i, gran: %i\n", sd.sd_lobase,
1111 			sd.sd_hibase,
1112 			sd.sd_lolimit,
1113 			sd.sd_hilimit,
1114 			sd.sd_type,
1115 			sd.sd_dpl,
1116 			sd.sd_p,
1117 			sd.sd_xx,
1118 			sd.sd_def32,
1119 			sd.sd_gran);
1120 #endif
1121 
1122 	/* this is taken from i386 version of cpu_set_user_tls() */
1123 	critical_enter();
1124 	/* set %gs */
1125 	td->td_pcb->pcb_gsd = sd;
1126 	PCPU_GET(fsgs_gdt)[1] = sd;
1127 	load_gs(GSEL(GUGS_SEL, SEL_UPL));
1128 	critical_exit();
1129 
1130 	return (0);
1131 }
1132 
1133 int
1134 linux_get_thread_area(struct thread *td, struct linux_get_thread_area_args *args)
1135 {
1136 
1137 	struct l_user_desc info;
1138 	int error;
1139 	int idx;
1140 	struct l_desc_struct desc;
1141 	struct segment_descriptor sd;
1142 
1143 #ifdef DEBUG
1144 	if (ldebug(get_thread_area))
1145 		printf(ARGS(get_thread_area, "%p"), args->desc);
1146 #endif
1147 
1148 	error = copyin(args->desc, &info, sizeof(struct l_user_desc));
1149 	if (error)
1150 		return (error);
1151 
1152 	idx = info.entry_number;
1153 	/* XXX: I am not sure if we want 3 to be allowed too. */
1154 	if (idx != 6 && idx != 3)
1155 		return (EINVAL);
1156 
1157 	idx = 3;
1158 
1159 	memset(&info, 0, sizeof(info));
1160 
1161 	sd = PCPU_GET(fsgs_gdt)[1];
1162 
1163 	memcpy(&desc, &sd, sizeof(desc));
1164 
1165 	info.entry_number = idx;
1166 	info.base_addr = GET_BASE(&desc);
1167 	info.limit = GET_LIMIT(&desc);
1168 	info.seg_32bit = GET_32BIT(&desc);
1169 	info.contents = GET_CONTENTS(&desc);
1170 	info.read_exec_only = !GET_WRITABLE(&desc);
1171 	info.limit_in_pages = GET_LIMIT_PAGES(&desc);
1172 	info.seg_not_present = !GET_PRESENT(&desc);
1173 	info.useable = GET_USEABLE(&desc);
1174 
1175 	error = copyout(&info, args->desc, sizeof(struct l_user_desc));
1176 	if (error)
1177 	   	return (EFAULT);
1178 
1179 	return (0);
1180 }
1181 
1182 /* copied from kern/kern_time.c */
1183 int
1184 linux_timer_create(struct thread *td, struct linux_timer_create_args *args)
1185 {
1186    	return ktimer_create(td, (struct ktimer_create_args *) args);
1187 }
1188 
1189 int
1190 linux_timer_settime(struct thread *td, struct linux_timer_settime_args *args)
1191 {
1192    	return ktimer_settime(td, (struct ktimer_settime_args *) args);
1193 }
1194 
1195 int
1196 linux_timer_gettime(struct thread *td, struct linux_timer_gettime_args *args)
1197 {
1198    	return ktimer_gettime(td, (struct ktimer_gettime_args *) args);
1199 }
1200 
1201 int
1202 linux_timer_getoverrun(struct thread *td, struct linux_timer_getoverrun_args *args)
1203 {
1204    	return ktimer_getoverrun(td, (struct ktimer_getoverrun_args *) args);
1205 }
1206 
1207 int
1208 linux_timer_delete(struct thread *td, struct linux_timer_delete_args *args)
1209 {
1210    	return ktimer_delete(td, (struct ktimer_delete_args *) args);
1211 }
1212 
1213 /* XXX: this wont work with module - convert it */
1214 int
1215 linux_mq_open(struct thread *td, struct linux_mq_open_args *args)
1216 {
1217 #ifdef P1003_1B_MQUEUE
1218    	return kmq_open(td, (struct kmq_open_args *) args);
1219 #else
1220 	return (ENOSYS);
1221 #endif
1222 }
1223 
1224 int
1225 linux_mq_unlink(struct thread *td, struct linux_mq_unlink_args *args)
1226 {
1227 #ifdef P1003_1B_MQUEUE
1228    	return kmq_unlink(td, (struct kmq_unlink_args *) args);
1229 #else
1230 	return (ENOSYS);
1231 #endif
1232 }
1233 
1234 int
1235 linux_mq_timedsend(struct thread *td, struct linux_mq_timedsend_args *args)
1236 {
1237 #ifdef P1003_1B_MQUEUE
1238    	return kmq_timedsend(td, (struct kmq_timedsend_args *) args);
1239 #else
1240 	return (ENOSYS);
1241 #endif
1242 }
1243 
1244 int
1245 linux_mq_timedreceive(struct thread *td, struct linux_mq_timedreceive_args *args)
1246 {
1247 #ifdef P1003_1B_MQUEUE
1248    	return kmq_timedreceive(td, (struct kmq_timedreceive_args *) args);
1249 #else
1250 	return (ENOSYS);
1251 #endif
1252 }
1253 
1254 int
1255 linux_mq_notify(struct thread *td, struct linux_mq_notify_args *args)
1256 {
1257 #ifdef P1003_1B_MQUEUE
1258 	return kmq_notify(td, (struct kmq_notify_args *) args);
1259 #else
1260 	return (ENOSYS);
1261 #endif
1262 }
1263 
1264 int
1265 linux_mq_getsetattr(struct thread *td, struct linux_mq_getsetattr_args *args)
1266 {
1267 #ifdef P1003_1B_MQUEUE
1268    	return kmq_setattr(td, (struct kmq_setattr_args *) args);
1269 #else
1270 	return (ENOSYS);
1271 #endif
1272 }
1273 
1274