xref: /freebsd/sys/compat/linux/linux_misc.c (revision 2da199da53835ee2d9228a60717fd2d0fccf9e50)
1 /*-
2  * Copyright (c) 1994-1995 S�ren Schmidt
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer
10  *    in this position and unchanged.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  * 3. The name of the author may not be used to endorse or promote products
15  *    derived from this software withough specific prior written permission
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  *
28  *  $Id: linux_misc.c,v 1.51 1999/01/06 23:05:38 julian Exp $
29  */
30 
31 #include <sys/param.h>
32 #include <sys/systm.h>
33 #include <sys/sysproto.h>
34 #include <sys/kernel.h>
35 #include <sys/mman.h>
36 #include <sys/proc.h>
37 #include <sys/fcntl.h>
38 #include <sys/imgact_aout.h>
39 #include <sys/mount.h>
40 #include <sys/namei.h>
41 #include <sys/resourcevar.h>
42 #include <sys/stat.h>
43 #include <sys/sysctl.h>
44 #include <sys/unistd.h>
45 #include <sys/vnode.h>
46 #include <sys/wait.h>
47 #include <sys/time.h>
48 
49 #include <vm/vm.h>
50 #include <vm/pmap.h>
51 #include <vm/vm_kern.h>
52 #include <vm/vm_prot.h>
53 #include <vm/vm_map.h>
54 #include <vm/vm_extern.h>
55 
56 #include <machine/frame.h>
57 #include <machine/psl.h>
58 
59 #include <i386/linux/linux.h>
60 #include <i386/linux/linux_proto.h>
61 #include <i386/linux/linux_util.h>
62 
63 int
64 linux_alarm(struct proc *p, struct linux_alarm_args *args)
65 {
66     struct itimerval it, old_it;
67     struct timeval tv;
68     int s;
69 
70 #ifdef DEBUG
71     printf("Linux-emul(%ld): alarm(%u)\n", (long)p->p_pid, args->secs);
72 #endif
73     if (args->secs > 100000000)
74 	return EINVAL;
75     it.it_value.tv_sec = (long)args->secs;
76     it.it_value.tv_usec = 0;
77     it.it_interval.tv_sec = 0;
78     it.it_interval.tv_usec = 0;
79     s = splsoftclock();
80     old_it = p->p_realtimer;
81     getmicrouptime(&tv);
82     if (timevalisset(&old_it.it_value))
83 	untimeout(realitexpire, (caddr_t)p, p->p_ithandle);
84     if (it.it_value.tv_sec != 0) {
85 	p->p_ithandle = timeout(realitexpire, (caddr_t)p, tvtohz(&it.it_value));
86 	timevaladd(&it.it_value, &tv);
87     }
88     p->p_realtimer = it;
89     splx(s);
90     if (timevalcmp(&old_it.it_value, &tv, >)) {
91 	timevalsub(&old_it.it_value, &tv);
92 	if (old_it.it_value.tv_usec != 0)
93 	    old_it.it_value.tv_sec++;
94 	p->p_retval[0] = old_it.it_value.tv_sec;
95     }
96     return 0;
97 }
98 
99 int
100 linux_brk(struct proc *p, struct linux_brk_args *args)
101 {
102 #if 0
103     struct vmspace *vm = p->p_vmspace;
104     vm_offset_t new, old;
105     int error;
106 
107     if ((vm_offset_t)args->dsend < (vm_offset_t)vm->vm_daddr)
108 	return EINVAL;
109     if (((caddr_t)args->dsend - (caddr_t)vm->vm_daddr)
110 	> p->p_rlimit[RLIMIT_DATA].rlim_cur)
111 	return ENOMEM;
112 
113     old = round_page((vm_offset_t)vm->vm_daddr) + ctob(vm->vm_dsize);
114     new = round_page((vm_offset_t)args->dsend);
115     p->p_retval[0] = old;
116     if ((new-old) > 0) {
117 	if (swap_pager_full)
118 	    return ENOMEM;
119 	error = vm_map_find(&vm->vm_map, NULL, 0, &old, (new-old), FALSE,
120 			VM_PROT_ALL, VM_PROT_ALL, 0);
121 	if (error)
122 	    return error;
123 	vm->vm_dsize += btoc((new-old));
124 	p->p_retval[0] = (int)(vm->vm_daddr + ctob(vm->vm_dsize));
125     }
126     return 0;
127 #else
128     struct vmspace *vm = p->p_vmspace;
129     vm_offset_t new, old;
130     struct obreak_args /* {
131 	char * nsize;
132     } */ tmp;
133 
134 #ifdef DEBUG
135     printf("Linux-emul(%ld): brk(%p)\n", (long)p->p_pid, (void *)args->dsend);
136 #endif
137     old = (vm_offset_t)vm->vm_daddr + ctob(vm->vm_dsize);
138     new = (vm_offset_t)args->dsend;
139     tmp.nsize = (char *) new;
140     if (((caddr_t)new > vm->vm_daddr) && !obreak(p, &tmp))
141 	p->p_retval[0] = (int)new;
142     else
143 	p->p_retval[0] = (int)old;
144 
145     return 0;
146 #endif
147 }
148 
149 int
150 linux_uselib(struct proc *p, struct linux_uselib_args *args)
151 {
152     struct nameidata ni;
153     struct vnode *vp;
154     struct exec *a_out;
155     struct vattr attr;
156     vm_offset_t vmaddr;
157     unsigned long file_offset;
158     vm_offset_t buffer;
159     unsigned long bss_size;
160     int error;
161     caddr_t sg;
162     int locked;
163 
164     sg = stackgap_init();
165     CHECKALTEXIST(p, &sg, args->library);
166 
167 #ifdef DEBUG
168     printf("Linux-emul(%d): uselib(%s)\n", p->p_pid, args->library);
169 #endif
170 
171     a_out = NULL;
172     locked = 0;
173     vp = NULL;
174 
175     NDINIT(&ni, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, args->library, p);
176     if (error = namei(&ni))
177 	goto cleanup;
178 
179     vp = ni.ni_vp;
180     if (vp == NULL) {
181 	error = ENOEXEC;	/* ?? */
182 	goto cleanup;
183     }
184 
185     /*
186      * From here on down, we have a locked vnode that must be unlocked.
187      */
188     locked++;
189 
190     /*
191      * Writable?
192      */
193     if (vp->v_writecount) {
194 	error = ETXTBSY;
195 	goto cleanup;
196     }
197 
198     /*
199      * Executable?
200      */
201     if (error = VOP_GETATTR(vp, &attr, p->p_ucred, p))
202 	goto cleanup;
203 
204     if ((vp->v_mount->mnt_flag & MNT_NOEXEC) ||
205 	((attr.va_mode & 0111) == 0) ||
206 	(attr.va_type != VREG)) {
207 	    error = ENOEXEC;
208 	    goto cleanup;
209     }
210 
211     /*
212      * Sensible size?
213      */
214     if (attr.va_size == 0) {
215 	error = ENOEXEC;
216 	goto cleanup;
217     }
218 
219     /*
220      * Can we access it?
221      */
222     if (error = VOP_ACCESS(vp, VEXEC, p->p_ucred, p))
223 	goto cleanup;
224 
225     if (error = VOP_OPEN(vp, FREAD, p->p_ucred, p))
226 	goto cleanup;
227 
228     /*
229      * Lock no longer needed
230      */
231     VOP_UNLOCK(vp, 0, p);
232     locked = 0;
233 
234     /*
235      * Pull in executable header into kernel_map
236      */
237     error = vm_mmap(kernel_map, (vm_offset_t *)&a_out, PAGE_SIZE,
238 	    	    VM_PROT_READ, VM_PROT_READ, 0, (caddr_t)vp, 0);
239     if (error)
240 	goto cleanup;
241 
242     /*
243      * Is it a Linux binary ?
244      */
245     if (((a_out->a_magic >> 16) & 0xff) != 0x64) {
246 	error = ENOEXEC;
247 	goto cleanup;
248     }
249 
250     /* While we are here, we should REALLY do some more checks */
251 
252     /*
253      * Set file/virtual offset based on a.out variant.
254      */
255     switch ((int)(a_out->a_magic & 0xffff)) {
256     case 0413:	/* ZMAGIC */
257 	file_offset = 1024;
258 	break;
259     case 0314:	/* QMAGIC */
260 	file_offset = 0;
261 	break;
262     default:
263 	error = ENOEXEC;
264 	goto cleanup;
265     }
266 
267     bss_size = round_page(a_out->a_bss);
268 
269     /*
270      * Check various fields in header for validity/bounds.
271      */
272     if (a_out->a_text & PAGE_MASK || a_out->a_data & PAGE_MASK) {
273 	error = ENOEXEC;
274 	goto cleanup;
275     }
276 
277     /* text + data can't exceed file size */
278     if (a_out->a_data + a_out->a_text > attr.va_size) {
279 	error = EFAULT;
280 	goto cleanup;
281     }
282 
283     /*
284      * text/data/bss must not exceed limits
285      * XXX: this is not complete. it should check current usage PLUS
286      * the resources needed by this library.
287      */
288     if (a_out->a_text > MAXTSIZ ||
289 	a_out->a_data + bss_size > p->p_rlimit[RLIMIT_DATA].rlim_cur) {
290 	error = ENOMEM;
291 	goto cleanup;
292     }
293 
294     /*
295      * prevent more writers
296      */
297     vp->v_flag |= VTEXT;
298 
299     /*
300      * Check if file_offset page aligned,.
301      * Currently we cannot handle misalinged file offsets,
302      * and so we read in the entire image (what a waste).
303      */
304     if (file_offset & PAGE_MASK) {
305 #ifdef DEBUG
306 printf("uselib: Non page aligned binary %lu\n", file_offset);
307 #endif
308 	/*
309 	 * Map text+data read/write/execute
310 	 */
311 
312 	/* a_entry is the load address and is page aligned */
313 	vmaddr = trunc_page(a_out->a_entry);
314 
315 	/* get anon user mapping, read+write+execute */
316 	error = vm_map_find(&p->p_vmspace->vm_map, NULL, 0, &vmaddr,
317 		    	    a_out->a_text + a_out->a_data, FALSE,
318 			    VM_PROT_ALL, VM_PROT_ALL, 0);
319 	if (error)
320 	    goto cleanup;
321 
322 	/* map file into kernel_map */
323 	error = vm_mmap(kernel_map, &buffer,
324 			round_page(a_out->a_text + a_out->a_data + file_offset),
325 		   	VM_PROT_READ, VM_PROT_READ, 0,
326 			(caddr_t)vp, trunc_page(file_offset));
327 	if (error)
328 	    goto cleanup;
329 
330 	/* copy from kernel VM space to user space */
331 	error = copyout((caddr_t)(void *)(uintptr_t)(buffer + file_offset),
332 			(caddr_t)vmaddr, a_out->a_text + a_out->a_data);
333 
334 	/* release temporary kernel space */
335 	vm_map_remove(kernel_map, buffer,
336 		      buffer + round_page(a_out->a_text + a_out->a_data + file_offset));
337 
338 	if (error)
339 	    goto cleanup;
340     }
341     else {
342 #ifdef DEBUG
343 printf("uselib: Page aligned binary %lu\n", file_offset);
344 #endif
345 	/*
346 	 * for QMAGIC, a_entry is 20 bytes beyond the load address
347 	 * to skip the executable header
348 	 */
349 	vmaddr = trunc_page(a_out->a_entry);
350 
351 	/*
352 	 * Map it all into the process's space as a single copy-on-write
353 	 * "data" segment.
354 	 */
355 	error = vm_mmap(&p->p_vmspace->vm_map, &vmaddr,
356 		   	a_out->a_text + a_out->a_data,
357 			VM_PROT_ALL, VM_PROT_ALL, MAP_PRIVATE | MAP_FIXED,
358 			(caddr_t)vp, file_offset);
359 	if (error)
360 	    goto cleanup;
361     }
362 #ifdef DEBUG
363 printf("mem=%08x = %08x %08x\n", vmaddr, ((int*)vmaddr)[0], ((int*)vmaddr)[1]);
364 #endif
365     if (bss_size != 0) {
366         /*
367 	 * Calculate BSS start address
368 	 */
369 	vmaddr = trunc_page(a_out->a_entry) + a_out->a_text + a_out->a_data;
370 
371 	/*
372 	 * allocate some 'anon' space
373 	 */
374 	error = vm_map_find(&p->p_vmspace->vm_map, NULL, 0, &vmaddr,
375 			    bss_size, FALSE,
376 			    VM_PROT_ALL, VM_PROT_ALL, 0);
377 	if (error)
378 	    goto cleanup;
379     }
380 
381 cleanup:
382     /*
383      * Unlock vnode if needed
384      */
385     if (locked)
386 	VOP_UNLOCK(vp, 0, p);
387 
388     /*
389      * Release the kernel mapping.
390      */
391     if (a_out)
392 	vm_map_remove(kernel_map, (vm_offset_t)a_out, (vm_offset_t)a_out + PAGE_SIZE);
393 
394     return error;
395 }
396 
397 /* XXX move */
398 struct linux_select_argv {
399 	int nfds;
400 	fd_set *readfds;
401 	fd_set *writefds;
402 	fd_set *exceptfds;
403 	struct timeval *timeout;
404 };
405 
406 int
407 linux_select(struct proc *p, struct linux_select_args *args)
408 {
409     struct linux_select_argv linux_args;
410     struct linux_newselect_args newsel;
411     int error;
412 
413 #ifdef SELECT_DEBUG
414     printf("Linux-emul(%d): select(%x)\n",
415 	   p->p_pid, args->ptr);
416 #endif
417     if ((error = copyin((caddr_t)args->ptr, (caddr_t)&linux_args,
418 			sizeof(linux_args))))
419 	return error;
420 
421     newsel.nfds = linux_args.nfds;
422     newsel.readfds = linux_args.readfds;
423     newsel.writefds = linux_args.writefds;
424     newsel.exceptfds = linux_args.exceptfds;
425     newsel.timeout = linux_args.timeout;
426 
427     return linux_newselect(p, &newsel);
428 }
429 
430 int
431 linux_newselect(struct proc *p, struct linux_newselect_args *args)
432 {
433     struct select_args bsa;
434     struct timeval tv0, tv1, utv, *tvp;
435     caddr_t sg;
436     int error;
437 
438 #ifdef DEBUG
439     printf("Linux-emul(%ld): newselect(%d, %p, %p, %p, %p)\n",
440   	(long)p->p_pid, args->nfds, (void *)args->readfds,
441 	(void *)args->writefds, (void *)args->exceptfds,
442 	(void *)args->timeout);
443 #endif
444     error = 0;
445     bsa.nd = args->nfds;
446     bsa.in = args->readfds;
447     bsa.ou = args->writefds;
448     bsa.ex = args->exceptfds;
449     bsa.tv = args->timeout;
450 
451     /*
452      * Store current time for computation of the amount of
453      * time left.
454      */
455     if (args->timeout) {
456 	if ((error = copyin(args->timeout, &utv, sizeof(utv))))
457 	    goto select_out;
458 #ifdef DEBUG
459 	printf("Linux-emul(%ld): incoming timeout (%ld/%ld)\n",
460 	    (long)p->p_pid, utv.tv_sec, utv.tv_usec);
461 #endif
462 	if (itimerfix(&utv)) {
463 	    /*
464 	     * The timeval was invalid.  Convert it to something
465 	     * valid that will act as it does under Linux.
466 	     */
467 	    sg = stackgap_init();
468 	    tvp = stackgap_alloc(&sg, sizeof(utv));
469 	    utv.tv_sec += utv.tv_usec / 1000000;
470 	    utv.tv_usec %= 1000000;
471 	    if (utv.tv_usec < 0) {
472 		utv.tv_sec -= 1;
473 		utv.tv_usec += 1000000;
474 	    }
475 	    if (utv.tv_sec < 0)
476 		timevalclear(&utv);
477 	    if ((error = copyout(&utv, tvp, sizeof(utv))))
478 		goto select_out;
479 	    bsa.tv = tvp;
480 	}
481 	microtime(&tv0);
482     }
483 
484     error = select(p, &bsa);
485 #ifdef DEBUG
486     printf("Linux-emul(%d): real select returns %d\n",
487 	       p->p_pid, error);
488 #endif
489 
490     if (error) {
491 	/*
492 	 * See fs/select.c in the Linux kernel.  Without this,
493 	 * Maelstrom doesn't work.
494 	 */
495 	if (error == ERESTART)
496 	    error = EINTR;
497 	goto select_out;
498     }
499 
500     if (args->timeout) {
501 	if (p->p_retval[0]) {
502 	    /*
503 	     * Compute how much time was left of the timeout,
504 	     * by subtracting the current time and the time
505 	     * before we started the call, and subtracting
506 	     * that result from the user-supplied value.
507 	     */
508 	    microtime(&tv1);
509 	    timevalsub(&tv1, &tv0);
510 	    timevalsub(&utv, &tv1);
511 	    if (utv.tv_sec < 0)
512 		timevalclear(&utv);
513 	} else
514 	    timevalclear(&utv);
515 #ifdef DEBUG
516 	printf("Linux-emul(%ld): outgoing timeout (%ld/%ld)\n",
517 	    (long)p->p_pid, utv.tv_sec, utv.tv_usec);
518 #endif
519 	if ((error = copyout(&utv, args->timeout, sizeof(utv))))
520 	    goto select_out;
521     }
522 
523 select_out:
524 #ifdef DEBUG
525     printf("Linux-emul(%d): newselect_out -> %d\n",
526 	       p->p_pid, error);
527 #endif
528     return error;
529 }
530 
531 int
532 linux_getpgid(struct proc *p, struct linux_getpgid_args *args)
533 {
534     struct proc *curproc;
535 
536 #ifdef DEBUG
537     printf("Linux-emul(%d): getpgid(%d)\n", p->p_pid, args->pid);
538 #endif
539     if (args->pid != p->p_pid) {
540 	if (!(curproc = pfind(args->pid)))
541 	    return ESRCH;
542     }
543     else
544 	curproc = p;
545     p->p_retval[0] = curproc->p_pgid;
546     return 0;
547 }
548 
549 int
550 linux_fork(struct proc *p, struct linux_fork_args *args)
551 {
552     int error;
553 
554 #ifdef DEBUG
555     printf("Linux-emul(%d): fork()\n", p->p_pid);
556 #endif
557     if (error = fork(p, (struct fork_args *)args))
558 	return error;
559     if (p->p_retval[1] == 1)
560 	p->p_retval[0] = 0;
561     return 0;
562 }
563 
564 #define CLONE_VM	0x100
565 #define CLONE_FS	0x200
566 #define CLONE_FILES	0x400
567 #define CLONE_SIGHAND	0x800
568 #define CLONE_PID	0x1000
569 
570 int
571 linux_clone(struct proc *p, struct linux_clone_args *args)
572 {
573     int error, ff = RFPROC;
574     struct proc *p2;
575     int            exit_signal;
576     vm_offset_t    start;
577     struct rfork_args rf_args;
578 
579 #ifdef SMP
580     printf("linux_clone(%d): does not work with SMP yet\n", p->p_pid);
581     return (EOPNOTSUPP);
582 #endif
583 #ifdef DEBUG
584     if (args->flags & CLONE_PID)
585 	printf("linux_clone(%d): CLONE_PID not yet supported\n", p->p_pid);
586     printf ("linux_clone(%d): invoked with flags %x and stack %x\n", p->p_pid,
587 	     (unsigned int)args->flags, (unsigned int)args->stack);
588 #endif
589 
590     if (!args->stack)
591         return (EINVAL);
592     exit_signal = args->flags & 0x000000ff;
593     if (exit_signal >= LINUX_NSIG)
594 	return EINVAL;
595     exit_signal = linux_to_bsd_signal[exit_signal];
596 
597     /* RFTHREAD probably not necessary here, but it shouldn't hurt either */
598     ff |= RFTHREAD;
599 
600     if (args->flags & CLONE_VM)
601 	ff |= RFMEM;
602     if (args->flags & CLONE_SIGHAND)
603 	ff |= RFSIGSHARE;
604     if (!(args->flags & CLONE_FILES))
605 	ff |= RFFDG;
606 
607     error = 0;
608     start = 0;
609 
610     rf_args.flags = ff;
611     if (error = rfork(p, &rf_args))
612 	return error;
613 
614     p2 = pfind(p->p_retval[0]);
615     if (p2 == 0)
616  	return ESRCH;
617 
618     p2->p_sigparent = exit_signal;
619     p2->p_md.md_regs->tf_esp = (unsigned int)args->stack;
620 
621 #ifdef DEBUG
622     printf ("linux_clone(%d): successful rfork to %d\n", p->p_pid, p2->p_pid);
623 #endif
624     return 0;
625 }
626 
627 /* XXX move */
628 struct linux_mmap_argv {
629 	linux_caddr_t addr;
630 	int len;
631 	int prot;
632 	int flags;
633 	int fd;
634 	int pos;
635 };
636 
637 #define STACK_SIZE  (2 * 1024 * 1024)
638 #define GUARD_SIZE  (4 * PAGE_SIZE)
639 int
640 linux_mmap(struct proc *p, struct linux_mmap_args *args)
641 {
642     struct mmap_args /* {
643 	caddr_t addr;
644 	size_t len;
645 	int prot;
646 	int flags;
647 	int fd;
648 	long pad;
649 	off_t pos;
650     } */ bsd_args;
651     int error;
652     struct linux_mmap_argv linux_args;
653 
654     if ((error = copyin((caddr_t)args->ptr, (caddr_t)&linux_args,
655 			sizeof(linux_args))))
656 	return error;
657 #ifdef DEBUG
658     printf("Linux-emul(%ld): mmap(%p, %d, %d, %08x, %d, %d)\n",
659 	(long)p->p_pid, (void *)linux_args.addr, linux_args.len,
660 	linux_args.prot, linux_args.flags, linux_args.fd, linux_args.pos);
661 #endif
662     bsd_args.flags = 0;
663     if (linux_args.flags & LINUX_MAP_SHARED)
664 	bsd_args.flags |= MAP_SHARED;
665     if (linux_args.flags & LINUX_MAP_PRIVATE)
666 	bsd_args.flags |= MAP_PRIVATE;
667     if (linux_args.flags & LINUX_MAP_FIXED)
668 	bsd_args.flags |= MAP_FIXED;
669     if (linux_args.flags & LINUX_MAP_ANON)
670 	bsd_args.flags |= MAP_ANON;
671 
672 #ifndef VM_STACK
673     /* Linux Threads will map into the proc stack space, unless
674      * we prevent it.  This causes problems if we're not using
675      * our VM_STACK options.
676      */
677     if ((unsigned int)linux_args.addr + linux_args.len > (USRSTACK - MAXSSIZ))
678 	return (EINVAL);
679 #endif
680 
681     if (linux_args.flags & LINUX_MAP_GROWSDOWN) {
682 
683 #ifdef VM_STACK
684 	bsd_args.flags |= MAP_STACK;
685 #endif
686 
687 	/* The linux MAP_GROWSDOWN option does not limit auto
688 	 * growth of the region.  Linux mmap with this option
689 	 * takes as addr the inital BOS, and as len, the initial
690 	 * region size.  It can then grow down from addr without
691 	 * limit.  However, linux threads has an implicit internal
692 	 * limit to stack size of STACK_SIZE.  Its just not
693 	 * enforced explicitly in linux.  But, here we impose
694 	 * a limit of (STACK_SIZE - GUARD_SIZE) on the stack
695 	 * region, since we can do this with our mmap.
696 	 *
697 	 * Our mmap with MAP_STACK takes addr as the maximum
698 	 * downsize limit on BOS, and as len the max size of
699 	 * the region.  It them maps the top SGROWSIZ bytes,
700 	 * and autgrows the region down, up to the limit
701 	 * in addr.
702 	 *
703 	 * If we don't use the MAP_STACK option, the effect
704 	 * of this code is to allocate a stack region of a
705 	 * fixed size of (STACK_SIZE - GUARD_SIZE).
706 	 */
707 
708 	/* This gives us TOS */
709 	bsd_args.addr = linux_args.addr + linux_args.len;
710 
711 	/* This gives us our maximum stack size */
712 	if (linux_args.len > STACK_SIZE - GUARD_SIZE)
713 	    bsd_args.len = linux_args.len;
714 	else
715 	    bsd_args.len  = STACK_SIZE - GUARD_SIZE;
716 
717 	/* This gives us a new BOS.  If we're using VM_STACK, then
718 	 * mmap will just map the top SGROWSIZ bytes, and let
719 	 * the stack grow down to the limit at BOS.  If we're
720 	 * not using VM_STACK we map the full stack, since we
721 	 * don't have a way to autogrow it.
722 	 */
723 	bsd_args.addr -= bsd_args.len;
724 
725     } else {
726 	bsd_args.addr = linux_args.addr;
727 	bsd_args.len  = linux_args.len;
728     }
729 
730     bsd_args.prot = linux_args.prot | PROT_READ;	/* always required */
731     bsd_args.fd = linux_args.fd;
732     bsd_args.pos = linux_args.pos;
733     bsd_args.pad = 0;
734     return mmap(p, &bsd_args);
735 }
736 
737 int
738 linux_mremap(struct proc *p, struct linux_mremap_args *args)
739 {
740 	struct munmap_args /* {
741 		void *addr;
742 		size_t len;
743 	} */ bsd_args;
744 	int error = 0;
745 
746 #ifdef DEBUG
747 	printf("Linux-emul(%ld): mremap(%p, %08x, %08x, %08x)\n",
748 	    (long)p->p_pid, (void *)args->addr, args->old_len, args->new_len,
749 	    args->flags);
750 #endif
751 	args->new_len = round_page(args->new_len);
752 	args->old_len = round_page(args->old_len);
753 
754 	if (args->new_len > args->old_len) {
755 		p->p_retval[0] = 0;
756 		return ENOMEM;
757 	}
758 
759 	if (args->new_len < args->old_len) {
760 		bsd_args.addr = args->addr + args->new_len;
761 		bsd_args.len = args->old_len - args->new_len;
762 		error = munmap(p, &bsd_args);
763 	}
764 
765 	p->p_retval[0] = error ? 0 : (int)args->addr;
766 	return error;
767 }
768 
769 int
770 linux_msync(struct proc *p, struct linux_msync_args *args)
771 {
772 	struct msync_args bsd_args;
773 
774 	bsd_args.addr = args->addr;
775 	bsd_args.len = args->len;
776 	bsd_args.flags = 0;	/* XXX ignore */
777 
778 	return msync(p, &bsd_args);
779 }
780 
781 int
782 linux_pipe(struct proc *p, struct linux_pipe_args *args)
783 {
784     int error;
785     int reg_edx;
786 
787 #ifdef DEBUG
788     printf("Linux-emul(%d): pipe(*)\n", p->p_pid);
789 #endif
790     reg_edx = p->p_retval[1];
791     if (error = pipe(p, 0)) {
792 	p->p_retval[1] = reg_edx;
793 	return error;
794     }
795 
796     if (error = copyout(p->p_retval, args->pipefds, 2*sizeof(int))) {
797 	p->p_retval[1] = reg_edx;
798 	return error;
799     }
800 
801     p->p_retval[1] = reg_edx;
802     p->p_retval[0] = 0;
803     return 0;
804 }
805 
806 int
807 linux_time(struct proc *p, struct linux_time_args *args)
808 {
809     struct timeval tv;
810     linux_time_t tm;
811     int error;
812 
813 #ifdef DEBUG
814     printf("Linux-emul(%d): time(*)\n", p->p_pid);
815 #endif
816     microtime(&tv);
817     tm = tv.tv_sec;
818     if (args->tm && (error = copyout(&tm, args->tm, sizeof(linux_time_t))))
819 	return error;
820     p->p_retval[0] = tm;
821     return 0;
822 }
823 
824 struct linux_times_argv {
825     long    tms_utime;
826     long    tms_stime;
827     long    tms_cutime;
828     long    tms_cstime;
829 };
830 
831 #define CLK_TCK 100	/* Linux uses 100 */
832 #define CONVTCK(r)	(r.tv_sec * CLK_TCK + r.tv_usec / (1000000 / CLK_TCK))
833 
834 int
835 linux_times(struct proc *p, struct linux_times_args *args)
836 {
837     struct timeval tv;
838     struct linux_times_argv tms;
839     struct rusage ru;
840     int error;
841 
842 #ifdef DEBUG
843     printf("Linux-emul(%d): times(*)\n", p->p_pid);
844 #endif
845     calcru(p, &ru.ru_utime, &ru.ru_stime, NULL);
846 
847     tms.tms_utime = CONVTCK(ru.ru_utime);
848     tms.tms_stime = CONVTCK(ru.ru_stime);
849 
850     tms.tms_cutime = CONVTCK(p->p_stats->p_cru.ru_utime);
851     tms.tms_cstime = CONVTCK(p->p_stats->p_cru.ru_stime);
852 
853     if ((error = copyout((caddr_t)&tms, (caddr_t)args->buf,
854 	    	    sizeof(struct linux_times_argv))))
855 	return error;
856 
857     microuptime(&tv);
858     p->p_retval[0] = (int)CONVTCK(tv);
859     return 0;
860 }
861 
862 /* XXX move */
863 struct linux_newuname_t {
864     char sysname[65];
865     char nodename[65];
866     char release[65];
867     char version[65];
868     char machine[65];
869     char domainname[65];
870 };
871 
872 int
873 linux_newuname(struct proc *p, struct linux_newuname_args *args)
874 {
875     struct linux_newuname_t linux_newuname;
876 
877 #ifdef DEBUG
878     printf("Linux-emul(%d): newuname(*)\n", p->p_pid);
879 #endif
880     bzero(&linux_newuname, sizeof(struct linux_newuname_t));
881     strncpy(linux_newuname.sysname, ostype,
882 	sizeof(linux_newuname.sysname) - 1);
883     strncpy(linux_newuname.nodename, hostname,
884 	sizeof(linux_newuname.nodename) - 1);
885     strncpy(linux_newuname.release, osrelease,
886 	sizeof(linux_newuname.release) - 1);
887     strncpy(linux_newuname.version, version,
888 	sizeof(linux_newuname.version) - 1);
889     strncpy(linux_newuname.machine, machine,
890 	sizeof(linux_newuname.machine) - 1);
891     strncpy(linux_newuname.domainname, domainname,
892 	sizeof(linux_newuname.domainname) - 1);
893     return (copyout((caddr_t)&linux_newuname, (caddr_t)args->buf,
894 	    	    sizeof(struct linux_newuname_t)));
895 }
896 
897 struct linux_utimbuf {
898 	linux_time_t l_actime;
899 	linux_time_t l_modtime;
900 };
901 
902 int
903 linux_utime(struct proc *p, struct linux_utime_args *args)
904 {
905     struct utimes_args /* {
906 	char	*path;
907 	struct	timeval *tptr;
908     } */ bsdutimes;
909     struct timeval tv[2], *tvp;
910     struct linux_utimbuf lut;
911     int error;
912     caddr_t sg;
913 
914     sg = stackgap_init();
915     CHECKALTEXIST(p, &sg, args->fname);
916 
917 #ifdef DEBUG
918     printf("Linux-emul(%d): utime(%s, *)\n", p->p_pid, args->fname);
919 #endif
920     if (args->times) {
921 	if ((error = copyin(args->times, &lut, sizeof lut)))
922 	    return error;
923 	tv[0].tv_sec = lut.l_actime;
924 	tv[0].tv_usec = 0;
925 	tv[1].tv_sec = lut.l_modtime;
926 	tv[1].tv_usec = 0;
927 	/* so that utimes can copyin */
928 	tvp = (struct timeval *)stackgap_alloc(&sg, sizeof(tv));
929 	if ((error = copyout(tv, tvp, sizeof(tv))))
930 	    return error;
931 	bsdutimes.tptr = tvp;
932     } else
933 	bsdutimes.tptr = NULL;
934 
935     bsdutimes.path = args->fname;
936     return utimes(p, &bsdutimes);
937 }
938 
939 int
940 linux_waitpid(struct proc *p, struct linux_waitpid_args *args)
941 {
942     struct wait_args /* {
943 	int pid;
944 	int *status;
945 	int options;
946 	struct	rusage *rusage;
947     } */ tmp;
948     int error, tmpstat;
949 
950 #ifdef DEBUG
951     printf("Linux-emul(%ld): waitpid(%d, %p, %d)\n",
952 	(long)p->p_pid, args->pid, (void *)args->status, args->options);
953 #endif
954     tmp.pid = args->pid;
955     tmp.status = args->status;
956     /* This filters out the linux option _WCLONE.  I don't
957      * think we need it, but I could be wrong.  If we need
958      * it, we need to fix wait4, since it will give us an
959      * error return of EINVAL if we pass in _WCLONE, and
960      * of course, it won't do anything with it.
961      */
962     tmp.options = (args->options & (WNOHANG | WUNTRACED));
963     tmp.rusage = NULL;
964 
965     if (error = wait4(p, &tmp))
966 	return error;
967 
968     if (args->status) {
969 	if (error = copyin(args->status, &tmpstat, sizeof(int)))
970 	    return error;
971 	if (WIFSIGNALED(tmpstat))
972 	    tmpstat = (tmpstat & 0xffffff80) |
973 		      bsd_to_linux_signal[WTERMSIG(tmpstat)];
974 	else if (WIFSTOPPED(tmpstat))
975 	    tmpstat = (tmpstat & 0xffff00ff) |
976 		      (bsd_to_linux_signal[WSTOPSIG(tmpstat)]<<8);
977 	return copyout(&tmpstat, args->status, sizeof(int));
978     } else
979 	return 0;
980 }
981 
982 int
983 linux_wait4(struct proc *p, struct linux_wait4_args *args)
984 {
985     struct wait_args /* {
986 	int pid;
987 	int *status;
988 	int options;
989 	struct	rusage *rusage;
990     } */ tmp;
991     int error, tmpstat;
992 
993 #ifdef DEBUG
994     printf("Linux-emul(%ld): wait4(%d, %p, %d, %p)\n",
995 	(long)p->p_pid, args->pid, (void *)args->status, args->options,
996 	(void *)args->rusage);
997 #endif
998     tmp.pid = args->pid;
999     tmp.status = args->status;
1000     /* This filters out the linux option _WCLONE.  I don't
1001      * think we need it, but I could be wrong.  If we need
1002      * it, we need to fix wait4, since it will give us an
1003      * error return of EINVAL if we pass in _WCLONE, and
1004      * of course, it won't do anything with it.
1005      */
1006     tmp.options = (args->options & (WNOHANG | WUNTRACED));
1007     tmp.rusage = args->rusage;
1008 
1009     if (error = wait4(p, &tmp))
1010 	return error;
1011 
1012     p->p_siglist &= ~sigmask(SIGCHLD);
1013 
1014     if (args->status) {
1015 	if (error = copyin(args->status, &tmpstat, sizeof(int)))
1016 	    return error;
1017 	if (WIFSIGNALED(tmpstat))
1018 	    tmpstat = (tmpstat & 0xffffff80) |
1019 		  bsd_to_linux_signal[WTERMSIG(tmpstat)];
1020 	else if (WIFSTOPPED(tmpstat))
1021 	    tmpstat = (tmpstat & 0xffff00ff) |
1022 		  (bsd_to_linux_signal[WSTOPSIG(tmpstat)]<<8);
1023 	return copyout(&tmpstat, args->status, sizeof(int));
1024     } else
1025 	return 0;
1026 }
1027 
1028 int
1029 linux_mknod(struct proc *p, struct linux_mknod_args *args)
1030 {
1031 	caddr_t sg;
1032 	struct mknod_args bsd_mknod;
1033 	struct mkfifo_args bsd_mkfifo;
1034 
1035 	sg = stackgap_init();
1036 
1037 	CHECKALTCREAT(p, &sg, args->path);
1038 
1039 #ifdef DEBUG
1040 	printf("Linux-emul(%d): mknod(%s, %d, %d)\n",
1041 	   p->p_pid, args->path, args->mode, args->dev);
1042 #endif
1043 
1044 	if (args->mode & S_IFIFO) {
1045 		bsd_mkfifo.path = args->path;
1046 		bsd_mkfifo.mode = args->mode;
1047 		return mkfifo(p, &bsd_mkfifo);
1048 	} else {
1049 		bsd_mknod.path = args->path;
1050 		bsd_mknod.mode = args->mode;
1051 		bsd_mknod.dev = args->dev;
1052 		return mknod(p, &bsd_mknod);
1053 	}
1054 }
1055 
1056 /*
1057  * UGH! This is just about the dumbest idea I've ever heard!!
1058  */
1059 int
1060 linux_personality(struct proc *p, struct linux_personality_args *args)
1061 {
1062 #ifdef DEBUG
1063 	printf("Linux-emul(%d): personality(%d)\n",
1064 	   p->p_pid, args->per);
1065 #endif
1066 	if (args->per != 0)
1067 		return EINVAL;
1068 
1069 	/* Yes Jim, it's still a Linux... */
1070 	p->p_retval[0] = 0;
1071 	return 0;
1072 }
1073 
1074 /*
1075  * Wrappers for get/setitimer for debugging..
1076  */
1077 int
1078 linux_setitimer(struct proc *p, struct linux_setitimer_args *args)
1079 {
1080 	struct setitimer_args bsa;
1081 	struct itimerval foo;
1082 	int error;
1083 
1084 #ifdef DEBUG
1085 	printf("Linux-emul(%ld): setitimer(%p, %p)\n",
1086 	    (long)p->p_pid, (void *)args->itv, (void *)args->oitv);
1087 #endif
1088 	bsa.which = args->which;
1089 	bsa.itv = args->itv;
1090 	bsa.oitv = args->oitv;
1091 	if (args->itv) {
1092 	    if ((error = copyin((caddr_t)args->itv, (caddr_t)&foo,
1093 			sizeof(foo))))
1094 		return error;
1095 #ifdef DEBUG
1096 	    printf("setitimer: value: sec: %ld, usec: %ld\n",
1097 		foo.it_value.tv_sec, foo.it_value.tv_usec);
1098 	    printf("setitimer: interval: sec: %ld, usec: %ld\n",
1099 		foo.it_interval.tv_sec, foo.it_interval.tv_usec);
1100 #endif
1101 	}
1102 	return setitimer(p, &bsa);
1103 }
1104 
1105 int
1106 linux_getitimer(struct proc *p, struct linux_getitimer_args *args)
1107 {
1108 	struct getitimer_args bsa;
1109 #ifdef DEBUG
1110 	printf("Linux-emul(%ld): getitimer(%p)\n",
1111 	    (long)p->p_pid, (void *)args->itv);
1112 #endif
1113 	bsa.which = args->which;
1114 	bsa.itv = args->itv;
1115 	return getitimer(p, &bsa);
1116 }
1117 
1118 int
1119 linux_iopl(struct proc *p, struct linux_iopl_args *args)
1120 {
1121 	int error;
1122 
1123 	error = suser(p->p_ucred, &p->p_acflag);
1124 	if (error != 0)
1125 		return error;
1126 	if (securelevel > 0)
1127 		return EPERM;
1128 	p->p_md.md_regs->tf_eflags |= PSL_IOPL;
1129 	return 0;
1130 }
1131 
1132 int
1133 linux_nice(struct proc *p, struct linux_nice_args *args)
1134 {
1135 	struct setpriority_args	bsd_args;
1136 
1137 	bsd_args.which = PRIO_PROCESS;
1138 	bsd_args.who = 0;	/* current process */
1139 	bsd_args.prio = args->inc;
1140 	return setpriority(p, &bsd_args);
1141 }
1142 
1143 int
1144 linux_setgroups(p, uap)
1145      struct proc *p;
1146      struct linux_setgroups_args *uap;
1147 {
1148   struct pcred *pc = p->p_cred;
1149   linux_gid_t linux_gidset[NGROUPS];
1150   gid_t *bsd_gidset;
1151   int ngrp, error;
1152 
1153   if ((error = suser(pc->pc_ucred, &p->p_acflag)))
1154     return error;
1155 
1156   if (uap->gidsetsize > NGROUPS)
1157     return EINVAL;
1158 
1159   ngrp = uap->gidsetsize;
1160   pc->pc_ucred = crcopy(pc->pc_ucred);
1161   if (ngrp >= 1) {
1162     if ((error = copyin((caddr_t)uap->gidset,
1163                       (caddr_t)linux_gidset,
1164                         ngrp * sizeof(linux_gid_t))))
1165       return error;
1166 
1167     pc->pc_ucred->cr_ngroups = ngrp;
1168 
1169     bsd_gidset = pc->pc_ucred->cr_groups;
1170     ngrp--;
1171     while (ngrp >= 0) {
1172       bsd_gidset[ngrp] = linux_gidset[ngrp];
1173       ngrp--;
1174     }
1175   }
1176   else
1177     pc->pc_ucred->cr_ngroups = 1;
1178 
1179   setsugid(p);
1180   return 0;
1181 }
1182 
1183 int
1184 linux_getgroups(p, uap)
1185      struct proc *p;
1186      struct linux_getgroups_args *uap;
1187 {
1188   struct pcred *pc = p->p_cred;
1189   linux_gid_t linux_gidset[NGROUPS];
1190   gid_t *bsd_gidset;
1191   int ngrp, error;
1192 
1193   if ((ngrp = uap->gidsetsize) == 0) {
1194     p->p_retval[0] = pc->pc_ucred->cr_ngroups;
1195     return 0;
1196   }
1197 
1198   if (ngrp < pc->pc_ucred->cr_ngroups)
1199     return EINVAL;
1200 
1201   ngrp = 0;
1202   bsd_gidset = pc->pc_ucred->cr_groups;
1203   while (ngrp < pc->pc_ucred->cr_ngroups) {
1204     linux_gidset[ngrp] = bsd_gidset[ngrp];
1205     ngrp++;
1206   }
1207 
1208   if ((error = copyout((caddr_t)linux_gidset, (caddr_t)uap->gidset,
1209                        ngrp * sizeof(linux_gid_t))))
1210     return error;
1211 
1212   p->p_retval[0] = ngrp;
1213   return (0);
1214 }
1215