xref: /freebsd/sys/compat/linux/linux_misc.c (revision 601752d5a7bef087e755da5a2b158fa35cb51ccb)
1 /*-
2  * Copyright (c) 1994-1995 S�ren Schmidt
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer
10  *    in this position and unchanged.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  * 3. The name of the author may not be used to endorse or promote products
15  *    derived from this software withough specific prior written permission
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  *
28  *  $Id: linux_misc.c,v 1.52 1999/01/26 02:38:10 julian Exp $
29  */
30 
31 #include <sys/param.h>
32 #include <sys/systm.h>
33 #include <sys/sysproto.h>
34 #include <sys/kernel.h>
35 #include <sys/mman.h>
36 #include <sys/proc.h>
37 #include <sys/fcntl.h>
38 #include <sys/imgact_aout.h>
39 #include <sys/mount.h>
40 #include <sys/namei.h>
41 #include <sys/resourcevar.h>
42 #include <sys/stat.h>
43 #include <sys/sysctl.h>
44 #include <sys/unistd.h>
45 #include <sys/vnode.h>
46 #include <sys/wait.h>
47 #include <sys/time.h>
48 
49 #include <vm/vm.h>
50 #include <vm/pmap.h>
51 #include <vm/vm_kern.h>
52 #include <vm/vm_prot.h>
53 #include <vm/vm_map.h>
54 #include <vm/vm_extern.h>
55 
56 #include <machine/frame.h>
57 #include <machine/psl.h>
58 
59 #include <i386/linux/linux.h>
60 #include <i386/linux/linux_proto.h>
61 #include <i386/linux/linux_util.h>
62 
63 int
64 linux_alarm(struct proc *p, struct linux_alarm_args *args)
65 {
66     struct itimerval it, old_it;
67     struct timeval tv;
68     int s;
69 
70 #ifdef DEBUG
71     printf("Linux-emul(%ld): alarm(%u)\n", (long)p->p_pid, args->secs);
72 #endif
73     if (args->secs > 100000000)
74 	return EINVAL;
75     it.it_value.tv_sec = (long)args->secs;
76     it.it_value.tv_usec = 0;
77     it.it_interval.tv_sec = 0;
78     it.it_interval.tv_usec = 0;
79     s = splsoftclock();
80     old_it = p->p_realtimer;
81     getmicrouptime(&tv);
82     if (timevalisset(&old_it.it_value))
83 	untimeout(realitexpire, (caddr_t)p, p->p_ithandle);
84     if (it.it_value.tv_sec != 0) {
85 	p->p_ithandle = timeout(realitexpire, (caddr_t)p, tvtohz(&it.it_value));
86 	timevaladd(&it.it_value, &tv);
87     }
88     p->p_realtimer = it;
89     splx(s);
90     if (timevalcmp(&old_it.it_value, &tv, >)) {
91 	timevalsub(&old_it.it_value, &tv);
92 	if (old_it.it_value.tv_usec != 0)
93 	    old_it.it_value.tv_sec++;
94 	p->p_retval[0] = old_it.it_value.tv_sec;
95     }
96     return 0;
97 }
98 
99 int
100 linux_brk(struct proc *p, struct linux_brk_args *args)
101 {
102 #if 0
103     struct vmspace *vm = p->p_vmspace;
104     vm_offset_t new, old;
105     int error;
106 
107     if ((vm_offset_t)args->dsend < (vm_offset_t)vm->vm_daddr)
108 	return EINVAL;
109     if (((caddr_t)args->dsend - (caddr_t)vm->vm_daddr)
110 	> p->p_rlimit[RLIMIT_DATA].rlim_cur)
111 	return ENOMEM;
112 
113     old = round_page((vm_offset_t)vm->vm_daddr) + ctob(vm->vm_dsize);
114     new = round_page((vm_offset_t)args->dsend);
115     p->p_retval[0] = old;
116     if ((new-old) > 0) {
117 	if (swap_pager_full)
118 	    return ENOMEM;
119 	error = vm_map_find(&vm->vm_map, NULL, 0, &old, (new-old), FALSE,
120 			VM_PROT_ALL, VM_PROT_ALL, 0);
121 	if (error)
122 	    return error;
123 	vm->vm_dsize += btoc((new-old));
124 	p->p_retval[0] = (int)(vm->vm_daddr + ctob(vm->vm_dsize));
125     }
126     return 0;
127 #else
128     struct vmspace *vm = p->p_vmspace;
129     vm_offset_t new, old;
130     struct obreak_args /* {
131 	char * nsize;
132     } */ tmp;
133 
134 #ifdef DEBUG
135     printf("Linux-emul(%ld): brk(%p)\n", (long)p->p_pid, (void *)args->dsend);
136 #endif
137     old = (vm_offset_t)vm->vm_daddr + ctob(vm->vm_dsize);
138     new = (vm_offset_t)args->dsend;
139     tmp.nsize = (char *) new;
140     if (((caddr_t)new > vm->vm_daddr) && !obreak(p, &tmp))
141 	p->p_retval[0] = (int)new;
142     else
143 	p->p_retval[0] = (int)old;
144 
145     return 0;
146 #endif
147 }
148 
149 int
150 linux_uselib(struct proc *p, struct linux_uselib_args *args)
151 {
152     struct nameidata ni;
153     struct vnode *vp;
154     struct exec *a_out;
155     struct vattr attr;
156     vm_offset_t vmaddr;
157     unsigned long file_offset;
158     vm_offset_t buffer;
159     unsigned long bss_size;
160     int error;
161     caddr_t sg;
162     int locked;
163 
164     sg = stackgap_init();
165     CHECKALTEXIST(p, &sg, args->library);
166 
167 #ifdef DEBUG
168     printf("Linux-emul(%d): uselib(%s)\n", p->p_pid, args->library);
169 #endif
170 
171     a_out = NULL;
172     locked = 0;
173     vp = NULL;
174 
175     NDINIT(&ni, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, args->library, p);
176     if (error = namei(&ni))
177 	goto cleanup;
178 
179     vp = ni.ni_vp;
180     if (vp == NULL) {
181 	error = ENOEXEC;	/* ?? */
182 	goto cleanup;
183     }
184 
185     /*
186      * From here on down, we have a locked vnode that must be unlocked.
187      */
188     locked++;
189 
190     /*
191      * Writable?
192      */
193     if (vp->v_writecount) {
194 	error = ETXTBSY;
195 	goto cleanup;
196     }
197 
198     /*
199      * Executable?
200      */
201     if (error = VOP_GETATTR(vp, &attr, p->p_ucred, p))
202 	goto cleanup;
203 
204     if ((vp->v_mount->mnt_flag & MNT_NOEXEC) ||
205 	((attr.va_mode & 0111) == 0) ||
206 	(attr.va_type != VREG)) {
207 	    error = ENOEXEC;
208 	    goto cleanup;
209     }
210 
211     /*
212      * Sensible size?
213      */
214     if (attr.va_size == 0) {
215 	error = ENOEXEC;
216 	goto cleanup;
217     }
218 
219     /*
220      * Can we access it?
221      */
222     if (error = VOP_ACCESS(vp, VEXEC, p->p_ucred, p))
223 	goto cleanup;
224 
225     if (error = VOP_OPEN(vp, FREAD, p->p_ucred, p))
226 	goto cleanup;
227 
228     /*
229      * Lock no longer needed
230      */
231     VOP_UNLOCK(vp, 0, p);
232     locked = 0;
233 
234     /*
235      * Pull in executable header into kernel_map
236      */
237     error = vm_mmap(kernel_map, (vm_offset_t *)&a_out, PAGE_SIZE,
238 	    	    VM_PROT_READ, VM_PROT_READ, 0, (caddr_t)vp, 0);
239     if (error)
240 	goto cleanup;
241 
242     /*
243      * Is it a Linux binary ?
244      */
245     if (((a_out->a_magic >> 16) & 0xff) != 0x64) {
246 	error = ENOEXEC;
247 	goto cleanup;
248     }
249 
250     /* While we are here, we should REALLY do some more checks */
251 
252     /*
253      * Set file/virtual offset based on a.out variant.
254      */
255     switch ((int)(a_out->a_magic & 0xffff)) {
256     case 0413:	/* ZMAGIC */
257 	file_offset = 1024;
258 	break;
259     case 0314:	/* QMAGIC */
260 	file_offset = 0;
261 	break;
262     default:
263 	error = ENOEXEC;
264 	goto cleanup;
265     }
266 
267     bss_size = round_page(a_out->a_bss);
268 
269     /*
270      * Check various fields in header for validity/bounds.
271      */
272     if (a_out->a_text & PAGE_MASK || a_out->a_data & PAGE_MASK) {
273 	error = ENOEXEC;
274 	goto cleanup;
275     }
276 
277     /* text + data can't exceed file size */
278     if (a_out->a_data + a_out->a_text > attr.va_size) {
279 	error = EFAULT;
280 	goto cleanup;
281     }
282 
283     /*
284      * text/data/bss must not exceed limits
285      * XXX: this is not complete. it should check current usage PLUS
286      * the resources needed by this library.
287      */
288     if (a_out->a_text > MAXTSIZ ||
289 	a_out->a_data + bss_size > p->p_rlimit[RLIMIT_DATA].rlim_cur) {
290 	error = ENOMEM;
291 	goto cleanup;
292     }
293 
294     /*
295      * prevent more writers
296      */
297     vp->v_flag |= VTEXT;
298 
299     /*
300      * Check if file_offset page aligned,.
301      * Currently we cannot handle misalinged file offsets,
302      * and so we read in the entire image (what a waste).
303      */
304     if (file_offset & PAGE_MASK) {
305 #ifdef DEBUG
306 printf("uselib: Non page aligned binary %lu\n", file_offset);
307 #endif
308 	/*
309 	 * Map text+data read/write/execute
310 	 */
311 
312 	/* a_entry is the load address and is page aligned */
313 	vmaddr = trunc_page(a_out->a_entry);
314 
315 	/* get anon user mapping, read+write+execute */
316 	error = vm_map_find(&p->p_vmspace->vm_map, NULL, 0, &vmaddr,
317 		    	    a_out->a_text + a_out->a_data, FALSE,
318 			    VM_PROT_ALL, VM_PROT_ALL, 0);
319 	if (error)
320 	    goto cleanup;
321 
322 	/* map file into kernel_map */
323 	error = vm_mmap(kernel_map, &buffer,
324 			round_page(a_out->a_text + a_out->a_data + file_offset),
325 		   	VM_PROT_READ, VM_PROT_READ, 0,
326 			(caddr_t)vp, trunc_page(file_offset));
327 	if (error)
328 	    goto cleanup;
329 
330 	/* copy from kernel VM space to user space */
331 	error = copyout((caddr_t)(void *)(uintptr_t)(buffer + file_offset),
332 			(caddr_t)vmaddr, a_out->a_text + a_out->a_data);
333 
334 	/* release temporary kernel space */
335 	vm_map_remove(kernel_map, buffer,
336 		      buffer + round_page(a_out->a_text + a_out->a_data + file_offset));
337 
338 	if (error)
339 	    goto cleanup;
340     }
341     else {
342 #ifdef DEBUG
343 printf("uselib: Page aligned binary %lu\n", file_offset);
344 #endif
345 	/*
346 	 * for QMAGIC, a_entry is 20 bytes beyond the load address
347 	 * to skip the executable header
348 	 */
349 	vmaddr = trunc_page(a_out->a_entry);
350 
351 	/*
352 	 * Map it all into the process's space as a single copy-on-write
353 	 * "data" segment.
354 	 */
355 	error = vm_mmap(&p->p_vmspace->vm_map, &vmaddr,
356 		   	a_out->a_text + a_out->a_data,
357 			VM_PROT_ALL, VM_PROT_ALL, MAP_PRIVATE | MAP_FIXED,
358 			(caddr_t)vp, file_offset);
359 	if (error)
360 	    goto cleanup;
361     }
362 #ifdef DEBUG
363 printf("mem=%08x = %08x %08x\n", vmaddr, ((int*)vmaddr)[0], ((int*)vmaddr)[1]);
364 #endif
365     if (bss_size != 0) {
366         /*
367 	 * Calculate BSS start address
368 	 */
369 	vmaddr = trunc_page(a_out->a_entry) + a_out->a_text + a_out->a_data;
370 
371 	/*
372 	 * allocate some 'anon' space
373 	 */
374 	error = vm_map_find(&p->p_vmspace->vm_map, NULL, 0, &vmaddr,
375 			    bss_size, FALSE,
376 			    VM_PROT_ALL, VM_PROT_ALL, 0);
377 	if (error)
378 	    goto cleanup;
379     }
380 
381 cleanup:
382     /*
383      * Unlock vnode if needed
384      */
385     if (locked)
386 	VOP_UNLOCK(vp, 0, p);
387 
388     /*
389      * Release the kernel mapping.
390      */
391     if (a_out)
392 	vm_map_remove(kernel_map, (vm_offset_t)a_out, (vm_offset_t)a_out + PAGE_SIZE);
393 
394     return error;
395 }
396 
397 /* XXX move */
398 struct linux_select_argv {
399 	int nfds;
400 	fd_set *readfds;
401 	fd_set *writefds;
402 	fd_set *exceptfds;
403 	struct timeval *timeout;
404 };
405 
406 int
407 linux_select(struct proc *p, struct linux_select_args *args)
408 {
409     struct linux_select_argv linux_args;
410     struct linux_newselect_args newsel;
411     int error;
412 
413 #ifdef SELECT_DEBUG
414     printf("Linux-emul(%d): select(%x)\n",
415 	   p->p_pid, args->ptr);
416 #endif
417     if ((error = copyin((caddr_t)args->ptr, (caddr_t)&linux_args,
418 			sizeof(linux_args))))
419 	return error;
420 
421     newsel.nfds = linux_args.nfds;
422     newsel.readfds = linux_args.readfds;
423     newsel.writefds = linux_args.writefds;
424     newsel.exceptfds = linux_args.exceptfds;
425     newsel.timeout = linux_args.timeout;
426 
427     return linux_newselect(p, &newsel);
428 }
429 
430 int
431 linux_newselect(struct proc *p, struct linux_newselect_args *args)
432 {
433     struct select_args bsa;
434     struct timeval tv0, tv1, utv, *tvp;
435     caddr_t sg;
436     int error;
437 
438 #ifdef DEBUG
439     printf("Linux-emul(%ld): newselect(%d, %p, %p, %p, %p)\n",
440   	(long)p->p_pid, args->nfds, (void *)args->readfds,
441 	(void *)args->writefds, (void *)args->exceptfds,
442 	(void *)args->timeout);
443 #endif
444     error = 0;
445     bsa.nd = args->nfds;
446     bsa.in = args->readfds;
447     bsa.ou = args->writefds;
448     bsa.ex = args->exceptfds;
449     bsa.tv = args->timeout;
450 
451     /*
452      * Store current time for computation of the amount of
453      * time left.
454      */
455     if (args->timeout) {
456 	if ((error = copyin(args->timeout, &utv, sizeof(utv))))
457 	    goto select_out;
458 #ifdef DEBUG
459 	printf("Linux-emul(%ld): incoming timeout (%ld/%ld)\n",
460 	    (long)p->p_pid, utv.tv_sec, utv.tv_usec);
461 #endif
462 	if (itimerfix(&utv)) {
463 	    /*
464 	     * The timeval was invalid.  Convert it to something
465 	     * valid that will act as it does under Linux.
466 	     */
467 	    sg = stackgap_init();
468 	    tvp = stackgap_alloc(&sg, sizeof(utv));
469 	    utv.tv_sec += utv.tv_usec / 1000000;
470 	    utv.tv_usec %= 1000000;
471 	    if (utv.tv_usec < 0) {
472 		utv.tv_sec -= 1;
473 		utv.tv_usec += 1000000;
474 	    }
475 	    if (utv.tv_sec < 0)
476 		timevalclear(&utv);
477 	    if ((error = copyout(&utv, tvp, sizeof(utv))))
478 		goto select_out;
479 	    bsa.tv = tvp;
480 	}
481 	microtime(&tv0);
482     }
483 
484     error = select(p, &bsa);
485 #ifdef DEBUG
486     printf("Linux-emul(%d): real select returns %d\n",
487 	       p->p_pid, error);
488 #endif
489 
490     if (error) {
491 	/*
492 	 * See fs/select.c in the Linux kernel.  Without this,
493 	 * Maelstrom doesn't work.
494 	 */
495 	if (error == ERESTART)
496 	    error = EINTR;
497 	goto select_out;
498     }
499 
500     if (args->timeout) {
501 	if (p->p_retval[0]) {
502 	    /*
503 	     * Compute how much time was left of the timeout,
504 	     * by subtracting the current time and the time
505 	     * before we started the call, and subtracting
506 	     * that result from the user-supplied value.
507 	     */
508 	    microtime(&tv1);
509 	    timevalsub(&tv1, &tv0);
510 	    timevalsub(&utv, &tv1);
511 	    if (utv.tv_sec < 0)
512 		timevalclear(&utv);
513 	} else
514 	    timevalclear(&utv);
515 #ifdef DEBUG
516 	printf("Linux-emul(%ld): outgoing timeout (%ld/%ld)\n",
517 	    (long)p->p_pid, utv.tv_sec, utv.tv_usec);
518 #endif
519 	if ((error = copyout(&utv, args->timeout, sizeof(utv))))
520 	    goto select_out;
521     }
522 
523 select_out:
524 #ifdef DEBUG
525     printf("Linux-emul(%d): newselect_out -> %d\n",
526 	       p->p_pid, error);
527 #endif
528     return error;
529 }
530 
531 int
532 linux_getpgid(struct proc *p, struct linux_getpgid_args *args)
533 {
534     struct proc *curproc;
535 
536 #ifdef DEBUG
537     printf("Linux-emul(%d): getpgid(%d)\n", p->p_pid, args->pid);
538 #endif
539     if (args->pid != p->p_pid) {
540 	if (!(curproc = pfind(args->pid)))
541 	    return ESRCH;
542     }
543     else
544 	curproc = p;
545     p->p_retval[0] = curproc->p_pgid;
546     return 0;
547 }
548 
549 int
550 linux_fork(struct proc *p, struct linux_fork_args *args)
551 {
552     int error;
553 
554 #ifdef DEBUG
555     printf("Linux-emul(%d): fork()\n", p->p_pid);
556 #endif
557     if ((error = fork(p, (struct fork_args *)args)) != 0)
558 	return error;
559     if (p->p_retval[1] == 1)
560 	p->p_retval[0] = 0;
561     return 0;
562 }
563 
564 #define CLONE_VM	0x100
565 #define CLONE_FS	0x200
566 #define CLONE_FILES	0x400
567 #define CLONE_SIGHAND	0x800
568 #define CLONE_PID	0x1000
569 
570 int
571 linux_clone(struct proc *p, struct linux_clone_args *args)
572 {
573     int error, ff = RFPROC;
574     struct proc *p2;
575     int            exit_signal;
576     vm_offset_t    start;
577     struct rfork_args rf_args;
578 
579 #ifdef SMP
580     printf("linux_clone(%d): does not work with SMP yet\n", p->p_pid);
581     return (EOPNOTSUPP);
582 #endif
583 #ifdef DEBUG
584     if (args->flags & CLONE_PID)
585 	printf("linux_clone(%d): CLONE_PID not yet supported\n", p->p_pid);
586     printf ("linux_clone(%d): invoked with flags %x and stack %x\n", p->p_pid,
587 	     (unsigned int)args->flags, (unsigned int)args->stack);
588 #endif
589 
590     if (!args->stack)
591         return (EINVAL);
592 
593     exit_signal = args->flags & 0x000000ff;
594     if (exit_signal >= LINUX_NSIG)
595 	return EINVAL;
596     exit_signal = linux_to_bsd_signal[exit_signal];
597 
598     /* RFTHREAD probably not necessary here, but it shouldn't hurt either */
599     ff |= RFTHREAD;
600 
601     if (args->flags & CLONE_VM)
602 	ff |= RFMEM;
603     if (args->flags & CLONE_SIGHAND)
604 	ff |= RFSIGSHARE;
605     if (!(args->flags & CLONE_FILES))
606 	ff |= RFFDG;
607 
608     error = 0;
609     start = 0;
610 
611     rf_args.flags = ff;
612     if ((error = rfork(p, &rf_args)) != 0)
613 	return error;
614 
615     p2 = pfind(p->p_retval[0]);
616     if (p2 == 0)
617  	return ESRCH;
618 
619     p2->p_sigparent = exit_signal;
620     p2->p_md.md_regs->tf_esp = (unsigned int)args->stack;
621 
622 #ifdef DEBUG
623     printf ("linux_clone(%d): successful rfork to %d\n", p->p_pid, p2->p_pid);
624 #endif
625     return 0;
626 }
627 
628 /* XXX move */
629 struct linux_mmap_argv {
630 	linux_caddr_t addr;
631 	int len;
632 	int prot;
633 	int flags;
634 	int fd;
635 	int pos;
636 };
637 
638 #define STACK_SIZE  (2 * 1024 * 1024)
639 #define GUARD_SIZE  (4 * PAGE_SIZE)
640 int
641 linux_mmap(struct proc *p, struct linux_mmap_args *args)
642 {
643     struct mmap_args /* {
644 	caddr_t addr;
645 	size_t len;
646 	int prot;
647 	int flags;
648 	int fd;
649 	long pad;
650 	off_t pos;
651     } */ bsd_args;
652     int error;
653     struct linux_mmap_argv linux_args;
654 
655     if ((error = copyin((caddr_t)args->ptr, (caddr_t)&linux_args,
656 			sizeof(linux_args))))
657 	return error;
658 #ifdef DEBUG
659     printf("Linux-emul(%ld): mmap(%p, %d, %d, %08x, %d, %d)\n",
660 	(long)p->p_pid, (void *)linux_args.addr, linux_args.len,
661 	linux_args.prot, linux_args.flags, linux_args.fd, linux_args.pos);
662 #endif
663     bsd_args.flags = 0;
664     if (linux_args.flags & LINUX_MAP_SHARED)
665 	bsd_args.flags |= MAP_SHARED;
666     if (linux_args.flags & LINUX_MAP_PRIVATE)
667 	bsd_args.flags |= MAP_PRIVATE;
668     if (linux_args.flags & LINUX_MAP_FIXED)
669 	bsd_args.flags |= MAP_FIXED;
670     if (linux_args.flags & LINUX_MAP_ANON)
671 	bsd_args.flags |= MAP_ANON;
672 
673 #ifndef VM_STACK
674     /* Linux Threads will map into the proc stack space, unless
675      * we prevent it.  This causes problems if we're not using
676      * our VM_STACK options.
677      */
678     if ((unsigned int)linux_args.addr + linux_args.len > (USRSTACK - MAXSSIZ))
679 	return (EINVAL);
680 #endif
681 
682     if (linux_args.flags & LINUX_MAP_GROWSDOWN) {
683 
684 #ifdef VM_STACK
685 	bsd_args.flags |= MAP_STACK;
686 #endif
687 
688 	/* The linux MAP_GROWSDOWN option does not limit auto
689 	 * growth of the region.  Linux mmap with this option
690 	 * takes as addr the inital BOS, and as len, the initial
691 	 * region size.  It can then grow down from addr without
692 	 * limit.  However, linux threads has an implicit internal
693 	 * limit to stack size of STACK_SIZE.  Its just not
694 	 * enforced explicitly in linux.  But, here we impose
695 	 * a limit of (STACK_SIZE - GUARD_SIZE) on the stack
696 	 * region, since we can do this with our mmap.
697 	 *
698 	 * Our mmap with MAP_STACK takes addr as the maximum
699 	 * downsize limit on BOS, and as len the max size of
700 	 * the region.  It them maps the top SGROWSIZ bytes,
701 	 * and autgrows the region down, up to the limit
702 	 * in addr.
703 	 *
704 	 * If we don't use the MAP_STACK option, the effect
705 	 * of this code is to allocate a stack region of a
706 	 * fixed size of (STACK_SIZE - GUARD_SIZE).
707 	 */
708 
709 	/* This gives us TOS */
710 	bsd_args.addr = linux_args.addr + linux_args.len;
711 
712 	/* This gives us our maximum stack size */
713 	if (linux_args.len > STACK_SIZE - GUARD_SIZE)
714 	    bsd_args.len = linux_args.len;
715 	else
716 	    bsd_args.len  = STACK_SIZE - GUARD_SIZE;
717 
718 	/* This gives us a new BOS.  If we're using VM_STACK, then
719 	 * mmap will just map the top SGROWSIZ bytes, and let
720 	 * the stack grow down to the limit at BOS.  If we're
721 	 * not using VM_STACK we map the full stack, since we
722 	 * don't have a way to autogrow it.
723 	 */
724 	bsd_args.addr -= bsd_args.len;
725 
726     } else {
727 	bsd_args.addr = linux_args.addr;
728 	bsd_args.len  = linux_args.len;
729     }
730 
731     bsd_args.prot = linux_args.prot | PROT_READ;	/* always required */
732     bsd_args.fd = linux_args.fd;
733     bsd_args.pos = linux_args.pos;
734     bsd_args.pad = 0;
735     return mmap(p, &bsd_args);
736 }
737 
738 int
739 linux_mremap(struct proc *p, struct linux_mremap_args *args)
740 {
741 	struct munmap_args /* {
742 		void *addr;
743 		size_t len;
744 	} */ bsd_args;
745 	int error = 0;
746 
747 #ifdef DEBUG
748 	printf("Linux-emul(%ld): mremap(%p, %08x, %08x, %08x)\n",
749 	    (long)p->p_pid, (void *)args->addr, args->old_len, args->new_len,
750 	    args->flags);
751 #endif
752 	args->new_len = round_page(args->new_len);
753 	args->old_len = round_page(args->old_len);
754 
755 	if (args->new_len > args->old_len) {
756 		p->p_retval[0] = 0;
757 		return ENOMEM;
758 	}
759 
760 	if (args->new_len < args->old_len) {
761 		bsd_args.addr = args->addr + args->new_len;
762 		bsd_args.len = args->old_len - args->new_len;
763 		error = munmap(p, &bsd_args);
764 	}
765 
766 	p->p_retval[0] = error ? 0 : (int)args->addr;
767 	return error;
768 }
769 
770 int
771 linux_msync(struct proc *p, struct linux_msync_args *args)
772 {
773 	struct msync_args bsd_args;
774 
775 	bsd_args.addr = args->addr;
776 	bsd_args.len = args->len;
777 	bsd_args.flags = 0;	/* XXX ignore */
778 
779 	return msync(p, &bsd_args);
780 }
781 
782 int
783 linux_pipe(struct proc *p, struct linux_pipe_args *args)
784 {
785     int error;
786     int reg_edx;
787 
788 #ifdef DEBUG
789     printf("Linux-emul(%d): pipe(*)\n", p->p_pid);
790 #endif
791     reg_edx = p->p_retval[1];
792     if (error = pipe(p, 0)) {
793 	p->p_retval[1] = reg_edx;
794 	return error;
795     }
796 
797     if (error = copyout(p->p_retval, args->pipefds, 2*sizeof(int))) {
798 	p->p_retval[1] = reg_edx;
799 	return error;
800     }
801 
802     p->p_retval[1] = reg_edx;
803     p->p_retval[0] = 0;
804     return 0;
805 }
806 
807 int
808 linux_time(struct proc *p, struct linux_time_args *args)
809 {
810     struct timeval tv;
811     linux_time_t tm;
812     int error;
813 
814 #ifdef DEBUG
815     printf("Linux-emul(%d): time(*)\n", p->p_pid);
816 #endif
817     microtime(&tv);
818     tm = tv.tv_sec;
819     if (args->tm && (error = copyout(&tm, args->tm, sizeof(linux_time_t))))
820 	return error;
821     p->p_retval[0] = tm;
822     return 0;
823 }
824 
825 struct linux_times_argv {
826     long    tms_utime;
827     long    tms_stime;
828     long    tms_cutime;
829     long    tms_cstime;
830 };
831 
832 #define CLK_TCK 100	/* Linux uses 100 */
833 #define CONVTCK(r)	(r.tv_sec * CLK_TCK + r.tv_usec / (1000000 / CLK_TCK))
834 
835 int
836 linux_times(struct proc *p, struct linux_times_args *args)
837 {
838     struct timeval tv;
839     struct linux_times_argv tms;
840     struct rusage ru;
841     int error;
842 
843 #ifdef DEBUG
844     printf("Linux-emul(%d): times(*)\n", p->p_pid);
845 #endif
846     calcru(p, &ru.ru_utime, &ru.ru_stime, NULL);
847 
848     tms.tms_utime = CONVTCK(ru.ru_utime);
849     tms.tms_stime = CONVTCK(ru.ru_stime);
850 
851     tms.tms_cutime = CONVTCK(p->p_stats->p_cru.ru_utime);
852     tms.tms_cstime = CONVTCK(p->p_stats->p_cru.ru_stime);
853 
854     if ((error = copyout((caddr_t)&tms, (caddr_t)args->buf,
855 	    	    sizeof(struct linux_times_argv))))
856 	return error;
857 
858     microuptime(&tv);
859     p->p_retval[0] = (int)CONVTCK(tv);
860     return 0;
861 }
862 
863 /* XXX move */
864 struct linux_newuname_t {
865     char sysname[65];
866     char nodename[65];
867     char release[65];
868     char version[65];
869     char machine[65];
870     char domainname[65];
871 };
872 
873 int
874 linux_newuname(struct proc *p, struct linux_newuname_args *args)
875 {
876     struct linux_newuname_t linux_newuname;
877 
878 #ifdef DEBUG
879     printf("Linux-emul(%d): newuname(*)\n", p->p_pid);
880 #endif
881     bzero(&linux_newuname, sizeof(struct linux_newuname_t));
882     strncpy(linux_newuname.sysname, ostype,
883 	sizeof(linux_newuname.sysname) - 1);
884     strncpy(linux_newuname.nodename, hostname,
885 	sizeof(linux_newuname.nodename) - 1);
886     strncpy(linux_newuname.release, osrelease,
887 	sizeof(linux_newuname.release) - 1);
888     strncpy(linux_newuname.version, version,
889 	sizeof(linux_newuname.version) - 1);
890     strncpy(linux_newuname.machine, machine,
891 	sizeof(linux_newuname.machine) - 1);
892     strncpy(linux_newuname.domainname, domainname,
893 	sizeof(linux_newuname.domainname) - 1);
894     return (copyout((caddr_t)&linux_newuname, (caddr_t)args->buf,
895 	    	    sizeof(struct linux_newuname_t)));
896 }
897 
898 struct linux_utimbuf {
899 	linux_time_t l_actime;
900 	linux_time_t l_modtime;
901 };
902 
903 int
904 linux_utime(struct proc *p, struct linux_utime_args *args)
905 {
906     struct utimes_args /* {
907 	char	*path;
908 	struct	timeval *tptr;
909     } */ bsdutimes;
910     struct timeval tv[2], *tvp;
911     struct linux_utimbuf lut;
912     int error;
913     caddr_t sg;
914 
915     sg = stackgap_init();
916     CHECKALTEXIST(p, &sg, args->fname);
917 
918 #ifdef DEBUG
919     printf("Linux-emul(%d): utime(%s, *)\n", p->p_pid, args->fname);
920 #endif
921     if (args->times) {
922 	if ((error = copyin(args->times, &lut, sizeof lut)))
923 	    return error;
924 	tv[0].tv_sec = lut.l_actime;
925 	tv[0].tv_usec = 0;
926 	tv[1].tv_sec = lut.l_modtime;
927 	tv[1].tv_usec = 0;
928 	/* so that utimes can copyin */
929 	tvp = (struct timeval *)stackgap_alloc(&sg, sizeof(tv));
930 	if ((error = copyout(tv, tvp, sizeof(tv))))
931 	    return error;
932 	bsdutimes.tptr = tvp;
933     } else
934 	bsdutimes.tptr = NULL;
935 
936     bsdutimes.path = args->fname;
937     return utimes(p, &bsdutimes);
938 }
939 
940 #define __WCLONE 0x80000000
941 
942 int
943 linux_waitpid(struct proc *p, struct linux_waitpid_args *args)
944 {
945     struct wait_args /* {
946 	int pid;
947 	int *status;
948 	int options;
949 	struct	rusage *rusage;
950     } */ tmp;
951     int error, tmpstat;
952 
953 #ifdef DEBUG
954     printf("Linux-emul(%ld): waitpid(%d, %p, %d)\n",
955 	(long)p->p_pid, args->pid, (void *)args->status, args->options);
956 #endif
957     tmp.pid = args->pid;
958     tmp.status = args->status;
959     tmp.options = (args->options & (WNOHANG | WUNTRACED));
960     /* WLINUXCLONE should be equal to __WCLONE, but we make sure */
961     if (args->options & __WCLONE)
962 	tmp.options |= WLINUXCLONE;
963     tmp.rusage = NULL;
964 
965     if ((error = wait4(p, &tmp)) != 0)
966 	return error;
967 
968     if (args->status) {
969 	if ((error = copyin(args->status, &tmpstat, sizeof(int))) != 0)
970 	    return error;
971 	if (WIFSIGNALED(tmpstat))
972 	    tmpstat = (tmpstat & 0xffffff80) |
973 		      bsd_to_linux_signal[WTERMSIG(tmpstat)];
974 	else if (WIFSTOPPED(tmpstat))
975 	    tmpstat = (tmpstat & 0xffff00ff) |
976 		      (bsd_to_linux_signal[WSTOPSIG(tmpstat)]<<8);
977 	return copyout(&tmpstat, args->status, sizeof(int));
978     } else
979 	return 0;
980 }
981 
982 int
983 linux_wait4(struct proc *p, struct linux_wait4_args *args)
984 {
985     struct wait_args /* {
986 	int pid;
987 	int *status;
988 	int options;
989 	struct	rusage *rusage;
990     } */ tmp;
991     int error, tmpstat;
992 
993 #ifdef DEBUG
994     printf("Linux-emul(%ld): wait4(%d, %p, %d, %p)\n",
995 	(long)p->p_pid, args->pid, (void *)args->status, args->options,
996 	(void *)args->rusage);
997 #endif
998     tmp.pid = args->pid;
999     tmp.status = args->status;
1000     tmp.options = (args->options & (WNOHANG | WUNTRACED));
1001     /* WLINUXCLONE should be equal to __WCLONE, but we make sure */
1002     if (args->options & __WCLONE)
1003 	tmp.options |= WLINUXCLONE;
1004     tmp.rusage = args->rusage;
1005 
1006     if ((error = wait4(p, &tmp)) != 0)
1007 	return error;
1008 
1009     p->p_siglist &= ~sigmask(SIGCHLD);
1010 
1011     if (args->status) {
1012 	if ((error = copyin(args->status, &tmpstat, sizeof(int))) != 0)
1013 	    return error;
1014 	if (WIFSIGNALED(tmpstat))
1015 	    tmpstat = (tmpstat & 0xffffff80) |
1016 		  bsd_to_linux_signal[WTERMSIG(tmpstat)];
1017 	else if (WIFSTOPPED(tmpstat))
1018 	    tmpstat = (tmpstat & 0xffff00ff) |
1019 		  (bsd_to_linux_signal[WSTOPSIG(tmpstat)]<<8);
1020 	return copyout(&tmpstat, args->status, sizeof(int));
1021     } else
1022 	return 0;
1023 }
1024 
1025 int
1026 linux_mknod(struct proc *p, struct linux_mknod_args *args)
1027 {
1028 	caddr_t sg;
1029 	struct mknod_args bsd_mknod;
1030 	struct mkfifo_args bsd_mkfifo;
1031 
1032 	sg = stackgap_init();
1033 
1034 	CHECKALTCREAT(p, &sg, args->path);
1035 
1036 #ifdef DEBUG
1037 	printf("Linux-emul(%d): mknod(%s, %d, %d)\n",
1038 	   p->p_pid, args->path, args->mode, args->dev);
1039 #endif
1040 
1041 	if (args->mode & S_IFIFO) {
1042 		bsd_mkfifo.path = args->path;
1043 		bsd_mkfifo.mode = args->mode;
1044 		return mkfifo(p, &bsd_mkfifo);
1045 	} else {
1046 		bsd_mknod.path = args->path;
1047 		bsd_mknod.mode = args->mode;
1048 		bsd_mknod.dev = args->dev;
1049 		return mknod(p, &bsd_mknod);
1050 	}
1051 }
1052 
1053 /*
1054  * UGH! This is just about the dumbest idea I've ever heard!!
1055  */
1056 int
1057 linux_personality(struct proc *p, struct linux_personality_args *args)
1058 {
1059 #ifdef DEBUG
1060 	printf("Linux-emul(%d): personality(%d)\n",
1061 	   p->p_pid, args->per);
1062 #endif
1063 	if (args->per != 0)
1064 		return EINVAL;
1065 
1066 	/* Yes Jim, it's still a Linux... */
1067 	p->p_retval[0] = 0;
1068 	return 0;
1069 }
1070 
1071 /*
1072  * Wrappers for get/setitimer for debugging..
1073  */
1074 int
1075 linux_setitimer(struct proc *p, struct linux_setitimer_args *args)
1076 {
1077 	struct setitimer_args bsa;
1078 	struct itimerval foo;
1079 	int error;
1080 
1081 #ifdef DEBUG
1082 	printf("Linux-emul(%ld): setitimer(%p, %p)\n",
1083 	    (long)p->p_pid, (void *)args->itv, (void *)args->oitv);
1084 #endif
1085 	bsa.which = args->which;
1086 	bsa.itv = args->itv;
1087 	bsa.oitv = args->oitv;
1088 	if (args->itv) {
1089 	    if ((error = copyin((caddr_t)args->itv, (caddr_t)&foo,
1090 			sizeof(foo))))
1091 		return error;
1092 #ifdef DEBUG
1093 	    printf("setitimer: value: sec: %ld, usec: %ld\n",
1094 		foo.it_value.tv_sec, foo.it_value.tv_usec);
1095 	    printf("setitimer: interval: sec: %ld, usec: %ld\n",
1096 		foo.it_interval.tv_sec, foo.it_interval.tv_usec);
1097 #endif
1098 	}
1099 	return setitimer(p, &bsa);
1100 }
1101 
1102 int
1103 linux_getitimer(struct proc *p, struct linux_getitimer_args *args)
1104 {
1105 	struct getitimer_args bsa;
1106 #ifdef DEBUG
1107 	printf("Linux-emul(%ld): getitimer(%p)\n",
1108 	    (long)p->p_pid, (void *)args->itv);
1109 #endif
1110 	bsa.which = args->which;
1111 	bsa.itv = args->itv;
1112 	return getitimer(p, &bsa);
1113 }
1114 
1115 int
1116 linux_iopl(struct proc *p, struct linux_iopl_args *args)
1117 {
1118 	int error;
1119 
1120 	error = suser(p->p_ucred, &p->p_acflag);
1121 	if (error != 0)
1122 		return error;
1123 	if (securelevel > 0)
1124 		return EPERM;
1125 	p->p_md.md_regs->tf_eflags |= PSL_IOPL;
1126 	return 0;
1127 }
1128 
1129 int
1130 linux_nice(struct proc *p, struct linux_nice_args *args)
1131 {
1132 	struct setpriority_args	bsd_args;
1133 
1134 	bsd_args.which = PRIO_PROCESS;
1135 	bsd_args.who = 0;	/* current process */
1136 	bsd_args.prio = args->inc;
1137 	return setpriority(p, &bsd_args);
1138 }
1139 
1140 int
1141 linux_setgroups(p, uap)
1142      struct proc *p;
1143      struct linux_setgroups_args *uap;
1144 {
1145   struct pcred *pc = p->p_cred;
1146   linux_gid_t linux_gidset[NGROUPS];
1147   gid_t *bsd_gidset;
1148   int ngrp, error;
1149 
1150   if ((error = suser(pc->pc_ucred, &p->p_acflag)))
1151     return error;
1152 
1153   if (uap->gidsetsize > NGROUPS)
1154     return EINVAL;
1155 
1156   ngrp = uap->gidsetsize;
1157   pc->pc_ucred = crcopy(pc->pc_ucred);
1158   if (ngrp >= 1) {
1159     if ((error = copyin((caddr_t)uap->gidset,
1160                       (caddr_t)linux_gidset,
1161                         ngrp * sizeof(linux_gid_t))))
1162       return error;
1163 
1164     pc->pc_ucred->cr_ngroups = ngrp;
1165 
1166     bsd_gidset = pc->pc_ucred->cr_groups;
1167     ngrp--;
1168     while (ngrp >= 0) {
1169       bsd_gidset[ngrp] = linux_gidset[ngrp];
1170       ngrp--;
1171     }
1172   }
1173   else
1174     pc->pc_ucred->cr_ngroups = 1;
1175 
1176   setsugid(p);
1177   return 0;
1178 }
1179 
1180 int
1181 linux_getgroups(p, uap)
1182      struct proc *p;
1183      struct linux_getgroups_args *uap;
1184 {
1185   struct pcred *pc = p->p_cred;
1186   linux_gid_t linux_gidset[NGROUPS];
1187   gid_t *bsd_gidset;
1188   int ngrp, error;
1189 
1190   if ((ngrp = uap->gidsetsize) == 0) {
1191     p->p_retval[0] = pc->pc_ucred->cr_ngroups;
1192     return 0;
1193   }
1194 
1195   if (ngrp < pc->pc_ucred->cr_ngroups)
1196     return EINVAL;
1197 
1198   ngrp = 0;
1199   bsd_gidset = pc->pc_ucred->cr_groups;
1200   while (ngrp < pc->pc_ucred->cr_ngroups) {
1201     linux_gidset[ngrp] = bsd_gidset[ngrp];
1202     ngrp++;
1203   }
1204 
1205   if ((error = copyout((caddr_t)linux_gidset, (caddr_t)uap->gidset,
1206                        ngrp * sizeof(linux_gid_t))))
1207     return error;
1208 
1209   p->p_retval[0] = ngrp;
1210   return (0);
1211 }
1212