xref: /freebsd/sys/kern/sys_generic.c (revision ce4946daa5ce852d28008dac492029500ab2ee95)
1 /*
2  * Copyright (c) 1982, 1986, 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
39  * $FreeBSD$
40  */
41 
42 #include "opt_ktrace.h"
43 
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/sysproto.h>
47 #include <sys/filedesc.h>
48 #include <sys/filio.h>
49 #include <sys/fcntl.h>
50 #include <sys/file.h>
51 #include <sys/proc.h>
52 #include <sys/signalvar.h>
53 #include <sys/socketvar.h>
54 #include <sys/uio.h>
55 #include <sys/kernel.h>
56 #include <sys/malloc.h>
57 #include <sys/poll.h>
58 #include <sys/resourcevar.h>
59 #include <sys/selinfo.h>
60 #include <sys/sysctl.h>
61 #include <sys/sysent.h>
62 #include <sys/bio.h>
63 #include <sys/buf.h>
64 #ifdef KTRACE
65 #include <sys/ktrace.h>
66 #endif
67 #include <vm/vm.h>
68 #include <vm/vm_page.h>
69 
70 #include <machine/limits.h>
71 
72 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
73 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
74 MALLOC_DEFINE(M_IOV, "iov", "large iov's");
75 
76 static int	pollscan __P((struct proc *, struct pollfd *, u_int));
77 static int	selscan __P((struct proc *, fd_mask **, fd_mask **, int));
78 static int	dofileread __P((struct proc *, struct file *, int, void *,
79 		    size_t, off_t, int));
80 static int	dofilewrite __P((struct proc *, struct file *, int,
81 		    const void *, size_t, off_t, int));
82 
83 struct file*
84 holdfp(fdp, fd, flag)
85 	struct filedesc* fdp;
86 	int fd, flag;
87 {
88 	struct file* fp;
89 
90 	if (((u_int)fd) >= fdp->fd_nfiles ||
91 	    (fp = fdp->fd_ofiles[fd]) == NULL ||
92 	    (fp->f_flag & flag) == 0) {
93 		return (NULL);
94 	}
95 	fhold(fp);
96 	return (fp);
97 }
98 
99 /*
100  * Read system call.
101  */
102 #ifndef _SYS_SYSPROTO_H_
103 struct read_args {
104 	int	fd;
105 	void	*buf;
106 	size_t	nbyte;
107 };
108 #endif
109 int
110 read(p, uap)
111 	struct proc *p;
112 	register struct read_args *uap;
113 {
114 	register struct file *fp;
115 	int error;
116 
117 	if ((fp = holdfp(p->p_fd, uap->fd, FREAD)) == NULL)
118 		return (EBADF);
119 	error = dofileread(p, fp, uap->fd, uap->buf, uap->nbyte, (off_t)-1, 0);
120 	fdrop(fp, p);
121 	return(error);
122 }
123 
124 /*
125  * Pread system call
126  */
127 #ifndef _SYS_SYSPROTO_H_
128 struct pread_args {
129 	int	fd;
130 	void	*buf;
131 	size_t	nbyte;
132 	int	pad;
133 	off_t	offset;
134 };
135 #endif
136 int
137 pread(p, uap)
138 	struct proc *p;
139 	register struct pread_args *uap;
140 {
141 	register struct file *fp;
142 	int error;
143 
144 	if ((fp = holdfp(p->p_fd, uap->fd, FREAD)) == NULL)
145 		return (EBADF);
146 	if (fp->f_type != DTYPE_VNODE) {
147 		error = ESPIPE;
148 	} else {
149 	    error = dofileread(p, fp, uap->fd, uap->buf, uap->nbyte,
150 		uap->offset, FOF_OFFSET);
151 	}
152 	fdrop(fp, p);
153 	return(error);
154 }
155 
156 /*
157  * Code common for read and pread
158  */
159 int
160 dofileread(p, fp, fd, buf, nbyte, offset, flags)
161 	struct proc *p;
162 	struct file *fp;
163 	int fd, flags;
164 	void *buf;
165 	size_t nbyte;
166 	off_t offset;
167 {
168 	struct uio auio;
169 	struct iovec aiov;
170 	long cnt, error = 0;
171 #ifdef KTRACE
172 	struct iovec ktriov;
173 	struct uio ktruio;
174 	int didktr = 0;
175 #endif
176 
177 	aiov.iov_base = (caddr_t)buf;
178 	aiov.iov_len = nbyte;
179 	auio.uio_iov = &aiov;
180 	auio.uio_iovcnt = 1;
181 	auio.uio_offset = offset;
182 	if (nbyte > INT_MAX)
183 		return (EINVAL);
184 	auio.uio_resid = nbyte;
185 	auio.uio_rw = UIO_READ;
186 	auio.uio_segflg = UIO_USERSPACE;
187 	auio.uio_procp = p;
188 #ifdef KTRACE
189 	/*
190 	 * if tracing, save a copy of iovec
191 	 */
192 	if (KTRPOINT(p, KTR_GENIO)) {
193 		ktriov = aiov;
194 		ktruio = auio;
195 		didktr = 1;
196 	}
197 #endif
198 	cnt = nbyte;
199 
200 	if ((error = fo_read(fp, &auio, fp->f_cred, flags, p))) {
201 		if (auio.uio_resid != cnt && (error == ERESTART ||
202 		    error == EINTR || error == EWOULDBLOCK))
203 			error = 0;
204 	}
205 	cnt -= auio.uio_resid;
206 #ifdef KTRACE
207 	if (didktr && error == 0) {
208 		ktruio.uio_iov = &ktriov;
209 		ktruio.uio_resid = cnt;
210 		ktrgenio(p->p_tracep, fd, UIO_READ, &ktruio, error);
211 	}
212 #endif
213 	p->p_retval[0] = cnt;
214 	return (error);
215 }
216 
217 /*
218  * Scatter read system call.
219  */
220 #ifndef _SYS_SYSPROTO_H_
221 struct readv_args {
222 	int	fd;
223 	struct	iovec *iovp;
224 	u_int	iovcnt;
225 };
226 #endif
227 int
228 readv(p, uap)
229 	struct proc *p;
230 	register struct readv_args *uap;
231 {
232 	register struct file *fp;
233 	register struct filedesc *fdp = p->p_fd;
234 	struct uio auio;
235 	register struct iovec *iov;
236 	struct iovec *needfree;
237 	struct iovec aiov[UIO_SMALLIOV];
238 	long i, cnt, error = 0;
239 	u_int iovlen;
240 #ifdef KTRACE
241 	struct iovec *ktriov = NULL;
242 	struct uio ktruio;
243 #endif
244 
245 	if ((fp = holdfp(fdp, uap->fd, FREAD)) == NULL)
246 		return (EBADF);
247 	/* note: can't use iovlen until iovcnt is validated */
248 	iovlen = uap->iovcnt * sizeof (struct iovec);
249 	if (uap->iovcnt > UIO_SMALLIOV) {
250 		if (uap->iovcnt > UIO_MAXIOV)
251 			return (EINVAL);
252 		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
253 		needfree = iov;
254 	} else {
255 		iov = aiov;
256 		needfree = NULL;
257 	}
258 	auio.uio_iov = iov;
259 	auio.uio_iovcnt = uap->iovcnt;
260 	auio.uio_rw = UIO_READ;
261 	auio.uio_segflg = UIO_USERSPACE;
262 	auio.uio_procp = p;
263 	auio.uio_offset = -1;
264 	if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)))
265 		goto done;
266 	auio.uio_resid = 0;
267 	for (i = 0; i < uap->iovcnt; i++) {
268 		if (iov->iov_len > INT_MAX - auio.uio_resid) {
269 			error = EINVAL;
270 			goto done;
271 		}
272 		auio.uio_resid += iov->iov_len;
273 		iov++;
274 	}
275 #ifdef KTRACE
276 	/*
277 	 * if tracing, save a copy of iovec
278 	 */
279 	if (KTRPOINT(p, KTR_GENIO))  {
280 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
281 		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
282 		ktruio = auio;
283 	}
284 #endif
285 	cnt = auio.uio_resid;
286 	if ((error = fo_read(fp, &auio, fp->f_cred, 0, p))) {
287 		if (auio.uio_resid != cnt && (error == ERESTART ||
288 		    error == EINTR || error == EWOULDBLOCK))
289 			error = 0;
290 	}
291 	cnt -= auio.uio_resid;
292 #ifdef KTRACE
293 	if (ktriov != NULL) {
294 		if (error == 0) {
295 			ktruio.uio_iov = ktriov;
296 			ktruio.uio_resid = cnt;
297 			ktrgenio(p->p_tracep, uap->fd, UIO_READ, &ktruio,
298 			    error);
299 		}
300 		FREE(ktriov, M_TEMP);
301 	}
302 #endif
303 	p->p_retval[0] = cnt;
304 done:
305 	fdrop(fp, p);
306 	if (needfree)
307 		FREE(needfree, M_IOV);
308 	return (error);
309 }
310 
311 /*
312  * Write system call
313  */
314 #ifndef _SYS_SYSPROTO_H_
315 struct write_args {
316 	int	fd;
317 	const void *buf;
318 	size_t	nbyte;
319 };
320 #endif
321 int
322 write(p, uap)
323 	struct proc *p;
324 	register struct write_args *uap;
325 {
326 	register struct file *fp;
327 	int error;
328 
329 	if ((fp = holdfp(p->p_fd, uap->fd, FWRITE)) == NULL)
330 		return (EBADF);
331 	error = dofilewrite(p, fp, uap->fd, uap->buf, uap->nbyte, (off_t)-1, 0);
332 	fdrop(fp, p);
333 	return(error);
334 }
335 
336 /*
337  * Pwrite system call
338  */
339 #ifndef _SYS_SYSPROTO_H_
340 struct pwrite_args {
341 	int	fd;
342 	const void *buf;
343 	size_t	nbyte;
344 	int	pad;
345 	off_t	offset;
346 };
347 #endif
348 int
349 pwrite(p, uap)
350 	struct proc *p;
351 	register struct pwrite_args *uap;
352 {
353 	register struct file *fp;
354 	int error;
355 
356 	if ((fp = holdfp(p->p_fd, uap->fd, FWRITE)) == NULL)
357 		return (EBADF);
358 	if (fp->f_type != DTYPE_VNODE) {
359 		error = ESPIPE;
360 	} else {
361 	    error = dofilewrite(p, fp, uap->fd, uap->buf, uap->nbyte,
362 		uap->offset, FOF_OFFSET);
363 	}
364 	fdrop(fp, p);
365 	return(error);
366 }
367 
368 static int
369 dofilewrite(p, fp, fd, buf, nbyte, offset, flags)
370 	struct proc *p;
371 	struct file *fp;
372 	int fd, flags;
373 	const void *buf;
374 	size_t nbyte;
375 	off_t offset;
376 {
377 	struct uio auio;
378 	struct iovec aiov;
379 	long cnt, error = 0;
380 #ifdef KTRACE
381 	struct iovec ktriov;
382 	struct uio ktruio;
383 	int didktr = 0;
384 #endif
385 
386 	aiov.iov_base = (void *)(uintptr_t)buf;
387 	aiov.iov_len = nbyte;
388 	auio.uio_iov = &aiov;
389 	auio.uio_iovcnt = 1;
390 	auio.uio_offset = offset;
391 	if (nbyte > INT_MAX)
392 		return (EINVAL);
393 	auio.uio_resid = nbyte;
394 	auio.uio_rw = UIO_WRITE;
395 	auio.uio_segflg = UIO_USERSPACE;
396 	auio.uio_procp = p;
397 #ifdef KTRACE
398 	/*
399 	 * if tracing, save a copy of iovec and uio
400 	 */
401 	if (KTRPOINT(p, KTR_GENIO)) {
402 		ktriov = aiov;
403 		ktruio = auio;
404 		didktr = 1;
405 	}
406 #endif
407 	cnt = nbyte;
408 	if (fp->f_type == DTYPE_VNODE)
409 		bwillwrite();
410 	if ((error = fo_write(fp, &auio, fp->f_cred, flags, p))) {
411 		if (auio.uio_resid != cnt && (error == ERESTART ||
412 		    error == EINTR || error == EWOULDBLOCK))
413 			error = 0;
414 		if (error == EPIPE) {
415 			PROC_LOCK(p);
416 			psignal(p, SIGPIPE);
417 			PROC_UNLOCK(p);
418 		}
419 	}
420 	cnt -= auio.uio_resid;
421 #ifdef KTRACE
422 	if (didktr && error == 0) {
423 		ktruio.uio_iov = &ktriov;
424 		ktruio.uio_resid = cnt;
425 		ktrgenio(p->p_tracep, fd, UIO_WRITE, &ktruio, error);
426 	}
427 #endif
428 	p->p_retval[0] = cnt;
429 	return (error);
430 }
431 
432 /*
433  * Gather write system call
434  */
435 #ifndef _SYS_SYSPROTO_H_
436 struct writev_args {
437 	int	fd;
438 	struct	iovec *iovp;
439 	u_int	iovcnt;
440 };
441 #endif
442 int
443 writev(p, uap)
444 	struct proc *p;
445 	register struct writev_args *uap;
446 {
447 	register struct file *fp;
448 	register struct filedesc *fdp = p->p_fd;
449 	struct uio auio;
450 	register struct iovec *iov;
451 	struct iovec *needfree;
452 	struct iovec aiov[UIO_SMALLIOV];
453 	long i, cnt, error = 0;
454 	u_int iovlen;
455 #ifdef KTRACE
456 	struct iovec *ktriov = NULL;
457 	struct uio ktruio;
458 #endif
459 
460 	if ((fp = holdfp(fdp, uap->fd, FWRITE)) == NULL)
461 		return (EBADF);
462 	/* note: can't use iovlen until iovcnt is validated */
463 	iovlen = uap->iovcnt * sizeof (struct iovec);
464 	if (uap->iovcnt > UIO_SMALLIOV) {
465 		if (uap->iovcnt > UIO_MAXIOV) {
466 			needfree = NULL;
467 			error = EINVAL;
468 			goto done;
469 		}
470 		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
471 		needfree = iov;
472 	} else {
473 		iov = aiov;
474 		needfree = NULL;
475 	}
476 	auio.uio_iov = iov;
477 	auio.uio_iovcnt = uap->iovcnt;
478 	auio.uio_rw = UIO_WRITE;
479 	auio.uio_segflg = UIO_USERSPACE;
480 	auio.uio_procp = p;
481 	auio.uio_offset = -1;
482 	if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)))
483 		goto done;
484 	auio.uio_resid = 0;
485 	for (i = 0; i < uap->iovcnt; i++) {
486 		if (iov->iov_len > INT_MAX - auio.uio_resid) {
487 			error = EINVAL;
488 			goto done;
489 		}
490 		auio.uio_resid += iov->iov_len;
491 		iov++;
492 	}
493 #ifdef KTRACE
494 	/*
495 	 * if tracing, save a copy of iovec and uio
496 	 */
497 	if (KTRPOINT(p, KTR_GENIO))  {
498 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
499 		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
500 		ktruio = auio;
501 	}
502 #endif
503 	cnt = auio.uio_resid;
504 	if (fp->f_type == DTYPE_VNODE)
505 		bwillwrite();
506 	if ((error = fo_write(fp, &auio, fp->f_cred, 0, p))) {
507 		if (auio.uio_resid != cnt && (error == ERESTART ||
508 		    error == EINTR || error == EWOULDBLOCK))
509 			error = 0;
510 		if (error == EPIPE) {
511 			PROC_LOCK(p);
512 			psignal(p, SIGPIPE);
513 			PROC_UNLOCK(p);
514 		}
515 	}
516 	cnt -= auio.uio_resid;
517 #ifdef KTRACE
518 	if (ktriov != NULL) {
519 		if (error == 0) {
520 			ktruio.uio_iov = ktriov;
521 			ktruio.uio_resid = cnt;
522 			ktrgenio(p->p_tracep, uap->fd, UIO_WRITE, &ktruio,
523 			    error);
524 		}
525 		FREE(ktriov, M_TEMP);
526 	}
527 #endif
528 	p->p_retval[0] = cnt;
529 done:
530 	fdrop(fp, p);
531 	if (needfree)
532 		FREE(needfree, M_IOV);
533 	return (error);
534 }
535 
536 /*
537  * Ioctl system call
538  */
539 #ifndef _SYS_SYSPROTO_H_
540 struct ioctl_args {
541 	int	fd;
542 	u_long	com;
543 	caddr_t	data;
544 };
545 #endif
546 /* ARGSUSED */
547 int
548 ioctl(p, uap)
549 	struct proc *p;
550 	register struct ioctl_args *uap;
551 {
552 	register struct file *fp;
553 	register struct filedesc *fdp;
554 	register u_long com;
555 	int error;
556 	register u_int size;
557 	caddr_t data, memp;
558 	int tmp;
559 #define STK_PARAMS	128
560 	union {
561 	    char stkbuf[STK_PARAMS];
562 	    long align;
563 	} ubuf;
564 
565 	fdp = p->p_fd;
566 	if ((u_int)uap->fd >= fdp->fd_nfiles ||
567 	    (fp = fdp->fd_ofiles[uap->fd]) == NULL)
568 		return (EBADF);
569 
570 	if ((fp->f_flag & (FREAD | FWRITE)) == 0)
571 		return (EBADF);
572 
573 	switch (com = uap->com) {
574 	case FIONCLEX:
575 		fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE;
576 		return (0);
577 	case FIOCLEX:
578 		fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE;
579 		return (0);
580 	}
581 
582 	/*
583 	 * Interpret high order word to find amount of data to be
584 	 * copied to/from the user's address space.
585 	 */
586 	size = IOCPARM_LEN(com);
587 	if (size > IOCPARM_MAX)
588 		return (ENOTTY);
589 
590 	fhold(fp);
591 
592 	memp = NULL;
593 	if (size > sizeof (ubuf.stkbuf)) {
594 		memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
595 		data = memp;
596 	} else {
597 		data = ubuf.stkbuf;
598 	}
599 	if (com&IOC_IN) {
600 		if (size) {
601 			error = copyin(uap->data, data, (u_int)size);
602 			if (error) {
603 				if (memp)
604 					free(memp, M_IOCTLOPS);
605 				fdrop(fp, p);
606 				return (error);
607 			}
608 		} else {
609 			*(caddr_t *)data = uap->data;
610 		}
611 	} else if ((com&IOC_OUT) && size) {
612 		/*
613 		 * Zero the buffer so the user always
614 		 * gets back something deterministic.
615 		 */
616 		bzero(data, size);
617 	} else if (com&IOC_VOID) {
618 		*(caddr_t *)data = uap->data;
619 	}
620 
621 	switch (com) {
622 
623 	case FIONBIO:
624 		if ((tmp = *(int *)data))
625 			fp->f_flag |= FNONBLOCK;
626 		else
627 			fp->f_flag &= ~FNONBLOCK;
628 		error = fo_ioctl(fp, FIONBIO, (caddr_t)&tmp, p);
629 		break;
630 
631 	case FIOASYNC:
632 		if ((tmp = *(int *)data))
633 			fp->f_flag |= FASYNC;
634 		else
635 			fp->f_flag &= ~FASYNC;
636 		error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, p);
637 		break;
638 
639 	default:
640 		error = fo_ioctl(fp, com, data, p);
641 		/*
642 		 * Copy any data to user, size was
643 		 * already set and checked above.
644 		 */
645 		if (error == 0 && (com&IOC_OUT) && size)
646 			error = copyout(data, uap->data, (u_int)size);
647 		break;
648 	}
649 	if (memp)
650 		free(memp, M_IOCTLOPS);
651 	fdrop(fp, p);
652 	return (error);
653 }
654 
655 static int	nselcoll;	/* Select collisions since boot */
656 int	selwait;
657 SYSCTL_INT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
658 
659 /*
660  * Select system call.
661  */
662 #ifndef _SYS_SYSPROTO_H_
663 struct select_args {
664 	int	nd;
665 	fd_set	*in, *ou, *ex;
666 	struct	timeval *tv;
667 };
668 #endif
669 int
670 select(p, uap)
671 	register struct proc *p;
672 	register struct select_args *uap;
673 {
674 	/*
675 	 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
676 	 * infds with the new FD_SETSIZE of 1024, and more than enough for
677 	 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
678 	 * of 256.
679 	 */
680 	fd_mask s_selbits[howmany(2048, NFDBITS)];
681 	fd_mask *ibits[3], *obits[3], *selbits, *sbp;
682 	struct timeval atv, rtv, ttv;
683 	int s, ncoll, error, timo;
684 	u_int nbufbytes, ncpbytes, nfdbits;
685 
686 	if (uap->nd < 0)
687 		return (EINVAL);
688 	if (uap->nd > p->p_fd->fd_nfiles)
689 		uap->nd = p->p_fd->fd_nfiles;   /* forgiving; slightly wrong */
690 
691 	/*
692 	 * Allocate just enough bits for the non-null fd_sets.  Use the
693 	 * preallocated auto buffer if possible.
694 	 */
695 	nfdbits = roundup(uap->nd, NFDBITS);
696 	ncpbytes = nfdbits / NBBY;
697 	nbufbytes = 0;
698 	if (uap->in != NULL)
699 		nbufbytes += 2 * ncpbytes;
700 	if (uap->ou != NULL)
701 		nbufbytes += 2 * ncpbytes;
702 	if (uap->ex != NULL)
703 		nbufbytes += 2 * ncpbytes;
704 	if (nbufbytes <= sizeof s_selbits)
705 		selbits = &s_selbits[0];
706 	else
707 		selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
708 
709 	/*
710 	 * Assign pointers into the bit buffers and fetch the input bits.
711 	 * Put the output buffers together so that they can be bzeroed
712 	 * together.
713 	 */
714 	sbp = selbits;
715 #define	getbits(name, x) \
716 	do {								\
717 		if (uap->name == NULL)					\
718 			ibits[x] = NULL;				\
719 		else {							\
720 			ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp;	\
721 			obits[x] = sbp;					\
722 			sbp += ncpbytes / sizeof *sbp;			\
723 			error = copyin(uap->name, ibits[x], ncpbytes);	\
724 			if (error != 0)	{				\
725 				PROC_LOCK(p);				\
726 				goto done;				\
727 			}						\
728 		}							\
729 	} while (0)
730 	getbits(in, 0);
731 	getbits(ou, 1);
732 	getbits(ex, 2);
733 #undef	getbits
734 	if (nbufbytes != 0)
735 		bzero(selbits, nbufbytes / 2);
736 
737 	if (uap->tv) {
738 		error = copyin((caddr_t)uap->tv, (caddr_t)&atv,
739 			sizeof (atv));
740 		if (error) {
741 			PROC_LOCK(p);
742 			goto done;
743 		}
744 		if (itimerfix(&atv)) {
745 			error = EINVAL;
746 			PROC_LOCK(p);
747 			goto done;
748 		}
749 		getmicrouptime(&rtv);
750 		timevaladd(&atv, &rtv);
751 	} else {
752 		atv.tv_sec = 0;
753 		atv.tv_usec = 0;
754 	}
755 	timo = 0;
756 	PROC_LOCK(p);
757 retry:
758 	ncoll = nselcoll;
759 	p->p_flag |= P_SELECT;
760 	PROC_UNLOCK(p);
761 	error = selscan(p, ibits, obits, uap->nd);
762 	PROC_LOCK(p);
763 	if (error || p->p_retval[0])
764 		goto done;
765 	if (atv.tv_sec || atv.tv_usec) {
766 		getmicrouptime(&rtv);
767 		if (timevalcmp(&rtv, &atv, >=))
768 			goto done;
769 		ttv = atv;
770 		timevalsub(&ttv, &rtv);
771 		timo = ttv.tv_sec > 24 * 60 * 60 ?
772 		    24 * 60 * 60 * hz : tvtohz(&ttv);
773 	}
774 	s = splhigh();
775 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
776 		splx(s);
777 		goto retry;
778 	}
779 	p->p_flag &= ~P_SELECT;
780 
781 	error = msleep((caddr_t)&selwait, &p->p_mtx, PSOCK | PCATCH, "select",
782 	    timo);
783 
784 	splx(s);
785 	if (error == 0)
786 		goto retry;
787 done:
788 	p->p_flag &= ~P_SELECT;
789 	PROC_UNLOCK(p);
790 	/* select is not restarted after signals... */
791 	if (error == ERESTART)
792 		error = EINTR;
793 	if (error == EWOULDBLOCK)
794 		error = 0;
795 #define	putbits(name, x) \
796 	if (uap->name && (error2 = copyout(obits[x], uap->name, ncpbytes))) \
797 		error = error2;
798 	if (error == 0) {
799 		int error2;
800 
801 		putbits(in, 0);
802 		putbits(ou, 1);
803 		putbits(ex, 2);
804 #undef putbits
805 	}
806 	if (selbits != &s_selbits[0])
807 		free(selbits, M_SELECT);
808 	return (error);
809 }
810 
811 static int
812 selscan(p, ibits, obits, nfd)
813 	struct proc *p;
814 	fd_mask **ibits, **obits;
815 	int nfd;
816 {
817 	struct filedesc *fdp = p->p_fd;
818 	int msk, i, fd;
819 	fd_mask bits;
820 	struct file *fp;
821 	int n = 0;
822 	/* Note: backend also returns POLLHUP/POLLERR if appropriate. */
823 	static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
824 
825 	for (msk = 0; msk < 3; msk++) {
826 		if (ibits[msk] == NULL)
827 			continue;
828 		for (i = 0; i < nfd; i += NFDBITS) {
829 			bits = ibits[msk][i/NFDBITS];
830 			/* ffs(int mask) not portable, fd_mask is long */
831 			for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
832 				if (!(bits & 1))
833 					continue;
834 				fp = fdp->fd_ofiles[fd];
835 				if (fp == NULL)
836 					return (EBADF);
837 				if (fo_poll(fp, flag[msk], fp->f_cred, p)) {
838 					obits[msk][(fd)/NFDBITS] |=
839 					    ((fd_mask)1 << ((fd) % NFDBITS));
840 					n++;
841 				}
842 			}
843 		}
844 	}
845 	p->p_retval[0] = n;
846 	return (0);
847 }
848 
849 /*
850  * Poll system call.
851  */
852 #ifndef _SYS_SYSPROTO_H_
853 struct poll_args {
854 	struct pollfd *fds;
855 	u_int	nfds;
856 	int	timeout;
857 };
858 #endif
859 int
860 poll(p, uap)
861 	struct proc *p;
862 	struct poll_args *uap;
863 {
864 	caddr_t bits;
865 	char smallbits[32 * sizeof(struct pollfd)];
866 	struct timeval atv, rtv, ttv;
867 	int s, ncoll, error = 0, timo;
868 	u_int nfds;
869 	size_t ni;
870 
871 	nfds = SCARG(uap, nfds);
872 	/*
873 	 * This is kinda bogus.  We have fd limits, but that is not
874 	 * really related to the size of the pollfd array.  Make sure
875 	 * we let the process use at least FD_SETSIZE entries and at
876 	 * least enough for the current limits.  We want to be reasonably
877 	 * safe, but not overly restrictive.
878 	 */
879 	if (nfds > p->p_rlimit[RLIMIT_NOFILE].rlim_cur && nfds > FD_SETSIZE)
880 		return (EINVAL);
881 	ni = nfds * sizeof(struct pollfd);
882 	if (ni > sizeof(smallbits))
883 		bits = malloc(ni, M_TEMP, M_WAITOK);
884 	else
885 		bits = smallbits;
886 	error = copyin(SCARG(uap, fds), bits, ni);
887 	PROC_LOCK(p);
888 	if (error)
889 		goto done;
890 	if (SCARG(uap, timeout) != INFTIM) {
891 		atv.tv_sec = SCARG(uap, timeout) / 1000;
892 		atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000;
893 		if (itimerfix(&atv)) {
894 			error = EINVAL;
895 			goto done;
896 		}
897 		getmicrouptime(&rtv);
898 		timevaladd(&atv, &rtv);
899 	} else {
900 		atv.tv_sec = 0;
901 		atv.tv_usec = 0;
902 	}
903 	timo = 0;
904 retry:
905 	ncoll = nselcoll;
906 	p->p_flag |= P_SELECT;
907 	PROC_UNLOCK(p);
908 	error = pollscan(p, (struct pollfd *)bits, nfds);
909 	PROC_LOCK(p);
910 	if (error || p->p_retval[0])
911 		goto done;
912 	if (atv.tv_sec || atv.tv_usec) {
913 		getmicrouptime(&rtv);
914 		if (timevalcmp(&rtv, &atv, >=))
915 			goto done;
916 		ttv = atv;
917 		timevalsub(&ttv, &rtv);
918 		timo = ttv.tv_sec > 24 * 60 * 60 ?
919 		    24 * 60 * 60 * hz : tvtohz(&ttv);
920 	}
921 	s = splhigh();
922 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
923 		splx(s);
924 		goto retry;
925 	}
926 	p->p_flag &= ~P_SELECT;
927 	error = msleep((caddr_t)&selwait, &p->p_mtx, PSOCK | PCATCH, "poll",
928 	    timo);
929 	splx(s);
930 	if (error == 0)
931 		goto retry;
932 done:
933 	p->p_flag &= ~P_SELECT;
934 	PROC_UNLOCK(p);
935 	/* poll is not restarted after signals... */
936 	if (error == ERESTART)
937 		error = EINTR;
938 	if (error == EWOULDBLOCK)
939 		error = 0;
940 	if (error == 0) {
941 		error = copyout(bits, SCARG(uap, fds), ni);
942 		if (error)
943 			goto out;
944 	}
945 out:
946 	if (ni > sizeof(smallbits))
947 		free(bits, M_TEMP);
948 	return (error);
949 }
950 
951 static int
952 pollscan(p, fds, nfd)
953 	struct proc *p;
954 	struct pollfd *fds;
955 	u_int nfd;
956 {
957 	register struct filedesc *fdp = p->p_fd;
958 	int i;
959 	struct file *fp;
960 	int n = 0;
961 
962 	for (i = 0; i < nfd; i++, fds++) {
963 		if (fds->fd >= fdp->fd_nfiles) {
964 			fds->revents = POLLNVAL;
965 			n++;
966 		} else if (fds->fd < 0) {
967 			fds->revents = 0;
968 		} else {
969 			fp = fdp->fd_ofiles[fds->fd];
970 			if (fp == NULL) {
971 				fds->revents = POLLNVAL;
972 				n++;
973 			} else {
974 				/*
975 				 * Note: backend also returns POLLHUP and
976 				 * POLLERR if appropriate.
977 				 */
978 				fds->revents = fo_poll(fp, fds->events,
979 				    fp->f_cred, p);
980 				if (fds->revents != 0)
981 					n++;
982 			}
983 		}
984 	}
985 	p->p_retval[0] = n;
986 	return (0);
987 }
988 
989 /*
990  * OpenBSD poll system call.
991  * XXX this isn't quite a true representation..  OpenBSD uses select ops.
992  */
993 #ifndef _SYS_SYSPROTO_H_
994 struct openbsd_poll_args {
995 	struct pollfd *fds;
996 	u_int	nfds;
997 	int	timeout;
998 };
999 #endif
1000 int
1001 openbsd_poll(p, uap)
1002 	register struct proc *p;
1003 	register struct openbsd_poll_args *uap;
1004 {
1005 	return (poll(p, (struct poll_args *)uap));
1006 }
1007 
1008 /*ARGSUSED*/
1009 int
1010 seltrue(dev, events, p)
1011 	dev_t dev;
1012 	int events;
1013 	struct proc *p;
1014 {
1015 
1016 	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
1017 }
1018 
1019 /*
1020  * Record a select request.
1021  */
1022 void
1023 selrecord(selector, sip)
1024 	struct proc *selector;
1025 	struct selinfo *sip;
1026 {
1027 	struct proc *p;
1028 	pid_t mypid;
1029 
1030 	mypid = selector->p_pid;
1031 	if (sip->si_pid == mypid)
1032 		return;
1033 	if (sip->si_pid && (p = pfind(sip->si_pid))) {
1034 		mtx_lock_spin(&sched_lock);
1035 	    	if (p->p_wchan == (caddr_t)&selwait) {
1036 			mtx_unlock_spin(&sched_lock);
1037 			PROC_UNLOCK(p);
1038 			sip->si_flags |= SI_COLL;
1039 			return;
1040 		}
1041 		mtx_unlock_spin(&sched_lock);
1042 		PROC_UNLOCK(p);
1043 	}
1044 	sip->si_pid = mypid;
1045 }
1046 
1047 /*
1048  * Do a wakeup when a selectable event occurs.
1049  */
1050 void
1051 selwakeup(sip)
1052 	register struct selinfo *sip;
1053 {
1054 	register struct proc *p;
1055 
1056 	if (sip->si_pid == 0)
1057 		return;
1058 	if (sip->si_flags & SI_COLL) {
1059 		nselcoll++;
1060 		sip->si_flags &= ~SI_COLL;
1061 		wakeup((caddr_t)&selwait);
1062 	}
1063 	p = pfind(sip->si_pid);
1064 	sip->si_pid = 0;
1065 	if (p != NULL) {
1066 		mtx_lock_spin(&sched_lock);
1067 		if (p->p_wchan == (caddr_t)&selwait) {
1068 			if (p->p_stat == SSLEEP)
1069 				setrunnable(p);
1070 			else
1071 				unsleep(p);
1072 		} else
1073 			p->p_flag &= ~P_SELECT;
1074 		mtx_unlock_spin(&sched_lock);
1075 		PROC_UNLOCK(p);
1076 	}
1077 }
1078