xref: /freebsd/sys/kern/sys_generic.c (revision 2ad872c5794e4c26fdf6ed219ad3f09ca0d5304a)
1 /*
2  * Copyright (c) 1982, 1986, 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
39  * $Id: sys_generic.c,v 1.42 1998/11/11 10:03:55 truckman Exp $
40  */
41 
42 #include "opt_ktrace.h"
43 
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/sysproto.h>
47 #include <sys/filedesc.h>
48 #include <sys/filio.h>
49 #include <sys/ttycom.h>
50 #include <sys/fcntl.h>
51 #include <sys/file.h>
52 #include <sys/proc.h>
53 #include <sys/signalvar.h>
54 #include <sys/socketvar.h>
55 #include <sys/uio.h>
56 #include <sys/kernel.h>
57 #include <sys/malloc.h>
58 #include <sys/poll.h>
59 #include <sys/sysent.h>
60 #ifdef KTRACE
61 #include <sys/ktrace.h>
62 #endif
63 
64 #include <machine/limits.h>
65 
66 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
67 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
68 MALLOC_DEFINE(M_IOV, "iov", "large iov's");
69 
70 static int	pollscan __P((struct proc *, struct pollfd *, int));
71 static int	selscan __P((struct proc *, fd_mask **, fd_mask **, int));
72 
73 /*
74  * Read system call.
75  */
76 #ifndef _SYS_SYSPROTO_H_
77 struct read_args {
78 	int	fd;
79 	void	*buf;
80 	size_t	nbyte;
81 };
82 #endif
83 /* ARGSUSED */
84 int
85 read(p, uap)
86 	struct proc *p;
87 	register struct read_args *uap;
88 {
89 	register struct file *fp;
90 	register struct filedesc *fdp = p->p_fd;
91 	struct uio auio;
92 	struct iovec aiov;
93 	long cnt, error = 0;
94 #ifdef KTRACE
95 	struct iovec ktriov;
96 #endif
97 
98 	if (((u_int)uap->fd) >= fdp->fd_nfiles ||
99 	    (fp = fdp->fd_ofiles[uap->fd]) == NULL ||
100 	    (fp->f_flag & FREAD) == 0)
101 		return (EBADF);
102 	aiov.iov_base = (caddr_t)uap->buf;
103 	aiov.iov_len = uap->nbyte;
104 	auio.uio_iov = &aiov;
105 	auio.uio_iovcnt = 1;
106 	auio.uio_offset = -1;
107 	if (uap->nbyte > INT_MAX)
108 		return (EINVAL);
109 	auio.uio_resid = uap->nbyte;
110 	auio.uio_rw = UIO_READ;
111 	auio.uio_segflg = UIO_USERSPACE;
112 	auio.uio_procp = p;
113 #ifdef KTRACE
114 	/*
115 	 * if tracing, save a copy of iovec
116 	 */
117 	if (KTRPOINT(p, KTR_GENIO))
118 		ktriov = aiov;
119 #endif
120 	cnt = uap->nbyte;
121 	if ((error = (*fp->f_ops->fo_read)(fp, &auio, fp->f_cred)))
122 		if (auio.uio_resid != cnt && (error == ERESTART ||
123 		    error == EINTR || error == EWOULDBLOCK))
124 			error = 0;
125 	cnt -= auio.uio_resid;
126 #ifdef KTRACE
127 	if (KTRPOINT(p, KTR_GENIO) && error == 0)
128 		ktrgenio(p->p_tracep, uap->fd, UIO_READ, &ktriov, cnt, error);
129 #endif
130 	p->p_retval[0] = cnt;
131 	return (error);
132 }
133 
134 /*
135  * Scatter read system call.
136  */
137 #ifndef _SYS_SYSPROTO_H_
138 struct readv_args {
139 	int	fd;
140 	struct	iovec *iovp;
141 	u_int	iovcnt;
142 };
143 #endif
144 int
145 readv(p, uap)
146 	struct proc *p;
147 	register struct readv_args *uap;
148 {
149 	register struct file *fp;
150 	register struct filedesc *fdp = p->p_fd;
151 	struct uio auio;
152 	register struct iovec *iov;
153 	struct iovec *needfree;
154 	struct iovec aiov[UIO_SMALLIOV];
155 	long i, cnt, error = 0;
156 	u_int iovlen;
157 #ifdef KTRACE
158 	struct iovec *ktriov = NULL;
159 #endif
160 
161 	if (((u_int)uap->fd) >= fdp->fd_nfiles ||
162 	    (fp = fdp->fd_ofiles[uap->fd]) == NULL ||
163 	    (fp->f_flag & FREAD) == 0)
164 		return (EBADF);
165 	/* note: can't use iovlen until iovcnt is validated */
166 	iovlen = uap->iovcnt * sizeof (struct iovec);
167 	if (uap->iovcnt > UIO_SMALLIOV) {
168 		if (uap->iovcnt > UIO_MAXIOV)
169 			return (EINVAL);
170 		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
171 		needfree = iov;
172 	} else {
173 		iov = aiov;
174 		needfree = NULL;
175 	}
176 	auio.uio_iov = iov;
177 	auio.uio_iovcnt = uap->iovcnt;
178 	auio.uio_rw = UIO_READ;
179 	auio.uio_segflg = UIO_USERSPACE;
180 	auio.uio_procp = p;
181 	auio.uio_offset = -1;
182 	if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)))
183 		goto done;
184 	auio.uio_resid = 0;
185 	for (i = 0; i < uap->iovcnt; i++) {
186 		if (iov->iov_len > INT_MAX - auio.uio_resid) {
187 			error = EINVAL;
188 			goto done;
189 		}
190 		auio.uio_resid += iov->iov_len;
191 		iov++;
192 	}
193 #ifdef KTRACE
194 	/*
195 	 * if tracing, save a copy of iovec
196 	 */
197 	if (KTRPOINT(p, KTR_GENIO))  {
198 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
199 		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
200 	}
201 #endif
202 	cnt = auio.uio_resid;
203 	if ((error = (*fp->f_ops->fo_read)(fp, &auio, fp->f_cred)))
204 		if (auio.uio_resid != cnt && (error == ERESTART ||
205 		    error == EINTR || error == EWOULDBLOCK))
206 			error = 0;
207 	cnt -= auio.uio_resid;
208 #ifdef KTRACE
209 	if (ktriov != NULL) {
210 		if (error == 0)
211 			ktrgenio(p->p_tracep, uap->fd, UIO_READ, ktriov,
212 			    cnt, error);
213 		FREE(ktriov, M_TEMP);
214 	}
215 #endif
216 	p->p_retval[0] = cnt;
217 done:
218 	if (needfree)
219 		FREE(needfree, M_IOV);
220 	return (error);
221 }
222 
223 /*
224  * Write system call
225  */
226 #ifndef _SYS_SYSPROTO_H_
227 struct write_args {
228 	int	fd;
229 	const void *buf;
230 	size_t	nbyte;
231 };
232 #endif
233 int
234 write(p, uap)
235 	struct proc *p;
236 	register struct write_args *uap;
237 {
238 	register struct file *fp;
239 	register struct filedesc *fdp = p->p_fd;
240 	struct uio auio;
241 	struct iovec aiov;
242 	long cnt, error = 0;
243 #ifdef KTRACE
244 	struct iovec ktriov;
245 #endif
246 
247 	if (((u_int)uap->fd) >= fdp->fd_nfiles ||
248 	    (fp = fdp->fd_ofiles[uap->fd]) == NULL ||
249 	    (fp->f_flag & FWRITE) == 0)
250 		return (EBADF);
251 	aiov.iov_base = (caddr_t)uap->buf;
252 	aiov.iov_len = uap->nbyte;
253 	auio.uio_iov = &aiov;
254 	auio.uio_iovcnt = 1;
255 	auio.uio_offset = -1;
256 	if (uap->nbyte > INT_MAX)
257 		return (EINVAL);
258 	auio.uio_resid = uap->nbyte;
259 	auio.uio_rw = UIO_WRITE;
260 	auio.uio_segflg = UIO_USERSPACE;
261 	auio.uio_procp = p;
262 #ifdef KTRACE
263 	/*
264 	 * if tracing, save a copy of iovec
265 	 */
266 	if (KTRPOINT(p, KTR_GENIO))
267 		ktriov = aiov;
268 #endif
269 	cnt = uap->nbyte;
270 	if ((error = (*fp->f_ops->fo_write)(fp, &auio, fp->f_cred))) {
271 		if (auio.uio_resid != cnt && (error == ERESTART ||
272 		    error == EINTR || error == EWOULDBLOCK))
273 			error = 0;
274 		if (error == EPIPE)
275 			psignal(p, SIGPIPE);
276 	}
277 	cnt -= auio.uio_resid;
278 #ifdef KTRACE
279 	if (KTRPOINT(p, KTR_GENIO) && error == 0)
280 		ktrgenio(p->p_tracep, uap->fd, UIO_WRITE,
281 		    &ktriov, cnt, error);
282 #endif
283 	p->p_retval[0] = cnt;
284 	return (error);
285 }
286 
287 /*
288  * Gather write system call
289  */
290 #ifndef _SYS_SYSPROTO_H_
291 struct writev_args {
292 	int	fd;
293 	struct	iovec *iovp;
294 	u_int	iovcnt;
295 };
296 #endif
297 int
298 writev(p, uap)
299 	struct proc *p;
300 	register struct writev_args *uap;
301 {
302 	register struct file *fp;
303 	register struct filedesc *fdp = p->p_fd;
304 	struct uio auio;
305 	register struct iovec *iov;
306 	struct iovec *needfree;
307 	struct iovec aiov[UIO_SMALLIOV];
308 	long i, cnt, error = 0;
309 	u_int iovlen;
310 #ifdef KTRACE
311 	struct iovec *ktriov = NULL;
312 #endif
313 
314 	if (((u_int)uap->fd) >= fdp->fd_nfiles ||
315 	    (fp = fdp->fd_ofiles[uap->fd]) == NULL ||
316 	    (fp->f_flag & FWRITE) == 0)
317 		return (EBADF);
318 	/* note: can't use iovlen until iovcnt is validated */
319 	iovlen = uap->iovcnt * sizeof (struct iovec);
320 	if (uap->iovcnt > UIO_SMALLIOV) {
321 		if (uap->iovcnt > UIO_MAXIOV)
322 			return (EINVAL);
323 		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
324 		needfree = iov;
325 	} else {
326 		iov = aiov;
327 		needfree = NULL;
328 	}
329 	auio.uio_iov = iov;
330 	auio.uio_iovcnt = uap->iovcnt;
331 	auio.uio_rw = UIO_WRITE;
332 	auio.uio_segflg = UIO_USERSPACE;
333 	auio.uio_procp = p;
334 	auio.uio_offset = -1;
335 	if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)))
336 		goto done;
337 	auio.uio_resid = 0;
338 	for (i = 0; i < uap->iovcnt; i++) {
339 		if (iov->iov_len > INT_MAX - auio.uio_resid) {
340 			error = EINVAL;
341 			goto done;
342 		}
343 		auio.uio_resid += iov->iov_len;
344 		iov++;
345 	}
346 #ifdef KTRACE
347 	/*
348 	 * if tracing, save a copy of iovec
349 	 */
350 	if (KTRPOINT(p, KTR_GENIO))  {
351 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
352 		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
353 	}
354 #endif
355 	cnt = auio.uio_resid;
356 	if ((error = (*fp->f_ops->fo_write)(fp, &auio, fp->f_cred))) {
357 		if (auio.uio_resid != cnt && (error == ERESTART ||
358 		    error == EINTR || error == EWOULDBLOCK))
359 			error = 0;
360 		if (error == EPIPE)
361 			psignal(p, SIGPIPE);
362 	}
363 	cnt -= auio.uio_resid;
364 #ifdef KTRACE
365 	if (ktriov != NULL) {
366 		if (error == 0)
367 			ktrgenio(p->p_tracep, uap->fd, UIO_WRITE,
368 				ktriov, cnt, error);
369 		FREE(ktriov, M_TEMP);
370 	}
371 #endif
372 	p->p_retval[0] = cnt;
373 done:
374 	if (needfree)
375 		FREE(needfree, M_IOV);
376 	return (error);
377 }
378 
379 /*
380  * Ioctl system call
381  */
382 #ifndef _SYS_SYSPROTO_H_
383 struct ioctl_args {
384 	int	fd;
385 	u_long	com;
386 	caddr_t	data;
387 };
388 #endif
389 /* ARGSUSED */
390 int
391 ioctl(p, uap)
392 	struct proc *p;
393 	register struct ioctl_args *uap;
394 {
395 	register struct file *fp;
396 	register struct filedesc *fdp;
397 	register u_long com;
398 	int error;
399 	register u_int size;
400 	caddr_t data, memp;
401 	int tmp;
402 #define STK_PARAMS	128
403 	char stkbuf[STK_PARAMS];
404 
405 	fdp = p->p_fd;
406 	if ((u_int)uap->fd >= fdp->fd_nfiles ||
407 	    (fp = fdp->fd_ofiles[uap->fd]) == NULL)
408 		return (EBADF);
409 
410 	if ((fp->f_flag & (FREAD | FWRITE)) == 0)
411 		return (EBADF);
412 
413 	switch (com = uap->com) {
414 	case FIONCLEX:
415 		fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE;
416 		return (0);
417 	case FIOCLEX:
418 		fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE;
419 		return (0);
420 	}
421 
422 	/*
423 	 * Interpret high order word to find amount of data to be
424 	 * copied to/from the user's address space.
425 	 */
426 	size = IOCPARM_LEN(com);
427 	if (size > IOCPARM_MAX)
428 		return (ENOTTY);
429 	memp = NULL;
430 	if (size > sizeof (stkbuf)) {
431 		memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
432 		data = memp;
433 	} else
434 		data = stkbuf;
435 	if (com&IOC_IN) {
436 		if (size) {
437 			error = copyin(uap->data, data, (u_int)size);
438 			if (error) {
439 				if (memp)
440 					free(memp, M_IOCTLOPS);
441 				return (error);
442 			}
443 		} else
444 			*(caddr_t *)data = uap->data;
445 	} else if ((com&IOC_OUT) && size)
446 		/*
447 		 * Zero the buffer so the user always
448 		 * gets back something deterministic.
449 		 */
450 		bzero(data, size);
451 	else if (com&IOC_VOID)
452 		*(caddr_t *)data = uap->data;
453 
454 	switch (com) {
455 
456 	case FIONBIO:
457 		if ((tmp = *(int *)data))
458 			fp->f_flag |= FNONBLOCK;
459 		else
460 			fp->f_flag &= ~FNONBLOCK;
461 		error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&tmp, p);
462 		break;
463 
464 	case FIOASYNC:
465 		if ((tmp = *(int *)data))
466 			fp->f_flag |= FASYNC;
467 		else
468 			fp->f_flag &= ~FASYNC;
469 		error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, (caddr_t)&tmp, p);
470 		break;
471 
472 	default:
473 		error = (*fp->f_ops->fo_ioctl)(fp, com, data, p);
474 		/*
475 		 * Copy any data to user, size was
476 		 * already set and checked above.
477 		 */
478 		if (error == 0 && (com&IOC_OUT) && size)
479 			error = copyout(data, uap->data, (u_int)size);
480 		break;
481 	}
482 	if (memp)
483 		free(memp, M_IOCTLOPS);
484 	return (error);
485 }
486 
487 static int	nselcoll;
488 int	selwait;
489 
490 /*
491  * Select system call.
492  */
493 #ifndef _SYS_SYSPROTO_H_
494 struct select_args {
495 	int	nd;
496 	fd_set	*in, *ou, *ex;
497 	struct	timeval *tv;
498 };
499 #endif
500 int
501 select(p, uap)
502 	register struct proc *p;
503 	register struct select_args *uap;
504 {
505 	/*
506 	 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
507 	 * infds with the new FD_SETSIZE of 1024, and more than enough for
508 	 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
509 	 * of 256.
510 	 */
511 	fd_mask s_selbits[howmany(2048, NFDBITS)];
512 	fd_mask *ibits[3], *obits[3], *selbits, *sbp;
513 	struct timeval atv, rtv, ttv;
514 	int s, ncoll, error, timo;
515 	u_int nbufbytes, ncpbytes, nfdbits;
516 
517 	if (uap->nd < 0)
518 		return (EINVAL);
519 	if (uap->nd > p->p_fd->fd_nfiles)
520 		uap->nd = p->p_fd->fd_nfiles;   /* forgiving; slightly wrong */
521 
522 	/*
523 	 * Allocate just enough bits for the non-null fd_sets.  Use the
524 	 * preallocated auto buffer if possible.
525 	 */
526 	nfdbits = roundup(uap->nd, NFDBITS);
527 	ncpbytes = nfdbits / NBBY;
528 	nbufbytes = 0;
529 	if (uap->in != NULL)
530 		nbufbytes += 2 * ncpbytes;
531 	if (uap->ou != NULL)
532 		nbufbytes += 2 * ncpbytes;
533 	if (uap->ex != NULL)
534 		nbufbytes += 2 * ncpbytes;
535 	if (nbufbytes <= sizeof s_selbits)
536 		selbits = &s_selbits[0];
537 	else
538 		selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
539 
540 	/*
541 	 * Assign pointers into the bit buffers and fetch the input bits.
542 	 * Put the output buffers together so that they can be bzeroed
543 	 * together.
544 	 */
545 	sbp = selbits;
546 #define	getbits(name, x) \
547 	do {								\
548 		if (uap->name == NULL)					\
549 			ibits[x] = NULL;				\
550 		else {							\
551 			ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp;	\
552 			obits[x] = sbp;					\
553 			sbp += ncpbytes / sizeof *sbp;			\
554 			error = copyin(uap->name, ibits[x], ncpbytes);	\
555 			if (error != 0)					\
556 				goto done;				\
557 		}							\
558 	} while (0)
559 	getbits(in, 0);
560 	getbits(ou, 1);
561 	getbits(ex, 2);
562 #undef	getbits
563 	if (nbufbytes != 0)
564 		bzero(selbits, nbufbytes / 2);
565 
566 	if (uap->tv) {
567 		error = copyin((caddr_t)uap->tv, (caddr_t)&atv,
568 			sizeof (atv));
569 		if (error)
570 			goto done;
571 		if (itimerfix(&atv)) {
572 			error = EINVAL;
573 			goto done;
574 		}
575 		getmicrouptime(&rtv);
576 		timevaladd(&atv, &rtv);
577 	} else
578 		atv.tv_sec = 0;
579 	timo = 0;
580 retry:
581 	ncoll = nselcoll;
582 	p->p_flag |= P_SELECT;
583 	error = selscan(p, ibits, obits, uap->nd);
584 	if (error || p->p_retval[0])
585 		goto done;
586 	if (atv.tv_sec) {
587 		getmicrouptime(&rtv);
588 		if (timevalcmp(&rtv, &atv, >=))
589 			goto done;
590 		ttv = atv;
591 		timevalsub(&ttv, &rtv);
592 		timo = ttv.tv_sec > 24 * 60 * 60 ?
593 		    24 * 60 * 60 * hz : tvtohz(&ttv);
594 	}
595 	s = splhigh();
596 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
597 		splx(s);
598 		goto retry;
599 	}
600 	p->p_flag &= ~P_SELECT;
601 	error = tsleep((caddr_t)&selwait, PSOCK | PCATCH, "select", timo);
602 	splx(s);
603 	if (error == 0)
604 		goto retry;
605 done:
606 	p->p_flag &= ~P_SELECT;
607 	/* select is not restarted after signals... */
608 	if (error == ERESTART)
609 		error = EINTR;
610 	if (error == EWOULDBLOCK)
611 		error = 0;
612 #define	putbits(name, x) \
613 	if (uap->name && (error2 = copyout(obits[x], uap->name, ncpbytes))) \
614 		error = error2;
615 	if (error == 0) {
616 		int error2;
617 
618 		putbits(in, 0);
619 		putbits(ou, 1);
620 		putbits(ex, 2);
621 #undef putbits
622 	}
623 	if (selbits != &s_selbits[0])
624 		free(selbits, M_SELECT);
625 	return (error);
626 }
627 
628 static int
629 selscan(p, ibits, obits, nfd)
630 	struct proc *p;
631 	fd_mask **ibits, **obits;
632 	int nfd;
633 {
634 	register struct filedesc *fdp = p->p_fd;
635 	register int msk, i, j, fd;
636 	register fd_mask bits;
637 	struct file *fp;
638 	int n = 0;
639 	/* Note: backend also returns POLLHUP/POLLERR if appropriate. */
640 	static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
641 
642 	for (msk = 0; msk < 3; msk++) {
643 		if (ibits[msk] == NULL)
644 			continue;
645 		for (i = 0; i < nfd; i += NFDBITS) {
646 			bits = ibits[msk][i/NFDBITS];
647 			while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
648 				bits &= ~(1 << j);
649 				fp = fdp->fd_ofiles[fd];
650 				if (fp == NULL)
651 					return (EBADF);
652 				if ((*fp->f_ops->fo_poll)(fp, flag[msk],
653 				    fp->f_cred, p)) {
654 					obits[msk][(fd)/NFDBITS] |=
655 						(1 << ((fd) % NFDBITS));
656 					n++;
657 				}
658 			}
659 		}
660 	}
661 	p->p_retval[0] = n;
662 	return (0);
663 }
664 
665 /*
666  * Poll system call.
667  */
668 #ifndef _SYS_SYSPROTO_H_
669 struct poll_args {
670 	struct pollfd *fds;
671 	u_int	nfds;
672 	int	timeout;
673 };
674 #endif
675 int
676 poll(p, uap)
677 	register struct proc *p;
678 	register struct poll_args *uap;
679 {
680 	caddr_t bits;
681 	char smallbits[32 * sizeof(struct pollfd)];
682 	struct timeval atv, rtv, ttv;
683 	int s, ncoll, error = 0, timo;
684 	size_t ni;
685 
686 	if (SCARG(uap, nfds) > p->p_fd->fd_nfiles) {
687 		/* forgiving; slightly wrong */
688 		SCARG(uap, nfds) = p->p_fd->fd_nfiles;
689 	}
690 	ni = SCARG(uap, nfds) * sizeof(struct pollfd);
691 	if (ni > sizeof(smallbits))
692 		bits = malloc(ni, M_TEMP, M_WAITOK);
693 	else
694 		bits = smallbits;
695 	error = copyin(SCARG(uap, fds), bits, ni);
696 	if (error)
697 		goto done;
698 	if (SCARG(uap, timeout) != INFTIM) {
699 		atv.tv_sec = SCARG(uap, timeout) / 1000;
700 		atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000;
701 		if (itimerfix(&atv)) {
702 			error = EINVAL;
703 			goto done;
704 		}
705 		getmicrouptime(&rtv);
706 		timevaladd(&atv, &rtv);
707 	} else
708 		atv.tv_sec = 0;
709 	timo = 0;
710 retry:
711 	ncoll = nselcoll;
712 	p->p_flag |= P_SELECT;
713 	error = pollscan(p, (struct pollfd *)bits, SCARG(uap, nfds));
714 	if (error || p->p_retval[0])
715 		goto done;
716 	if (atv.tv_sec) {
717 		getmicrouptime(&rtv);
718 		if (timevalcmp(&rtv, &atv, >=))
719 			goto done;
720 		ttv = atv;
721 		timevalsub(&ttv, &rtv);
722 		timo = ttv.tv_sec > 24 * 60 * 60 ?
723 		    24 * 60 * 60 * hz : tvtohz(&ttv);
724 	}
725 	s = splhigh();
726 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
727 		splx(s);
728 		goto retry;
729 	}
730 	p->p_flag &= ~P_SELECT;
731 	error = tsleep((caddr_t)&selwait, PSOCK | PCATCH, "poll", timo);
732 	splx(s);
733 	if (error == 0)
734 		goto retry;
735 done:
736 	p->p_flag &= ~P_SELECT;
737 	/* poll is not restarted after signals... */
738 	if (error == ERESTART)
739 		error = EINTR;
740 	if (error == EWOULDBLOCK)
741 		error = 0;
742 	if (error == 0) {
743 		error = copyout(bits, SCARG(uap, fds), ni);
744 		if (error)
745 			goto out;
746 	}
747 out:
748 	if (ni > sizeof(smallbits))
749 		free(bits, M_TEMP);
750 	return (error);
751 }
752 
753 static int
754 pollscan(p, fds, nfd)
755 	struct proc *p;
756 	struct pollfd *fds;
757 	int nfd;
758 {
759 	register struct filedesc *fdp = p->p_fd;
760 	int i;
761 	struct file *fp;
762 	int n = 0;
763 
764 	for (i = 0; i < nfd; i++, fds++) {
765 		if (fds->fd >= fdp->fd_nfiles) {
766 			fds->revents = POLLNVAL;
767 			n++;
768 		} else if (fds->fd < 0) {
769 			fds->revents = 0;
770 		} else {
771 			fp = fdp->fd_ofiles[fds->fd];
772 			if (fp == 0) {
773 				fds->revents = POLLNVAL;
774 				n++;
775 			} else {
776 				/*
777 				 * Note: backend also returns POLLHUP and
778 				 * POLLERR if appropriate.
779 				 */
780 				fds->revents = (*fp->f_ops->fo_poll)(fp,
781 				    fds->events, fp->f_cred, p);
782 				if (fds->revents != 0)
783 					n++;
784 			}
785 		}
786 	}
787 	p->p_retval[0] = n;
788 	return (0);
789 }
790 
791 /*
792  * OpenBSD poll system call.
793  * XXX this isn't quite a true representation..  OpenBSD uses select ops.
794  */
795 #ifndef _SYS_SYSPROTO_H_
796 struct openbsd_poll_args {
797 	struct pollfd *fds;
798 	u_int	nfds;
799 	int	timeout;
800 };
801 #endif
802 int
803 openbsd_poll(p, uap)
804 	register struct proc *p;
805 	register struct openbsd_poll_args *uap;
806 {
807 	return (poll(p, (struct poll_args *)uap));
808 }
809 
810 /*ARGSUSED*/
811 int
812 seltrue(dev, events, p)
813 	dev_t dev;
814 	int events;
815 	struct proc *p;
816 {
817 
818 	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
819 }
820 
821 /*
822  * Record a select request.
823  */
824 void
825 selrecord(selector, sip)
826 	struct proc *selector;
827 	struct selinfo *sip;
828 {
829 	struct proc *p;
830 	pid_t mypid;
831 
832 	mypid = selector->p_pid;
833 	if (sip->si_pid == mypid)
834 		return;
835 	if (sip->si_pid && (p = pfind(sip->si_pid)) &&
836 	    p->p_wchan == (caddr_t)&selwait)
837 		sip->si_flags |= SI_COLL;
838 	else
839 		sip->si_pid = mypid;
840 }
841 
842 /*
843  * Do a wakeup when a selectable event occurs.
844  */
845 void
846 selwakeup(sip)
847 	register struct selinfo *sip;
848 {
849 	register struct proc *p;
850 	int s;
851 
852 	if (sip->si_pid == 0)
853 		return;
854 	if (sip->si_flags & SI_COLL) {
855 		nselcoll++;
856 		sip->si_flags &= ~SI_COLL;
857 		wakeup((caddr_t)&selwait);
858 	}
859 	p = pfind(sip->si_pid);
860 	sip->si_pid = 0;
861 	if (p != NULL) {
862 		s = splhigh();
863 		if (p->p_wchan == (caddr_t)&selwait) {
864 			if (p->p_stat == SSLEEP)
865 				setrunnable(p);
866 			else
867 				unsleep(p);
868 		} else if (p->p_flag & P_SELECT)
869 			p->p_flag &= ~P_SELECT;
870 		splx(s);
871 	}
872 }
873