xref: /freebsd/sys/kern/sys_generic.c (revision 33b77e2decd50e53798014b70bf7ca3bdc4c0c7e)
1 /*
2  * Copyright (c) 1982, 1986, 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
39  * $Id: sys_generic.c,v 1.32 1997/11/06 19:29:20 phk Exp $
40  */
41 
42 #include "opt_ktrace.h"
43 
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/sysproto.h>
47 #include <sys/filedesc.h>
48 #include <sys/filio.h>
49 #include <sys/ttycom.h>
50 #include <sys/fcntl.h>
51 #include <sys/file.h>
52 #include <sys/proc.h>
53 #include <sys/signalvar.h>
54 #include <sys/socketvar.h>
55 #include <sys/uio.h>
56 #include <sys/kernel.h>
57 #include <sys/malloc.h>
58 #include <sys/poll.h>
59 #include <sys/sysent.h>
60 #ifdef KTRACE
61 #include <sys/ktrace.h>
62 #endif
63 
64 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
65 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
66 MALLOC_DEFINE(M_IOV, "iov", "large iov's");
67 
68 static int	pollscan __P((struct proc *, struct pollfd *, int));
69 static int	selscan __P((struct proc *, fd_mask **, fd_mask **, int));
70 
71 /*
72  * Read system call.
73  */
74 #ifndef _SYS_SYSPROTO_H_
75 struct read_args {
76 	int	fd;
77 	char	*buf;
78 	u_int	nbyte;
79 };
80 #endif
81 /* ARGSUSED */
82 int
83 read(p, uap)
84 	struct proc *p;
85 	register struct read_args *uap;
86 {
87 	register struct file *fp;
88 	register struct filedesc *fdp = p->p_fd;
89 	struct uio auio;
90 	struct iovec aiov;
91 	long cnt, error = 0;
92 #ifdef KTRACE
93 	struct iovec ktriov;
94 #endif
95 
96 	if (((u_int)uap->fd) >= fdp->fd_nfiles ||
97 	    (fp = fdp->fd_ofiles[uap->fd]) == NULL ||
98 	    (fp->f_flag & FREAD) == 0)
99 		return (EBADF);
100 	aiov.iov_base = (caddr_t)uap->buf;
101 	aiov.iov_len = uap->nbyte;
102 	auio.uio_iov = &aiov;
103 	auio.uio_iovcnt = 1;
104 	auio.uio_offset = -1;
105 
106 	auio.uio_resid = uap->nbyte;
107 	if (auio.uio_resid < 0)
108 		return (EINVAL);
109 
110 	auio.uio_rw = UIO_READ;
111 	auio.uio_segflg = UIO_USERSPACE;
112 	auio.uio_procp = p;
113 #ifdef KTRACE
114 	/*
115 	 * if tracing, save a copy of iovec
116 	 */
117 	if (KTRPOINT(p, KTR_GENIO))
118 		ktriov = aiov;
119 #endif
120 	cnt = uap->nbyte;
121 	if ((error = (*fp->f_ops->fo_read)(fp, &auio, fp->f_cred)))
122 		if (auio.uio_resid != cnt && (error == ERESTART ||
123 		    error == EINTR || error == EWOULDBLOCK))
124 			error = 0;
125 	cnt -= auio.uio_resid;
126 #ifdef KTRACE
127 	if (KTRPOINT(p, KTR_GENIO) && error == 0)
128 		ktrgenio(p->p_tracep, uap->fd, UIO_READ, &ktriov, cnt, error);
129 #endif
130 	p->p_retval[0] = cnt;
131 	return (error);
132 }
133 
134 /*
135  * Scatter read system call.
136  */
137 #ifndef _SYS_SYSPROTO_H_
138 struct readv_args {
139 	int	fd;
140 	struct	iovec *iovp;
141 	u_int	iovcnt;
142 };
143 #endif
144 int
145 readv(p, uap)
146 	struct proc *p;
147 	register struct readv_args *uap;
148 {
149 	register struct file *fp;
150 	register struct filedesc *fdp = p->p_fd;
151 	struct uio auio;
152 	register struct iovec *iov;
153 	struct iovec *needfree;
154 	struct iovec aiov[UIO_SMALLIOV];
155 	long i, cnt, error = 0;
156 	u_int iovlen;
157 #ifdef KTRACE
158 	struct iovec *ktriov = NULL;
159 #endif
160 
161 	if (((u_int)uap->fd) >= fdp->fd_nfiles ||
162 	    (fp = fdp->fd_ofiles[uap->fd]) == NULL ||
163 	    (fp->f_flag & FREAD) == 0)
164 		return (EBADF);
165 	/* note: can't use iovlen until iovcnt is validated */
166 	iovlen = uap->iovcnt * sizeof (struct iovec);
167 	if (uap->iovcnt > UIO_SMALLIOV) {
168 		if (uap->iovcnt > UIO_MAXIOV)
169 			return (EINVAL);
170 		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
171 		needfree = iov;
172 	} else {
173 		iov = aiov;
174 		needfree = NULL;
175 	}
176 	auio.uio_iov = iov;
177 	auio.uio_iovcnt = uap->iovcnt;
178 	auio.uio_rw = UIO_READ;
179 	auio.uio_segflg = UIO_USERSPACE;
180 	auio.uio_procp = p;
181 	auio.uio_offset = -1;
182 	if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)))
183 		goto done;
184 	auio.uio_resid = 0;
185 	for (i = 0; i < uap->iovcnt; i++) {
186 		auio.uio_resid += iov->iov_len;
187 		if (auio.uio_resid < 0) {
188 			error = EINVAL;
189 			goto done;
190 		}
191 		iov++;
192 	}
193 #ifdef KTRACE
194 	/*
195 	 * if tracing, save a copy of iovec
196 	 */
197 	if (KTRPOINT(p, KTR_GENIO))  {
198 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
199 		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
200 	}
201 #endif
202 	cnt = auio.uio_resid;
203 	if ((error = (*fp->f_ops->fo_read)(fp, &auio, fp->f_cred)))
204 		if (auio.uio_resid != cnt && (error == ERESTART ||
205 		    error == EINTR || error == EWOULDBLOCK))
206 			error = 0;
207 	cnt -= auio.uio_resid;
208 #ifdef KTRACE
209 	if (ktriov != NULL) {
210 		if (error == 0)
211 			ktrgenio(p->p_tracep, uap->fd, UIO_READ, ktriov,
212 			    cnt, error);
213 		FREE(ktriov, M_TEMP);
214 	}
215 #endif
216 	p->p_retval[0] = cnt;
217 done:
218 	if (needfree)
219 		FREE(needfree, M_IOV);
220 	return (error);
221 }
222 
223 /*
224  * Write system call
225  */
226 #ifndef _SYS_SYSPROTO_H_
227 struct write_args {
228 	int	fd;
229 	char	*buf;
230 	u_int	nbyte;
231 };
232 #endif
233 int
234 write(p, uap)
235 	struct proc *p;
236 	register struct write_args *uap;
237 {
238 	register struct file *fp;
239 	register struct filedesc *fdp = p->p_fd;
240 	struct uio auio;
241 	struct iovec aiov;
242 	long cnt, error = 0;
243 #ifdef KTRACE
244 	struct iovec ktriov;
245 #endif
246 
247 	if (((u_int)uap->fd) >= fdp->fd_nfiles ||
248 	    (fp = fdp->fd_ofiles[uap->fd]) == NULL ||
249 	    (fp->f_flag & FWRITE) == 0)
250 		return (EBADF);
251 	aiov.iov_base = (caddr_t)uap->buf;
252 	aiov.iov_len = uap->nbyte;
253 	auio.uio_iov = &aiov;
254 	auio.uio_iovcnt = 1;
255 	auio.uio_offset = -1;
256 	auio.uio_resid = uap->nbyte;
257 	auio.uio_rw = UIO_WRITE;
258 	auio.uio_segflg = UIO_USERSPACE;
259 	auio.uio_procp = p;
260 #ifdef KTRACE
261 	/*
262 	 * if tracing, save a copy of iovec
263 	 */
264 	if (KTRPOINT(p, KTR_GENIO))
265 		ktriov = aiov;
266 #endif
267 	cnt = uap->nbyte;
268 	if ((error = (*fp->f_ops->fo_write)(fp, &auio, fp->f_cred))) {
269 		if (auio.uio_resid != cnt && (error == ERESTART ||
270 		    error == EINTR || error == EWOULDBLOCK))
271 			error = 0;
272 		if (error == EPIPE)
273 			psignal(p, SIGPIPE);
274 	}
275 	cnt -= auio.uio_resid;
276 #ifdef KTRACE
277 	if (KTRPOINT(p, KTR_GENIO) && error == 0)
278 		ktrgenio(p->p_tracep, uap->fd, UIO_WRITE,
279 		    &ktriov, cnt, error);
280 #endif
281 	p->p_retval[0] = cnt;
282 	return (error);
283 }
284 
285 /*
286  * Gather write system call
287  */
288 #ifndef _SYS_SYSPROTO_H_
289 struct writev_args {
290 	int	fd;
291 	struct	iovec *iovp;
292 	u_int	iovcnt;
293 };
294 #endif
295 int
296 writev(p, uap)
297 	struct proc *p;
298 	register struct writev_args *uap;
299 {
300 	register struct file *fp;
301 	register struct filedesc *fdp = p->p_fd;
302 	struct uio auio;
303 	register struct iovec *iov;
304 	struct iovec *needfree;
305 	struct iovec aiov[UIO_SMALLIOV];
306 	long i, cnt, error = 0;
307 	u_int iovlen;
308 #ifdef KTRACE
309 	struct iovec *ktriov = NULL;
310 #endif
311 
312 	if (((u_int)uap->fd) >= fdp->fd_nfiles ||
313 	    (fp = fdp->fd_ofiles[uap->fd]) == NULL ||
314 	    (fp->f_flag & FWRITE) == 0)
315 		return (EBADF);
316 	/* note: can't use iovlen until iovcnt is validated */
317 	iovlen = uap->iovcnt * sizeof (struct iovec);
318 	if (uap->iovcnt > UIO_SMALLIOV) {
319 		if (uap->iovcnt > UIO_MAXIOV)
320 			return (EINVAL);
321 		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
322 		needfree = iov;
323 	} else {
324 		iov = aiov;
325 		needfree = NULL;
326 	}
327 	auio.uio_iov = iov;
328 	auio.uio_iovcnt = uap->iovcnt;
329 	auio.uio_rw = UIO_WRITE;
330 	auio.uio_segflg = UIO_USERSPACE;
331 	auio.uio_procp = p;
332 	auio.uio_offset = -1;
333 	if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)))
334 		goto done;
335 	auio.uio_resid = 0;
336 	for (i = 0; i < uap->iovcnt; i++) {
337 		auio.uio_resid += iov->iov_len;
338 		if (auio.uio_resid < 0) {
339 			error = EINVAL;
340 			goto done;
341 		}
342 		iov++;
343 	}
344 #ifdef KTRACE
345 	/*
346 	 * if tracing, save a copy of iovec
347 	 */
348 	if (KTRPOINT(p, KTR_GENIO))  {
349 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
350 		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
351 	}
352 #endif
353 	cnt = auio.uio_resid;
354 	if ((error = (*fp->f_ops->fo_write)(fp, &auio, fp->f_cred))) {
355 		if (auio.uio_resid != cnt && (error == ERESTART ||
356 		    error == EINTR || error == EWOULDBLOCK))
357 			error = 0;
358 		if (error == EPIPE)
359 			psignal(p, SIGPIPE);
360 	}
361 	cnt -= auio.uio_resid;
362 #ifdef KTRACE
363 	if (ktriov != NULL) {
364 		if (error == 0)
365 			ktrgenio(p->p_tracep, uap->fd, UIO_WRITE,
366 				ktriov, cnt, error);
367 		FREE(ktriov, M_TEMP);
368 	}
369 #endif
370 	p->p_retval[0] = cnt;
371 done:
372 	if (needfree)
373 		FREE(needfree, M_IOV);
374 	return (error);
375 }
376 
377 /*
378  * Ioctl system call
379  */
380 #ifndef _SYS_SYSPROTO_H_
381 struct ioctl_args {
382 	int	fd;
383 	int	com;
384 	caddr_t	data;
385 };
386 #endif
387 /* ARGSUSED */
388 int
389 ioctl(p, uap)
390 	struct proc *p;
391 	register struct ioctl_args *uap;
392 {
393 	register struct file *fp;
394 	register struct filedesc *fdp;
395 	register int com, error;
396 	register u_int size;
397 	caddr_t data, memp;
398 	int tmp;
399 #define STK_PARAMS	128
400 	char stkbuf[STK_PARAMS];
401 
402 	fdp = p->p_fd;
403 	if ((u_int)uap->fd >= fdp->fd_nfiles ||
404 	    (fp = fdp->fd_ofiles[uap->fd]) == NULL)
405 		return (EBADF);
406 
407 	if ((fp->f_flag & (FREAD | FWRITE)) == 0)
408 		return (EBADF);
409 
410 	switch (com = uap->com) {
411 	case FIONCLEX:
412 		fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE;
413 		return (0);
414 	case FIOCLEX:
415 		fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE;
416 		return (0);
417 	}
418 
419 	/*
420 	 * Interpret high order word to find amount of data to be
421 	 * copied to/from the user's address space.
422 	 */
423 	size = IOCPARM_LEN(com);
424 	if (size > IOCPARM_MAX)
425 		return (ENOTTY);
426 	memp = NULL;
427 	if (size > sizeof (stkbuf)) {
428 		memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
429 		data = memp;
430 	} else
431 		data = stkbuf;
432 	if (com&IOC_IN) {
433 		if (size) {
434 			error = copyin(uap->data, data, (u_int)size);
435 			if (error) {
436 				if (memp)
437 					free(memp, M_IOCTLOPS);
438 				return (error);
439 			}
440 		} else
441 			*(caddr_t *)data = uap->data;
442 	} else if ((com&IOC_OUT) && size)
443 		/*
444 		 * Zero the buffer so the user always
445 		 * gets back something deterministic.
446 		 */
447 		bzero(data, size);
448 	else if (com&IOC_VOID)
449 		*(caddr_t *)data = uap->data;
450 
451 	switch (com) {
452 
453 	case FIONBIO:
454 		if ((tmp = *(int *)data))
455 			fp->f_flag |= FNONBLOCK;
456 		else
457 			fp->f_flag &= ~FNONBLOCK;
458 		error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&tmp, p);
459 		break;
460 
461 	case FIOASYNC:
462 		if ((tmp = *(int *)data))
463 			fp->f_flag |= FASYNC;
464 		else
465 			fp->f_flag &= ~FASYNC;
466 		error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, (caddr_t)&tmp, p);
467 		break;
468 
469 	case FIOSETOWN:
470 		tmp = *(int *)data;
471 		if (fp->f_type == DTYPE_SOCKET) {
472 			((struct socket *)fp->f_data)->so_pgid = tmp;
473 			error = 0;
474 			break;
475 		}
476 		if (tmp <= 0) {
477 			tmp = -tmp;
478 		} else {
479 			struct proc *p1 = pfind(tmp);
480 			if (p1 == 0) {
481 				error = ESRCH;
482 				break;
483 			}
484 			tmp = p1->p_pgrp->pg_id;
485 		}
486 		error = (*fp->f_ops->fo_ioctl)
487 			(fp, (int)TIOCSPGRP, (caddr_t)&tmp, p);
488 		break;
489 
490 	case FIOGETOWN:
491 		if (fp->f_type == DTYPE_SOCKET) {
492 			error = 0;
493 			*(int *)data = ((struct socket *)fp->f_data)->so_pgid;
494 			break;
495 		}
496 		error = (*fp->f_ops->fo_ioctl)(fp, (int)TIOCGPGRP, data, p);
497 		*(int *)data = -*(int *)data;
498 		break;
499 
500 	default:
501 		error = (*fp->f_ops->fo_ioctl)(fp, com, data, p);
502 		/*
503 		 * Copy any data to user, size was
504 		 * already set and checked above.
505 		 */
506 		if (error == 0 && (com&IOC_OUT) && size)
507 			error = copyout(data, uap->data, (u_int)size);
508 		break;
509 	}
510 	if (memp)
511 		free(memp, M_IOCTLOPS);
512 	return (error);
513 }
514 
515 static int	nselcoll;
516 int	selwait;
517 
518 /*
519  * Select system call.
520  */
521 #ifndef _SYS_SYSPROTO_H_
522 struct select_args {
523 	int	nd;
524 	fd_set	*in, *ou, *ex;
525 	struct	timeval *tv;
526 };
527 #endif
528 int
529 select(p, uap)
530 	register struct proc *p;
531 	register struct select_args *uap;
532 {
533 	/*
534 	 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
535 	 * infds with the new FD_SETSIZE of 1024, and more than enough for
536 	 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
537 	 * of 256.
538 	 */
539 	fd_mask s_selbits[howmany(2048, NFDBITS)];
540 	fd_mask *ibits[3], *obits[3], *selbits, *sbp;
541 	struct timeval atv;
542 	int s, ncoll, error, timo;
543 	u_int nbufbytes, ncpbytes, nfdbits;
544 
545 	if (uap->nd < 0)
546 		return (EINVAL);
547 	if (uap->nd > p->p_fd->fd_nfiles)
548 		uap->nd = p->p_fd->fd_nfiles;   /* forgiving; slightly wrong */
549 
550 	/*
551 	 * Allocate just enough bits for the non-null fd_sets.  Use the
552 	 * preallocated auto buffer if possible.
553 	 */
554 	nfdbits = roundup(uap->nd, NFDBITS);
555 	ncpbytes = nfdbits / NBBY;
556 	nbufbytes = 0;
557 	if (uap->in != NULL)
558 		nbufbytes += 2 * ncpbytes;
559 	if (uap->ou != NULL)
560 		nbufbytes += 2 * ncpbytes;
561 	if (uap->ex != NULL)
562 		nbufbytes += 2 * ncpbytes;
563 	if (nbufbytes <= sizeof s_selbits)
564 		selbits = &s_selbits[0];
565 	else
566 		selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
567 
568 	/*
569 	 * Assign pointers into the bit buffers and fetch the input bits.
570 	 * Put the output buffers together so that they can be bzeroed
571 	 * together.
572 	 */
573 	sbp = selbits;
574 #define	getbits(name, x) \
575 	do {								\
576 		if (uap->name == NULL)					\
577 			ibits[x] = NULL;				\
578 		else {							\
579 			ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp;	\
580 			obits[x] = sbp;					\
581 			sbp += ncpbytes / sizeof *sbp;			\
582 			error = copyin(uap->name, ibits[x], ncpbytes);	\
583 			if (error != 0)					\
584 				goto done;				\
585 		}							\
586 	} while (0)
587 	getbits(in, 0);
588 	getbits(ou, 1);
589 	getbits(ex, 2);
590 #undef	getbits
591 	if (nbufbytes != 0)
592 		bzero(selbits, nbufbytes / 2);
593 
594 	if (uap->tv) {
595 		error = copyin((caddr_t)uap->tv, (caddr_t)&atv,
596 			sizeof (atv));
597 		if (error)
598 			goto done;
599 		if (itimerfix(&atv)) {
600 			error = EINVAL;
601 			goto done;
602 		}
603 		s = splclock();
604 		timevaladd(&atv, &time);
605 		timo = hzto(&atv);
606 		splx(s);
607 	} else
608 		timo = 0;
609 retry:
610 	ncoll = nselcoll;
611 	p->p_flag |= P_SELECT;
612 	error = selscan(p, ibits, obits, uap->nd);
613 	if (error || p->p_retval[0])
614 		goto done;
615 	s = splhigh();
616 	/* this should be timercmp(&time, &atv, >=) */
617 	if (uap->tv && (time.tv_sec > atv.tv_sec ||
618 	    (time.tv_sec == atv.tv_sec && time.tv_usec >= atv.tv_usec))) {
619 		splx(s);
620 		goto done;
621 	}
622 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
623 		splx(s);
624 		goto retry;
625 	}
626 	p->p_flag &= ~P_SELECT;
627 	error = tsleep((caddr_t)&selwait, PSOCK | PCATCH, "select", timo);
628 	splx(s);
629 	if (error == 0)
630 		goto retry;
631 done:
632 	p->p_flag &= ~P_SELECT;
633 	/* select is not restarted after signals... */
634 	if (error == ERESTART)
635 		error = EINTR;
636 	if (error == EWOULDBLOCK)
637 		error = 0;
638 #define	putbits(name, x) \
639 	if (uap->name && (error2 = copyout(obits[x], uap->name, ncpbytes))) \
640 		error = error2;
641 	if (error == 0) {
642 		int error2;
643 
644 		putbits(in, 0);
645 		putbits(ou, 1);
646 		putbits(ex, 2);
647 #undef putbits
648 	}
649 	if (selbits != &s_selbits[0])
650 		free(selbits, M_SELECT);
651 	return (error);
652 }
653 
654 static int
655 selscan(p, ibits, obits, nfd)
656 	struct proc *p;
657 	fd_mask **ibits, **obits;
658 	int nfd;
659 {
660 	register struct filedesc *fdp = p->p_fd;
661 	register int msk, i, j, fd;
662 	register fd_mask bits;
663 	struct file *fp;
664 	int n = 0;
665 	/* Note: backend also returns POLLHUP/POLLERR if appropriate. */
666 	static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
667 
668 	for (msk = 0; msk < 3; msk++) {
669 		if (ibits[msk] == NULL)
670 			continue;
671 		for (i = 0; i < nfd; i += NFDBITS) {
672 			bits = ibits[msk][i/NFDBITS];
673 			while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
674 				bits &= ~(1 << j);
675 				fp = fdp->fd_ofiles[fd];
676 				if (fp == NULL)
677 					return (EBADF);
678 				if ((*fp->f_ops->fo_poll)(fp, flag[msk],
679 				    fp->f_cred, p)) {
680 					obits[msk][(fd)/NFDBITS] |=
681 						(1 << ((fd) % NFDBITS));
682 					n++;
683 				}
684 			}
685 		}
686 	}
687 	p->p_retval[0] = n;
688 	return (0);
689 }
690 
691 /*
692  * Poll system call.
693  */
694 #ifndef _SYS_SYSPROTO_H_
695 struct poll_args {
696 	struct pollfd *fds;
697 	u_int	nfds;
698 	int	timeout;
699 };
700 #endif
701 int
702 poll(p, uap)
703 	register struct proc *p;
704 	register struct poll_args *uap;
705 {
706 	caddr_t bits;
707 	char smallbits[32 * sizeof(struct pollfd)];
708 	struct timeval atv;
709 	int s, ncoll, error = 0, timo;
710 	size_t ni;
711 
712 	if (SCARG(uap, nfds) > p->p_fd->fd_nfiles) {
713 		/* forgiving; slightly wrong */
714 		SCARG(uap, nfds) = p->p_fd->fd_nfiles;
715 	}
716 	ni = SCARG(uap, nfds) * sizeof(struct pollfd);
717 	if (ni > sizeof(smallbits))
718 		bits = malloc(ni, M_TEMP, M_WAITOK);
719 	else
720 		bits = smallbits;
721 	error = copyin(SCARG(uap, fds), bits, ni);
722 	if (error)
723 		goto done;
724 	if (SCARG(uap, timeout) != INFTIM) {
725 		atv.tv_sec = SCARG(uap, timeout) / 1000;
726 		atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000;
727 		if (itimerfix(&atv)) {
728 			error = EINVAL;
729 			goto done;
730 		}
731 		s = splclock();
732 		timevaladd(&atv, &time);
733 		timo = hzto(&atv);
734 		splx(s);
735 	} else
736 		timo = 0;
737 retry:
738 	ncoll = nselcoll;
739 	p->p_flag |= P_SELECT;
740 	error = pollscan(p, (struct pollfd *)bits, SCARG(uap, nfds));
741 	if (error || p->p_retval[0])
742 		goto done;
743 	s = splhigh();
744 	if (timo && timercmp(&time, &atv, >=)) {
745 		splx(s);
746 		goto done;
747 	}
748 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
749 		splx(s);
750 		goto retry;
751 	}
752 	p->p_flag &= ~P_SELECT;
753 	error = tsleep((caddr_t)&selwait, PSOCK | PCATCH, "poll", timo);
754 	splx(s);
755 	if (error == 0)
756 		goto retry;
757 done:
758 	p->p_flag &= ~P_SELECT;
759 	/* poll is not restarted after signals... */
760 	if (error == ERESTART)
761 		error = EINTR;
762 	if (error == EWOULDBLOCK)
763 		error = 0;
764 	if (error == 0) {
765 		error = copyout(bits, SCARG(uap, fds), ni);
766 		if (error)
767 			goto out;
768 	}
769 out:
770 	if (ni > sizeof(smallbits))
771 		free(bits, M_TEMP);
772 	return (error);
773 }
774 
775 static int
776 pollscan(p, fds, nfd)
777 	struct proc *p;
778 	struct pollfd *fds;
779 	int nfd;
780 {
781 	register struct filedesc *fdp = p->p_fd;
782 	int i;
783 	struct file *fp;
784 	int n = 0;
785 
786 	for (i = 0; i < nfd; i++, fds++) {
787 		if ((u_int)fds->fd >= fdp->fd_nfiles) {
788 			fds->revents = POLLNVAL;
789 			n++;
790 		} else {
791 			fp = fdp->fd_ofiles[fds->fd];
792 			if (fp == 0) {
793 				fds->revents = POLLNVAL;
794 				n++;
795 			} else {
796 				/*
797 				 * Note: backend also returns POLLHUP and
798 				 * POLLERR if appropriate.
799 				 */
800 				fds->revents = (*fp->f_ops->fo_poll)(fp,
801 				    fds->events, fp->f_cred, p);
802 				if (fds->revents != 0)
803 					n++;
804 			}
805 		}
806 	}
807 	p->p_retval[0] = n;
808 	return (0);
809 }
810 
811 /*
812  * OpenBSD poll system call.
813  * XXX this isn't quite a true representation..  OpenBSD uses select ops.
814  */
815 #ifndef _SYS_SYSPROTO_H_
816 struct openbsd_poll_args {
817 	struct pollfd *fds;
818 	u_int	nfds;
819 	int	timeout;
820 };
821 #endif
822 int
823 openbsd_poll(p, uap)
824 	register struct proc *p;
825 	register struct openbsd_poll_args *uap;
826 {
827 	return (poll(p, (struct poll_args *)uap));
828 }
829 
830 /*ARGSUSED*/
831 int
832 seltrue(dev, events, p)
833 	dev_t dev;
834 	int events;
835 	struct proc *p;
836 {
837 
838 	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
839 }
840 
841 /*
842  * Record a select request.
843  */
844 void
845 selrecord(selector, sip)
846 	struct proc *selector;
847 	struct selinfo *sip;
848 {
849 	struct proc *p;
850 	pid_t mypid;
851 
852 	mypid = selector->p_pid;
853 	if (sip->si_pid == mypid)
854 		return;
855 	if (sip->si_pid && (p = pfind(sip->si_pid)) &&
856 	    p->p_wchan == (caddr_t)&selwait)
857 		sip->si_flags |= SI_COLL;
858 	else
859 		sip->si_pid = mypid;
860 }
861 
862 /*
863  * Do a wakeup when a selectable event occurs.
864  */
865 void
866 selwakeup(sip)
867 	register struct selinfo *sip;
868 {
869 	register struct proc *p;
870 	int s;
871 
872 	if (sip->si_pid == 0)
873 		return;
874 	if (sip->si_flags & SI_COLL) {
875 		nselcoll++;
876 		sip->si_flags &= ~SI_COLL;
877 		wakeup((caddr_t)&selwait);
878 	}
879 	p = pfind(sip->si_pid);
880 	sip->si_pid = 0;
881 	if (p != NULL) {
882 		s = splhigh();
883 		if (p->p_wchan == (caddr_t)&selwait) {
884 			if (p->p_stat == SSLEEP)
885 				setrunnable(p);
886 			else
887 				unsleep(p);
888 		} else if (p->p_flag & P_SELECT)
889 			p->p_flag &= ~P_SELECT;
890 		splx(s);
891 	}
892 }
893