xref: /freebsd/sys/kern/sys_generic.c (revision 02f2e93b60c2b91feac8f45c4c889a5a8e40d8a2)
1 /*
2  * Copyright (c) 1982, 1986, 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
39  * $Id: sys_generic.c,v 1.30 1997/10/11 18:31:24 phk Exp $
40  */
41 
42 #include "opt_ktrace.h"
43 
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/sysproto.h>
47 #include <sys/filedesc.h>
48 #include <sys/filio.h>
49 #include <sys/ttycom.h>
50 #include <sys/fcntl.h>
51 #include <sys/file.h>
52 #include <sys/proc.h>
53 #include <sys/signalvar.h>
54 #include <sys/socketvar.h>
55 #include <sys/uio.h>
56 #include <sys/kernel.h>
57 #include <sys/malloc.h>
58 #include <sys/poll.h>
59 #include <sys/sysent.h>
60 #ifdef KTRACE
61 #include <sys/ktrace.h>
62 #endif
63 
64 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
65 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
66 MALLOC_DEFINE(M_IOV, "iov", "large iov's");
67 
68 static int	selscan __P((struct proc *, fd_mask **, fd_mask **, int, int *));
69 static int	pollscan __P((struct proc *, struct pollfd *, int, int *));
70 
71 /*
72  * Read system call.
73  */
74 #ifndef _SYS_SYSPROTO_H_
75 struct read_args {
76 	int	fd;
77 	char	*buf;
78 	u_int	nbyte;
79 };
80 #endif
81 /* ARGSUSED */
82 int
83 read(p, uap, retval)
84 	struct proc *p;
85 	register struct read_args *uap;
86 	int *retval;
87 {
88 	register struct file *fp;
89 	register struct filedesc *fdp = p->p_fd;
90 	struct uio auio;
91 	struct iovec aiov;
92 	long cnt, error = 0;
93 #ifdef KTRACE
94 	struct iovec ktriov;
95 #endif
96 
97 	if (((u_int)uap->fd) >= fdp->fd_nfiles ||
98 	    (fp = fdp->fd_ofiles[uap->fd]) == NULL ||
99 	    (fp->f_flag & FREAD) == 0)
100 		return (EBADF);
101 	aiov.iov_base = (caddr_t)uap->buf;
102 	aiov.iov_len = uap->nbyte;
103 	auio.uio_iov = &aiov;
104 	auio.uio_iovcnt = 1;
105 	auio.uio_offset = -1;
106 
107 	auio.uio_resid = uap->nbyte;
108 	if (auio.uio_resid < 0)
109 		return (EINVAL);
110 
111 	auio.uio_rw = UIO_READ;
112 	auio.uio_segflg = UIO_USERSPACE;
113 	auio.uio_procp = p;
114 #ifdef KTRACE
115 	/*
116 	 * if tracing, save a copy of iovec
117 	 */
118 	if (KTRPOINT(p, KTR_GENIO))
119 		ktriov = aiov;
120 #endif
121 	cnt = uap->nbyte;
122 	if ((error = (*fp->f_ops->fo_read)(fp, &auio, fp->f_cred)))
123 		if (auio.uio_resid != cnt && (error == ERESTART ||
124 		    error == EINTR || error == EWOULDBLOCK))
125 			error = 0;
126 	cnt -= auio.uio_resid;
127 #ifdef KTRACE
128 	if (KTRPOINT(p, KTR_GENIO) && error == 0)
129 		ktrgenio(p->p_tracep, uap->fd, UIO_READ, &ktriov, cnt, error);
130 #endif
131 	*retval = cnt;
132 	return (error);
133 }
134 
135 /*
136  * Scatter read system call.
137  */
138 #ifndef _SYS_SYSPROTO_H_
139 struct readv_args {
140 	int	fd;
141 	struct	iovec *iovp;
142 	u_int	iovcnt;
143 };
144 #endif
145 int
146 readv(p, uap, retval)
147 	struct proc *p;
148 	register struct readv_args *uap;
149 	int *retval;
150 {
151 	register struct file *fp;
152 	register struct filedesc *fdp = p->p_fd;
153 	struct uio auio;
154 	register struct iovec *iov;
155 	struct iovec *needfree;
156 	struct iovec aiov[UIO_SMALLIOV];
157 	long i, cnt, error = 0;
158 	u_int iovlen;
159 #ifdef KTRACE
160 	struct iovec *ktriov = NULL;
161 #endif
162 
163 	if (((u_int)uap->fd) >= fdp->fd_nfiles ||
164 	    (fp = fdp->fd_ofiles[uap->fd]) == NULL ||
165 	    (fp->f_flag & FREAD) == 0)
166 		return (EBADF);
167 	/* note: can't use iovlen until iovcnt is validated */
168 	iovlen = uap->iovcnt * sizeof (struct iovec);
169 	if (uap->iovcnt > UIO_SMALLIOV) {
170 		if (uap->iovcnt > UIO_MAXIOV)
171 			return (EINVAL);
172 		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
173 		needfree = iov;
174 	} else {
175 		iov = aiov;
176 		needfree = NULL;
177 	}
178 	auio.uio_iov = iov;
179 	auio.uio_iovcnt = uap->iovcnt;
180 	auio.uio_rw = UIO_READ;
181 	auio.uio_segflg = UIO_USERSPACE;
182 	auio.uio_procp = p;
183 	auio.uio_offset = -1;
184 	if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)))
185 		goto done;
186 	auio.uio_resid = 0;
187 	for (i = 0; i < uap->iovcnt; i++) {
188 		auio.uio_resid += iov->iov_len;
189 		if (auio.uio_resid < 0) {
190 			error = EINVAL;
191 			goto done;
192 		}
193 		iov++;
194 	}
195 #ifdef KTRACE
196 	/*
197 	 * if tracing, save a copy of iovec
198 	 */
199 	if (KTRPOINT(p, KTR_GENIO))  {
200 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
201 		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
202 	}
203 #endif
204 	cnt = auio.uio_resid;
205 	if ((error = (*fp->f_ops->fo_read)(fp, &auio, fp->f_cred)))
206 		if (auio.uio_resid != cnt && (error == ERESTART ||
207 		    error == EINTR || error == EWOULDBLOCK))
208 			error = 0;
209 	cnt -= auio.uio_resid;
210 #ifdef KTRACE
211 	if (ktriov != NULL) {
212 		if (error == 0)
213 			ktrgenio(p->p_tracep, uap->fd, UIO_READ, ktriov,
214 			    cnt, error);
215 		FREE(ktriov, M_TEMP);
216 	}
217 #endif
218 	*retval = cnt;
219 done:
220 	if (needfree)
221 		FREE(needfree, M_IOV);
222 	return (error);
223 }
224 
225 /*
226  * Write system call
227  */
228 #ifndef _SYS_SYSPROTO_H_
229 struct write_args {
230 	int	fd;
231 	char	*buf;
232 	u_int	nbyte;
233 };
234 #endif
235 int
236 write(p, uap, retval)
237 	struct proc *p;
238 	register struct write_args *uap;
239 	int *retval;
240 {
241 	register struct file *fp;
242 	register struct filedesc *fdp = p->p_fd;
243 	struct uio auio;
244 	struct iovec aiov;
245 	long cnt, error = 0;
246 #ifdef KTRACE
247 	struct iovec ktriov;
248 #endif
249 
250 	if (((u_int)uap->fd) >= fdp->fd_nfiles ||
251 	    (fp = fdp->fd_ofiles[uap->fd]) == NULL ||
252 	    (fp->f_flag & FWRITE) == 0)
253 		return (EBADF);
254 	aiov.iov_base = (caddr_t)uap->buf;
255 	aiov.iov_len = uap->nbyte;
256 	auio.uio_iov = &aiov;
257 	auio.uio_iovcnt = 1;
258 	auio.uio_offset = -1;
259 	auio.uio_resid = uap->nbyte;
260 	auio.uio_rw = UIO_WRITE;
261 	auio.uio_segflg = UIO_USERSPACE;
262 	auio.uio_procp = p;
263 #ifdef KTRACE
264 	/*
265 	 * if tracing, save a copy of iovec
266 	 */
267 	if (KTRPOINT(p, KTR_GENIO))
268 		ktriov = aiov;
269 #endif
270 	cnt = uap->nbyte;
271 	if ((error = (*fp->f_ops->fo_write)(fp, &auio, fp->f_cred))) {
272 		if (auio.uio_resid != cnt && (error == ERESTART ||
273 		    error == EINTR || error == EWOULDBLOCK))
274 			error = 0;
275 		if (error == EPIPE)
276 			psignal(p, SIGPIPE);
277 	}
278 	cnt -= auio.uio_resid;
279 #ifdef KTRACE
280 	if (KTRPOINT(p, KTR_GENIO) && error == 0)
281 		ktrgenio(p->p_tracep, uap->fd, UIO_WRITE,
282 		    &ktriov, cnt, error);
283 #endif
284 	*retval = cnt;
285 	return (error);
286 }
287 
288 /*
289  * Gather write system call
290  */
291 #ifndef _SYS_SYSPROTO_H_
292 struct writev_args {
293 	int	fd;
294 	struct	iovec *iovp;
295 	u_int	iovcnt;
296 };
297 #endif
298 int
299 writev(p, uap, retval)
300 	struct proc *p;
301 	register struct writev_args *uap;
302 	int *retval;
303 {
304 	register struct file *fp;
305 	register struct filedesc *fdp = p->p_fd;
306 	struct uio auio;
307 	register struct iovec *iov;
308 	struct iovec *needfree;
309 	struct iovec aiov[UIO_SMALLIOV];
310 	long i, cnt, error = 0;
311 	u_int iovlen;
312 #ifdef KTRACE
313 	struct iovec *ktriov = NULL;
314 #endif
315 
316 	if (((u_int)uap->fd) >= fdp->fd_nfiles ||
317 	    (fp = fdp->fd_ofiles[uap->fd]) == NULL ||
318 	    (fp->f_flag & FWRITE) == 0)
319 		return (EBADF);
320 	/* note: can't use iovlen until iovcnt is validated */
321 	iovlen = uap->iovcnt * sizeof (struct iovec);
322 	if (uap->iovcnt > UIO_SMALLIOV) {
323 		if (uap->iovcnt > UIO_MAXIOV)
324 			return (EINVAL);
325 		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
326 		needfree = iov;
327 	} else {
328 		iov = aiov;
329 		needfree = NULL;
330 	}
331 	auio.uio_iov = iov;
332 	auio.uio_iovcnt = uap->iovcnt;
333 	auio.uio_rw = UIO_WRITE;
334 	auio.uio_segflg = UIO_USERSPACE;
335 	auio.uio_procp = p;
336 	auio.uio_offset = -1;
337 	if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)))
338 		goto done;
339 	auio.uio_resid = 0;
340 	for (i = 0; i < uap->iovcnt; i++) {
341 		auio.uio_resid += iov->iov_len;
342 		if (auio.uio_resid < 0) {
343 			error = EINVAL;
344 			goto done;
345 		}
346 		iov++;
347 	}
348 #ifdef KTRACE
349 	/*
350 	 * if tracing, save a copy of iovec
351 	 */
352 	if (KTRPOINT(p, KTR_GENIO))  {
353 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
354 		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
355 	}
356 #endif
357 	cnt = auio.uio_resid;
358 	if ((error = (*fp->f_ops->fo_write)(fp, &auio, fp->f_cred))) {
359 		if (auio.uio_resid != cnt && (error == ERESTART ||
360 		    error == EINTR || error == EWOULDBLOCK))
361 			error = 0;
362 		if (error == EPIPE)
363 			psignal(p, SIGPIPE);
364 	}
365 	cnt -= auio.uio_resid;
366 #ifdef KTRACE
367 	if (ktriov != NULL) {
368 		if (error == 0)
369 			ktrgenio(p->p_tracep, uap->fd, UIO_WRITE,
370 				ktriov, cnt, error);
371 		FREE(ktriov, M_TEMP);
372 	}
373 #endif
374 	*retval = cnt;
375 done:
376 	if (needfree)
377 		FREE(needfree, M_IOV);
378 	return (error);
379 }
380 
381 /*
382  * Ioctl system call
383  */
384 #ifndef _SYS_SYSPROTO_H_
385 struct ioctl_args {
386 	int	fd;
387 	int	com;
388 	caddr_t	data;
389 };
390 #endif
391 /* ARGSUSED */
392 int
393 ioctl(p, uap, retval)
394 	struct proc *p;
395 	register struct ioctl_args *uap;
396 	int *retval;
397 {
398 	register struct file *fp;
399 	register struct filedesc *fdp;
400 	register int com, error;
401 	register u_int size;
402 	caddr_t data, memp;
403 	int tmp;
404 #define STK_PARAMS	128
405 	char stkbuf[STK_PARAMS];
406 
407 	fdp = p->p_fd;
408 	if ((u_int)uap->fd >= fdp->fd_nfiles ||
409 	    (fp = fdp->fd_ofiles[uap->fd]) == NULL)
410 		return (EBADF);
411 
412 	if ((fp->f_flag & (FREAD | FWRITE)) == 0)
413 		return (EBADF);
414 
415 	switch (com = uap->com) {
416 	case FIONCLEX:
417 		fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE;
418 		return (0);
419 	case FIOCLEX:
420 		fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE;
421 		return (0);
422 	}
423 
424 	/*
425 	 * Interpret high order word to find amount of data to be
426 	 * copied to/from the user's address space.
427 	 */
428 	size = IOCPARM_LEN(com);
429 	if (size > IOCPARM_MAX)
430 		return (ENOTTY);
431 	memp = NULL;
432 	if (size > sizeof (stkbuf)) {
433 		memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
434 		data = memp;
435 	} else
436 		data = stkbuf;
437 	if (com&IOC_IN) {
438 		if (size) {
439 			error = copyin(uap->data, data, (u_int)size);
440 			if (error) {
441 				if (memp)
442 					free(memp, M_IOCTLOPS);
443 				return (error);
444 			}
445 		} else
446 			*(caddr_t *)data = uap->data;
447 	} else if ((com&IOC_OUT) && size)
448 		/*
449 		 * Zero the buffer so the user always
450 		 * gets back something deterministic.
451 		 */
452 		bzero(data, size);
453 	else if (com&IOC_VOID)
454 		*(caddr_t *)data = uap->data;
455 
456 	switch (com) {
457 
458 	case FIONBIO:
459 		if ((tmp = *(int *)data))
460 			fp->f_flag |= FNONBLOCK;
461 		else
462 			fp->f_flag &= ~FNONBLOCK;
463 		error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&tmp, p);
464 		break;
465 
466 	case FIOASYNC:
467 		if ((tmp = *(int *)data))
468 			fp->f_flag |= FASYNC;
469 		else
470 			fp->f_flag &= ~FASYNC;
471 		error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, (caddr_t)&tmp, p);
472 		break;
473 
474 	case FIOSETOWN:
475 		tmp = *(int *)data;
476 		if (fp->f_type == DTYPE_SOCKET) {
477 			((struct socket *)fp->f_data)->so_pgid = tmp;
478 			error = 0;
479 			break;
480 		}
481 		if (tmp <= 0) {
482 			tmp = -tmp;
483 		} else {
484 			struct proc *p1 = pfind(tmp);
485 			if (p1 == 0) {
486 				error = ESRCH;
487 				break;
488 			}
489 			tmp = p1->p_pgrp->pg_id;
490 		}
491 		error = (*fp->f_ops->fo_ioctl)
492 			(fp, (int)TIOCSPGRP, (caddr_t)&tmp, p);
493 		break;
494 
495 	case FIOGETOWN:
496 		if (fp->f_type == DTYPE_SOCKET) {
497 			error = 0;
498 			*(int *)data = ((struct socket *)fp->f_data)->so_pgid;
499 			break;
500 		}
501 		error = (*fp->f_ops->fo_ioctl)(fp, (int)TIOCGPGRP, data, p);
502 		*(int *)data = -*(int *)data;
503 		break;
504 
505 	default:
506 		error = (*fp->f_ops->fo_ioctl)(fp, com, data, p);
507 		/*
508 		 * Copy any data to user, size was
509 		 * already set and checked above.
510 		 */
511 		if (error == 0 && (com&IOC_OUT) && size)
512 			error = copyout(data, uap->data, (u_int)size);
513 		break;
514 	}
515 	if (memp)
516 		free(memp, M_IOCTLOPS);
517 	return (error);
518 }
519 
520 static int	nselcoll;
521 int	selwait;
522 
523 /*
524  * Select system call.
525  */
526 #ifndef _SYS_SYSPROTO_H_
527 struct select_args {
528 	int	nd;
529 	fd_set	*in, *ou, *ex;
530 	struct	timeval *tv;
531 };
532 #endif
533 int
534 select(p, uap, retval)
535 	register struct proc *p;
536 	register struct select_args *uap;
537 	int *retval;
538 {
539 	/*
540 	 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
541 	 * infds with the new FD_SETSIZE of 1024, and more than enough for
542 	 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
543 	 * of 256.
544 	 */
545 	fd_mask s_selbits[howmany(2048, NFDBITS)];
546 	fd_mask *ibits[3], *obits[3], *selbits, *sbp;
547 	struct timeval atv;
548 	int s, ncoll, error, timo;
549 	u_int nbufbytes, ncpbytes, nfdbits;
550 
551 	if (uap->nd < 0)
552 		return (EINVAL);
553 	if (uap->nd > p->p_fd->fd_nfiles)
554 		uap->nd = p->p_fd->fd_nfiles;   /* forgiving; slightly wrong */
555 
556 	/*
557 	 * Allocate just enough bits for the non-null fd_sets.  Use the
558 	 * preallocated auto buffer if possible.
559 	 */
560 	nfdbits = roundup(uap->nd, NFDBITS);
561 	ncpbytes = nfdbits / NBBY;
562 	nbufbytes = 0;
563 	if (uap->in != NULL)
564 		nbufbytes += 2 * ncpbytes;
565 	if (uap->ou != NULL)
566 		nbufbytes += 2 * ncpbytes;
567 	if (uap->ex != NULL)
568 		nbufbytes += 2 * ncpbytes;
569 	if (nbufbytes <= sizeof s_selbits)
570 		selbits = &s_selbits[0];
571 	else
572 		selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
573 
574 	/*
575 	 * Assign pointers into the bit buffers and fetch the input bits.
576 	 * Put the output buffers together so that they can be bzeroed
577 	 * together.
578 	 */
579 	sbp = selbits;
580 #define	getbits(name, x) \
581 	do {								\
582 		if (uap->name == NULL)					\
583 			ibits[x] = NULL;				\
584 		else {							\
585 			ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp;	\
586 			obits[x] = sbp;					\
587 			sbp += ncpbytes / sizeof *sbp;			\
588 			error = copyin(uap->name, ibits[x], ncpbytes);	\
589 			if (error != 0)					\
590 				goto done;				\
591 		}							\
592 	} while (0)
593 	getbits(in, 0);
594 	getbits(ou, 1);
595 	getbits(ex, 2);
596 #undef	getbits
597 	if (nbufbytes != 0)
598 		bzero(selbits, nbufbytes / 2);
599 
600 	if (uap->tv) {
601 		error = copyin((caddr_t)uap->tv, (caddr_t)&atv,
602 			sizeof (atv));
603 		if (error)
604 			goto done;
605 		if (itimerfix(&atv)) {
606 			error = EINVAL;
607 			goto done;
608 		}
609 		s = splclock();
610 		timevaladd(&atv, &time);
611 		timo = hzto(&atv);
612 		/*
613 		 * Avoid inadvertently sleeping forever.
614 		 */
615 		if (timo == 0)
616 			timo = 1;
617 		splx(s);
618 	} else
619 		timo = 0;
620 retry:
621 	ncoll = nselcoll;
622 	p->p_flag |= P_SELECT;
623 	error = selscan(p, ibits, obits, uap->nd, retval);
624 	if (error || *retval)
625 		goto done;
626 	s = splhigh();
627 	/* this should be timercmp(&time, &atv, >=) */
628 	if (uap->tv && (time.tv_sec > atv.tv_sec ||
629 	    (time.tv_sec == atv.tv_sec && time.tv_usec >= atv.tv_usec))) {
630 		splx(s);
631 		goto done;
632 	}
633 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
634 		splx(s);
635 		goto retry;
636 	}
637 	p->p_flag &= ~P_SELECT;
638 	error = tsleep((caddr_t)&selwait, PSOCK | PCATCH, "select", timo);
639 	splx(s);
640 	if (error == 0)
641 		goto retry;
642 done:
643 	p->p_flag &= ~P_SELECT;
644 	/* select is not restarted after signals... */
645 	if (error == ERESTART)
646 		error = EINTR;
647 	if (error == EWOULDBLOCK)
648 		error = 0;
649 #define	putbits(name, x) \
650 	if (uap->name && (error2 = copyout(obits[x], uap->name, ncpbytes))) \
651 		error = error2;
652 	if (error == 0) {
653 		int error2;
654 
655 		putbits(in, 0);
656 		putbits(ou, 1);
657 		putbits(ex, 2);
658 #undef putbits
659 	}
660 	if (selbits != &s_selbits[0])
661 		free(selbits, M_SELECT);
662 	return (error);
663 }
664 
665 static int
666 selscan(p, ibits, obits, nfd, retval)
667 	struct proc *p;
668 	fd_mask **ibits, **obits;
669 	int nfd, *retval;
670 {
671 	register struct filedesc *fdp = p->p_fd;
672 	register int msk, i, j, fd;
673 	register fd_mask bits;
674 	struct file *fp;
675 	int n = 0;
676 	/* Note; backend also returns POLLHUP/POLLERR if appropriate */
677 	static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND};
678 
679 	for (msk = 0; msk < 3; msk++) {
680 		if (ibits[msk] == NULL)
681 			continue;
682 		for (i = 0; i < nfd; i += NFDBITS) {
683 			bits = ibits[msk][i/NFDBITS];
684 			while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
685 				bits &= ~(1 << j);
686 				fp = fdp->fd_ofiles[fd];
687 				if (fp == NULL)
688 					return (EBADF);
689 				if ((*fp->f_ops->fo_poll)(fp, flag[msk],
690 				    fp->f_cred, p)) {
691 					obits[msk][(fd)/NFDBITS] |=
692 						(1 << ((fd) % NFDBITS));
693 					n++;
694 				}
695 			}
696 		}
697 	}
698 	*retval = n;
699 	return (0);
700 }
701 
702 /*
703  * Poll system call.
704  */
705 #ifndef _SYS_SYSPROTO_H_
706 struct poll_args {
707 	struct pollfd *fds;
708 	u_int	nfds;
709 	int	timeout;
710 };
711 #endif
712 int
713 poll(p, uap, retval)
714 	register struct proc *p;
715 	register struct poll_args *uap;
716 	register_t *retval;
717 {
718 	caddr_t bits;
719 	char smallbits[32 * sizeof(struct pollfd)];
720 	struct timeval atv;
721 	int s, ncoll, error = 0, timo;
722 	size_t ni;
723 
724 	if (SCARG(uap, nfds) > p->p_fd->fd_nfiles) {
725 		/* forgiving; slightly wrong */
726 		SCARG(uap, nfds) = p->p_fd->fd_nfiles;
727 	}
728 	ni = SCARG(uap, nfds) * sizeof(struct pollfd);
729 	if (ni > sizeof(smallbits))
730 		bits = malloc(ni, M_TEMP, M_WAITOK);
731 	else
732 		bits = smallbits;
733 
734 	error = copyin(SCARG(uap, fds), bits, ni);
735 	if (error)
736 		goto done;
737 
738 	if (SCARG(uap, timeout) != INFTIM) {
739 		atv.tv_sec = SCARG(uap, timeout) / 1000;
740 		atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000;
741 		if (itimerfix(&atv)) {
742 			error = EINVAL;
743 			goto done;
744 		}
745 		s = splclock();
746 		timevaladd(&atv, &time);
747 		timo = hzto(&atv);
748 		/*
749 		 * Avoid inadvertently sleeping forever.
750 		 */
751 		if (timo == 0)
752 			timo = 1;
753 		splx(s);
754 	} else
755 		timo = 0;
756 retry:
757 	ncoll = nselcoll;
758 	p->p_flag |= P_SELECT;
759 	error = pollscan(p, (struct pollfd *)bits, SCARG(uap, nfds), retval);
760 	if (error || *retval)
761 		goto done;
762 	s = splhigh();
763 	if (timo && timercmp(&time, &atv, >=)) {
764 		splx(s);
765 		goto done;
766 	}
767 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
768 		splx(s);
769 		goto retry;
770 	}
771 	p->p_flag &= ~P_SELECT;
772 	error = tsleep((caddr_t)&selwait, PSOCK | PCATCH, "poll", timo);
773 	splx(s);
774 	if (error == 0)
775 		goto retry;
776 done:
777 	p->p_flag &= ~P_SELECT;
778 	/* poll is not restarted after signals... */
779 	if (error == ERESTART)
780 		error = EINTR;
781 	if (error == EWOULDBLOCK)
782 		error = 0;
783 	if (error == 0) {
784 		error = copyout(bits, SCARG(uap, fds), ni);
785 		if (error)
786 			goto out;
787 	}
788 out:
789 	if (ni > sizeof(smallbits))
790 		free(bits, M_TEMP);
791 	return (error);
792 }
793 
794 static int
795 pollscan(p, fds, nfd, retval)
796 	struct proc *p;
797 	struct pollfd *fds;
798 	int nfd;
799 	register_t *retval;
800 {
801 	register struct filedesc *fdp = p->p_fd;
802 	int i;
803 	struct file *fp;
804 	int n = 0;
805 
806 	for (i = 0; i < nfd; i++, fds++) {
807 		if ((u_int)fds->fd >= fdp->fd_nfiles) {
808 			fds->revents = POLLNVAL;
809 			n++;
810 		} else {
811 			fp = fdp->fd_ofiles[fds->fd];
812 			if (fp == 0) {
813 				fds->revents = POLLNVAL;
814 				n++;
815 			} else {
816 				/* Note: backend also returns POLLHUP and
817 				 * POLLERR if appropriate */
818 				fds->revents = (*fp->f_ops->fo_poll)(fp,
819 				    fds->events, fp->f_cred, p);
820 				if (fds->revents != 0)
821 					n++;
822 			}
823 		}
824 	}
825 	*retval = n;
826 	return (0);
827 }
828 
829 /*
830  * OpenBSD poll system call.
831  * XXX this isn't quite a true representation..  OpenBSD uses select ops.
832  */
833 #ifndef _SYS_SYSPROTO_H_
834 struct openbsd_poll_args {
835 	struct pollfd *fds;
836 	u_int	nfds;
837 	int	timeout;
838 };
839 #endif
840 int
841 openbsd_poll(p, uap, retval)
842 	register struct proc *p;
843 	register struct openbsd_poll_args *uap;
844 	register_t *retval;
845 {
846 	return (poll(p, (struct poll_args *)uap, retval));
847 }
848 
849 /*ARGSUSED*/
850 int
851 seltrue(dev, events, p)
852 	dev_t dev;
853 	int events;
854 	struct proc *p;
855 {
856 
857 	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
858 }
859 
860 /*
861  * Record a select request.
862  */
863 void
864 selrecord(selector, sip)
865 	struct proc *selector;
866 	struct selinfo *sip;
867 {
868 	struct proc *p;
869 	pid_t mypid;
870 
871 	mypid = selector->p_pid;
872 	if (sip->si_pid == mypid)
873 		return;
874 	if (sip->si_pid && (p = pfind(sip->si_pid)) &&
875 	    p->p_wchan == (caddr_t)&selwait)
876 		sip->si_flags |= SI_COLL;
877 	else
878 		sip->si_pid = mypid;
879 }
880 
881 /*
882  * Do a wakeup when a selectable event occurs.
883  */
884 void
885 selwakeup(sip)
886 	register struct selinfo *sip;
887 {
888 	register struct proc *p;
889 	int s;
890 
891 	if (sip->si_pid == 0)
892 		return;
893 	if (sip->si_flags & SI_COLL) {
894 		nselcoll++;
895 		sip->si_flags &= ~SI_COLL;
896 		wakeup((caddr_t)&selwait);
897 	}
898 	p = pfind(sip->si_pid);
899 	sip->si_pid = 0;
900 	if (p != NULL) {
901 		s = splhigh();
902 		if (p->p_wchan == (caddr_t)&selwait) {
903 			if (p->p_stat == SSLEEP)
904 				setrunnable(p);
905 			else
906 				unsleep(p);
907 		} else if (p->p_flag & P_SELECT)
908 			p->p_flag &= ~P_SELECT;
909 		splx(s);
910 	}
911 }
912