xref: /freebsd/sys/kern/sys_generic.c (revision 64db83a8ab2d1f72a9b2174b39d2ef42b5b0580c)
1 /*
2  * Copyright (c) 1982, 1986, 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
39  * $FreeBSD$
40  */
41 
42 #include "opt_ktrace.h"
43 
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/sysproto.h>
47 #include <sys/filedesc.h>
48 #include <sys/filio.h>
49 #include <sys/fcntl.h>
50 #include <sys/file.h>
51 #include <sys/proc.h>
52 #include <sys/signalvar.h>
53 #include <sys/socketvar.h>
54 #include <sys/uio.h>
55 #include <sys/kernel.h>
56 #include <sys/malloc.h>
57 #include <sys/poll.h>
58 #include <sys/sysctl.h>
59 #include <sys/sysent.h>
60 #ifdef KTRACE
61 #include <sys/ktrace.h>
62 #endif
63 
64 #include <machine/limits.h>
65 
66 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
67 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
68 MALLOC_DEFINE(M_IOV, "iov", "large iov's");
69 
70 static int	pollscan __P((struct proc *, struct pollfd *, int));
71 static int	selscan __P((struct proc *, fd_mask **, fd_mask **, int));
72 static struct file* getfp __P((struct filedesc *, int, int));
73 static int	dofileread __P((struct proc *, struct file *, int, void *,
74 		    size_t, off_t, int));
75 static int	dofilewrite __P((struct proc *, struct file *, int,
76 		    const void *, size_t, off_t, int));
77 
78 static struct file*
79 getfp(fdp, fd, flag)
80 	struct filedesc* fdp;
81 	int fd, flag;
82 {
83 	struct file* fp;
84 
85 	if (((u_int)fd) >= fdp->fd_nfiles ||
86 	    (fp = fdp->fd_ofiles[fd]) == NULL ||
87 	    (fp->f_flag & flag) == 0)
88 		return (NULL);
89 	return (fp);
90 }
91 
92 /*
93  * Read system call.
94  */
95 #ifndef _SYS_SYSPROTO_H_
96 struct read_args {
97 	int	fd;
98 	void	*buf;
99 	size_t	nbyte;
100 };
101 #endif
102 int
103 read(p, uap)
104 	struct proc *p;
105 	register struct read_args *uap;
106 {
107 	register struct file *fp;
108 
109 	if ((fp = getfp(p->p_fd, uap->fd, FREAD)) == NULL)
110 		return (EBADF);
111 	return (dofileread(p, fp, uap->fd, uap->buf, uap->nbyte, (off_t)-1, 0));
112 }
113 
114 /*
115  * Pread system call
116  */
117 #ifndef _SYS_SYSPROTO_H_
118 struct pread_args {
119 	int	fd;
120 	void	*buf;
121 	size_t	nbyte;
122 	int	pad;
123 	off_t	offset;
124 };
125 #endif
126 int
127 pread(p, uap)
128 	struct proc *p;
129 	register struct pread_args *uap;
130 {
131 	register struct file *fp;
132 
133 	if ((fp = getfp(p->p_fd, uap->fd, FREAD)) == NULL)
134 		return (EBADF);
135 	if (fp->f_type != DTYPE_VNODE)
136 		return (ESPIPE);
137 	return (dofileread(p, fp, uap->fd, uap->buf, uap->nbyte, uap->offset,
138 	    FOF_OFFSET));
139 }
140 
141 /*
142  * Code common for read and pread
143  */
144 int
145 dofileread(p, fp, fd, buf, nbyte, offset, flags)
146 	struct proc *p;
147 	struct file *fp;
148 	int fd, flags;
149 	void *buf;
150 	size_t nbyte;
151 	off_t offset;
152 {
153 	struct uio auio;
154 	struct iovec aiov;
155 	long cnt, error = 0;
156 #ifdef KTRACE
157 	struct iovec ktriov;
158 #endif
159 
160 	aiov.iov_base = (caddr_t)buf;
161 	aiov.iov_len = nbyte;
162 	auio.uio_iov = &aiov;
163 	auio.uio_iovcnt = 1;
164 	auio.uio_offset = offset;
165 	if (nbyte > INT_MAX)
166 		return (EINVAL);
167 	auio.uio_resid = nbyte;
168 	auio.uio_rw = UIO_READ;
169 	auio.uio_segflg = UIO_USERSPACE;
170 	auio.uio_procp = p;
171 #ifdef KTRACE
172 	/*
173 	 * if tracing, save a copy of iovec
174 	 */
175 	if (KTRPOINT(p, KTR_GENIO))
176 		ktriov = aiov;
177 #endif
178 	cnt = nbyte;
179 	if ((error = fo_read(fp, &auio, fp->f_cred, flags, p)))
180 		if (auio.uio_resid != cnt && (error == ERESTART ||
181 		    error == EINTR || error == EWOULDBLOCK))
182 			error = 0;
183 	cnt -= auio.uio_resid;
184 #ifdef KTRACE
185 	if (KTRPOINT(p, KTR_GENIO) && error == 0)
186 		ktrgenio(p->p_tracep, fd, UIO_READ, &ktriov, cnt, error);
187 #endif
188 	p->p_retval[0] = cnt;
189 	return (error);
190 }
191 
192 /*
193  * Scatter read system call.
194  */
195 #ifndef _SYS_SYSPROTO_H_
196 struct readv_args {
197 	int	fd;
198 	struct	iovec *iovp;
199 	u_int	iovcnt;
200 };
201 #endif
202 int
203 readv(p, uap)
204 	struct proc *p;
205 	register struct readv_args *uap;
206 {
207 	register struct file *fp;
208 	register struct filedesc *fdp = p->p_fd;
209 	struct uio auio;
210 	register struct iovec *iov;
211 	struct iovec *needfree;
212 	struct iovec aiov[UIO_SMALLIOV];
213 	long i, cnt, error = 0;
214 	u_int iovlen;
215 #ifdef KTRACE
216 	struct iovec *ktriov = NULL;
217 #endif
218 
219 	if ((fp = getfp(fdp, uap->fd, FREAD)) == NULL)
220 		return (EBADF);
221 	/* note: can't use iovlen until iovcnt is validated */
222 	iovlen = uap->iovcnt * sizeof (struct iovec);
223 	if (uap->iovcnt > UIO_SMALLIOV) {
224 		if (uap->iovcnt > UIO_MAXIOV)
225 			return (EINVAL);
226 		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
227 		needfree = iov;
228 	} else {
229 		iov = aiov;
230 		needfree = NULL;
231 	}
232 	auio.uio_iov = iov;
233 	auio.uio_iovcnt = uap->iovcnt;
234 	auio.uio_rw = UIO_READ;
235 	auio.uio_segflg = UIO_USERSPACE;
236 	auio.uio_procp = p;
237 	auio.uio_offset = -1;
238 	if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)))
239 		goto done;
240 	auio.uio_resid = 0;
241 	for (i = 0; i < uap->iovcnt; i++) {
242 		if (iov->iov_len > INT_MAX - auio.uio_resid) {
243 			error = EINVAL;
244 			goto done;
245 		}
246 		auio.uio_resid += iov->iov_len;
247 		iov++;
248 	}
249 #ifdef KTRACE
250 	/*
251 	 * if tracing, save a copy of iovec
252 	 */
253 	if (KTRPOINT(p, KTR_GENIO))  {
254 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
255 		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
256 	}
257 #endif
258 	cnt = auio.uio_resid;
259 	if ((error = fo_read(fp, &auio, fp->f_cred, 0, p)))
260 		if (auio.uio_resid != cnt && (error == ERESTART ||
261 		    error == EINTR || error == EWOULDBLOCK))
262 			error = 0;
263 	cnt -= auio.uio_resid;
264 #ifdef KTRACE
265 	if (ktriov != NULL) {
266 		if (error == 0)
267 			ktrgenio(p->p_tracep, uap->fd, UIO_READ, ktriov,
268 			    cnt, error);
269 		FREE(ktriov, M_TEMP);
270 	}
271 #endif
272 	p->p_retval[0] = cnt;
273 done:
274 	if (needfree)
275 		FREE(needfree, M_IOV);
276 	return (error);
277 }
278 
279 /*
280  * Write system call
281  */
282 #ifndef _SYS_SYSPROTO_H_
283 struct write_args {
284 	int	fd;
285 	const void *buf;
286 	size_t	nbyte;
287 };
288 #endif
289 int
290 write(p, uap)
291 	struct proc *p;
292 	register struct write_args *uap;
293 {
294 	register struct file *fp;
295 
296 	if ((fp = getfp(p->p_fd, uap->fd, FWRITE)) == NULL)
297 		return (EBADF);
298 	return (dofilewrite(p, fp, uap->fd, uap->buf, uap->nbyte, (off_t)-1, 0));
299 }
300 
301 /*
302  * Pwrite system call
303  */
304 #ifndef _SYS_SYSPROTO_H_
305 struct pwrite_args {
306 	int	fd;
307 	const void *buf;
308 	size_t	nbyte;
309 	int	pad;
310 	off_t	offset;
311 };
312 #endif
313 int
314 pwrite(p, uap)
315 	struct proc *p;
316 	register struct pwrite_args *uap;
317 {
318 	register struct file *fp;
319 
320 	if ((fp = getfp(p->p_fd, uap->fd, FWRITE)) == NULL)
321 		return (EBADF);
322 	if (fp->f_type != DTYPE_VNODE)
323 		return (ESPIPE);
324 	return (dofilewrite(p, fp, uap->fd, uap->buf, uap->nbyte, uap->offset,
325 	    FOF_OFFSET));
326 }
327 
328 static int
329 dofilewrite(p, fp, fd, buf, nbyte, offset, flags)
330 	struct proc *p;
331 	struct file *fp;
332 	int fd, flags;
333 	const void *buf;
334 	size_t nbyte;
335 	off_t offset;
336 {
337 	struct uio auio;
338 	struct iovec aiov;
339 	long cnt, error = 0;
340 #ifdef KTRACE
341 	struct iovec ktriov;
342 #endif
343 
344 	aiov.iov_base = (void *)buf;
345 	aiov.iov_len = nbyte;
346 	auio.uio_iov = &aiov;
347 	auio.uio_iovcnt = 1;
348 	auio.uio_offset = offset;
349 	if (nbyte > INT_MAX)
350 		return (EINVAL);
351 	auio.uio_resid = nbyte;
352 	auio.uio_rw = UIO_WRITE;
353 	auio.uio_segflg = UIO_USERSPACE;
354 	auio.uio_procp = p;
355 #ifdef KTRACE
356 	/*
357 	 * if tracing, save a copy of iovec
358 	 */
359 	if (KTRPOINT(p, KTR_GENIO))
360 		ktriov = aiov;
361 #endif
362 	cnt = nbyte;
363 	if ((error = fo_write(fp, &auio, fp->f_cred, flags, p))) {
364 		if (auio.uio_resid != cnt && (error == ERESTART ||
365 		    error == EINTR || error == EWOULDBLOCK))
366 			error = 0;
367 		if (error == EPIPE)
368 			psignal(p, SIGPIPE);
369 	}
370 	cnt -= auio.uio_resid;
371 #ifdef KTRACE
372 	if (KTRPOINT(p, KTR_GENIO) && error == 0)
373 		ktrgenio(p->p_tracep, fd, UIO_WRITE,
374 		    &ktriov, cnt, error);
375 #endif
376 	p->p_retval[0] = cnt;
377 	return (error);
378 }
379 
380 /*
381  * Gather write system call
382  */
383 #ifndef _SYS_SYSPROTO_H_
384 struct writev_args {
385 	int	fd;
386 	struct	iovec *iovp;
387 	u_int	iovcnt;
388 };
389 #endif
390 int
391 writev(p, uap)
392 	struct proc *p;
393 	register struct writev_args *uap;
394 {
395 	register struct file *fp;
396 	register struct filedesc *fdp = p->p_fd;
397 	struct uio auio;
398 	register struct iovec *iov;
399 	struct iovec *needfree;
400 	struct iovec aiov[UIO_SMALLIOV];
401 	long i, cnt, error = 0;
402 	u_int iovlen;
403 #ifdef KTRACE
404 	struct iovec *ktriov = NULL;
405 #endif
406 
407 	if ((fp = getfp(fdp, uap->fd, FWRITE)) == NULL)
408 		return (EBADF);
409 	fhold(fp);
410 	/* note: can't use iovlen until iovcnt is validated */
411 	iovlen = uap->iovcnt * sizeof (struct iovec);
412 	if (uap->iovcnt > UIO_SMALLIOV) {
413 		if (uap->iovcnt > UIO_MAXIOV) {
414 			needfree = NULL;
415 			error = EINVAL;
416 			goto done;
417 		}
418 		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
419 		needfree = iov;
420 	} else {
421 		iov = aiov;
422 		needfree = NULL;
423 	}
424 	auio.uio_iov = iov;
425 	auio.uio_iovcnt = uap->iovcnt;
426 	auio.uio_rw = UIO_WRITE;
427 	auio.uio_segflg = UIO_USERSPACE;
428 	auio.uio_procp = p;
429 	auio.uio_offset = -1;
430 	if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)))
431 		goto done;
432 	auio.uio_resid = 0;
433 	for (i = 0; i < uap->iovcnt; i++) {
434 		if (iov->iov_len > INT_MAX - auio.uio_resid) {
435 			error = EINVAL;
436 			goto done;
437 		}
438 		auio.uio_resid += iov->iov_len;
439 		iov++;
440 	}
441 #ifdef KTRACE
442 	/*
443 	 * if tracing, save a copy of iovec
444 	 */
445 	if (KTRPOINT(p, KTR_GENIO))  {
446 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
447 		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
448 	}
449 #endif
450 	cnt = auio.uio_resid;
451 	if ((error = fo_write(fp, &auio, fp->f_cred, 0, p))) {
452 		if (auio.uio_resid != cnt && (error == ERESTART ||
453 		    error == EINTR || error == EWOULDBLOCK))
454 			error = 0;
455 		if (error == EPIPE)
456 			psignal(p, SIGPIPE);
457 	}
458 	cnt -= auio.uio_resid;
459 #ifdef KTRACE
460 	if (ktriov != NULL) {
461 		if (error == 0)
462 			ktrgenio(p->p_tracep, uap->fd, UIO_WRITE,
463 				ktriov, cnt, error);
464 		FREE(ktriov, M_TEMP);
465 	}
466 #endif
467 	p->p_retval[0] = cnt;
468 done:
469 	fdrop(fp, p);
470 	if (needfree)
471 		FREE(needfree, M_IOV);
472 	return (error);
473 }
474 
475 /*
476  * Ioctl system call
477  */
478 #ifndef _SYS_SYSPROTO_H_
479 struct ioctl_args {
480 	int	fd;
481 	u_long	com;
482 	caddr_t	data;
483 };
484 #endif
485 /* ARGSUSED */
486 int
487 ioctl(p, uap)
488 	struct proc *p;
489 	register struct ioctl_args *uap;
490 {
491 	register struct file *fp;
492 	register struct filedesc *fdp;
493 	register u_long com;
494 	int error;
495 	register u_int size;
496 	caddr_t data, memp;
497 	int tmp;
498 #define STK_PARAMS	128
499 	union {
500 	    char stkbuf[STK_PARAMS];
501 	    long align;
502 	} ubuf;
503 
504 	fdp = p->p_fd;
505 	if ((u_int)uap->fd >= fdp->fd_nfiles ||
506 	    (fp = fdp->fd_ofiles[uap->fd]) == NULL)
507 		return (EBADF);
508 
509 	if ((fp->f_flag & (FREAD | FWRITE)) == 0)
510 		return (EBADF);
511 
512 	switch (com = uap->com) {
513 	case FIONCLEX:
514 		fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE;
515 		return (0);
516 	case FIOCLEX:
517 		fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE;
518 		return (0);
519 	}
520 
521 	/*
522 	 * Interpret high order word to find amount of data to be
523 	 * copied to/from the user's address space.
524 	 */
525 	size = IOCPARM_LEN(com);
526 	if (size > IOCPARM_MAX)
527 		return (ENOTTY);
528 	memp = NULL;
529 	if (size > sizeof (ubuf.stkbuf)) {
530 		memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
531 		data = memp;
532 	} else
533 		data = ubuf.stkbuf;
534 	if (com&IOC_IN) {
535 		if (size) {
536 			error = copyin(uap->data, data, (u_int)size);
537 			if (error) {
538 				if (memp)
539 					free(memp, M_IOCTLOPS);
540 				return (error);
541 			}
542 		} else
543 			*(caddr_t *)data = uap->data;
544 	} else if ((com&IOC_OUT) && size)
545 		/*
546 		 * Zero the buffer so the user always
547 		 * gets back something deterministic.
548 		 */
549 		bzero(data, size);
550 	else if (com&IOC_VOID)
551 		*(caddr_t *)data = uap->data;
552 
553 	switch (com) {
554 
555 	case FIONBIO:
556 		if ((tmp = *(int *)data))
557 			fp->f_flag |= FNONBLOCK;
558 		else
559 			fp->f_flag &= ~FNONBLOCK;
560 		error = fo_ioctl(fp, FIONBIO, (caddr_t)&tmp, p);
561 		break;
562 
563 	case FIOASYNC:
564 		if ((tmp = *(int *)data))
565 			fp->f_flag |= FASYNC;
566 		else
567 			fp->f_flag &= ~FASYNC;
568 		error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, p);
569 		break;
570 
571 	default:
572 		error = fo_ioctl(fp, com, data, p);
573 		/*
574 		 * Copy any data to user, size was
575 		 * already set and checked above.
576 		 */
577 		if (error == 0 && (com&IOC_OUT) && size)
578 			error = copyout(data, uap->data, (u_int)size);
579 		break;
580 	}
581 	if (memp)
582 		free(memp, M_IOCTLOPS);
583 	return (error);
584 }
585 
586 static int	nselcoll;	/* Select collisions since boot */
587 int	selwait;
588 SYSCTL_INT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
589 
590 /*
591  * Select system call.
592  */
593 #ifndef _SYS_SYSPROTO_H_
594 struct select_args {
595 	int	nd;
596 	fd_set	*in, *ou, *ex;
597 	struct	timeval *tv;
598 };
599 #endif
600 int
601 select(p, uap)
602 	register struct proc *p;
603 	register struct select_args *uap;
604 {
605 	/*
606 	 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
607 	 * infds with the new FD_SETSIZE of 1024, and more than enough for
608 	 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
609 	 * of 256.
610 	 */
611 	fd_mask s_selbits[howmany(2048, NFDBITS)];
612 	fd_mask *ibits[3], *obits[3], *selbits, *sbp;
613 	struct timeval atv, rtv, ttv;
614 	int s, ncoll, error, timo;
615 	u_int nbufbytes, ncpbytes, nfdbits;
616 
617 	if (uap->nd < 0)
618 		return (EINVAL);
619 	if (uap->nd > p->p_fd->fd_nfiles)
620 		uap->nd = p->p_fd->fd_nfiles;   /* forgiving; slightly wrong */
621 
622 	/*
623 	 * Allocate just enough bits for the non-null fd_sets.  Use the
624 	 * preallocated auto buffer if possible.
625 	 */
626 	nfdbits = roundup(uap->nd, NFDBITS);
627 	ncpbytes = nfdbits / NBBY;
628 	nbufbytes = 0;
629 	if (uap->in != NULL)
630 		nbufbytes += 2 * ncpbytes;
631 	if (uap->ou != NULL)
632 		nbufbytes += 2 * ncpbytes;
633 	if (uap->ex != NULL)
634 		nbufbytes += 2 * ncpbytes;
635 	if (nbufbytes <= sizeof s_selbits)
636 		selbits = &s_selbits[0];
637 	else
638 		selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
639 
640 	/*
641 	 * Assign pointers into the bit buffers and fetch the input bits.
642 	 * Put the output buffers together so that they can be bzeroed
643 	 * together.
644 	 */
645 	sbp = selbits;
646 #define	getbits(name, x) \
647 	do {								\
648 		if (uap->name == NULL)					\
649 			ibits[x] = NULL;				\
650 		else {							\
651 			ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp;	\
652 			obits[x] = sbp;					\
653 			sbp += ncpbytes / sizeof *sbp;			\
654 			error = copyin(uap->name, ibits[x], ncpbytes);	\
655 			if (error != 0)					\
656 				goto done;				\
657 		}							\
658 	} while (0)
659 	getbits(in, 0);
660 	getbits(ou, 1);
661 	getbits(ex, 2);
662 #undef	getbits
663 	if (nbufbytes != 0)
664 		bzero(selbits, nbufbytes / 2);
665 
666 	if (uap->tv) {
667 		error = copyin((caddr_t)uap->tv, (caddr_t)&atv,
668 			sizeof (atv));
669 		if (error)
670 			goto done;
671 		if (itimerfix(&atv)) {
672 			error = EINVAL;
673 			goto done;
674 		}
675 		getmicrouptime(&rtv);
676 		timevaladd(&atv, &rtv);
677 	} else
678 		atv.tv_sec = 0;
679 	timo = 0;
680 retry:
681 	ncoll = nselcoll;
682 	p->p_flag |= P_SELECT;
683 	error = selscan(p, ibits, obits, uap->nd);
684 	if (error || p->p_retval[0])
685 		goto done;
686 	if (atv.tv_sec) {
687 		getmicrouptime(&rtv);
688 		if (timevalcmp(&rtv, &atv, >=))
689 			goto done;
690 		ttv = atv;
691 		timevalsub(&ttv, &rtv);
692 		timo = ttv.tv_sec > 24 * 60 * 60 ?
693 		    24 * 60 * 60 * hz : tvtohz(&ttv);
694 	}
695 	s = splhigh();
696 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
697 		splx(s);
698 		goto retry;
699 	}
700 	p->p_flag &= ~P_SELECT;
701 
702 	error = tsleep((caddr_t)&selwait, PSOCK | PCATCH, "select", timo);
703 
704 	splx(s);
705 	if (error == 0)
706 		goto retry;
707 done:
708 	p->p_flag &= ~P_SELECT;
709 	/* select is not restarted after signals... */
710 	if (error == ERESTART)
711 		error = EINTR;
712 	if (error == EWOULDBLOCK)
713 		error = 0;
714 #define	putbits(name, x) \
715 	if (uap->name && (error2 = copyout(obits[x], uap->name, ncpbytes))) \
716 		error = error2;
717 	if (error == 0) {
718 		int error2;
719 
720 		putbits(in, 0);
721 		putbits(ou, 1);
722 		putbits(ex, 2);
723 #undef putbits
724 	}
725 	if (selbits != &s_selbits[0])
726 		free(selbits, M_SELECT);
727 	return (error);
728 }
729 
730 static int
731 selscan(p, ibits, obits, nfd)
732 	struct proc *p;
733 	fd_mask **ibits, **obits;
734 	int nfd;
735 {
736 	struct filedesc *fdp = p->p_fd;
737 	int msk, i, fd;
738 	fd_mask bits;
739 	struct file *fp;
740 	int n = 0;
741 	/* Note: backend also returns POLLHUP/POLLERR if appropriate. */
742 	static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
743 
744 	for (msk = 0; msk < 3; msk++) {
745 		if (ibits[msk] == NULL)
746 			continue;
747 		for (i = 0; i < nfd; i += NFDBITS) {
748 			bits = ibits[msk][i/NFDBITS];
749 			/* ffs(int mask) not portable, fd_mask is long */
750 			for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
751 				if (!(bits & 1))
752 					continue;
753 				fp = fdp->fd_ofiles[fd];
754 				if (fp == NULL)
755 					return (EBADF);
756 				if (fo_poll(fp, flag[msk], fp->f_cred, p)) {
757 					obits[msk][(fd)/NFDBITS] |=
758 					    ((fd_mask)1 << ((fd) % NFDBITS));
759 					n++;
760 				}
761 			}
762 		}
763 	}
764 	p->p_retval[0] = n;
765 	return (0);
766 }
767 
768 /*
769  * Poll system call.
770  */
771 #ifndef _SYS_SYSPROTO_H_
772 struct poll_args {
773 	struct pollfd *fds;
774 	u_int	nfds;
775 	int	timeout;
776 };
777 #endif
778 int
779 poll(p, uap)
780 	register struct proc *p;
781 	register struct poll_args *uap;
782 {
783 	caddr_t bits;
784 	char smallbits[32 * sizeof(struct pollfd)];
785 	struct timeval atv, rtv, ttv;
786 	int s, ncoll, error = 0, timo;
787 	size_t ni;
788 
789 	if (SCARG(uap, nfds) > p->p_fd->fd_nfiles) {
790 		/* forgiving; slightly wrong */
791 		SCARG(uap, nfds) = p->p_fd->fd_nfiles;
792 	}
793 	ni = SCARG(uap, nfds) * sizeof(struct pollfd);
794 	if (ni > sizeof(smallbits))
795 		bits = malloc(ni, M_TEMP, M_WAITOK);
796 	else
797 		bits = smallbits;
798 	error = copyin(SCARG(uap, fds), bits, ni);
799 	if (error)
800 		goto done;
801 	if (SCARG(uap, timeout) != INFTIM) {
802 		atv.tv_sec = SCARG(uap, timeout) / 1000;
803 		atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000;
804 		if (itimerfix(&atv)) {
805 			error = EINVAL;
806 			goto done;
807 		}
808 		getmicrouptime(&rtv);
809 		timevaladd(&atv, &rtv);
810 	} else
811 		atv.tv_sec = 0;
812 	timo = 0;
813 retry:
814 	ncoll = nselcoll;
815 	p->p_flag |= P_SELECT;
816 	error = pollscan(p, (struct pollfd *)bits, SCARG(uap, nfds));
817 	if (error || p->p_retval[0])
818 		goto done;
819 	if (atv.tv_sec) {
820 		getmicrouptime(&rtv);
821 		if (timevalcmp(&rtv, &atv, >=))
822 			goto done;
823 		ttv = atv;
824 		timevalsub(&ttv, &rtv);
825 		timo = ttv.tv_sec > 24 * 60 * 60 ?
826 		    24 * 60 * 60 * hz : tvtohz(&ttv);
827 	}
828 	s = splhigh();
829 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
830 		splx(s);
831 		goto retry;
832 	}
833 	p->p_flag &= ~P_SELECT;
834 	error = tsleep((caddr_t)&selwait, PSOCK | PCATCH, "poll", timo);
835 	splx(s);
836 	if (error == 0)
837 		goto retry;
838 done:
839 	p->p_flag &= ~P_SELECT;
840 	/* poll is not restarted after signals... */
841 	if (error == ERESTART)
842 		error = EINTR;
843 	if (error == EWOULDBLOCK)
844 		error = 0;
845 	if (error == 0) {
846 		error = copyout(bits, SCARG(uap, fds), ni);
847 		if (error)
848 			goto out;
849 	}
850 out:
851 	if (ni > sizeof(smallbits))
852 		free(bits, M_TEMP);
853 	return (error);
854 }
855 
856 static int
857 pollscan(p, fds, nfd)
858 	struct proc *p;
859 	struct pollfd *fds;
860 	int nfd;
861 {
862 	register struct filedesc *fdp = p->p_fd;
863 	int i;
864 	struct file *fp;
865 	int n = 0;
866 
867 	for (i = 0; i < nfd; i++, fds++) {
868 		if (fds->fd >= fdp->fd_nfiles) {
869 			fds->revents = POLLNVAL;
870 			n++;
871 		} else if (fds->fd < 0) {
872 			fds->revents = 0;
873 		} else {
874 			fp = fdp->fd_ofiles[fds->fd];
875 			if (fp == 0) {
876 				fds->revents = POLLNVAL;
877 				n++;
878 			} else {
879 				/*
880 				 * Note: backend also returns POLLHUP and
881 				 * POLLERR if appropriate.
882 				 */
883 				fds->revents = fo_poll(fp, fds->events,
884 				    fp->f_cred, p);
885 				if (fds->revents != 0)
886 					n++;
887 			}
888 		}
889 	}
890 	p->p_retval[0] = n;
891 	return (0);
892 }
893 
894 /*
895  * OpenBSD poll system call.
896  * XXX this isn't quite a true representation..  OpenBSD uses select ops.
897  */
898 #ifndef _SYS_SYSPROTO_H_
899 struct openbsd_poll_args {
900 	struct pollfd *fds;
901 	u_int	nfds;
902 	int	timeout;
903 };
904 #endif
905 int
906 openbsd_poll(p, uap)
907 	register struct proc *p;
908 	register struct openbsd_poll_args *uap;
909 {
910 	return (poll(p, (struct poll_args *)uap));
911 }
912 
913 /*ARGSUSED*/
914 int
915 seltrue(dev, events, p)
916 	dev_t dev;
917 	int events;
918 	struct proc *p;
919 {
920 
921 	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
922 }
923 
924 /*
925  * Record a select request.
926  */
927 void
928 selrecord(selector, sip)
929 	struct proc *selector;
930 	struct selinfo *sip;
931 {
932 	struct proc *p;
933 	pid_t mypid;
934 
935 	mypid = selector->p_pid;
936 	if (sip->si_pid == mypid)
937 		return;
938 	if (sip->si_pid && (p = pfind(sip->si_pid)) &&
939 	    p->p_wchan == (caddr_t)&selwait)
940 		sip->si_flags |= SI_COLL;
941 	else
942 		sip->si_pid = mypid;
943 }
944 
945 /*
946  * Do a wakeup when a selectable event occurs.
947  */
948 void
949 selwakeup(sip)
950 	register struct selinfo *sip;
951 {
952 	register struct proc *p;
953 	int s;
954 
955 	if (sip->si_pid == 0)
956 		return;
957 	if (sip->si_flags & SI_COLL) {
958 		nselcoll++;
959 		sip->si_flags &= ~SI_COLL;
960 		wakeup((caddr_t)&selwait);
961 	}
962 	p = pfind(sip->si_pid);
963 	sip->si_pid = 0;
964 	if (p != NULL) {
965 		s = splhigh();
966 		if (p->p_wchan == (caddr_t)&selwait) {
967 			if (p->p_stat == SSLEEP)
968 				setrunnable(p);
969 			else
970 				unsleep(p);
971 		} else if (p->p_flag & P_SELECT)
972 			p->p_flag &= ~P_SELECT;
973 		splx(s);
974 	}
975 }
976