xref: /freebsd/sys/kern/sys_generic.c (revision 4cf49a43559ed9fdad601bdcccd2c55963008675)
1 /*
2  * Copyright (c) 1982, 1986, 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
39  * $FreeBSD$
40  */
41 
42 #include "opt_ktrace.h"
43 
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/sysproto.h>
47 #include <sys/filedesc.h>
48 #include <sys/filio.h>
49 #include <sys/fcntl.h>
50 #include <sys/file.h>
51 #include <sys/proc.h>
52 #include <sys/signalvar.h>
53 #include <sys/socketvar.h>
54 #include <sys/uio.h>
55 #include <sys/kernel.h>
56 #include <sys/malloc.h>
57 #include <sys/poll.h>
58 #include <sys/sysent.h>
59 #ifdef KTRACE
60 #include <sys/ktrace.h>
61 #endif
62 
63 #include <machine/limits.h>
64 
65 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
66 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
67 MALLOC_DEFINE(M_IOV, "iov", "large iov's");
68 
69 static int	pollscan __P((struct proc *, struct pollfd *, int));
70 static int	selscan __P((struct proc *, fd_mask **, fd_mask **, int));
71 static struct file* getfp __P((struct filedesc *, int, int));
72 static int	dofileread __P((struct proc *, struct file *, int, void *,
73 		    size_t, off_t, int));
74 static int	dofilewrite __P((struct proc *, struct file *, int,
75 		    const void *, size_t, off_t, int));
76 
77 static struct file*
78 getfp(fdp, fd, flag)
79 	struct filedesc* fdp;
80 	int fd, flag;
81 {
82 	struct file* fp;
83 
84 	if (((u_int)fd) >= fdp->fd_nfiles ||
85 	    (fp = fdp->fd_ofiles[fd]) == NULL ||
86 	    (fp->f_flag & flag) == 0)
87 		return (NULL);
88 	return (fp);
89 }
90 
91 /*
92  * Read system call.
93  */
94 #ifndef _SYS_SYSPROTO_H_
95 struct read_args {
96 	int	fd;
97 	void	*buf;
98 	size_t	nbyte;
99 };
100 #endif
101 int
102 read(p, uap)
103 	struct proc *p;
104 	register struct read_args *uap;
105 {
106 	register struct file *fp;
107 
108 	if ((fp = getfp(p->p_fd, uap->fd, FREAD)) == NULL)
109 		return (EBADF);
110 	return (dofileread(p, fp, uap->fd, uap->buf, uap->nbyte, (off_t)-1, 0));
111 }
112 
113 /*
114  * Pread system call
115  */
116 #ifndef _SYS_SYSPROTO_H_
117 struct pread_args {
118 	int	fd;
119 	void	*buf;
120 	size_t	nbyte;
121 	int	pad;
122 	off_t	offset;
123 };
124 #endif
125 int
126 pread(p, uap)
127 	struct proc *p;
128 	register struct pread_args *uap;
129 {
130 	register struct file *fp;
131 
132 	if ((fp = getfp(p->p_fd, uap->fd, FREAD)) == NULL)
133 		return (EBADF);
134 	if (fp->f_type != DTYPE_VNODE)
135 		return (ESPIPE);
136 	return (dofileread(p, fp, uap->fd, uap->buf, uap->nbyte, uap->offset,
137 	    FOF_OFFSET));
138 }
139 
140 /*
141  * Code common for read and pread
142  */
143 int
144 dofileread(p, fp, fd, buf, nbyte, offset, flags)
145 	struct proc *p;
146 	struct file *fp;
147 	int fd, flags;
148 	void *buf;
149 	size_t nbyte;
150 	off_t offset;
151 {
152 	struct uio auio;
153 	struct iovec aiov;
154 	long cnt, error = 0;
155 #ifdef KTRACE
156 	struct iovec ktriov;
157 #endif
158 
159 	aiov.iov_base = (caddr_t)buf;
160 	aiov.iov_len = nbyte;
161 	auio.uio_iov = &aiov;
162 	auio.uio_iovcnt = 1;
163 	auio.uio_offset = offset;
164 	if (nbyte > INT_MAX)
165 		return (EINVAL);
166 	auio.uio_resid = nbyte;
167 	auio.uio_rw = UIO_READ;
168 	auio.uio_segflg = UIO_USERSPACE;
169 	auio.uio_procp = p;
170 #ifdef KTRACE
171 	/*
172 	 * if tracing, save a copy of iovec
173 	 */
174 	if (KTRPOINT(p, KTR_GENIO))
175 		ktriov = aiov;
176 #endif
177 	cnt = nbyte;
178 	if ((error = fo_read(fp, &auio, fp->f_cred, flags, p)))
179 		if (auio.uio_resid != cnt && (error == ERESTART ||
180 		    error == EINTR || error == EWOULDBLOCK))
181 			error = 0;
182 	cnt -= auio.uio_resid;
183 #ifdef KTRACE
184 	if (KTRPOINT(p, KTR_GENIO) && error == 0)
185 		ktrgenio(p->p_tracep, fd, UIO_READ, &ktriov, cnt, error);
186 #endif
187 	p->p_retval[0] = cnt;
188 	return (error);
189 }
190 
191 /*
192  * Scatter read system call.
193  */
194 #ifndef _SYS_SYSPROTO_H_
195 struct readv_args {
196 	int	fd;
197 	struct	iovec *iovp;
198 	u_int	iovcnt;
199 };
200 #endif
201 int
202 readv(p, uap)
203 	struct proc *p;
204 	register struct readv_args *uap;
205 {
206 	register struct file *fp;
207 	register struct filedesc *fdp = p->p_fd;
208 	struct uio auio;
209 	register struct iovec *iov;
210 	struct iovec *needfree;
211 	struct iovec aiov[UIO_SMALLIOV];
212 	long i, cnt, error = 0;
213 	u_int iovlen;
214 #ifdef KTRACE
215 	struct iovec *ktriov = NULL;
216 #endif
217 
218 	if ((fp = getfp(fdp, uap->fd, FREAD)) == NULL)
219 		return (EBADF);
220 	/* note: can't use iovlen until iovcnt is validated */
221 	iovlen = uap->iovcnt * sizeof (struct iovec);
222 	if (uap->iovcnt > UIO_SMALLIOV) {
223 		if (uap->iovcnt > UIO_MAXIOV)
224 			return (EINVAL);
225 		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
226 		needfree = iov;
227 	} else {
228 		iov = aiov;
229 		needfree = NULL;
230 	}
231 	auio.uio_iov = iov;
232 	auio.uio_iovcnt = uap->iovcnt;
233 	auio.uio_rw = UIO_READ;
234 	auio.uio_segflg = UIO_USERSPACE;
235 	auio.uio_procp = p;
236 	auio.uio_offset = -1;
237 	if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)))
238 		goto done;
239 	auio.uio_resid = 0;
240 	for (i = 0; i < uap->iovcnt; i++) {
241 		if (iov->iov_len > INT_MAX - auio.uio_resid) {
242 			error = EINVAL;
243 			goto done;
244 		}
245 		auio.uio_resid += iov->iov_len;
246 		iov++;
247 	}
248 #ifdef KTRACE
249 	/*
250 	 * if tracing, save a copy of iovec
251 	 */
252 	if (KTRPOINT(p, KTR_GENIO))  {
253 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
254 		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
255 	}
256 #endif
257 	cnt = auio.uio_resid;
258 	if ((error = fo_read(fp, &auio, fp->f_cred, 0, p)))
259 		if (auio.uio_resid != cnt && (error == ERESTART ||
260 		    error == EINTR || error == EWOULDBLOCK))
261 			error = 0;
262 	cnt -= auio.uio_resid;
263 #ifdef KTRACE
264 	if (ktriov != NULL) {
265 		if (error == 0)
266 			ktrgenio(p->p_tracep, uap->fd, UIO_READ, ktriov,
267 			    cnt, error);
268 		FREE(ktriov, M_TEMP);
269 	}
270 #endif
271 	p->p_retval[0] = cnt;
272 done:
273 	if (needfree)
274 		FREE(needfree, M_IOV);
275 	return (error);
276 }
277 
278 /*
279  * Write system call
280  */
281 #ifndef _SYS_SYSPROTO_H_
282 struct write_args {
283 	int	fd;
284 	const void *buf;
285 	size_t	nbyte;
286 };
287 #endif
288 int
289 write(p, uap)
290 	struct proc *p;
291 	register struct write_args *uap;
292 {
293 	register struct file *fp;
294 
295 	if ((fp = getfp(p->p_fd, uap->fd, FWRITE)) == NULL)
296 		return (EBADF);
297 	return (dofilewrite(p, fp, uap->fd, uap->buf, uap->nbyte, (off_t)-1, 0));
298 }
299 
300 /*
301  * Pwrite system call
302  */
303 #ifndef _SYS_SYSPROTO_H_
304 struct pwrite_args {
305 	int	fd;
306 	const void *buf;
307 	size_t	nbyte;
308 	int	pad;
309 	off_t	offset;
310 };
311 #endif
312 int
313 pwrite(p, uap)
314 	struct proc *p;
315 	register struct pwrite_args *uap;
316 {
317 	register struct file *fp;
318 
319 	if ((fp = getfp(p->p_fd, uap->fd, FWRITE)) == NULL)
320 		return (EBADF);
321 	if (fp->f_type != DTYPE_VNODE)
322 		return (ESPIPE);
323 	return (dofilewrite(p, fp, uap->fd, uap->buf, uap->nbyte, uap->offset,
324 	    FOF_OFFSET));
325 }
326 
327 static int
328 dofilewrite(p, fp, fd, buf, nbyte, offset, flags)
329 	struct proc *p;
330 	struct file *fp;
331 	int fd, flags;
332 	const void *buf;
333 	size_t nbyte;
334 	off_t offset;
335 {
336 	struct uio auio;
337 	struct iovec aiov;
338 	long cnt, error = 0;
339 #ifdef KTRACE
340 	struct iovec ktriov;
341 #endif
342 
343 	aiov.iov_base = (void *)buf;
344 	aiov.iov_len = nbyte;
345 	auio.uio_iov = &aiov;
346 	auio.uio_iovcnt = 1;
347 	auio.uio_offset = offset;
348 	if (nbyte > INT_MAX)
349 		return (EINVAL);
350 	auio.uio_resid = nbyte;
351 	auio.uio_rw = UIO_WRITE;
352 	auio.uio_segflg = UIO_USERSPACE;
353 	auio.uio_procp = p;
354 #ifdef KTRACE
355 	/*
356 	 * if tracing, save a copy of iovec
357 	 */
358 	if (KTRPOINT(p, KTR_GENIO))
359 		ktriov = aiov;
360 #endif
361 	cnt = nbyte;
362 	if ((error = fo_write(fp, &auio, fp->f_cred, flags, p))) {
363 		if (auio.uio_resid != cnt && (error == ERESTART ||
364 		    error == EINTR || error == EWOULDBLOCK))
365 			error = 0;
366 		if (error == EPIPE)
367 			psignal(p, SIGPIPE);
368 	}
369 	cnt -= auio.uio_resid;
370 #ifdef KTRACE
371 	if (KTRPOINT(p, KTR_GENIO) && error == 0)
372 		ktrgenio(p->p_tracep, fd, UIO_WRITE,
373 		    &ktriov, cnt, error);
374 #endif
375 	p->p_retval[0] = cnt;
376 	return (error);
377 }
378 
379 /*
380  * Gather write system call
381  */
382 #ifndef _SYS_SYSPROTO_H_
383 struct writev_args {
384 	int	fd;
385 	struct	iovec *iovp;
386 	u_int	iovcnt;
387 };
388 #endif
389 int
390 writev(p, uap)
391 	struct proc *p;
392 	register struct writev_args *uap;
393 {
394 	register struct file *fp;
395 	register struct filedesc *fdp = p->p_fd;
396 	struct uio auio;
397 	register struct iovec *iov;
398 	struct iovec *needfree;
399 	struct iovec aiov[UIO_SMALLIOV];
400 	long i, cnt, error = 0;
401 	u_int iovlen;
402 #ifdef KTRACE
403 	struct iovec *ktriov = NULL;
404 #endif
405 
406 	if ((fp = getfp(fdp, uap->fd, FWRITE)) == NULL)
407 		return (EBADF);
408 	fhold(fp);
409 	/* note: can't use iovlen until iovcnt is validated */
410 	iovlen = uap->iovcnt * sizeof (struct iovec);
411 	if (uap->iovcnt > UIO_SMALLIOV) {
412 		if (uap->iovcnt > UIO_MAXIOV) {
413 			needfree = NULL;
414 			error = EINVAL;
415 			goto done;
416 		}
417 		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
418 		needfree = iov;
419 	} else {
420 		iov = aiov;
421 		needfree = NULL;
422 	}
423 	auio.uio_iov = iov;
424 	auio.uio_iovcnt = uap->iovcnt;
425 	auio.uio_rw = UIO_WRITE;
426 	auio.uio_segflg = UIO_USERSPACE;
427 	auio.uio_procp = p;
428 	auio.uio_offset = -1;
429 	if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)))
430 		goto done;
431 	auio.uio_resid = 0;
432 	for (i = 0; i < uap->iovcnt; i++) {
433 		if (iov->iov_len > INT_MAX - auio.uio_resid) {
434 			error = EINVAL;
435 			goto done;
436 		}
437 		auio.uio_resid += iov->iov_len;
438 		iov++;
439 	}
440 #ifdef KTRACE
441 	/*
442 	 * if tracing, save a copy of iovec
443 	 */
444 	if (KTRPOINT(p, KTR_GENIO))  {
445 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
446 		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
447 	}
448 #endif
449 	cnt = auio.uio_resid;
450 	if ((error = fo_write(fp, &auio, fp->f_cred, 0, p))) {
451 		if (auio.uio_resid != cnt && (error == ERESTART ||
452 		    error == EINTR || error == EWOULDBLOCK))
453 			error = 0;
454 		if (error == EPIPE)
455 			psignal(p, SIGPIPE);
456 	}
457 	cnt -= auio.uio_resid;
458 #ifdef KTRACE
459 	if (ktriov != NULL) {
460 		if (error == 0)
461 			ktrgenio(p->p_tracep, uap->fd, UIO_WRITE,
462 				ktriov, cnt, error);
463 		FREE(ktriov, M_TEMP);
464 	}
465 #endif
466 	p->p_retval[0] = cnt;
467 done:
468 	fdrop(fp, p);
469 	if (needfree)
470 		FREE(needfree, M_IOV);
471 	return (error);
472 }
473 
474 /*
475  * Ioctl system call
476  */
477 #ifndef _SYS_SYSPROTO_H_
478 struct ioctl_args {
479 	int	fd;
480 	u_long	com;
481 	caddr_t	data;
482 };
483 #endif
484 /* ARGSUSED */
485 int
486 ioctl(p, uap)
487 	struct proc *p;
488 	register struct ioctl_args *uap;
489 {
490 	register struct file *fp;
491 	register struct filedesc *fdp;
492 	register u_long com;
493 	int error;
494 	register u_int size;
495 	caddr_t data, memp;
496 	int tmp;
497 #define STK_PARAMS	128
498 	char stkbuf[STK_PARAMS];
499 
500 	fdp = p->p_fd;
501 	if ((u_int)uap->fd >= fdp->fd_nfiles ||
502 	    (fp = fdp->fd_ofiles[uap->fd]) == NULL)
503 		return (EBADF);
504 
505 	if ((fp->f_flag & (FREAD | FWRITE)) == 0)
506 		return (EBADF);
507 
508 	switch (com = uap->com) {
509 	case FIONCLEX:
510 		fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE;
511 		return (0);
512 	case FIOCLEX:
513 		fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE;
514 		return (0);
515 	}
516 
517 	/*
518 	 * Interpret high order word to find amount of data to be
519 	 * copied to/from the user's address space.
520 	 */
521 	size = IOCPARM_LEN(com);
522 	if (size > IOCPARM_MAX)
523 		return (ENOTTY);
524 	memp = NULL;
525 	if (size > sizeof (stkbuf)) {
526 		memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
527 		data = memp;
528 	} else
529 		data = stkbuf;
530 	if (com&IOC_IN) {
531 		if (size) {
532 			error = copyin(uap->data, data, (u_int)size);
533 			if (error) {
534 				if (memp)
535 					free(memp, M_IOCTLOPS);
536 				return (error);
537 			}
538 		} else
539 			*(caddr_t *)data = uap->data;
540 	} else if ((com&IOC_OUT) && size)
541 		/*
542 		 * Zero the buffer so the user always
543 		 * gets back something deterministic.
544 		 */
545 		bzero(data, size);
546 	else if (com&IOC_VOID)
547 		*(caddr_t *)data = uap->data;
548 
549 	switch (com) {
550 
551 	case FIONBIO:
552 		if ((tmp = *(int *)data))
553 			fp->f_flag |= FNONBLOCK;
554 		else
555 			fp->f_flag &= ~FNONBLOCK;
556 		error = fo_ioctl(fp, FIONBIO, (caddr_t)&tmp, p);
557 		break;
558 
559 	case FIOASYNC:
560 		if ((tmp = *(int *)data))
561 			fp->f_flag |= FASYNC;
562 		else
563 			fp->f_flag &= ~FASYNC;
564 		error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, p);
565 		break;
566 
567 	default:
568 		error = fo_ioctl(fp, com, data, p);
569 		/*
570 		 * Copy any data to user, size was
571 		 * already set and checked above.
572 		 */
573 		if (error == 0 && (com&IOC_OUT) && size)
574 			error = copyout(data, uap->data, (u_int)size);
575 		break;
576 	}
577 	if (memp)
578 		free(memp, M_IOCTLOPS);
579 	return (error);
580 }
581 
582 static int	nselcoll;
583 int	selwait;
584 
585 /*
586  * Select system call.
587  */
588 #ifndef _SYS_SYSPROTO_H_
589 struct select_args {
590 	int	nd;
591 	fd_set	*in, *ou, *ex;
592 	struct	timeval *tv;
593 };
594 #endif
595 int
596 select(p, uap)
597 	register struct proc *p;
598 	register struct select_args *uap;
599 {
600 	/*
601 	 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
602 	 * infds with the new FD_SETSIZE of 1024, and more than enough for
603 	 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
604 	 * of 256.
605 	 */
606 	fd_mask s_selbits[howmany(2048, NFDBITS)];
607 	fd_mask *ibits[3], *obits[3], *selbits, *sbp;
608 	struct timeval atv, rtv, ttv;
609 	int s, ncoll, error, timo;
610 	u_int nbufbytes, ncpbytes, nfdbits;
611 
612 	if (uap->nd < 0)
613 		return (EINVAL);
614 	if (uap->nd > p->p_fd->fd_nfiles)
615 		uap->nd = p->p_fd->fd_nfiles;   /* forgiving; slightly wrong */
616 
617 	/*
618 	 * Allocate just enough bits for the non-null fd_sets.  Use the
619 	 * preallocated auto buffer if possible.
620 	 */
621 	nfdbits = roundup(uap->nd, NFDBITS);
622 	ncpbytes = nfdbits / NBBY;
623 	nbufbytes = 0;
624 	if (uap->in != NULL)
625 		nbufbytes += 2 * ncpbytes;
626 	if (uap->ou != NULL)
627 		nbufbytes += 2 * ncpbytes;
628 	if (uap->ex != NULL)
629 		nbufbytes += 2 * ncpbytes;
630 	if (nbufbytes <= sizeof s_selbits)
631 		selbits = &s_selbits[0];
632 	else
633 		selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
634 
635 	/*
636 	 * Assign pointers into the bit buffers and fetch the input bits.
637 	 * Put the output buffers together so that they can be bzeroed
638 	 * together.
639 	 */
640 	sbp = selbits;
641 #define	getbits(name, x) \
642 	do {								\
643 		if (uap->name == NULL)					\
644 			ibits[x] = NULL;				\
645 		else {							\
646 			ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp;	\
647 			obits[x] = sbp;					\
648 			sbp += ncpbytes / sizeof *sbp;			\
649 			error = copyin(uap->name, ibits[x], ncpbytes);	\
650 			if (error != 0)					\
651 				goto done;				\
652 		}							\
653 	} while (0)
654 	getbits(in, 0);
655 	getbits(ou, 1);
656 	getbits(ex, 2);
657 #undef	getbits
658 	if (nbufbytes != 0)
659 		bzero(selbits, nbufbytes / 2);
660 
661 	if (uap->tv) {
662 		error = copyin((caddr_t)uap->tv, (caddr_t)&atv,
663 			sizeof (atv));
664 		if (error)
665 			goto done;
666 		if (itimerfix(&atv)) {
667 			error = EINVAL;
668 			goto done;
669 		}
670 		getmicrouptime(&rtv);
671 		timevaladd(&atv, &rtv);
672 	} else
673 		atv.tv_sec = 0;
674 	timo = 0;
675 retry:
676 	ncoll = nselcoll;
677 	p->p_flag |= P_SELECT;
678 	error = selscan(p, ibits, obits, uap->nd);
679 	if (error || p->p_retval[0])
680 		goto done;
681 	if (atv.tv_sec) {
682 		getmicrouptime(&rtv);
683 		if (timevalcmp(&rtv, &atv, >=))
684 			goto done;
685 		ttv = atv;
686 		timevalsub(&ttv, &rtv);
687 		timo = ttv.tv_sec > 24 * 60 * 60 ?
688 		    24 * 60 * 60 * hz : tvtohz(&ttv);
689 	}
690 	s = splhigh();
691 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
692 		splx(s);
693 		goto retry;
694 	}
695 	p->p_flag &= ~P_SELECT;
696 	error = tsleep((caddr_t)&selwait, PSOCK | PCATCH, "select", timo);
697 	splx(s);
698 	if (error == 0)
699 		goto retry;
700 done:
701 	p->p_flag &= ~P_SELECT;
702 	/* select is not restarted after signals... */
703 	if (error == ERESTART)
704 		error = EINTR;
705 	if (error == EWOULDBLOCK)
706 		error = 0;
707 #define	putbits(name, x) \
708 	if (uap->name && (error2 = copyout(obits[x], uap->name, ncpbytes))) \
709 		error = error2;
710 	if (error == 0) {
711 		int error2;
712 
713 		putbits(in, 0);
714 		putbits(ou, 1);
715 		putbits(ex, 2);
716 #undef putbits
717 	}
718 	if (selbits != &s_selbits[0])
719 		free(selbits, M_SELECT);
720 	return (error);
721 }
722 
723 static int
724 selscan(p, ibits, obits, nfd)
725 	struct proc *p;
726 	fd_mask **ibits, **obits;
727 	int nfd;
728 {
729 	register struct filedesc *fdp = p->p_fd;
730 	register int msk, i, j, fd;
731 	register fd_mask bits;
732 	struct file *fp;
733 	int n = 0;
734 	/* Note: backend also returns POLLHUP/POLLERR if appropriate. */
735 	static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
736 
737 	for (msk = 0; msk < 3; msk++) {
738 		if (ibits[msk] == NULL)
739 			continue;
740 		for (i = 0; i < nfd; i += NFDBITS) {
741 			bits = ibits[msk][i/NFDBITS];
742 			while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
743 				bits &= ~(1 << j);
744 				fp = fdp->fd_ofiles[fd];
745 				if (fp == NULL)
746 					return (EBADF);
747 				if (fo_poll(fp, flag[msk], fp->f_cred, p)) {
748 					obits[msk][(fd)/NFDBITS] |=
749 						(1 << ((fd) % NFDBITS));
750 					n++;
751 				}
752 			}
753 		}
754 	}
755 	p->p_retval[0] = n;
756 	return (0);
757 }
758 
759 /*
760  * Poll system call.
761  */
762 #ifndef _SYS_SYSPROTO_H_
763 struct poll_args {
764 	struct pollfd *fds;
765 	u_int	nfds;
766 	int	timeout;
767 };
768 #endif
769 int
770 poll(p, uap)
771 	register struct proc *p;
772 	register struct poll_args *uap;
773 {
774 	caddr_t bits;
775 	char smallbits[32 * sizeof(struct pollfd)];
776 	struct timeval atv, rtv, ttv;
777 	int s, ncoll, error = 0, timo;
778 	size_t ni;
779 
780 	if (SCARG(uap, nfds) > p->p_fd->fd_nfiles) {
781 		/* forgiving; slightly wrong */
782 		SCARG(uap, nfds) = p->p_fd->fd_nfiles;
783 	}
784 	ni = SCARG(uap, nfds) * sizeof(struct pollfd);
785 	if (ni > sizeof(smallbits))
786 		bits = malloc(ni, M_TEMP, M_WAITOK);
787 	else
788 		bits = smallbits;
789 	error = copyin(SCARG(uap, fds), bits, ni);
790 	if (error)
791 		goto done;
792 	if (SCARG(uap, timeout) != INFTIM) {
793 		atv.tv_sec = SCARG(uap, timeout) / 1000;
794 		atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000;
795 		if (itimerfix(&atv)) {
796 			error = EINVAL;
797 			goto done;
798 		}
799 		getmicrouptime(&rtv);
800 		timevaladd(&atv, &rtv);
801 	} else
802 		atv.tv_sec = 0;
803 	timo = 0;
804 retry:
805 	ncoll = nselcoll;
806 	p->p_flag |= P_SELECT;
807 	error = pollscan(p, (struct pollfd *)bits, SCARG(uap, nfds));
808 	if (error || p->p_retval[0])
809 		goto done;
810 	if (atv.tv_sec) {
811 		getmicrouptime(&rtv);
812 		if (timevalcmp(&rtv, &atv, >=))
813 			goto done;
814 		ttv = atv;
815 		timevalsub(&ttv, &rtv);
816 		timo = ttv.tv_sec > 24 * 60 * 60 ?
817 		    24 * 60 * 60 * hz : tvtohz(&ttv);
818 	}
819 	s = splhigh();
820 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
821 		splx(s);
822 		goto retry;
823 	}
824 	p->p_flag &= ~P_SELECT;
825 	error = tsleep((caddr_t)&selwait, PSOCK | PCATCH, "poll", timo);
826 	splx(s);
827 	if (error == 0)
828 		goto retry;
829 done:
830 	p->p_flag &= ~P_SELECT;
831 	/* poll is not restarted after signals... */
832 	if (error == ERESTART)
833 		error = EINTR;
834 	if (error == EWOULDBLOCK)
835 		error = 0;
836 	if (error == 0) {
837 		error = copyout(bits, SCARG(uap, fds), ni);
838 		if (error)
839 			goto out;
840 	}
841 out:
842 	if (ni > sizeof(smallbits))
843 		free(bits, M_TEMP);
844 	return (error);
845 }
846 
847 static int
848 pollscan(p, fds, nfd)
849 	struct proc *p;
850 	struct pollfd *fds;
851 	int nfd;
852 {
853 	register struct filedesc *fdp = p->p_fd;
854 	int i;
855 	struct file *fp;
856 	int n = 0;
857 
858 	for (i = 0; i < nfd; i++, fds++) {
859 		if (fds->fd >= fdp->fd_nfiles) {
860 			fds->revents = POLLNVAL;
861 			n++;
862 		} else if (fds->fd < 0) {
863 			fds->revents = 0;
864 		} else {
865 			fp = fdp->fd_ofiles[fds->fd];
866 			if (fp == 0) {
867 				fds->revents = POLLNVAL;
868 				n++;
869 			} else {
870 				/*
871 				 * Note: backend also returns POLLHUP and
872 				 * POLLERR if appropriate.
873 				 */
874 				fds->revents = fo_poll(fp, fds->events,
875 				    fp->f_cred, p);
876 				if (fds->revents != 0)
877 					n++;
878 			}
879 		}
880 	}
881 	p->p_retval[0] = n;
882 	return (0);
883 }
884 
885 /*
886  * OpenBSD poll system call.
887  * XXX this isn't quite a true representation..  OpenBSD uses select ops.
888  */
889 #ifndef _SYS_SYSPROTO_H_
890 struct openbsd_poll_args {
891 	struct pollfd *fds;
892 	u_int	nfds;
893 	int	timeout;
894 };
895 #endif
896 int
897 openbsd_poll(p, uap)
898 	register struct proc *p;
899 	register struct openbsd_poll_args *uap;
900 {
901 	return (poll(p, (struct poll_args *)uap));
902 }
903 
904 /*ARGSUSED*/
905 int
906 seltrue(dev, events, p)
907 	dev_t dev;
908 	int events;
909 	struct proc *p;
910 {
911 
912 	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
913 }
914 
915 /*
916  * Record a select request.
917  */
918 void
919 selrecord(selector, sip)
920 	struct proc *selector;
921 	struct selinfo *sip;
922 {
923 	struct proc *p;
924 	pid_t mypid;
925 
926 	mypid = selector->p_pid;
927 	if (sip->si_pid == mypid)
928 		return;
929 	if (sip->si_pid && (p = pfind(sip->si_pid)) &&
930 	    p->p_wchan == (caddr_t)&selwait)
931 		sip->si_flags |= SI_COLL;
932 	else
933 		sip->si_pid = mypid;
934 }
935 
936 /*
937  * Do a wakeup when a selectable event occurs.
938  */
939 void
940 selwakeup(sip)
941 	register struct selinfo *sip;
942 {
943 	register struct proc *p;
944 	int s;
945 
946 	if (sip->si_pid == 0)
947 		return;
948 	if (sip->si_flags & SI_COLL) {
949 		nselcoll++;
950 		sip->si_flags &= ~SI_COLL;
951 		wakeup((caddr_t)&selwait);
952 	}
953 	p = pfind(sip->si_pid);
954 	sip->si_pid = 0;
955 	if (p != NULL) {
956 		s = splhigh();
957 		if (p->p_wchan == (caddr_t)&selwait) {
958 			if (p->p_stat == SSLEEP)
959 				setrunnable(p);
960 			else
961 				unsleep(p);
962 		} else if (p->p_flag & P_SELECT)
963 			p->p_flag &= ~P_SELECT;
964 		splx(s);
965 	}
966 }
967