xref: /freebsd/sys/kern/sys_generic.c (revision 04c9749ff0148ec8f73b150cec8bc2c094a5d31a)
1 /*
2  * Copyright (c) 1982, 1986, 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
39  * $FreeBSD$
40  */
41 
42 #include "opt_ktrace.h"
43 
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/sysproto.h>
47 #include <sys/filedesc.h>
48 #include <sys/filio.h>
49 #include <sys/fcntl.h>
50 #include <sys/file.h>
51 #include <sys/proc.h>
52 #include <sys/signalvar.h>
53 #include <sys/socketvar.h>
54 #include <sys/uio.h>
55 #include <sys/kernel.h>
56 #include <sys/malloc.h>
57 #include <sys/poll.h>
58 #include <sys/sysctl.h>
59 #include <sys/sysent.h>
60 #ifdef KTRACE
61 #include <sys/ktrace.h>
62 #endif
63 
64 #include <machine/limits.h>
65 
66 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
67 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
68 MALLOC_DEFINE(M_IOV, "iov", "large iov's");
69 
70 static int	pollscan __P((struct proc *, struct pollfd *, int));
71 static int	selscan __P((struct proc *, fd_mask **, fd_mask **, int));
72 static int	dofileread __P((struct proc *, struct file *, int, void *,
73 		    size_t, off_t, int));
74 static int	dofilewrite __P((struct proc *, struct file *, int,
75 		    const void *, size_t, off_t, int));
76 
77 struct file*
78 getfp(fdp, fd, flag)
79 	struct filedesc* fdp;
80 	int fd, flag;
81 {
82 	struct file* fp;
83 
84 	if (((u_int)fd) >= fdp->fd_nfiles ||
85 	    (fp = fdp->fd_ofiles[fd]) == NULL ||
86 	    (fp->f_flag & flag) == 0)
87 		return (NULL);
88 	return (fp);
89 }
90 
91 /*
92  * Read system call.
93  */
94 #ifndef _SYS_SYSPROTO_H_
95 struct read_args {
96 	int	fd;
97 	void	*buf;
98 	size_t	nbyte;
99 };
100 #endif
101 int
102 read(p, uap)
103 	struct proc *p;
104 	register struct read_args *uap;
105 {
106 	register struct file *fp;
107 
108 	if ((fp = getfp(p->p_fd, uap->fd, FREAD)) == NULL)
109 		return (EBADF);
110 	return (dofileread(p, fp, uap->fd, uap->buf, uap->nbyte, (off_t)-1, 0));
111 }
112 
113 /*
114  * Pread system call
115  */
116 #ifndef _SYS_SYSPROTO_H_
117 struct pread_args {
118 	int	fd;
119 	void	*buf;
120 	size_t	nbyte;
121 	int	pad;
122 	off_t	offset;
123 };
124 #endif
125 int
126 pread(p, uap)
127 	struct proc *p;
128 	register struct pread_args *uap;
129 {
130 	register struct file *fp;
131 
132 	if ((fp = getfp(p->p_fd, uap->fd, FREAD)) == NULL)
133 		return (EBADF);
134 	if (fp->f_type != DTYPE_VNODE)
135 		return (ESPIPE);
136 	return (dofileread(p, fp, uap->fd, uap->buf, uap->nbyte, uap->offset,
137 	    FOF_OFFSET));
138 }
139 
140 /*
141  * Code common for read and pread
142  */
143 int
144 dofileread(p, fp, fd, buf, nbyte, offset, flags)
145 	struct proc *p;
146 	struct file *fp;
147 	int fd, flags;
148 	void *buf;
149 	size_t nbyte;
150 	off_t offset;
151 {
152 	struct uio auio;
153 	struct iovec aiov;
154 	long cnt, error = 0;
155 #ifdef KTRACE
156 	struct iovec ktriov;
157 	struct uio ktruio;
158 	int didktr = 0;
159 #endif
160 
161 	aiov.iov_base = (caddr_t)buf;
162 	aiov.iov_len = nbyte;
163 	auio.uio_iov = &aiov;
164 	auio.uio_iovcnt = 1;
165 	auio.uio_offset = offset;
166 	if (nbyte > INT_MAX)
167 		return (EINVAL);
168 	auio.uio_resid = nbyte;
169 	auio.uio_rw = UIO_READ;
170 	auio.uio_segflg = UIO_USERSPACE;
171 	auio.uio_procp = p;
172 #ifdef KTRACE
173 	/*
174 	 * if tracing, save a copy of iovec
175 	 */
176 	if (KTRPOINT(p, KTR_GENIO)) {
177 		ktriov = aiov;
178 		ktruio = auio;
179 		didktr = 1;
180 	}
181 #endif
182 	cnt = nbyte;
183 	if ((error = fo_read(fp, &auio, fp->f_cred, flags, p)))
184 		if (auio.uio_resid != cnt && (error == ERESTART ||
185 		    error == EINTR || error == EWOULDBLOCK))
186 			error = 0;
187 	cnt -= auio.uio_resid;
188 #ifdef KTRACE
189 	if (didktr && error == 0) {
190 		ktruio.uio_iov = &ktriov;
191 		ktruio.uio_resid = cnt;
192 		ktrgenio(p->p_tracep, fd, UIO_READ, &ktruio, error);
193 	}
194 #endif
195 	p->p_retval[0] = cnt;
196 	return (error);
197 }
198 
199 /*
200  * Scatter read system call.
201  */
202 #ifndef _SYS_SYSPROTO_H_
203 struct readv_args {
204 	int	fd;
205 	struct	iovec *iovp;
206 	u_int	iovcnt;
207 };
208 #endif
209 int
210 readv(p, uap)
211 	struct proc *p;
212 	register struct readv_args *uap;
213 {
214 	register struct file *fp;
215 	register struct filedesc *fdp = p->p_fd;
216 	struct uio auio;
217 	register struct iovec *iov;
218 	struct iovec *needfree;
219 	struct iovec aiov[UIO_SMALLIOV];
220 	long i, cnt, error = 0;
221 	u_int iovlen;
222 #ifdef KTRACE
223 	struct iovec *ktriov = NULL;
224 	struct uio ktruio;
225 #endif
226 
227 	if ((fp = getfp(fdp, uap->fd, FREAD)) == NULL)
228 		return (EBADF);
229 	/* note: can't use iovlen until iovcnt is validated */
230 	iovlen = uap->iovcnt * sizeof (struct iovec);
231 	if (uap->iovcnt > UIO_SMALLIOV) {
232 		if (uap->iovcnt > UIO_MAXIOV)
233 			return (EINVAL);
234 		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
235 		needfree = iov;
236 	} else {
237 		iov = aiov;
238 		needfree = NULL;
239 	}
240 	auio.uio_iov = iov;
241 	auio.uio_iovcnt = uap->iovcnt;
242 	auio.uio_rw = UIO_READ;
243 	auio.uio_segflg = UIO_USERSPACE;
244 	auio.uio_procp = p;
245 	auio.uio_offset = -1;
246 	if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)))
247 		goto done;
248 	auio.uio_resid = 0;
249 	for (i = 0; i < uap->iovcnt; i++) {
250 		if (iov->iov_len > INT_MAX - auio.uio_resid) {
251 			error = EINVAL;
252 			goto done;
253 		}
254 		auio.uio_resid += iov->iov_len;
255 		iov++;
256 	}
257 #ifdef KTRACE
258 	/*
259 	 * if tracing, save a copy of iovec
260 	 */
261 	if (KTRPOINT(p, KTR_GENIO))  {
262 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
263 		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
264 		ktruio = auio;
265 	}
266 #endif
267 	cnt = auio.uio_resid;
268 	if ((error = fo_read(fp, &auio, fp->f_cred, 0, p)))
269 		if (auio.uio_resid != cnt && (error == ERESTART ||
270 		    error == EINTR || error == EWOULDBLOCK))
271 			error = 0;
272 	cnt -= auio.uio_resid;
273 #ifdef KTRACE
274 	if (ktriov != NULL) {
275 		if (error == 0) {
276 			ktruio.uio_iov = ktriov;
277 			ktruio.uio_resid = cnt;
278 			ktrgenio(p->p_tracep, uap->fd, UIO_READ, &ktruio,
279 			    error);
280 		}
281 		FREE(ktriov, M_TEMP);
282 	}
283 #endif
284 	p->p_retval[0] = cnt;
285 done:
286 	if (needfree)
287 		FREE(needfree, M_IOV);
288 	return (error);
289 }
290 
291 /*
292  * Write system call
293  */
294 #ifndef _SYS_SYSPROTO_H_
295 struct write_args {
296 	int	fd;
297 	const void *buf;
298 	size_t	nbyte;
299 };
300 #endif
301 int
302 write(p, uap)
303 	struct proc *p;
304 	register struct write_args *uap;
305 {
306 	register struct file *fp;
307 
308 	if ((fp = getfp(p->p_fd, uap->fd, FWRITE)) == NULL)
309 		return (EBADF);
310 	return (dofilewrite(p, fp, uap->fd, uap->buf, uap->nbyte, (off_t)-1, 0));
311 }
312 
313 /*
314  * Pwrite system call
315  */
316 #ifndef _SYS_SYSPROTO_H_
317 struct pwrite_args {
318 	int	fd;
319 	const void *buf;
320 	size_t	nbyte;
321 	int	pad;
322 	off_t	offset;
323 };
324 #endif
325 int
326 pwrite(p, uap)
327 	struct proc *p;
328 	register struct pwrite_args *uap;
329 {
330 	register struct file *fp;
331 
332 	if ((fp = getfp(p->p_fd, uap->fd, FWRITE)) == NULL)
333 		return (EBADF);
334 	if (fp->f_type != DTYPE_VNODE)
335 		return (ESPIPE);
336 	return (dofilewrite(p, fp, uap->fd, uap->buf, uap->nbyte, uap->offset,
337 	    FOF_OFFSET));
338 }
339 
340 static int
341 dofilewrite(p, fp, fd, buf, nbyte, offset, flags)
342 	struct proc *p;
343 	struct file *fp;
344 	int fd, flags;
345 	const void *buf;
346 	size_t nbyte;
347 	off_t offset;
348 {
349 	struct uio auio;
350 	struct iovec aiov;
351 	long cnt, error = 0;
352 #ifdef KTRACE
353 	struct iovec ktriov;
354 	struct uio ktruio;
355 	int didktr = 0;
356 #endif
357 
358 	aiov.iov_base = (void *)(uintptr_t)buf;
359 	aiov.iov_len = nbyte;
360 	auio.uio_iov = &aiov;
361 	auio.uio_iovcnt = 1;
362 	auio.uio_offset = offset;
363 	if (nbyte > INT_MAX)
364 		return (EINVAL);
365 	auio.uio_resid = nbyte;
366 	auio.uio_rw = UIO_WRITE;
367 	auio.uio_segflg = UIO_USERSPACE;
368 	auio.uio_procp = p;
369 #ifdef KTRACE
370 	/*
371 	 * if tracing, save a copy of iovec and uio
372 	 */
373 	if (KTRPOINT(p, KTR_GENIO)) {
374 		ktriov = aiov;
375 		ktruio = auio;
376 		didktr = 1;
377 	}
378 #endif
379 	cnt = nbyte;
380 	if ((error = fo_write(fp, &auio, fp->f_cred, flags, p))) {
381 		if (auio.uio_resid != cnt && (error == ERESTART ||
382 		    error == EINTR || error == EWOULDBLOCK))
383 			error = 0;
384 		if (error == EPIPE)
385 			psignal(p, SIGPIPE);
386 	}
387 	cnt -= auio.uio_resid;
388 #ifdef KTRACE
389 	if (didktr && error == 0) {
390 		ktruio.uio_iov = &ktriov;
391 		ktruio.uio_resid = cnt;
392 		ktrgenio(p->p_tracep, fd, UIO_WRITE, &ktruio, error);
393 	}
394 #endif
395 	p->p_retval[0] = cnt;
396 	return (error);
397 }
398 
399 /*
400  * Gather write system call
401  */
402 #ifndef _SYS_SYSPROTO_H_
403 struct writev_args {
404 	int	fd;
405 	struct	iovec *iovp;
406 	u_int	iovcnt;
407 };
408 #endif
409 int
410 writev(p, uap)
411 	struct proc *p;
412 	register struct writev_args *uap;
413 {
414 	register struct file *fp;
415 	register struct filedesc *fdp = p->p_fd;
416 	struct uio auio;
417 	register struct iovec *iov;
418 	struct iovec *needfree;
419 	struct iovec aiov[UIO_SMALLIOV];
420 	long i, cnt, error = 0;
421 	u_int iovlen;
422 #ifdef KTRACE
423 	struct iovec *ktriov = NULL;
424 	struct uio ktruio;
425 #endif
426 
427 	if ((fp = getfp(fdp, uap->fd, FWRITE)) == NULL)
428 		return (EBADF);
429 	fhold(fp);
430 	/* note: can't use iovlen until iovcnt is validated */
431 	iovlen = uap->iovcnt * sizeof (struct iovec);
432 	if (uap->iovcnt > UIO_SMALLIOV) {
433 		if (uap->iovcnt > UIO_MAXIOV) {
434 			needfree = NULL;
435 			error = EINVAL;
436 			goto done;
437 		}
438 		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
439 		needfree = iov;
440 	} else {
441 		iov = aiov;
442 		needfree = NULL;
443 	}
444 	auio.uio_iov = iov;
445 	auio.uio_iovcnt = uap->iovcnt;
446 	auio.uio_rw = UIO_WRITE;
447 	auio.uio_segflg = UIO_USERSPACE;
448 	auio.uio_procp = p;
449 	auio.uio_offset = -1;
450 	if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)))
451 		goto done;
452 	auio.uio_resid = 0;
453 	for (i = 0; i < uap->iovcnt; i++) {
454 		if (iov->iov_len > INT_MAX - auio.uio_resid) {
455 			error = EINVAL;
456 			goto done;
457 		}
458 		auio.uio_resid += iov->iov_len;
459 		iov++;
460 	}
461 #ifdef KTRACE
462 	/*
463 	 * if tracing, save a copy of iovec and uio
464 	 */
465 	if (KTRPOINT(p, KTR_GENIO))  {
466 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
467 		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
468 		ktruio = auio;
469 	}
470 #endif
471 	cnt = auio.uio_resid;
472 	if ((error = fo_write(fp, &auio, fp->f_cred, 0, p))) {
473 		if (auio.uio_resid != cnt && (error == ERESTART ||
474 		    error == EINTR || error == EWOULDBLOCK))
475 			error = 0;
476 		if (error == EPIPE)
477 			psignal(p, SIGPIPE);
478 	}
479 	cnt -= auio.uio_resid;
480 #ifdef KTRACE
481 	if (ktriov != NULL) {
482 		if (error == 0) {
483 			ktruio.uio_iov = ktriov;
484 			ktruio.uio_resid = cnt;
485 			ktrgenio(p->p_tracep, uap->fd, UIO_WRITE, &ktruio,
486 			    error);
487 		}
488 		FREE(ktriov, M_TEMP);
489 	}
490 #endif
491 	p->p_retval[0] = cnt;
492 done:
493 	fdrop(fp, p);
494 	if (needfree)
495 		FREE(needfree, M_IOV);
496 	return (error);
497 }
498 
499 /*
500  * Ioctl system call
501  */
502 #ifndef _SYS_SYSPROTO_H_
503 struct ioctl_args {
504 	int	fd;
505 	u_long	com;
506 	caddr_t	data;
507 };
508 #endif
509 /* ARGSUSED */
510 int
511 ioctl(p, uap)
512 	struct proc *p;
513 	register struct ioctl_args *uap;
514 {
515 	register struct file *fp;
516 	register struct filedesc *fdp;
517 	register u_long com;
518 	int error;
519 	register u_int size;
520 	caddr_t data, memp;
521 	int tmp;
522 #define STK_PARAMS	128
523 	union {
524 	    char stkbuf[STK_PARAMS];
525 	    long align;
526 	} ubuf;
527 
528 	fdp = p->p_fd;
529 	if ((u_int)uap->fd >= fdp->fd_nfiles ||
530 	    (fp = fdp->fd_ofiles[uap->fd]) == NULL)
531 		return (EBADF);
532 
533 	if ((fp->f_flag & (FREAD | FWRITE)) == 0)
534 		return (EBADF);
535 
536 	switch (com = uap->com) {
537 	case FIONCLEX:
538 		fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE;
539 		return (0);
540 	case FIOCLEX:
541 		fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE;
542 		return (0);
543 	}
544 
545 	/*
546 	 * Interpret high order word to find amount of data to be
547 	 * copied to/from the user's address space.
548 	 */
549 	size = IOCPARM_LEN(com);
550 	if (size > IOCPARM_MAX)
551 		return (ENOTTY);
552 	memp = NULL;
553 	if (size > sizeof (ubuf.stkbuf)) {
554 		memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
555 		data = memp;
556 	} else
557 		data = ubuf.stkbuf;
558 	if (com&IOC_IN) {
559 		if (size) {
560 			error = copyin(uap->data, data, (u_int)size);
561 			if (error) {
562 				if (memp)
563 					free(memp, M_IOCTLOPS);
564 				return (error);
565 			}
566 		} else
567 			*(caddr_t *)data = uap->data;
568 	} else if ((com&IOC_OUT) && size)
569 		/*
570 		 * Zero the buffer so the user always
571 		 * gets back something deterministic.
572 		 */
573 		bzero(data, size);
574 	else if (com&IOC_VOID)
575 		*(caddr_t *)data = uap->data;
576 
577 	switch (com) {
578 
579 	case FIONBIO:
580 		if ((tmp = *(int *)data))
581 			fp->f_flag |= FNONBLOCK;
582 		else
583 			fp->f_flag &= ~FNONBLOCK;
584 		error = fo_ioctl(fp, FIONBIO, (caddr_t)&tmp, p);
585 		break;
586 
587 	case FIOASYNC:
588 		if ((tmp = *(int *)data))
589 			fp->f_flag |= FASYNC;
590 		else
591 			fp->f_flag &= ~FASYNC;
592 		error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, p);
593 		break;
594 
595 	default:
596 		error = fo_ioctl(fp, com, data, p);
597 		/*
598 		 * Copy any data to user, size was
599 		 * already set and checked above.
600 		 */
601 		if (error == 0 && (com&IOC_OUT) && size)
602 			error = copyout(data, uap->data, (u_int)size);
603 		break;
604 	}
605 	if (memp)
606 		free(memp, M_IOCTLOPS);
607 	return (error);
608 }
609 
610 static int	nselcoll;	/* Select collisions since boot */
611 int	selwait;
612 SYSCTL_INT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
613 
614 /*
615  * Select system call.
616  */
617 #ifndef _SYS_SYSPROTO_H_
618 struct select_args {
619 	int	nd;
620 	fd_set	*in, *ou, *ex;
621 	struct	timeval *tv;
622 };
623 #endif
624 int
625 select(p, uap)
626 	register struct proc *p;
627 	register struct select_args *uap;
628 {
629 	/*
630 	 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
631 	 * infds with the new FD_SETSIZE of 1024, and more than enough for
632 	 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
633 	 * of 256.
634 	 */
635 	fd_mask s_selbits[howmany(2048, NFDBITS)];
636 	fd_mask *ibits[3], *obits[3], *selbits, *sbp;
637 	struct timeval atv, rtv, ttv;
638 	int s, ncoll, error, timo;
639 	u_int nbufbytes, ncpbytes, nfdbits;
640 
641 	if (uap->nd < 0)
642 		return (EINVAL);
643 	if (uap->nd > p->p_fd->fd_nfiles)
644 		uap->nd = p->p_fd->fd_nfiles;   /* forgiving; slightly wrong */
645 
646 	/*
647 	 * Allocate just enough bits for the non-null fd_sets.  Use the
648 	 * preallocated auto buffer if possible.
649 	 */
650 	nfdbits = roundup(uap->nd, NFDBITS);
651 	ncpbytes = nfdbits / NBBY;
652 	nbufbytes = 0;
653 	if (uap->in != NULL)
654 		nbufbytes += 2 * ncpbytes;
655 	if (uap->ou != NULL)
656 		nbufbytes += 2 * ncpbytes;
657 	if (uap->ex != NULL)
658 		nbufbytes += 2 * ncpbytes;
659 	if (nbufbytes <= sizeof s_selbits)
660 		selbits = &s_selbits[0];
661 	else
662 		selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
663 
664 	/*
665 	 * Assign pointers into the bit buffers and fetch the input bits.
666 	 * Put the output buffers together so that they can be bzeroed
667 	 * together.
668 	 */
669 	sbp = selbits;
670 #define	getbits(name, x) \
671 	do {								\
672 		if (uap->name == NULL)					\
673 			ibits[x] = NULL;				\
674 		else {							\
675 			ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp;	\
676 			obits[x] = sbp;					\
677 			sbp += ncpbytes / sizeof *sbp;			\
678 			error = copyin(uap->name, ibits[x], ncpbytes);	\
679 			if (error != 0)					\
680 				goto done;				\
681 		}							\
682 	} while (0)
683 	getbits(in, 0);
684 	getbits(ou, 1);
685 	getbits(ex, 2);
686 #undef	getbits
687 	if (nbufbytes != 0)
688 		bzero(selbits, nbufbytes / 2);
689 
690 	if (uap->tv) {
691 		error = copyin((caddr_t)uap->tv, (caddr_t)&atv,
692 			sizeof (atv));
693 		if (error)
694 			goto done;
695 		if (itimerfix(&atv)) {
696 			error = EINVAL;
697 			goto done;
698 		}
699 		getmicrouptime(&rtv);
700 		timevaladd(&atv, &rtv);
701 	} else {
702 		atv.tv_sec = 0;
703 		atv.tv_usec = 0;
704 	}
705 	timo = 0;
706 retry:
707 	ncoll = nselcoll;
708 	p->p_flag |= P_SELECT;
709 	error = selscan(p, ibits, obits, uap->nd);
710 	if (error || p->p_retval[0])
711 		goto done;
712 	if (atv.tv_sec || atv.tv_usec) {
713 		getmicrouptime(&rtv);
714 		if (timevalcmp(&rtv, &atv, >=))
715 			goto done;
716 		ttv = atv;
717 		timevalsub(&ttv, &rtv);
718 		timo = ttv.tv_sec > 24 * 60 * 60 ?
719 		    24 * 60 * 60 * hz : tvtohz(&ttv);
720 	}
721 	s = splhigh();
722 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
723 		splx(s);
724 		goto retry;
725 	}
726 	p->p_flag &= ~P_SELECT;
727 
728 	error = tsleep((caddr_t)&selwait, PSOCK | PCATCH, "select", timo);
729 
730 	splx(s);
731 	if (error == 0)
732 		goto retry;
733 done:
734 	p->p_flag &= ~P_SELECT;
735 	/* select is not restarted after signals... */
736 	if (error == ERESTART)
737 		error = EINTR;
738 	if (error == EWOULDBLOCK)
739 		error = 0;
740 #define	putbits(name, x) \
741 	if (uap->name && (error2 = copyout(obits[x], uap->name, ncpbytes))) \
742 		error = error2;
743 	if (error == 0) {
744 		int error2;
745 
746 		putbits(in, 0);
747 		putbits(ou, 1);
748 		putbits(ex, 2);
749 #undef putbits
750 	}
751 	if (selbits != &s_selbits[0])
752 		free(selbits, M_SELECT);
753 	return (error);
754 }
755 
756 static int
757 selscan(p, ibits, obits, nfd)
758 	struct proc *p;
759 	fd_mask **ibits, **obits;
760 	int nfd;
761 {
762 	struct filedesc *fdp = p->p_fd;
763 	int msk, i, fd;
764 	fd_mask bits;
765 	struct file *fp;
766 	int n = 0;
767 	/* Note: backend also returns POLLHUP/POLLERR if appropriate. */
768 	static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
769 
770 	for (msk = 0; msk < 3; msk++) {
771 		if (ibits[msk] == NULL)
772 			continue;
773 		for (i = 0; i < nfd; i += NFDBITS) {
774 			bits = ibits[msk][i/NFDBITS];
775 			/* ffs(int mask) not portable, fd_mask is long */
776 			for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
777 				if (!(bits & 1))
778 					continue;
779 				fp = fdp->fd_ofiles[fd];
780 				if (fp == NULL)
781 					return (EBADF);
782 				if (fo_poll(fp, flag[msk], fp->f_cred, p)) {
783 					obits[msk][(fd)/NFDBITS] |=
784 					    ((fd_mask)1 << ((fd) % NFDBITS));
785 					n++;
786 				}
787 			}
788 		}
789 	}
790 	p->p_retval[0] = n;
791 	return (0);
792 }
793 
794 /*
795  * Poll system call.
796  */
797 #ifndef _SYS_SYSPROTO_H_
798 struct poll_args {
799 	struct pollfd *fds;
800 	u_int	nfds;
801 	int	timeout;
802 };
803 #endif
804 int
805 poll(p, uap)
806 	register struct proc *p;
807 	register struct poll_args *uap;
808 {
809 	caddr_t bits;
810 	char smallbits[32 * sizeof(struct pollfd)];
811 	struct timeval atv, rtv, ttv;
812 	int s, ncoll, error = 0, timo;
813 	size_t ni;
814 
815 	if (SCARG(uap, nfds) > p->p_fd->fd_nfiles) {
816 		/* forgiving; slightly wrong */
817 		SCARG(uap, nfds) = p->p_fd->fd_nfiles;
818 	}
819 	ni = SCARG(uap, nfds) * sizeof(struct pollfd);
820 	if (ni > sizeof(smallbits))
821 		bits = malloc(ni, M_TEMP, M_WAITOK);
822 	else
823 		bits = smallbits;
824 	error = copyin(SCARG(uap, fds), bits, ni);
825 	if (error)
826 		goto done;
827 	if (SCARG(uap, timeout) != INFTIM) {
828 		atv.tv_sec = SCARG(uap, timeout) / 1000;
829 		atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000;
830 		if (itimerfix(&atv)) {
831 			error = EINVAL;
832 			goto done;
833 		}
834 		getmicrouptime(&rtv);
835 		timevaladd(&atv, &rtv);
836 	} else {
837 		atv.tv_sec = 0;
838 		atv.tv_usec = 0;
839 	}
840 	timo = 0;
841 retry:
842 	ncoll = nselcoll;
843 	p->p_flag |= P_SELECT;
844 	error = pollscan(p, (struct pollfd *)bits, SCARG(uap, nfds));
845 	if (error || p->p_retval[0])
846 		goto done;
847 	if (atv.tv_sec || atv.tv_usec) {
848 		getmicrouptime(&rtv);
849 		if (timevalcmp(&rtv, &atv, >=))
850 			goto done;
851 		ttv = atv;
852 		timevalsub(&ttv, &rtv);
853 		timo = ttv.tv_sec > 24 * 60 * 60 ?
854 		    24 * 60 * 60 * hz : tvtohz(&ttv);
855 	}
856 	s = splhigh();
857 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
858 		splx(s);
859 		goto retry;
860 	}
861 	p->p_flag &= ~P_SELECT;
862 	error = tsleep((caddr_t)&selwait, PSOCK | PCATCH, "poll", timo);
863 	splx(s);
864 	if (error == 0)
865 		goto retry;
866 done:
867 	p->p_flag &= ~P_SELECT;
868 	/* poll is not restarted after signals... */
869 	if (error == ERESTART)
870 		error = EINTR;
871 	if (error == EWOULDBLOCK)
872 		error = 0;
873 	if (error == 0) {
874 		error = copyout(bits, SCARG(uap, fds), ni);
875 		if (error)
876 			goto out;
877 	}
878 out:
879 	if (ni > sizeof(smallbits))
880 		free(bits, M_TEMP);
881 	return (error);
882 }
883 
884 static int
885 pollscan(p, fds, nfd)
886 	struct proc *p;
887 	struct pollfd *fds;
888 	int nfd;
889 {
890 	register struct filedesc *fdp = p->p_fd;
891 	int i;
892 	struct file *fp;
893 	int n = 0;
894 
895 	for (i = 0; i < nfd; i++, fds++) {
896 		if (fds->fd >= fdp->fd_nfiles) {
897 			fds->revents = POLLNVAL;
898 			n++;
899 		} else if (fds->fd < 0) {
900 			fds->revents = 0;
901 		} else {
902 			fp = fdp->fd_ofiles[fds->fd];
903 			if (fp == 0) {
904 				fds->revents = POLLNVAL;
905 				n++;
906 			} else {
907 				/*
908 				 * Note: backend also returns POLLHUP and
909 				 * POLLERR if appropriate.
910 				 */
911 				fds->revents = fo_poll(fp, fds->events,
912 				    fp->f_cred, p);
913 				if (fds->revents != 0)
914 					n++;
915 			}
916 		}
917 	}
918 	p->p_retval[0] = n;
919 	return (0);
920 }
921 
922 /*
923  * OpenBSD poll system call.
924  * XXX this isn't quite a true representation..  OpenBSD uses select ops.
925  */
926 #ifndef _SYS_SYSPROTO_H_
927 struct openbsd_poll_args {
928 	struct pollfd *fds;
929 	u_int	nfds;
930 	int	timeout;
931 };
932 #endif
933 int
934 openbsd_poll(p, uap)
935 	register struct proc *p;
936 	register struct openbsd_poll_args *uap;
937 {
938 	return (poll(p, (struct poll_args *)uap));
939 }
940 
941 /*ARGSUSED*/
942 int
943 seltrue(dev, events, p)
944 	dev_t dev;
945 	int events;
946 	struct proc *p;
947 {
948 
949 	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
950 }
951 
952 /*
953  * Record a select request.
954  */
955 void
956 selrecord(selector, sip)
957 	struct proc *selector;
958 	struct selinfo *sip;
959 {
960 	struct proc *p;
961 	pid_t mypid;
962 
963 	mypid = selector->p_pid;
964 	if (sip->si_pid == mypid)
965 		return;
966 	if (sip->si_pid && (p = pfind(sip->si_pid)) &&
967 	    p->p_wchan == (caddr_t)&selwait)
968 		sip->si_flags |= SI_COLL;
969 	else
970 		sip->si_pid = mypid;
971 }
972 
973 /*
974  * Do a wakeup when a selectable event occurs.
975  */
976 void
977 selwakeup(sip)
978 	register struct selinfo *sip;
979 {
980 	register struct proc *p;
981 	int s;
982 
983 	if (sip->si_pid == 0)
984 		return;
985 	if (sip->si_flags & SI_COLL) {
986 		nselcoll++;
987 		sip->si_flags &= ~SI_COLL;
988 		wakeup((caddr_t)&selwait);
989 	}
990 	p = pfind(sip->si_pid);
991 	sip->si_pid = 0;
992 	if (p != NULL) {
993 		s = splhigh();
994 		if (p->p_wchan == (caddr_t)&selwait) {
995 			if (p->p_stat == SSLEEP)
996 				setrunnable(p);
997 			else
998 				unsleep(p);
999 		} else if (p->p_flag & P_SELECT)
1000 			p->p_flag &= ~P_SELECT;
1001 		splx(s);
1002 	}
1003 }
1004