xref: /freebsd/sys/kern/sys_generic.c (revision 77a0943ded95b9e6438f7db70c4a28e4d93946d4)
1 /*
2  * Copyright (c) 1982, 1986, 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
39  * $FreeBSD$
40  */
41 
42 #include "opt_ktrace.h"
43 
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/sysproto.h>
47 #include <sys/filedesc.h>
48 #include <sys/filio.h>
49 #include <sys/fcntl.h>
50 #include <sys/file.h>
51 #include <sys/proc.h>
52 #include <sys/signalvar.h>
53 #include <sys/socketvar.h>
54 #include <sys/uio.h>
55 #include <sys/kernel.h>
56 #include <sys/malloc.h>
57 #include <sys/poll.h>
58 #include <sys/sysctl.h>
59 #include <sys/sysent.h>
60 #include <sys/bio.h>
61 #include <sys/buf.h>
62 #ifdef KTRACE
63 #include <sys/ktrace.h>
64 #endif
65 #include <vm/vm.h>
66 #include <vm/vm_page.h>
67 
68 #include <machine/limits.h>
69 
70 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
71 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
72 MALLOC_DEFINE(M_IOV, "iov", "large iov's");
73 
74 static int	pollscan __P((struct proc *, struct pollfd *, int));
75 static int	selscan __P((struct proc *, fd_mask **, fd_mask **, int));
76 static int	dofileread __P((struct proc *, struct file *, int, void *,
77 		    size_t, off_t, int));
78 static int	dofilewrite __P((struct proc *, struct file *, int,
79 		    const void *, size_t, off_t, int));
80 
81 struct file*
82 holdfp(fdp, fd, flag)
83 	struct filedesc* fdp;
84 	int fd, flag;
85 {
86 	struct file* fp;
87 
88 	if (((u_int)fd) >= fdp->fd_nfiles ||
89 	    (fp = fdp->fd_ofiles[fd]) == NULL ||
90 	    (fp->f_flag & flag) == 0) {
91 		return (NULL);
92 	}
93 	fhold(fp);
94 	return (fp);
95 }
96 
97 /*
98  * Read system call.
99  */
100 #ifndef _SYS_SYSPROTO_H_
101 struct read_args {
102 	int	fd;
103 	void	*buf;
104 	size_t	nbyte;
105 };
106 #endif
107 int
108 read(p, uap)
109 	struct proc *p;
110 	register struct read_args *uap;
111 {
112 	register struct file *fp;
113 	int error;
114 
115 	if ((fp = holdfp(p->p_fd, uap->fd, FREAD)) == NULL)
116 		return (EBADF);
117 	error = dofileread(p, fp, uap->fd, uap->buf, uap->nbyte, (off_t)-1, 0);
118 	fdrop(fp, p);
119 	return(error);
120 }
121 
122 /*
123  * Pread system call
124  */
125 #ifndef _SYS_SYSPROTO_H_
126 struct pread_args {
127 	int	fd;
128 	void	*buf;
129 	size_t	nbyte;
130 	int	pad;
131 	off_t	offset;
132 };
133 #endif
134 int
135 pread(p, uap)
136 	struct proc *p;
137 	register struct pread_args *uap;
138 {
139 	register struct file *fp;
140 	int error;
141 
142 	if ((fp = holdfp(p->p_fd, uap->fd, FREAD)) == NULL)
143 		return (EBADF);
144 	if (fp->f_type != DTYPE_VNODE) {
145 		error = ESPIPE;
146 	} else {
147 	    error = dofileread(p, fp, uap->fd, uap->buf, uap->nbyte,
148 		uap->offset, FOF_OFFSET);
149 	}
150 	fdrop(fp, p);
151 	return(error);
152 }
153 
154 /*
155  * Code common for read and pread
156  */
157 int
158 dofileread(p, fp, fd, buf, nbyte, offset, flags)
159 	struct proc *p;
160 	struct file *fp;
161 	int fd, flags;
162 	void *buf;
163 	size_t nbyte;
164 	off_t offset;
165 {
166 	struct uio auio;
167 	struct iovec aiov;
168 	long cnt, error = 0;
169 #ifdef KTRACE
170 	struct iovec ktriov;
171 	struct uio ktruio;
172 	int didktr = 0;
173 #endif
174 
175 	aiov.iov_base = (caddr_t)buf;
176 	aiov.iov_len = nbyte;
177 	auio.uio_iov = &aiov;
178 	auio.uio_iovcnt = 1;
179 	auio.uio_offset = offset;
180 	if (nbyte > INT_MAX)
181 		return (EINVAL);
182 	auio.uio_resid = nbyte;
183 	auio.uio_rw = UIO_READ;
184 	auio.uio_segflg = UIO_USERSPACE;
185 	auio.uio_procp = p;
186 #ifdef KTRACE
187 	/*
188 	 * if tracing, save a copy of iovec
189 	 */
190 	if (KTRPOINT(p, KTR_GENIO)) {
191 		ktriov = aiov;
192 		ktruio = auio;
193 		didktr = 1;
194 	}
195 #endif
196 	cnt = nbyte;
197 
198 	if ((error = fo_read(fp, &auio, fp->f_cred, flags, p))) {
199 		if (auio.uio_resid != cnt && (error == ERESTART ||
200 		    error == EINTR || error == EWOULDBLOCK))
201 			error = 0;
202 	}
203 	cnt -= auio.uio_resid;
204 #ifdef KTRACE
205 	if (didktr && error == 0) {
206 		ktruio.uio_iov = &ktriov;
207 		ktruio.uio_resid = cnt;
208 		ktrgenio(p->p_tracep, fd, UIO_READ, &ktruio, error);
209 	}
210 #endif
211 	p->p_retval[0] = cnt;
212 	return (error);
213 }
214 
215 /*
216  * Scatter read system call.
217  */
218 #ifndef _SYS_SYSPROTO_H_
219 struct readv_args {
220 	int	fd;
221 	struct	iovec *iovp;
222 	u_int	iovcnt;
223 };
224 #endif
225 int
226 readv(p, uap)
227 	struct proc *p;
228 	register struct readv_args *uap;
229 {
230 	register struct file *fp;
231 	register struct filedesc *fdp = p->p_fd;
232 	struct uio auio;
233 	register struct iovec *iov;
234 	struct iovec *needfree;
235 	struct iovec aiov[UIO_SMALLIOV];
236 	long i, cnt, error = 0;
237 	u_int iovlen;
238 #ifdef KTRACE
239 	struct iovec *ktriov = NULL;
240 	struct uio ktruio;
241 #endif
242 
243 	if ((fp = holdfp(fdp, uap->fd, FREAD)) == NULL)
244 		return (EBADF);
245 	/* note: can't use iovlen until iovcnt is validated */
246 	iovlen = uap->iovcnt * sizeof (struct iovec);
247 	if (uap->iovcnt > UIO_SMALLIOV) {
248 		if (uap->iovcnt > UIO_MAXIOV)
249 			return (EINVAL);
250 		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
251 		needfree = iov;
252 	} else {
253 		iov = aiov;
254 		needfree = NULL;
255 	}
256 	auio.uio_iov = iov;
257 	auio.uio_iovcnt = uap->iovcnt;
258 	auio.uio_rw = UIO_READ;
259 	auio.uio_segflg = UIO_USERSPACE;
260 	auio.uio_procp = p;
261 	auio.uio_offset = -1;
262 	if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)))
263 		goto done;
264 	auio.uio_resid = 0;
265 	for (i = 0; i < uap->iovcnt; i++) {
266 		if (iov->iov_len > INT_MAX - auio.uio_resid) {
267 			error = EINVAL;
268 			goto done;
269 		}
270 		auio.uio_resid += iov->iov_len;
271 		iov++;
272 	}
273 #ifdef KTRACE
274 	/*
275 	 * if tracing, save a copy of iovec
276 	 */
277 	if (KTRPOINT(p, KTR_GENIO))  {
278 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
279 		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
280 		ktruio = auio;
281 	}
282 #endif
283 	cnt = auio.uio_resid;
284 	if ((error = fo_read(fp, &auio, fp->f_cred, 0, p))) {
285 		if (auio.uio_resid != cnt && (error == ERESTART ||
286 		    error == EINTR || error == EWOULDBLOCK))
287 			error = 0;
288 	}
289 	cnt -= auio.uio_resid;
290 #ifdef KTRACE
291 	if (ktriov != NULL) {
292 		if (error == 0) {
293 			ktruio.uio_iov = ktriov;
294 			ktruio.uio_resid = cnt;
295 			ktrgenio(p->p_tracep, uap->fd, UIO_READ, &ktruio,
296 			    error);
297 		}
298 		FREE(ktriov, M_TEMP);
299 	}
300 #endif
301 	p->p_retval[0] = cnt;
302 done:
303 	fdrop(fp, p);
304 	if (needfree)
305 		FREE(needfree, M_IOV);
306 	return (error);
307 }
308 
309 /*
310  * Write system call
311  */
312 #ifndef _SYS_SYSPROTO_H_
313 struct write_args {
314 	int	fd;
315 	const void *buf;
316 	size_t	nbyte;
317 };
318 #endif
319 int
320 write(p, uap)
321 	struct proc *p;
322 	register struct write_args *uap;
323 {
324 	register struct file *fp;
325 	int error;
326 
327 	if ((fp = holdfp(p->p_fd, uap->fd, FWRITE)) == NULL)
328 		return (EBADF);
329 	error = dofilewrite(p, fp, uap->fd, uap->buf, uap->nbyte, (off_t)-1, 0);
330 	fdrop(fp, p);
331 	return(error);
332 }
333 
334 /*
335  * Pwrite system call
336  */
337 #ifndef _SYS_SYSPROTO_H_
338 struct pwrite_args {
339 	int	fd;
340 	const void *buf;
341 	size_t	nbyte;
342 	int	pad;
343 	off_t	offset;
344 };
345 #endif
346 int
347 pwrite(p, uap)
348 	struct proc *p;
349 	register struct pwrite_args *uap;
350 {
351 	register struct file *fp;
352 	int error;
353 
354 	if ((fp = holdfp(p->p_fd, uap->fd, FWRITE)) == NULL)
355 		return (EBADF);
356 	if (fp->f_type != DTYPE_VNODE) {
357 		error = ESPIPE;
358 	} else {
359 	    error = dofilewrite(p, fp, uap->fd, uap->buf, uap->nbyte,
360 		uap->offset, FOF_OFFSET);
361 	}
362 	fdrop(fp, p);
363 	return(error);
364 }
365 
366 static int
367 dofilewrite(p, fp, fd, buf, nbyte, offset, flags)
368 	struct proc *p;
369 	struct file *fp;
370 	int fd, flags;
371 	const void *buf;
372 	size_t nbyte;
373 	off_t offset;
374 {
375 	struct uio auio;
376 	struct iovec aiov;
377 	long cnt, error = 0;
378 #ifdef KTRACE
379 	struct iovec ktriov;
380 	struct uio ktruio;
381 	int didktr = 0;
382 #endif
383 
384 	aiov.iov_base = (void *)(uintptr_t)buf;
385 	aiov.iov_len = nbyte;
386 	auio.uio_iov = &aiov;
387 	auio.uio_iovcnt = 1;
388 	auio.uio_offset = offset;
389 	if (nbyte > INT_MAX)
390 		return (EINVAL);
391 	auio.uio_resid = nbyte;
392 	auio.uio_rw = UIO_WRITE;
393 	auio.uio_segflg = UIO_USERSPACE;
394 	auio.uio_procp = p;
395 #ifdef KTRACE
396 	/*
397 	 * if tracing, save a copy of iovec and uio
398 	 */
399 	if (KTRPOINT(p, KTR_GENIO)) {
400 		ktriov = aiov;
401 		ktruio = auio;
402 		didktr = 1;
403 	}
404 #endif
405 	cnt = nbyte;
406 	bwillwrite();
407 	if ((error = fo_write(fp, &auio, fp->f_cred, flags, p))) {
408 		if (auio.uio_resid != cnt && (error == ERESTART ||
409 		    error == EINTR || error == EWOULDBLOCK))
410 			error = 0;
411 		if (error == EPIPE)
412 			psignal(p, SIGPIPE);
413 	}
414 	cnt -= auio.uio_resid;
415 #ifdef KTRACE
416 	if (didktr && error == 0) {
417 		ktruio.uio_iov = &ktriov;
418 		ktruio.uio_resid = cnt;
419 		ktrgenio(p->p_tracep, fd, UIO_WRITE, &ktruio, error);
420 	}
421 #endif
422 	p->p_retval[0] = cnt;
423 	return (error);
424 }
425 
426 /*
427  * Gather write system call
428  */
429 #ifndef _SYS_SYSPROTO_H_
430 struct writev_args {
431 	int	fd;
432 	struct	iovec *iovp;
433 	u_int	iovcnt;
434 };
435 #endif
436 int
437 writev(p, uap)
438 	struct proc *p;
439 	register struct writev_args *uap;
440 {
441 	register struct file *fp;
442 	register struct filedesc *fdp = p->p_fd;
443 	struct uio auio;
444 	register struct iovec *iov;
445 	struct iovec *needfree;
446 	struct iovec aiov[UIO_SMALLIOV];
447 	long i, cnt, error = 0;
448 	u_int iovlen;
449 #ifdef KTRACE
450 	struct iovec *ktriov = NULL;
451 	struct uio ktruio;
452 #endif
453 
454 	if ((fp = holdfp(fdp, uap->fd, FWRITE)) == NULL)
455 		return (EBADF);
456 	/* note: can't use iovlen until iovcnt is validated */
457 	iovlen = uap->iovcnt * sizeof (struct iovec);
458 	if (uap->iovcnt > UIO_SMALLIOV) {
459 		if (uap->iovcnt > UIO_MAXIOV) {
460 			needfree = NULL;
461 			error = EINVAL;
462 			goto done;
463 		}
464 		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
465 		needfree = iov;
466 	} else {
467 		iov = aiov;
468 		needfree = NULL;
469 	}
470 	auio.uio_iov = iov;
471 	auio.uio_iovcnt = uap->iovcnt;
472 	auio.uio_rw = UIO_WRITE;
473 	auio.uio_segflg = UIO_USERSPACE;
474 	auio.uio_procp = p;
475 	auio.uio_offset = -1;
476 	if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)))
477 		goto done;
478 	auio.uio_resid = 0;
479 	for (i = 0; i < uap->iovcnt; i++) {
480 		if (iov->iov_len > INT_MAX - auio.uio_resid) {
481 			error = EINVAL;
482 			goto done;
483 		}
484 		auio.uio_resid += iov->iov_len;
485 		iov++;
486 	}
487 #ifdef KTRACE
488 	/*
489 	 * if tracing, save a copy of iovec and uio
490 	 */
491 	if (KTRPOINT(p, KTR_GENIO))  {
492 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
493 		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
494 		ktruio = auio;
495 	}
496 #endif
497 	cnt = auio.uio_resid;
498 	if ((error = fo_write(fp, &auio, fp->f_cred, 0, p))) {
499 		if (auio.uio_resid != cnt && (error == ERESTART ||
500 		    error == EINTR || error == EWOULDBLOCK))
501 			error = 0;
502 		if (error == EPIPE)
503 			psignal(p, SIGPIPE);
504 	}
505 	cnt -= auio.uio_resid;
506 #ifdef KTRACE
507 	if (ktriov != NULL) {
508 		if (error == 0) {
509 			ktruio.uio_iov = ktriov;
510 			ktruio.uio_resid = cnt;
511 			ktrgenio(p->p_tracep, uap->fd, UIO_WRITE, &ktruio,
512 			    error);
513 		}
514 		FREE(ktriov, M_TEMP);
515 	}
516 #endif
517 	p->p_retval[0] = cnt;
518 done:
519 	fdrop(fp, p);
520 	if (needfree)
521 		FREE(needfree, M_IOV);
522 	return (error);
523 }
524 
525 /*
526  * Ioctl system call
527  */
528 #ifndef _SYS_SYSPROTO_H_
529 struct ioctl_args {
530 	int	fd;
531 	u_long	com;
532 	caddr_t	data;
533 };
534 #endif
535 /* ARGSUSED */
536 int
537 ioctl(p, uap)
538 	struct proc *p;
539 	register struct ioctl_args *uap;
540 {
541 	register struct file *fp;
542 	register struct filedesc *fdp;
543 	register u_long com;
544 	int error;
545 	register u_int size;
546 	caddr_t data, memp;
547 	int tmp;
548 #define STK_PARAMS	128
549 	union {
550 	    char stkbuf[STK_PARAMS];
551 	    long align;
552 	} ubuf;
553 
554 	fdp = p->p_fd;
555 	if ((u_int)uap->fd >= fdp->fd_nfiles ||
556 	    (fp = fdp->fd_ofiles[uap->fd]) == NULL)
557 		return (EBADF);
558 
559 	if ((fp->f_flag & (FREAD | FWRITE)) == 0)
560 		return (EBADF);
561 
562 	switch (com = uap->com) {
563 	case FIONCLEX:
564 		fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE;
565 		return (0);
566 	case FIOCLEX:
567 		fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE;
568 		return (0);
569 	}
570 
571 	/*
572 	 * Interpret high order word to find amount of data to be
573 	 * copied to/from the user's address space.
574 	 */
575 	size = IOCPARM_LEN(com);
576 	if (size > IOCPARM_MAX)
577 		return (ENOTTY);
578 
579 	fhold(fp);
580 
581 	memp = NULL;
582 	if (size > sizeof (ubuf.stkbuf)) {
583 		memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
584 		data = memp;
585 	} else {
586 		data = ubuf.stkbuf;
587 	}
588 	if (com&IOC_IN) {
589 		if (size) {
590 			error = copyin(uap->data, data, (u_int)size);
591 			if (error) {
592 				if (memp)
593 					free(memp, M_IOCTLOPS);
594 				fdrop(fp, p);
595 				return (error);
596 			}
597 		} else {
598 			*(caddr_t *)data = uap->data;
599 		}
600 	} else if ((com&IOC_OUT) && size) {
601 		/*
602 		 * Zero the buffer so the user always
603 		 * gets back something deterministic.
604 		 */
605 		bzero(data, size);
606 	} else if (com&IOC_VOID) {
607 		*(caddr_t *)data = uap->data;
608 	}
609 
610 	switch (com) {
611 
612 	case FIONBIO:
613 		if ((tmp = *(int *)data))
614 			fp->f_flag |= FNONBLOCK;
615 		else
616 			fp->f_flag &= ~FNONBLOCK;
617 		error = fo_ioctl(fp, FIONBIO, (caddr_t)&tmp, p);
618 		break;
619 
620 	case FIOASYNC:
621 		if ((tmp = *(int *)data))
622 			fp->f_flag |= FASYNC;
623 		else
624 			fp->f_flag &= ~FASYNC;
625 		error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, p);
626 		break;
627 
628 	default:
629 		error = fo_ioctl(fp, com, data, p);
630 		/*
631 		 * Copy any data to user, size was
632 		 * already set and checked above.
633 		 */
634 		if (error == 0 && (com&IOC_OUT) && size)
635 			error = copyout(data, uap->data, (u_int)size);
636 		break;
637 	}
638 	if (memp)
639 		free(memp, M_IOCTLOPS);
640 	fdrop(fp, p);
641 	return (error);
642 }
643 
644 static int	nselcoll;	/* Select collisions since boot */
645 int	selwait;
646 SYSCTL_INT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
647 
648 /*
649  * Select system call.
650  */
651 #ifndef _SYS_SYSPROTO_H_
652 struct select_args {
653 	int	nd;
654 	fd_set	*in, *ou, *ex;
655 	struct	timeval *tv;
656 };
657 #endif
658 int
659 select(p, uap)
660 	register struct proc *p;
661 	register struct select_args *uap;
662 {
663 	/*
664 	 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
665 	 * infds with the new FD_SETSIZE of 1024, and more than enough for
666 	 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
667 	 * of 256.
668 	 */
669 	fd_mask s_selbits[howmany(2048, NFDBITS)];
670 	fd_mask *ibits[3], *obits[3], *selbits, *sbp;
671 	struct timeval atv, rtv, ttv;
672 	int s, ncoll, error, timo;
673 	u_int nbufbytes, ncpbytes, nfdbits;
674 
675 	if (uap->nd < 0)
676 		return (EINVAL);
677 	if (uap->nd > p->p_fd->fd_nfiles)
678 		uap->nd = p->p_fd->fd_nfiles;   /* forgiving; slightly wrong */
679 
680 	/*
681 	 * Allocate just enough bits for the non-null fd_sets.  Use the
682 	 * preallocated auto buffer if possible.
683 	 */
684 	nfdbits = roundup(uap->nd, NFDBITS);
685 	ncpbytes = nfdbits / NBBY;
686 	nbufbytes = 0;
687 	if (uap->in != NULL)
688 		nbufbytes += 2 * ncpbytes;
689 	if (uap->ou != NULL)
690 		nbufbytes += 2 * ncpbytes;
691 	if (uap->ex != NULL)
692 		nbufbytes += 2 * ncpbytes;
693 	if (nbufbytes <= sizeof s_selbits)
694 		selbits = &s_selbits[0];
695 	else
696 		selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
697 
698 	/*
699 	 * Assign pointers into the bit buffers and fetch the input bits.
700 	 * Put the output buffers together so that they can be bzeroed
701 	 * together.
702 	 */
703 	sbp = selbits;
704 #define	getbits(name, x) \
705 	do {								\
706 		if (uap->name == NULL)					\
707 			ibits[x] = NULL;				\
708 		else {							\
709 			ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp;	\
710 			obits[x] = sbp;					\
711 			sbp += ncpbytes / sizeof *sbp;			\
712 			error = copyin(uap->name, ibits[x], ncpbytes);	\
713 			if (error != 0)					\
714 				goto done;				\
715 		}							\
716 	} while (0)
717 	getbits(in, 0);
718 	getbits(ou, 1);
719 	getbits(ex, 2);
720 #undef	getbits
721 	if (nbufbytes != 0)
722 		bzero(selbits, nbufbytes / 2);
723 
724 	if (uap->tv) {
725 		error = copyin((caddr_t)uap->tv, (caddr_t)&atv,
726 			sizeof (atv));
727 		if (error)
728 			goto done;
729 		if (itimerfix(&atv)) {
730 			error = EINVAL;
731 			goto done;
732 		}
733 		getmicrouptime(&rtv);
734 		timevaladd(&atv, &rtv);
735 	} else {
736 		atv.tv_sec = 0;
737 		atv.tv_usec = 0;
738 	}
739 	timo = 0;
740 retry:
741 	ncoll = nselcoll;
742 	p->p_flag |= P_SELECT;
743 	error = selscan(p, ibits, obits, uap->nd);
744 	if (error || p->p_retval[0])
745 		goto done;
746 	if (atv.tv_sec || atv.tv_usec) {
747 		getmicrouptime(&rtv);
748 		if (timevalcmp(&rtv, &atv, >=))
749 			goto done;
750 		ttv = atv;
751 		timevalsub(&ttv, &rtv);
752 		timo = ttv.tv_sec > 24 * 60 * 60 ?
753 		    24 * 60 * 60 * hz : tvtohz(&ttv);
754 	}
755 	s = splhigh();
756 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
757 		splx(s);
758 		goto retry;
759 	}
760 	p->p_flag &= ~P_SELECT;
761 
762 	error = tsleep((caddr_t)&selwait, PSOCK | PCATCH, "select", timo);
763 
764 	splx(s);
765 	if (error == 0)
766 		goto retry;
767 done:
768 	p->p_flag &= ~P_SELECT;
769 	/* select is not restarted after signals... */
770 	if (error == ERESTART)
771 		error = EINTR;
772 	if (error == EWOULDBLOCK)
773 		error = 0;
774 #define	putbits(name, x) \
775 	if (uap->name && (error2 = copyout(obits[x], uap->name, ncpbytes))) \
776 		error = error2;
777 	if (error == 0) {
778 		int error2;
779 
780 		putbits(in, 0);
781 		putbits(ou, 1);
782 		putbits(ex, 2);
783 #undef putbits
784 	}
785 	if (selbits != &s_selbits[0])
786 		free(selbits, M_SELECT);
787 	return (error);
788 }
789 
790 static int
791 selscan(p, ibits, obits, nfd)
792 	struct proc *p;
793 	fd_mask **ibits, **obits;
794 	int nfd;
795 {
796 	struct filedesc *fdp = p->p_fd;
797 	int msk, i, fd;
798 	fd_mask bits;
799 	struct file *fp;
800 	int n = 0;
801 	/* Note: backend also returns POLLHUP/POLLERR if appropriate. */
802 	static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
803 
804 	for (msk = 0; msk < 3; msk++) {
805 		if (ibits[msk] == NULL)
806 			continue;
807 		for (i = 0; i < nfd; i += NFDBITS) {
808 			bits = ibits[msk][i/NFDBITS];
809 			/* ffs(int mask) not portable, fd_mask is long */
810 			for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
811 				if (!(bits & 1))
812 					continue;
813 				fp = fdp->fd_ofiles[fd];
814 				if (fp == NULL)
815 					return (EBADF);
816 				if (fo_poll(fp, flag[msk], fp->f_cred, p)) {
817 					obits[msk][(fd)/NFDBITS] |=
818 					    ((fd_mask)1 << ((fd) % NFDBITS));
819 					n++;
820 				}
821 			}
822 		}
823 	}
824 	p->p_retval[0] = n;
825 	return (0);
826 }
827 
828 /*
829  * Poll system call.
830  */
831 #ifndef _SYS_SYSPROTO_H_
832 struct poll_args {
833 	struct pollfd *fds;
834 	u_int	nfds;
835 	int	timeout;
836 };
837 #endif
838 int
839 poll(p, uap)
840 	register struct proc *p;
841 	register struct poll_args *uap;
842 {
843 	caddr_t bits;
844 	char smallbits[32 * sizeof(struct pollfd)];
845 	struct timeval atv, rtv, ttv;
846 	int s, ncoll, error = 0, timo;
847 	size_t ni;
848 
849 	if (SCARG(uap, nfds) > p->p_fd->fd_nfiles) {
850 		/* forgiving; slightly wrong */
851 		SCARG(uap, nfds) = p->p_fd->fd_nfiles;
852 	}
853 	ni = SCARG(uap, nfds) * sizeof(struct pollfd);
854 	if (ni > sizeof(smallbits))
855 		bits = malloc(ni, M_TEMP, M_WAITOK);
856 	else
857 		bits = smallbits;
858 	error = copyin(SCARG(uap, fds), bits, ni);
859 	if (error)
860 		goto done;
861 	if (SCARG(uap, timeout) != INFTIM) {
862 		atv.tv_sec = SCARG(uap, timeout) / 1000;
863 		atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000;
864 		if (itimerfix(&atv)) {
865 			error = EINVAL;
866 			goto done;
867 		}
868 		getmicrouptime(&rtv);
869 		timevaladd(&atv, &rtv);
870 	} else {
871 		atv.tv_sec = 0;
872 		atv.tv_usec = 0;
873 	}
874 	timo = 0;
875 retry:
876 	ncoll = nselcoll;
877 	p->p_flag |= P_SELECT;
878 	error = pollscan(p, (struct pollfd *)bits, SCARG(uap, nfds));
879 	if (error || p->p_retval[0])
880 		goto done;
881 	if (atv.tv_sec || atv.tv_usec) {
882 		getmicrouptime(&rtv);
883 		if (timevalcmp(&rtv, &atv, >=))
884 			goto done;
885 		ttv = atv;
886 		timevalsub(&ttv, &rtv);
887 		timo = ttv.tv_sec > 24 * 60 * 60 ?
888 		    24 * 60 * 60 * hz : tvtohz(&ttv);
889 	}
890 	s = splhigh();
891 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
892 		splx(s);
893 		goto retry;
894 	}
895 	p->p_flag &= ~P_SELECT;
896 	error = tsleep((caddr_t)&selwait, PSOCK | PCATCH, "poll", timo);
897 	splx(s);
898 	if (error == 0)
899 		goto retry;
900 done:
901 	p->p_flag &= ~P_SELECT;
902 	/* poll is not restarted after signals... */
903 	if (error == ERESTART)
904 		error = EINTR;
905 	if (error == EWOULDBLOCK)
906 		error = 0;
907 	if (error == 0) {
908 		error = copyout(bits, SCARG(uap, fds), ni);
909 		if (error)
910 			goto out;
911 	}
912 out:
913 	if (ni > sizeof(smallbits))
914 		free(bits, M_TEMP);
915 	return (error);
916 }
917 
918 static int
919 pollscan(p, fds, nfd)
920 	struct proc *p;
921 	struct pollfd *fds;
922 	int nfd;
923 {
924 	register struct filedesc *fdp = p->p_fd;
925 	int i;
926 	struct file *fp;
927 	int n = 0;
928 
929 	for (i = 0; i < nfd; i++, fds++) {
930 		if (fds->fd >= fdp->fd_nfiles) {
931 			fds->revents = POLLNVAL;
932 			n++;
933 		} else if (fds->fd < 0) {
934 			fds->revents = 0;
935 		} else {
936 			fp = fdp->fd_ofiles[fds->fd];
937 			if (fp == NULL) {
938 				fds->revents = POLLNVAL;
939 				n++;
940 			} else {
941 				/*
942 				 * Note: backend also returns POLLHUP and
943 				 * POLLERR if appropriate.
944 				 */
945 				fds->revents = fo_poll(fp, fds->events,
946 				    fp->f_cred, p);
947 				if (fds->revents != 0)
948 					n++;
949 			}
950 		}
951 	}
952 	p->p_retval[0] = n;
953 	return (0);
954 }
955 
956 /*
957  * OpenBSD poll system call.
958  * XXX this isn't quite a true representation..  OpenBSD uses select ops.
959  */
960 #ifndef _SYS_SYSPROTO_H_
961 struct openbsd_poll_args {
962 	struct pollfd *fds;
963 	u_int	nfds;
964 	int	timeout;
965 };
966 #endif
967 int
968 openbsd_poll(p, uap)
969 	register struct proc *p;
970 	register struct openbsd_poll_args *uap;
971 {
972 	return (poll(p, (struct poll_args *)uap));
973 }
974 
975 /*ARGSUSED*/
976 int
977 seltrue(dev, events, p)
978 	dev_t dev;
979 	int events;
980 	struct proc *p;
981 {
982 
983 	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
984 }
985 
986 /*
987  * Record a select request.
988  */
989 void
990 selrecord(selector, sip)
991 	struct proc *selector;
992 	struct selinfo *sip;
993 {
994 	struct proc *p;
995 	pid_t mypid;
996 
997 	mypid = selector->p_pid;
998 	if (sip->si_pid == mypid)
999 		return;
1000 	if (sip->si_pid && (p = pfind(sip->si_pid)) &&
1001 	    p->p_wchan == (caddr_t)&selwait)
1002 		sip->si_flags |= SI_COLL;
1003 	else
1004 		sip->si_pid = mypid;
1005 }
1006 
1007 /*
1008  * Do a wakeup when a selectable event occurs.
1009  */
1010 void
1011 selwakeup(sip)
1012 	register struct selinfo *sip;
1013 {
1014 	register struct proc *p;
1015 	int s;
1016 
1017 	if (sip->si_pid == 0)
1018 		return;
1019 	if (sip->si_flags & SI_COLL) {
1020 		nselcoll++;
1021 		sip->si_flags &= ~SI_COLL;
1022 		wakeup((caddr_t)&selwait);
1023 	}
1024 	p = pfind(sip->si_pid);
1025 	sip->si_pid = 0;
1026 	if (p != NULL) {
1027 		s = splhigh();
1028 		mtx_enter(&sched_lock, MTX_SPIN);
1029 		if (p->p_wchan == (caddr_t)&selwait) {
1030 			if (p->p_stat == SSLEEP)
1031 				setrunnable(p);
1032 			else
1033 				unsleep(p);
1034 		} else if (p->p_flag & P_SELECT)
1035 			p->p_flag &= ~P_SELECT;
1036 		mtx_exit(&sched_lock, MTX_SPIN);
1037 		splx(s);
1038 	}
1039 }
1040