xref: /freebsd/sys/kern/sys_generic.c (revision 5521ff5a4d1929056e7ffc982fac3341ca54df7c)
1 /*
2  * Copyright (c) 1982, 1986, 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
39  * $FreeBSD$
40  */
41 
42 #include "opt_ktrace.h"
43 
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/sysproto.h>
47 #include <sys/filedesc.h>
48 #include <sys/filio.h>
49 #include <sys/fcntl.h>
50 #include <sys/file.h>
51 #include <sys/proc.h>
52 #include <sys/signalvar.h>
53 #include <sys/socketvar.h>
54 #include <sys/uio.h>
55 #include <sys/kernel.h>
56 #include <sys/malloc.h>
57 #include <sys/poll.h>
58 #include <sys/resourcevar.h>
59 #include <sys/selinfo.h>
60 #include <sys/sysctl.h>
61 #include <sys/sysent.h>
62 #include <sys/bio.h>
63 #include <sys/buf.h>
64 #include <sys/condvar.h>
65 #ifdef KTRACE
66 #include <sys/ktrace.h>
67 #endif
68 #include <vm/vm.h>
69 #include <vm/vm_page.h>
70 
71 #include <machine/limits.h>
72 
73 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
74 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
75 MALLOC_DEFINE(M_IOV, "iov", "large iov's");
76 
77 static int	pollscan __P((struct proc *, struct pollfd *, u_int));
78 static int	pollholddrop __P((struct proc *, struct pollfd *, u_int, int));
79 static int	selscan __P((struct proc *, fd_mask **, fd_mask **, int));
80 static int	selholddrop __P((struct proc *, fd_mask *, fd_mask *, int, int));
81 static int	dofileread __P((struct proc *, struct file *, int, void *,
82 		    size_t, off_t, int));
83 static int	dofilewrite __P((struct proc *, struct file *, int,
84 		    const void *, size_t, off_t, int));
85 
86 struct file*
87 holdfp(fdp, fd, flag)
88 	struct filedesc* fdp;
89 	int fd, flag;
90 {
91 	struct file* fp;
92 
93 	if (((u_int)fd) >= fdp->fd_nfiles ||
94 	    (fp = fdp->fd_ofiles[fd]) == NULL ||
95 	    (fp->f_flag & flag) == 0) {
96 		return (NULL);
97 	}
98 	fhold(fp);
99 	return (fp);
100 }
101 
102 /*
103  * Read system call.
104  */
105 #ifndef _SYS_SYSPROTO_H_
106 struct read_args {
107 	int	fd;
108 	void	*buf;
109 	size_t	nbyte;
110 };
111 #endif
112 int
113 read(p, uap)
114 	struct proc *p;
115 	register struct read_args *uap;
116 {
117 	register struct file *fp;
118 	int error;
119 
120 	if ((fp = holdfp(p->p_fd, uap->fd, FREAD)) == NULL)
121 		return (EBADF);
122 	error = dofileread(p, fp, uap->fd, uap->buf, uap->nbyte, (off_t)-1, 0);
123 	fdrop(fp, p);
124 	return(error);
125 }
126 
127 /*
128  * Pread system call
129  */
130 #ifndef _SYS_SYSPROTO_H_
131 struct pread_args {
132 	int	fd;
133 	void	*buf;
134 	size_t	nbyte;
135 	int	pad;
136 	off_t	offset;
137 };
138 #endif
139 int
140 pread(p, uap)
141 	struct proc *p;
142 	register struct pread_args *uap;
143 {
144 	register struct file *fp;
145 	int error;
146 
147 	if ((fp = holdfp(p->p_fd, uap->fd, FREAD)) == NULL)
148 		return (EBADF);
149 	if (fp->f_type != DTYPE_VNODE) {
150 		error = ESPIPE;
151 	} else {
152 	    error = dofileread(p, fp, uap->fd, uap->buf, uap->nbyte,
153 		uap->offset, FOF_OFFSET);
154 	}
155 	fdrop(fp, p);
156 	return(error);
157 }
158 
159 /*
160  * Code common for read and pread
161  */
162 int
163 dofileread(p, fp, fd, buf, nbyte, offset, flags)
164 	struct proc *p;
165 	struct file *fp;
166 	int fd, flags;
167 	void *buf;
168 	size_t nbyte;
169 	off_t offset;
170 {
171 	struct uio auio;
172 	struct iovec aiov;
173 	long cnt, error = 0;
174 #ifdef KTRACE
175 	struct iovec ktriov;
176 	struct uio ktruio;
177 	int didktr = 0;
178 #endif
179 
180 	aiov.iov_base = (caddr_t)buf;
181 	aiov.iov_len = nbyte;
182 	auio.uio_iov = &aiov;
183 	auio.uio_iovcnt = 1;
184 	auio.uio_offset = offset;
185 	if (nbyte > INT_MAX)
186 		return (EINVAL);
187 	auio.uio_resid = nbyte;
188 	auio.uio_rw = UIO_READ;
189 	auio.uio_segflg = UIO_USERSPACE;
190 	auio.uio_procp = p;
191 #ifdef KTRACE
192 	/*
193 	 * if tracing, save a copy of iovec
194 	 */
195 	if (KTRPOINT(p, KTR_GENIO)) {
196 		ktriov = aiov;
197 		ktruio = auio;
198 		didktr = 1;
199 	}
200 #endif
201 	cnt = nbyte;
202 
203 	if ((error = fo_read(fp, &auio, fp->f_cred, flags, p))) {
204 		if (auio.uio_resid != cnt && (error == ERESTART ||
205 		    error == EINTR || error == EWOULDBLOCK))
206 			error = 0;
207 	}
208 	cnt -= auio.uio_resid;
209 #ifdef KTRACE
210 	if (didktr && error == 0) {
211 		ktruio.uio_iov = &ktriov;
212 		ktruio.uio_resid = cnt;
213 		ktrgenio(p->p_tracep, fd, UIO_READ, &ktruio, error);
214 	}
215 #endif
216 	p->p_retval[0] = cnt;
217 	return (error);
218 }
219 
220 /*
221  * Scatter read system call.
222  */
223 #ifndef _SYS_SYSPROTO_H_
224 struct readv_args {
225 	int	fd;
226 	struct	iovec *iovp;
227 	u_int	iovcnt;
228 };
229 #endif
230 int
231 readv(p, uap)
232 	struct proc *p;
233 	register struct readv_args *uap;
234 {
235 	register struct file *fp;
236 	register struct filedesc *fdp = p->p_fd;
237 	struct uio auio;
238 	register struct iovec *iov;
239 	struct iovec *needfree;
240 	struct iovec aiov[UIO_SMALLIOV];
241 	long i, cnt, error = 0;
242 	u_int iovlen;
243 #ifdef KTRACE
244 	struct iovec *ktriov = NULL;
245 	struct uio ktruio;
246 #endif
247 
248 	if ((fp = holdfp(fdp, uap->fd, FREAD)) == NULL)
249 		return (EBADF);
250 	/* note: can't use iovlen until iovcnt is validated */
251 	iovlen = uap->iovcnt * sizeof (struct iovec);
252 	if (uap->iovcnt > UIO_SMALLIOV) {
253 		if (uap->iovcnt > UIO_MAXIOV)
254 			return (EINVAL);
255 		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
256 		needfree = iov;
257 	} else {
258 		iov = aiov;
259 		needfree = NULL;
260 	}
261 	auio.uio_iov = iov;
262 	auio.uio_iovcnt = uap->iovcnt;
263 	auio.uio_rw = UIO_READ;
264 	auio.uio_segflg = UIO_USERSPACE;
265 	auio.uio_procp = p;
266 	auio.uio_offset = -1;
267 	if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)))
268 		goto done;
269 	auio.uio_resid = 0;
270 	for (i = 0; i < uap->iovcnt; i++) {
271 		if (iov->iov_len > INT_MAX - auio.uio_resid) {
272 			error = EINVAL;
273 			goto done;
274 		}
275 		auio.uio_resid += iov->iov_len;
276 		iov++;
277 	}
278 #ifdef KTRACE
279 	/*
280 	 * if tracing, save a copy of iovec
281 	 */
282 	if (KTRPOINT(p, KTR_GENIO))  {
283 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
284 		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
285 		ktruio = auio;
286 	}
287 #endif
288 	cnt = auio.uio_resid;
289 	if ((error = fo_read(fp, &auio, fp->f_cred, 0, p))) {
290 		if (auio.uio_resid != cnt && (error == ERESTART ||
291 		    error == EINTR || error == EWOULDBLOCK))
292 			error = 0;
293 	}
294 	cnt -= auio.uio_resid;
295 #ifdef KTRACE
296 	if (ktriov != NULL) {
297 		if (error == 0) {
298 			ktruio.uio_iov = ktriov;
299 			ktruio.uio_resid = cnt;
300 			ktrgenio(p->p_tracep, uap->fd, UIO_READ, &ktruio,
301 			    error);
302 		}
303 		FREE(ktriov, M_TEMP);
304 	}
305 #endif
306 	p->p_retval[0] = cnt;
307 done:
308 	fdrop(fp, p);
309 	if (needfree)
310 		FREE(needfree, M_IOV);
311 	return (error);
312 }
313 
314 /*
315  * Write system call
316  */
317 #ifndef _SYS_SYSPROTO_H_
318 struct write_args {
319 	int	fd;
320 	const void *buf;
321 	size_t	nbyte;
322 };
323 #endif
324 int
325 write(p, uap)
326 	struct proc *p;
327 	register struct write_args *uap;
328 {
329 	register struct file *fp;
330 	int error;
331 
332 	if ((fp = holdfp(p->p_fd, uap->fd, FWRITE)) == NULL)
333 		return (EBADF);
334 	error = dofilewrite(p, fp, uap->fd, uap->buf, uap->nbyte, (off_t)-1, 0);
335 	fdrop(fp, p);
336 	return(error);
337 }
338 
339 /*
340  * Pwrite system call
341  */
342 #ifndef _SYS_SYSPROTO_H_
343 struct pwrite_args {
344 	int	fd;
345 	const void *buf;
346 	size_t	nbyte;
347 	int	pad;
348 	off_t	offset;
349 };
350 #endif
351 int
352 pwrite(p, uap)
353 	struct proc *p;
354 	register struct pwrite_args *uap;
355 {
356 	register struct file *fp;
357 	int error;
358 
359 	if ((fp = holdfp(p->p_fd, uap->fd, FWRITE)) == NULL)
360 		return (EBADF);
361 	if (fp->f_type != DTYPE_VNODE) {
362 		error = ESPIPE;
363 	} else {
364 	    error = dofilewrite(p, fp, uap->fd, uap->buf, uap->nbyte,
365 		uap->offset, FOF_OFFSET);
366 	}
367 	fdrop(fp, p);
368 	return(error);
369 }
370 
371 static int
372 dofilewrite(p, fp, fd, buf, nbyte, offset, flags)
373 	struct proc *p;
374 	struct file *fp;
375 	int fd, flags;
376 	const void *buf;
377 	size_t nbyte;
378 	off_t offset;
379 {
380 	struct uio auio;
381 	struct iovec aiov;
382 	long cnt, error = 0;
383 #ifdef KTRACE
384 	struct iovec ktriov;
385 	struct uio ktruio;
386 	int didktr = 0;
387 #endif
388 
389 	aiov.iov_base = (void *)(uintptr_t)buf;
390 	aiov.iov_len = nbyte;
391 	auio.uio_iov = &aiov;
392 	auio.uio_iovcnt = 1;
393 	auio.uio_offset = offset;
394 	if (nbyte > INT_MAX)
395 		return (EINVAL);
396 	auio.uio_resid = nbyte;
397 	auio.uio_rw = UIO_WRITE;
398 	auio.uio_segflg = UIO_USERSPACE;
399 	auio.uio_procp = p;
400 #ifdef KTRACE
401 	/*
402 	 * if tracing, save a copy of iovec and uio
403 	 */
404 	if (KTRPOINT(p, KTR_GENIO)) {
405 		ktriov = aiov;
406 		ktruio = auio;
407 		didktr = 1;
408 	}
409 #endif
410 	cnt = nbyte;
411 	if (fp->f_type == DTYPE_VNODE)
412 		bwillwrite();
413 	if ((error = fo_write(fp, &auio, fp->f_cred, flags, p))) {
414 		if (auio.uio_resid != cnt && (error == ERESTART ||
415 		    error == EINTR || error == EWOULDBLOCK))
416 			error = 0;
417 		if (error == EPIPE) {
418 			PROC_LOCK(p);
419 			psignal(p, SIGPIPE);
420 			PROC_UNLOCK(p);
421 		}
422 	}
423 	cnt -= auio.uio_resid;
424 #ifdef KTRACE
425 	if (didktr && error == 0) {
426 		ktruio.uio_iov = &ktriov;
427 		ktruio.uio_resid = cnt;
428 		ktrgenio(p->p_tracep, fd, UIO_WRITE, &ktruio, error);
429 	}
430 #endif
431 	p->p_retval[0] = cnt;
432 	return (error);
433 }
434 
435 /*
436  * Gather write system call
437  */
438 #ifndef _SYS_SYSPROTO_H_
439 struct writev_args {
440 	int	fd;
441 	struct	iovec *iovp;
442 	u_int	iovcnt;
443 };
444 #endif
445 int
446 writev(p, uap)
447 	struct proc *p;
448 	register struct writev_args *uap;
449 {
450 	register struct file *fp;
451 	register struct filedesc *fdp = p->p_fd;
452 	struct uio auio;
453 	register struct iovec *iov;
454 	struct iovec *needfree;
455 	struct iovec aiov[UIO_SMALLIOV];
456 	long i, cnt, error = 0;
457 	u_int iovlen;
458 #ifdef KTRACE
459 	struct iovec *ktriov = NULL;
460 	struct uio ktruio;
461 #endif
462 
463 	if ((fp = holdfp(fdp, uap->fd, FWRITE)) == NULL)
464 		return (EBADF);
465 	/* note: can't use iovlen until iovcnt is validated */
466 	iovlen = uap->iovcnt * sizeof (struct iovec);
467 	if (uap->iovcnt > UIO_SMALLIOV) {
468 		if (uap->iovcnt > UIO_MAXIOV) {
469 			needfree = NULL;
470 			error = EINVAL;
471 			goto done;
472 		}
473 		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
474 		needfree = iov;
475 	} else {
476 		iov = aiov;
477 		needfree = NULL;
478 	}
479 	auio.uio_iov = iov;
480 	auio.uio_iovcnt = uap->iovcnt;
481 	auio.uio_rw = UIO_WRITE;
482 	auio.uio_segflg = UIO_USERSPACE;
483 	auio.uio_procp = p;
484 	auio.uio_offset = -1;
485 	if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)))
486 		goto done;
487 	auio.uio_resid = 0;
488 	for (i = 0; i < uap->iovcnt; i++) {
489 		if (iov->iov_len > INT_MAX - auio.uio_resid) {
490 			error = EINVAL;
491 			goto done;
492 		}
493 		auio.uio_resid += iov->iov_len;
494 		iov++;
495 	}
496 #ifdef KTRACE
497 	/*
498 	 * if tracing, save a copy of iovec and uio
499 	 */
500 	if (KTRPOINT(p, KTR_GENIO))  {
501 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
502 		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
503 		ktruio = auio;
504 	}
505 #endif
506 	cnt = auio.uio_resid;
507 	if (fp->f_type == DTYPE_VNODE)
508 		bwillwrite();
509 	if ((error = fo_write(fp, &auio, fp->f_cred, 0, p))) {
510 		if (auio.uio_resid != cnt && (error == ERESTART ||
511 		    error == EINTR || error == EWOULDBLOCK))
512 			error = 0;
513 		if (error == EPIPE) {
514 			PROC_LOCK(p);
515 			psignal(p, SIGPIPE);
516 			PROC_UNLOCK(p);
517 		}
518 	}
519 	cnt -= auio.uio_resid;
520 #ifdef KTRACE
521 	if (ktriov != NULL) {
522 		if (error == 0) {
523 			ktruio.uio_iov = ktriov;
524 			ktruio.uio_resid = cnt;
525 			ktrgenio(p->p_tracep, uap->fd, UIO_WRITE, &ktruio,
526 			    error);
527 		}
528 		FREE(ktriov, M_TEMP);
529 	}
530 #endif
531 	p->p_retval[0] = cnt;
532 done:
533 	fdrop(fp, p);
534 	if (needfree)
535 		FREE(needfree, M_IOV);
536 	return (error);
537 }
538 
539 /*
540  * Ioctl system call
541  */
542 #ifndef _SYS_SYSPROTO_H_
543 struct ioctl_args {
544 	int	fd;
545 	u_long	com;
546 	caddr_t	data;
547 };
548 #endif
549 /* ARGSUSED */
550 int
551 ioctl(p, uap)
552 	struct proc *p;
553 	register struct ioctl_args *uap;
554 {
555 	register struct file *fp;
556 	register struct filedesc *fdp;
557 	register u_long com;
558 	int error;
559 	register u_int size;
560 	caddr_t data, memp;
561 	int tmp;
562 #define STK_PARAMS	128
563 	union {
564 	    char stkbuf[STK_PARAMS];
565 	    long align;
566 	} ubuf;
567 
568 	fdp = p->p_fd;
569 	if ((u_int)uap->fd >= fdp->fd_nfiles ||
570 	    (fp = fdp->fd_ofiles[uap->fd]) == NULL)
571 		return (EBADF);
572 
573 	if ((fp->f_flag & (FREAD | FWRITE)) == 0)
574 		return (EBADF);
575 
576 	switch (com = uap->com) {
577 	case FIONCLEX:
578 		fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE;
579 		return (0);
580 	case FIOCLEX:
581 		fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE;
582 		return (0);
583 	}
584 
585 	/*
586 	 * Interpret high order word to find amount of data to be
587 	 * copied to/from the user's address space.
588 	 */
589 	size = IOCPARM_LEN(com);
590 	if (size > IOCPARM_MAX)
591 		return (ENOTTY);
592 
593 	fhold(fp);
594 
595 	memp = NULL;
596 	if (size > sizeof (ubuf.stkbuf)) {
597 		memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
598 		data = memp;
599 	} else {
600 		data = ubuf.stkbuf;
601 	}
602 	if (com&IOC_IN) {
603 		if (size) {
604 			error = copyin(uap->data, data, (u_int)size);
605 			if (error) {
606 				if (memp)
607 					free(memp, M_IOCTLOPS);
608 				fdrop(fp, p);
609 				return (error);
610 			}
611 		} else {
612 			*(caddr_t *)data = uap->data;
613 		}
614 	} else if ((com&IOC_OUT) && size) {
615 		/*
616 		 * Zero the buffer so the user always
617 		 * gets back something deterministic.
618 		 */
619 		bzero(data, size);
620 	} else if (com&IOC_VOID) {
621 		*(caddr_t *)data = uap->data;
622 	}
623 
624 	switch (com) {
625 
626 	case FIONBIO:
627 		if ((tmp = *(int *)data))
628 			fp->f_flag |= FNONBLOCK;
629 		else
630 			fp->f_flag &= ~FNONBLOCK;
631 		error = fo_ioctl(fp, FIONBIO, (caddr_t)&tmp, p);
632 		break;
633 
634 	case FIOASYNC:
635 		if ((tmp = *(int *)data))
636 			fp->f_flag |= FASYNC;
637 		else
638 			fp->f_flag &= ~FASYNC;
639 		error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, p);
640 		break;
641 
642 	default:
643 		error = fo_ioctl(fp, com, data, p);
644 		/*
645 		 * Copy any data to user, size was
646 		 * already set and checked above.
647 		 */
648 		if (error == 0 && (com&IOC_OUT) && size)
649 			error = copyout(data, uap->data, (u_int)size);
650 		break;
651 	}
652 	if (memp)
653 		free(memp, M_IOCTLOPS);
654 	fdrop(fp, p);
655 	return (error);
656 }
657 
658 static int	nselcoll;	/* Select collisions since boot */
659 struct cv	selwait;
660 SYSCTL_INT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
661 
662 /*
663  * Select system call.
664  */
665 #ifndef _SYS_SYSPROTO_H_
666 struct select_args {
667 	int	nd;
668 	fd_set	*in, *ou, *ex;
669 	struct	timeval *tv;
670 };
671 #endif
672 int
673 select(p, uap)
674 	register struct proc *p;
675 	register struct select_args *uap;
676 {
677 	/*
678 	 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
679 	 * infds with the new FD_SETSIZE of 1024, and more than enough for
680 	 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
681 	 * of 256.
682 	 */
683 	fd_mask s_selbits[howmany(2048, NFDBITS)];
684 	fd_mask s_heldbits[howmany(2048, NFDBITS)];
685 	fd_mask *ibits[3], *obits[3], *selbits, *sbp, *heldbits, *hibits, *hobits;
686 	struct timeval atv, rtv, ttv;
687 	int ncoll, error, timo, i;
688 	u_int nbufbytes, ncpbytes, nfdbits;
689 
690 	if (uap->nd < 0)
691 		return (EINVAL);
692 	if (uap->nd > p->p_fd->fd_nfiles)
693 		uap->nd = p->p_fd->fd_nfiles;   /* forgiving; slightly wrong */
694 
695 	/*
696 	 * Allocate just enough bits for the non-null fd_sets.  Use the
697 	 * preallocated auto buffer if possible.
698 	 */
699 	nfdbits = roundup(uap->nd, NFDBITS);
700 	ncpbytes = nfdbits / NBBY;
701 	nbufbytes = 0;
702 	if (uap->in != NULL)
703 		nbufbytes += 2 * ncpbytes;
704 	if (uap->ou != NULL)
705 		nbufbytes += 2 * ncpbytes;
706 	if (uap->ex != NULL)
707 		nbufbytes += 2 * ncpbytes;
708 	if (nbufbytes <= sizeof s_selbits)
709 		selbits = &s_selbits[0];
710 	else
711 		selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
712 	if (2 * ncpbytes <= sizeof s_heldbits) {
713 		bzero(s_heldbits, sizeof(s_heldbits));
714 		heldbits = &s_heldbits[0];
715 	} else
716 		heldbits = malloc(2 * ncpbytes, M_SELECT, M_WAITOK | M_ZERO);
717 
718 	/*
719 	 * Assign pointers into the bit buffers and fetch the input bits.
720 	 * Put the output buffers together so that they can be bzeroed
721 	 * together.
722 	 */
723 	sbp = selbits;
724 	hibits = heldbits + ncpbytes / sizeof *heldbits;
725 	hobits = heldbits;
726 #define	getbits(name, x) \
727 	do {								\
728 		if (uap->name == NULL)					\
729 			ibits[x] = NULL;				\
730 		else {							\
731 			ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp;	\
732 			obits[x] = sbp;					\
733 			sbp += ncpbytes / sizeof *sbp;			\
734 			error = copyin(uap->name, ibits[x], ncpbytes);	\
735 			if (error != 0)					\
736 				goto done_noproclock;			\
737 			for (i = 0;					\
738 			     i < ncpbytes / sizeof ibits[i][0];		\
739 			     i++)					\
740 				hibits[i] |= ibits[x][i];		\
741 		}							\
742 	} while (0)
743 	getbits(in, 0);
744 	getbits(ou, 1);
745 	getbits(ex, 2);
746 #undef	getbits
747 	if (nbufbytes != 0)
748 		bzero(selbits, nbufbytes / 2);
749 
750 	if (uap->tv) {
751 		error = copyin((caddr_t)uap->tv, (caddr_t)&atv,
752 			sizeof (atv));
753 		if (error)
754 			goto done_noproclock;
755 		if (itimerfix(&atv)) {
756 			error = EINVAL;
757 			goto done_noproclock;
758 		}
759 		getmicrouptime(&rtv);
760 		timevaladd(&atv, &rtv);
761 	} else {
762 		atv.tv_sec = 0;
763 		atv.tv_usec = 0;
764 	}
765 	selholddrop(p, hibits, hobits, uap->nd, 1);
766 	timo = 0;
767 	PROC_LOCK(p);
768 retry:
769 	ncoll = nselcoll;
770 	p->p_flag |= P_SELECT;
771 	PROC_UNLOCK(p);
772 	error = selscan(p, ibits, obits, uap->nd);
773 	PROC_LOCK(p);
774 	if (error || p->p_retval[0])
775 		goto done;
776 	if (atv.tv_sec || atv.tv_usec) {
777 		getmicrouptime(&rtv);
778 		if (timevalcmp(&rtv, &atv, >=)) {
779 			/*
780 			 * An event of our interest may occur during locking a process.
781 			 * In order to avoid missing the event that occured during locking
782 			 * the process, test P_SELECT and rescan file descriptors if
783 			 * necessary.
784 			 */
785 			if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
786 				ncoll = nselcoll;
787 				p->p_flag |= P_SELECT;
788 				PROC_UNLOCK(p);
789 				error = selscan(p, ibits, obits, uap->nd);
790 				PROC_LOCK(p);
791 			}
792 			goto done;
793 		}
794 		ttv = atv;
795 		timevalsub(&ttv, &rtv);
796 		timo = ttv.tv_sec > 24 * 60 * 60 ?
797 		    24 * 60 * 60 * hz : tvtohz(&ttv);
798 	}
799 	p->p_flag &= ~P_SELECT;
800 
801 	if (timo > 0)
802 		error = cv_timedwait_sig(&selwait, &p->p_mtx, timo);
803 	else
804 		error = cv_wait_sig(&selwait, &p->p_mtx);
805 
806 	if (error == 0)
807 		goto retry;
808 
809 done:
810 	p->p_flag &= ~P_SELECT;
811 	PROC_UNLOCK(p);
812 	selholddrop(p, hibits, hobits, uap->nd, 0);
813 done_noproclock:
814 	/* select is not restarted after signals... */
815 	if (error == ERESTART)
816 		error = EINTR;
817 	if (error == EWOULDBLOCK)
818 		error = 0;
819 #define	putbits(name, x) \
820 	if (uap->name && (error2 = copyout(obits[x], uap->name, ncpbytes))) \
821 		error = error2;
822 	if (error == 0) {
823 		int error2;
824 
825 		putbits(in, 0);
826 		putbits(ou, 1);
827 		putbits(ex, 2);
828 #undef putbits
829 	}
830 	if (selbits != &s_selbits[0])
831 		free(selbits, M_SELECT);
832 	if (heldbits != &s_heldbits[0])
833 		free(heldbits, M_SELECT);
834 	return (error);
835 }
836 
837 static int
838 selholddrop(p, ibits, obits, nfd, hold)
839 	struct proc *p;
840 	fd_mask *ibits, *obits;
841 	int nfd, hold;
842 {
843 	struct filedesc *fdp = p->p_fd;
844 	int i, fd;
845 	fd_mask bits;
846 	struct file *fp;
847 
848 	for (i = 0; i < nfd; i += NFDBITS) {
849 		if (hold)
850 			bits = ibits[i/NFDBITS];
851 		else
852 			bits = obits[i/NFDBITS];
853 		/* ffs(int mask) not portable, fd_mask is long */
854 		for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
855 			if (!(bits & 1))
856 				continue;
857 			fp = fdp->fd_ofiles[fd];
858 			if (fp == NULL)
859 				return (EBADF);
860 			if (hold) {
861 				fhold(fp);
862 				obits[(fd)/NFDBITS] |=
863 				    ((fd_mask)1 << ((fd) % NFDBITS));
864 			} else
865 				fdrop(fp, p);
866 		}
867 	}
868 	return (0);
869 }
870 
871 static int
872 selscan(p, ibits, obits, nfd)
873 	struct proc *p;
874 	fd_mask **ibits, **obits;
875 	int nfd;
876 {
877 	struct filedesc *fdp = p->p_fd;
878 	int msk, i, fd;
879 	fd_mask bits;
880 	struct file *fp;
881 	int n = 0;
882 	/* Note: backend also returns POLLHUP/POLLERR if appropriate. */
883 	static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
884 
885 	for (msk = 0; msk < 3; msk++) {
886 		if (ibits[msk] == NULL)
887 			continue;
888 		for (i = 0; i < nfd; i += NFDBITS) {
889 			bits = ibits[msk][i/NFDBITS];
890 			/* ffs(int mask) not portable, fd_mask is long */
891 			for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
892 				if (!(bits & 1))
893 					continue;
894 				fp = fdp->fd_ofiles[fd];
895 				if (fp == NULL)
896 					return (EBADF);
897 				if (fo_poll(fp, flag[msk], fp->f_cred, p)) {
898 					obits[msk][(fd)/NFDBITS] |=
899 					    ((fd_mask)1 << ((fd) % NFDBITS));
900 					n++;
901 				}
902 			}
903 		}
904 	}
905 	p->p_retval[0] = n;
906 	return (0);
907 }
908 
909 /*
910  * Poll system call.
911  */
912 #ifndef _SYS_SYSPROTO_H_
913 struct poll_args {
914 	struct pollfd *fds;
915 	u_int	nfds;
916 	int	timeout;
917 };
918 #endif
919 int
920 poll(p, uap)
921 	struct proc *p;
922 	struct poll_args *uap;
923 {
924 	caddr_t bits;
925 	char smallbits[32 * sizeof(struct pollfd)];
926 	struct timeval atv, rtv, ttv;
927 	int ncoll, error = 0, timo;
928 	u_int nfds;
929 	size_t ni;
930 	struct pollfd p_heldbits[32];
931 	struct pollfd *heldbits;
932 
933 	nfds = SCARG(uap, nfds);
934 	/*
935 	 * This is kinda bogus.  We have fd limits, but that is not
936 	 * really related to the size of the pollfd array.  Make sure
937 	 * we let the process use at least FD_SETSIZE entries and at
938 	 * least enough for the current limits.  We want to be reasonably
939 	 * safe, but not overly restrictive.
940 	 */
941 	if (nfds > p->p_rlimit[RLIMIT_NOFILE].rlim_cur && nfds > FD_SETSIZE)
942 		return (EINVAL);
943 	ni = nfds * sizeof(struct pollfd);
944 	if (ni > sizeof(smallbits))
945 		bits = malloc(ni, M_TEMP, M_WAITOK);
946 	else
947 		bits = smallbits;
948 	if (ni > sizeof(p_heldbits))
949 		heldbits = malloc(ni, M_TEMP, M_WAITOK);
950 	else {
951 		bzero(p_heldbits, sizeof(p_heldbits));
952 		heldbits = p_heldbits;
953 	}
954 	error = copyin(SCARG(uap, fds), bits, ni);
955 	if (error)
956 		goto done_noproclock;
957 	bcopy(bits, heldbits, ni);
958 	if (SCARG(uap, timeout) != INFTIM) {
959 		atv.tv_sec = SCARG(uap, timeout) / 1000;
960 		atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000;
961 		if (itimerfix(&atv)) {
962 			error = EINVAL;
963 			goto done_noproclock;
964 		}
965 		getmicrouptime(&rtv);
966 		timevaladd(&atv, &rtv);
967 	} else {
968 		atv.tv_sec = 0;
969 		atv.tv_usec = 0;
970 	}
971 	pollholddrop(p, heldbits, nfds, 1);
972 	timo = 0;
973 	PROC_LOCK(p);
974 retry:
975 	ncoll = nselcoll;
976 	p->p_flag |= P_SELECT;
977 	PROC_UNLOCK(p);
978 	error = pollscan(p, (struct pollfd *)bits, nfds);
979 	PROC_LOCK(p);
980 	if (error || p->p_retval[0])
981 		goto done;
982 	if (atv.tv_sec || atv.tv_usec) {
983 		getmicrouptime(&rtv);
984 		if (timevalcmp(&rtv, &atv, >=)) {
985 			/*
986 			 * An event of our interest may occur during locking a process.
987 			 * In order to avoid missing the event that occured during locking
988 			 * the process, test P_SELECT and rescan file descriptors if
989 			 * necessary.
990 			 */
991 			if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
992 				ncoll = nselcoll;
993 				p->p_flag |= P_SELECT;
994 				PROC_UNLOCK(p);
995 				error = pollscan(p, (struct pollfd *)bits, nfds);
996 				PROC_LOCK(p);
997 			}
998 			goto done;
999 		}
1000 		ttv = atv;
1001 		timevalsub(&ttv, &rtv);
1002 		timo = ttv.tv_sec > 24 * 60 * 60 ?
1003 		    24 * 60 * 60 * hz : tvtohz(&ttv);
1004 	}
1005 	p->p_flag &= ~P_SELECT;
1006 	if (timo > 0)
1007 		error = cv_timedwait_sig(&selwait, &p->p_mtx, timo);
1008 	else
1009 		error = cv_wait_sig(&selwait, &p->p_mtx);
1010 	if (error == 0)
1011 		goto retry;
1012 
1013 done:
1014 	p->p_flag &= ~P_SELECT;
1015 	PROC_UNLOCK(p);
1016 	pollholddrop(p, heldbits, nfds, 0);
1017 done_noproclock:
1018 	/* poll is not restarted after signals... */
1019 	if (error == ERESTART)
1020 		error = EINTR;
1021 	if (error == EWOULDBLOCK)
1022 		error = 0;
1023 	if (error == 0) {
1024 		error = copyout(bits, SCARG(uap, fds), ni);
1025 		if (error)
1026 			goto out;
1027 	}
1028 out:
1029 	if (ni > sizeof(smallbits))
1030 		free(bits, M_TEMP);
1031 	if (ni > sizeof(p_heldbits))
1032 		free(heldbits, M_TEMP);
1033 	return (error);
1034 }
1035 
1036 static int
1037 pollholddrop(p, fds, nfd, hold)
1038 	struct proc *p;
1039 	struct pollfd *fds;
1040 	u_int nfd;
1041 	int hold;
1042 {
1043 	register struct filedesc *fdp = p->p_fd;
1044 	int i;
1045 	struct file *fp;
1046 
1047 	for (i = 0; i < nfd; i++, fds++) {
1048 		if (0 <= fds->fd && fds->fd < fdp->fd_nfiles) {
1049 			fp = fdp->fd_ofiles[fds->fd];
1050 			if (hold) {
1051 				if (fp != NULL) {
1052 					fhold(fp);
1053 					fds->revents = 1;
1054 				} else
1055 					fds->revents = 0;
1056 			} else if(fp != NULL && fds->revents)
1057 				fdrop(fp, p);
1058 		}
1059 	}
1060 	return (0);
1061 }
1062 
1063 static int
1064 pollscan(p, fds, nfd)
1065 	struct proc *p;
1066 	struct pollfd *fds;
1067 	u_int nfd;
1068 {
1069 	register struct filedesc *fdp = p->p_fd;
1070 	int i;
1071 	struct file *fp;
1072 	int n = 0;
1073 
1074 	for (i = 0; i < nfd; i++, fds++) {
1075 		if (fds->fd >= fdp->fd_nfiles) {
1076 			fds->revents = POLLNVAL;
1077 			n++;
1078 		} else if (fds->fd < 0) {
1079 			fds->revents = 0;
1080 		} else {
1081 			fp = fdp->fd_ofiles[fds->fd];
1082 			if (fp == NULL) {
1083 				fds->revents = POLLNVAL;
1084 				n++;
1085 			} else {
1086 				/*
1087 				 * Note: backend also returns POLLHUP and
1088 				 * POLLERR if appropriate.
1089 				 */
1090 				fds->revents = fo_poll(fp, fds->events,
1091 				    fp->f_cred, p);
1092 				if (fds->revents != 0)
1093 					n++;
1094 			}
1095 		}
1096 	}
1097 	p->p_retval[0] = n;
1098 	return (0);
1099 }
1100 
1101 /*
1102  * OpenBSD poll system call.
1103  * XXX this isn't quite a true representation..  OpenBSD uses select ops.
1104  */
1105 #ifndef _SYS_SYSPROTO_H_
1106 struct openbsd_poll_args {
1107 	struct pollfd *fds;
1108 	u_int	nfds;
1109 	int	timeout;
1110 };
1111 #endif
1112 int
1113 openbsd_poll(p, uap)
1114 	register struct proc *p;
1115 	register struct openbsd_poll_args *uap;
1116 {
1117 	return (poll(p, (struct poll_args *)uap));
1118 }
1119 
1120 /*ARGSUSED*/
1121 int
1122 seltrue(dev, events, p)
1123 	dev_t dev;
1124 	int events;
1125 	struct proc *p;
1126 {
1127 
1128 	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
1129 }
1130 
1131 /*
1132  * Record a select request.
1133  */
1134 void
1135 selrecord(selector, sip)
1136 	struct proc *selector;
1137 	struct selinfo *sip;
1138 {
1139 	struct proc *p;
1140 	pid_t mypid;
1141 
1142 	mypid = selector->p_pid;
1143 	if (sip->si_pid == mypid)
1144 		return;
1145 	if (sip->si_pid && (p = pfind(sip->si_pid))) {
1146 		mtx_lock_spin(&sched_lock);
1147 	    	if (p->p_wchan == (caddr_t)&selwait) {
1148 			mtx_unlock_spin(&sched_lock);
1149 			PROC_UNLOCK(p);
1150 			sip->si_flags |= SI_COLL;
1151 			return;
1152 		}
1153 		mtx_unlock_spin(&sched_lock);
1154 		PROC_UNLOCK(p);
1155 	}
1156 	sip->si_pid = mypid;
1157 }
1158 
1159 /*
1160  * Do a wakeup when a selectable event occurs.
1161  */
1162 void
1163 selwakeup(sip)
1164 	register struct selinfo *sip;
1165 {
1166 	register struct proc *p;
1167 
1168 	if (sip->si_pid == 0)
1169 		return;
1170 	if (sip->si_flags & SI_COLL) {
1171 		nselcoll++;
1172 		sip->si_flags &= ~SI_COLL;
1173 		cv_broadcast(&selwait);
1174 	}
1175 	p = pfind(sip->si_pid);
1176 	sip->si_pid = 0;
1177 	if (p != NULL) {
1178 		mtx_lock_spin(&sched_lock);
1179 		if (p->p_wchan == (caddr_t)&selwait) {
1180 			if (p->p_stat == SSLEEP)
1181 				setrunnable(p);
1182 			else
1183 				cv_waitq_remove(p);
1184 		} else
1185 			p->p_flag &= ~P_SELECT;
1186 		mtx_unlock_spin(&sched_lock);
1187 		PROC_UNLOCK(p);
1188 	}
1189 }
1190 
1191 static void selectinit __P((void *));
1192 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL)
1193 
1194 /* ARGSUSED*/
1195 static void
1196 selectinit(dummy)
1197 	void *dummy;
1198 {
1199 	cv_init(&selwait, "select");
1200 }
1201