xref: /freebsd/sys/kern/sys_generic.c (revision 1d66272a85cde1c8a69c58f4b5dd649babd6eca6)
1 /*
2  * Copyright (c) 1982, 1986, 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
39  * $FreeBSD$
40  */
41 
42 #include "opt_ktrace.h"
43 
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/sysproto.h>
47 #include <sys/filedesc.h>
48 #include <sys/filio.h>
49 #include <sys/fcntl.h>
50 #include <sys/file.h>
51 #include <sys/proc.h>
52 #include <sys/signalvar.h>
53 #include <sys/socketvar.h>
54 #include <sys/uio.h>
55 #include <sys/kernel.h>
56 #include <sys/malloc.h>
57 #include <sys/poll.h>
58 #include <sys/selinfo.h>
59 #include <sys/sysctl.h>
60 #include <sys/sysent.h>
61 #include <sys/bio.h>
62 #include <sys/buf.h>
63 #ifdef KTRACE
64 #include <sys/ktrace.h>
65 #endif
66 #include <vm/vm.h>
67 #include <vm/vm_page.h>
68 
69 #include <machine/limits.h>
70 
71 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
72 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
73 MALLOC_DEFINE(M_IOV, "iov", "large iov's");
74 
75 static int	pollscan __P((struct proc *, struct pollfd *, int));
76 static int	selscan __P((struct proc *, fd_mask **, fd_mask **, int));
77 static int	dofileread __P((struct proc *, struct file *, int, void *,
78 		    size_t, off_t, int));
79 static int	dofilewrite __P((struct proc *, struct file *, int,
80 		    const void *, size_t, off_t, int));
81 
82 struct file*
83 holdfp(fdp, fd, flag)
84 	struct filedesc* fdp;
85 	int fd, flag;
86 {
87 	struct file* fp;
88 
89 	if (((u_int)fd) >= fdp->fd_nfiles ||
90 	    (fp = fdp->fd_ofiles[fd]) == NULL ||
91 	    (fp->f_flag & flag) == 0) {
92 		return (NULL);
93 	}
94 	fhold(fp);
95 	return (fp);
96 }
97 
98 /*
99  * Read system call.
100  */
101 #ifndef _SYS_SYSPROTO_H_
102 struct read_args {
103 	int	fd;
104 	void	*buf;
105 	size_t	nbyte;
106 };
107 #endif
108 int
109 read(p, uap)
110 	struct proc *p;
111 	register struct read_args *uap;
112 {
113 	register struct file *fp;
114 	int error;
115 
116 	if ((fp = holdfp(p->p_fd, uap->fd, FREAD)) == NULL)
117 		return (EBADF);
118 	error = dofileread(p, fp, uap->fd, uap->buf, uap->nbyte, (off_t)-1, 0);
119 	fdrop(fp, p);
120 	return(error);
121 }
122 
123 /*
124  * Pread system call
125  */
126 #ifndef _SYS_SYSPROTO_H_
127 struct pread_args {
128 	int	fd;
129 	void	*buf;
130 	size_t	nbyte;
131 	int	pad;
132 	off_t	offset;
133 };
134 #endif
135 int
136 pread(p, uap)
137 	struct proc *p;
138 	register struct pread_args *uap;
139 {
140 	register struct file *fp;
141 	int error;
142 
143 	if ((fp = holdfp(p->p_fd, uap->fd, FREAD)) == NULL)
144 		return (EBADF);
145 	if (fp->f_type != DTYPE_VNODE) {
146 		error = ESPIPE;
147 	} else {
148 	    error = dofileread(p, fp, uap->fd, uap->buf, uap->nbyte,
149 		uap->offset, FOF_OFFSET);
150 	}
151 	fdrop(fp, p);
152 	return(error);
153 }
154 
155 /*
156  * Code common for read and pread
157  */
158 int
159 dofileread(p, fp, fd, buf, nbyte, offset, flags)
160 	struct proc *p;
161 	struct file *fp;
162 	int fd, flags;
163 	void *buf;
164 	size_t nbyte;
165 	off_t offset;
166 {
167 	struct uio auio;
168 	struct iovec aiov;
169 	long cnt, error = 0;
170 #ifdef KTRACE
171 	struct iovec ktriov;
172 	struct uio ktruio;
173 	int didktr = 0;
174 #endif
175 
176 	aiov.iov_base = (caddr_t)buf;
177 	aiov.iov_len = nbyte;
178 	auio.uio_iov = &aiov;
179 	auio.uio_iovcnt = 1;
180 	auio.uio_offset = offset;
181 	if (nbyte > INT_MAX)
182 		return (EINVAL);
183 	auio.uio_resid = nbyte;
184 	auio.uio_rw = UIO_READ;
185 	auio.uio_segflg = UIO_USERSPACE;
186 	auio.uio_procp = p;
187 #ifdef KTRACE
188 	/*
189 	 * if tracing, save a copy of iovec
190 	 */
191 	if (KTRPOINT(p, KTR_GENIO)) {
192 		ktriov = aiov;
193 		ktruio = auio;
194 		didktr = 1;
195 	}
196 #endif
197 	cnt = nbyte;
198 
199 	if ((error = fo_read(fp, &auio, fp->f_cred, flags, p))) {
200 		if (auio.uio_resid != cnt && (error == ERESTART ||
201 		    error == EINTR || error == EWOULDBLOCK))
202 			error = 0;
203 	}
204 	cnt -= auio.uio_resid;
205 #ifdef KTRACE
206 	if (didktr && error == 0) {
207 		ktruio.uio_iov = &ktriov;
208 		ktruio.uio_resid = cnt;
209 		ktrgenio(p->p_tracep, fd, UIO_READ, &ktruio, error);
210 	}
211 #endif
212 	p->p_retval[0] = cnt;
213 	return (error);
214 }
215 
216 /*
217  * Scatter read system call.
218  */
219 #ifndef _SYS_SYSPROTO_H_
220 struct readv_args {
221 	int	fd;
222 	struct	iovec *iovp;
223 	u_int	iovcnt;
224 };
225 #endif
226 int
227 readv(p, uap)
228 	struct proc *p;
229 	register struct readv_args *uap;
230 {
231 	register struct file *fp;
232 	register struct filedesc *fdp = p->p_fd;
233 	struct uio auio;
234 	register struct iovec *iov;
235 	struct iovec *needfree;
236 	struct iovec aiov[UIO_SMALLIOV];
237 	long i, cnt, error = 0;
238 	u_int iovlen;
239 #ifdef KTRACE
240 	struct iovec *ktriov = NULL;
241 	struct uio ktruio;
242 #endif
243 
244 	if ((fp = holdfp(fdp, uap->fd, FREAD)) == NULL)
245 		return (EBADF);
246 	/* note: can't use iovlen until iovcnt is validated */
247 	iovlen = uap->iovcnt * sizeof (struct iovec);
248 	if (uap->iovcnt > UIO_SMALLIOV) {
249 		if (uap->iovcnt > UIO_MAXIOV)
250 			return (EINVAL);
251 		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
252 		needfree = iov;
253 	} else {
254 		iov = aiov;
255 		needfree = NULL;
256 	}
257 	auio.uio_iov = iov;
258 	auio.uio_iovcnt = uap->iovcnt;
259 	auio.uio_rw = UIO_READ;
260 	auio.uio_segflg = UIO_USERSPACE;
261 	auio.uio_procp = p;
262 	auio.uio_offset = -1;
263 	if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)))
264 		goto done;
265 	auio.uio_resid = 0;
266 	for (i = 0; i < uap->iovcnt; i++) {
267 		if (iov->iov_len > INT_MAX - auio.uio_resid) {
268 			error = EINVAL;
269 			goto done;
270 		}
271 		auio.uio_resid += iov->iov_len;
272 		iov++;
273 	}
274 #ifdef KTRACE
275 	/*
276 	 * if tracing, save a copy of iovec
277 	 */
278 	if (KTRPOINT(p, KTR_GENIO))  {
279 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
280 		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
281 		ktruio = auio;
282 	}
283 #endif
284 	cnt = auio.uio_resid;
285 	if ((error = fo_read(fp, &auio, fp->f_cred, 0, p))) {
286 		if (auio.uio_resid != cnt && (error == ERESTART ||
287 		    error == EINTR || error == EWOULDBLOCK))
288 			error = 0;
289 	}
290 	cnt -= auio.uio_resid;
291 #ifdef KTRACE
292 	if (ktriov != NULL) {
293 		if (error == 0) {
294 			ktruio.uio_iov = ktriov;
295 			ktruio.uio_resid = cnt;
296 			ktrgenio(p->p_tracep, uap->fd, UIO_READ, &ktruio,
297 			    error);
298 		}
299 		FREE(ktriov, M_TEMP);
300 	}
301 #endif
302 	p->p_retval[0] = cnt;
303 done:
304 	fdrop(fp, p);
305 	if (needfree)
306 		FREE(needfree, M_IOV);
307 	return (error);
308 }
309 
310 /*
311  * Write system call
312  */
313 #ifndef _SYS_SYSPROTO_H_
314 struct write_args {
315 	int	fd;
316 	const void *buf;
317 	size_t	nbyte;
318 };
319 #endif
320 int
321 write(p, uap)
322 	struct proc *p;
323 	register struct write_args *uap;
324 {
325 	register struct file *fp;
326 	int error;
327 
328 	if ((fp = holdfp(p->p_fd, uap->fd, FWRITE)) == NULL)
329 		return (EBADF);
330 	error = dofilewrite(p, fp, uap->fd, uap->buf, uap->nbyte, (off_t)-1, 0);
331 	fdrop(fp, p);
332 	return(error);
333 }
334 
335 /*
336  * Pwrite system call
337  */
338 #ifndef _SYS_SYSPROTO_H_
339 struct pwrite_args {
340 	int	fd;
341 	const void *buf;
342 	size_t	nbyte;
343 	int	pad;
344 	off_t	offset;
345 };
346 #endif
347 int
348 pwrite(p, uap)
349 	struct proc *p;
350 	register struct pwrite_args *uap;
351 {
352 	register struct file *fp;
353 	int error;
354 
355 	if ((fp = holdfp(p->p_fd, uap->fd, FWRITE)) == NULL)
356 		return (EBADF);
357 	if (fp->f_type != DTYPE_VNODE) {
358 		error = ESPIPE;
359 	} else {
360 	    error = dofilewrite(p, fp, uap->fd, uap->buf, uap->nbyte,
361 		uap->offset, FOF_OFFSET);
362 	}
363 	fdrop(fp, p);
364 	return(error);
365 }
366 
367 static int
368 dofilewrite(p, fp, fd, buf, nbyte, offset, flags)
369 	struct proc *p;
370 	struct file *fp;
371 	int fd, flags;
372 	const void *buf;
373 	size_t nbyte;
374 	off_t offset;
375 {
376 	struct uio auio;
377 	struct iovec aiov;
378 	long cnt, error = 0;
379 #ifdef KTRACE
380 	struct iovec ktriov;
381 	struct uio ktruio;
382 	int didktr = 0;
383 #endif
384 
385 	aiov.iov_base = (void *)(uintptr_t)buf;
386 	aiov.iov_len = nbyte;
387 	auio.uio_iov = &aiov;
388 	auio.uio_iovcnt = 1;
389 	auio.uio_offset = offset;
390 	if (nbyte > INT_MAX)
391 		return (EINVAL);
392 	auio.uio_resid = nbyte;
393 	auio.uio_rw = UIO_WRITE;
394 	auio.uio_segflg = UIO_USERSPACE;
395 	auio.uio_procp = p;
396 #ifdef KTRACE
397 	/*
398 	 * if tracing, save a copy of iovec and uio
399 	 */
400 	if (KTRPOINT(p, KTR_GENIO)) {
401 		ktriov = aiov;
402 		ktruio = auio;
403 		didktr = 1;
404 	}
405 #endif
406 	cnt = nbyte;
407 	if (fp->f_type == DTYPE_VNODE)
408 		bwillwrite();
409 	if ((error = fo_write(fp, &auio, fp->f_cred, flags, p))) {
410 		if (auio.uio_resid != cnt && (error == ERESTART ||
411 		    error == EINTR || error == EWOULDBLOCK))
412 			error = 0;
413 		if (error == EPIPE)
414 			psignal(p, SIGPIPE);
415 	}
416 	cnt -= auio.uio_resid;
417 #ifdef KTRACE
418 	if (didktr && error == 0) {
419 		ktruio.uio_iov = &ktriov;
420 		ktruio.uio_resid = cnt;
421 		ktrgenio(p->p_tracep, fd, UIO_WRITE, &ktruio, error);
422 	}
423 #endif
424 	p->p_retval[0] = cnt;
425 	return (error);
426 }
427 
428 /*
429  * Gather write system call
430  */
431 #ifndef _SYS_SYSPROTO_H_
432 struct writev_args {
433 	int	fd;
434 	struct	iovec *iovp;
435 	u_int	iovcnt;
436 };
437 #endif
438 int
439 writev(p, uap)
440 	struct proc *p;
441 	register struct writev_args *uap;
442 {
443 	register struct file *fp;
444 	register struct filedesc *fdp = p->p_fd;
445 	struct uio auio;
446 	register struct iovec *iov;
447 	struct iovec *needfree;
448 	struct iovec aiov[UIO_SMALLIOV];
449 	long i, cnt, error = 0;
450 	u_int iovlen;
451 #ifdef KTRACE
452 	struct iovec *ktriov = NULL;
453 	struct uio ktruio;
454 #endif
455 
456 	if ((fp = holdfp(fdp, uap->fd, FWRITE)) == NULL)
457 		return (EBADF);
458 	/* note: can't use iovlen until iovcnt is validated */
459 	iovlen = uap->iovcnt * sizeof (struct iovec);
460 	if (uap->iovcnt > UIO_SMALLIOV) {
461 		if (uap->iovcnt > UIO_MAXIOV) {
462 			needfree = NULL;
463 			error = EINVAL;
464 			goto done;
465 		}
466 		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
467 		needfree = iov;
468 	} else {
469 		iov = aiov;
470 		needfree = NULL;
471 	}
472 	auio.uio_iov = iov;
473 	auio.uio_iovcnt = uap->iovcnt;
474 	auio.uio_rw = UIO_WRITE;
475 	auio.uio_segflg = UIO_USERSPACE;
476 	auio.uio_procp = p;
477 	auio.uio_offset = -1;
478 	if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)))
479 		goto done;
480 	auio.uio_resid = 0;
481 	for (i = 0; i < uap->iovcnt; i++) {
482 		if (iov->iov_len > INT_MAX - auio.uio_resid) {
483 			error = EINVAL;
484 			goto done;
485 		}
486 		auio.uio_resid += iov->iov_len;
487 		iov++;
488 	}
489 #ifdef KTRACE
490 	/*
491 	 * if tracing, save a copy of iovec and uio
492 	 */
493 	if (KTRPOINT(p, KTR_GENIO))  {
494 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
495 		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
496 		ktruio = auio;
497 	}
498 #endif
499 	cnt = auio.uio_resid;
500 	if (fp->f_type == DTYPE_VNODE)
501 		bwillwrite();
502 	if ((error = fo_write(fp, &auio, fp->f_cred, 0, p))) {
503 		if (auio.uio_resid != cnt && (error == ERESTART ||
504 		    error == EINTR || error == EWOULDBLOCK))
505 			error = 0;
506 		if (error == EPIPE)
507 			psignal(p, SIGPIPE);
508 	}
509 	cnt -= auio.uio_resid;
510 #ifdef KTRACE
511 	if (ktriov != NULL) {
512 		if (error == 0) {
513 			ktruio.uio_iov = ktriov;
514 			ktruio.uio_resid = cnt;
515 			ktrgenio(p->p_tracep, uap->fd, UIO_WRITE, &ktruio,
516 			    error);
517 		}
518 		FREE(ktriov, M_TEMP);
519 	}
520 #endif
521 	p->p_retval[0] = cnt;
522 done:
523 	fdrop(fp, p);
524 	if (needfree)
525 		FREE(needfree, M_IOV);
526 	return (error);
527 }
528 
529 /*
530  * Ioctl system call
531  */
532 #ifndef _SYS_SYSPROTO_H_
533 struct ioctl_args {
534 	int	fd;
535 	u_long	com;
536 	caddr_t	data;
537 };
538 #endif
539 /* ARGSUSED */
540 int
541 ioctl(p, uap)
542 	struct proc *p;
543 	register struct ioctl_args *uap;
544 {
545 	register struct file *fp;
546 	register struct filedesc *fdp;
547 	register u_long com;
548 	int error;
549 	register u_int size;
550 	caddr_t data, memp;
551 	int tmp;
552 #define STK_PARAMS	128
553 	union {
554 	    char stkbuf[STK_PARAMS];
555 	    long align;
556 	} ubuf;
557 
558 	fdp = p->p_fd;
559 	if ((u_int)uap->fd >= fdp->fd_nfiles ||
560 	    (fp = fdp->fd_ofiles[uap->fd]) == NULL)
561 		return (EBADF);
562 
563 	if ((fp->f_flag & (FREAD | FWRITE)) == 0)
564 		return (EBADF);
565 
566 	switch (com = uap->com) {
567 	case FIONCLEX:
568 		fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE;
569 		return (0);
570 	case FIOCLEX:
571 		fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE;
572 		return (0);
573 	}
574 
575 	/*
576 	 * Interpret high order word to find amount of data to be
577 	 * copied to/from the user's address space.
578 	 */
579 	size = IOCPARM_LEN(com);
580 	if (size > IOCPARM_MAX)
581 		return (ENOTTY);
582 
583 	fhold(fp);
584 
585 	memp = NULL;
586 	if (size > sizeof (ubuf.stkbuf)) {
587 		memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
588 		data = memp;
589 	} else {
590 		data = ubuf.stkbuf;
591 	}
592 	if (com&IOC_IN) {
593 		if (size) {
594 			error = copyin(uap->data, data, (u_int)size);
595 			if (error) {
596 				if (memp)
597 					free(memp, M_IOCTLOPS);
598 				fdrop(fp, p);
599 				return (error);
600 			}
601 		} else {
602 			*(caddr_t *)data = uap->data;
603 		}
604 	} else if ((com&IOC_OUT) && size) {
605 		/*
606 		 * Zero the buffer so the user always
607 		 * gets back something deterministic.
608 		 */
609 		bzero(data, size);
610 	} else if (com&IOC_VOID) {
611 		*(caddr_t *)data = uap->data;
612 	}
613 
614 	switch (com) {
615 
616 	case FIONBIO:
617 		if ((tmp = *(int *)data))
618 			fp->f_flag |= FNONBLOCK;
619 		else
620 			fp->f_flag &= ~FNONBLOCK;
621 		error = fo_ioctl(fp, FIONBIO, (caddr_t)&tmp, p);
622 		break;
623 
624 	case FIOASYNC:
625 		if ((tmp = *(int *)data))
626 			fp->f_flag |= FASYNC;
627 		else
628 			fp->f_flag &= ~FASYNC;
629 		error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, p);
630 		break;
631 
632 	default:
633 		error = fo_ioctl(fp, com, data, p);
634 		/*
635 		 * Copy any data to user, size was
636 		 * already set and checked above.
637 		 */
638 		if (error == 0 && (com&IOC_OUT) && size)
639 			error = copyout(data, uap->data, (u_int)size);
640 		break;
641 	}
642 	if (memp)
643 		free(memp, M_IOCTLOPS);
644 	fdrop(fp, p);
645 	return (error);
646 }
647 
648 static int	nselcoll;	/* Select collisions since boot */
649 int	selwait;
650 SYSCTL_INT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
651 
652 /*
653  * Select system call.
654  */
655 #ifndef _SYS_SYSPROTO_H_
656 struct select_args {
657 	int	nd;
658 	fd_set	*in, *ou, *ex;
659 	struct	timeval *tv;
660 };
661 #endif
662 int
663 select(p, uap)
664 	register struct proc *p;
665 	register struct select_args *uap;
666 {
667 	/*
668 	 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
669 	 * infds with the new FD_SETSIZE of 1024, and more than enough for
670 	 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
671 	 * of 256.
672 	 */
673 	fd_mask s_selbits[howmany(2048, NFDBITS)];
674 	fd_mask *ibits[3], *obits[3], *selbits, *sbp;
675 	struct timeval atv, rtv, ttv;
676 	int s, ncoll, error, timo;
677 	u_int nbufbytes, ncpbytes, nfdbits;
678 
679 	if (uap->nd < 0)
680 		return (EINVAL);
681 	if (uap->nd > p->p_fd->fd_nfiles)
682 		uap->nd = p->p_fd->fd_nfiles;   /* forgiving; slightly wrong */
683 
684 	/*
685 	 * Allocate just enough bits for the non-null fd_sets.  Use the
686 	 * preallocated auto buffer if possible.
687 	 */
688 	nfdbits = roundup(uap->nd, NFDBITS);
689 	ncpbytes = nfdbits / NBBY;
690 	nbufbytes = 0;
691 	if (uap->in != NULL)
692 		nbufbytes += 2 * ncpbytes;
693 	if (uap->ou != NULL)
694 		nbufbytes += 2 * ncpbytes;
695 	if (uap->ex != NULL)
696 		nbufbytes += 2 * ncpbytes;
697 	if (nbufbytes <= sizeof s_selbits)
698 		selbits = &s_selbits[0];
699 	else
700 		selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
701 
702 	/*
703 	 * Assign pointers into the bit buffers and fetch the input bits.
704 	 * Put the output buffers together so that they can be bzeroed
705 	 * together.
706 	 */
707 	sbp = selbits;
708 #define	getbits(name, x) \
709 	do {								\
710 		if (uap->name == NULL)					\
711 			ibits[x] = NULL;				\
712 		else {							\
713 			ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp;	\
714 			obits[x] = sbp;					\
715 			sbp += ncpbytes / sizeof *sbp;			\
716 			error = copyin(uap->name, ibits[x], ncpbytes);	\
717 			if (error != 0)					\
718 				goto done;				\
719 		}							\
720 	} while (0)
721 	getbits(in, 0);
722 	getbits(ou, 1);
723 	getbits(ex, 2);
724 #undef	getbits
725 	if (nbufbytes != 0)
726 		bzero(selbits, nbufbytes / 2);
727 
728 	if (uap->tv) {
729 		error = copyin((caddr_t)uap->tv, (caddr_t)&atv,
730 			sizeof (atv));
731 		if (error)
732 			goto done;
733 		if (itimerfix(&atv)) {
734 			error = EINVAL;
735 			goto done;
736 		}
737 		getmicrouptime(&rtv);
738 		timevaladd(&atv, &rtv);
739 	} else {
740 		atv.tv_sec = 0;
741 		atv.tv_usec = 0;
742 	}
743 	timo = 0;
744 retry:
745 	ncoll = nselcoll;
746 	p->p_flag |= P_SELECT;
747 	error = selscan(p, ibits, obits, uap->nd);
748 	if (error || p->p_retval[0])
749 		goto done;
750 	if (atv.tv_sec || atv.tv_usec) {
751 		getmicrouptime(&rtv);
752 		if (timevalcmp(&rtv, &atv, >=))
753 			goto done;
754 		ttv = atv;
755 		timevalsub(&ttv, &rtv);
756 		timo = ttv.tv_sec > 24 * 60 * 60 ?
757 		    24 * 60 * 60 * hz : tvtohz(&ttv);
758 	}
759 	s = splhigh();
760 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
761 		splx(s);
762 		goto retry;
763 	}
764 	p->p_flag &= ~P_SELECT;
765 
766 	error = tsleep((caddr_t)&selwait, PSOCK | PCATCH, "select", timo);
767 
768 	splx(s);
769 	if (error == 0)
770 		goto retry;
771 done:
772 	p->p_flag &= ~P_SELECT;
773 	/* select is not restarted after signals... */
774 	if (error == ERESTART)
775 		error = EINTR;
776 	if (error == EWOULDBLOCK)
777 		error = 0;
778 #define	putbits(name, x) \
779 	if (uap->name && (error2 = copyout(obits[x], uap->name, ncpbytes))) \
780 		error = error2;
781 	if (error == 0) {
782 		int error2;
783 
784 		putbits(in, 0);
785 		putbits(ou, 1);
786 		putbits(ex, 2);
787 #undef putbits
788 	}
789 	if (selbits != &s_selbits[0])
790 		free(selbits, M_SELECT);
791 	return (error);
792 }
793 
794 static int
795 selscan(p, ibits, obits, nfd)
796 	struct proc *p;
797 	fd_mask **ibits, **obits;
798 	int nfd;
799 {
800 	struct filedesc *fdp = p->p_fd;
801 	int msk, i, fd;
802 	fd_mask bits;
803 	struct file *fp;
804 	int n = 0;
805 	/* Note: backend also returns POLLHUP/POLLERR if appropriate. */
806 	static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
807 
808 	for (msk = 0; msk < 3; msk++) {
809 		if (ibits[msk] == NULL)
810 			continue;
811 		for (i = 0; i < nfd; i += NFDBITS) {
812 			bits = ibits[msk][i/NFDBITS];
813 			/* ffs(int mask) not portable, fd_mask is long */
814 			for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
815 				if (!(bits & 1))
816 					continue;
817 				fp = fdp->fd_ofiles[fd];
818 				if (fp == NULL)
819 					return (EBADF);
820 				if (fo_poll(fp, flag[msk], fp->f_cred, p)) {
821 					obits[msk][(fd)/NFDBITS] |=
822 					    ((fd_mask)1 << ((fd) % NFDBITS));
823 					n++;
824 				}
825 			}
826 		}
827 	}
828 	p->p_retval[0] = n;
829 	return (0);
830 }
831 
832 /*
833  * Poll system call.
834  */
835 #ifndef _SYS_SYSPROTO_H_
836 struct poll_args {
837 	struct pollfd *fds;
838 	u_int	nfds;
839 	int	timeout;
840 };
841 #endif
842 int
843 poll(p, uap)
844 	register struct proc *p;
845 	register struct poll_args *uap;
846 {
847 	caddr_t bits;
848 	char smallbits[32 * sizeof(struct pollfd)];
849 	struct timeval atv, rtv, ttv;
850 	int s, ncoll, error = 0, timo;
851 	size_t ni;
852 
853 	if (SCARG(uap, nfds) > p->p_fd->fd_nfiles) {
854 		/* forgiving; slightly wrong */
855 		SCARG(uap, nfds) = p->p_fd->fd_nfiles;
856 	}
857 	ni = SCARG(uap, nfds) * sizeof(struct pollfd);
858 	if (ni > sizeof(smallbits))
859 		bits = malloc(ni, M_TEMP, M_WAITOK);
860 	else
861 		bits = smallbits;
862 	error = copyin(SCARG(uap, fds), bits, ni);
863 	if (error)
864 		goto done;
865 	if (SCARG(uap, timeout) != INFTIM) {
866 		atv.tv_sec = SCARG(uap, timeout) / 1000;
867 		atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000;
868 		if (itimerfix(&atv)) {
869 			error = EINVAL;
870 			goto done;
871 		}
872 		getmicrouptime(&rtv);
873 		timevaladd(&atv, &rtv);
874 	} else {
875 		atv.tv_sec = 0;
876 		atv.tv_usec = 0;
877 	}
878 	timo = 0;
879 retry:
880 	ncoll = nselcoll;
881 	p->p_flag |= P_SELECT;
882 	error = pollscan(p, (struct pollfd *)bits, SCARG(uap, nfds));
883 	if (error || p->p_retval[0])
884 		goto done;
885 	if (atv.tv_sec || atv.tv_usec) {
886 		getmicrouptime(&rtv);
887 		if (timevalcmp(&rtv, &atv, >=))
888 			goto done;
889 		ttv = atv;
890 		timevalsub(&ttv, &rtv);
891 		timo = ttv.tv_sec > 24 * 60 * 60 ?
892 		    24 * 60 * 60 * hz : tvtohz(&ttv);
893 	}
894 	s = splhigh();
895 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
896 		splx(s);
897 		goto retry;
898 	}
899 	p->p_flag &= ~P_SELECT;
900 	error = tsleep((caddr_t)&selwait, PSOCK | PCATCH, "poll", timo);
901 	splx(s);
902 	if (error == 0)
903 		goto retry;
904 done:
905 	p->p_flag &= ~P_SELECT;
906 	/* poll is not restarted after signals... */
907 	if (error == ERESTART)
908 		error = EINTR;
909 	if (error == EWOULDBLOCK)
910 		error = 0;
911 	if (error == 0) {
912 		error = copyout(bits, SCARG(uap, fds), ni);
913 		if (error)
914 			goto out;
915 	}
916 out:
917 	if (ni > sizeof(smallbits))
918 		free(bits, M_TEMP);
919 	return (error);
920 }
921 
922 static int
923 pollscan(p, fds, nfd)
924 	struct proc *p;
925 	struct pollfd *fds;
926 	int nfd;
927 {
928 	register struct filedesc *fdp = p->p_fd;
929 	int i;
930 	struct file *fp;
931 	int n = 0;
932 
933 	for (i = 0; i < nfd; i++, fds++) {
934 		if (fds->fd >= fdp->fd_nfiles) {
935 			fds->revents = POLLNVAL;
936 			n++;
937 		} else if (fds->fd < 0) {
938 			fds->revents = 0;
939 		} else {
940 			fp = fdp->fd_ofiles[fds->fd];
941 			if (fp == NULL) {
942 				fds->revents = POLLNVAL;
943 				n++;
944 			} else {
945 				/*
946 				 * Note: backend also returns POLLHUP and
947 				 * POLLERR if appropriate.
948 				 */
949 				fds->revents = fo_poll(fp, fds->events,
950 				    fp->f_cred, p);
951 				if (fds->revents != 0)
952 					n++;
953 			}
954 		}
955 	}
956 	p->p_retval[0] = n;
957 	return (0);
958 }
959 
960 /*
961  * OpenBSD poll system call.
962  * XXX this isn't quite a true representation..  OpenBSD uses select ops.
963  */
964 #ifndef _SYS_SYSPROTO_H_
965 struct openbsd_poll_args {
966 	struct pollfd *fds;
967 	u_int	nfds;
968 	int	timeout;
969 };
970 #endif
971 int
972 openbsd_poll(p, uap)
973 	register struct proc *p;
974 	register struct openbsd_poll_args *uap;
975 {
976 	return (poll(p, (struct poll_args *)uap));
977 }
978 
979 /*ARGSUSED*/
980 int
981 seltrue(dev, events, p)
982 	dev_t dev;
983 	int events;
984 	struct proc *p;
985 {
986 
987 	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
988 }
989 
990 /*
991  * Record a select request.
992  */
993 void
994 selrecord(selector, sip)
995 	struct proc *selector;
996 	struct selinfo *sip;
997 {
998 	struct proc *p;
999 	pid_t mypid;
1000 
1001 	mypid = selector->p_pid;
1002 	if (sip->si_pid == mypid)
1003 		return;
1004 	if (sip->si_pid && (p = pfind(sip->si_pid)) &&
1005 	    p->p_wchan == (caddr_t)&selwait)
1006 		sip->si_flags |= SI_COLL;
1007 	else
1008 		sip->si_pid = mypid;
1009 }
1010 
1011 /*
1012  * Do a wakeup when a selectable event occurs.
1013  */
1014 void
1015 selwakeup(sip)
1016 	register struct selinfo *sip;
1017 {
1018 	register struct proc *p;
1019 	int s;
1020 
1021 	if (sip->si_pid == 0)
1022 		return;
1023 	if (sip->si_flags & SI_COLL) {
1024 		nselcoll++;
1025 		sip->si_flags &= ~SI_COLL;
1026 		wakeup((caddr_t)&selwait);
1027 	}
1028 	p = pfind(sip->si_pid);
1029 	sip->si_pid = 0;
1030 	if (p != NULL) {
1031 		s = splhigh();
1032 		mtx_enter(&sched_lock, MTX_SPIN);
1033 		if (p->p_wchan == (caddr_t)&selwait) {
1034 			if (p->p_stat == SSLEEP)
1035 				setrunnable(p);
1036 			else
1037 				unsleep(p);
1038 		} else if (p->p_flag & P_SELECT)
1039 			p->p_flag &= ~P_SELECT;
1040 		mtx_exit(&sched_lock, MTX_SPIN);
1041 		splx(s);
1042 	}
1043 }
1044