xref: /freebsd/sys/kern/sys_generic.c (revision a79b71281cd63ad7a6cc43a6d5673a2510b51630)
1 /*
2  * Copyright (c) 1982, 1986, 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
39  * $FreeBSD$
40  */
41 
42 #include "opt_ktrace.h"
43 
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/sysproto.h>
47 #include <sys/filedesc.h>
48 #include <sys/filio.h>
49 #include <sys/fcntl.h>
50 #include <sys/file.h>
51 #include <sys/proc.h>
52 #include <sys/signalvar.h>
53 #include <sys/socketvar.h>
54 #include <sys/uio.h>
55 #include <sys/kernel.h>
56 #include <sys/malloc.h>
57 #include <sys/poll.h>
58 #include <sys/sysctl.h>
59 #include <sys/sysent.h>
60 #ifdef KTRACE
61 #include <sys/ktrace.h>
62 #endif
63 
64 #include <machine/limits.h>
65 
66 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
67 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
68 MALLOC_DEFINE(M_IOV, "iov", "large iov's");
69 
70 static int	pollscan __P((struct proc *, struct pollfd *, int));
71 static int	selscan __P((struct proc *, fd_mask **, fd_mask **, int));
72 static int	dofileread __P((struct proc *, struct file *, int, void *,
73 		    size_t, off_t, int));
74 static int	dofilewrite __P((struct proc *, struct file *, int,
75 		    const void *, size_t, off_t, int));
76 
77 struct file*
78 getfp(fdp, fd, flag)
79 	struct filedesc* fdp;
80 	int fd, flag;
81 {
82 	struct file* fp;
83 
84 	if (((u_int)fd) >= fdp->fd_nfiles ||
85 	    (fp = fdp->fd_ofiles[fd]) == NULL ||
86 	    (fp->f_flag & flag) == 0)
87 		return (NULL);
88 	return (fp);
89 }
90 
91 /*
92  * Read system call.
93  */
94 #ifndef _SYS_SYSPROTO_H_
95 struct read_args {
96 	int	fd;
97 	void	*buf;
98 	size_t	nbyte;
99 };
100 #endif
101 int
102 read(p, uap)
103 	struct proc *p;
104 	register struct read_args *uap;
105 {
106 	register struct file *fp;
107 
108 	if ((fp = getfp(p->p_fd, uap->fd, FREAD)) == NULL)
109 		return (EBADF);
110 	return (dofileread(p, fp, uap->fd, uap->buf, uap->nbyte, (off_t)-1, 0));
111 }
112 
113 /*
114  * Pread system call
115  */
116 #ifndef _SYS_SYSPROTO_H_
117 struct pread_args {
118 	int	fd;
119 	void	*buf;
120 	size_t	nbyte;
121 	int	pad;
122 	off_t	offset;
123 };
124 #endif
125 int
126 pread(p, uap)
127 	struct proc *p;
128 	register struct pread_args *uap;
129 {
130 	register struct file *fp;
131 
132 	if ((fp = getfp(p->p_fd, uap->fd, FREAD)) == NULL)
133 		return (EBADF);
134 	if (fp->f_type != DTYPE_VNODE)
135 		return (ESPIPE);
136 	return (dofileread(p, fp, uap->fd, uap->buf, uap->nbyte, uap->offset,
137 	    FOF_OFFSET));
138 }
139 
140 /*
141  * Code common for read and pread
142  */
143 int
144 dofileread(p, fp, fd, buf, nbyte, offset, flags)
145 	struct proc *p;
146 	struct file *fp;
147 	int fd, flags;
148 	void *buf;
149 	size_t nbyte;
150 	off_t offset;
151 {
152 	struct uio auio;
153 	struct iovec aiov;
154 	long cnt, error = 0;
155 #ifdef KTRACE
156 	struct iovec ktriov;
157 #endif
158 
159 	aiov.iov_base = (caddr_t)buf;
160 	aiov.iov_len = nbyte;
161 	auio.uio_iov = &aiov;
162 	auio.uio_iovcnt = 1;
163 	auio.uio_offset = offset;
164 	if (nbyte > INT_MAX)
165 		return (EINVAL);
166 	auio.uio_resid = nbyte;
167 	auio.uio_rw = UIO_READ;
168 	auio.uio_segflg = UIO_USERSPACE;
169 	auio.uio_procp = p;
170 #ifdef KTRACE
171 	/*
172 	 * if tracing, save a copy of iovec
173 	 */
174 	if (KTRPOINT(p, KTR_GENIO))
175 		ktriov = aiov;
176 #endif
177 	cnt = nbyte;
178 	if ((error = fo_read(fp, &auio, fp->f_cred, flags, p)))
179 		if (auio.uio_resid != cnt && (error == ERESTART ||
180 		    error == EINTR || error == EWOULDBLOCK))
181 			error = 0;
182 	cnt -= auio.uio_resid;
183 #ifdef KTRACE
184 	if (KTRPOINT(p, KTR_GENIO) && error == 0)
185 		ktrgenio(p->p_tracep, fd, UIO_READ, &ktriov, cnt, error);
186 #endif
187 	p->p_retval[0] = cnt;
188 	return (error);
189 }
190 
191 /*
192  * Scatter read system call.
193  */
194 #ifndef _SYS_SYSPROTO_H_
195 struct readv_args {
196 	int	fd;
197 	struct	iovec *iovp;
198 	u_int	iovcnt;
199 };
200 #endif
201 int
202 readv(p, uap)
203 	struct proc *p;
204 	register struct readv_args *uap;
205 {
206 	register struct file *fp;
207 	register struct filedesc *fdp = p->p_fd;
208 	struct uio auio;
209 	register struct iovec *iov;
210 	struct iovec *needfree;
211 	struct iovec aiov[UIO_SMALLIOV];
212 	long i, cnt, error = 0;
213 	u_int iovlen;
214 #ifdef KTRACE
215 	struct iovec *ktriov = NULL;
216 #endif
217 
218 	if ((fp = getfp(fdp, uap->fd, FREAD)) == NULL)
219 		return (EBADF);
220 	/* note: can't use iovlen until iovcnt is validated */
221 	iovlen = uap->iovcnt * sizeof (struct iovec);
222 	if (uap->iovcnt > UIO_SMALLIOV) {
223 		if (uap->iovcnt > UIO_MAXIOV)
224 			return (EINVAL);
225 		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
226 		needfree = iov;
227 	} else {
228 		iov = aiov;
229 		needfree = NULL;
230 	}
231 	auio.uio_iov = iov;
232 	auio.uio_iovcnt = uap->iovcnt;
233 	auio.uio_rw = UIO_READ;
234 	auio.uio_segflg = UIO_USERSPACE;
235 	auio.uio_procp = p;
236 	auio.uio_offset = -1;
237 	if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)))
238 		goto done;
239 	auio.uio_resid = 0;
240 	for (i = 0; i < uap->iovcnt; i++) {
241 		if (iov->iov_len > INT_MAX - auio.uio_resid) {
242 			error = EINVAL;
243 			goto done;
244 		}
245 		auio.uio_resid += iov->iov_len;
246 		iov++;
247 	}
248 #ifdef KTRACE
249 	/*
250 	 * if tracing, save a copy of iovec
251 	 */
252 	if (KTRPOINT(p, KTR_GENIO))  {
253 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
254 		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
255 	}
256 #endif
257 	cnt = auio.uio_resid;
258 	if ((error = fo_read(fp, &auio, fp->f_cred, 0, p)))
259 		if (auio.uio_resid != cnt && (error == ERESTART ||
260 		    error == EINTR || error == EWOULDBLOCK))
261 			error = 0;
262 	cnt -= auio.uio_resid;
263 #ifdef KTRACE
264 	if (ktriov != NULL) {
265 		if (error == 0)
266 			ktrgenio(p->p_tracep, uap->fd, UIO_READ, ktriov,
267 			    cnt, error);
268 		FREE(ktriov, M_TEMP);
269 	}
270 #endif
271 	p->p_retval[0] = cnt;
272 done:
273 	if (needfree)
274 		FREE(needfree, M_IOV);
275 	return (error);
276 }
277 
278 /*
279  * Write system call
280  */
281 #ifndef _SYS_SYSPROTO_H_
282 struct write_args {
283 	int	fd;
284 	const void *buf;
285 	size_t	nbyte;
286 };
287 #endif
288 int
289 write(p, uap)
290 	struct proc *p;
291 	register struct write_args *uap;
292 {
293 	register struct file *fp;
294 
295 	if ((fp = getfp(p->p_fd, uap->fd, FWRITE)) == NULL)
296 		return (EBADF);
297 	return (dofilewrite(p, fp, uap->fd, uap->buf, uap->nbyte, (off_t)-1, 0));
298 }
299 
300 /*
301  * Pwrite system call
302  */
303 #ifndef _SYS_SYSPROTO_H_
304 struct pwrite_args {
305 	int	fd;
306 	const void *buf;
307 	size_t	nbyte;
308 	int	pad;
309 	off_t	offset;
310 };
311 #endif
312 int
313 pwrite(p, uap)
314 	struct proc *p;
315 	register struct pwrite_args *uap;
316 {
317 	register struct file *fp;
318 
319 	if ((fp = getfp(p->p_fd, uap->fd, FWRITE)) == NULL)
320 		return (EBADF);
321 	if (fp->f_type != DTYPE_VNODE)
322 		return (ESPIPE);
323 	return (dofilewrite(p, fp, uap->fd, uap->buf, uap->nbyte, uap->offset,
324 	    FOF_OFFSET));
325 }
326 
327 static int
328 dofilewrite(p, fp, fd, buf, nbyte, offset, flags)
329 	struct proc *p;
330 	struct file *fp;
331 	int fd, flags;
332 	const void *buf;
333 	size_t nbyte;
334 	off_t offset;
335 {
336 	struct uio auio;
337 	struct iovec aiov;
338 	long cnt, error = 0;
339 #ifdef KTRACE
340 	struct iovec ktriov;
341 #endif
342 
343 	aiov.iov_base = (void *)buf;
344 	aiov.iov_len = nbyte;
345 	auio.uio_iov = &aiov;
346 	auio.uio_iovcnt = 1;
347 	auio.uio_offset = offset;
348 	if (nbyte > INT_MAX)
349 		return (EINVAL);
350 	auio.uio_resid = nbyte;
351 	auio.uio_rw = UIO_WRITE;
352 	auio.uio_segflg = UIO_USERSPACE;
353 	auio.uio_procp = p;
354 #ifdef KTRACE
355 	/*
356 	 * if tracing, save a copy of iovec
357 	 */
358 	if (KTRPOINT(p, KTR_GENIO))
359 		ktriov = aiov;
360 #endif
361 	cnt = nbyte;
362 	if ((error = fo_write(fp, &auio, fp->f_cred, flags, p))) {
363 		if (auio.uio_resid != cnt && (error == ERESTART ||
364 		    error == EINTR || error == EWOULDBLOCK))
365 			error = 0;
366 		if (error == EPIPE)
367 			psignal(p, SIGPIPE);
368 	}
369 	cnt -= auio.uio_resid;
370 #ifdef KTRACE
371 	if (KTRPOINT(p, KTR_GENIO) && error == 0)
372 		ktrgenio(p->p_tracep, fd, UIO_WRITE,
373 		    &ktriov, cnt, error);
374 #endif
375 	p->p_retval[0] = cnt;
376 	return (error);
377 }
378 
379 /*
380  * Gather write system call
381  */
382 #ifndef _SYS_SYSPROTO_H_
383 struct writev_args {
384 	int	fd;
385 	struct	iovec *iovp;
386 	u_int	iovcnt;
387 };
388 #endif
389 int
390 writev(p, uap)
391 	struct proc *p;
392 	register struct writev_args *uap;
393 {
394 	register struct file *fp;
395 	register struct filedesc *fdp = p->p_fd;
396 	struct uio auio;
397 	register struct iovec *iov;
398 	struct iovec *needfree;
399 	struct iovec aiov[UIO_SMALLIOV];
400 	long i, cnt, error = 0;
401 	u_int iovlen;
402 #ifdef KTRACE
403 	struct iovec *ktriov = NULL;
404 #endif
405 
406 	if ((fp = getfp(fdp, uap->fd, FWRITE)) == NULL)
407 		return (EBADF);
408 	fhold(fp);
409 	/* note: can't use iovlen until iovcnt is validated */
410 	iovlen = uap->iovcnt * sizeof (struct iovec);
411 	if (uap->iovcnt > UIO_SMALLIOV) {
412 		if (uap->iovcnt > UIO_MAXIOV) {
413 			needfree = NULL;
414 			error = EINVAL;
415 			goto done;
416 		}
417 		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
418 		needfree = iov;
419 	} else {
420 		iov = aiov;
421 		needfree = NULL;
422 	}
423 	auio.uio_iov = iov;
424 	auio.uio_iovcnt = uap->iovcnt;
425 	auio.uio_rw = UIO_WRITE;
426 	auio.uio_segflg = UIO_USERSPACE;
427 	auio.uio_procp = p;
428 	auio.uio_offset = -1;
429 	if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)))
430 		goto done;
431 	auio.uio_resid = 0;
432 	for (i = 0; i < uap->iovcnt; i++) {
433 		if (iov->iov_len > INT_MAX - auio.uio_resid) {
434 			error = EINVAL;
435 			goto done;
436 		}
437 		auio.uio_resid += iov->iov_len;
438 		iov++;
439 	}
440 #ifdef KTRACE
441 	/*
442 	 * if tracing, save a copy of iovec
443 	 */
444 	if (KTRPOINT(p, KTR_GENIO))  {
445 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
446 		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
447 	}
448 #endif
449 	cnt = auio.uio_resid;
450 	if ((error = fo_write(fp, &auio, fp->f_cred, 0, p))) {
451 		if (auio.uio_resid != cnt && (error == ERESTART ||
452 		    error == EINTR || error == EWOULDBLOCK))
453 			error = 0;
454 		if (error == EPIPE)
455 			psignal(p, SIGPIPE);
456 	}
457 	cnt -= auio.uio_resid;
458 #ifdef KTRACE
459 	if (ktriov != NULL) {
460 		if (error == 0)
461 			ktrgenio(p->p_tracep, uap->fd, UIO_WRITE,
462 				ktriov, cnt, error);
463 		FREE(ktriov, M_TEMP);
464 	}
465 #endif
466 	p->p_retval[0] = cnt;
467 done:
468 	fdrop(fp, p);
469 	if (needfree)
470 		FREE(needfree, M_IOV);
471 	return (error);
472 }
473 
474 /*
475  * Ioctl system call
476  */
477 #ifndef _SYS_SYSPROTO_H_
478 struct ioctl_args {
479 	int	fd;
480 	u_long	com;
481 	caddr_t	data;
482 };
483 #endif
484 /* ARGSUSED */
485 int
486 ioctl(p, uap)
487 	struct proc *p;
488 	register struct ioctl_args *uap;
489 {
490 	register struct file *fp;
491 	register struct filedesc *fdp;
492 	register u_long com;
493 	int error;
494 	register u_int size;
495 	caddr_t data, memp;
496 	int tmp;
497 #define STK_PARAMS	128
498 	union {
499 	    char stkbuf[STK_PARAMS];
500 	    long align;
501 	} ubuf;
502 
503 	fdp = p->p_fd;
504 	if ((u_int)uap->fd >= fdp->fd_nfiles ||
505 	    (fp = fdp->fd_ofiles[uap->fd]) == NULL)
506 		return (EBADF);
507 
508 	if ((fp->f_flag & (FREAD | FWRITE)) == 0)
509 		return (EBADF);
510 
511 	switch (com = uap->com) {
512 	case FIONCLEX:
513 		fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE;
514 		return (0);
515 	case FIOCLEX:
516 		fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE;
517 		return (0);
518 	}
519 
520 	/*
521 	 * Interpret high order word to find amount of data to be
522 	 * copied to/from the user's address space.
523 	 */
524 	size = IOCPARM_LEN(com);
525 	if (size > IOCPARM_MAX)
526 		return (ENOTTY);
527 	memp = NULL;
528 	if (size > sizeof (ubuf.stkbuf)) {
529 		memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
530 		data = memp;
531 	} else
532 		data = ubuf.stkbuf;
533 	if (com&IOC_IN) {
534 		if (size) {
535 			error = copyin(uap->data, data, (u_int)size);
536 			if (error) {
537 				if (memp)
538 					free(memp, M_IOCTLOPS);
539 				return (error);
540 			}
541 		} else
542 			*(caddr_t *)data = uap->data;
543 	} else if ((com&IOC_OUT) && size)
544 		/*
545 		 * Zero the buffer so the user always
546 		 * gets back something deterministic.
547 		 */
548 		bzero(data, size);
549 	else if (com&IOC_VOID)
550 		*(caddr_t *)data = uap->data;
551 
552 	switch (com) {
553 
554 	case FIONBIO:
555 		if ((tmp = *(int *)data))
556 			fp->f_flag |= FNONBLOCK;
557 		else
558 			fp->f_flag &= ~FNONBLOCK;
559 		error = fo_ioctl(fp, FIONBIO, (caddr_t)&tmp, p);
560 		break;
561 
562 	case FIOASYNC:
563 		if ((tmp = *(int *)data))
564 			fp->f_flag |= FASYNC;
565 		else
566 			fp->f_flag &= ~FASYNC;
567 		error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, p);
568 		break;
569 
570 	default:
571 		error = fo_ioctl(fp, com, data, p);
572 		/*
573 		 * Copy any data to user, size was
574 		 * already set and checked above.
575 		 */
576 		if (error == 0 && (com&IOC_OUT) && size)
577 			error = copyout(data, uap->data, (u_int)size);
578 		break;
579 	}
580 	if (memp)
581 		free(memp, M_IOCTLOPS);
582 	return (error);
583 }
584 
585 static int	nselcoll;	/* Select collisions since boot */
586 int	selwait;
587 SYSCTL_INT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
588 
589 /*
590  * Select system call.
591  */
592 #ifndef _SYS_SYSPROTO_H_
593 struct select_args {
594 	int	nd;
595 	fd_set	*in, *ou, *ex;
596 	struct	timeval *tv;
597 };
598 #endif
599 int
600 select(p, uap)
601 	register struct proc *p;
602 	register struct select_args *uap;
603 {
604 	/*
605 	 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
606 	 * infds with the new FD_SETSIZE of 1024, and more than enough for
607 	 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
608 	 * of 256.
609 	 */
610 	fd_mask s_selbits[howmany(2048, NFDBITS)];
611 	fd_mask *ibits[3], *obits[3], *selbits, *sbp;
612 	struct timeval atv, rtv, ttv;
613 	int s, ncoll, error, timo;
614 	u_int nbufbytes, ncpbytes, nfdbits;
615 
616 	if (uap->nd < 0)
617 		return (EINVAL);
618 	if (uap->nd > p->p_fd->fd_nfiles)
619 		uap->nd = p->p_fd->fd_nfiles;   /* forgiving; slightly wrong */
620 
621 	/*
622 	 * Allocate just enough bits for the non-null fd_sets.  Use the
623 	 * preallocated auto buffer if possible.
624 	 */
625 	nfdbits = roundup(uap->nd, NFDBITS);
626 	ncpbytes = nfdbits / NBBY;
627 	nbufbytes = 0;
628 	if (uap->in != NULL)
629 		nbufbytes += 2 * ncpbytes;
630 	if (uap->ou != NULL)
631 		nbufbytes += 2 * ncpbytes;
632 	if (uap->ex != NULL)
633 		nbufbytes += 2 * ncpbytes;
634 	if (nbufbytes <= sizeof s_selbits)
635 		selbits = &s_selbits[0];
636 	else
637 		selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
638 
639 	/*
640 	 * Assign pointers into the bit buffers and fetch the input bits.
641 	 * Put the output buffers together so that they can be bzeroed
642 	 * together.
643 	 */
644 	sbp = selbits;
645 #define	getbits(name, x) \
646 	do {								\
647 		if (uap->name == NULL)					\
648 			ibits[x] = NULL;				\
649 		else {							\
650 			ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp;	\
651 			obits[x] = sbp;					\
652 			sbp += ncpbytes / sizeof *sbp;			\
653 			error = copyin(uap->name, ibits[x], ncpbytes);	\
654 			if (error != 0)					\
655 				goto done;				\
656 		}							\
657 	} while (0)
658 	getbits(in, 0);
659 	getbits(ou, 1);
660 	getbits(ex, 2);
661 #undef	getbits
662 	if (nbufbytes != 0)
663 		bzero(selbits, nbufbytes / 2);
664 
665 	if (uap->tv) {
666 		error = copyin((caddr_t)uap->tv, (caddr_t)&atv,
667 			sizeof (atv));
668 		if (error)
669 			goto done;
670 		if (itimerfix(&atv)) {
671 			error = EINVAL;
672 			goto done;
673 		}
674 		getmicrouptime(&rtv);
675 		timevaladd(&atv, &rtv);
676 	} else
677 		atv.tv_sec = 0;
678 	timo = 0;
679 retry:
680 	ncoll = nselcoll;
681 	p->p_flag |= P_SELECT;
682 	error = selscan(p, ibits, obits, uap->nd);
683 	if (error || p->p_retval[0])
684 		goto done;
685 	if (atv.tv_sec) {
686 		getmicrouptime(&rtv);
687 		if (timevalcmp(&rtv, &atv, >=))
688 			goto done;
689 		ttv = atv;
690 		timevalsub(&ttv, &rtv);
691 		timo = ttv.tv_sec > 24 * 60 * 60 ?
692 		    24 * 60 * 60 * hz : tvtohz(&ttv);
693 	}
694 	s = splhigh();
695 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
696 		splx(s);
697 		goto retry;
698 	}
699 	p->p_flag &= ~P_SELECT;
700 
701 	error = tsleep((caddr_t)&selwait, PSOCK | PCATCH, "select", timo);
702 
703 	splx(s);
704 	if (error == 0)
705 		goto retry;
706 done:
707 	p->p_flag &= ~P_SELECT;
708 	/* select is not restarted after signals... */
709 	if (error == ERESTART)
710 		error = EINTR;
711 	if (error == EWOULDBLOCK)
712 		error = 0;
713 #define	putbits(name, x) \
714 	if (uap->name && (error2 = copyout(obits[x], uap->name, ncpbytes))) \
715 		error = error2;
716 	if (error == 0) {
717 		int error2;
718 
719 		putbits(in, 0);
720 		putbits(ou, 1);
721 		putbits(ex, 2);
722 #undef putbits
723 	}
724 	if (selbits != &s_selbits[0])
725 		free(selbits, M_SELECT);
726 	return (error);
727 }
728 
729 static int
730 selscan(p, ibits, obits, nfd)
731 	struct proc *p;
732 	fd_mask **ibits, **obits;
733 	int nfd;
734 {
735 	struct filedesc *fdp = p->p_fd;
736 	int msk, i, fd;
737 	fd_mask bits;
738 	struct file *fp;
739 	int n = 0;
740 	/* Note: backend also returns POLLHUP/POLLERR if appropriate. */
741 	static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
742 
743 	for (msk = 0; msk < 3; msk++) {
744 		if (ibits[msk] == NULL)
745 			continue;
746 		for (i = 0; i < nfd; i += NFDBITS) {
747 			bits = ibits[msk][i/NFDBITS];
748 			/* ffs(int mask) not portable, fd_mask is long */
749 			for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
750 				if (!(bits & 1))
751 					continue;
752 				fp = fdp->fd_ofiles[fd];
753 				if (fp == NULL)
754 					return (EBADF);
755 				if (fo_poll(fp, flag[msk], fp->f_cred, p)) {
756 					obits[msk][(fd)/NFDBITS] |=
757 					    ((fd_mask)1 << ((fd) % NFDBITS));
758 					n++;
759 				}
760 			}
761 		}
762 	}
763 	p->p_retval[0] = n;
764 	return (0);
765 }
766 
767 /*
768  * Poll system call.
769  */
770 #ifndef _SYS_SYSPROTO_H_
771 struct poll_args {
772 	struct pollfd *fds;
773 	u_int	nfds;
774 	int	timeout;
775 };
776 #endif
777 int
778 poll(p, uap)
779 	register struct proc *p;
780 	register struct poll_args *uap;
781 {
782 	caddr_t bits;
783 	char smallbits[32 * sizeof(struct pollfd)];
784 	struct timeval atv, rtv, ttv;
785 	int s, ncoll, error = 0, timo;
786 	size_t ni;
787 
788 	if (SCARG(uap, nfds) > p->p_fd->fd_nfiles) {
789 		/* forgiving; slightly wrong */
790 		SCARG(uap, nfds) = p->p_fd->fd_nfiles;
791 	}
792 	ni = SCARG(uap, nfds) * sizeof(struct pollfd);
793 	if (ni > sizeof(smallbits))
794 		bits = malloc(ni, M_TEMP, M_WAITOK);
795 	else
796 		bits = smallbits;
797 	error = copyin(SCARG(uap, fds), bits, ni);
798 	if (error)
799 		goto done;
800 	if (SCARG(uap, timeout) != INFTIM) {
801 		atv.tv_sec = SCARG(uap, timeout) / 1000;
802 		atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000;
803 		if (itimerfix(&atv)) {
804 			error = EINVAL;
805 			goto done;
806 		}
807 		getmicrouptime(&rtv);
808 		timevaladd(&atv, &rtv);
809 	} else
810 		atv.tv_sec = 0;
811 	timo = 0;
812 retry:
813 	ncoll = nselcoll;
814 	p->p_flag |= P_SELECT;
815 	error = pollscan(p, (struct pollfd *)bits, SCARG(uap, nfds));
816 	if (error || p->p_retval[0])
817 		goto done;
818 	if (atv.tv_sec) {
819 		getmicrouptime(&rtv);
820 		if (timevalcmp(&rtv, &atv, >=))
821 			goto done;
822 		ttv = atv;
823 		timevalsub(&ttv, &rtv);
824 		timo = ttv.tv_sec > 24 * 60 * 60 ?
825 		    24 * 60 * 60 * hz : tvtohz(&ttv);
826 	}
827 	s = splhigh();
828 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
829 		splx(s);
830 		goto retry;
831 	}
832 	p->p_flag &= ~P_SELECT;
833 	error = tsleep((caddr_t)&selwait, PSOCK | PCATCH, "poll", timo);
834 	splx(s);
835 	if (error == 0)
836 		goto retry;
837 done:
838 	p->p_flag &= ~P_SELECT;
839 	/* poll is not restarted after signals... */
840 	if (error == ERESTART)
841 		error = EINTR;
842 	if (error == EWOULDBLOCK)
843 		error = 0;
844 	if (error == 0) {
845 		error = copyout(bits, SCARG(uap, fds), ni);
846 		if (error)
847 			goto out;
848 	}
849 out:
850 	if (ni > sizeof(smallbits))
851 		free(bits, M_TEMP);
852 	return (error);
853 }
854 
855 static int
856 pollscan(p, fds, nfd)
857 	struct proc *p;
858 	struct pollfd *fds;
859 	int nfd;
860 {
861 	register struct filedesc *fdp = p->p_fd;
862 	int i;
863 	struct file *fp;
864 	int n = 0;
865 
866 	for (i = 0; i < nfd; i++, fds++) {
867 		if (fds->fd >= fdp->fd_nfiles) {
868 			fds->revents = POLLNVAL;
869 			n++;
870 		} else if (fds->fd < 0) {
871 			fds->revents = 0;
872 		} else {
873 			fp = fdp->fd_ofiles[fds->fd];
874 			if (fp == 0) {
875 				fds->revents = POLLNVAL;
876 				n++;
877 			} else {
878 				/*
879 				 * Note: backend also returns POLLHUP and
880 				 * POLLERR if appropriate.
881 				 */
882 				fds->revents = fo_poll(fp, fds->events,
883 				    fp->f_cred, p);
884 				if (fds->revents != 0)
885 					n++;
886 			}
887 		}
888 	}
889 	p->p_retval[0] = n;
890 	return (0);
891 }
892 
893 /*
894  * OpenBSD poll system call.
895  * XXX this isn't quite a true representation..  OpenBSD uses select ops.
896  */
897 #ifndef _SYS_SYSPROTO_H_
898 struct openbsd_poll_args {
899 	struct pollfd *fds;
900 	u_int	nfds;
901 	int	timeout;
902 };
903 #endif
904 int
905 openbsd_poll(p, uap)
906 	register struct proc *p;
907 	register struct openbsd_poll_args *uap;
908 {
909 	return (poll(p, (struct poll_args *)uap));
910 }
911 
912 /*ARGSUSED*/
913 int
914 seltrue(dev, events, p)
915 	dev_t dev;
916 	int events;
917 	struct proc *p;
918 {
919 
920 	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
921 }
922 
923 /*
924  * Record a select request.
925  */
926 void
927 selrecord(selector, sip)
928 	struct proc *selector;
929 	struct selinfo *sip;
930 {
931 	struct proc *p;
932 	pid_t mypid;
933 
934 	mypid = selector->p_pid;
935 	if (sip->si_pid == mypid)
936 		return;
937 	if (sip->si_pid && (p = pfind(sip->si_pid)) &&
938 	    p->p_wchan == (caddr_t)&selwait)
939 		sip->si_flags |= SI_COLL;
940 	else
941 		sip->si_pid = mypid;
942 }
943 
944 /*
945  * Do a wakeup when a selectable event occurs.
946  */
947 void
948 selwakeup(sip)
949 	register struct selinfo *sip;
950 {
951 	register struct proc *p;
952 	int s;
953 
954 	if (sip->si_pid == 0)
955 		return;
956 	if (sip->si_flags & SI_COLL) {
957 		nselcoll++;
958 		sip->si_flags &= ~SI_COLL;
959 		wakeup((caddr_t)&selwait);
960 	}
961 	p = pfind(sip->si_pid);
962 	sip->si_pid = 0;
963 	if (p != NULL) {
964 		s = splhigh();
965 		if (p->p_wchan == (caddr_t)&selwait) {
966 			if (p->p_stat == SSLEEP)
967 				setrunnable(p);
968 			else
969 				unsleep(p);
970 		} else if (p->p_flag & P_SELECT)
971 			p->p_flag &= ~P_SELECT;
972 		splx(s);
973 	}
974 }
975