xref: /freebsd/sys/kern/sys_generic.c (revision 6af83ee0d2941d18880b6aaa2b4facd1d30c6106)
1 /*-
2  * Copyright (c) 1982, 1986, 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 4. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
35  */
36 
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD$");
39 
40 #include "opt_ktrace.h"
41 
42 #include <sys/param.h>
43 #include <sys/systm.h>
44 #include <sys/sysproto.h>
45 #include <sys/filedesc.h>
46 #include <sys/filio.h>
47 #include <sys/fcntl.h>
48 #include <sys/file.h>
49 #include <sys/proc.h>
50 #include <sys/signalvar.h>
51 #include <sys/socketvar.h>
52 #include <sys/uio.h>
53 #include <sys/kernel.h>
54 #include <sys/limits.h>
55 #include <sys/malloc.h>
56 #include <sys/poll.h>
57 #include <sys/resourcevar.h>
58 #include <sys/selinfo.h>
59 #include <sys/sleepqueue.h>
60 #include <sys/syscallsubr.h>
61 #include <sys/sysctl.h>
62 #include <sys/sysent.h>
63 #include <sys/vnode.h>
64 #include <sys/bio.h>
65 #include <sys/buf.h>
66 #include <sys/condvar.h>
67 #ifdef KTRACE
68 #include <sys/ktrace.h>
69 #endif
70 #include <vm/vm.h>
71 #include <vm/vm_page.h>
72 
73 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
74 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
75 MALLOC_DEFINE(M_IOV, "iov", "large iov's");
76 
77 static int	pollscan(struct thread *, struct pollfd *, u_int);
78 static int	selscan(struct thread *, fd_mask **, fd_mask **, int);
79 static int	dofileread(struct thread *, struct file *, int, void *,
80 		    size_t, off_t, int);
81 static int	dofilewrite(struct thread *, struct file *, int,
82 		    const void *, size_t, off_t, int);
83 static void	doselwakeup(struct selinfo *, int);
84 
85 /*
86  * Read system call.
87  */
88 #ifndef _SYS_SYSPROTO_H_
89 struct read_args {
90 	int	fd;
91 	void	*buf;
92 	size_t	nbyte;
93 };
94 #endif
95 /*
96  * MPSAFE
97  */
98 int
99 read(td, uap)
100 	struct thread *td;
101 	struct read_args *uap;
102 {
103 	struct file *fp;
104 	int error;
105 
106 	if ((error = fget_read(td, uap->fd, &fp)) == 0) {
107 		error = dofileread(td, fp, uap->fd, uap->buf,
108 			    uap->nbyte, (off_t)-1, 0);
109 		fdrop(fp, td);
110 	}
111 	return(error);
112 }
113 
114 /*
115  * Pread system call
116  */
117 #ifndef _SYS_SYSPROTO_H_
118 struct pread_args {
119 	int	fd;
120 	void	*buf;
121 	size_t	nbyte;
122 	int	pad;
123 	off_t	offset;
124 };
125 #endif
126 /*
127  * MPSAFE
128  */
129 int
130 pread(td, uap)
131 	struct thread *td;
132 	struct pread_args *uap;
133 {
134 	struct file *fp;
135 	int error;
136 
137 	if ((error = fget_read(td, uap->fd, &fp)) != 0)
138 		return (error);
139 	if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
140 		error = ESPIPE;
141 	else if (uap->offset < 0 && fp->f_vnode->v_type != VCHR)
142 		error = EINVAL;
143 	else {
144 		error = dofileread(td, fp, uap->fd, uap->buf, uap->nbyte,
145 			    uap->offset, FOF_OFFSET);
146 	}
147 	fdrop(fp, td);
148 	return(error);
149 }
150 
151 /*
152  * Code common for read and pread
153  */
154 static int
155 dofileread(td, fp, fd, buf, nbyte, offset, flags)
156 	struct thread *td;
157 	struct file *fp;
158 	int fd, flags;
159 	void *buf;
160 	size_t nbyte;
161 	off_t offset;
162 {
163 	struct uio auio;
164 	struct iovec aiov;
165 	ssize_t cnt;
166 	long error = 0;
167 #ifdef KTRACE
168 	struct uio *ktruio = NULL;
169 #endif
170 
171 	/* Finish zero length reads right here */
172 	if (nbyte == 0) {
173 		td->td_retval[0] = 0;
174 		return(0);
175 	}
176 	aiov.iov_base = buf;
177 	aiov.iov_len = nbyte;
178 	auio.uio_iov = &aiov;
179 	auio.uio_iovcnt = 1;
180 	auio.uio_offset = offset;
181 	if (nbyte > INT_MAX)
182 		return (EINVAL);
183 	auio.uio_resid = nbyte;
184 	auio.uio_rw = UIO_READ;
185 	auio.uio_segflg = UIO_USERSPACE;
186 	auio.uio_td = td;
187 #ifdef KTRACE
188 	if (KTRPOINT(td, KTR_GENIO))
189 		ktruio = cloneuio(&auio);
190 #endif
191 	cnt = nbyte;
192 
193 	if ((error = fo_read(fp, &auio, td->td_ucred, flags, td))) {
194 		if (auio.uio_resid != cnt && (error == ERESTART ||
195 		    error == EINTR || error == EWOULDBLOCK))
196 			error = 0;
197 	}
198 	cnt -= auio.uio_resid;
199 #ifdef KTRACE
200 	if (ktruio != NULL) {
201 		ktruio->uio_resid = cnt;
202 		ktrgenio(fd, UIO_READ, ktruio, error);
203 	}
204 #endif
205 	td->td_retval[0] = cnt;
206 	return (error);
207 }
208 
209 /*
210  * Scatter read system call.
211  */
212 #ifndef _SYS_SYSPROTO_H_
213 struct readv_args {
214 	int	fd;
215 	struct	iovec *iovp;
216 	u_int	iovcnt;
217 };
218 #endif
219 /*
220  * MPSAFE
221  */
222 int
223 readv(struct thread *td, struct readv_args *uap)
224 {
225 	struct file *fp;
226 	struct uio *auio = NULL;
227 	long cnt;
228 	int error;
229 #ifdef KTRACE
230 	struct uio *ktruio = NULL;
231 #endif
232 
233 	error = fget_read(td, uap->fd, &fp);
234 	if (error)
235 		return (error);
236 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
237 	if (error) {
238 		fdrop(fp, td);
239 		return (error);
240 	}
241 	/* Finish zero length reads right here */
242 	if (auio->uio_resid == 0) {
243 		td->td_retval[0] = 0;
244 		free(auio, M_IOV);
245 		fdrop(fp, td);
246 		return(0);
247 	}
248 	auio->uio_rw = UIO_READ;
249 	auio->uio_td = td;
250 #ifdef KTRACE
251 	if (KTRPOINT(td, KTR_GENIO))
252 		ktruio = cloneuio(auio);
253 #endif
254 	cnt = auio->uio_resid;
255 	if ((error = fo_read(fp, auio, td->td_ucred, 0, td))) {
256 		if (auio->uio_resid != cnt && (error == ERESTART ||
257 		    error == EINTR || error == EWOULDBLOCK))
258 			error = 0;
259 	}
260 	cnt -= auio->uio_resid;
261 #ifdef KTRACE
262 	if (ktruio != NULL) {
263 		ktruio->uio_resid = cnt;
264 		ktrgenio(uap->fd, UIO_READ, ktruio, error);
265 	}
266 #endif
267 	td->td_retval[0] = cnt;
268 	free(auio, M_IOV);
269 	fdrop(fp, td);
270 	return (error);
271 }
272 
273 /*
274  * Write system call
275  */
276 #ifndef _SYS_SYSPROTO_H_
277 struct write_args {
278 	int	fd;
279 	const void *buf;
280 	size_t	nbyte;
281 };
282 #endif
283 /*
284  * MPSAFE
285  */
286 int
287 write(td, uap)
288 	struct thread *td;
289 	struct write_args *uap;
290 {
291 	struct file *fp;
292 	int error;
293 
294 	if ((error = fget_write(td, uap->fd, &fp)) == 0) {
295 		error = dofilewrite(td, fp, uap->fd, uap->buf, uap->nbyte,
296 			    (off_t)-1, 0);
297 		fdrop(fp, td);
298 	} else {
299 		error = EBADF;	/* XXX this can't be right */
300 	}
301 	return(error);
302 }
303 
304 /*
305  * Pwrite system call
306  */
307 #ifndef _SYS_SYSPROTO_H_
308 struct pwrite_args {
309 	int	fd;
310 	const void *buf;
311 	size_t	nbyte;
312 	int	pad;
313 	off_t	offset;
314 };
315 #endif
316 /*
317  * MPSAFE
318  */
319 int
320 pwrite(td, uap)
321 	struct thread *td;
322 	struct pwrite_args *uap;
323 {
324 	struct file *fp;
325 	int error;
326 
327 	if ((error = fget_write(td, uap->fd, &fp)) == 0) {
328 		if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
329 			error = ESPIPE;
330 		else if (uap->offset < 0 && fp->f_vnode->v_type != VCHR)
331 			error = EINVAL;
332 		else {
333 			error = dofilewrite(td, fp, uap->fd, uap->buf,
334 				    uap->nbyte, uap->offset, FOF_OFFSET);
335 		}
336 		fdrop(fp, td);
337 	} else {
338 		error = EBADF;	/* this can't be right */
339 	}
340 	return(error);
341 }
342 
343 static int
344 dofilewrite(td, fp, fd, buf, nbyte, offset, flags)
345 	struct thread *td;
346 	struct file *fp;
347 	int fd, flags;
348 	const void *buf;
349 	size_t nbyte;
350 	off_t offset;
351 {
352 	struct uio auio;
353 	struct iovec aiov;
354 	ssize_t cnt;
355 	long error = 0;
356 #ifdef KTRACE
357 	struct uio *ktruio = NULL;
358 #endif
359 
360 	aiov.iov_base = (void *)(uintptr_t)buf;
361 	aiov.iov_len = nbyte;
362 	auio.uio_iov = &aiov;
363 	auio.uio_iovcnt = 1;
364 	auio.uio_offset = offset;
365 	if (nbyte > INT_MAX)
366 		return (EINVAL);
367 	auio.uio_resid = nbyte;
368 	auio.uio_rw = UIO_WRITE;
369 	auio.uio_segflg = UIO_USERSPACE;
370 	auio.uio_td = td;
371 #ifdef KTRACE
372 	if (KTRPOINT(td, KTR_GENIO))
373 		ktruio = cloneuio(&auio);
374 #endif
375 	cnt = nbyte;
376 	if (fp->f_type == DTYPE_VNODE)
377 		bwillwrite();
378 	if ((error = fo_write(fp, &auio, td->td_ucred, flags, td))) {
379 		if (auio.uio_resid != cnt && (error == ERESTART ||
380 		    error == EINTR || error == EWOULDBLOCK))
381 			error = 0;
382 		/* Socket layer is responsible for issuing SIGPIPE. */
383 		if (error == EPIPE && fp->f_type != DTYPE_SOCKET) {
384 			PROC_LOCK(td->td_proc);
385 			psignal(td->td_proc, SIGPIPE);
386 			PROC_UNLOCK(td->td_proc);
387 		}
388 	}
389 	cnt -= auio.uio_resid;
390 #ifdef KTRACE
391 	if (ktruio != NULL) {
392 		ktruio->uio_resid = cnt;
393 		ktrgenio(fd, UIO_WRITE, ktruio, error);
394 	}
395 #endif
396 	td->td_retval[0] = cnt;
397 	return (error);
398 }
399 
400 /*
401  * Gather write system call
402  */
403 #ifndef _SYS_SYSPROTO_H_
404 struct writev_args {
405 	int	fd;
406 	struct	iovec *iovp;
407 	u_int	iovcnt;
408 };
409 #endif
410 /*
411  * MPSAFE
412  */
413 int
414 writev(struct thread *td, struct writev_args *uap)
415 {
416 	struct file *fp;
417 	struct uio *auio = NULL;
418 	long cnt;
419 	int error;
420 #ifdef KTRACE
421 	struct uio *ktruio = NULL;
422 #endif
423 
424 	error = fget_write(td, uap->fd, &fp);
425 	if (error)
426 		return (EBADF);
427 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
428 	if (error) {
429 		fdrop(fp, td);
430 		return (error);
431 	}
432 	auio->uio_rw = UIO_WRITE;
433 	auio->uio_td = td;
434 #ifdef KTRACE
435 	if (KTRPOINT(td, KTR_GENIO))
436 		ktruio = cloneuio(auio);
437 #endif
438 	cnt = auio->uio_resid;
439 	if (fp->f_type == DTYPE_VNODE)
440 		bwillwrite();
441 	if ((error = fo_write(fp, auio, td->td_ucred, 0, td))) {
442 		if (auio->uio_resid != cnt && (error == ERESTART ||
443 		    error == EINTR || error == EWOULDBLOCK))
444 			error = 0;
445 		if (error == EPIPE) {
446 			PROC_LOCK(td->td_proc);
447 			psignal(td->td_proc, SIGPIPE);
448 			PROC_UNLOCK(td->td_proc);
449 		}
450 	}
451 	cnt -= auio->uio_resid;
452 #ifdef KTRACE
453 	if (ktruio != NULL) {
454 		ktruio->uio_resid = cnt;
455 		ktrgenio(uap->fd, UIO_WRITE, ktruio, error);
456 	}
457 #endif
458 	td->td_retval[0] = cnt;
459 	fdrop(fp, td);
460 	free(auio, M_IOV);
461 	return (error);
462 }
463 
464 /*
465  * Ioctl system call
466  */
467 #ifndef _SYS_SYSPROTO_H_
468 struct ioctl_args {
469 	int	fd;
470 	u_long	com;
471 	caddr_t	data;
472 };
473 #endif
474 /*
475  * MPSAFE
476  */
477 /* ARGSUSED */
478 int
479 ioctl(struct thread *td, struct ioctl_args *uap)
480 {
481 	struct file *fp;
482 	struct filedesc *fdp;
483 	u_long com;
484 	int error = 0;
485 	u_int size;
486 	caddr_t data, memp;
487 	int tmp;
488 
489 	if (uap->com > 0xffffffff) {
490 		printf(
491 		    "WARNING pid %d (%s): ioctl sign-extension ioctl %lx\n",
492 		    td->td_proc->p_pid, td->td_proc->p_comm, uap->com);
493 		uap->com &= 0xffffffff;
494 	}
495 	if ((error = fget(td, uap->fd, &fp)) != 0)
496 		return (error);
497 	if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
498 		fdrop(fp, td);
499 		return (EBADF);
500 	}
501 	fdp = td->td_proc->p_fd;
502 	switch (com = uap->com) {
503 	case FIONCLEX:
504 		FILEDESC_LOCK_FAST(fdp);
505 		fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE;
506 		FILEDESC_UNLOCK_FAST(fdp);
507 		fdrop(fp, td);
508 		return (0);
509 	case FIOCLEX:
510 		FILEDESC_LOCK_FAST(fdp);
511 		fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE;
512 		FILEDESC_UNLOCK_FAST(fdp);
513 		fdrop(fp, td);
514 		return (0);
515 	}
516 
517 	/*
518 	 * Interpret high order word to find amount of data to be
519 	 * copied to/from the user's address space.
520 	 */
521 	size = IOCPARM_LEN(com);
522 	if ((size > IOCPARM_MAX) ||
523 	    ((com & (IOC_VOID  | IOC_IN | IOC_OUT)) == 0) ||
524 	    ((com & IOC_VOID) && size > 0) ||
525 	    ((com & (IOC_IN | IOC_OUT)) && size == 0)) {
526 		fdrop(fp, td);
527 		return (ENOTTY);
528 	}
529 
530 	if (size > 0) {
531 		memp = malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
532 		data = memp;
533 	} else {
534 		memp = NULL;
535 		data = (void *)&uap->data;
536 	}
537 	if (com & IOC_IN) {
538 		error = copyin(uap->data, data, (u_int)size);
539 		if (error) {
540 			free(memp, M_IOCTLOPS);
541 			fdrop(fp, td);
542 			return (error);
543 		}
544 	} else if (com & IOC_OUT) {
545 		/*
546 		 * Zero the buffer so the user always
547 		 * gets back something deterministic.
548 		 */
549 		bzero(data, size);
550 	}
551 
552 	if (com == FIONBIO) {
553 		FILE_LOCK(fp);
554 		if ((tmp = *(int *)data))
555 			fp->f_flag |= FNONBLOCK;
556 		else
557 			fp->f_flag &= ~FNONBLOCK;
558 		FILE_UNLOCK(fp);
559 		data = (void *)&tmp;
560 	} else if (com == FIOASYNC) {
561 		FILE_LOCK(fp);
562 		if ((tmp = *(int *)data))
563 			fp->f_flag |= FASYNC;
564 		else
565 			fp->f_flag &= ~FASYNC;
566 		FILE_UNLOCK(fp);
567 		data = (void *)&tmp;
568 	}
569 
570 	error = fo_ioctl(fp, com, data, td->td_ucred, td);
571 
572 	if (error == 0 && (com & IOC_OUT))
573 		error = copyout(data, uap->data, (u_int)size);
574 
575 	if (memp != NULL)
576 		free(memp, M_IOCTLOPS);
577 	fdrop(fp, td);
578 	return (error);
579 }
580 
581 /*
582  * sellock and selwait are initialized in selectinit() via SYSINIT.
583  */
584 struct mtx	sellock;
585 struct cv	selwait;
586 u_int		nselcoll;	/* Select collisions since boot */
587 SYSCTL_UINT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
588 
589 /*
590  * Select system call.
591  */
592 #ifndef _SYS_SYSPROTO_H_
593 struct select_args {
594 	int	nd;
595 	fd_set	*in, *ou, *ex;
596 	struct	timeval *tv;
597 };
598 #endif
599 /*
600  * MPSAFE
601  */
602 int
603 select(td, uap)
604 	register struct thread *td;
605 	register struct select_args *uap;
606 {
607 	struct timeval tv, *tvp;
608 	int error;
609 
610 	if (uap->tv != NULL) {
611 		error = copyin(uap->tv, &tv, sizeof(tv));
612 		if (error)
613 			return (error);
614 		tvp = &tv;
615 	} else
616 		tvp = NULL;
617 
618 	return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp));
619 }
620 
621 int
622 kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou,
623     fd_set *fd_ex, struct timeval *tvp)
624 {
625 	struct filedesc *fdp;
626 	/*
627 	 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
628 	 * infds with the new FD_SETSIZE of 1024, and more than enough for
629 	 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
630 	 * of 256.
631 	 */
632 	fd_mask s_selbits[howmany(2048, NFDBITS)];
633 	fd_mask *ibits[3], *obits[3], *selbits, *sbp;
634 	struct timeval atv, rtv, ttv;
635 	int error, timo;
636 	u_int ncoll, nbufbytes, ncpbytes, nfdbits;
637 
638 	if (nd < 0)
639 		return (EINVAL);
640 	fdp = td->td_proc->p_fd;
641 
642 	FILEDESC_LOCK_FAST(fdp);
643 
644 	if (nd > td->td_proc->p_fd->fd_nfiles)
645 		nd = td->td_proc->p_fd->fd_nfiles;   /* forgiving; slightly wrong */
646 	FILEDESC_UNLOCK_FAST(fdp);
647 
648 	/*
649 	 * Allocate just enough bits for the non-null fd_sets.  Use the
650 	 * preallocated auto buffer if possible.
651 	 */
652 	nfdbits = roundup(nd, NFDBITS);
653 	ncpbytes = nfdbits / NBBY;
654 	nbufbytes = 0;
655 	if (fd_in != NULL)
656 		nbufbytes += 2 * ncpbytes;
657 	if (fd_ou != NULL)
658 		nbufbytes += 2 * ncpbytes;
659 	if (fd_ex != NULL)
660 		nbufbytes += 2 * ncpbytes;
661 	if (nbufbytes <= sizeof s_selbits)
662 		selbits = &s_selbits[0];
663 	else
664 		selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
665 
666 	/*
667 	 * Assign pointers into the bit buffers and fetch the input bits.
668 	 * Put the output buffers together so that they can be bzeroed
669 	 * together.
670 	 */
671 	sbp = selbits;
672 #define	getbits(name, x) \
673 	do {								\
674 		if (name == NULL)					\
675 			ibits[x] = NULL;				\
676 		else {							\
677 			ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp;	\
678 			obits[x] = sbp;					\
679 			sbp += ncpbytes / sizeof *sbp;			\
680 			error = copyin(name, ibits[x], ncpbytes);	\
681 			if (error != 0)					\
682 				goto done_nosellock;			\
683 		}							\
684 	} while (0)
685 	getbits(fd_in, 0);
686 	getbits(fd_ou, 1);
687 	getbits(fd_ex, 2);
688 #undef	getbits
689 	if (nbufbytes != 0)
690 		bzero(selbits, nbufbytes / 2);
691 
692 	if (tvp != NULL) {
693 		atv = *tvp;
694 		if (itimerfix(&atv)) {
695 			error = EINVAL;
696 			goto done_nosellock;
697 		}
698 		getmicrouptime(&rtv);
699 		timevaladd(&atv, &rtv);
700 	} else {
701 		atv.tv_sec = 0;
702 		atv.tv_usec = 0;
703 	}
704 	timo = 0;
705 	TAILQ_INIT(&td->td_selq);
706 	mtx_lock(&sellock);
707 retry:
708 	ncoll = nselcoll;
709 	mtx_lock_spin(&sched_lock);
710 	td->td_flags |= TDF_SELECT;
711 	mtx_unlock_spin(&sched_lock);
712 	mtx_unlock(&sellock);
713 
714 	error = selscan(td, ibits, obits, nd);
715 	mtx_lock(&sellock);
716 	if (error || td->td_retval[0])
717 		goto done;
718 	if (atv.tv_sec || atv.tv_usec) {
719 		getmicrouptime(&rtv);
720 		if (timevalcmp(&rtv, &atv, >=))
721 			goto done;
722 		ttv = atv;
723 		timevalsub(&ttv, &rtv);
724 		timo = ttv.tv_sec > 24 * 60 * 60 ?
725 		    24 * 60 * 60 * hz : tvtohz(&ttv);
726 	}
727 
728 	/*
729 	 * An event of interest may occur while we do not hold
730 	 * sellock, so check TDF_SELECT and the number of
731 	 * collisions and rescan the file descriptors if
732 	 * necessary.
733 	 */
734 	mtx_lock_spin(&sched_lock);
735 	if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
736 		mtx_unlock_spin(&sched_lock);
737 		goto retry;
738 	}
739 	mtx_unlock_spin(&sched_lock);
740 
741 	if (timo > 0)
742 		error = cv_timedwait_sig(&selwait, &sellock, timo);
743 	else
744 		error = cv_wait_sig(&selwait, &sellock);
745 
746 	if (error == 0)
747 		goto retry;
748 
749 done:
750 	clear_selinfo_list(td);
751 	mtx_lock_spin(&sched_lock);
752 	td->td_flags &= ~TDF_SELECT;
753 	mtx_unlock_spin(&sched_lock);
754 	mtx_unlock(&sellock);
755 
756 done_nosellock:
757 	/* select is not restarted after signals... */
758 	if (error == ERESTART)
759 		error = EINTR;
760 	if (error == EWOULDBLOCK)
761 		error = 0;
762 #define	putbits(name, x) \
763 	if (name && (error2 = copyout(obits[x], name, ncpbytes))) \
764 		error = error2;
765 	if (error == 0) {
766 		int error2;
767 
768 		putbits(fd_in, 0);
769 		putbits(fd_ou, 1);
770 		putbits(fd_ex, 2);
771 #undef putbits
772 	}
773 	if (selbits != &s_selbits[0])
774 		free(selbits, M_SELECT);
775 
776 	return (error);
777 }
778 
779 static int
780 selscan(td, ibits, obits, nfd)
781 	struct thread *td;
782 	fd_mask **ibits, **obits;
783 	int nfd;
784 {
785 	int msk, i, fd;
786 	fd_mask bits;
787 	struct file *fp;
788 	int n = 0;
789 	/* Note: backend also returns POLLHUP/POLLERR if appropriate. */
790 	static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
791 	struct filedesc *fdp = td->td_proc->p_fd;
792 
793 	FILEDESC_LOCK(fdp);
794 	for (msk = 0; msk < 3; msk++) {
795 		if (ibits[msk] == NULL)
796 			continue;
797 		for (i = 0; i < nfd; i += NFDBITS) {
798 			bits = ibits[msk][i/NFDBITS];
799 			/* ffs(int mask) not portable, fd_mask is long */
800 			for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
801 				if (!(bits & 1))
802 					continue;
803 				if ((fp = fget_locked(fdp, fd)) == NULL) {
804 					FILEDESC_UNLOCK(fdp);
805 					return (EBADF);
806 				}
807 				if (fo_poll(fp, flag[msk], td->td_ucred,
808 				    td)) {
809 					obits[msk][(fd)/NFDBITS] |=
810 					    ((fd_mask)1 << ((fd) % NFDBITS));
811 					n++;
812 				}
813 			}
814 		}
815 	}
816 	FILEDESC_UNLOCK(fdp);
817 	td->td_retval[0] = n;
818 	return (0);
819 }
820 
821 /*
822  * Poll system call.
823  */
824 #ifndef _SYS_SYSPROTO_H_
825 struct poll_args {
826 	struct pollfd *fds;
827 	u_int	nfds;
828 	int	timeout;
829 };
830 #endif
831 /*
832  * MPSAFE
833  */
834 int
835 poll(td, uap)
836 	struct thread *td;
837 	struct poll_args *uap;
838 {
839 	struct pollfd *bits;
840 	struct pollfd smallbits[32];
841 	struct timeval atv, rtv, ttv;
842 	int error = 0, timo;
843 	u_int ncoll, nfds;
844 	size_t ni;
845 
846 	nfds = uap->nfds;
847 
848 	/*
849 	 * This is kinda bogus.  We have fd limits, but that is not
850 	 * really related to the size of the pollfd array.  Make sure
851 	 * we let the process use at least FD_SETSIZE entries and at
852 	 * least enough for the current limits.  We want to be reasonably
853 	 * safe, but not overly restrictive.
854 	 */
855 	PROC_LOCK(td->td_proc);
856 	if ((nfds > lim_cur(td->td_proc, RLIMIT_NOFILE)) &&
857 	    (nfds > FD_SETSIZE)) {
858 		PROC_UNLOCK(td->td_proc);
859 		error = EINVAL;
860 		goto done2;
861 	}
862 	PROC_UNLOCK(td->td_proc);
863 	ni = nfds * sizeof(struct pollfd);
864 	if (ni > sizeof(smallbits))
865 		bits = malloc(ni, M_TEMP, M_WAITOK);
866 	else
867 		bits = smallbits;
868 	error = copyin(uap->fds, bits, ni);
869 	if (error)
870 		goto done_nosellock;
871 	if (uap->timeout != INFTIM) {
872 		atv.tv_sec = uap->timeout / 1000;
873 		atv.tv_usec = (uap->timeout % 1000) * 1000;
874 		if (itimerfix(&atv)) {
875 			error = EINVAL;
876 			goto done_nosellock;
877 		}
878 		getmicrouptime(&rtv);
879 		timevaladd(&atv, &rtv);
880 	} else {
881 		atv.tv_sec = 0;
882 		atv.tv_usec = 0;
883 	}
884 	timo = 0;
885 	TAILQ_INIT(&td->td_selq);
886 	mtx_lock(&sellock);
887 retry:
888 	ncoll = nselcoll;
889 	mtx_lock_spin(&sched_lock);
890 	td->td_flags |= TDF_SELECT;
891 	mtx_unlock_spin(&sched_lock);
892 	mtx_unlock(&sellock);
893 
894 	error = pollscan(td, bits, nfds);
895 	mtx_lock(&sellock);
896 	if (error || td->td_retval[0])
897 		goto done;
898 	if (atv.tv_sec || atv.tv_usec) {
899 		getmicrouptime(&rtv);
900 		if (timevalcmp(&rtv, &atv, >=))
901 			goto done;
902 		ttv = atv;
903 		timevalsub(&ttv, &rtv);
904 		timo = ttv.tv_sec > 24 * 60 * 60 ?
905 		    24 * 60 * 60 * hz : tvtohz(&ttv);
906 	}
907 	/*
908 	 * An event of interest may occur while we do not hold
909 	 * sellock, so check TDF_SELECT and the number of collisions
910 	 * and rescan the file descriptors if necessary.
911 	 */
912 	mtx_lock_spin(&sched_lock);
913 	if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
914 		mtx_unlock_spin(&sched_lock);
915 		goto retry;
916 	}
917 	mtx_unlock_spin(&sched_lock);
918 
919 	if (timo > 0)
920 		error = cv_timedwait_sig(&selwait, &sellock, timo);
921 	else
922 		error = cv_wait_sig(&selwait, &sellock);
923 
924 	if (error == 0)
925 		goto retry;
926 
927 done:
928 	clear_selinfo_list(td);
929 	mtx_lock_spin(&sched_lock);
930 	td->td_flags &= ~TDF_SELECT;
931 	mtx_unlock_spin(&sched_lock);
932 	mtx_unlock(&sellock);
933 
934 done_nosellock:
935 	/* poll is not restarted after signals... */
936 	if (error == ERESTART)
937 		error = EINTR;
938 	if (error == EWOULDBLOCK)
939 		error = 0;
940 	if (error == 0) {
941 		error = copyout(bits, uap->fds, ni);
942 		if (error)
943 			goto out;
944 	}
945 out:
946 	if (ni > sizeof(smallbits))
947 		free(bits, M_TEMP);
948 done2:
949 	return (error);
950 }
951 
952 static int
953 pollscan(td, fds, nfd)
954 	struct thread *td;
955 	struct pollfd *fds;
956 	u_int nfd;
957 {
958 	register struct filedesc *fdp = td->td_proc->p_fd;
959 	int i;
960 	struct file *fp;
961 	int n = 0;
962 
963 	FILEDESC_LOCK(fdp);
964 	for (i = 0; i < nfd; i++, fds++) {
965 		if (fds->fd >= fdp->fd_nfiles) {
966 			fds->revents = POLLNVAL;
967 			n++;
968 		} else if (fds->fd < 0) {
969 			fds->revents = 0;
970 		} else {
971 			fp = fdp->fd_ofiles[fds->fd];
972 			if (fp == NULL) {
973 				fds->revents = POLLNVAL;
974 				n++;
975 			} else {
976 				/*
977 				 * Note: backend also returns POLLHUP and
978 				 * POLLERR if appropriate.
979 				 */
980 				fds->revents = fo_poll(fp, fds->events,
981 				    td->td_ucred, td);
982 				if (fds->revents != 0)
983 					n++;
984 			}
985 		}
986 	}
987 	FILEDESC_UNLOCK(fdp);
988 	td->td_retval[0] = n;
989 	return (0);
990 }
991 
992 /*
993  * OpenBSD poll system call.
994  * XXX this isn't quite a true representation..  OpenBSD uses select ops.
995  */
996 #ifndef _SYS_SYSPROTO_H_
997 struct openbsd_poll_args {
998 	struct pollfd *fds;
999 	u_int	nfds;
1000 	int	timeout;
1001 };
1002 #endif
1003 /*
1004  * MPSAFE
1005  */
1006 int
1007 openbsd_poll(td, uap)
1008 	register struct thread *td;
1009 	register struct openbsd_poll_args *uap;
1010 {
1011 	return (poll(td, (struct poll_args *)uap));
1012 }
1013 
1014 /*
1015  * Remove the references to the thread from all of the objects
1016  * we were polling.
1017  *
1018  * This code assumes that the underlying owner of the selinfo
1019  * structure will hold sellock before it changes it, and that
1020  * it will unlink itself from our list if it goes away.
1021  */
1022 void
1023 clear_selinfo_list(td)
1024 	struct thread *td;
1025 {
1026 	struct selinfo *si;
1027 
1028 	mtx_assert(&sellock, MA_OWNED);
1029 	TAILQ_FOREACH(si, &td->td_selq, si_thrlist)
1030 		si->si_thread = NULL;
1031 	TAILQ_INIT(&td->td_selq);
1032 }
1033 
1034 /*
1035  * Record a select request.
1036  */
1037 void
1038 selrecord(selector, sip)
1039 	struct thread *selector;
1040 	struct selinfo *sip;
1041 {
1042 
1043 	mtx_lock(&sellock);
1044 	/*
1045 	 * If the selinfo's thread pointer is NULL then take ownership of it.
1046 	 *
1047 	 * If the thread pointer is not NULL and it points to another
1048 	 * thread, then we have a collision.
1049 	 *
1050 	 * If the thread pointer is not NULL and points back to us then leave
1051 	 * it alone as we've already added pointed it at us and added it to
1052 	 * our list.
1053 	 */
1054 	if (sip->si_thread == NULL) {
1055 		sip->si_thread = selector;
1056 		TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist);
1057 	} else if (sip->si_thread != selector) {
1058 		sip->si_flags |= SI_COLL;
1059 	}
1060 
1061 	mtx_unlock(&sellock);
1062 }
1063 
1064 /* Wake up a selecting thread. */
1065 void
1066 selwakeup(sip)
1067 	struct selinfo *sip;
1068 {
1069 	doselwakeup(sip, -1);
1070 }
1071 
1072 /* Wake up a selecting thread, and set its priority. */
1073 void
1074 selwakeuppri(sip, pri)
1075 	struct selinfo *sip;
1076 	int pri;
1077 {
1078 	doselwakeup(sip, pri);
1079 }
1080 
1081 /*
1082  * Do a wakeup when a selectable event occurs.
1083  */
1084 static void
1085 doselwakeup(sip, pri)
1086 	struct selinfo *sip;
1087 	int pri;
1088 {
1089 	struct thread *td;
1090 
1091 	mtx_lock(&sellock);
1092 	td = sip->si_thread;
1093 	if ((sip->si_flags & SI_COLL) != 0) {
1094 		nselcoll++;
1095 		sip->si_flags &= ~SI_COLL;
1096 		cv_broadcastpri(&selwait, pri);
1097 	}
1098 	if (td == NULL) {
1099 		mtx_unlock(&sellock);
1100 		return;
1101 	}
1102 	TAILQ_REMOVE(&td->td_selq, sip, si_thrlist);
1103 	sip->si_thread = NULL;
1104 	mtx_lock_spin(&sched_lock);
1105 	td->td_flags &= ~TDF_SELECT;
1106 	mtx_unlock_spin(&sched_lock);
1107 	sleepq_remove(td, &selwait);
1108 	mtx_unlock(&sellock);
1109 }
1110 
1111 static void selectinit(void *);
1112 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL)
1113 
1114 /* ARGSUSED*/
1115 static void
1116 selectinit(dummy)
1117 	void *dummy;
1118 {
1119 	cv_init(&selwait, "select");
1120 	mtx_init(&sellock, "sellck", NULL, MTX_DEF);
1121 }
1122