xref: /freebsd/sys/kern/sys_generic.c (revision 6b806d21d144c25f4fad714e1c0cf780f5e27d7e)
1 /*-
2  * Copyright (c) 1982, 1986, 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 4. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
35  */
36 
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD$");
39 
40 #include "opt_ktrace.h"
41 
42 #include <sys/param.h>
43 #include <sys/systm.h>
44 #include <sys/sysproto.h>
45 #include <sys/filedesc.h>
46 #include <sys/filio.h>
47 #include <sys/fcntl.h>
48 #include <sys/file.h>
49 #include <sys/proc.h>
50 #include <sys/signalvar.h>
51 #include <sys/socketvar.h>
52 #include <sys/uio.h>
53 #include <sys/kernel.h>
54 #include <sys/limits.h>
55 #include <sys/malloc.h>
56 #include <sys/poll.h>
57 #include <sys/resourcevar.h>
58 #include <sys/selinfo.h>
59 #include <sys/sleepqueue.h>
60 #include <sys/syscallsubr.h>
61 #include <sys/sysctl.h>
62 #include <sys/sysent.h>
63 #include <sys/vnode.h>
64 #include <sys/bio.h>
65 #include <sys/buf.h>
66 #include <sys/condvar.h>
67 #ifdef KTRACE
68 #include <sys/ktrace.h>
69 #endif
70 #include <vm/vm.h>
71 #include <vm/vm_page.h>
72 
73 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
74 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
75 MALLOC_DEFINE(M_IOV, "iov", "large iov's");
76 
77 static int	pollscan(struct thread *, struct pollfd *, u_int);
78 static int	selscan(struct thread *, fd_mask **, fd_mask **, int);
79 static int	dofileread(struct thread *, struct file *, int, void *,
80 		    size_t, off_t, int);
81 static int	dofilewrite(struct thread *, struct file *, int,
82 		    const void *, size_t, off_t, int);
83 static void	doselwakeup(struct selinfo *, int);
84 
85 /*
86  * Read system call.
87  */
88 #ifndef _SYS_SYSPROTO_H_
89 struct read_args {
90 	int	fd;
91 	void	*buf;
92 	size_t	nbyte;
93 };
94 #endif
95 /*
96  * MPSAFE
97  */
98 int
99 read(td, uap)
100 	struct thread *td;
101 	struct read_args *uap;
102 {
103 	struct file *fp;
104 	int error;
105 
106 	if ((error = fget_read(td, uap->fd, &fp)) == 0) {
107 		error = dofileread(td, fp, uap->fd, uap->buf,
108 			    uap->nbyte, (off_t)-1, 0);
109 		fdrop(fp, td);
110 	}
111 	return(error);
112 }
113 
114 /*
115  * Pread system call
116  */
117 #ifndef _SYS_SYSPROTO_H_
118 struct pread_args {
119 	int	fd;
120 	void	*buf;
121 	size_t	nbyte;
122 	int	pad;
123 	off_t	offset;
124 };
125 #endif
126 /*
127  * MPSAFE
128  */
129 int
130 pread(td, uap)
131 	struct thread *td;
132 	struct pread_args *uap;
133 {
134 	struct file *fp;
135 	int error;
136 
137 	if ((error = fget_read(td, uap->fd, &fp)) != 0)
138 		return (error);
139 	if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
140 		error = ESPIPE;
141 	else if (uap->offset < 0 && fp->f_vnode->v_type != VCHR)
142 		error = EINVAL;
143 	else {
144 		error = dofileread(td, fp, uap->fd, uap->buf, uap->nbyte,
145 			    uap->offset, FOF_OFFSET);
146 	}
147 	fdrop(fp, td);
148 	return(error);
149 }
150 
151 /*
152  * Code common for read and pread
153  */
154 static int
155 dofileread(td, fp, fd, buf, nbyte, offset, flags)
156 	struct thread *td;
157 	struct file *fp;
158 	int fd, flags;
159 	void *buf;
160 	size_t nbyte;
161 	off_t offset;
162 {
163 	struct uio auio;
164 	struct iovec aiov;
165 	long cnt, error = 0;
166 #ifdef KTRACE
167 	struct uio *ktruio = NULL;
168 #endif
169 
170 	/* Finish zero length reads right here */
171 	if (nbyte == 0) {
172 		td->td_retval[0] = 0;
173 		return(0);
174 	}
175 	aiov.iov_base = buf;
176 	aiov.iov_len = nbyte;
177 	auio.uio_iov = &aiov;
178 	auio.uio_iovcnt = 1;
179 	auio.uio_offset = offset;
180 	if (nbyte > INT_MAX)
181 		return (EINVAL);
182 	auio.uio_resid = nbyte;
183 	auio.uio_rw = UIO_READ;
184 	auio.uio_segflg = UIO_USERSPACE;
185 	auio.uio_td = td;
186 #ifdef KTRACE
187 	if (KTRPOINT(td, KTR_GENIO))
188 		ktruio = cloneuio(&auio);
189 #endif
190 	cnt = nbyte;
191 
192 	if ((error = fo_read(fp, &auio, td->td_ucred, flags, td))) {
193 		if (auio.uio_resid != cnt && (error == ERESTART ||
194 		    error == EINTR || error == EWOULDBLOCK))
195 			error = 0;
196 	}
197 	cnt -= auio.uio_resid;
198 #ifdef KTRACE
199 	if (ktruio != NULL) {
200 		ktruio->uio_resid = cnt;
201 		ktrgenio(fd, UIO_READ, ktruio, error);
202 	}
203 #endif
204 	td->td_retval[0] = cnt;
205 	return (error);
206 }
207 
208 /*
209  * Scatter read system call.
210  */
211 #ifndef _SYS_SYSPROTO_H_
212 struct readv_args {
213 	int	fd;
214 	struct	iovec *iovp;
215 	u_int	iovcnt;
216 };
217 #endif
218 /*
219  * MPSAFE
220  */
221 int
222 readv(struct thread *td, struct readv_args *uap)
223 {
224 	struct file *fp;
225 	struct uio *auio = NULL;
226 	long cnt;
227 	int error;
228 #ifdef KTRACE
229 	struct uio *ktruio = NULL;
230 #endif
231 
232 	error = fget_read(td, uap->fd, &fp);
233 	if (error)
234 		return (error);
235 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
236 	if (error) {
237 		fdrop(fp, td);
238 		return (error);
239 	}
240 	/* Finish zero length reads right here */
241 	if (auio->uio_resid == 0) {
242 		td->td_retval[0] = 0;
243 		free(auio, M_IOV);
244 		fdrop(fp, td);
245 		return(0);
246 	}
247 	auio->uio_rw = UIO_READ;
248 	auio->uio_td = td;
249 #ifdef KTRACE
250 	if (KTRPOINT(td, KTR_GENIO))
251 		ktruio = cloneuio(auio);
252 #endif
253 	cnt = auio->uio_resid;
254 	if ((error = fo_read(fp, auio, td->td_ucred, 0, td))) {
255 		if (auio->uio_resid != cnt && (error == ERESTART ||
256 		    error == EINTR || error == EWOULDBLOCK))
257 			error = 0;
258 	}
259 	cnt -= auio->uio_resid;
260 #ifdef KTRACE
261 	if (ktruio != NULL) {
262 		ktruio->uio_resid = cnt;
263 		ktrgenio(uap->fd, UIO_READ, ktruio, error);
264 	}
265 #endif
266 	td->td_retval[0] = cnt;
267 	free(auio, M_IOV);
268 	fdrop(fp, td);
269 	return (error);
270 }
271 
272 /*
273  * Write system call
274  */
275 #ifndef _SYS_SYSPROTO_H_
276 struct write_args {
277 	int	fd;
278 	const void *buf;
279 	size_t	nbyte;
280 };
281 #endif
282 /*
283  * MPSAFE
284  */
285 int
286 write(td, uap)
287 	struct thread *td;
288 	struct write_args *uap;
289 {
290 	struct file *fp;
291 	int error;
292 
293 	if ((error = fget_write(td, uap->fd, &fp)) == 0) {
294 		error = dofilewrite(td, fp, uap->fd, uap->buf, uap->nbyte,
295 			    (off_t)-1, 0);
296 		fdrop(fp, td);
297 	} else {
298 		error = EBADF;	/* XXX this can't be right */
299 	}
300 	return(error);
301 }
302 
303 /*
304  * Pwrite system call
305  */
306 #ifndef _SYS_SYSPROTO_H_
307 struct pwrite_args {
308 	int	fd;
309 	const void *buf;
310 	size_t	nbyte;
311 	int	pad;
312 	off_t	offset;
313 };
314 #endif
315 /*
316  * MPSAFE
317  */
318 int
319 pwrite(td, uap)
320 	struct thread *td;
321 	struct pwrite_args *uap;
322 {
323 	struct file *fp;
324 	int error;
325 
326 	if ((error = fget_write(td, uap->fd, &fp)) == 0) {
327 		if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
328 			error = ESPIPE;
329 		else if (uap->offset < 0 && fp->f_vnode->v_type != VCHR)
330 			error = EINVAL;
331 		else {
332 			error = dofilewrite(td, fp, uap->fd, uap->buf,
333 				    uap->nbyte, uap->offset, FOF_OFFSET);
334 		}
335 		fdrop(fp, td);
336 	} else {
337 		error = EBADF;	/* this can't be right */
338 	}
339 	return(error);
340 }
341 
342 static int
343 dofilewrite(td, fp, fd, buf, nbyte, offset, flags)
344 	struct thread *td;
345 	struct file *fp;
346 	int fd, flags;
347 	const void *buf;
348 	size_t nbyte;
349 	off_t offset;
350 {
351 	struct uio auio;
352 	struct iovec aiov;
353 	long cnt, error = 0;
354 #ifdef KTRACE
355 	struct uio *ktruio = NULL;
356 #endif
357 
358 	aiov.iov_base = (void *)(uintptr_t)buf;
359 	aiov.iov_len = nbyte;
360 	auio.uio_iov = &aiov;
361 	auio.uio_iovcnt = 1;
362 	auio.uio_offset = offset;
363 	if (nbyte > INT_MAX)
364 		return (EINVAL);
365 	auio.uio_resid = nbyte;
366 	auio.uio_rw = UIO_WRITE;
367 	auio.uio_segflg = UIO_USERSPACE;
368 	auio.uio_td = td;
369 #ifdef KTRACE
370 	if (KTRPOINT(td, KTR_GENIO))
371 		ktruio = cloneuio(&auio);
372 #endif
373 	cnt = nbyte;
374 	if (fp->f_type == DTYPE_VNODE)
375 		bwillwrite();
376 	if ((error = fo_write(fp, &auio, td->td_ucred, flags, td))) {
377 		if (auio.uio_resid != cnt && (error == ERESTART ||
378 		    error == EINTR || error == EWOULDBLOCK))
379 			error = 0;
380 		/* Socket layer is responsible for issuing SIGPIPE. */
381 		if (error == EPIPE && fp->f_type != DTYPE_SOCKET) {
382 			PROC_LOCK(td->td_proc);
383 			psignal(td->td_proc, SIGPIPE);
384 			PROC_UNLOCK(td->td_proc);
385 		}
386 	}
387 	cnt -= auio.uio_resid;
388 #ifdef KTRACE
389 	if (ktruio != NULL) {
390 		ktruio->uio_resid = cnt;
391 		ktrgenio(fd, UIO_WRITE, ktruio, error);
392 	}
393 #endif
394 	td->td_retval[0] = cnt;
395 	return (error);
396 }
397 
398 /*
399  * Gather write system call
400  */
401 #ifndef _SYS_SYSPROTO_H_
402 struct writev_args {
403 	int	fd;
404 	struct	iovec *iovp;
405 	u_int	iovcnt;
406 };
407 #endif
408 /*
409  * MPSAFE
410  */
411 int
412 writev(struct thread *td, struct writev_args *uap)
413 {
414 	struct file *fp;
415 	struct uio *auio = NULL;
416 	long cnt;
417 	int error;
418 #ifdef KTRACE
419 	struct uio *ktruio = NULL;
420 #endif
421 
422 	error = fget_write(td, uap->fd, &fp);
423 	if (error)
424 		return (EBADF);
425 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
426 	if (error) {
427 		fdrop(fp, td);
428 		return (error);
429 	}
430 	auio->uio_rw = UIO_WRITE;
431 	auio->uio_td = td;
432 #ifdef KTRACE
433 	if (KTRPOINT(td, KTR_GENIO))
434 		ktruio = cloneuio(auio);
435 #endif
436 	cnt = auio->uio_resid;
437 	if (fp->f_type == DTYPE_VNODE)
438 		bwillwrite();
439 	if ((error = fo_write(fp, auio, td->td_ucred, 0, td))) {
440 		if (auio->uio_resid != cnt && (error == ERESTART ||
441 		    error == EINTR || error == EWOULDBLOCK))
442 			error = 0;
443 		if (error == EPIPE) {
444 			PROC_LOCK(td->td_proc);
445 			psignal(td->td_proc, SIGPIPE);
446 			PROC_UNLOCK(td->td_proc);
447 		}
448 	}
449 	cnt -= auio->uio_resid;
450 #ifdef KTRACE
451 	if (ktruio != NULL) {
452 		ktruio->uio_resid = cnt;
453 		ktrgenio(uap->fd, UIO_WRITE, ktruio, error);
454 	}
455 #endif
456 	td->td_retval[0] = cnt;
457 	fdrop(fp, td);
458 	free(auio, M_IOV);
459 	return (error);
460 }
461 
462 /*
463  * Ioctl system call
464  */
465 #ifndef _SYS_SYSPROTO_H_
466 struct ioctl_args {
467 	int	fd;
468 	u_long	com;
469 	caddr_t	data;
470 };
471 #endif
472 /*
473  * MPSAFE
474  */
475 /* ARGSUSED */
476 int
477 ioctl(struct thread *td, struct ioctl_args *uap)
478 {
479 	struct file *fp;
480 	struct filedesc *fdp;
481 	u_long com;
482 	int error = 0;
483 	u_int size;
484 	caddr_t data, memp;
485 	int tmp;
486 
487 	if (uap->com > 0xffffffff) {
488 		printf(
489 		    "WARNING pid %d (%s): ioctl sign-extension ioctl %lx\n",
490 		    td->td_proc->p_pid, td->td_proc->p_comm, uap->com);
491 		uap->com &= 0xffffffff;
492 	}
493 	if ((error = fget(td, uap->fd, &fp)) != 0)
494 		return (error);
495 	if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
496 		fdrop(fp, td);
497 		return (EBADF);
498 	}
499 	fdp = td->td_proc->p_fd;
500 	switch (com = uap->com) {
501 	case FIONCLEX:
502 		FILEDESC_LOCK_FAST(fdp);
503 		fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE;
504 		FILEDESC_UNLOCK_FAST(fdp);
505 		fdrop(fp, td);
506 		return (0);
507 	case FIOCLEX:
508 		FILEDESC_LOCK_FAST(fdp);
509 		fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE;
510 		FILEDESC_UNLOCK_FAST(fdp);
511 		fdrop(fp, td);
512 		return (0);
513 	}
514 
515 	/*
516 	 * Interpret high order word to find amount of data to be
517 	 * copied to/from the user's address space.
518 	 */
519 	size = IOCPARM_LEN(com);
520 	if ((size > IOCPARM_MAX) ||
521 	    ((com & (IOC_VOID  | IOC_IN | IOC_OUT)) == 0) ||
522 	    ((com & IOC_VOID) && size > 0) ||
523 	    ((com & (IOC_IN | IOC_OUT)) && size == 0)) {
524 		fdrop(fp, td);
525 		return (ENOTTY);
526 	}
527 
528 	if (size > 0) {
529 		memp = malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
530 		data = memp;
531 	} else {
532 		memp = NULL;
533 		data = (void *)&uap->data;
534 	}
535 	if (com & IOC_IN) {
536 		error = copyin(uap->data, data, (u_int)size);
537 		if (error) {
538 			free(memp, M_IOCTLOPS);
539 			fdrop(fp, td);
540 			return (error);
541 		}
542 	} else if (com & IOC_OUT) {
543 		/*
544 		 * Zero the buffer so the user always
545 		 * gets back something deterministic.
546 		 */
547 		bzero(data, size);
548 	}
549 
550 	if (com == FIONBIO) {
551 		FILE_LOCK(fp);
552 		if ((tmp = *(int *)data))
553 			fp->f_flag |= FNONBLOCK;
554 		else
555 			fp->f_flag &= ~FNONBLOCK;
556 		FILE_UNLOCK(fp);
557 		data = (void *)&tmp;
558 	} else if (com == FIOASYNC) {
559 		FILE_LOCK(fp);
560 		if ((tmp = *(int *)data))
561 			fp->f_flag |= FASYNC;
562 		else
563 			fp->f_flag &= ~FASYNC;
564 		FILE_UNLOCK(fp);
565 		data = (void *)&tmp;
566 	}
567 
568 	error = fo_ioctl(fp, com, data, td->td_ucred, td);
569 
570 	if (error == 0 && (com & IOC_OUT))
571 		error = copyout(data, uap->data, (u_int)size);
572 
573 	if (memp != NULL)
574 		free(memp, M_IOCTLOPS);
575 	fdrop(fp, td);
576 	return (error);
577 }
578 
579 /*
580  * sellock and selwait are initialized in selectinit() via SYSINIT.
581  */
582 struct mtx	sellock;
583 struct cv	selwait;
584 u_int		nselcoll;	/* Select collisions since boot */
585 SYSCTL_UINT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
586 
587 /*
588  * Select system call.
589  */
590 #ifndef _SYS_SYSPROTO_H_
591 struct select_args {
592 	int	nd;
593 	fd_set	*in, *ou, *ex;
594 	struct	timeval *tv;
595 };
596 #endif
597 /*
598  * MPSAFE
599  */
600 int
601 select(td, uap)
602 	register struct thread *td;
603 	register struct select_args *uap;
604 {
605 	struct timeval tv, *tvp;
606 	int error;
607 
608 	if (uap->tv != NULL) {
609 		error = copyin(uap->tv, &tv, sizeof(tv));
610 		if (error)
611 			return (error);
612 		tvp = &tv;
613 	} else
614 		tvp = NULL;
615 
616 	return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp));
617 }
618 
619 int
620 kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou,
621     fd_set *fd_ex, struct timeval *tvp)
622 {
623 	struct filedesc *fdp;
624 	/*
625 	 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
626 	 * infds with the new FD_SETSIZE of 1024, and more than enough for
627 	 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
628 	 * of 256.
629 	 */
630 	fd_mask s_selbits[howmany(2048, NFDBITS)];
631 	fd_mask *ibits[3], *obits[3], *selbits, *sbp;
632 	struct timeval atv, rtv, ttv;
633 	int error, timo;
634 	u_int ncoll, nbufbytes, ncpbytes, nfdbits;
635 
636 	if (nd < 0)
637 		return (EINVAL);
638 	fdp = td->td_proc->p_fd;
639 
640 	FILEDESC_LOCK_FAST(fdp);
641 
642 	if (nd > td->td_proc->p_fd->fd_nfiles)
643 		nd = td->td_proc->p_fd->fd_nfiles;   /* forgiving; slightly wrong */
644 	FILEDESC_UNLOCK_FAST(fdp);
645 
646 	/*
647 	 * Allocate just enough bits for the non-null fd_sets.  Use the
648 	 * preallocated auto buffer if possible.
649 	 */
650 	nfdbits = roundup(nd, NFDBITS);
651 	ncpbytes = nfdbits / NBBY;
652 	nbufbytes = 0;
653 	if (fd_in != NULL)
654 		nbufbytes += 2 * ncpbytes;
655 	if (fd_ou != NULL)
656 		nbufbytes += 2 * ncpbytes;
657 	if (fd_ex != NULL)
658 		nbufbytes += 2 * ncpbytes;
659 	if (nbufbytes <= sizeof s_selbits)
660 		selbits = &s_selbits[0];
661 	else
662 		selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
663 
664 	/*
665 	 * Assign pointers into the bit buffers and fetch the input bits.
666 	 * Put the output buffers together so that they can be bzeroed
667 	 * together.
668 	 */
669 	sbp = selbits;
670 #define	getbits(name, x) \
671 	do {								\
672 		if (name == NULL)					\
673 			ibits[x] = NULL;				\
674 		else {							\
675 			ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp;	\
676 			obits[x] = sbp;					\
677 			sbp += ncpbytes / sizeof *sbp;			\
678 			error = copyin(name, ibits[x], ncpbytes);	\
679 			if (error != 0)					\
680 				goto done_nosellock;			\
681 		}							\
682 	} while (0)
683 	getbits(fd_in, 0);
684 	getbits(fd_ou, 1);
685 	getbits(fd_ex, 2);
686 #undef	getbits
687 	if (nbufbytes != 0)
688 		bzero(selbits, nbufbytes / 2);
689 
690 	if (tvp != NULL) {
691 		atv = *tvp;
692 		if (itimerfix(&atv)) {
693 			error = EINVAL;
694 			goto done_nosellock;
695 		}
696 		getmicrouptime(&rtv);
697 		timevaladd(&atv, &rtv);
698 	} else {
699 		atv.tv_sec = 0;
700 		atv.tv_usec = 0;
701 	}
702 	timo = 0;
703 	TAILQ_INIT(&td->td_selq);
704 	mtx_lock(&sellock);
705 retry:
706 	ncoll = nselcoll;
707 	mtx_lock_spin(&sched_lock);
708 	td->td_flags |= TDF_SELECT;
709 	mtx_unlock_spin(&sched_lock);
710 	mtx_unlock(&sellock);
711 
712 	error = selscan(td, ibits, obits, nd);
713 	mtx_lock(&sellock);
714 	if (error || td->td_retval[0])
715 		goto done;
716 	if (atv.tv_sec || atv.tv_usec) {
717 		getmicrouptime(&rtv);
718 		if (timevalcmp(&rtv, &atv, >=))
719 			goto done;
720 		ttv = atv;
721 		timevalsub(&ttv, &rtv);
722 		timo = ttv.tv_sec > 24 * 60 * 60 ?
723 		    24 * 60 * 60 * hz : tvtohz(&ttv);
724 	}
725 
726 	/*
727 	 * An event of interest may occur while we do not hold
728 	 * sellock, so check TDF_SELECT and the number of
729 	 * collisions and rescan the file descriptors if
730 	 * necessary.
731 	 */
732 	mtx_lock_spin(&sched_lock);
733 	if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
734 		mtx_unlock_spin(&sched_lock);
735 		goto retry;
736 	}
737 	mtx_unlock_spin(&sched_lock);
738 
739 	if (timo > 0)
740 		error = cv_timedwait_sig(&selwait, &sellock, timo);
741 	else
742 		error = cv_wait_sig(&selwait, &sellock);
743 
744 	if (error == 0)
745 		goto retry;
746 
747 done:
748 	clear_selinfo_list(td);
749 	mtx_lock_spin(&sched_lock);
750 	td->td_flags &= ~TDF_SELECT;
751 	mtx_unlock_spin(&sched_lock);
752 	mtx_unlock(&sellock);
753 
754 done_nosellock:
755 	/* select is not restarted after signals... */
756 	if (error == ERESTART)
757 		error = EINTR;
758 	if (error == EWOULDBLOCK)
759 		error = 0;
760 #define	putbits(name, x) \
761 	if (name && (error2 = copyout(obits[x], name, ncpbytes))) \
762 		error = error2;
763 	if (error == 0) {
764 		int error2;
765 
766 		putbits(fd_in, 0);
767 		putbits(fd_ou, 1);
768 		putbits(fd_ex, 2);
769 #undef putbits
770 	}
771 	if (selbits != &s_selbits[0])
772 		free(selbits, M_SELECT);
773 
774 	return (error);
775 }
776 
777 static int
778 selscan(td, ibits, obits, nfd)
779 	struct thread *td;
780 	fd_mask **ibits, **obits;
781 	int nfd;
782 {
783 	int msk, i, fd;
784 	fd_mask bits;
785 	struct file *fp;
786 	int n = 0;
787 	/* Note: backend also returns POLLHUP/POLLERR if appropriate. */
788 	static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
789 	struct filedesc *fdp = td->td_proc->p_fd;
790 
791 	FILEDESC_LOCK(fdp);
792 	for (msk = 0; msk < 3; msk++) {
793 		if (ibits[msk] == NULL)
794 			continue;
795 		for (i = 0; i < nfd; i += NFDBITS) {
796 			bits = ibits[msk][i/NFDBITS];
797 			/* ffs(int mask) not portable, fd_mask is long */
798 			for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
799 				if (!(bits & 1))
800 					continue;
801 				if ((fp = fget_locked(fdp, fd)) == NULL) {
802 					FILEDESC_UNLOCK(fdp);
803 					return (EBADF);
804 				}
805 				if (fo_poll(fp, flag[msk], td->td_ucred,
806 				    td)) {
807 					obits[msk][(fd)/NFDBITS] |=
808 					    ((fd_mask)1 << ((fd) % NFDBITS));
809 					n++;
810 				}
811 			}
812 		}
813 	}
814 	FILEDESC_UNLOCK(fdp);
815 	td->td_retval[0] = n;
816 	return (0);
817 }
818 
819 /*
820  * Poll system call.
821  */
822 #ifndef _SYS_SYSPROTO_H_
823 struct poll_args {
824 	struct pollfd *fds;
825 	u_int	nfds;
826 	int	timeout;
827 };
828 #endif
829 /*
830  * MPSAFE
831  */
832 int
833 poll(td, uap)
834 	struct thread *td;
835 	struct poll_args *uap;
836 {
837 	struct pollfd *bits;
838 	struct pollfd smallbits[32];
839 	struct timeval atv, rtv, ttv;
840 	int error = 0, timo;
841 	u_int ncoll, nfds;
842 	size_t ni;
843 
844 	nfds = uap->nfds;
845 
846 	/*
847 	 * This is kinda bogus.  We have fd limits, but that is not
848 	 * really related to the size of the pollfd array.  Make sure
849 	 * we let the process use at least FD_SETSIZE entries and at
850 	 * least enough for the current limits.  We want to be reasonably
851 	 * safe, but not overly restrictive.
852 	 */
853 	PROC_LOCK(td->td_proc);
854 	if ((nfds > lim_cur(td->td_proc, RLIMIT_NOFILE)) &&
855 	    (nfds > FD_SETSIZE)) {
856 		PROC_UNLOCK(td->td_proc);
857 		error = EINVAL;
858 		goto done2;
859 	}
860 	PROC_UNLOCK(td->td_proc);
861 	ni = nfds * sizeof(struct pollfd);
862 	if (ni > sizeof(smallbits))
863 		bits = malloc(ni, M_TEMP, M_WAITOK);
864 	else
865 		bits = smallbits;
866 	error = copyin(uap->fds, bits, ni);
867 	if (error)
868 		goto done_nosellock;
869 	if (uap->timeout != INFTIM) {
870 		atv.tv_sec = uap->timeout / 1000;
871 		atv.tv_usec = (uap->timeout % 1000) * 1000;
872 		if (itimerfix(&atv)) {
873 			error = EINVAL;
874 			goto done_nosellock;
875 		}
876 		getmicrouptime(&rtv);
877 		timevaladd(&atv, &rtv);
878 	} else {
879 		atv.tv_sec = 0;
880 		atv.tv_usec = 0;
881 	}
882 	timo = 0;
883 	TAILQ_INIT(&td->td_selq);
884 	mtx_lock(&sellock);
885 retry:
886 	ncoll = nselcoll;
887 	mtx_lock_spin(&sched_lock);
888 	td->td_flags |= TDF_SELECT;
889 	mtx_unlock_spin(&sched_lock);
890 	mtx_unlock(&sellock);
891 
892 	error = pollscan(td, bits, nfds);
893 	mtx_lock(&sellock);
894 	if (error || td->td_retval[0])
895 		goto done;
896 	if (atv.tv_sec || atv.tv_usec) {
897 		getmicrouptime(&rtv);
898 		if (timevalcmp(&rtv, &atv, >=))
899 			goto done;
900 		ttv = atv;
901 		timevalsub(&ttv, &rtv);
902 		timo = ttv.tv_sec > 24 * 60 * 60 ?
903 		    24 * 60 * 60 * hz : tvtohz(&ttv);
904 	}
905 	/*
906 	 * An event of interest may occur while we do not hold
907 	 * sellock, so check TDF_SELECT and the number of collisions
908 	 * and rescan the file descriptors if necessary.
909 	 */
910 	mtx_lock_spin(&sched_lock);
911 	if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
912 		mtx_unlock_spin(&sched_lock);
913 		goto retry;
914 	}
915 	mtx_unlock_spin(&sched_lock);
916 
917 	if (timo > 0)
918 		error = cv_timedwait_sig(&selwait, &sellock, timo);
919 	else
920 		error = cv_wait_sig(&selwait, &sellock);
921 
922 	if (error == 0)
923 		goto retry;
924 
925 done:
926 	clear_selinfo_list(td);
927 	mtx_lock_spin(&sched_lock);
928 	td->td_flags &= ~TDF_SELECT;
929 	mtx_unlock_spin(&sched_lock);
930 	mtx_unlock(&sellock);
931 
932 done_nosellock:
933 	/* poll is not restarted after signals... */
934 	if (error == ERESTART)
935 		error = EINTR;
936 	if (error == EWOULDBLOCK)
937 		error = 0;
938 	if (error == 0) {
939 		error = copyout(bits, uap->fds, ni);
940 		if (error)
941 			goto out;
942 	}
943 out:
944 	if (ni > sizeof(smallbits))
945 		free(bits, M_TEMP);
946 done2:
947 	return (error);
948 }
949 
950 static int
951 pollscan(td, fds, nfd)
952 	struct thread *td;
953 	struct pollfd *fds;
954 	u_int nfd;
955 {
956 	register struct filedesc *fdp = td->td_proc->p_fd;
957 	int i;
958 	struct file *fp;
959 	int n = 0;
960 
961 	FILEDESC_LOCK(fdp);
962 	for (i = 0; i < nfd; i++, fds++) {
963 		if (fds->fd >= fdp->fd_nfiles) {
964 			fds->revents = POLLNVAL;
965 			n++;
966 		} else if (fds->fd < 0) {
967 			fds->revents = 0;
968 		} else {
969 			fp = fdp->fd_ofiles[fds->fd];
970 			if (fp == NULL) {
971 				fds->revents = POLLNVAL;
972 				n++;
973 			} else {
974 				/*
975 				 * Note: backend also returns POLLHUP and
976 				 * POLLERR if appropriate.
977 				 */
978 				fds->revents = fo_poll(fp, fds->events,
979 				    td->td_ucred, td);
980 				if (fds->revents != 0)
981 					n++;
982 			}
983 		}
984 	}
985 	FILEDESC_UNLOCK(fdp);
986 	td->td_retval[0] = n;
987 	return (0);
988 }
989 
990 /*
991  * OpenBSD poll system call.
992  * XXX this isn't quite a true representation..  OpenBSD uses select ops.
993  */
994 #ifndef _SYS_SYSPROTO_H_
995 struct openbsd_poll_args {
996 	struct pollfd *fds;
997 	u_int	nfds;
998 	int	timeout;
999 };
1000 #endif
1001 /*
1002  * MPSAFE
1003  */
1004 int
1005 openbsd_poll(td, uap)
1006 	register struct thread *td;
1007 	register struct openbsd_poll_args *uap;
1008 {
1009 	return (poll(td, (struct poll_args *)uap));
1010 }
1011 
1012 /*
1013  * Remove the references to the thread from all of the objects
1014  * we were polling.
1015  *
1016  * This code assumes that the underlying owner of the selinfo
1017  * structure will hold sellock before it changes it, and that
1018  * it will unlink itself from our list if it goes away.
1019  */
1020 void
1021 clear_selinfo_list(td)
1022 	struct thread *td;
1023 {
1024 	struct selinfo *si;
1025 
1026 	mtx_assert(&sellock, MA_OWNED);
1027 	TAILQ_FOREACH(si, &td->td_selq, si_thrlist)
1028 		si->si_thread = NULL;
1029 	TAILQ_INIT(&td->td_selq);
1030 }
1031 
1032 /*
1033  * Record a select request.
1034  */
1035 void
1036 selrecord(selector, sip)
1037 	struct thread *selector;
1038 	struct selinfo *sip;
1039 {
1040 
1041 	mtx_lock(&sellock);
1042 	/*
1043 	 * If the selinfo's thread pointer is NULL then take ownership of it.
1044 	 *
1045 	 * If the thread pointer is not NULL and it points to another
1046 	 * thread, then we have a collision.
1047 	 *
1048 	 * If the thread pointer is not NULL and points back to us then leave
1049 	 * it alone as we've already added pointed it at us and added it to
1050 	 * our list.
1051 	 */
1052 	if (sip->si_thread == NULL) {
1053 		sip->si_thread = selector;
1054 		TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist);
1055 	} else if (sip->si_thread != selector) {
1056 		sip->si_flags |= SI_COLL;
1057 	}
1058 
1059 	mtx_unlock(&sellock);
1060 }
1061 
1062 /* Wake up a selecting thread. */
1063 void
1064 selwakeup(sip)
1065 	struct selinfo *sip;
1066 {
1067 	doselwakeup(sip, -1);
1068 }
1069 
1070 /* Wake up a selecting thread, and set its priority. */
1071 void
1072 selwakeuppri(sip, pri)
1073 	struct selinfo *sip;
1074 	int pri;
1075 {
1076 	doselwakeup(sip, pri);
1077 }
1078 
1079 /*
1080  * Do a wakeup when a selectable event occurs.
1081  */
1082 static void
1083 doselwakeup(sip, pri)
1084 	struct selinfo *sip;
1085 	int pri;
1086 {
1087 	struct thread *td;
1088 
1089 	mtx_lock(&sellock);
1090 	td = sip->si_thread;
1091 	if ((sip->si_flags & SI_COLL) != 0) {
1092 		nselcoll++;
1093 		sip->si_flags &= ~SI_COLL;
1094 		cv_broadcastpri(&selwait, pri);
1095 	}
1096 	if (td == NULL) {
1097 		mtx_unlock(&sellock);
1098 		return;
1099 	}
1100 	TAILQ_REMOVE(&td->td_selq, sip, si_thrlist);
1101 	sip->si_thread = NULL;
1102 	mtx_lock_spin(&sched_lock);
1103 	td->td_flags &= ~TDF_SELECT;
1104 	mtx_unlock_spin(&sched_lock);
1105 	sleepq_remove(td, &selwait);
1106 	mtx_unlock(&sellock);
1107 }
1108 
1109 static void selectinit(void *);
1110 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL)
1111 
1112 /* ARGSUSED*/
1113 static void
1114 selectinit(dummy)
1115 	void *dummy;
1116 {
1117 	cv_init(&selwait, "select");
1118 	mtx_init(&sellock, "sellck", NULL, MTX_DEF);
1119 }
1120