xref: /freebsd/sys/kern/sys_generic.c (revision c0020399a650364d0134f79f3fa319f84064372d)
1 /*-
2  * Copyright (c) 1982, 1986, 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 4. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
35  */
36 
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD$");
39 
40 #include "opt_compat.h"
41 #include "opt_ktrace.h"
42 
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/sysproto.h>
46 #include <sys/filedesc.h>
47 #include <sys/filio.h>
48 #include <sys/fcntl.h>
49 #include <sys/file.h>
50 #include <sys/proc.h>
51 #include <sys/signalvar.h>
52 #include <sys/socketvar.h>
53 #include <sys/uio.h>
54 #include <sys/kernel.h>
55 #include <sys/ktr.h>
56 #include <sys/limits.h>
57 #include <sys/malloc.h>
58 #include <sys/poll.h>
59 #include <sys/resourcevar.h>
60 #include <sys/selinfo.h>
61 #include <sys/sleepqueue.h>
62 #include <sys/syscallsubr.h>
63 #include <sys/sysctl.h>
64 #include <sys/sysent.h>
65 #include <sys/vnode.h>
66 #include <sys/bio.h>
67 #include <sys/buf.h>
68 #include <sys/condvar.h>
69 #ifdef KTRACE
70 #include <sys/ktrace.h>
71 #endif
72 
73 #include <security/audit/audit.h>
74 
75 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
76 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
77 MALLOC_DEFINE(M_IOV, "iov", "large iov's");
78 
79 static int	pollout(struct pollfd *, struct pollfd *, u_int);
80 static int	pollscan(struct thread *, struct pollfd *, u_int);
81 static int	pollrescan(struct thread *);
82 static int	selscan(struct thread *, fd_mask **, fd_mask **, int);
83 static int	selrescan(struct thread *, fd_mask **, fd_mask **);
84 static void	selfdalloc(struct thread *, void *);
85 static void	selfdfree(struct seltd *, struct selfd *);
86 static int	dofileread(struct thread *, int, struct file *, struct uio *,
87 		    off_t, int);
88 static int	dofilewrite(struct thread *, int, struct file *, struct uio *,
89 		    off_t, int);
90 static void	doselwakeup(struct selinfo *, int);
91 static void	seltdinit(struct thread *);
92 static int	seltdwait(struct thread *, int);
93 static void	seltdclear(struct thread *);
94 
95 /*
96  * One seltd per-thread allocated on demand as needed.
97  *
98  *	t - protected by st_mtx
99  * 	k - Only accessed by curthread or read-only
100  */
101 struct seltd {
102 	STAILQ_HEAD(, selfd)	st_selq;	/* (k) List of selfds. */
103 	struct selfd		*st_free1;	/* (k) free fd for read set. */
104 	struct selfd		*st_free2;	/* (k) free fd for write set. */
105 	struct mtx		st_mtx;		/* Protects struct seltd */
106 	struct cv		st_wait;	/* (t) Wait channel. */
107 	int			st_flags;	/* (t) SELTD_ flags. */
108 };
109 
110 #define	SELTD_PENDING	0x0001			/* We have pending events. */
111 #define	SELTD_RESCAN	0x0002			/* Doing a rescan. */
112 
113 /*
114  * One selfd allocated per-thread per-file-descriptor.
115  *	f - protected by sf_mtx
116  */
117 struct selfd {
118 	STAILQ_ENTRY(selfd)	sf_link;	/* (k) fds owned by this td. */
119 	TAILQ_ENTRY(selfd)	sf_threads;	/* (f) fds on this selinfo. */
120 	struct selinfo		*sf_si;		/* (f) selinfo when linked. */
121 	struct mtx		*sf_mtx;	/* Pointer to selinfo mtx. */
122 	struct seltd		*sf_td;		/* (k) owning seltd. */
123 	void			*sf_cookie;	/* (k) fd or pollfd. */
124 };
125 
126 static uma_zone_t selfd_zone;
127 
128 #ifndef _SYS_SYSPROTO_H_
129 struct read_args {
130 	int	fd;
131 	void	*buf;
132 	size_t	nbyte;
133 };
134 #endif
135 int
136 read(td, uap)
137 	struct thread *td;
138 	struct read_args *uap;
139 {
140 	struct uio auio;
141 	struct iovec aiov;
142 	int error;
143 
144 	if (uap->nbyte > INT_MAX)
145 		return (EINVAL);
146 	aiov.iov_base = uap->buf;
147 	aiov.iov_len = uap->nbyte;
148 	auio.uio_iov = &aiov;
149 	auio.uio_iovcnt = 1;
150 	auio.uio_resid = uap->nbyte;
151 	auio.uio_segflg = UIO_USERSPACE;
152 	error = kern_readv(td, uap->fd, &auio);
153 	return(error);
154 }
155 
156 /*
157  * Positioned read system call
158  */
159 #ifndef _SYS_SYSPROTO_H_
160 struct pread_args {
161 	int	fd;
162 	void	*buf;
163 	size_t	nbyte;
164 	int	pad;
165 	off_t	offset;
166 };
167 #endif
168 int
169 pread(td, uap)
170 	struct thread *td;
171 	struct pread_args *uap;
172 {
173 	struct uio auio;
174 	struct iovec aiov;
175 	int error;
176 
177 	if (uap->nbyte > INT_MAX)
178 		return (EINVAL);
179 	aiov.iov_base = uap->buf;
180 	aiov.iov_len = uap->nbyte;
181 	auio.uio_iov = &aiov;
182 	auio.uio_iovcnt = 1;
183 	auio.uio_resid = uap->nbyte;
184 	auio.uio_segflg = UIO_USERSPACE;
185 	error = kern_preadv(td, uap->fd, &auio, uap->offset);
186 	return(error);
187 }
188 
189 int
190 freebsd6_pread(td, uap)
191 	struct thread *td;
192 	struct freebsd6_pread_args *uap;
193 {
194 	struct pread_args oargs;
195 
196 	oargs.fd = uap->fd;
197 	oargs.buf = uap->buf;
198 	oargs.nbyte = uap->nbyte;
199 	oargs.offset = uap->offset;
200 	return (pread(td, &oargs));
201 }
202 
203 /*
204  * Scatter read system call.
205  */
206 #ifndef _SYS_SYSPROTO_H_
207 struct readv_args {
208 	int	fd;
209 	struct	iovec *iovp;
210 	u_int	iovcnt;
211 };
212 #endif
213 int
214 readv(struct thread *td, struct readv_args *uap)
215 {
216 	struct uio *auio;
217 	int error;
218 
219 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
220 	if (error)
221 		return (error);
222 	error = kern_readv(td, uap->fd, auio);
223 	free(auio, M_IOV);
224 	return (error);
225 }
226 
227 int
228 kern_readv(struct thread *td, int fd, struct uio *auio)
229 {
230 	struct file *fp;
231 	int error;
232 
233 	error = fget_read(td, fd, &fp);
234 	if (error)
235 		return (error);
236 	error = dofileread(td, fd, fp, auio, (off_t)-1, 0);
237 	fdrop(fp, td);
238 	return (error);
239 }
240 
241 /*
242  * Scatter positioned read system call.
243  */
244 #ifndef _SYS_SYSPROTO_H_
245 struct preadv_args {
246 	int	fd;
247 	struct	iovec *iovp;
248 	u_int	iovcnt;
249 	off_t	offset;
250 };
251 #endif
252 int
253 preadv(struct thread *td, struct preadv_args *uap)
254 {
255 	struct uio *auio;
256 	int error;
257 
258 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
259 	if (error)
260 		return (error);
261 	error = kern_preadv(td, uap->fd, auio, uap->offset);
262 	free(auio, M_IOV);
263 	return (error);
264 }
265 
266 int
267 kern_preadv(td, fd, auio, offset)
268 	struct thread *td;
269 	int fd;
270 	struct uio *auio;
271 	off_t offset;
272 {
273 	struct file *fp;
274 	int error;
275 
276 	error = fget_read(td, fd, &fp);
277 	if (error)
278 		return (error);
279 	if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
280 		error = ESPIPE;
281 	else if (offset < 0 && fp->f_vnode->v_type != VCHR)
282 		error = EINVAL;
283 	else
284 		error = dofileread(td, fd, fp, auio, offset, FOF_OFFSET);
285 	fdrop(fp, td);
286 	return (error);
287 }
288 
289 /*
290  * Common code for readv and preadv that reads data in
291  * from a file using the passed in uio, offset, and flags.
292  */
293 static int
294 dofileread(td, fd, fp, auio, offset, flags)
295 	struct thread *td;
296 	int fd;
297 	struct file *fp;
298 	struct uio *auio;
299 	off_t offset;
300 	int flags;
301 {
302 	ssize_t cnt;
303 	int error;
304 #ifdef KTRACE
305 	struct uio *ktruio = NULL;
306 #endif
307 
308 	/* Finish zero length reads right here */
309 	if (auio->uio_resid == 0) {
310 		td->td_retval[0] = 0;
311 		return(0);
312 	}
313 	auio->uio_rw = UIO_READ;
314 	auio->uio_offset = offset;
315 	auio->uio_td = td;
316 #ifdef KTRACE
317 	if (KTRPOINT(td, KTR_GENIO))
318 		ktruio = cloneuio(auio);
319 #endif
320 	cnt = auio->uio_resid;
321 	if ((error = fo_read(fp, auio, td->td_ucred, flags, td))) {
322 		if (auio->uio_resid != cnt && (error == ERESTART ||
323 		    error == EINTR || error == EWOULDBLOCK))
324 			error = 0;
325 	}
326 	cnt -= auio->uio_resid;
327 #ifdef KTRACE
328 	if (ktruio != NULL) {
329 		ktruio->uio_resid = cnt;
330 		ktrgenio(fd, UIO_READ, ktruio, error);
331 	}
332 #endif
333 	td->td_retval[0] = cnt;
334 	return (error);
335 }
336 
337 #ifndef _SYS_SYSPROTO_H_
338 struct write_args {
339 	int	fd;
340 	const void *buf;
341 	size_t	nbyte;
342 };
343 #endif
344 int
345 write(td, uap)
346 	struct thread *td;
347 	struct write_args *uap;
348 {
349 	struct uio auio;
350 	struct iovec aiov;
351 	int error;
352 
353 	if (uap->nbyte > INT_MAX)
354 		return (EINVAL);
355 	aiov.iov_base = (void *)(uintptr_t)uap->buf;
356 	aiov.iov_len = uap->nbyte;
357 	auio.uio_iov = &aiov;
358 	auio.uio_iovcnt = 1;
359 	auio.uio_resid = uap->nbyte;
360 	auio.uio_segflg = UIO_USERSPACE;
361 	error = kern_writev(td, uap->fd, &auio);
362 	return(error);
363 }
364 
365 /*
366  * Positioned write system call.
367  */
368 #ifndef _SYS_SYSPROTO_H_
369 struct pwrite_args {
370 	int	fd;
371 	const void *buf;
372 	size_t	nbyte;
373 	int	pad;
374 	off_t	offset;
375 };
376 #endif
377 int
378 pwrite(td, uap)
379 	struct thread *td;
380 	struct pwrite_args *uap;
381 {
382 	struct uio auio;
383 	struct iovec aiov;
384 	int error;
385 
386 	if (uap->nbyte > INT_MAX)
387 		return (EINVAL);
388 	aiov.iov_base = (void *)(uintptr_t)uap->buf;
389 	aiov.iov_len = uap->nbyte;
390 	auio.uio_iov = &aiov;
391 	auio.uio_iovcnt = 1;
392 	auio.uio_resid = uap->nbyte;
393 	auio.uio_segflg = UIO_USERSPACE;
394 	error = kern_pwritev(td, uap->fd, &auio, uap->offset);
395 	return(error);
396 }
397 
398 int
399 freebsd6_pwrite(td, uap)
400 	struct thread *td;
401 	struct freebsd6_pwrite_args *uap;
402 {
403 	struct pwrite_args oargs;
404 
405 	oargs.fd = uap->fd;
406 	oargs.buf = uap->buf;
407 	oargs.nbyte = uap->nbyte;
408 	oargs.offset = uap->offset;
409 	return (pwrite(td, &oargs));
410 }
411 
412 /*
413  * Gather write system call.
414  */
415 #ifndef _SYS_SYSPROTO_H_
416 struct writev_args {
417 	int	fd;
418 	struct	iovec *iovp;
419 	u_int	iovcnt;
420 };
421 #endif
422 int
423 writev(struct thread *td, struct writev_args *uap)
424 {
425 	struct uio *auio;
426 	int error;
427 
428 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
429 	if (error)
430 		return (error);
431 	error = kern_writev(td, uap->fd, auio);
432 	free(auio, M_IOV);
433 	return (error);
434 }
435 
436 int
437 kern_writev(struct thread *td, int fd, struct uio *auio)
438 {
439 	struct file *fp;
440 	int error;
441 
442 	error = fget_write(td, fd, &fp);
443 	if (error)
444 		return (error);
445 	error = dofilewrite(td, fd, fp, auio, (off_t)-1, 0);
446 	fdrop(fp, td);
447 	return (error);
448 }
449 
450 /*
451  * Gather positioned write system call.
452  */
453 #ifndef _SYS_SYSPROTO_H_
454 struct pwritev_args {
455 	int	fd;
456 	struct	iovec *iovp;
457 	u_int	iovcnt;
458 	off_t	offset;
459 };
460 #endif
461 int
462 pwritev(struct thread *td, struct pwritev_args *uap)
463 {
464 	struct uio *auio;
465 	int error;
466 
467 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
468 	if (error)
469 		return (error);
470 	error = kern_pwritev(td, uap->fd, auio, uap->offset);
471 	free(auio, M_IOV);
472 	return (error);
473 }
474 
475 int
476 kern_pwritev(td, fd, auio, offset)
477 	struct thread *td;
478 	struct uio *auio;
479 	int fd;
480 	off_t offset;
481 {
482 	struct file *fp;
483 	int error;
484 
485 	error = fget_write(td, fd, &fp);
486 	if (error)
487 		return (error);
488 	if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
489 		error = ESPIPE;
490 	else if (offset < 0 && fp->f_vnode->v_type != VCHR)
491 		error = EINVAL;
492 	else
493 		error = dofilewrite(td, fd, fp, auio, offset, FOF_OFFSET);
494 	fdrop(fp, td);
495 	return (error);
496 }
497 
498 /*
499  * Common code for writev and pwritev that writes data to
500  * a file using the passed in uio, offset, and flags.
501  */
502 static int
503 dofilewrite(td, fd, fp, auio, offset, flags)
504 	struct thread *td;
505 	int fd;
506 	struct file *fp;
507 	struct uio *auio;
508 	off_t offset;
509 	int flags;
510 {
511 	ssize_t cnt;
512 	int error;
513 #ifdef KTRACE
514 	struct uio *ktruio = NULL;
515 #endif
516 
517 	auio->uio_rw = UIO_WRITE;
518 	auio->uio_td = td;
519 	auio->uio_offset = offset;
520 #ifdef KTRACE
521 	if (KTRPOINT(td, KTR_GENIO))
522 		ktruio = cloneuio(auio);
523 #endif
524 	cnt = auio->uio_resid;
525 	if (fp->f_type == DTYPE_VNODE)
526 		bwillwrite();
527 	if ((error = fo_write(fp, auio, td->td_ucred, flags, td))) {
528 		if (auio->uio_resid != cnt && (error == ERESTART ||
529 		    error == EINTR || error == EWOULDBLOCK))
530 			error = 0;
531 		/* Socket layer is responsible for issuing SIGPIPE. */
532 		if (fp->f_type != DTYPE_SOCKET && error == EPIPE) {
533 			PROC_LOCK(td->td_proc);
534 			psignal(td->td_proc, SIGPIPE);
535 			PROC_UNLOCK(td->td_proc);
536 		}
537 	}
538 	cnt -= auio->uio_resid;
539 #ifdef KTRACE
540 	if (ktruio != NULL) {
541 		ktruio->uio_resid = cnt;
542 		ktrgenio(fd, UIO_WRITE, ktruio, error);
543 	}
544 #endif
545 	td->td_retval[0] = cnt;
546 	return (error);
547 }
548 
549 /*
550  * Truncate a file given a file descriptor.
551  *
552  * Can't use fget_write() here, since must return EINVAL and not EBADF if the
553  * descriptor isn't writable.
554  */
555 int
556 kern_ftruncate(td, fd, length)
557 	struct thread *td;
558 	int fd;
559 	off_t length;
560 {
561 	struct file *fp;
562 	int error;
563 
564 	AUDIT_ARG(fd, fd);
565 	if (length < 0)
566 		return (EINVAL);
567 	error = fget(td, fd, &fp);
568 	if (error)
569 		return (error);
570 	AUDIT_ARG(file, td->td_proc, fp);
571 	if (!(fp->f_flag & FWRITE)) {
572 		fdrop(fp, td);
573 		return (EINVAL);
574 	}
575 	error = fo_truncate(fp, length, td->td_ucred, td);
576 	fdrop(fp, td);
577 	return (error);
578 }
579 
580 #ifndef _SYS_SYSPROTO_H_
581 struct ftruncate_args {
582 	int	fd;
583 	int	pad;
584 	off_t	length;
585 };
586 #endif
587 int
588 ftruncate(td, uap)
589 	struct thread *td;
590 	struct ftruncate_args *uap;
591 {
592 
593 	return (kern_ftruncate(td, uap->fd, uap->length));
594 }
595 
596 #if defined(COMPAT_43)
597 #ifndef _SYS_SYSPROTO_H_
598 struct oftruncate_args {
599 	int	fd;
600 	long	length;
601 };
602 #endif
603 int
604 oftruncate(td, uap)
605 	struct thread *td;
606 	struct oftruncate_args *uap;
607 {
608 
609 	return (kern_ftruncate(td, uap->fd, uap->length));
610 }
611 #endif /* COMPAT_43 */
612 
613 #ifndef _SYS_SYSPROTO_H_
614 struct ioctl_args {
615 	int	fd;
616 	u_long	com;
617 	caddr_t	data;
618 };
619 #endif
620 /* ARGSUSED */
621 int
622 ioctl(struct thread *td, struct ioctl_args *uap)
623 {
624 	u_long com;
625 	int arg, error;
626 	u_int size;
627 	caddr_t data;
628 
629 	if (uap->com > 0xffffffff) {
630 		printf(
631 		    "WARNING pid %d (%s): ioctl sign-extension ioctl %lx\n",
632 		    td->td_proc->p_pid, td->td_name, uap->com);
633 		uap->com &= 0xffffffff;
634 	}
635 	com = uap->com;
636 
637 	/*
638 	 * Interpret high order word to find amount of data to be
639 	 * copied to/from the user's address space.
640 	 */
641 	size = IOCPARM_LEN(com);
642 	if ((size > IOCPARM_MAX) ||
643 	    ((com & (IOC_VOID  | IOC_IN | IOC_OUT)) == 0) ||
644 #if defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
645 	    ((com & IOC_OUT) && size == 0) ||
646 #else
647 	    ((com & (IOC_IN | IOC_OUT)) && size == 0) ||
648 #endif
649 	    ((com & IOC_VOID) && size > 0 && size != sizeof(int)))
650 		return (ENOTTY);
651 
652 	if (size > 0) {
653 		if (com & IOC_VOID) {
654 			/* Integer argument. */
655 			arg = (intptr_t)uap->data;
656 			data = (void *)&arg;
657 			size = 0;
658 		} else
659 			data = malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
660 	} else
661 		data = (void *)&uap->data;
662 	if (com & IOC_IN) {
663 		error = copyin(uap->data, data, (u_int)size);
664 		if (error) {
665 			if (size > 0)
666 				free(data, M_IOCTLOPS);
667 			return (error);
668 		}
669 	} else if (com & IOC_OUT) {
670 		/*
671 		 * Zero the buffer so the user always
672 		 * gets back something deterministic.
673 		 */
674 		bzero(data, size);
675 	}
676 
677 	error = kern_ioctl(td, uap->fd, com, data);
678 
679 	if (error == 0 && (com & IOC_OUT))
680 		error = copyout(data, uap->data, (u_int)size);
681 
682 	if (size > 0)
683 		free(data, M_IOCTLOPS);
684 	return (error);
685 }
686 
687 int
688 kern_ioctl(struct thread *td, int fd, u_long com, caddr_t data)
689 {
690 	struct file *fp;
691 	struct filedesc *fdp;
692 	int error;
693 	int tmp;
694 
695 	if ((error = fget(td, fd, &fp)) != 0)
696 		return (error);
697 	if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
698 		fdrop(fp, td);
699 		return (EBADF);
700 	}
701 	fdp = td->td_proc->p_fd;
702 	switch (com) {
703 	case FIONCLEX:
704 		FILEDESC_XLOCK(fdp);
705 		fdp->fd_ofileflags[fd] &= ~UF_EXCLOSE;
706 		FILEDESC_XUNLOCK(fdp);
707 		goto out;
708 	case FIOCLEX:
709 		FILEDESC_XLOCK(fdp);
710 		fdp->fd_ofileflags[fd] |= UF_EXCLOSE;
711 		FILEDESC_XUNLOCK(fdp);
712 		goto out;
713 	case FIONBIO:
714 		if ((tmp = *(int *)data))
715 			atomic_set_int(&fp->f_flag, FNONBLOCK);
716 		else
717 			atomic_clear_int(&fp->f_flag, FNONBLOCK);
718 		data = (void *)&tmp;
719 		break;
720 	case FIOASYNC:
721 		if ((tmp = *(int *)data))
722 			atomic_set_int(&fp->f_flag, FASYNC);
723 		else
724 			atomic_clear_int(&fp->f_flag, FASYNC);
725 		data = (void *)&tmp;
726 		break;
727 	}
728 
729 	error = fo_ioctl(fp, com, data, td->td_ucred, td);
730 out:
731 	fdrop(fp, td);
732 	return (error);
733 }
734 
735 int
736 poll_no_poll(int events)
737 {
738 	/*
739 	 * Return true for read/write.  If the user asked for something
740 	 * special, return POLLNVAL, so that clients have a way of
741 	 * determining reliably whether or not the extended
742 	 * functionality is present without hard-coding knowledge
743 	 * of specific filesystem implementations.
744 	 */
745 	if (events & ~POLLSTANDARD)
746 		return (POLLNVAL);
747 
748 	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
749 }
750 
751 #ifndef _SYS_SYSPROTO_H_
752 struct select_args {
753 	int	nd;
754 	fd_set	*in, *ou, *ex;
755 	struct	timeval *tv;
756 };
757 #endif
758 int
759 select(td, uap)
760 	register struct thread *td;
761 	register struct select_args *uap;
762 {
763 	struct timeval tv, *tvp;
764 	int error;
765 
766 	if (uap->tv != NULL) {
767 		error = copyin(uap->tv, &tv, sizeof(tv));
768 		if (error)
769 			return (error);
770 		tvp = &tv;
771 	} else
772 		tvp = NULL;
773 
774 	return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp));
775 }
776 
777 int
778 kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou,
779     fd_set *fd_ex, struct timeval *tvp)
780 {
781 	struct filedesc *fdp;
782 	/*
783 	 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
784 	 * infds with the new FD_SETSIZE of 1024, and more than enough for
785 	 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
786 	 * of 256.
787 	 */
788 	fd_mask s_selbits[howmany(2048, NFDBITS)];
789 	fd_mask *ibits[3], *obits[3], *selbits, *sbp;
790 	struct timeval atv, rtv, ttv;
791 	int error, timo;
792 	u_int nbufbytes, ncpbytes, nfdbits;
793 
794 	if (nd < 0)
795 		return (EINVAL);
796 	fdp = td->td_proc->p_fd;
797 
798 	FILEDESC_SLOCK(fdp);
799 	if (nd > td->td_proc->p_fd->fd_nfiles)
800 		nd = td->td_proc->p_fd->fd_nfiles;   /* forgiving; slightly wrong */
801 	FILEDESC_SUNLOCK(fdp);
802 
803 	/*
804 	 * Allocate just enough bits for the non-null fd_sets.  Use the
805 	 * preallocated auto buffer if possible.
806 	 */
807 	nfdbits = roundup(nd, NFDBITS);
808 	ncpbytes = nfdbits / NBBY;
809 	nbufbytes = 0;
810 	if (fd_in != NULL)
811 		nbufbytes += 2 * ncpbytes;
812 	if (fd_ou != NULL)
813 		nbufbytes += 2 * ncpbytes;
814 	if (fd_ex != NULL)
815 		nbufbytes += 2 * ncpbytes;
816 	if (nbufbytes <= sizeof s_selbits)
817 		selbits = &s_selbits[0];
818 	else
819 		selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
820 
821 	/*
822 	 * Assign pointers into the bit buffers and fetch the input bits.
823 	 * Put the output buffers together so that they can be bzeroed
824 	 * together.
825 	 */
826 	sbp = selbits;
827 #define	getbits(name, x) \
828 	do {								\
829 		if (name == NULL)					\
830 			ibits[x] = NULL;				\
831 		else {							\
832 			ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp;	\
833 			obits[x] = sbp;					\
834 			sbp += ncpbytes / sizeof *sbp;			\
835 			error = copyin(name, ibits[x], ncpbytes);	\
836 			if (error != 0)					\
837 				goto done;				\
838 		}							\
839 	} while (0)
840 	getbits(fd_in, 0);
841 	getbits(fd_ou, 1);
842 	getbits(fd_ex, 2);
843 #undef	getbits
844 	if (nbufbytes != 0)
845 		bzero(selbits, nbufbytes / 2);
846 
847 	if (tvp != NULL) {
848 		atv = *tvp;
849 		if (itimerfix(&atv)) {
850 			error = EINVAL;
851 			goto done;
852 		}
853 		getmicrouptime(&rtv);
854 		timevaladd(&atv, &rtv);
855 	} else {
856 		atv.tv_sec = 0;
857 		atv.tv_usec = 0;
858 	}
859 	timo = 0;
860 	seltdinit(td);
861 	/* Iterate until the timeout expires or descriptors become ready. */
862 	for (;;) {
863 		error = selscan(td, ibits, obits, nd);
864 		if (error || td->td_retval[0] != 0)
865 			break;
866 		if (atv.tv_sec || atv.tv_usec) {
867 			getmicrouptime(&rtv);
868 			if (timevalcmp(&rtv, &atv, >=))
869 				break;
870 			ttv = atv;
871 			timevalsub(&ttv, &rtv);
872 			timo = ttv.tv_sec > 24 * 60 * 60 ?
873 			    24 * 60 * 60 * hz : tvtohz(&ttv);
874 		}
875 		error = seltdwait(td, timo);
876 		if (error)
877 			break;
878 		error = selrescan(td, ibits, obits);
879 		if (error || td->td_retval[0] != 0)
880 			break;
881 	}
882 	seltdclear(td);
883 
884 done:
885 	/* select is not restarted after signals... */
886 	if (error == ERESTART)
887 		error = EINTR;
888 	if (error == EWOULDBLOCK)
889 		error = 0;
890 #define	putbits(name, x) \
891 	if (name && (error2 = copyout(obits[x], name, ncpbytes))) \
892 		error = error2;
893 	if (error == 0) {
894 		int error2;
895 
896 		putbits(fd_in, 0);
897 		putbits(fd_ou, 1);
898 		putbits(fd_ex, 2);
899 #undef putbits
900 	}
901 	if (selbits != &s_selbits[0])
902 		free(selbits, M_SELECT);
903 
904 	return (error);
905 }
906 /*
907  * Convert a select bit set to poll flags.
908  *
909  * The backend always returns POLLHUP/POLLERR if appropriate and we
910  * return this as a set bit in any set.
911  */
912 static int select_flags[3] = {
913     POLLRDNORM | POLLHUP | POLLERR,
914     POLLWRNORM | POLLHUP | POLLERR,
915     POLLRDBAND | POLLHUP | POLLERR
916 };
917 
918 /*
919  * Compute the fo_poll flags required for a fd given by the index and
920  * bit position in the fd_mask array.
921  */
922 static __inline int
923 selflags(fd_mask **ibits, int idx, fd_mask bit)
924 {
925 	int flags;
926 	int msk;
927 
928 	flags = 0;
929 	for (msk = 0; msk < 3; msk++) {
930 		if (ibits[msk] == NULL)
931 			continue;
932 		if ((ibits[msk][idx] & bit) == 0)
933 			continue;
934 		flags |= select_flags[msk];
935 	}
936 	return (flags);
937 }
938 
939 /*
940  * Set the appropriate output bits given a mask of fired events and the
941  * input bits originally requested.
942  */
943 static __inline int
944 selsetbits(fd_mask **ibits, fd_mask **obits, int idx, fd_mask bit, int events)
945 {
946 	int msk;
947 	int n;
948 
949 	n = 0;
950 	for (msk = 0; msk < 3; msk++) {
951 		if ((events & select_flags[msk]) == 0)
952 			continue;
953 		if (ibits[msk] == NULL)
954 			continue;
955 		if ((ibits[msk][idx] & bit) == 0)
956 			continue;
957 		/*
958 		 * XXX Check for a duplicate set.  This can occur because a
959 		 * socket calls selrecord() twice for each poll() call
960 		 * resulting in two selfds per real fd.  selrescan() will
961 		 * call selsetbits twice as a result.
962 		 */
963 		if ((obits[msk][idx] & bit) != 0)
964 			continue;
965 		obits[msk][idx] |= bit;
966 		n++;
967 	}
968 
969 	return (n);
970 }
971 
972 /*
973  * Traverse the list of fds attached to this thread's seltd and check for
974  * completion.
975  */
976 static int
977 selrescan(struct thread *td, fd_mask **ibits, fd_mask **obits)
978 {
979 	struct filedesc *fdp;
980 	struct selinfo *si;
981 	struct seltd *stp;
982 	struct selfd *sfp;
983 	struct selfd *sfn;
984 	struct file *fp;
985 	fd_mask bit;
986 	int fd, ev, n, idx;
987 
988 	fdp = td->td_proc->p_fd;
989 	stp = td->td_sel;
990 	n = 0;
991 	FILEDESC_SLOCK(fdp);
992 	STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) {
993 		fd = (int)(uintptr_t)sfp->sf_cookie;
994 		si = sfp->sf_si;
995 		selfdfree(stp, sfp);
996 		/* If the selinfo wasn't cleared the event didn't fire. */
997 		if (si != NULL)
998 			continue;
999 		if ((fp = fget_locked(fdp, fd)) == NULL) {
1000 			FILEDESC_SUNLOCK(fdp);
1001 			return (EBADF);
1002 		}
1003 		idx = fd / NFDBITS;
1004 		bit = (fd_mask)1 << (fd % NFDBITS);
1005 		ev = fo_poll(fp, selflags(ibits, idx, bit), td->td_ucred, td);
1006 		if (ev != 0)
1007 			n += selsetbits(ibits, obits, idx, bit, ev);
1008 	}
1009 	FILEDESC_SUNLOCK(fdp);
1010 	stp->st_flags = 0;
1011 	td->td_retval[0] = n;
1012 	return (0);
1013 }
1014 
1015 /*
1016  * Perform the initial filedescriptor scan and register ourselves with
1017  * each selinfo.
1018  */
1019 static int
1020 selscan(td, ibits, obits, nfd)
1021 	struct thread *td;
1022 	fd_mask **ibits, **obits;
1023 	int nfd;
1024 {
1025 	struct filedesc *fdp;
1026 	struct file *fp;
1027 	fd_mask bit;
1028 	int ev, flags, end, fd;
1029 	int n, idx;
1030 
1031 	fdp = td->td_proc->p_fd;
1032 	n = 0;
1033 	FILEDESC_SLOCK(fdp);
1034 	for (idx = 0, fd = 0; fd < nfd; idx++) {
1035 		end = imin(fd + NFDBITS, nfd);
1036 		for (bit = 1; fd < end; bit <<= 1, fd++) {
1037 			/* Compute the list of events we're interested in. */
1038 			flags = selflags(ibits, idx, bit);
1039 			if (flags == 0)
1040 				continue;
1041 			if ((fp = fget_locked(fdp, fd)) == NULL) {
1042 				FILEDESC_SUNLOCK(fdp);
1043 				return (EBADF);
1044 			}
1045 			selfdalloc(td, (void *)(uintptr_t)fd);
1046 			ev = fo_poll(fp, flags, td->td_ucred, td);
1047 			if (ev != 0)
1048 				n += selsetbits(ibits, obits, idx, bit, ev);
1049 		}
1050 	}
1051 
1052 	FILEDESC_SUNLOCK(fdp);
1053 	td->td_retval[0] = n;
1054 	return (0);
1055 }
1056 
1057 #ifndef _SYS_SYSPROTO_H_
1058 struct poll_args {
1059 	struct pollfd *fds;
1060 	u_int	nfds;
1061 	int	timeout;
1062 };
1063 #endif
1064 int
1065 poll(td, uap)
1066 	struct thread *td;
1067 	struct poll_args *uap;
1068 {
1069 	struct pollfd *bits;
1070 	struct pollfd smallbits[32];
1071 	struct timeval atv, rtv, ttv;
1072 	int error = 0, timo;
1073 	u_int nfds;
1074 	size_t ni;
1075 
1076 	nfds = uap->nfds;
1077 	if (nfds > maxfilesperproc && nfds > FD_SETSIZE)
1078 		return (EINVAL);
1079 	ni = nfds * sizeof(struct pollfd);
1080 	if (ni > sizeof(smallbits))
1081 		bits = malloc(ni, M_TEMP, M_WAITOK);
1082 	else
1083 		bits = smallbits;
1084 	error = copyin(uap->fds, bits, ni);
1085 	if (error)
1086 		goto done;
1087 	if (uap->timeout != INFTIM) {
1088 		atv.tv_sec = uap->timeout / 1000;
1089 		atv.tv_usec = (uap->timeout % 1000) * 1000;
1090 		if (itimerfix(&atv)) {
1091 			error = EINVAL;
1092 			goto done;
1093 		}
1094 		getmicrouptime(&rtv);
1095 		timevaladd(&atv, &rtv);
1096 	} else {
1097 		atv.tv_sec = 0;
1098 		atv.tv_usec = 0;
1099 	}
1100 	timo = 0;
1101 	seltdinit(td);
1102 	/* Iterate until the timeout expires or descriptors become ready. */
1103 	for (;;) {
1104 		error = pollscan(td, bits, nfds);
1105 		if (error || td->td_retval[0] != 0)
1106 			break;
1107 		if (atv.tv_sec || atv.tv_usec) {
1108 			getmicrouptime(&rtv);
1109 			if (timevalcmp(&rtv, &atv, >=))
1110 				break;
1111 			ttv = atv;
1112 			timevalsub(&ttv, &rtv);
1113 			timo = ttv.tv_sec > 24 * 60 * 60 ?
1114 			    24 * 60 * 60 * hz : tvtohz(&ttv);
1115 		}
1116 		error = seltdwait(td, timo);
1117 		if (error)
1118 			break;
1119 		error = pollrescan(td);
1120 		if (error || td->td_retval[0] != 0)
1121 			break;
1122 	}
1123 	seltdclear(td);
1124 
1125 done:
1126 	/* poll is not restarted after signals... */
1127 	if (error == ERESTART)
1128 		error = EINTR;
1129 	if (error == EWOULDBLOCK)
1130 		error = 0;
1131 	if (error == 0) {
1132 		error = pollout(bits, uap->fds, nfds);
1133 		if (error)
1134 			goto out;
1135 	}
1136 out:
1137 	if (ni > sizeof(smallbits))
1138 		free(bits, M_TEMP);
1139 	return (error);
1140 }
1141 
1142 static int
1143 pollrescan(struct thread *td)
1144 {
1145 	struct seltd *stp;
1146 	struct selfd *sfp;
1147 	struct selfd *sfn;
1148 	struct selinfo *si;
1149 	struct filedesc *fdp;
1150 	struct file *fp;
1151 	struct pollfd *fd;
1152 	int n;
1153 
1154 	n = 0;
1155 	fdp = td->td_proc->p_fd;
1156 	stp = td->td_sel;
1157 	FILEDESC_SLOCK(fdp);
1158 	STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) {
1159 		fd = (struct pollfd *)sfp->sf_cookie;
1160 		si = sfp->sf_si;
1161 		selfdfree(stp, sfp);
1162 		/* If the selinfo wasn't cleared the event didn't fire. */
1163 		if (si != NULL)
1164 			continue;
1165 		fp = fdp->fd_ofiles[fd->fd];
1166 		if (fp == NULL) {
1167 			fd->revents = POLLNVAL;
1168 			n++;
1169 			continue;
1170 		}
1171 		/*
1172 		 * Note: backend also returns POLLHUP and
1173 		 * POLLERR if appropriate.
1174 		 */
1175 		fd->revents = fo_poll(fp, fd->events, td->td_ucred, td);
1176 		if (fd->revents != 0)
1177 			n++;
1178 	}
1179 	FILEDESC_SUNLOCK(fdp);
1180 	stp->st_flags = 0;
1181 	td->td_retval[0] = n;
1182 	return (0);
1183 }
1184 
1185 
1186 static int
1187 pollout(fds, ufds, nfd)
1188 	struct pollfd *fds;
1189 	struct pollfd *ufds;
1190 	u_int nfd;
1191 {
1192 	int error = 0;
1193 	u_int i = 0;
1194 
1195 	for (i = 0; i < nfd; i++) {
1196 		error = copyout(&fds->revents, &ufds->revents,
1197 		    sizeof(ufds->revents));
1198 		if (error)
1199 			return (error);
1200 		fds++;
1201 		ufds++;
1202 	}
1203 	return (0);
1204 }
1205 
1206 static int
1207 pollscan(td, fds, nfd)
1208 	struct thread *td;
1209 	struct pollfd *fds;
1210 	u_int nfd;
1211 {
1212 	struct filedesc *fdp = td->td_proc->p_fd;
1213 	int i;
1214 	struct file *fp;
1215 	int n = 0;
1216 
1217 	FILEDESC_SLOCK(fdp);
1218 	for (i = 0; i < nfd; i++, fds++) {
1219 		if (fds->fd >= fdp->fd_nfiles) {
1220 			fds->revents = POLLNVAL;
1221 			n++;
1222 		} else if (fds->fd < 0) {
1223 			fds->revents = 0;
1224 		} else {
1225 			fp = fdp->fd_ofiles[fds->fd];
1226 			if (fp == NULL) {
1227 				fds->revents = POLLNVAL;
1228 				n++;
1229 			} else {
1230 				/*
1231 				 * Note: backend also returns POLLHUP and
1232 				 * POLLERR if appropriate.
1233 				 */
1234 				selfdalloc(td, fds);
1235 				fds->revents = fo_poll(fp, fds->events,
1236 				    td->td_ucred, td);
1237 				if (fds->revents != 0)
1238 					n++;
1239 			}
1240 		}
1241 	}
1242 	FILEDESC_SUNLOCK(fdp);
1243 	td->td_retval[0] = n;
1244 	return (0);
1245 }
1246 
1247 /*
1248  * OpenBSD poll system call.
1249  *
1250  * XXX this isn't quite a true representation..  OpenBSD uses select ops.
1251  */
1252 #ifndef _SYS_SYSPROTO_H_
1253 struct openbsd_poll_args {
1254 	struct pollfd *fds;
1255 	u_int	nfds;
1256 	int	timeout;
1257 };
1258 #endif
1259 int
1260 openbsd_poll(td, uap)
1261 	register struct thread *td;
1262 	register struct openbsd_poll_args *uap;
1263 {
1264 	return (poll(td, (struct poll_args *)uap));
1265 }
1266 
1267 /*
1268  * XXX This was created specifically to support netncp and netsmb.  This
1269  * allows the caller to specify a socket to wait for events on.  It returns
1270  * 0 if any events matched and an error otherwise.  There is no way to
1271  * determine which events fired.
1272  */
1273 int
1274 selsocket(struct socket *so, int events, struct timeval *tvp, struct thread *td)
1275 {
1276 	struct timeval atv, rtv, ttv;
1277 	int error, timo;
1278 
1279 	if (tvp != NULL) {
1280 		atv = *tvp;
1281 		if (itimerfix(&atv))
1282 			return (EINVAL);
1283 		getmicrouptime(&rtv);
1284 		timevaladd(&atv, &rtv);
1285 	} else {
1286 		atv.tv_sec = 0;
1287 		atv.tv_usec = 0;
1288 	}
1289 
1290 	timo = 0;
1291 	seltdinit(td);
1292 	/*
1293 	 * Iterate until the timeout expires or the socket becomes ready.
1294 	 */
1295 	for (;;) {
1296 		selfdalloc(td, NULL);
1297 		error = sopoll(so, events, NULL, td);
1298 		/* error here is actually the ready events. */
1299 		if (error)
1300 			return (0);
1301 		if (atv.tv_sec || atv.tv_usec) {
1302 			getmicrouptime(&rtv);
1303 			if (timevalcmp(&rtv, &atv, >=)) {
1304 				seltdclear(td);
1305 				return (EWOULDBLOCK);
1306 			}
1307 			ttv = atv;
1308 			timevalsub(&ttv, &rtv);
1309 			timo = ttv.tv_sec > 24 * 60 * 60 ?
1310 			    24 * 60 * 60 * hz : tvtohz(&ttv);
1311 		}
1312 		error = seltdwait(td, timo);
1313 		seltdclear(td);
1314 		if (error)
1315 			break;
1316 	}
1317 	/* XXX Duplicates ncp/smb behavior. */
1318 	if (error == ERESTART)
1319 		error = 0;
1320 	return (error);
1321 }
1322 
1323 /*
1324  * Preallocate two selfds associated with 'cookie'.  Some fo_poll routines
1325  * have two select sets, one for read and another for write.
1326  */
1327 static void
1328 selfdalloc(struct thread *td, void *cookie)
1329 {
1330 	struct seltd *stp;
1331 
1332 	stp = td->td_sel;
1333 	if (stp->st_free1 == NULL)
1334 		stp->st_free1 = uma_zalloc(selfd_zone, M_WAITOK|M_ZERO);
1335 	stp->st_free1->sf_td = stp;
1336 	stp->st_free1->sf_cookie = cookie;
1337 	if (stp->st_free2 == NULL)
1338 		stp->st_free2 = uma_zalloc(selfd_zone, M_WAITOK|M_ZERO);
1339 	stp->st_free2->sf_td = stp;
1340 	stp->st_free2->sf_cookie = cookie;
1341 }
1342 
1343 static void
1344 selfdfree(struct seltd *stp, struct selfd *sfp)
1345 {
1346 	STAILQ_REMOVE(&stp->st_selq, sfp, selfd, sf_link);
1347 	mtx_lock(sfp->sf_mtx);
1348 	if (sfp->sf_si)
1349 		TAILQ_REMOVE(&sfp->sf_si->si_tdlist, sfp, sf_threads);
1350 	mtx_unlock(sfp->sf_mtx);
1351 	uma_zfree(selfd_zone, sfp);
1352 }
1353 
1354 /*
1355  * Record a select request.
1356  */
1357 void
1358 selrecord(selector, sip)
1359 	struct thread *selector;
1360 	struct selinfo *sip;
1361 {
1362 	struct selfd *sfp;
1363 	struct seltd *stp;
1364 	struct mtx *mtxp;
1365 
1366 	stp = selector->td_sel;
1367 	/*
1368 	 * Don't record when doing a rescan.
1369 	 */
1370 	if (stp->st_flags & SELTD_RESCAN)
1371 		return;
1372 	/*
1373 	 * Grab one of the preallocated descriptors.
1374 	 */
1375 	sfp = NULL;
1376 	if ((sfp = stp->st_free1) != NULL)
1377 		stp->st_free1 = NULL;
1378 	else if ((sfp = stp->st_free2) != NULL)
1379 		stp->st_free2 = NULL;
1380 	else
1381 		panic("selrecord: No free selfd on selq");
1382 	mtxp = mtx_pool_find(mtxpool_sleep, sip);
1383 	/*
1384 	 * Initialize the sfp and queue it in the thread.
1385 	 */
1386 	sfp->sf_si = sip;
1387 	sfp->sf_mtx = mtxp;
1388 	STAILQ_INSERT_TAIL(&stp->st_selq, sfp, sf_link);
1389 	/*
1390 	 * Now that we've locked the sip, check for initialization.
1391 	 */
1392 	mtx_lock(mtxp);
1393 	if (sip->si_mtx == NULL) {
1394 		sip->si_mtx = mtxp;
1395 		TAILQ_INIT(&sip->si_tdlist);
1396 	}
1397 	/*
1398 	 * Add this thread to the list of selfds listening on this selinfo.
1399 	 */
1400 	TAILQ_INSERT_TAIL(&sip->si_tdlist, sfp, sf_threads);
1401 	mtx_unlock(sip->si_mtx);
1402 }
1403 
1404 /* Wake up a selecting thread. */
1405 void
1406 selwakeup(sip)
1407 	struct selinfo *sip;
1408 {
1409 	doselwakeup(sip, -1);
1410 }
1411 
1412 /* Wake up a selecting thread, and set its priority. */
1413 void
1414 selwakeuppri(sip, pri)
1415 	struct selinfo *sip;
1416 	int pri;
1417 {
1418 	doselwakeup(sip, pri);
1419 }
1420 
1421 /*
1422  * Do a wakeup when a selectable event occurs.
1423  */
1424 static void
1425 doselwakeup(sip, pri)
1426 	struct selinfo *sip;
1427 	int pri;
1428 {
1429 	struct selfd *sfp;
1430 	struct selfd *sfn;
1431 	struct seltd *stp;
1432 
1433 	/* If it's not initialized there can't be any waiters. */
1434 	if (sip->si_mtx == NULL)
1435 		return;
1436 	/*
1437 	 * Locking the selinfo locks all selfds associated with it.
1438 	 */
1439 	mtx_lock(sip->si_mtx);
1440 	TAILQ_FOREACH_SAFE(sfp, &sip->si_tdlist, sf_threads, sfn) {
1441 		/*
1442 		 * Once we remove this sfp from the list and clear the
1443 		 * sf_si seltdclear will know to ignore this si.
1444 		 */
1445 		TAILQ_REMOVE(&sip->si_tdlist, sfp, sf_threads);
1446 		sfp->sf_si = NULL;
1447 		stp = sfp->sf_td;
1448 		mtx_lock(&stp->st_mtx);
1449 		stp->st_flags |= SELTD_PENDING;
1450 		cv_broadcastpri(&stp->st_wait, pri);
1451 		mtx_unlock(&stp->st_mtx);
1452 	}
1453 	mtx_unlock(sip->si_mtx);
1454 }
1455 
1456 static void
1457 seltdinit(struct thread *td)
1458 {
1459 	struct seltd *stp;
1460 
1461 	if ((stp = td->td_sel) != NULL)
1462 		goto out;
1463 	td->td_sel = stp = malloc(sizeof(*stp), M_SELECT, M_WAITOK|M_ZERO);
1464 	mtx_init(&stp->st_mtx, "sellck", NULL, MTX_DEF);
1465 	cv_init(&stp->st_wait, "select");
1466 out:
1467 	stp->st_flags = 0;
1468 	STAILQ_INIT(&stp->st_selq);
1469 }
1470 
1471 static int
1472 seltdwait(struct thread *td, int timo)
1473 {
1474 	struct seltd *stp;
1475 	int error;
1476 
1477 	stp = td->td_sel;
1478 	/*
1479 	 * An event of interest may occur while we do not hold the seltd
1480 	 * locked so check the pending flag before we sleep.
1481 	 */
1482 	mtx_lock(&stp->st_mtx);
1483 	/*
1484 	 * Any further calls to selrecord will be a rescan.
1485 	 */
1486 	stp->st_flags |= SELTD_RESCAN;
1487 	if (stp->st_flags & SELTD_PENDING) {
1488 		mtx_unlock(&stp->st_mtx);
1489 		return (0);
1490 	}
1491 	if (timo > 0)
1492 		error = cv_timedwait_sig(&stp->st_wait, &stp->st_mtx, timo);
1493 	else
1494 		error = cv_wait_sig(&stp->st_wait, &stp->st_mtx);
1495 	mtx_unlock(&stp->st_mtx);
1496 
1497 	return (error);
1498 }
1499 
1500 void
1501 seltdfini(struct thread *td)
1502 {
1503 	struct seltd *stp;
1504 
1505 	stp = td->td_sel;
1506 	if (stp == NULL)
1507 		return;
1508 	if (stp->st_free1)
1509 		uma_zfree(selfd_zone, stp->st_free1);
1510 	if (stp->st_free2)
1511 		uma_zfree(selfd_zone, stp->st_free2);
1512 	td->td_sel = NULL;
1513 	free(stp, M_SELECT);
1514 }
1515 
1516 /*
1517  * Remove the references to the thread from all of the objects we were
1518  * polling.
1519  */
1520 static void
1521 seltdclear(struct thread *td)
1522 {
1523 	struct seltd *stp;
1524 	struct selfd *sfp;
1525 	struct selfd *sfn;
1526 
1527 	stp = td->td_sel;
1528 	STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn)
1529 		selfdfree(stp, sfp);
1530 	stp->st_flags = 0;
1531 }
1532 
1533 static void selectinit(void *);
1534 SYSINIT(select, SI_SUB_SYSCALLS, SI_ORDER_ANY, selectinit, NULL);
1535 static void
1536 selectinit(void *dummy __unused)
1537 {
1538 	selfd_zone = uma_zcreate("selfd", sizeof(struct selfd), NULL, NULL,
1539 	    NULL, NULL, UMA_ALIGN_PTR, 0);
1540 }
1541