xref: /freebsd/sys/kern/sys_generic.c (revision a0dd79dbdf917a8fbe2762d668f05a7c9f682b22)
1 /*-
2  * Copyright (c) 1982, 1986, 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 4. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
35  */
36 
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD$");
39 
40 #include "opt_capsicum.h"
41 #include "opt_compat.h"
42 #include "opt_ktrace.h"
43 
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/sysproto.h>
47 #include <sys/capability.h>
48 #include <sys/filedesc.h>
49 #include <sys/filio.h>
50 #include <sys/fcntl.h>
51 #include <sys/file.h>
52 #include <sys/proc.h>
53 #include <sys/signalvar.h>
54 #include <sys/socketvar.h>
55 #include <sys/uio.h>
56 #include <sys/kernel.h>
57 #include <sys/ktr.h>
58 #include <sys/limits.h>
59 #include <sys/malloc.h>
60 #include <sys/poll.h>
61 #include <sys/resourcevar.h>
62 #include <sys/selinfo.h>
63 #include <sys/sleepqueue.h>
64 #include <sys/syscallsubr.h>
65 #include <sys/sysctl.h>
66 #include <sys/sysent.h>
67 #include <sys/vnode.h>
68 #include <sys/bio.h>
69 #include <sys/buf.h>
70 #include <sys/condvar.h>
71 #ifdef KTRACE
72 #include <sys/ktrace.h>
73 #endif
74 
75 #include <security/audit/audit.h>
76 
77 int iosize_max_clamp = 1;
78 SYSCTL_INT(_debug, OID_AUTO, iosize_max_clamp, CTLFLAG_RW, &iosize_max_clamp, 0,
79     "Clamp max i/o size to INT_MAX");
80 
81 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
82 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
83 MALLOC_DEFINE(M_IOV, "iov", "large iov's");
84 
85 static int	pollout(struct thread *, struct pollfd *, struct pollfd *,
86 		    u_int);
87 static int	pollscan(struct thread *, struct pollfd *, u_int);
88 static int	pollrescan(struct thread *);
89 static int	selscan(struct thread *, fd_mask **, fd_mask **, int);
90 static int	selrescan(struct thread *, fd_mask **, fd_mask **);
91 static void	selfdalloc(struct thread *, void *);
92 static void	selfdfree(struct seltd *, struct selfd *);
93 static int	dofileread(struct thread *, int, struct file *, struct uio *,
94 		    off_t, int);
95 static int	dofilewrite(struct thread *, int, struct file *, struct uio *,
96 		    off_t, int);
97 static void	doselwakeup(struct selinfo *, int);
98 static void	seltdinit(struct thread *);
99 static int	seltdwait(struct thread *, int);
100 static void	seltdclear(struct thread *);
101 
102 /*
103  * One seltd per-thread allocated on demand as needed.
104  *
105  *	t - protected by st_mtx
106  * 	k - Only accessed by curthread or read-only
107  */
108 struct seltd {
109 	STAILQ_HEAD(, selfd)	st_selq;	/* (k) List of selfds. */
110 	struct selfd		*st_free1;	/* (k) free fd for read set. */
111 	struct selfd		*st_free2;	/* (k) free fd for write set. */
112 	struct mtx		st_mtx;		/* Protects struct seltd */
113 	struct cv		st_wait;	/* (t) Wait channel. */
114 	int			st_flags;	/* (t) SELTD_ flags. */
115 };
116 
117 #define	SELTD_PENDING	0x0001			/* We have pending events. */
118 #define	SELTD_RESCAN	0x0002			/* Doing a rescan. */
119 
120 /*
121  * One selfd allocated per-thread per-file-descriptor.
122  *	f - protected by sf_mtx
123  */
124 struct selfd {
125 	STAILQ_ENTRY(selfd)	sf_link;	/* (k) fds owned by this td. */
126 	TAILQ_ENTRY(selfd)	sf_threads;	/* (f) fds on this selinfo. */
127 	struct selinfo		*sf_si;		/* (f) selinfo when linked. */
128 	struct mtx		*sf_mtx;	/* Pointer to selinfo mtx. */
129 	struct seltd		*sf_td;		/* (k) owning seltd. */
130 	void			*sf_cookie;	/* (k) fd or pollfd. */
131 };
132 
133 static uma_zone_t selfd_zone;
134 static struct mtx_pool *mtxpool_select;
135 
136 #ifndef _SYS_SYSPROTO_H_
137 struct read_args {
138 	int	fd;
139 	void	*buf;
140 	size_t	nbyte;
141 };
142 #endif
143 int
144 sys_read(td, uap)
145 	struct thread *td;
146 	struct read_args *uap;
147 {
148 	struct uio auio;
149 	struct iovec aiov;
150 	int error;
151 
152 	if (uap->nbyte > IOSIZE_MAX)
153 		return (EINVAL);
154 	aiov.iov_base = uap->buf;
155 	aiov.iov_len = uap->nbyte;
156 	auio.uio_iov = &aiov;
157 	auio.uio_iovcnt = 1;
158 	auio.uio_resid = uap->nbyte;
159 	auio.uio_segflg = UIO_USERSPACE;
160 	error = kern_readv(td, uap->fd, &auio);
161 	return(error);
162 }
163 
164 /*
165  * Positioned read system call
166  */
167 #ifndef _SYS_SYSPROTO_H_
168 struct pread_args {
169 	int	fd;
170 	void	*buf;
171 	size_t	nbyte;
172 	int	pad;
173 	off_t	offset;
174 };
175 #endif
176 int
177 sys_pread(td, uap)
178 	struct thread *td;
179 	struct pread_args *uap;
180 {
181 	struct uio auio;
182 	struct iovec aiov;
183 	int error;
184 
185 	if (uap->nbyte > IOSIZE_MAX)
186 		return (EINVAL);
187 	aiov.iov_base = uap->buf;
188 	aiov.iov_len = uap->nbyte;
189 	auio.uio_iov = &aiov;
190 	auio.uio_iovcnt = 1;
191 	auio.uio_resid = uap->nbyte;
192 	auio.uio_segflg = UIO_USERSPACE;
193 	error = kern_preadv(td, uap->fd, &auio, uap->offset);
194 	return(error);
195 }
196 
197 int
198 freebsd6_pread(td, uap)
199 	struct thread *td;
200 	struct freebsd6_pread_args *uap;
201 {
202 	struct pread_args oargs;
203 
204 	oargs.fd = uap->fd;
205 	oargs.buf = uap->buf;
206 	oargs.nbyte = uap->nbyte;
207 	oargs.offset = uap->offset;
208 	return (sys_pread(td, &oargs));
209 }
210 
211 /*
212  * Scatter read system call.
213  */
214 #ifndef _SYS_SYSPROTO_H_
215 struct readv_args {
216 	int	fd;
217 	struct	iovec *iovp;
218 	u_int	iovcnt;
219 };
220 #endif
221 int
222 sys_readv(struct thread *td, struct readv_args *uap)
223 {
224 	struct uio *auio;
225 	int error;
226 
227 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
228 	if (error)
229 		return (error);
230 	error = kern_readv(td, uap->fd, auio);
231 	free(auio, M_IOV);
232 	return (error);
233 }
234 
235 int
236 kern_readv(struct thread *td, int fd, struct uio *auio)
237 {
238 	struct file *fp;
239 	int error;
240 
241 	error = fget_read(td, fd, CAP_READ | CAP_SEEK, &fp);
242 	if (error)
243 		return (error);
244 	error = dofileread(td, fd, fp, auio, (off_t)-1, 0);
245 	fdrop(fp, td);
246 	return (error);
247 }
248 
249 /*
250  * Scatter positioned read system call.
251  */
252 #ifndef _SYS_SYSPROTO_H_
253 struct preadv_args {
254 	int	fd;
255 	struct	iovec *iovp;
256 	u_int	iovcnt;
257 	off_t	offset;
258 };
259 #endif
260 int
261 sys_preadv(struct thread *td, struct preadv_args *uap)
262 {
263 	struct uio *auio;
264 	int error;
265 
266 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
267 	if (error)
268 		return (error);
269 	error = kern_preadv(td, uap->fd, auio, uap->offset);
270 	free(auio, M_IOV);
271 	return (error);
272 }
273 
274 int
275 kern_preadv(td, fd, auio, offset)
276 	struct thread *td;
277 	int fd;
278 	struct uio *auio;
279 	off_t offset;
280 {
281 	struct file *fp;
282 	int error;
283 
284 	error = fget_read(td, fd, CAP_READ, &fp);
285 	if (error)
286 		return (error);
287 	if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
288 		error = ESPIPE;
289 	else if (offset < 0 && fp->f_vnode->v_type != VCHR)
290 		error = EINVAL;
291 	else
292 		error = dofileread(td, fd, fp, auio, offset, FOF_OFFSET);
293 	fdrop(fp, td);
294 	return (error);
295 }
296 
297 /*
298  * Common code for readv and preadv that reads data in
299  * from a file using the passed in uio, offset, and flags.
300  */
301 static int
302 dofileread(td, fd, fp, auio, offset, flags)
303 	struct thread *td;
304 	int fd;
305 	struct file *fp;
306 	struct uio *auio;
307 	off_t offset;
308 	int flags;
309 {
310 	ssize_t cnt;
311 	int error;
312 #ifdef KTRACE
313 	struct uio *ktruio = NULL;
314 #endif
315 
316 	/* Finish zero length reads right here */
317 	if (auio->uio_resid == 0) {
318 		td->td_retval[0] = 0;
319 		return(0);
320 	}
321 	auio->uio_rw = UIO_READ;
322 	auio->uio_offset = offset;
323 	auio->uio_td = td;
324 #ifdef KTRACE
325 	if (KTRPOINT(td, KTR_GENIO))
326 		ktruio = cloneuio(auio);
327 #endif
328 	cnt = auio->uio_resid;
329 	if ((error = fo_read(fp, auio, td->td_ucred, flags, td))) {
330 		if (auio->uio_resid != cnt && (error == ERESTART ||
331 		    error == EINTR || error == EWOULDBLOCK))
332 			error = 0;
333 	}
334 	cnt -= auio->uio_resid;
335 #ifdef KTRACE
336 	if (ktruio != NULL) {
337 		ktruio->uio_resid = cnt;
338 		ktrgenio(fd, UIO_READ, ktruio, error);
339 	}
340 #endif
341 #if SSIZE_MAX > LONG_MAX
342 	td->td_retval[1] = cnt >> (sizeof(register_t) * CHAR_BIT);
343 	td->td_retval[0] = cnt;
344 #else
345 	td->td_retval[0] = cnt;
346 #endif
347 	return (error);
348 }
349 
350 #ifndef _SYS_SYSPROTO_H_
351 struct write_args {
352 	int	fd;
353 	const void *buf;
354 	size_t	nbyte;
355 };
356 #endif
357 int
358 sys_write(td, uap)
359 	struct thread *td;
360 	struct write_args *uap;
361 {
362 	struct uio auio;
363 	struct iovec aiov;
364 	int error;
365 
366 	if (uap->nbyte > IOSIZE_MAX)
367 		return (EINVAL);
368 	aiov.iov_base = (void *)(uintptr_t)uap->buf;
369 	aiov.iov_len = uap->nbyte;
370 	auio.uio_iov = &aiov;
371 	auio.uio_iovcnt = 1;
372 	auio.uio_resid = uap->nbyte;
373 	auio.uio_segflg = UIO_USERSPACE;
374 	error = kern_writev(td, uap->fd, &auio);
375 	return(error);
376 }
377 
378 /*
379  * Positioned write system call.
380  */
381 #ifndef _SYS_SYSPROTO_H_
382 struct pwrite_args {
383 	int	fd;
384 	const void *buf;
385 	size_t	nbyte;
386 	int	pad;
387 	off_t	offset;
388 };
389 #endif
390 int
391 sys_pwrite(td, uap)
392 	struct thread *td;
393 	struct pwrite_args *uap;
394 {
395 	struct uio auio;
396 	struct iovec aiov;
397 	int error;
398 
399 	if (uap->nbyte > IOSIZE_MAX)
400 		return (EINVAL);
401 	aiov.iov_base = (void *)(uintptr_t)uap->buf;
402 	aiov.iov_len = uap->nbyte;
403 	auio.uio_iov = &aiov;
404 	auio.uio_iovcnt = 1;
405 	auio.uio_resid = uap->nbyte;
406 	auio.uio_segflg = UIO_USERSPACE;
407 	error = kern_pwritev(td, uap->fd, &auio, uap->offset);
408 	return(error);
409 }
410 
411 int
412 freebsd6_pwrite(td, uap)
413 	struct thread *td;
414 	struct freebsd6_pwrite_args *uap;
415 {
416 	struct pwrite_args oargs;
417 
418 	oargs.fd = uap->fd;
419 	oargs.buf = uap->buf;
420 	oargs.nbyte = uap->nbyte;
421 	oargs.offset = uap->offset;
422 	return (sys_pwrite(td, &oargs));
423 }
424 
425 /*
426  * Gather write system call.
427  */
428 #ifndef _SYS_SYSPROTO_H_
429 struct writev_args {
430 	int	fd;
431 	struct	iovec *iovp;
432 	u_int	iovcnt;
433 };
434 #endif
435 int
436 sys_writev(struct thread *td, struct writev_args *uap)
437 {
438 	struct uio *auio;
439 	int error;
440 
441 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
442 	if (error)
443 		return (error);
444 	error = kern_writev(td, uap->fd, auio);
445 	free(auio, M_IOV);
446 	return (error);
447 }
448 
449 int
450 kern_writev(struct thread *td, int fd, struct uio *auio)
451 {
452 	struct file *fp;
453 	int error;
454 
455 	error = fget_write(td, fd, CAP_WRITE | CAP_SEEK, &fp);
456 	if (error)
457 		return (error);
458 	error = dofilewrite(td, fd, fp, auio, (off_t)-1, 0);
459 	fdrop(fp, td);
460 	return (error);
461 }
462 
463 /*
464  * Gather positioned write system call.
465  */
466 #ifndef _SYS_SYSPROTO_H_
467 struct pwritev_args {
468 	int	fd;
469 	struct	iovec *iovp;
470 	u_int	iovcnt;
471 	off_t	offset;
472 };
473 #endif
474 int
475 sys_pwritev(struct thread *td, struct pwritev_args *uap)
476 {
477 	struct uio *auio;
478 	int error;
479 
480 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
481 	if (error)
482 		return (error);
483 	error = kern_pwritev(td, uap->fd, auio, uap->offset);
484 	free(auio, M_IOV);
485 	return (error);
486 }
487 
488 int
489 kern_pwritev(td, fd, auio, offset)
490 	struct thread *td;
491 	struct uio *auio;
492 	int fd;
493 	off_t offset;
494 {
495 	struct file *fp;
496 	int error;
497 
498 	error = fget_write(td, fd, CAP_WRITE, &fp);
499 	if (error)
500 		return (error);
501 	if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
502 		error = ESPIPE;
503 	else if (offset < 0 && fp->f_vnode->v_type != VCHR)
504 		error = EINVAL;
505 	else
506 		error = dofilewrite(td, fd, fp, auio, offset, FOF_OFFSET);
507 	fdrop(fp, td);
508 	return (error);
509 }
510 
511 /*
512  * Common code for writev and pwritev that writes data to
513  * a file using the passed in uio, offset, and flags.
514  */
515 static int
516 dofilewrite(td, fd, fp, auio, offset, flags)
517 	struct thread *td;
518 	int fd;
519 	struct file *fp;
520 	struct uio *auio;
521 	off_t offset;
522 	int flags;
523 {
524 	ssize_t cnt;
525 	int error;
526 #ifdef KTRACE
527 	struct uio *ktruio = NULL;
528 #endif
529 
530 	auio->uio_rw = UIO_WRITE;
531 	auio->uio_td = td;
532 	auio->uio_offset = offset;
533 #ifdef KTRACE
534 	if (KTRPOINT(td, KTR_GENIO))
535 		ktruio = cloneuio(auio);
536 #endif
537 	cnt = auio->uio_resid;
538 	if (fp->f_type == DTYPE_VNODE)
539 		bwillwrite();
540 	if ((error = fo_write(fp, auio, td->td_ucred, flags, td))) {
541 		if (auio->uio_resid != cnt && (error == ERESTART ||
542 		    error == EINTR || error == EWOULDBLOCK))
543 			error = 0;
544 		/* Socket layer is responsible for issuing SIGPIPE. */
545 		if (fp->f_type != DTYPE_SOCKET && error == EPIPE) {
546 			PROC_LOCK(td->td_proc);
547 			tdsignal(td, SIGPIPE);
548 			PROC_UNLOCK(td->td_proc);
549 		}
550 	}
551 	cnt -= auio->uio_resid;
552 #ifdef KTRACE
553 	if (ktruio != NULL) {
554 		ktruio->uio_resid = cnt;
555 		ktrgenio(fd, UIO_WRITE, ktruio, error);
556 	}
557 #endif
558 #if SSIZE_MAX > LONG_MAX
559 	td->td_retval[1] = cnt >> (sizeof(register_t) * CHAR_BIT);
560 	td->td_retval[0] = cnt;
561 #else
562 	td->td_retval[0] = cnt;
563 #endif
564 	return (error);
565 }
566 
567 /*
568  * Truncate a file given a file descriptor.
569  *
570  * Can't use fget_write() here, since must return EINVAL and not EBADF if the
571  * descriptor isn't writable.
572  */
573 int
574 kern_ftruncate(td, fd, length)
575 	struct thread *td;
576 	int fd;
577 	off_t length;
578 {
579 	struct file *fp;
580 	int error;
581 
582 	AUDIT_ARG_FD(fd);
583 	if (length < 0)
584 		return (EINVAL);
585 	error = fget(td, fd, CAP_FTRUNCATE, &fp);
586 	if (error)
587 		return (error);
588 	AUDIT_ARG_FILE(td->td_proc, fp);
589 	if (!(fp->f_flag & FWRITE)) {
590 		fdrop(fp, td);
591 		return (EINVAL);
592 	}
593 	error = fo_truncate(fp, length, td->td_ucred, td);
594 	fdrop(fp, td);
595 	return (error);
596 }
597 
598 #ifndef _SYS_SYSPROTO_H_
599 struct ftruncate_args {
600 	int	fd;
601 	int	pad;
602 	off_t	length;
603 };
604 #endif
605 int
606 sys_ftruncate(td, uap)
607 	struct thread *td;
608 	struct ftruncate_args *uap;
609 {
610 
611 	return (kern_ftruncate(td, uap->fd, uap->length));
612 }
613 
614 #if defined(COMPAT_43)
615 #ifndef _SYS_SYSPROTO_H_
616 struct oftruncate_args {
617 	int	fd;
618 	long	length;
619 };
620 #endif
621 int
622 oftruncate(td, uap)
623 	struct thread *td;
624 	struct oftruncate_args *uap;
625 {
626 
627 	return (kern_ftruncate(td, uap->fd, uap->length));
628 }
629 #endif /* COMPAT_43 */
630 
631 #ifndef _SYS_SYSPROTO_H_
632 struct ioctl_args {
633 	int	fd;
634 	u_long	com;
635 	caddr_t	data;
636 };
637 #endif
638 /* ARGSUSED */
639 int
640 sys_ioctl(struct thread *td, struct ioctl_args *uap)
641 {
642 	u_long com;
643 	int arg, error;
644 	u_int size;
645 	caddr_t data;
646 
647 	if (uap->com > 0xffffffff) {
648 		printf(
649 		    "WARNING pid %d (%s): ioctl sign-extension ioctl %lx\n",
650 		    td->td_proc->p_pid, td->td_name, uap->com);
651 		uap->com &= 0xffffffff;
652 	}
653 	com = uap->com;
654 
655 	/*
656 	 * Interpret high order word to find amount of data to be
657 	 * copied to/from the user's address space.
658 	 */
659 	size = IOCPARM_LEN(com);
660 	if ((size > IOCPARM_MAX) ||
661 	    ((com & (IOC_VOID  | IOC_IN | IOC_OUT)) == 0) ||
662 #if defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
663 	    ((com & IOC_OUT) && size == 0) ||
664 #else
665 	    ((com & (IOC_IN | IOC_OUT)) && size == 0) ||
666 #endif
667 	    ((com & IOC_VOID) && size > 0 && size != sizeof(int)))
668 		return (ENOTTY);
669 
670 	if (size > 0) {
671 		if (com & IOC_VOID) {
672 			/* Integer argument. */
673 			arg = (intptr_t)uap->data;
674 			data = (void *)&arg;
675 			size = 0;
676 		} else
677 			data = malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
678 	} else
679 		data = (void *)&uap->data;
680 	if (com & IOC_IN) {
681 		error = copyin(uap->data, data, (u_int)size);
682 		if (error) {
683 			if (size > 0)
684 				free(data, M_IOCTLOPS);
685 			return (error);
686 		}
687 	} else if (com & IOC_OUT) {
688 		/*
689 		 * Zero the buffer so the user always
690 		 * gets back something deterministic.
691 		 */
692 		bzero(data, size);
693 	}
694 
695 	error = kern_ioctl(td, uap->fd, com, data);
696 
697 	if (error == 0 && (com & IOC_OUT))
698 		error = copyout(data, uap->data, (u_int)size);
699 
700 	if (size > 0)
701 		free(data, M_IOCTLOPS);
702 	return (error);
703 }
704 
705 int
706 kern_ioctl(struct thread *td, int fd, u_long com, caddr_t data)
707 {
708 	struct file *fp;
709 	struct filedesc *fdp;
710 	int error;
711 	int tmp;
712 
713 	AUDIT_ARG_FD(fd);
714 	AUDIT_ARG_CMD(com);
715 	if ((error = fget(td, fd, CAP_IOCTL, &fp)) != 0)
716 		return (error);
717 	if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
718 		fdrop(fp, td);
719 		return (EBADF);
720 	}
721 	fdp = td->td_proc->p_fd;
722 	switch (com) {
723 	case FIONCLEX:
724 		FILEDESC_XLOCK(fdp);
725 		fdp->fd_ofileflags[fd] &= ~UF_EXCLOSE;
726 		FILEDESC_XUNLOCK(fdp);
727 		goto out;
728 	case FIOCLEX:
729 		FILEDESC_XLOCK(fdp);
730 		fdp->fd_ofileflags[fd] |= UF_EXCLOSE;
731 		FILEDESC_XUNLOCK(fdp);
732 		goto out;
733 	case FIONBIO:
734 		if ((tmp = *(int *)data))
735 			atomic_set_int(&fp->f_flag, FNONBLOCK);
736 		else
737 			atomic_clear_int(&fp->f_flag, FNONBLOCK);
738 		data = (void *)&tmp;
739 		break;
740 	case FIOASYNC:
741 		if ((tmp = *(int *)data))
742 			atomic_set_int(&fp->f_flag, FASYNC);
743 		else
744 			atomic_clear_int(&fp->f_flag, FASYNC);
745 		data = (void *)&tmp;
746 		break;
747 	}
748 
749 	error = fo_ioctl(fp, com, data, td->td_ucred, td);
750 out:
751 	fdrop(fp, td);
752 	return (error);
753 }
754 
755 int
756 poll_no_poll(int events)
757 {
758 	/*
759 	 * Return true for read/write.  If the user asked for something
760 	 * special, return POLLNVAL, so that clients have a way of
761 	 * determining reliably whether or not the extended
762 	 * functionality is present without hard-coding knowledge
763 	 * of specific filesystem implementations.
764 	 */
765 	if (events & ~POLLSTANDARD)
766 		return (POLLNVAL);
767 
768 	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
769 }
770 
771 int
772 sys_pselect(struct thread *td, struct pselect_args *uap)
773 {
774 	struct timespec ts;
775 	struct timeval tv, *tvp;
776 	sigset_t set, *uset;
777 	int error;
778 
779 	if (uap->ts != NULL) {
780 		error = copyin(uap->ts, &ts, sizeof(ts));
781 		if (error != 0)
782 		    return (error);
783 		TIMESPEC_TO_TIMEVAL(&tv, &ts);
784 		tvp = &tv;
785 	} else
786 		tvp = NULL;
787 	if (uap->sm != NULL) {
788 		error = copyin(uap->sm, &set, sizeof(set));
789 		if (error != 0)
790 			return (error);
791 		uset = &set;
792 	} else
793 		uset = NULL;
794 	return (kern_pselect(td, uap->nd, uap->in, uap->ou, uap->ex, tvp,
795 	    uset, NFDBITS));
796 }
797 
798 int
799 kern_pselect(struct thread *td, int nd, fd_set *in, fd_set *ou, fd_set *ex,
800     struct timeval *tvp, sigset_t *uset, int abi_nfdbits)
801 {
802 	int error;
803 
804 	if (uset != NULL) {
805 		error = kern_sigprocmask(td, SIG_SETMASK, uset,
806 		    &td->td_oldsigmask, 0);
807 		if (error != 0)
808 			return (error);
809 		td->td_pflags |= TDP_OLDMASK;
810 		/*
811 		 * Make sure that ast() is called on return to
812 		 * usermode and TDP_OLDMASK is cleared, restoring old
813 		 * sigmask.
814 		 */
815 		thread_lock(td);
816 		td->td_flags |= TDF_ASTPENDING;
817 		thread_unlock(td);
818 	}
819 	error = kern_select(td, nd, in, ou, ex, tvp, abi_nfdbits);
820 	return (error);
821 }
822 
823 #ifndef _SYS_SYSPROTO_H_
824 struct select_args {
825 	int	nd;
826 	fd_set	*in, *ou, *ex;
827 	struct	timeval *tv;
828 };
829 #endif
830 int
831 sys_select(struct thread *td, struct select_args *uap)
832 {
833 	struct timeval tv, *tvp;
834 	int error;
835 
836 	if (uap->tv != NULL) {
837 		error = copyin(uap->tv, &tv, sizeof(tv));
838 		if (error)
839 			return (error);
840 		tvp = &tv;
841 	} else
842 		tvp = NULL;
843 
844 	return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp,
845 	    NFDBITS));
846 }
847 
848 /*
849  * In the unlikely case when user specified n greater then the last
850  * open file descriptor, check that no bits are set after the last
851  * valid fd.  We must return EBADF if any is set.
852  *
853  * There are applications that rely on the behaviour.
854  *
855  * nd is fd_lastfile + 1.
856  */
857 static int
858 select_check_badfd(fd_set *fd_in, int nd, int ndu, int abi_nfdbits)
859 {
860 	char *addr, *oaddr;
861 	int b, i, res;
862 	uint8_t bits;
863 
864 	if (nd >= ndu || fd_in == NULL)
865 		return (0);
866 
867 	oaddr = NULL;
868 	bits = 0; /* silence gcc */
869 	for (i = nd; i < ndu; i++) {
870 		b = i / NBBY;
871 #if BYTE_ORDER == LITTLE_ENDIAN
872 		addr = (char *)fd_in + b;
873 #else
874 		addr = (char *)fd_in;
875 		if (abi_nfdbits == NFDBITS) {
876 			addr += rounddown(b, sizeof(fd_mask)) +
877 			    sizeof(fd_mask) - 1 - b % sizeof(fd_mask);
878 		} else {
879 			addr += rounddown(b, sizeof(uint32_t)) +
880 			    sizeof(uint32_t) - 1 - b % sizeof(uint32_t);
881 		}
882 #endif
883 		if (addr != oaddr) {
884 			res = fubyte(addr);
885 			if (res == -1)
886 				return (EFAULT);
887 			oaddr = addr;
888 			bits = res;
889 		}
890 		if ((bits & (1 << (i % NBBY))) != 0)
891 			return (EBADF);
892 	}
893 	return (0);
894 }
895 
896 int
897 kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou,
898     fd_set *fd_ex, struct timeval *tvp, int abi_nfdbits)
899 {
900 	struct filedesc *fdp;
901 	/*
902 	 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
903 	 * infds with the new FD_SETSIZE of 1024, and more than enough for
904 	 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
905 	 * of 256.
906 	 */
907 	fd_mask s_selbits[howmany(2048, NFDBITS)];
908 	fd_mask *ibits[3], *obits[3], *selbits, *sbp;
909 	struct timeval atv, rtv, ttv;
910 	int error, lf, ndu, timo;
911 	u_int nbufbytes, ncpbytes, ncpubytes, nfdbits;
912 
913 	if (nd < 0)
914 		return (EINVAL);
915 	fdp = td->td_proc->p_fd;
916 	ndu = nd;
917 	lf = fdp->fd_lastfile;
918 	if (nd > lf + 1)
919 		nd = lf + 1;
920 
921 	error = select_check_badfd(fd_in, nd, ndu, abi_nfdbits);
922 	if (error != 0)
923 		return (error);
924 	error = select_check_badfd(fd_ou, nd, ndu, abi_nfdbits);
925 	if (error != 0)
926 		return (error);
927 	error = select_check_badfd(fd_ex, nd, ndu, abi_nfdbits);
928 	if (error != 0)
929 		return (error);
930 
931 	/*
932 	 * Allocate just enough bits for the non-null fd_sets.  Use the
933 	 * preallocated auto buffer if possible.
934 	 */
935 	nfdbits = roundup(nd, NFDBITS);
936 	ncpbytes = nfdbits / NBBY;
937 	ncpubytes = roundup(nd, abi_nfdbits) / NBBY;
938 	nbufbytes = 0;
939 	if (fd_in != NULL)
940 		nbufbytes += 2 * ncpbytes;
941 	if (fd_ou != NULL)
942 		nbufbytes += 2 * ncpbytes;
943 	if (fd_ex != NULL)
944 		nbufbytes += 2 * ncpbytes;
945 	if (nbufbytes <= sizeof s_selbits)
946 		selbits = &s_selbits[0];
947 	else
948 		selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
949 
950 	/*
951 	 * Assign pointers into the bit buffers and fetch the input bits.
952 	 * Put the output buffers together so that they can be bzeroed
953 	 * together.
954 	 */
955 	sbp = selbits;
956 #define	getbits(name, x) \
957 	do {								\
958 		if (name == NULL) {					\
959 			ibits[x] = NULL;				\
960 			obits[x] = NULL;				\
961 		} else {						\
962 			ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp;	\
963 			obits[x] = sbp;					\
964 			sbp += ncpbytes / sizeof *sbp;			\
965 			error = copyin(name, ibits[x], ncpubytes);	\
966 			if (error != 0)					\
967 				goto done;				\
968 			bzero((char *)ibits[x] + ncpubytes,		\
969 			    ncpbytes - ncpubytes);			\
970 		}							\
971 	} while (0)
972 	getbits(fd_in, 0);
973 	getbits(fd_ou, 1);
974 	getbits(fd_ex, 2);
975 #undef	getbits
976 
977 #if BYTE_ORDER == BIG_ENDIAN && defined(__LP64__)
978 	/*
979 	 * XXX: swizzle_fdset assumes that if abi_nfdbits != NFDBITS,
980 	 * we are running under 32-bit emulation. This should be more
981 	 * generic.
982 	 */
983 #define swizzle_fdset(bits)						\
984 	if (abi_nfdbits != NFDBITS && bits != NULL) {			\
985 		int i;							\
986 		for (i = 0; i < ncpbytes / sizeof *sbp; i++)		\
987 			bits[i] = (bits[i] >> 32) | (bits[i] << 32);	\
988 	}
989 #else
990 #define swizzle_fdset(bits)
991 #endif
992 
993 	/* Make sure the bit order makes it through an ABI transition */
994 	swizzle_fdset(ibits[0]);
995 	swizzle_fdset(ibits[1]);
996 	swizzle_fdset(ibits[2]);
997 
998 	if (nbufbytes != 0)
999 		bzero(selbits, nbufbytes / 2);
1000 
1001 	if (tvp != NULL) {
1002 		atv = *tvp;
1003 		if (itimerfix(&atv)) {
1004 			error = EINVAL;
1005 			goto done;
1006 		}
1007 		getmicrouptime(&rtv);
1008 		timevaladd(&atv, &rtv);
1009 	} else {
1010 		atv.tv_sec = 0;
1011 		atv.tv_usec = 0;
1012 	}
1013 	timo = 0;
1014 	seltdinit(td);
1015 	/* Iterate until the timeout expires or descriptors become ready. */
1016 	for (;;) {
1017 		error = selscan(td, ibits, obits, nd);
1018 		if (error || td->td_retval[0] != 0)
1019 			break;
1020 		if (atv.tv_sec || atv.tv_usec) {
1021 			getmicrouptime(&rtv);
1022 			if (timevalcmp(&rtv, &atv, >=))
1023 				break;
1024 			ttv = atv;
1025 			timevalsub(&ttv, &rtv);
1026 			timo = ttv.tv_sec > 24 * 60 * 60 ?
1027 			    24 * 60 * 60 * hz : tvtohz(&ttv);
1028 		}
1029 		error = seltdwait(td, timo);
1030 		if (error)
1031 			break;
1032 		error = selrescan(td, ibits, obits);
1033 		if (error || td->td_retval[0] != 0)
1034 			break;
1035 	}
1036 	seltdclear(td);
1037 
1038 done:
1039 	/* select is not restarted after signals... */
1040 	if (error == ERESTART)
1041 		error = EINTR;
1042 	if (error == EWOULDBLOCK)
1043 		error = 0;
1044 
1045 	/* swizzle bit order back, if necessary */
1046 	swizzle_fdset(obits[0]);
1047 	swizzle_fdset(obits[1]);
1048 	swizzle_fdset(obits[2]);
1049 #undef swizzle_fdset
1050 
1051 #define	putbits(name, x) \
1052 	if (name && (error2 = copyout(obits[x], name, ncpubytes))) \
1053 		error = error2;
1054 	if (error == 0) {
1055 		int error2;
1056 
1057 		putbits(fd_in, 0);
1058 		putbits(fd_ou, 1);
1059 		putbits(fd_ex, 2);
1060 #undef putbits
1061 	}
1062 	if (selbits != &s_selbits[0])
1063 		free(selbits, M_SELECT);
1064 
1065 	return (error);
1066 }
1067 /*
1068  * Convert a select bit set to poll flags.
1069  *
1070  * The backend always returns POLLHUP/POLLERR if appropriate and we
1071  * return this as a set bit in any set.
1072  */
1073 static int select_flags[3] = {
1074     POLLRDNORM | POLLHUP | POLLERR,
1075     POLLWRNORM | POLLHUP | POLLERR,
1076     POLLRDBAND | POLLERR
1077 };
1078 
1079 /*
1080  * Compute the fo_poll flags required for a fd given by the index and
1081  * bit position in the fd_mask array.
1082  */
1083 static __inline int
1084 selflags(fd_mask **ibits, int idx, fd_mask bit)
1085 {
1086 	int flags;
1087 	int msk;
1088 
1089 	flags = 0;
1090 	for (msk = 0; msk < 3; msk++) {
1091 		if (ibits[msk] == NULL)
1092 			continue;
1093 		if ((ibits[msk][idx] & bit) == 0)
1094 			continue;
1095 		flags |= select_flags[msk];
1096 	}
1097 	return (flags);
1098 }
1099 
1100 /*
1101  * Set the appropriate output bits given a mask of fired events and the
1102  * input bits originally requested.
1103  */
1104 static __inline int
1105 selsetbits(fd_mask **ibits, fd_mask **obits, int idx, fd_mask bit, int events)
1106 {
1107 	int msk;
1108 	int n;
1109 
1110 	n = 0;
1111 	for (msk = 0; msk < 3; msk++) {
1112 		if ((events & select_flags[msk]) == 0)
1113 			continue;
1114 		if (ibits[msk] == NULL)
1115 			continue;
1116 		if ((ibits[msk][idx] & bit) == 0)
1117 			continue;
1118 		/*
1119 		 * XXX Check for a duplicate set.  This can occur because a
1120 		 * socket calls selrecord() twice for each poll() call
1121 		 * resulting in two selfds per real fd.  selrescan() will
1122 		 * call selsetbits twice as a result.
1123 		 */
1124 		if ((obits[msk][idx] & bit) != 0)
1125 			continue;
1126 		obits[msk][idx] |= bit;
1127 		n++;
1128 	}
1129 
1130 	return (n);
1131 }
1132 
1133 static __inline int
1134 getselfd_cap(struct filedesc *fdp, int fd, struct file **fpp)
1135 {
1136 	struct file *fp;
1137 #ifdef CAPABILITIES
1138 	struct file *fp_fromcap;
1139 	int error;
1140 #endif
1141 
1142 	if ((fp = fget_unlocked(fdp, fd)) == NULL)
1143 		return (EBADF);
1144 #ifdef CAPABILITIES
1145 	/*
1146 	 * If the file descriptor is for a capability, test rights and use
1147 	 * the file descriptor references by the capability.
1148 	 */
1149 	error = cap_funwrap(fp, CAP_POLL_EVENT, &fp_fromcap);
1150 	if (error) {
1151 		fdrop(fp, curthread);
1152 		return (error);
1153 	}
1154 	if (fp != fp_fromcap) {
1155 		fhold(fp_fromcap);
1156 		fdrop(fp, curthread);
1157 		fp = fp_fromcap;
1158 	}
1159 #endif /* CAPABILITIES */
1160 	*fpp = fp;
1161 	return (0);
1162 }
1163 
1164 /*
1165  * Traverse the list of fds attached to this thread's seltd and check for
1166  * completion.
1167  */
1168 static int
1169 selrescan(struct thread *td, fd_mask **ibits, fd_mask **obits)
1170 {
1171 	struct filedesc *fdp;
1172 	struct selinfo *si;
1173 	struct seltd *stp;
1174 	struct selfd *sfp;
1175 	struct selfd *sfn;
1176 	struct file *fp;
1177 	fd_mask bit;
1178 	int fd, ev, n, idx;
1179 	int error;
1180 
1181 	fdp = td->td_proc->p_fd;
1182 	stp = td->td_sel;
1183 	n = 0;
1184 	STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) {
1185 		fd = (int)(uintptr_t)sfp->sf_cookie;
1186 		si = sfp->sf_si;
1187 		selfdfree(stp, sfp);
1188 		/* If the selinfo wasn't cleared the event didn't fire. */
1189 		if (si != NULL)
1190 			continue;
1191 		error = getselfd_cap(fdp, fd, &fp);
1192 		if (error)
1193 			return (error);
1194 		idx = fd / NFDBITS;
1195 		bit = (fd_mask)1 << (fd % NFDBITS);
1196 		ev = fo_poll(fp, selflags(ibits, idx, bit), td->td_ucred, td);
1197 		fdrop(fp, td);
1198 		if (ev != 0)
1199 			n += selsetbits(ibits, obits, idx, bit, ev);
1200 	}
1201 	stp->st_flags = 0;
1202 	td->td_retval[0] = n;
1203 	return (0);
1204 }
1205 
1206 /*
1207  * Perform the initial filedescriptor scan and register ourselves with
1208  * each selinfo.
1209  */
1210 static int
1211 selscan(td, ibits, obits, nfd)
1212 	struct thread *td;
1213 	fd_mask **ibits, **obits;
1214 	int nfd;
1215 {
1216 	struct filedesc *fdp;
1217 	struct file *fp;
1218 	fd_mask bit;
1219 	int ev, flags, end, fd;
1220 	int n, idx;
1221 	int error;
1222 
1223 	fdp = td->td_proc->p_fd;
1224 	n = 0;
1225 	for (idx = 0, fd = 0; fd < nfd; idx++) {
1226 		end = imin(fd + NFDBITS, nfd);
1227 		for (bit = 1; fd < end; bit <<= 1, fd++) {
1228 			/* Compute the list of events we're interested in. */
1229 			flags = selflags(ibits, idx, bit);
1230 			if (flags == 0)
1231 				continue;
1232 			error = getselfd_cap(fdp, fd, &fp);
1233 			if (error)
1234 				return (error);
1235 			selfdalloc(td, (void *)(uintptr_t)fd);
1236 			ev = fo_poll(fp, flags, td->td_ucred, td);
1237 			fdrop(fp, td);
1238 			if (ev != 0)
1239 				n += selsetbits(ibits, obits, idx, bit, ev);
1240 		}
1241 	}
1242 
1243 	td->td_retval[0] = n;
1244 	return (0);
1245 }
1246 
1247 #ifndef _SYS_SYSPROTO_H_
1248 struct poll_args {
1249 	struct pollfd *fds;
1250 	u_int	nfds;
1251 	int	timeout;
1252 };
1253 #endif
1254 int
1255 sys_poll(td, uap)
1256 	struct thread *td;
1257 	struct poll_args *uap;
1258 {
1259 	struct pollfd *bits;
1260 	struct pollfd smallbits[32];
1261 	struct timeval atv, rtv, ttv;
1262 	int error = 0, timo;
1263 	u_int nfds;
1264 	size_t ni;
1265 
1266 	nfds = uap->nfds;
1267 	if (nfds > maxfilesperproc && nfds > FD_SETSIZE)
1268 		return (EINVAL);
1269 	ni = nfds * sizeof(struct pollfd);
1270 	if (ni > sizeof(smallbits))
1271 		bits = malloc(ni, M_TEMP, M_WAITOK);
1272 	else
1273 		bits = smallbits;
1274 	error = copyin(uap->fds, bits, ni);
1275 	if (error)
1276 		goto done;
1277 	if (uap->timeout != INFTIM) {
1278 		atv.tv_sec = uap->timeout / 1000;
1279 		atv.tv_usec = (uap->timeout % 1000) * 1000;
1280 		if (itimerfix(&atv)) {
1281 			error = EINVAL;
1282 			goto done;
1283 		}
1284 		getmicrouptime(&rtv);
1285 		timevaladd(&atv, &rtv);
1286 	} else {
1287 		atv.tv_sec = 0;
1288 		atv.tv_usec = 0;
1289 	}
1290 	timo = 0;
1291 	seltdinit(td);
1292 	/* Iterate until the timeout expires or descriptors become ready. */
1293 	for (;;) {
1294 		error = pollscan(td, bits, nfds);
1295 		if (error || td->td_retval[0] != 0)
1296 			break;
1297 		if (atv.tv_sec || atv.tv_usec) {
1298 			getmicrouptime(&rtv);
1299 			if (timevalcmp(&rtv, &atv, >=))
1300 				break;
1301 			ttv = atv;
1302 			timevalsub(&ttv, &rtv);
1303 			timo = ttv.tv_sec > 24 * 60 * 60 ?
1304 			    24 * 60 * 60 * hz : tvtohz(&ttv);
1305 		}
1306 		error = seltdwait(td, timo);
1307 		if (error)
1308 			break;
1309 		error = pollrescan(td);
1310 		if (error || td->td_retval[0] != 0)
1311 			break;
1312 	}
1313 	seltdclear(td);
1314 
1315 done:
1316 	/* poll is not restarted after signals... */
1317 	if (error == ERESTART)
1318 		error = EINTR;
1319 	if (error == EWOULDBLOCK)
1320 		error = 0;
1321 	if (error == 0) {
1322 		error = pollout(td, bits, uap->fds, nfds);
1323 		if (error)
1324 			goto out;
1325 	}
1326 out:
1327 	if (ni > sizeof(smallbits))
1328 		free(bits, M_TEMP);
1329 	return (error);
1330 }
1331 
1332 static int
1333 pollrescan(struct thread *td)
1334 {
1335 	struct seltd *stp;
1336 	struct selfd *sfp;
1337 	struct selfd *sfn;
1338 	struct selinfo *si;
1339 	struct filedesc *fdp;
1340 	struct file *fp;
1341 	struct pollfd *fd;
1342 	int n;
1343 
1344 	n = 0;
1345 	fdp = td->td_proc->p_fd;
1346 	stp = td->td_sel;
1347 	FILEDESC_SLOCK(fdp);
1348 	STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) {
1349 		fd = (struct pollfd *)sfp->sf_cookie;
1350 		si = sfp->sf_si;
1351 		selfdfree(stp, sfp);
1352 		/* If the selinfo wasn't cleared the event didn't fire. */
1353 		if (si != NULL)
1354 			continue;
1355 		fp = fdp->fd_ofiles[fd->fd];
1356 #ifdef CAPABILITIES
1357 		if ((fp == NULL)
1358 		    || (cap_funwrap(fp, CAP_POLL_EVENT, &fp) != 0)) {
1359 #else
1360 		if (fp == NULL) {
1361 #endif
1362 			fd->revents = POLLNVAL;
1363 			n++;
1364 			continue;
1365 		}
1366 
1367 		/*
1368 		 * Note: backend also returns POLLHUP and
1369 		 * POLLERR if appropriate.
1370 		 */
1371 		fd->revents = fo_poll(fp, fd->events, td->td_ucred, td);
1372 		if (fd->revents != 0)
1373 			n++;
1374 	}
1375 	FILEDESC_SUNLOCK(fdp);
1376 	stp->st_flags = 0;
1377 	td->td_retval[0] = n;
1378 	return (0);
1379 }
1380 
1381 
1382 static int
1383 pollout(td, fds, ufds, nfd)
1384 	struct thread *td;
1385 	struct pollfd *fds;
1386 	struct pollfd *ufds;
1387 	u_int nfd;
1388 {
1389 	int error = 0;
1390 	u_int i = 0;
1391 	u_int n = 0;
1392 
1393 	for (i = 0; i < nfd; i++) {
1394 		error = copyout(&fds->revents, &ufds->revents,
1395 		    sizeof(ufds->revents));
1396 		if (error)
1397 			return (error);
1398 		if (fds->revents != 0)
1399 			n++;
1400 		fds++;
1401 		ufds++;
1402 	}
1403 	td->td_retval[0] = n;
1404 	return (0);
1405 }
1406 
1407 static int
1408 pollscan(td, fds, nfd)
1409 	struct thread *td;
1410 	struct pollfd *fds;
1411 	u_int nfd;
1412 {
1413 	struct filedesc *fdp = td->td_proc->p_fd;
1414 	int i;
1415 	struct file *fp;
1416 	int n = 0;
1417 
1418 	FILEDESC_SLOCK(fdp);
1419 	for (i = 0; i < nfd; i++, fds++) {
1420 		if (fds->fd >= fdp->fd_nfiles) {
1421 			fds->revents = POLLNVAL;
1422 			n++;
1423 		} else if (fds->fd < 0) {
1424 			fds->revents = 0;
1425 		} else {
1426 			fp = fdp->fd_ofiles[fds->fd];
1427 #ifdef CAPABILITIES
1428 			if ((fp == NULL)
1429 			    || (cap_funwrap(fp, CAP_POLL_EVENT, &fp) != 0)) {
1430 #else
1431 			if (fp == NULL) {
1432 #endif
1433 				fds->revents = POLLNVAL;
1434 				n++;
1435 			} else {
1436 				/*
1437 				 * Note: backend also returns POLLHUP and
1438 				 * POLLERR if appropriate.
1439 				 */
1440 				selfdalloc(td, fds);
1441 				fds->revents = fo_poll(fp, fds->events,
1442 				    td->td_ucred, td);
1443 				/*
1444 				 * POSIX requires POLLOUT to be never
1445 				 * set simultaneously with POLLHUP.
1446 				 */
1447 				if ((fds->revents & POLLHUP) != 0)
1448 					fds->revents &= ~POLLOUT;
1449 
1450 				if (fds->revents != 0)
1451 					n++;
1452 			}
1453 		}
1454 	}
1455 	FILEDESC_SUNLOCK(fdp);
1456 	td->td_retval[0] = n;
1457 	return (0);
1458 }
1459 
1460 /*
1461  * OpenBSD poll system call.
1462  *
1463  * XXX this isn't quite a true representation..  OpenBSD uses select ops.
1464  */
1465 #ifndef _SYS_SYSPROTO_H_
1466 struct openbsd_poll_args {
1467 	struct pollfd *fds;
1468 	u_int	nfds;
1469 	int	timeout;
1470 };
1471 #endif
1472 int
1473 sys_openbsd_poll(td, uap)
1474 	register struct thread *td;
1475 	register struct openbsd_poll_args *uap;
1476 {
1477 	return (sys_poll(td, (struct poll_args *)uap));
1478 }
1479 
1480 /*
1481  * XXX This was created specifically to support netncp and netsmb.  This
1482  * allows the caller to specify a socket to wait for events on.  It returns
1483  * 0 if any events matched and an error otherwise.  There is no way to
1484  * determine which events fired.
1485  */
1486 int
1487 selsocket(struct socket *so, int events, struct timeval *tvp, struct thread *td)
1488 {
1489 	struct timeval atv, rtv, ttv;
1490 	int error, timo;
1491 
1492 	if (tvp != NULL) {
1493 		atv = *tvp;
1494 		if (itimerfix(&atv))
1495 			return (EINVAL);
1496 		getmicrouptime(&rtv);
1497 		timevaladd(&atv, &rtv);
1498 	} else {
1499 		atv.tv_sec = 0;
1500 		atv.tv_usec = 0;
1501 	}
1502 
1503 	timo = 0;
1504 	seltdinit(td);
1505 	/*
1506 	 * Iterate until the timeout expires or the socket becomes ready.
1507 	 */
1508 	for (;;) {
1509 		selfdalloc(td, NULL);
1510 		error = sopoll(so, events, NULL, td);
1511 		/* error here is actually the ready events. */
1512 		if (error)
1513 			return (0);
1514 		if (atv.tv_sec || atv.tv_usec) {
1515 			getmicrouptime(&rtv);
1516 			if (timevalcmp(&rtv, &atv, >=)) {
1517 				seltdclear(td);
1518 				return (EWOULDBLOCK);
1519 			}
1520 			ttv = atv;
1521 			timevalsub(&ttv, &rtv);
1522 			timo = ttv.tv_sec > 24 * 60 * 60 ?
1523 			    24 * 60 * 60 * hz : tvtohz(&ttv);
1524 		}
1525 		error = seltdwait(td, timo);
1526 		seltdclear(td);
1527 		if (error)
1528 			break;
1529 	}
1530 	/* XXX Duplicates ncp/smb behavior. */
1531 	if (error == ERESTART)
1532 		error = 0;
1533 	return (error);
1534 }
1535 
1536 /*
1537  * Preallocate two selfds associated with 'cookie'.  Some fo_poll routines
1538  * have two select sets, one for read and another for write.
1539  */
1540 static void
1541 selfdalloc(struct thread *td, void *cookie)
1542 {
1543 	struct seltd *stp;
1544 
1545 	stp = td->td_sel;
1546 	if (stp->st_free1 == NULL)
1547 		stp->st_free1 = uma_zalloc(selfd_zone, M_WAITOK|M_ZERO);
1548 	stp->st_free1->sf_td = stp;
1549 	stp->st_free1->sf_cookie = cookie;
1550 	if (stp->st_free2 == NULL)
1551 		stp->st_free2 = uma_zalloc(selfd_zone, M_WAITOK|M_ZERO);
1552 	stp->st_free2->sf_td = stp;
1553 	stp->st_free2->sf_cookie = cookie;
1554 }
1555 
1556 static void
1557 selfdfree(struct seltd *stp, struct selfd *sfp)
1558 {
1559 	STAILQ_REMOVE(&stp->st_selq, sfp, selfd, sf_link);
1560 	mtx_lock(sfp->sf_mtx);
1561 	if (sfp->sf_si)
1562 		TAILQ_REMOVE(&sfp->sf_si->si_tdlist, sfp, sf_threads);
1563 	mtx_unlock(sfp->sf_mtx);
1564 	uma_zfree(selfd_zone, sfp);
1565 }
1566 
1567 /* Drain the waiters tied to all the selfd belonging the specified selinfo. */
1568 void
1569 seldrain(sip)
1570         struct selinfo *sip;
1571 {
1572 
1573 	/*
1574 	 * This feature is already provided by doselwakeup(), thus it is
1575 	 * enough to go for it.
1576 	 * Eventually, the context, should take care to avoid races
1577 	 * between thread calling select()/poll() and file descriptor
1578 	 * detaching, but, again, the races are just the same as
1579 	 * selwakeup().
1580 	 */
1581         doselwakeup(sip, -1);
1582 }
1583 
1584 /*
1585  * Record a select request.
1586  */
1587 void
1588 selrecord(selector, sip)
1589 	struct thread *selector;
1590 	struct selinfo *sip;
1591 {
1592 	struct selfd *sfp;
1593 	struct seltd *stp;
1594 	struct mtx *mtxp;
1595 
1596 	stp = selector->td_sel;
1597 	/*
1598 	 * Don't record when doing a rescan.
1599 	 */
1600 	if (stp->st_flags & SELTD_RESCAN)
1601 		return;
1602 	/*
1603 	 * Grab one of the preallocated descriptors.
1604 	 */
1605 	sfp = NULL;
1606 	if ((sfp = stp->st_free1) != NULL)
1607 		stp->st_free1 = NULL;
1608 	else if ((sfp = stp->st_free2) != NULL)
1609 		stp->st_free2 = NULL;
1610 	else
1611 		panic("selrecord: No free selfd on selq");
1612 	mtxp = sip->si_mtx;
1613 	if (mtxp == NULL)
1614 		mtxp = mtx_pool_find(mtxpool_select, sip);
1615 	/*
1616 	 * Initialize the sfp and queue it in the thread.
1617 	 */
1618 	sfp->sf_si = sip;
1619 	sfp->sf_mtx = mtxp;
1620 	STAILQ_INSERT_TAIL(&stp->st_selq, sfp, sf_link);
1621 	/*
1622 	 * Now that we've locked the sip, check for initialization.
1623 	 */
1624 	mtx_lock(mtxp);
1625 	if (sip->si_mtx == NULL) {
1626 		sip->si_mtx = mtxp;
1627 		TAILQ_INIT(&sip->si_tdlist);
1628 	}
1629 	/*
1630 	 * Add this thread to the list of selfds listening on this selinfo.
1631 	 */
1632 	TAILQ_INSERT_TAIL(&sip->si_tdlist, sfp, sf_threads);
1633 	mtx_unlock(sip->si_mtx);
1634 }
1635 
1636 /* Wake up a selecting thread. */
1637 void
1638 selwakeup(sip)
1639 	struct selinfo *sip;
1640 {
1641 	doselwakeup(sip, -1);
1642 }
1643 
1644 /* Wake up a selecting thread, and set its priority. */
1645 void
1646 selwakeuppri(sip, pri)
1647 	struct selinfo *sip;
1648 	int pri;
1649 {
1650 	doselwakeup(sip, pri);
1651 }
1652 
1653 /*
1654  * Do a wakeup when a selectable event occurs.
1655  */
1656 static void
1657 doselwakeup(sip, pri)
1658 	struct selinfo *sip;
1659 	int pri;
1660 {
1661 	struct selfd *sfp;
1662 	struct selfd *sfn;
1663 	struct seltd *stp;
1664 
1665 	/* If it's not initialized there can't be any waiters. */
1666 	if (sip->si_mtx == NULL)
1667 		return;
1668 	/*
1669 	 * Locking the selinfo locks all selfds associated with it.
1670 	 */
1671 	mtx_lock(sip->si_mtx);
1672 	TAILQ_FOREACH_SAFE(sfp, &sip->si_tdlist, sf_threads, sfn) {
1673 		/*
1674 		 * Once we remove this sfp from the list and clear the
1675 		 * sf_si seltdclear will know to ignore this si.
1676 		 */
1677 		TAILQ_REMOVE(&sip->si_tdlist, sfp, sf_threads);
1678 		sfp->sf_si = NULL;
1679 		stp = sfp->sf_td;
1680 		mtx_lock(&stp->st_mtx);
1681 		stp->st_flags |= SELTD_PENDING;
1682 		cv_broadcastpri(&stp->st_wait, pri);
1683 		mtx_unlock(&stp->st_mtx);
1684 	}
1685 	mtx_unlock(sip->si_mtx);
1686 }
1687 
1688 static void
1689 seltdinit(struct thread *td)
1690 {
1691 	struct seltd *stp;
1692 
1693 	if ((stp = td->td_sel) != NULL)
1694 		goto out;
1695 	td->td_sel = stp = malloc(sizeof(*stp), M_SELECT, M_WAITOK|M_ZERO);
1696 	mtx_init(&stp->st_mtx, "sellck", NULL, MTX_DEF);
1697 	cv_init(&stp->st_wait, "select");
1698 out:
1699 	stp->st_flags = 0;
1700 	STAILQ_INIT(&stp->st_selq);
1701 }
1702 
1703 static int
1704 seltdwait(struct thread *td, int timo)
1705 {
1706 	struct seltd *stp;
1707 	int error;
1708 
1709 	stp = td->td_sel;
1710 	/*
1711 	 * An event of interest may occur while we do not hold the seltd
1712 	 * locked so check the pending flag before we sleep.
1713 	 */
1714 	mtx_lock(&stp->st_mtx);
1715 	/*
1716 	 * Any further calls to selrecord will be a rescan.
1717 	 */
1718 	stp->st_flags |= SELTD_RESCAN;
1719 	if (stp->st_flags & SELTD_PENDING) {
1720 		mtx_unlock(&stp->st_mtx);
1721 		return (0);
1722 	}
1723 	if (timo > 0)
1724 		error = cv_timedwait_sig(&stp->st_wait, &stp->st_mtx, timo);
1725 	else
1726 		error = cv_wait_sig(&stp->st_wait, &stp->st_mtx);
1727 	mtx_unlock(&stp->st_mtx);
1728 
1729 	return (error);
1730 }
1731 
1732 void
1733 seltdfini(struct thread *td)
1734 {
1735 	struct seltd *stp;
1736 
1737 	stp = td->td_sel;
1738 	if (stp == NULL)
1739 		return;
1740 	if (stp->st_free1)
1741 		uma_zfree(selfd_zone, stp->st_free1);
1742 	if (stp->st_free2)
1743 		uma_zfree(selfd_zone, stp->st_free2);
1744 	td->td_sel = NULL;
1745 	free(stp, M_SELECT);
1746 }
1747 
1748 /*
1749  * Remove the references to the thread from all of the objects we were
1750  * polling.
1751  */
1752 static void
1753 seltdclear(struct thread *td)
1754 {
1755 	struct seltd *stp;
1756 	struct selfd *sfp;
1757 	struct selfd *sfn;
1758 
1759 	stp = td->td_sel;
1760 	STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn)
1761 		selfdfree(stp, sfp);
1762 	stp->st_flags = 0;
1763 }
1764 
1765 static void selectinit(void *);
1766 SYSINIT(select, SI_SUB_SYSCALLS, SI_ORDER_ANY, selectinit, NULL);
1767 static void
1768 selectinit(void *dummy __unused)
1769 {
1770 
1771 	selfd_zone = uma_zcreate("selfd", sizeof(struct selfd), NULL, NULL,
1772 	    NULL, NULL, UMA_ALIGN_PTR, 0);
1773 	mtxpool_select = mtx_pool_create("select mtxpool", 128, MTX_DEF);
1774 }
1775