xref: /freebsd/sys/kern/sys_generic.c (revision b78ee15e9f04ae15c3e1200df974473167524d17)
1 /*-
2  * Copyright (c) 1982, 1986, 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 4. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
35  */
36 
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD$");
39 
40 #include "opt_capsicum.h"
41 #include "opt_compat.h"
42 #include "opt_ktrace.h"
43 
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/sysproto.h>
47 #include <sys/capsicum.h>
48 #include <sys/filedesc.h>
49 #include <sys/filio.h>
50 #include <sys/fcntl.h>
51 #include <sys/file.h>
52 #include <sys/lock.h>
53 #include <sys/proc.h>
54 #include <sys/signalvar.h>
55 #include <sys/socketvar.h>
56 #include <sys/uio.h>
57 #include <sys/kernel.h>
58 #include <sys/ktr.h>
59 #include <sys/limits.h>
60 #include <sys/malloc.h>
61 #include <sys/poll.h>
62 #include <sys/resourcevar.h>
63 #include <sys/selinfo.h>
64 #include <sys/sleepqueue.h>
65 #include <sys/syscallsubr.h>
66 #include <sys/sysctl.h>
67 #include <sys/sysent.h>
68 #include <sys/vnode.h>
69 #include <sys/bio.h>
70 #include <sys/buf.h>
71 #include <sys/condvar.h>
72 #ifdef KTRACE
73 #include <sys/ktrace.h>
74 #endif
75 
76 #include <security/audit/audit.h>
77 
78 /*
79  * The following macro defines how many bytes will be allocated from
80  * the stack instead of memory allocated when passing the IOCTL data
81  * structures from userspace and to the kernel. Some IOCTLs having
82  * small data structures are used very frequently and this small
83  * buffer on the stack gives a significant speedup improvement for
84  * those requests. The value of this define should be greater or equal
85  * to 64 bytes and should also be power of two. The data structure is
86  * currently hard-aligned to a 8-byte boundary on the stack. This
87  * should currently be sufficient for all supported platforms.
88  */
89 #define	SYS_IOCTL_SMALL_SIZE	128	/* bytes */
90 #define	SYS_IOCTL_SMALL_ALIGN	8	/* bytes */
91 
92 int iosize_max_clamp = 0;
93 SYSCTL_INT(_debug, OID_AUTO, iosize_max_clamp, CTLFLAG_RW,
94     &iosize_max_clamp, 0, "Clamp max i/o size to INT_MAX");
95 int devfs_iosize_max_clamp = 1;
96 SYSCTL_INT(_debug, OID_AUTO, devfs_iosize_max_clamp, CTLFLAG_RW,
97     &devfs_iosize_max_clamp, 0, "Clamp max i/o size to INT_MAX for devices");
98 
99 /*
100  * Assert that the return value of read(2) and write(2) syscalls fits
101  * into a register.  If not, an architecture will need to provide the
102  * usermode wrappers to reconstruct the result.
103  */
104 CTASSERT(sizeof(register_t) >= sizeof(size_t));
105 
106 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
107 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
108 MALLOC_DEFINE(M_IOV, "iov", "large iov's");
109 
110 static int	pollout(struct thread *, struct pollfd *, struct pollfd *,
111 		    u_int);
112 static int	pollscan(struct thread *, struct pollfd *, u_int);
113 static int	pollrescan(struct thread *);
114 static int	selscan(struct thread *, fd_mask **, fd_mask **, int);
115 static int	selrescan(struct thread *, fd_mask **, fd_mask **);
116 static void	selfdalloc(struct thread *, void *);
117 static void	selfdfree(struct seltd *, struct selfd *);
118 static int	dofileread(struct thread *, int, struct file *, struct uio *,
119 		    off_t, int);
120 static int	dofilewrite(struct thread *, int, struct file *, struct uio *,
121 		    off_t, int);
122 static void	doselwakeup(struct selinfo *, int);
123 static void	seltdinit(struct thread *);
124 static int	seltdwait(struct thread *, sbintime_t, sbintime_t);
125 static void	seltdclear(struct thread *);
126 
127 /*
128  * One seltd per-thread allocated on demand as needed.
129  *
130  *	t - protected by st_mtx
131  * 	k - Only accessed by curthread or read-only
132  */
133 struct seltd {
134 	STAILQ_HEAD(, selfd)	st_selq;	/* (k) List of selfds. */
135 	struct selfd		*st_free1;	/* (k) free fd for read set. */
136 	struct selfd		*st_free2;	/* (k) free fd for write set. */
137 	struct mtx		st_mtx;		/* Protects struct seltd */
138 	struct cv		st_wait;	/* (t) Wait channel. */
139 	int			st_flags;	/* (t) SELTD_ flags. */
140 };
141 
142 #define	SELTD_PENDING	0x0001			/* We have pending events. */
143 #define	SELTD_RESCAN	0x0002			/* Doing a rescan. */
144 
145 /*
146  * One selfd allocated per-thread per-file-descriptor.
147  *	f - protected by sf_mtx
148  */
149 struct selfd {
150 	STAILQ_ENTRY(selfd)	sf_link;	/* (k) fds owned by this td. */
151 	TAILQ_ENTRY(selfd)	sf_threads;	/* (f) fds on this selinfo. */
152 	struct selinfo		*sf_si;		/* (f) selinfo when linked. */
153 	struct mtx		*sf_mtx;	/* Pointer to selinfo mtx. */
154 	struct seltd		*sf_td;		/* (k) owning seltd. */
155 	void			*sf_cookie;	/* (k) fd or pollfd. */
156 };
157 
158 static uma_zone_t selfd_zone;
159 static struct mtx_pool *mtxpool_select;
160 
161 #ifndef _SYS_SYSPROTO_H_
162 struct read_args {
163 	int	fd;
164 	void	*buf;
165 	size_t	nbyte;
166 };
167 #endif
168 int
169 sys_read(td, uap)
170 	struct thread *td;
171 	struct read_args *uap;
172 {
173 	struct uio auio;
174 	struct iovec aiov;
175 	int error;
176 
177 	if (uap->nbyte > IOSIZE_MAX)
178 		return (EINVAL);
179 	aiov.iov_base = uap->buf;
180 	aiov.iov_len = uap->nbyte;
181 	auio.uio_iov = &aiov;
182 	auio.uio_iovcnt = 1;
183 	auio.uio_resid = uap->nbyte;
184 	auio.uio_segflg = UIO_USERSPACE;
185 	error = kern_readv(td, uap->fd, &auio);
186 	return(error);
187 }
188 
189 /*
190  * Positioned read system call
191  */
192 #ifndef _SYS_SYSPROTO_H_
193 struct pread_args {
194 	int	fd;
195 	void	*buf;
196 	size_t	nbyte;
197 	int	pad;
198 	off_t	offset;
199 };
200 #endif
201 int
202 sys_pread(td, uap)
203 	struct thread *td;
204 	struct pread_args *uap;
205 {
206 	struct uio auio;
207 	struct iovec aiov;
208 	int error;
209 
210 	if (uap->nbyte > IOSIZE_MAX)
211 		return (EINVAL);
212 	aiov.iov_base = uap->buf;
213 	aiov.iov_len = uap->nbyte;
214 	auio.uio_iov = &aiov;
215 	auio.uio_iovcnt = 1;
216 	auio.uio_resid = uap->nbyte;
217 	auio.uio_segflg = UIO_USERSPACE;
218 	error = kern_preadv(td, uap->fd, &auio, uap->offset);
219 	return(error);
220 }
221 
222 #if defined(COMPAT_FREEBSD6)
223 int
224 freebsd6_pread(td, uap)
225 	struct thread *td;
226 	struct freebsd6_pread_args *uap;
227 {
228 	struct pread_args oargs;
229 
230 	oargs.fd = uap->fd;
231 	oargs.buf = uap->buf;
232 	oargs.nbyte = uap->nbyte;
233 	oargs.offset = uap->offset;
234 	return (sys_pread(td, &oargs));
235 }
236 #endif
237 
238 /*
239  * Scatter read system call.
240  */
241 #ifndef _SYS_SYSPROTO_H_
242 struct readv_args {
243 	int	fd;
244 	struct	iovec *iovp;
245 	u_int	iovcnt;
246 };
247 #endif
248 int
249 sys_readv(struct thread *td, struct readv_args *uap)
250 {
251 	struct uio *auio;
252 	int error;
253 
254 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
255 	if (error)
256 		return (error);
257 	error = kern_readv(td, uap->fd, auio);
258 	free(auio, M_IOV);
259 	return (error);
260 }
261 
262 int
263 kern_readv(struct thread *td, int fd, struct uio *auio)
264 {
265 	struct file *fp;
266 	cap_rights_t rights;
267 	int error;
268 
269 	error = fget_read(td, fd, cap_rights_init(&rights, CAP_READ), &fp);
270 	if (error)
271 		return (error);
272 	error = dofileread(td, fd, fp, auio, (off_t)-1, 0);
273 	fdrop(fp, td);
274 	return (error);
275 }
276 
277 /*
278  * Scatter positioned read system call.
279  */
280 #ifndef _SYS_SYSPROTO_H_
281 struct preadv_args {
282 	int	fd;
283 	struct	iovec *iovp;
284 	u_int	iovcnt;
285 	off_t	offset;
286 };
287 #endif
288 int
289 sys_preadv(struct thread *td, struct preadv_args *uap)
290 {
291 	struct uio *auio;
292 	int error;
293 
294 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
295 	if (error)
296 		return (error);
297 	error = kern_preadv(td, uap->fd, auio, uap->offset);
298 	free(auio, M_IOV);
299 	return (error);
300 }
301 
302 int
303 kern_preadv(td, fd, auio, offset)
304 	struct thread *td;
305 	int fd;
306 	struct uio *auio;
307 	off_t offset;
308 {
309 	struct file *fp;
310 	cap_rights_t rights;
311 	int error;
312 
313 	error = fget_read(td, fd, cap_rights_init(&rights, CAP_PREAD), &fp);
314 	if (error)
315 		return (error);
316 	if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
317 		error = ESPIPE;
318 	else if (offset < 0 && fp->f_vnode->v_type != VCHR)
319 		error = EINVAL;
320 	else
321 		error = dofileread(td, fd, fp, auio, offset, FOF_OFFSET);
322 	fdrop(fp, td);
323 	return (error);
324 }
325 
326 /*
327  * Common code for readv and preadv that reads data in
328  * from a file using the passed in uio, offset, and flags.
329  */
330 static int
331 dofileread(td, fd, fp, auio, offset, flags)
332 	struct thread *td;
333 	int fd;
334 	struct file *fp;
335 	struct uio *auio;
336 	off_t offset;
337 	int flags;
338 {
339 	ssize_t cnt;
340 	int error;
341 #ifdef KTRACE
342 	struct uio *ktruio = NULL;
343 #endif
344 
345 	/* Finish zero length reads right here */
346 	if (auio->uio_resid == 0) {
347 		td->td_retval[0] = 0;
348 		return(0);
349 	}
350 	auio->uio_rw = UIO_READ;
351 	auio->uio_offset = offset;
352 	auio->uio_td = td;
353 #ifdef KTRACE
354 	if (KTRPOINT(td, KTR_GENIO))
355 		ktruio = cloneuio(auio);
356 #endif
357 	cnt = auio->uio_resid;
358 	if ((error = fo_read(fp, auio, td->td_ucred, flags, td))) {
359 		if (auio->uio_resid != cnt && (error == ERESTART ||
360 		    error == EINTR || error == EWOULDBLOCK))
361 			error = 0;
362 	}
363 	cnt -= auio->uio_resid;
364 #ifdef KTRACE
365 	if (ktruio != NULL) {
366 		ktruio->uio_resid = cnt;
367 		ktrgenio(fd, UIO_READ, ktruio, error);
368 	}
369 #endif
370 	td->td_retval[0] = cnt;
371 	return (error);
372 }
373 
374 #ifndef _SYS_SYSPROTO_H_
375 struct write_args {
376 	int	fd;
377 	const void *buf;
378 	size_t	nbyte;
379 };
380 #endif
381 int
382 sys_write(td, uap)
383 	struct thread *td;
384 	struct write_args *uap;
385 {
386 	struct uio auio;
387 	struct iovec aiov;
388 	int error;
389 
390 	if (uap->nbyte > IOSIZE_MAX)
391 		return (EINVAL);
392 	aiov.iov_base = (void *)(uintptr_t)uap->buf;
393 	aiov.iov_len = uap->nbyte;
394 	auio.uio_iov = &aiov;
395 	auio.uio_iovcnt = 1;
396 	auio.uio_resid = uap->nbyte;
397 	auio.uio_segflg = UIO_USERSPACE;
398 	error = kern_writev(td, uap->fd, &auio);
399 	return(error);
400 }
401 
402 /*
403  * Positioned write system call.
404  */
405 #ifndef _SYS_SYSPROTO_H_
406 struct pwrite_args {
407 	int	fd;
408 	const void *buf;
409 	size_t	nbyte;
410 	int	pad;
411 	off_t	offset;
412 };
413 #endif
414 int
415 sys_pwrite(td, uap)
416 	struct thread *td;
417 	struct pwrite_args *uap;
418 {
419 	struct uio auio;
420 	struct iovec aiov;
421 	int error;
422 
423 	if (uap->nbyte > IOSIZE_MAX)
424 		return (EINVAL);
425 	aiov.iov_base = (void *)(uintptr_t)uap->buf;
426 	aiov.iov_len = uap->nbyte;
427 	auio.uio_iov = &aiov;
428 	auio.uio_iovcnt = 1;
429 	auio.uio_resid = uap->nbyte;
430 	auio.uio_segflg = UIO_USERSPACE;
431 	error = kern_pwritev(td, uap->fd, &auio, uap->offset);
432 	return(error);
433 }
434 
435 #if defined(COMPAT_FREEBSD6)
436 int
437 freebsd6_pwrite(td, uap)
438 	struct thread *td;
439 	struct freebsd6_pwrite_args *uap;
440 {
441 	struct pwrite_args oargs;
442 
443 	oargs.fd = uap->fd;
444 	oargs.buf = uap->buf;
445 	oargs.nbyte = uap->nbyte;
446 	oargs.offset = uap->offset;
447 	return (sys_pwrite(td, &oargs));
448 }
449 #endif
450 
451 /*
452  * Gather write system call.
453  */
454 #ifndef _SYS_SYSPROTO_H_
455 struct writev_args {
456 	int	fd;
457 	struct	iovec *iovp;
458 	u_int	iovcnt;
459 };
460 #endif
461 int
462 sys_writev(struct thread *td, struct writev_args *uap)
463 {
464 	struct uio *auio;
465 	int error;
466 
467 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
468 	if (error)
469 		return (error);
470 	error = kern_writev(td, uap->fd, auio);
471 	free(auio, M_IOV);
472 	return (error);
473 }
474 
475 int
476 kern_writev(struct thread *td, int fd, struct uio *auio)
477 {
478 	struct file *fp;
479 	cap_rights_t rights;
480 	int error;
481 
482 	error = fget_write(td, fd, cap_rights_init(&rights, CAP_WRITE), &fp);
483 	if (error)
484 		return (error);
485 	error = dofilewrite(td, fd, fp, auio, (off_t)-1, 0);
486 	fdrop(fp, td);
487 	return (error);
488 }
489 
490 /*
491  * Gather positioned write system call.
492  */
493 #ifndef _SYS_SYSPROTO_H_
494 struct pwritev_args {
495 	int	fd;
496 	struct	iovec *iovp;
497 	u_int	iovcnt;
498 	off_t	offset;
499 };
500 #endif
501 int
502 sys_pwritev(struct thread *td, struct pwritev_args *uap)
503 {
504 	struct uio *auio;
505 	int error;
506 
507 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
508 	if (error)
509 		return (error);
510 	error = kern_pwritev(td, uap->fd, auio, uap->offset);
511 	free(auio, M_IOV);
512 	return (error);
513 }
514 
515 int
516 kern_pwritev(td, fd, auio, offset)
517 	struct thread *td;
518 	struct uio *auio;
519 	int fd;
520 	off_t offset;
521 {
522 	struct file *fp;
523 	cap_rights_t rights;
524 	int error;
525 
526 	error = fget_write(td, fd, cap_rights_init(&rights, CAP_PWRITE), &fp);
527 	if (error)
528 		return (error);
529 	if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
530 		error = ESPIPE;
531 	else if (offset < 0 && fp->f_vnode->v_type != VCHR)
532 		error = EINVAL;
533 	else
534 		error = dofilewrite(td, fd, fp, auio, offset, FOF_OFFSET);
535 	fdrop(fp, td);
536 	return (error);
537 }
538 
539 /*
540  * Common code for writev and pwritev that writes data to
541  * a file using the passed in uio, offset, and flags.
542  */
543 static int
544 dofilewrite(td, fd, fp, auio, offset, flags)
545 	struct thread *td;
546 	int fd;
547 	struct file *fp;
548 	struct uio *auio;
549 	off_t offset;
550 	int flags;
551 {
552 	ssize_t cnt;
553 	int error;
554 #ifdef KTRACE
555 	struct uio *ktruio = NULL;
556 #endif
557 
558 	auio->uio_rw = UIO_WRITE;
559 	auio->uio_td = td;
560 	auio->uio_offset = offset;
561 #ifdef KTRACE
562 	if (KTRPOINT(td, KTR_GENIO))
563 		ktruio = cloneuio(auio);
564 #endif
565 	cnt = auio->uio_resid;
566 	if (fp->f_type == DTYPE_VNODE &&
567 	    (fp->f_vnread_flags & FDEVFS_VNODE) == 0)
568 		bwillwrite();
569 	if ((error = fo_write(fp, auio, td->td_ucred, flags, td))) {
570 		if (auio->uio_resid != cnt && (error == ERESTART ||
571 		    error == EINTR || error == EWOULDBLOCK))
572 			error = 0;
573 		/* Socket layer is responsible for issuing SIGPIPE. */
574 		if (fp->f_type != DTYPE_SOCKET && error == EPIPE) {
575 			PROC_LOCK(td->td_proc);
576 			tdsignal(td, SIGPIPE);
577 			PROC_UNLOCK(td->td_proc);
578 		}
579 	}
580 	cnt -= auio->uio_resid;
581 #ifdef KTRACE
582 	if (ktruio != NULL) {
583 		ktruio->uio_resid = cnt;
584 		ktrgenio(fd, UIO_WRITE, ktruio, error);
585 	}
586 #endif
587 	td->td_retval[0] = cnt;
588 	return (error);
589 }
590 
591 /*
592  * Truncate a file given a file descriptor.
593  *
594  * Can't use fget_write() here, since must return EINVAL and not EBADF if the
595  * descriptor isn't writable.
596  */
597 int
598 kern_ftruncate(td, fd, length)
599 	struct thread *td;
600 	int fd;
601 	off_t length;
602 {
603 	struct file *fp;
604 	cap_rights_t rights;
605 	int error;
606 
607 	AUDIT_ARG_FD(fd);
608 	if (length < 0)
609 		return (EINVAL);
610 	error = fget(td, fd, cap_rights_init(&rights, CAP_FTRUNCATE), &fp);
611 	if (error)
612 		return (error);
613 	AUDIT_ARG_FILE(td->td_proc, fp);
614 	if (!(fp->f_flag & FWRITE)) {
615 		fdrop(fp, td);
616 		return (EINVAL);
617 	}
618 	error = fo_truncate(fp, length, td->td_ucred, td);
619 	fdrop(fp, td);
620 	return (error);
621 }
622 
623 #ifndef _SYS_SYSPROTO_H_
624 struct ftruncate_args {
625 	int	fd;
626 	int	pad;
627 	off_t	length;
628 };
629 #endif
630 int
631 sys_ftruncate(td, uap)
632 	struct thread *td;
633 	struct ftruncate_args *uap;
634 {
635 
636 	return (kern_ftruncate(td, uap->fd, uap->length));
637 }
638 
639 #if defined(COMPAT_43)
640 #ifndef _SYS_SYSPROTO_H_
641 struct oftruncate_args {
642 	int	fd;
643 	long	length;
644 };
645 #endif
646 int
647 oftruncate(td, uap)
648 	struct thread *td;
649 	struct oftruncate_args *uap;
650 {
651 
652 	return (kern_ftruncate(td, uap->fd, uap->length));
653 }
654 #endif /* COMPAT_43 */
655 
656 #ifndef _SYS_SYSPROTO_H_
657 struct ioctl_args {
658 	int	fd;
659 	u_long	com;
660 	caddr_t	data;
661 };
662 #endif
663 /* ARGSUSED */
664 int
665 sys_ioctl(struct thread *td, struct ioctl_args *uap)
666 {
667 	u_char smalldata[SYS_IOCTL_SMALL_SIZE] __aligned(SYS_IOCTL_SMALL_ALIGN);
668 	u_long com;
669 	int arg, error;
670 	u_int size;
671 	caddr_t data;
672 
673 	if (uap->com > 0xffffffff) {
674 		printf(
675 		    "WARNING pid %d (%s): ioctl sign-extension ioctl %lx\n",
676 		    td->td_proc->p_pid, td->td_name, uap->com);
677 		uap->com &= 0xffffffff;
678 	}
679 	com = uap->com;
680 
681 	/*
682 	 * Interpret high order word to find amount of data to be
683 	 * copied to/from the user's address space.
684 	 */
685 	size = IOCPARM_LEN(com);
686 	if ((size > IOCPARM_MAX) ||
687 	    ((com & (IOC_VOID  | IOC_IN | IOC_OUT)) == 0) ||
688 #if defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
689 	    ((com & IOC_OUT) && size == 0) ||
690 #else
691 	    ((com & (IOC_IN | IOC_OUT)) && size == 0) ||
692 #endif
693 	    ((com & IOC_VOID) && size > 0 && size != sizeof(int)))
694 		return (ENOTTY);
695 
696 	if (size > 0) {
697 		if (com & IOC_VOID) {
698 			/* Integer argument. */
699 			arg = (intptr_t)uap->data;
700 			data = (void *)&arg;
701 			size = 0;
702 		} else {
703 			if (size > SYS_IOCTL_SMALL_SIZE)
704 				data = malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
705 			else
706 				data = smalldata;
707 		}
708 	} else
709 		data = (void *)&uap->data;
710 	if (com & IOC_IN) {
711 		error = copyin(uap->data, data, (u_int)size);
712 		if (error != 0)
713 			goto out;
714 	} else if (com & IOC_OUT) {
715 		/*
716 		 * Zero the buffer so the user always
717 		 * gets back something deterministic.
718 		 */
719 		bzero(data, size);
720 	}
721 
722 	error = kern_ioctl(td, uap->fd, com, data);
723 
724 	if (error == 0 && (com & IOC_OUT))
725 		error = copyout(data, uap->data, (u_int)size);
726 
727 out:
728 	if (size > SYS_IOCTL_SMALL_SIZE)
729 		free(data, M_IOCTLOPS);
730 	return (error);
731 }
732 
733 int
734 kern_ioctl(struct thread *td, int fd, u_long com, caddr_t data)
735 {
736 	struct file *fp;
737 	struct filedesc *fdp;
738 #ifndef CAPABILITIES
739 	cap_rights_t rights;
740 #endif
741 	int error, tmp, locked;
742 
743 	AUDIT_ARG_FD(fd);
744 	AUDIT_ARG_CMD(com);
745 
746 	fdp = td->td_proc->p_fd;
747 
748 	switch (com) {
749 	case FIONCLEX:
750 	case FIOCLEX:
751 		FILEDESC_XLOCK(fdp);
752 		locked = LA_XLOCKED;
753 		break;
754 	default:
755 #ifdef CAPABILITIES
756 		FILEDESC_SLOCK(fdp);
757 		locked = LA_SLOCKED;
758 #else
759 		locked = LA_UNLOCKED;
760 #endif
761 		break;
762 	}
763 
764 #ifdef CAPABILITIES
765 	if ((fp = fget_locked(fdp, fd)) == NULL) {
766 		error = EBADF;
767 		goto out;
768 	}
769 	if ((error = cap_ioctl_check(fdp, fd, com)) != 0) {
770 		fp = NULL;	/* fhold() was not called yet */
771 		goto out;
772 	}
773 	fhold(fp);
774 	if (locked == LA_SLOCKED) {
775 		FILEDESC_SUNLOCK(fdp);
776 		locked = LA_UNLOCKED;
777 	}
778 #else
779 	error = fget(td, fd, cap_rights_init(&rights, CAP_IOCTL), &fp);
780 	if (error != 0) {
781 		fp = NULL;
782 		goto out;
783 	}
784 #endif
785 	if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
786 		error = EBADF;
787 		goto out;
788 	}
789 
790 	switch (com) {
791 	case FIONCLEX:
792 		fdp->fd_ofiles[fd].fde_flags &= ~UF_EXCLOSE;
793 		goto out;
794 	case FIOCLEX:
795 		fdp->fd_ofiles[fd].fde_flags |= UF_EXCLOSE;
796 		goto out;
797 	case FIONBIO:
798 		if ((tmp = *(int *)data))
799 			atomic_set_int(&fp->f_flag, FNONBLOCK);
800 		else
801 			atomic_clear_int(&fp->f_flag, FNONBLOCK);
802 		data = (void *)&tmp;
803 		break;
804 	case FIOASYNC:
805 		if ((tmp = *(int *)data))
806 			atomic_set_int(&fp->f_flag, FASYNC);
807 		else
808 			atomic_clear_int(&fp->f_flag, FASYNC);
809 		data = (void *)&tmp;
810 		break;
811 	}
812 
813 	error = fo_ioctl(fp, com, data, td->td_ucred, td);
814 out:
815 	switch (locked) {
816 	case LA_XLOCKED:
817 		FILEDESC_XUNLOCK(fdp);
818 		break;
819 #ifdef CAPABILITIES
820 	case LA_SLOCKED:
821 		FILEDESC_SUNLOCK(fdp);
822 		break;
823 #endif
824 	default:
825 		FILEDESC_UNLOCK_ASSERT(fdp);
826 		break;
827 	}
828 	if (fp != NULL)
829 		fdrop(fp, td);
830 	return (error);
831 }
832 
833 int
834 poll_no_poll(int events)
835 {
836 	/*
837 	 * Return true for read/write.  If the user asked for something
838 	 * special, return POLLNVAL, so that clients have a way of
839 	 * determining reliably whether or not the extended
840 	 * functionality is present without hard-coding knowledge
841 	 * of specific filesystem implementations.
842 	 */
843 	if (events & ~POLLSTANDARD)
844 		return (POLLNVAL);
845 
846 	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
847 }
848 
849 int
850 sys_pselect(struct thread *td, struct pselect_args *uap)
851 {
852 	struct timespec ts;
853 	struct timeval tv, *tvp;
854 	sigset_t set, *uset;
855 	int error;
856 
857 	if (uap->ts != NULL) {
858 		error = copyin(uap->ts, &ts, sizeof(ts));
859 		if (error != 0)
860 		    return (error);
861 		TIMESPEC_TO_TIMEVAL(&tv, &ts);
862 		tvp = &tv;
863 	} else
864 		tvp = NULL;
865 	if (uap->sm != NULL) {
866 		error = copyin(uap->sm, &set, sizeof(set));
867 		if (error != 0)
868 			return (error);
869 		uset = &set;
870 	} else
871 		uset = NULL;
872 	return (kern_pselect(td, uap->nd, uap->in, uap->ou, uap->ex, tvp,
873 	    uset, NFDBITS));
874 }
875 
876 int
877 kern_pselect(struct thread *td, int nd, fd_set *in, fd_set *ou, fd_set *ex,
878     struct timeval *tvp, sigset_t *uset, int abi_nfdbits)
879 {
880 	int error;
881 
882 	if (uset != NULL) {
883 		error = kern_sigprocmask(td, SIG_SETMASK, uset,
884 		    &td->td_oldsigmask, 0);
885 		if (error != 0)
886 			return (error);
887 		td->td_pflags |= TDP_OLDMASK;
888 		/*
889 		 * Make sure that ast() is called on return to
890 		 * usermode and TDP_OLDMASK is cleared, restoring old
891 		 * sigmask.
892 		 */
893 		thread_lock(td);
894 		td->td_flags |= TDF_ASTPENDING;
895 		thread_unlock(td);
896 	}
897 	error = kern_select(td, nd, in, ou, ex, tvp, abi_nfdbits);
898 	return (error);
899 }
900 
901 #ifndef _SYS_SYSPROTO_H_
902 struct select_args {
903 	int	nd;
904 	fd_set	*in, *ou, *ex;
905 	struct	timeval *tv;
906 };
907 #endif
908 int
909 sys_select(struct thread *td, struct select_args *uap)
910 {
911 	struct timeval tv, *tvp;
912 	int error;
913 
914 	if (uap->tv != NULL) {
915 		error = copyin(uap->tv, &tv, sizeof(tv));
916 		if (error)
917 			return (error);
918 		tvp = &tv;
919 	} else
920 		tvp = NULL;
921 
922 	return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp,
923 	    NFDBITS));
924 }
925 
926 /*
927  * In the unlikely case when user specified n greater then the last
928  * open file descriptor, check that no bits are set after the last
929  * valid fd.  We must return EBADF if any is set.
930  *
931  * There are applications that rely on the behaviour.
932  *
933  * nd is fd_lastfile + 1.
934  */
935 static int
936 select_check_badfd(fd_set *fd_in, int nd, int ndu, int abi_nfdbits)
937 {
938 	char *addr, *oaddr;
939 	int b, i, res;
940 	uint8_t bits;
941 
942 	if (nd >= ndu || fd_in == NULL)
943 		return (0);
944 
945 	oaddr = NULL;
946 	bits = 0; /* silence gcc */
947 	for (i = nd; i < ndu; i++) {
948 		b = i / NBBY;
949 #if BYTE_ORDER == LITTLE_ENDIAN
950 		addr = (char *)fd_in + b;
951 #else
952 		addr = (char *)fd_in;
953 		if (abi_nfdbits == NFDBITS) {
954 			addr += rounddown(b, sizeof(fd_mask)) +
955 			    sizeof(fd_mask) - 1 - b % sizeof(fd_mask);
956 		} else {
957 			addr += rounddown(b, sizeof(uint32_t)) +
958 			    sizeof(uint32_t) - 1 - b % sizeof(uint32_t);
959 		}
960 #endif
961 		if (addr != oaddr) {
962 			res = fubyte(addr);
963 			if (res == -1)
964 				return (EFAULT);
965 			oaddr = addr;
966 			bits = res;
967 		}
968 		if ((bits & (1 << (i % NBBY))) != 0)
969 			return (EBADF);
970 	}
971 	return (0);
972 }
973 
974 int
975 kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou,
976     fd_set *fd_ex, struct timeval *tvp, int abi_nfdbits)
977 {
978 	struct filedesc *fdp;
979 	/*
980 	 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
981 	 * infds with the new FD_SETSIZE of 1024, and more than enough for
982 	 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
983 	 * of 256.
984 	 */
985 	fd_mask s_selbits[howmany(2048, NFDBITS)];
986 	fd_mask *ibits[3], *obits[3], *selbits, *sbp;
987 	struct timeval rtv;
988 	sbintime_t asbt, precision, rsbt;
989 	u_int nbufbytes, ncpbytes, ncpubytes, nfdbits;
990 	int error, lf, ndu;
991 
992 	if (nd < 0)
993 		return (EINVAL);
994 	fdp = td->td_proc->p_fd;
995 	ndu = nd;
996 	lf = fdp->fd_lastfile;
997 	if (nd > lf + 1)
998 		nd = lf + 1;
999 
1000 	error = select_check_badfd(fd_in, nd, ndu, abi_nfdbits);
1001 	if (error != 0)
1002 		return (error);
1003 	error = select_check_badfd(fd_ou, nd, ndu, abi_nfdbits);
1004 	if (error != 0)
1005 		return (error);
1006 	error = select_check_badfd(fd_ex, nd, ndu, abi_nfdbits);
1007 	if (error != 0)
1008 		return (error);
1009 
1010 	/*
1011 	 * Allocate just enough bits for the non-null fd_sets.  Use the
1012 	 * preallocated auto buffer if possible.
1013 	 */
1014 	nfdbits = roundup(nd, NFDBITS);
1015 	ncpbytes = nfdbits / NBBY;
1016 	ncpubytes = roundup(nd, abi_nfdbits) / NBBY;
1017 	nbufbytes = 0;
1018 	if (fd_in != NULL)
1019 		nbufbytes += 2 * ncpbytes;
1020 	if (fd_ou != NULL)
1021 		nbufbytes += 2 * ncpbytes;
1022 	if (fd_ex != NULL)
1023 		nbufbytes += 2 * ncpbytes;
1024 	if (nbufbytes <= sizeof s_selbits)
1025 		selbits = &s_selbits[0];
1026 	else
1027 		selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
1028 
1029 	/*
1030 	 * Assign pointers into the bit buffers and fetch the input bits.
1031 	 * Put the output buffers together so that they can be bzeroed
1032 	 * together.
1033 	 */
1034 	sbp = selbits;
1035 #define	getbits(name, x) \
1036 	do {								\
1037 		if (name == NULL) {					\
1038 			ibits[x] = NULL;				\
1039 			obits[x] = NULL;				\
1040 		} else {						\
1041 			ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp;	\
1042 			obits[x] = sbp;					\
1043 			sbp += ncpbytes / sizeof *sbp;			\
1044 			error = copyin(name, ibits[x], ncpubytes);	\
1045 			if (error != 0)					\
1046 				goto done;				\
1047 			bzero((char *)ibits[x] + ncpubytes,		\
1048 			    ncpbytes - ncpubytes);			\
1049 		}							\
1050 	} while (0)
1051 	getbits(fd_in, 0);
1052 	getbits(fd_ou, 1);
1053 	getbits(fd_ex, 2);
1054 #undef	getbits
1055 
1056 #if BYTE_ORDER == BIG_ENDIAN && defined(__LP64__)
1057 	/*
1058 	 * XXX: swizzle_fdset assumes that if abi_nfdbits != NFDBITS,
1059 	 * we are running under 32-bit emulation. This should be more
1060 	 * generic.
1061 	 */
1062 #define swizzle_fdset(bits)						\
1063 	if (abi_nfdbits != NFDBITS && bits != NULL) {			\
1064 		int i;							\
1065 		for (i = 0; i < ncpbytes / sizeof *sbp; i++)		\
1066 			bits[i] = (bits[i] >> 32) | (bits[i] << 32);	\
1067 	}
1068 #else
1069 #define swizzle_fdset(bits)
1070 #endif
1071 
1072 	/* Make sure the bit order makes it through an ABI transition */
1073 	swizzle_fdset(ibits[0]);
1074 	swizzle_fdset(ibits[1]);
1075 	swizzle_fdset(ibits[2]);
1076 
1077 	if (nbufbytes != 0)
1078 		bzero(selbits, nbufbytes / 2);
1079 
1080 	precision = 0;
1081 	if (tvp != NULL) {
1082 		rtv = *tvp;
1083 		if (rtv.tv_sec < 0 || rtv.tv_usec < 0 ||
1084 		    rtv.tv_usec >= 1000000) {
1085 			error = EINVAL;
1086 			goto done;
1087 		}
1088 		if (!timevalisset(&rtv))
1089 			asbt = 0;
1090 		else if (rtv.tv_sec <= INT32_MAX) {
1091 			rsbt = tvtosbt(rtv);
1092 			precision = rsbt;
1093 			precision >>= tc_precexp;
1094 			if (TIMESEL(&asbt, rsbt))
1095 				asbt += tc_tick_sbt;
1096 			if (asbt <= SBT_MAX - rsbt)
1097 				asbt += rsbt;
1098 			else
1099 				asbt = -1;
1100 		} else
1101 			asbt = -1;
1102 	} else
1103 		asbt = -1;
1104 	seltdinit(td);
1105 	/* Iterate until the timeout expires or descriptors become ready. */
1106 	for (;;) {
1107 		error = selscan(td, ibits, obits, nd);
1108 		if (error || td->td_retval[0] != 0)
1109 			break;
1110 		error = seltdwait(td, asbt, precision);
1111 		if (error)
1112 			break;
1113 		error = selrescan(td, ibits, obits);
1114 		if (error || td->td_retval[0] != 0)
1115 			break;
1116 	}
1117 	seltdclear(td);
1118 
1119 done:
1120 	/* select is not restarted after signals... */
1121 	if (error == ERESTART)
1122 		error = EINTR;
1123 	if (error == EWOULDBLOCK)
1124 		error = 0;
1125 
1126 	/* swizzle bit order back, if necessary */
1127 	swizzle_fdset(obits[0]);
1128 	swizzle_fdset(obits[1]);
1129 	swizzle_fdset(obits[2]);
1130 #undef swizzle_fdset
1131 
1132 #define	putbits(name, x) \
1133 	if (name && (error2 = copyout(obits[x], name, ncpubytes))) \
1134 		error = error2;
1135 	if (error == 0) {
1136 		int error2;
1137 
1138 		putbits(fd_in, 0);
1139 		putbits(fd_ou, 1);
1140 		putbits(fd_ex, 2);
1141 #undef putbits
1142 	}
1143 	if (selbits != &s_selbits[0])
1144 		free(selbits, M_SELECT);
1145 
1146 	return (error);
1147 }
1148 /*
1149  * Convert a select bit set to poll flags.
1150  *
1151  * The backend always returns POLLHUP/POLLERR if appropriate and we
1152  * return this as a set bit in any set.
1153  */
1154 static int select_flags[3] = {
1155     POLLRDNORM | POLLHUP | POLLERR,
1156     POLLWRNORM | POLLHUP | POLLERR,
1157     POLLRDBAND | POLLERR
1158 };
1159 
1160 /*
1161  * Compute the fo_poll flags required for a fd given by the index and
1162  * bit position in the fd_mask array.
1163  */
1164 static __inline int
1165 selflags(fd_mask **ibits, int idx, fd_mask bit)
1166 {
1167 	int flags;
1168 	int msk;
1169 
1170 	flags = 0;
1171 	for (msk = 0; msk < 3; msk++) {
1172 		if (ibits[msk] == NULL)
1173 			continue;
1174 		if ((ibits[msk][idx] & bit) == 0)
1175 			continue;
1176 		flags |= select_flags[msk];
1177 	}
1178 	return (flags);
1179 }
1180 
1181 /*
1182  * Set the appropriate output bits given a mask of fired events and the
1183  * input bits originally requested.
1184  */
1185 static __inline int
1186 selsetbits(fd_mask **ibits, fd_mask **obits, int idx, fd_mask bit, int events)
1187 {
1188 	int msk;
1189 	int n;
1190 
1191 	n = 0;
1192 	for (msk = 0; msk < 3; msk++) {
1193 		if ((events & select_flags[msk]) == 0)
1194 			continue;
1195 		if (ibits[msk] == NULL)
1196 			continue;
1197 		if ((ibits[msk][idx] & bit) == 0)
1198 			continue;
1199 		/*
1200 		 * XXX Check for a duplicate set.  This can occur because a
1201 		 * socket calls selrecord() twice for each poll() call
1202 		 * resulting in two selfds per real fd.  selrescan() will
1203 		 * call selsetbits twice as a result.
1204 		 */
1205 		if ((obits[msk][idx] & bit) != 0)
1206 			continue;
1207 		obits[msk][idx] |= bit;
1208 		n++;
1209 	}
1210 
1211 	return (n);
1212 }
1213 
1214 static __inline int
1215 getselfd_cap(struct filedesc *fdp, int fd, struct file **fpp)
1216 {
1217 	cap_rights_t rights;
1218 
1219 	cap_rights_init(&rights, CAP_EVENT);
1220 
1221 	return (fget_unlocked(fdp, fd, &rights, fpp, NULL));
1222 }
1223 
1224 /*
1225  * Traverse the list of fds attached to this thread's seltd and check for
1226  * completion.
1227  */
1228 static int
1229 selrescan(struct thread *td, fd_mask **ibits, fd_mask **obits)
1230 {
1231 	struct filedesc *fdp;
1232 	struct selinfo *si;
1233 	struct seltd *stp;
1234 	struct selfd *sfp;
1235 	struct selfd *sfn;
1236 	struct file *fp;
1237 	fd_mask bit;
1238 	int fd, ev, n, idx;
1239 	int error;
1240 
1241 	fdp = td->td_proc->p_fd;
1242 	stp = td->td_sel;
1243 	n = 0;
1244 	STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) {
1245 		fd = (int)(uintptr_t)sfp->sf_cookie;
1246 		si = sfp->sf_si;
1247 		selfdfree(stp, sfp);
1248 		/* If the selinfo wasn't cleared the event didn't fire. */
1249 		if (si != NULL)
1250 			continue;
1251 		error = getselfd_cap(fdp, fd, &fp);
1252 		if (error)
1253 			return (error);
1254 		idx = fd / NFDBITS;
1255 		bit = (fd_mask)1 << (fd % NFDBITS);
1256 		ev = fo_poll(fp, selflags(ibits, idx, bit), td->td_ucred, td);
1257 		fdrop(fp, td);
1258 		if (ev != 0)
1259 			n += selsetbits(ibits, obits, idx, bit, ev);
1260 	}
1261 	stp->st_flags = 0;
1262 	td->td_retval[0] = n;
1263 	return (0);
1264 }
1265 
1266 /*
1267  * Perform the initial filedescriptor scan and register ourselves with
1268  * each selinfo.
1269  */
1270 static int
1271 selscan(td, ibits, obits, nfd)
1272 	struct thread *td;
1273 	fd_mask **ibits, **obits;
1274 	int nfd;
1275 {
1276 	struct filedesc *fdp;
1277 	struct file *fp;
1278 	fd_mask bit;
1279 	int ev, flags, end, fd;
1280 	int n, idx;
1281 	int error;
1282 
1283 	fdp = td->td_proc->p_fd;
1284 	n = 0;
1285 	for (idx = 0, fd = 0; fd < nfd; idx++) {
1286 		end = imin(fd + NFDBITS, nfd);
1287 		for (bit = 1; fd < end; bit <<= 1, fd++) {
1288 			/* Compute the list of events we're interested in. */
1289 			flags = selflags(ibits, idx, bit);
1290 			if (flags == 0)
1291 				continue;
1292 			error = getselfd_cap(fdp, fd, &fp);
1293 			if (error)
1294 				return (error);
1295 			selfdalloc(td, (void *)(uintptr_t)fd);
1296 			ev = fo_poll(fp, flags, td->td_ucred, td);
1297 			fdrop(fp, td);
1298 			if (ev != 0)
1299 				n += selsetbits(ibits, obits, idx, bit, ev);
1300 		}
1301 	}
1302 
1303 	td->td_retval[0] = n;
1304 	return (0);
1305 }
1306 
1307 int
1308 sys_poll(struct thread *td, struct poll_args *uap)
1309 {
1310 	struct timespec ts, *tsp;
1311 
1312 	if (uap->timeout != INFTIM) {
1313 		if (uap->timeout < 0)
1314 			return (EINVAL);
1315 		ts.tv_sec = uap->timeout / 1000;
1316 		ts.tv_nsec = (uap->timeout % 1000) * 1000000;
1317 		tsp = &ts;
1318 	} else
1319 		tsp = NULL;
1320 
1321 	return (kern_poll(td, uap->fds, uap->nfds, tsp, NULL));
1322 }
1323 
1324 int
1325 kern_poll(struct thread *td, struct pollfd *fds, u_int nfds,
1326     struct timespec *tsp, sigset_t *uset)
1327 {
1328 	struct pollfd *bits;
1329 	struct pollfd smallbits[32];
1330 	sbintime_t sbt, precision, tmp;
1331 	time_t over;
1332 	struct timespec ts;
1333 	int error;
1334 	size_t ni;
1335 
1336 	precision = 0;
1337 	if (tsp != NULL) {
1338 		if (tsp->tv_sec < 0)
1339 			return (EINVAL);
1340 		if (tsp->tv_nsec < 0 || tsp->tv_nsec >= 1000000000)
1341 			return (EINVAL);
1342 		if (tsp->tv_sec == 0 && tsp->tv_nsec == 0)
1343 			sbt = 0;
1344 		else {
1345 			ts = *tsp;
1346 			if (ts.tv_sec > INT32_MAX / 2) {
1347 				over = ts.tv_sec - INT32_MAX / 2;
1348 				ts.tv_sec -= over;
1349 			} else
1350 				over = 0;
1351 			tmp = tstosbt(ts);
1352 			precision = tmp;
1353 			precision >>= tc_precexp;
1354 			if (TIMESEL(&sbt, tmp))
1355 				sbt += tc_tick_sbt;
1356 			sbt += tmp;
1357 		}
1358 	} else
1359 		sbt = -1;
1360 
1361 	if (nfds > maxfilesperproc && nfds > FD_SETSIZE)
1362 		return (EINVAL);
1363 	ni = nfds * sizeof(struct pollfd);
1364 	if (ni > sizeof(smallbits))
1365 		bits = malloc(ni, M_TEMP, M_WAITOK);
1366 	else
1367 		bits = smallbits;
1368 	error = copyin(fds, bits, ni);
1369 	if (error)
1370 		goto done;
1371 
1372 	if (uset != NULL) {
1373 		error = kern_sigprocmask(td, SIG_SETMASK, uset,
1374 		    &td->td_oldsigmask, 0);
1375 		if (error)
1376 			goto done;
1377 		td->td_pflags |= TDP_OLDMASK;
1378 		/*
1379 		 * Make sure that ast() is called on return to
1380 		 * usermode and TDP_OLDMASK is cleared, restoring old
1381 		 * sigmask.
1382 		 */
1383 		thread_lock(td);
1384 		td->td_flags |= TDF_ASTPENDING;
1385 		thread_unlock(td);
1386 	}
1387 
1388 	seltdinit(td);
1389 	/* Iterate until the timeout expires or descriptors become ready. */
1390 	for (;;) {
1391 		error = pollscan(td, bits, nfds);
1392 		if (error || td->td_retval[0] != 0)
1393 			break;
1394 		error = seltdwait(td, sbt, precision);
1395 		if (error)
1396 			break;
1397 		error = pollrescan(td);
1398 		if (error || td->td_retval[0] != 0)
1399 			break;
1400 	}
1401 	seltdclear(td);
1402 
1403 done:
1404 	/* poll is not restarted after signals... */
1405 	if (error == ERESTART)
1406 		error = EINTR;
1407 	if (error == EWOULDBLOCK)
1408 		error = 0;
1409 	if (error == 0) {
1410 		error = pollout(td, bits, fds, nfds);
1411 		if (error)
1412 			goto out;
1413 	}
1414 out:
1415 	if (ni > sizeof(smallbits))
1416 		free(bits, M_TEMP);
1417 	return (error);
1418 }
1419 
1420 int
1421 sys_ppoll(struct thread *td, struct ppoll_args *uap)
1422 {
1423 	struct timespec ts, *tsp;
1424 	sigset_t set, *ssp;
1425 	int error;
1426 
1427 	if (uap->ts != NULL) {
1428 		error = copyin(uap->ts, &ts, sizeof(ts));
1429 		if (error)
1430 			return (error);
1431 		tsp = &ts;
1432 	} else
1433 		tsp = NULL;
1434 	if (uap->set != NULL) {
1435 		error = copyin(uap->set, &set, sizeof(set));
1436 		if (error)
1437 			return (error);
1438 		ssp = &set;
1439 	} else
1440 		ssp = NULL;
1441 	/*
1442 	 * fds is still a pointer to user space. kern_poll() will
1443 	 * take care of copyin that array to the kernel space.
1444 	 */
1445 
1446 	return (kern_poll(td, uap->fds, uap->nfds, tsp, ssp));
1447 }
1448 
1449 static int
1450 pollrescan(struct thread *td)
1451 {
1452 	struct seltd *stp;
1453 	struct selfd *sfp;
1454 	struct selfd *sfn;
1455 	struct selinfo *si;
1456 	struct filedesc *fdp;
1457 	struct file *fp;
1458 	struct pollfd *fd;
1459 #ifdef CAPABILITIES
1460 	cap_rights_t rights;
1461 #endif
1462 	int n;
1463 
1464 	n = 0;
1465 	fdp = td->td_proc->p_fd;
1466 	stp = td->td_sel;
1467 	FILEDESC_SLOCK(fdp);
1468 	STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) {
1469 		fd = (struct pollfd *)sfp->sf_cookie;
1470 		si = sfp->sf_si;
1471 		selfdfree(stp, sfp);
1472 		/* If the selinfo wasn't cleared the event didn't fire. */
1473 		if (si != NULL)
1474 			continue;
1475 		fp = fdp->fd_ofiles[fd->fd].fde_file;
1476 #ifdef CAPABILITIES
1477 		if (fp == NULL ||
1478 		    cap_check(cap_rights(fdp, fd->fd),
1479 		    cap_rights_init(&rights, CAP_EVENT)) != 0)
1480 #else
1481 		if (fp == NULL)
1482 #endif
1483 		{
1484 			fd->revents = POLLNVAL;
1485 			n++;
1486 			continue;
1487 		}
1488 
1489 		/*
1490 		 * Note: backend also returns POLLHUP and
1491 		 * POLLERR if appropriate.
1492 		 */
1493 		fd->revents = fo_poll(fp, fd->events, td->td_ucred, td);
1494 		if (fd->revents != 0)
1495 			n++;
1496 	}
1497 	FILEDESC_SUNLOCK(fdp);
1498 	stp->st_flags = 0;
1499 	td->td_retval[0] = n;
1500 	return (0);
1501 }
1502 
1503 
1504 static int
1505 pollout(td, fds, ufds, nfd)
1506 	struct thread *td;
1507 	struct pollfd *fds;
1508 	struct pollfd *ufds;
1509 	u_int nfd;
1510 {
1511 	int error = 0;
1512 	u_int i = 0;
1513 	u_int n = 0;
1514 
1515 	for (i = 0; i < nfd; i++) {
1516 		error = copyout(&fds->revents, &ufds->revents,
1517 		    sizeof(ufds->revents));
1518 		if (error)
1519 			return (error);
1520 		if (fds->revents != 0)
1521 			n++;
1522 		fds++;
1523 		ufds++;
1524 	}
1525 	td->td_retval[0] = n;
1526 	return (0);
1527 }
1528 
1529 static int
1530 pollscan(td, fds, nfd)
1531 	struct thread *td;
1532 	struct pollfd *fds;
1533 	u_int nfd;
1534 {
1535 	struct filedesc *fdp = td->td_proc->p_fd;
1536 	struct file *fp;
1537 #ifdef CAPABILITIES
1538 	cap_rights_t rights;
1539 #endif
1540 	int i, n = 0;
1541 
1542 	FILEDESC_SLOCK(fdp);
1543 	for (i = 0; i < nfd; i++, fds++) {
1544 		if (fds->fd > fdp->fd_lastfile) {
1545 			fds->revents = POLLNVAL;
1546 			n++;
1547 		} else if (fds->fd < 0) {
1548 			fds->revents = 0;
1549 		} else {
1550 			fp = fdp->fd_ofiles[fds->fd].fde_file;
1551 #ifdef CAPABILITIES
1552 			if (fp == NULL ||
1553 			    cap_check(cap_rights(fdp, fds->fd),
1554 			    cap_rights_init(&rights, CAP_EVENT)) != 0)
1555 #else
1556 			if (fp == NULL)
1557 #endif
1558 			{
1559 				fds->revents = POLLNVAL;
1560 				n++;
1561 			} else {
1562 				/*
1563 				 * Note: backend also returns POLLHUP and
1564 				 * POLLERR if appropriate.
1565 				 */
1566 				selfdalloc(td, fds);
1567 				fds->revents = fo_poll(fp, fds->events,
1568 				    td->td_ucred, td);
1569 				/*
1570 				 * POSIX requires POLLOUT to be never
1571 				 * set simultaneously with POLLHUP.
1572 				 */
1573 				if ((fds->revents & POLLHUP) != 0)
1574 					fds->revents &= ~POLLOUT;
1575 
1576 				if (fds->revents != 0)
1577 					n++;
1578 			}
1579 		}
1580 	}
1581 	FILEDESC_SUNLOCK(fdp);
1582 	td->td_retval[0] = n;
1583 	return (0);
1584 }
1585 
1586 /*
1587  * OpenBSD poll system call.
1588  *
1589  * XXX this isn't quite a true representation..  OpenBSD uses select ops.
1590  */
1591 #ifndef _SYS_SYSPROTO_H_
1592 struct openbsd_poll_args {
1593 	struct pollfd *fds;
1594 	u_int	nfds;
1595 	int	timeout;
1596 };
1597 #endif
1598 int
1599 sys_openbsd_poll(td, uap)
1600 	register struct thread *td;
1601 	register struct openbsd_poll_args *uap;
1602 {
1603 	return (sys_poll(td, (struct poll_args *)uap));
1604 }
1605 
1606 /*
1607  * XXX This was created specifically to support netncp and netsmb.  This
1608  * allows the caller to specify a socket to wait for events on.  It returns
1609  * 0 if any events matched and an error otherwise.  There is no way to
1610  * determine which events fired.
1611  */
1612 int
1613 selsocket(struct socket *so, int events, struct timeval *tvp, struct thread *td)
1614 {
1615 	struct timeval rtv;
1616 	sbintime_t asbt, precision, rsbt;
1617 	int error;
1618 
1619 	precision = 0;	/* stupid gcc! */
1620 	if (tvp != NULL) {
1621 		rtv = *tvp;
1622 		if (rtv.tv_sec < 0 || rtv.tv_usec < 0 ||
1623 		    rtv.tv_usec >= 1000000)
1624 			return (EINVAL);
1625 		if (!timevalisset(&rtv))
1626 			asbt = 0;
1627 		else if (rtv.tv_sec <= INT32_MAX) {
1628 			rsbt = tvtosbt(rtv);
1629 			precision = rsbt;
1630 			precision >>= tc_precexp;
1631 			if (TIMESEL(&asbt, rsbt))
1632 				asbt += tc_tick_sbt;
1633 			if (asbt <= SBT_MAX - rsbt)
1634 				asbt += rsbt;
1635 			else
1636 				asbt = -1;
1637 		} else
1638 			asbt = -1;
1639 	} else
1640 		asbt = -1;
1641 	seltdinit(td);
1642 	/*
1643 	 * Iterate until the timeout expires or the socket becomes ready.
1644 	 */
1645 	for (;;) {
1646 		selfdalloc(td, NULL);
1647 		error = sopoll(so, events, NULL, td);
1648 		/* error here is actually the ready events. */
1649 		if (error)
1650 			return (0);
1651 		error = seltdwait(td, asbt, precision);
1652 		if (error)
1653 			break;
1654 	}
1655 	seltdclear(td);
1656 	/* XXX Duplicates ncp/smb behavior. */
1657 	if (error == ERESTART)
1658 		error = 0;
1659 	return (error);
1660 }
1661 
1662 /*
1663  * Preallocate two selfds associated with 'cookie'.  Some fo_poll routines
1664  * have two select sets, one for read and another for write.
1665  */
1666 static void
1667 selfdalloc(struct thread *td, void *cookie)
1668 {
1669 	struct seltd *stp;
1670 
1671 	stp = td->td_sel;
1672 	if (stp->st_free1 == NULL)
1673 		stp->st_free1 = uma_zalloc(selfd_zone, M_WAITOK|M_ZERO);
1674 	stp->st_free1->sf_td = stp;
1675 	stp->st_free1->sf_cookie = cookie;
1676 	if (stp->st_free2 == NULL)
1677 		stp->st_free2 = uma_zalloc(selfd_zone, M_WAITOK|M_ZERO);
1678 	stp->st_free2->sf_td = stp;
1679 	stp->st_free2->sf_cookie = cookie;
1680 }
1681 
1682 static void
1683 selfdfree(struct seltd *stp, struct selfd *sfp)
1684 {
1685 	STAILQ_REMOVE(&stp->st_selq, sfp, selfd, sf_link);
1686 	if (sfp->sf_si != NULL) {
1687 		mtx_lock(sfp->sf_mtx);
1688 		if (sfp->sf_si != NULL)
1689 			TAILQ_REMOVE(&sfp->sf_si->si_tdlist, sfp, sf_threads);
1690 		mtx_unlock(sfp->sf_mtx);
1691 	}
1692 	uma_zfree(selfd_zone, sfp);
1693 }
1694 
1695 /* Drain the waiters tied to all the selfd belonging the specified selinfo. */
1696 void
1697 seldrain(sip)
1698         struct selinfo *sip;
1699 {
1700 
1701 	/*
1702 	 * This feature is already provided by doselwakeup(), thus it is
1703 	 * enough to go for it.
1704 	 * Eventually, the context, should take care to avoid races
1705 	 * between thread calling select()/poll() and file descriptor
1706 	 * detaching, but, again, the races are just the same as
1707 	 * selwakeup().
1708 	 */
1709         doselwakeup(sip, -1);
1710 }
1711 
1712 /*
1713  * Record a select request.
1714  */
1715 void
1716 selrecord(selector, sip)
1717 	struct thread *selector;
1718 	struct selinfo *sip;
1719 {
1720 	struct selfd *sfp;
1721 	struct seltd *stp;
1722 	struct mtx *mtxp;
1723 
1724 	stp = selector->td_sel;
1725 	/*
1726 	 * Don't record when doing a rescan.
1727 	 */
1728 	if (stp->st_flags & SELTD_RESCAN)
1729 		return;
1730 	/*
1731 	 * Grab one of the preallocated descriptors.
1732 	 */
1733 	sfp = NULL;
1734 	if ((sfp = stp->st_free1) != NULL)
1735 		stp->st_free1 = NULL;
1736 	else if ((sfp = stp->st_free2) != NULL)
1737 		stp->st_free2 = NULL;
1738 	else
1739 		panic("selrecord: No free selfd on selq");
1740 	mtxp = sip->si_mtx;
1741 	if (mtxp == NULL)
1742 		mtxp = mtx_pool_find(mtxpool_select, sip);
1743 	/*
1744 	 * Initialize the sfp and queue it in the thread.
1745 	 */
1746 	sfp->sf_si = sip;
1747 	sfp->sf_mtx = mtxp;
1748 	STAILQ_INSERT_TAIL(&stp->st_selq, sfp, sf_link);
1749 	/*
1750 	 * Now that we've locked the sip, check for initialization.
1751 	 */
1752 	mtx_lock(mtxp);
1753 	if (sip->si_mtx == NULL) {
1754 		sip->si_mtx = mtxp;
1755 		TAILQ_INIT(&sip->si_tdlist);
1756 	}
1757 	/*
1758 	 * Add this thread to the list of selfds listening on this selinfo.
1759 	 */
1760 	TAILQ_INSERT_TAIL(&sip->si_tdlist, sfp, sf_threads);
1761 	mtx_unlock(sip->si_mtx);
1762 }
1763 
1764 /* Wake up a selecting thread. */
1765 void
1766 selwakeup(sip)
1767 	struct selinfo *sip;
1768 {
1769 	doselwakeup(sip, -1);
1770 }
1771 
1772 /* Wake up a selecting thread, and set its priority. */
1773 void
1774 selwakeuppri(sip, pri)
1775 	struct selinfo *sip;
1776 	int pri;
1777 {
1778 	doselwakeup(sip, pri);
1779 }
1780 
1781 /*
1782  * Do a wakeup when a selectable event occurs.
1783  */
1784 static void
1785 doselwakeup(sip, pri)
1786 	struct selinfo *sip;
1787 	int pri;
1788 {
1789 	struct selfd *sfp;
1790 	struct selfd *sfn;
1791 	struct seltd *stp;
1792 
1793 	/* If it's not initialized there can't be any waiters. */
1794 	if (sip->si_mtx == NULL)
1795 		return;
1796 	/*
1797 	 * Locking the selinfo locks all selfds associated with it.
1798 	 */
1799 	mtx_lock(sip->si_mtx);
1800 	TAILQ_FOREACH_SAFE(sfp, &sip->si_tdlist, sf_threads, sfn) {
1801 		/*
1802 		 * Once we remove this sfp from the list and clear the
1803 		 * sf_si seltdclear will know to ignore this si.
1804 		 */
1805 		TAILQ_REMOVE(&sip->si_tdlist, sfp, sf_threads);
1806 		sfp->sf_si = NULL;
1807 		stp = sfp->sf_td;
1808 		mtx_lock(&stp->st_mtx);
1809 		stp->st_flags |= SELTD_PENDING;
1810 		cv_broadcastpri(&stp->st_wait, pri);
1811 		mtx_unlock(&stp->st_mtx);
1812 	}
1813 	mtx_unlock(sip->si_mtx);
1814 }
1815 
1816 static void
1817 seltdinit(struct thread *td)
1818 {
1819 	struct seltd *stp;
1820 
1821 	if ((stp = td->td_sel) != NULL)
1822 		goto out;
1823 	td->td_sel = stp = malloc(sizeof(*stp), M_SELECT, M_WAITOK|M_ZERO);
1824 	mtx_init(&stp->st_mtx, "sellck", NULL, MTX_DEF);
1825 	cv_init(&stp->st_wait, "select");
1826 out:
1827 	stp->st_flags = 0;
1828 	STAILQ_INIT(&stp->st_selq);
1829 }
1830 
1831 static int
1832 seltdwait(struct thread *td, sbintime_t sbt, sbintime_t precision)
1833 {
1834 	struct seltd *stp;
1835 	int error;
1836 
1837 	stp = td->td_sel;
1838 	/*
1839 	 * An event of interest may occur while we do not hold the seltd
1840 	 * locked so check the pending flag before we sleep.
1841 	 */
1842 	mtx_lock(&stp->st_mtx);
1843 	/*
1844 	 * Any further calls to selrecord will be a rescan.
1845 	 */
1846 	stp->st_flags |= SELTD_RESCAN;
1847 	if (stp->st_flags & SELTD_PENDING) {
1848 		mtx_unlock(&stp->st_mtx);
1849 		return (0);
1850 	}
1851 	if (sbt == 0)
1852 		error = EWOULDBLOCK;
1853 	else if (sbt != -1)
1854 		error = cv_timedwait_sig_sbt(&stp->st_wait, &stp->st_mtx,
1855 		    sbt, precision, C_ABSOLUTE);
1856 	else
1857 		error = cv_wait_sig(&stp->st_wait, &stp->st_mtx);
1858 	mtx_unlock(&stp->st_mtx);
1859 
1860 	return (error);
1861 }
1862 
1863 void
1864 seltdfini(struct thread *td)
1865 {
1866 	struct seltd *stp;
1867 
1868 	stp = td->td_sel;
1869 	if (stp == NULL)
1870 		return;
1871 	if (stp->st_free1)
1872 		uma_zfree(selfd_zone, stp->st_free1);
1873 	if (stp->st_free2)
1874 		uma_zfree(selfd_zone, stp->st_free2);
1875 	td->td_sel = NULL;
1876 	free(stp, M_SELECT);
1877 }
1878 
1879 /*
1880  * Remove the references to the thread from all of the objects we were
1881  * polling.
1882  */
1883 static void
1884 seltdclear(struct thread *td)
1885 {
1886 	struct seltd *stp;
1887 	struct selfd *sfp;
1888 	struct selfd *sfn;
1889 
1890 	stp = td->td_sel;
1891 	STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn)
1892 		selfdfree(stp, sfp);
1893 	stp->st_flags = 0;
1894 }
1895 
1896 static void selectinit(void *);
1897 SYSINIT(select, SI_SUB_SYSCALLS, SI_ORDER_ANY, selectinit, NULL);
1898 static void
1899 selectinit(void *dummy __unused)
1900 {
1901 
1902 	selfd_zone = uma_zcreate("selfd", sizeof(struct selfd), NULL, NULL,
1903 	    NULL, NULL, UMA_ALIGN_PTR, 0);
1904 	mtxpool_select = mtx_pool_create("select mtxpool", 128, MTX_DEF);
1905 }
1906