xref: /freebsd/sys/kern/sys_generic.c (revision e4e9813eb92cd7c4d4b819a8fbed5cbd3d92f5d8)
1 /*-
2  * Copyright (c) 1982, 1986, 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 4. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
35  */
36 
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD$");
39 
40 #include "opt_compat.h"
41 #include "opt_ktrace.h"
42 
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/sysproto.h>
46 #include <sys/filedesc.h>
47 #include <sys/filio.h>
48 #include <sys/fcntl.h>
49 #include <sys/file.h>
50 #include <sys/proc.h>
51 #include <sys/signalvar.h>
52 #include <sys/socketvar.h>
53 #include <sys/uio.h>
54 #include <sys/kernel.h>
55 #include <sys/limits.h>
56 #include <sys/malloc.h>
57 #include <sys/poll.h>
58 #include <sys/resourcevar.h>
59 #include <sys/selinfo.h>
60 #include <sys/sleepqueue.h>
61 #include <sys/syscallsubr.h>
62 #include <sys/sysctl.h>
63 #include <sys/sysent.h>
64 #include <sys/vnode.h>
65 #include <sys/bio.h>
66 #include <sys/buf.h>
67 #include <sys/condvar.h>
68 #ifdef KTRACE
69 #include <sys/ktrace.h>
70 #endif
71 #include <vm/vm.h>
72 #include <vm/vm_page.h>
73 
74 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
75 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
76 MALLOC_DEFINE(M_IOV, "iov", "large iov's");
77 
78 static int	pollscan(struct thread *, struct pollfd *, u_int);
79 static int	selscan(struct thread *, fd_mask **, fd_mask **, int);
80 static int	dofileread(struct thread *, int, struct file *, struct uio *,
81 		    off_t, int);
82 static int	dofilewrite(struct thread *, int, struct file *, struct uio *,
83 		    off_t, int);
84 static void	doselwakeup(struct selinfo *, int);
85 
86 /*
87  * Read system call.
88  */
89 #ifndef _SYS_SYSPROTO_H_
90 struct read_args {
91 	int	fd;
92 	void	*buf;
93 	size_t	nbyte;
94 };
95 #endif
96 /*
97  * MPSAFE
98  */
99 int
100 read(td, uap)
101 	struct thread *td;
102 	struct read_args *uap;
103 {
104 	struct uio auio;
105 	struct iovec aiov;
106 	int error;
107 
108 	if (uap->nbyte > INT_MAX)
109 		return (EINVAL);
110 	aiov.iov_base = uap->buf;
111 	aiov.iov_len = uap->nbyte;
112 	auio.uio_iov = &aiov;
113 	auio.uio_iovcnt = 1;
114 	auio.uio_resid = uap->nbyte;
115 	auio.uio_segflg = UIO_USERSPACE;
116 	error = kern_readv(td, uap->fd, &auio);
117 	return(error);
118 }
119 
120 /*
121  * Positioned read system call
122  */
123 #ifndef _SYS_SYSPROTO_H_
124 struct pread_args {
125 	int	fd;
126 	void	*buf;
127 	size_t	nbyte;
128 	int	pad;
129 	off_t	offset;
130 };
131 #endif
132 /*
133  * MPSAFE
134  */
135 int
136 pread(td, uap)
137 	struct thread *td;
138 	struct pread_args *uap;
139 {
140 	struct uio auio;
141 	struct iovec aiov;
142 	int error;
143 
144 	if (uap->nbyte > INT_MAX)
145 		return (EINVAL);
146 	aiov.iov_base = uap->buf;
147 	aiov.iov_len = uap->nbyte;
148 	auio.uio_iov = &aiov;
149 	auio.uio_iovcnt = 1;
150 	auio.uio_resid = uap->nbyte;
151 	auio.uio_segflg = UIO_USERSPACE;
152 	error = kern_preadv(td, uap->fd, &auio, uap->offset);
153 	return(error);
154 }
155 
156 /*
157  * Scatter read system call.
158  */
159 #ifndef _SYS_SYSPROTO_H_
160 struct readv_args {
161 	int	fd;
162 	struct	iovec *iovp;
163 	u_int	iovcnt;
164 };
165 #endif
166 /*
167  * MPSAFE
168  */
169 int
170 readv(struct thread *td, struct readv_args *uap)
171 {
172 	struct uio *auio;
173 	int error;
174 
175 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
176 	if (error)
177 		return (error);
178 	error = kern_readv(td, uap->fd, auio);
179 	free(auio, M_IOV);
180 	return (error);
181 }
182 
183 int
184 kern_readv(struct thread *td, int fd, struct uio *auio)
185 {
186 	struct file *fp;
187 	int error;
188 
189 	error = fget_read(td, fd, &fp);
190 	if (error)
191 		return (error);
192 	error = dofileread(td, fd, fp, auio, (off_t)-1, 0);
193 	fdrop(fp, td);
194 	return (error);
195 }
196 
197 /*
198  * Scatter positioned read system call.
199  */
200 #ifndef _SYS_SYSPROTO_H_
201 struct preadv_args {
202 	int	fd;
203 	struct	iovec *iovp;
204 	u_int	iovcnt;
205 	off_t	offset;
206 };
207 #endif
208 /*
209  * MPSAFE
210  */
211 int
212 preadv(struct thread *td, struct preadv_args *uap)
213 {
214 	struct uio *auio;
215 	int error;
216 
217 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
218 	if (error)
219 		return (error);
220 	error = kern_preadv(td, uap->fd, auio, uap->offset);
221 	free(auio, M_IOV);
222 	return (error);
223 }
224 
225 int
226 kern_preadv(td, fd, auio, offset)
227 	struct thread *td;
228 	int fd;
229 	struct uio *auio;
230 	off_t offset;
231 {
232 	struct file *fp;
233 	int error;
234 
235 	error = fget_read(td, fd, &fp);
236 	if (error)
237 		return (error);
238 	if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
239 		error = ESPIPE;
240 	else if (offset < 0 && fp->f_vnode->v_type != VCHR)
241 		error = EINVAL;
242 	else
243 		error = dofileread(td, fd, fp, auio, offset, FOF_OFFSET);
244 	fdrop(fp, td);
245 	return (error);
246 }
247 
248 /*
249  * Common code for readv and preadv that reads data in
250  * from a file using the passed in uio, offset, and flags.
251  */
252 static int
253 dofileread(td, fd, fp, auio, offset, flags)
254 	struct thread *td;
255 	int fd;
256 	struct file *fp;
257 	struct uio *auio;
258 	off_t offset;
259 	int flags;
260 {
261 	ssize_t cnt;
262 	int error;
263 #ifdef KTRACE
264 	struct uio *ktruio = NULL;
265 #endif
266 
267 	/* Finish zero length reads right here */
268 	if (auio->uio_resid == 0) {
269 		td->td_retval[0] = 0;
270 		return(0);
271 	}
272 	auio->uio_rw = UIO_READ;
273 	auio->uio_offset = offset;
274 	auio->uio_td = td;
275 #ifdef KTRACE
276 	if (KTRPOINT(td, KTR_GENIO))
277 		ktruio = cloneuio(auio);
278 #endif
279 	cnt = auio->uio_resid;
280 	if ((error = fo_read(fp, auio, td->td_ucred, flags, td))) {
281 		if (auio->uio_resid != cnt && (error == ERESTART ||
282 		    error == EINTR || error == EWOULDBLOCK))
283 			error = 0;
284 	}
285 	cnt -= auio->uio_resid;
286 #ifdef KTRACE
287 	if (ktruio != NULL) {
288 		ktruio->uio_resid = cnt;
289 		ktrgenio(fd, UIO_READ, ktruio, error);
290 	}
291 #endif
292 	td->td_retval[0] = cnt;
293 	return (error);
294 }
295 
296 /*
297  * Write system call
298  */
299 #ifndef _SYS_SYSPROTO_H_
300 struct write_args {
301 	int	fd;
302 	const void *buf;
303 	size_t	nbyte;
304 };
305 #endif
306 /*
307  * MPSAFE
308  */
309 int
310 write(td, uap)
311 	struct thread *td;
312 	struct write_args *uap;
313 {
314 	struct uio auio;
315 	struct iovec aiov;
316 	int error;
317 
318 	if (uap->nbyte > INT_MAX)
319 		return (EINVAL);
320 	aiov.iov_base = (void *)(uintptr_t)uap->buf;
321 	aiov.iov_len = uap->nbyte;
322 	auio.uio_iov = &aiov;
323 	auio.uio_iovcnt = 1;
324 	auio.uio_resid = uap->nbyte;
325 	auio.uio_segflg = UIO_USERSPACE;
326 	error = kern_writev(td, uap->fd, &auio);
327 	return(error);
328 }
329 
330 /*
331  * Positioned write system call
332  */
333 #ifndef _SYS_SYSPROTO_H_
334 struct pwrite_args {
335 	int	fd;
336 	const void *buf;
337 	size_t	nbyte;
338 	int	pad;
339 	off_t	offset;
340 };
341 #endif
342 /*
343  * MPSAFE
344  */
345 int
346 pwrite(td, uap)
347 	struct thread *td;
348 	struct pwrite_args *uap;
349 {
350 	struct uio auio;
351 	struct iovec aiov;
352 	int error;
353 
354 	if (uap->nbyte > INT_MAX)
355 		return (EINVAL);
356 	aiov.iov_base = (void *)(uintptr_t)uap->buf;
357 	aiov.iov_len = uap->nbyte;
358 	auio.uio_iov = &aiov;
359 	auio.uio_iovcnt = 1;
360 	auio.uio_resid = uap->nbyte;
361 	auio.uio_segflg = UIO_USERSPACE;
362 	error = kern_pwritev(td, uap->fd, &auio, uap->offset);
363 	return(error);
364 }
365 
366 /*
367  * Gather write system call
368  */
369 #ifndef _SYS_SYSPROTO_H_
370 struct writev_args {
371 	int	fd;
372 	struct	iovec *iovp;
373 	u_int	iovcnt;
374 };
375 #endif
376 /*
377  * MPSAFE
378  */
379 int
380 writev(struct thread *td, struct writev_args *uap)
381 {
382 	struct uio *auio;
383 	int error;
384 
385 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
386 	if (error)
387 		return (error);
388 	error = kern_writev(td, uap->fd, auio);
389 	free(auio, M_IOV);
390 	return (error);
391 }
392 
393 int
394 kern_writev(struct thread *td, int fd, struct uio *auio)
395 {
396 	struct file *fp;
397 	int error;
398 
399 	error = fget_write(td, fd, &fp);
400 	if (error)
401 		return (error);
402 	error = dofilewrite(td, fd, fp, auio, (off_t)-1, 0);
403 	fdrop(fp, td);
404 	return (error);
405 }
406 
407 /*
408  * Gather positioned write system call
409  */
410 #ifndef _SYS_SYSPROTO_H_
411 struct pwritev_args {
412 	int	fd;
413 	struct	iovec *iovp;
414 	u_int	iovcnt;
415 	off_t	offset;
416 };
417 #endif
418 /*
419  * MPSAFE
420  */
421 int
422 pwritev(struct thread *td, struct pwritev_args *uap)
423 {
424 	struct uio *auio;
425 	int error;
426 
427 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
428 	if (error)
429 		return (error);
430 	error = kern_pwritev(td, uap->fd, auio, uap->offset);
431 	free(auio, M_IOV);
432 	return (error);
433 }
434 
435 int
436 kern_pwritev(td, fd, auio, offset)
437 	struct thread *td;
438 	struct uio *auio;
439 	int fd;
440 	off_t offset;
441 {
442 	struct file *fp;
443 	int error;
444 
445 	error = fget_write(td, fd, &fp);
446 	if (error)
447 		return (error);
448 	if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
449 		error = ESPIPE;
450 	else if (offset < 0 && fp->f_vnode->v_type != VCHR)
451 		error = EINVAL;
452 	else
453 		error = dofilewrite(td, fd, fp, auio, offset, FOF_OFFSET);
454 	fdrop(fp, td);
455 	return (error);
456 }
457 
458 /*
459  * Common code for writev and pwritev that writes data to
460  * a file using the passed in uio, offset, and flags.
461  */
462 static int
463 dofilewrite(td, fd, fp, auio, offset, flags)
464 	struct thread *td;
465 	int fd;
466 	struct file *fp;
467 	struct uio *auio;
468 	off_t offset;
469 	int flags;
470 {
471 	ssize_t cnt;
472 	int error;
473 #ifdef KTRACE
474 	struct uio *ktruio = NULL;
475 #endif
476 
477 	auio->uio_rw = UIO_WRITE;
478 	auio->uio_td = td;
479 	auio->uio_offset = offset;
480 #ifdef KTRACE
481 	if (KTRPOINT(td, KTR_GENIO))
482 		ktruio = cloneuio(auio);
483 #endif
484 	cnt = auio->uio_resid;
485 	if (fp->f_type == DTYPE_VNODE)
486 		bwillwrite();
487 	if ((error = fo_write(fp, auio, td->td_ucred, flags, td))) {
488 		if (auio->uio_resid != cnt && (error == ERESTART ||
489 		    error == EINTR || error == EWOULDBLOCK))
490 			error = 0;
491 		/* Socket layer is responsible for issuing SIGPIPE. */
492 		if (error == EPIPE) {
493 			PROC_LOCK(td->td_proc);
494 			psignal(td->td_proc, SIGPIPE);
495 			PROC_UNLOCK(td->td_proc);
496 		}
497 	}
498 	cnt -= auio->uio_resid;
499 #ifdef KTRACE
500 	if (ktruio != NULL) {
501 		ktruio->uio_resid = cnt;
502 		ktrgenio(fd, UIO_WRITE, ktruio, error);
503 	}
504 #endif
505 	td->td_retval[0] = cnt;
506 	return (error);
507 }
508 
509 /*
510  * Ioctl system call
511  */
512 #ifndef _SYS_SYSPROTO_H_
513 struct ioctl_args {
514 	int	fd;
515 	u_long	com;
516 	caddr_t	data;
517 };
518 #endif
519 /*
520  * MPSAFE
521  */
522 /* ARGSUSED */
523 int
524 ioctl(struct thread *td, struct ioctl_args *uap)
525 {
526 	u_long com;
527 	int error;
528 	u_int size;
529 	caddr_t data, memp;
530 
531 	if (uap->com > 0xffffffff) {
532 		printf(
533 		    "WARNING pid %d (%s): ioctl sign-extension ioctl %lx\n",
534 		    td->td_proc->p_pid, td->td_proc->p_comm, uap->com);
535 		uap->com &= 0xffffffff;
536 	}
537 	com = uap->com;
538 
539 	/*
540 	 * Interpret high order word to find amount of data to be
541 	 * copied to/from the user's address space.
542 	 */
543 	size = IOCPARM_LEN(com);
544 	if ((size > IOCPARM_MAX) ||
545 	    ((com & (IOC_VOID  | IOC_IN | IOC_OUT)) == 0) ||
546 #if defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
547 	    ((com & IOC_OUT) && size == 0) ||
548 #else
549 	    ((com & (IOC_IN | IOC_OUT)) && size == 0) ||
550 #endif
551 	    ((com & IOC_VOID) && size > 0))
552 		return (ENOTTY);
553 
554 	if (size > 0) {
555 		memp = malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
556 		data = memp;
557 	} else {
558 		memp = NULL;
559 		data = (void *)&uap->data;
560 	}
561 	if (com & IOC_IN) {
562 		error = copyin(uap->data, data, (u_int)size);
563 		if (error) {
564 			free(memp, M_IOCTLOPS);
565 			return (error);
566 		}
567 	} else if (com & IOC_OUT) {
568 		/*
569 		 * Zero the buffer so the user always
570 		 * gets back something deterministic.
571 		 */
572 		bzero(data, size);
573 	}
574 
575 	error = kern_ioctl(td, uap->fd, com, data);
576 
577 	if (error == 0 && (com & IOC_OUT))
578 		error = copyout(data, uap->data, (u_int)size);
579 
580 	if (memp != NULL)
581 		free(memp, M_IOCTLOPS);
582 	return (error);
583 }
584 
585 int
586 kern_ioctl(struct thread *td, int fd, u_long com, caddr_t data)
587 {
588 	struct file *fp;
589 	struct filedesc *fdp;
590 	int error;
591 	int tmp;
592 
593 	if ((error = fget(td, fd, &fp)) != 0)
594 		return (error);
595 	if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
596 		fdrop(fp, td);
597 		return (EBADF);
598 	}
599 	fdp = td->td_proc->p_fd;
600 	switch (com) {
601 	case FIONCLEX:
602 		FILEDESC_LOCK_FAST(fdp);
603 		fdp->fd_ofileflags[fd] &= ~UF_EXCLOSE;
604 		FILEDESC_UNLOCK_FAST(fdp);
605 		goto out;
606 	case FIOCLEX:
607 		FILEDESC_LOCK_FAST(fdp);
608 		fdp->fd_ofileflags[fd] |= UF_EXCLOSE;
609 		FILEDESC_UNLOCK_FAST(fdp);
610 		goto out;
611 	case FIONBIO:
612 		FILE_LOCK(fp);
613 		if ((tmp = *(int *)data))
614 			fp->f_flag |= FNONBLOCK;
615 		else
616 			fp->f_flag &= ~FNONBLOCK;
617 		FILE_UNLOCK(fp);
618 		data = (void *)&tmp;
619 		break;
620 	case FIOASYNC:
621 		FILE_LOCK(fp);
622 		if ((tmp = *(int *)data))
623 			fp->f_flag |= FASYNC;
624 		else
625 			fp->f_flag &= ~FASYNC;
626 		FILE_UNLOCK(fp);
627 		data = (void *)&tmp;
628 		break;
629 	}
630 
631 	error = fo_ioctl(fp, com, data, td->td_ucred, td);
632 out:
633 	fdrop(fp, td);
634 	return (error);
635 }
636 
637 /*
638  * sellock and selwait are initialized in selectinit() via SYSINIT.
639  */
640 struct mtx	sellock;
641 struct cv	selwait;
642 u_int		nselcoll;	/* Select collisions since boot */
643 SYSCTL_UINT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
644 
645 /*
646  * Select system call.
647  */
648 #ifndef _SYS_SYSPROTO_H_
649 struct select_args {
650 	int	nd;
651 	fd_set	*in, *ou, *ex;
652 	struct	timeval *tv;
653 };
654 #endif
655 /*
656  * MPSAFE
657  */
658 int
659 select(td, uap)
660 	register struct thread *td;
661 	register struct select_args *uap;
662 {
663 	struct timeval tv, *tvp;
664 	int error;
665 
666 	if (uap->tv != NULL) {
667 		error = copyin(uap->tv, &tv, sizeof(tv));
668 		if (error)
669 			return (error);
670 		tvp = &tv;
671 	} else
672 		tvp = NULL;
673 
674 	return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp));
675 }
676 
677 int
678 kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou,
679     fd_set *fd_ex, struct timeval *tvp)
680 {
681 	struct filedesc *fdp;
682 	/*
683 	 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
684 	 * infds with the new FD_SETSIZE of 1024, and more than enough for
685 	 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
686 	 * of 256.
687 	 */
688 	fd_mask s_selbits[howmany(2048, NFDBITS)];
689 	fd_mask *ibits[3], *obits[3], *selbits, *sbp;
690 	struct timeval atv, rtv, ttv;
691 	int error, timo;
692 	u_int ncoll, nbufbytes, ncpbytes, nfdbits;
693 
694 	if (nd < 0)
695 		return (EINVAL);
696 	fdp = td->td_proc->p_fd;
697 
698 	FILEDESC_LOCK_FAST(fdp);
699 
700 	if (nd > td->td_proc->p_fd->fd_nfiles)
701 		nd = td->td_proc->p_fd->fd_nfiles;   /* forgiving; slightly wrong */
702 	FILEDESC_UNLOCK_FAST(fdp);
703 
704 	/*
705 	 * Allocate just enough bits for the non-null fd_sets.  Use the
706 	 * preallocated auto buffer if possible.
707 	 */
708 	nfdbits = roundup(nd, NFDBITS);
709 	ncpbytes = nfdbits / NBBY;
710 	nbufbytes = 0;
711 	if (fd_in != NULL)
712 		nbufbytes += 2 * ncpbytes;
713 	if (fd_ou != NULL)
714 		nbufbytes += 2 * ncpbytes;
715 	if (fd_ex != NULL)
716 		nbufbytes += 2 * ncpbytes;
717 	if (nbufbytes <= sizeof s_selbits)
718 		selbits = &s_selbits[0];
719 	else
720 		selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
721 
722 	/*
723 	 * Assign pointers into the bit buffers and fetch the input bits.
724 	 * Put the output buffers together so that they can be bzeroed
725 	 * together.
726 	 */
727 	sbp = selbits;
728 #define	getbits(name, x) \
729 	do {								\
730 		if (name == NULL)					\
731 			ibits[x] = NULL;				\
732 		else {							\
733 			ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp;	\
734 			obits[x] = sbp;					\
735 			sbp += ncpbytes / sizeof *sbp;			\
736 			error = copyin(name, ibits[x], ncpbytes);	\
737 			if (error != 0)					\
738 				goto done_nosellock;			\
739 		}							\
740 	} while (0)
741 	getbits(fd_in, 0);
742 	getbits(fd_ou, 1);
743 	getbits(fd_ex, 2);
744 #undef	getbits
745 	if (nbufbytes != 0)
746 		bzero(selbits, nbufbytes / 2);
747 
748 	if (tvp != NULL) {
749 		atv = *tvp;
750 		if (itimerfix(&atv)) {
751 			error = EINVAL;
752 			goto done_nosellock;
753 		}
754 		getmicrouptime(&rtv);
755 		timevaladd(&atv, &rtv);
756 	} else {
757 		atv.tv_sec = 0;
758 		atv.tv_usec = 0;
759 	}
760 	timo = 0;
761 	TAILQ_INIT(&td->td_selq);
762 	mtx_lock(&sellock);
763 retry:
764 	ncoll = nselcoll;
765 	mtx_lock_spin(&sched_lock);
766 	td->td_flags |= TDF_SELECT;
767 	mtx_unlock_spin(&sched_lock);
768 	mtx_unlock(&sellock);
769 
770 	error = selscan(td, ibits, obits, nd);
771 	mtx_lock(&sellock);
772 	if (error || td->td_retval[0])
773 		goto done;
774 	if (atv.tv_sec || atv.tv_usec) {
775 		getmicrouptime(&rtv);
776 		if (timevalcmp(&rtv, &atv, >=))
777 			goto done;
778 		ttv = atv;
779 		timevalsub(&ttv, &rtv);
780 		timo = ttv.tv_sec > 24 * 60 * 60 ?
781 		    24 * 60 * 60 * hz : tvtohz(&ttv);
782 	}
783 
784 	/*
785 	 * An event of interest may occur while we do not hold
786 	 * sellock, so check TDF_SELECT and the number of
787 	 * collisions and rescan the file descriptors if
788 	 * necessary.
789 	 */
790 	mtx_lock_spin(&sched_lock);
791 	if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
792 		mtx_unlock_spin(&sched_lock);
793 		goto retry;
794 	}
795 	mtx_unlock_spin(&sched_lock);
796 
797 	if (timo > 0)
798 		error = cv_timedwait_sig(&selwait, &sellock, timo);
799 	else
800 		error = cv_wait_sig(&selwait, &sellock);
801 
802 	if (error == 0)
803 		goto retry;
804 
805 done:
806 	clear_selinfo_list(td);
807 	mtx_lock_spin(&sched_lock);
808 	td->td_flags &= ~TDF_SELECT;
809 	mtx_unlock_spin(&sched_lock);
810 	mtx_unlock(&sellock);
811 
812 done_nosellock:
813 	/* select is not restarted after signals... */
814 	if (error == ERESTART)
815 		error = EINTR;
816 	if (error == EWOULDBLOCK)
817 		error = 0;
818 #define	putbits(name, x) \
819 	if (name && (error2 = copyout(obits[x], name, ncpbytes))) \
820 		error = error2;
821 	if (error == 0) {
822 		int error2;
823 
824 		putbits(fd_in, 0);
825 		putbits(fd_ou, 1);
826 		putbits(fd_ex, 2);
827 #undef putbits
828 	}
829 	if (selbits != &s_selbits[0])
830 		free(selbits, M_SELECT);
831 
832 	return (error);
833 }
834 
835 static int
836 selscan(td, ibits, obits, nfd)
837 	struct thread *td;
838 	fd_mask **ibits, **obits;
839 	int nfd;
840 {
841 	int msk, i, fd;
842 	fd_mask bits;
843 	struct file *fp;
844 	int n = 0;
845 	/* Note: backend also returns POLLHUP/POLLERR if appropriate. */
846 	static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
847 	struct filedesc *fdp = td->td_proc->p_fd;
848 
849 	FILEDESC_LOCK(fdp);
850 	for (msk = 0; msk < 3; msk++) {
851 		if (ibits[msk] == NULL)
852 			continue;
853 		for (i = 0; i < nfd; i += NFDBITS) {
854 			bits = ibits[msk][i/NFDBITS];
855 			/* ffs(int mask) not portable, fd_mask is long */
856 			for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
857 				if (!(bits & 1))
858 					continue;
859 				if ((fp = fget_locked(fdp, fd)) == NULL) {
860 					FILEDESC_UNLOCK(fdp);
861 					return (EBADF);
862 				}
863 				if (fo_poll(fp, flag[msk], td->td_ucred,
864 				    td)) {
865 					obits[msk][(fd)/NFDBITS] |=
866 					    ((fd_mask)1 << ((fd) % NFDBITS));
867 					n++;
868 				}
869 			}
870 		}
871 	}
872 	FILEDESC_UNLOCK(fdp);
873 	td->td_retval[0] = n;
874 	return (0);
875 }
876 
877 /*
878  * Poll system call.
879  */
880 #ifndef _SYS_SYSPROTO_H_
881 struct poll_args {
882 	struct pollfd *fds;
883 	u_int	nfds;
884 	int	timeout;
885 };
886 #endif
887 /*
888  * MPSAFE
889  */
890 int
891 poll(td, uap)
892 	struct thread *td;
893 	struct poll_args *uap;
894 {
895 	struct pollfd *bits;
896 	struct pollfd smallbits[32];
897 	struct timeval atv, rtv, ttv;
898 	int error = 0, timo;
899 	u_int ncoll, nfds;
900 	size_t ni;
901 
902 	nfds = uap->nfds;
903 
904 	/*
905 	 * This is kinda bogus.  We have fd limits, but that is not
906 	 * really related to the size of the pollfd array.  Make sure
907 	 * we let the process use at least FD_SETSIZE entries and at
908 	 * least enough for the current limits.  We want to be reasonably
909 	 * safe, but not overly restrictive.
910 	 */
911 	PROC_LOCK(td->td_proc);
912 	if ((nfds > lim_cur(td->td_proc, RLIMIT_NOFILE)) &&
913 	    (nfds > FD_SETSIZE)) {
914 		PROC_UNLOCK(td->td_proc);
915 		error = EINVAL;
916 		goto done2;
917 	}
918 	PROC_UNLOCK(td->td_proc);
919 	ni = nfds * sizeof(struct pollfd);
920 	if (ni > sizeof(smallbits))
921 		bits = malloc(ni, M_TEMP, M_WAITOK);
922 	else
923 		bits = smallbits;
924 	error = copyin(uap->fds, bits, ni);
925 	if (error)
926 		goto done_nosellock;
927 	if (uap->timeout != INFTIM) {
928 		atv.tv_sec = uap->timeout / 1000;
929 		atv.tv_usec = (uap->timeout % 1000) * 1000;
930 		if (itimerfix(&atv)) {
931 			error = EINVAL;
932 			goto done_nosellock;
933 		}
934 		getmicrouptime(&rtv);
935 		timevaladd(&atv, &rtv);
936 	} else {
937 		atv.tv_sec = 0;
938 		atv.tv_usec = 0;
939 	}
940 	timo = 0;
941 	TAILQ_INIT(&td->td_selq);
942 	mtx_lock(&sellock);
943 retry:
944 	ncoll = nselcoll;
945 	mtx_lock_spin(&sched_lock);
946 	td->td_flags |= TDF_SELECT;
947 	mtx_unlock_spin(&sched_lock);
948 	mtx_unlock(&sellock);
949 
950 	error = pollscan(td, bits, nfds);
951 	mtx_lock(&sellock);
952 	if (error || td->td_retval[0])
953 		goto done;
954 	if (atv.tv_sec || atv.tv_usec) {
955 		getmicrouptime(&rtv);
956 		if (timevalcmp(&rtv, &atv, >=))
957 			goto done;
958 		ttv = atv;
959 		timevalsub(&ttv, &rtv);
960 		timo = ttv.tv_sec > 24 * 60 * 60 ?
961 		    24 * 60 * 60 * hz : tvtohz(&ttv);
962 	}
963 	/*
964 	 * An event of interest may occur while we do not hold
965 	 * sellock, so check TDF_SELECT and the number of collisions
966 	 * and rescan the file descriptors if necessary.
967 	 */
968 	mtx_lock_spin(&sched_lock);
969 	if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
970 		mtx_unlock_spin(&sched_lock);
971 		goto retry;
972 	}
973 	mtx_unlock_spin(&sched_lock);
974 
975 	if (timo > 0)
976 		error = cv_timedwait_sig(&selwait, &sellock, timo);
977 	else
978 		error = cv_wait_sig(&selwait, &sellock);
979 
980 	if (error == 0)
981 		goto retry;
982 
983 done:
984 	clear_selinfo_list(td);
985 	mtx_lock_spin(&sched_lock);
986 	td->td_flags &= ~TDF_SELECT;
987 	mtx_unlock_spin(&sched_lock);
988 	mtx_unlock(&sellock);
989 
990 done_nosellock:
991 	/* poll is not restarted after signals... */
992 	if (error == ERESTART)
993 		error = EINTR;
994 	if (error == EWOULDBLOCK)
995 		error = 0;
996 	if (error == 0) {
997 		error = copyout(bits, uap->fds, ni);
998 		if (error)
999 			goto out;
1000 	}
1001 out:
1002 	if (ni > sizeof(smallbits))
1003 		free(bits, M_TEMP);
1004 done2:
1005 	return (error);
1006 }
1007 
1008 static int
1009 pollscan(td, fds, nfd)
1010 	struct thread *td;
1011 	struct pollfd *fds;
1012 	u_int nfd;
1013 {
1014 	register struct filedesc *fdp = td->td_proc->p_fd;
1015 	int i;
1016 	struct file *fp;
1017 	int n = 0;
1018 
1019 	FILEDESC_LOCK(fdp);
1020 	for (i = 0; i < nfd; i++, fds++) {
1021 		if (fds->fd >= fdp->fd_nfiles) {
1022 			fds->revents = POLLNVAL;
1023 			n++;
1024 		} else if (fds->fd < 0) {
1025 			fds->revents = 0;
1026 		} else {
1027 			fp = fdp->fd_ofiles[fds->fd];
1028 			if (fp == NULL) {
1029 				fds->revents = POLLNVAL;
1030 				n++;
1031 			} else {
1032 				/*
1033 				 * Note: backend also returns POLLHUP and
1034 				 * POLLERR if appropriate.
1035 				 */
1036 				fds->revents = fo_poll(fp, fds->events,
1037 				    td->td_ucred, td);
1038 				if (fds->revents != 0)
1039 					n++;
1040 			}
1041 		}
1042 	}
1043 	FILEDESC_UNLOCK(fdp);
1044 	td->td_retval[0] = n;
1045 	return (0);
1046 }
1047 
1048 /*
1049  * OpenBSD poll system call.
1050  * XXX this isn't quite a true representation..  OpenBSD uses select ops.
1051  */
1052 #ifndef _SYS_SYSPROTO_H_
1053 struct openbsd_poll_args {
1054 	struct pollfd *fds;
1055 	u_int	nfds;
1056 	int	timeout;
1057 };
1058 #endif
1059 /*
1060  * MPSAFE
1061  */
1062 int
1063 openbsd_poll(td, uap)
1064 	register struct thread *td;
1065 	register struct openbsd_poll_args *uap;
1066 {
1067 	return (poll(td, (struct poll_args *)uap));
1068 }
1069 
1070 /*
1071  * Remove the references to the thread from all of the objects
1072  * we were polling.
1073  *
1074  * This code assumes that the underlying owner of the selinfo
1075  * structure will hold sellock before it changes it, and that
1076  * it will unlink itself from our list if it goes away.
1077  */
1078 void
1079 clear_selinfo_list(td)
1080 	struct thread *td;
1081 {
1082 	struct selinfo *si;
1083 
1084 	mtx_assert(&sellock, MA_OWNED);
1085 	TAILQ_FOREACH(si, &td->td_selq, si_thrlist)
1086 		si->si_thread = NULL;
1087 	TAILQ_INIT(&td->td_selq);
1088 }
1089 
1090 /*
1091  * Record a select request.
1092  */
1093 void
1094 selrecord(selector, sip)
1095 	struct thread *selector;
1096 	struct selinfo *sip;
1097 {
1098 
1099 	mtx_lock(&sellock);
1100 	/*
1101 	 * If the selinfo's thread pointer is NULL then take ownership of it.
1102 	 *
1103 	 * If the thread pointer is not NULL and it points to another
1104 	 * thread, then we have a collision.
1105 	 *
1106 	 * If the thread pointer is not NULL and points back to us then leave
1107 	 * it alone as we've already added pointed it at us and added it to
1108 	 * our list.
1109 	 */
1110 	if (sip->si_thread == NULL) {
1111 		sip->si_thread = selector;
1112 		TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist);
1113 	} else if (sip->si_thread != selector) {
1114 		sip->si_flags |= SI_COLL;
1115 	}
1116 
1117 	mtx_unlock(&sellock);
1118 }
1119 
1120 /* Wake up a selecting thread. */
1121 void
1122 selwakeup(sip)
1123 	struct selinfo *sip;
1124 {
1125 	doselwakeup(sip, -1);
1126 }
1127 
1128 /* Wake up a selecting thread, and set its priority. */
1129 void
1130 selwakeuppri(sip, pri)
1131 	struct selinfo *sip;
1132 	int pri;
1133 {
1134 	doselwakeup(sip, pri);
1135 }
1136 
1137 /*
1138  * Do a wakeup when a selectable event occurs.
1139  */
1140 static void
1141 doselwakeup(sip, pri)
1142 	struct selinfo *sip;
1143 	int pri;
1144 {
1145 	struct thread *td;
1146 
1147 	mtx_lock(&sellock);
1148 	td = sip->si_thread;
1149 	if ((sip->si_flags & SI_COLL) != 0) {
1150 		nselcoll++;
1151 		sip->si_flags &= ~SI_COLL;
1152 		cv_broadcastpri(&selwait, pri);
1153 	}
1154 	if (td == NULL) {
1155 		mtx_unlock(&sellock);
1156 		return;
1157 	}
1158 	TAILQ_REMOVE(&td->td_selq, sip, si_thrlist);
1159 	sip->si_thread = NULL;
1160 	mtx_lock_spin(&sched_lock);
1161 	td->td_flags &= ~TDF_SELECT;
1162 	mtx_unlock_spin(&sched_lock);
1163 	sleepq_remove(td, &selwait);
1164 	mtx_unlock(&sellock);
1165 }
1166 
1167 static void selectinit(void *);
1168 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL)
1169 
1170 /* ARGSUSED*/
1171 static void
1172 selectinit(dummy)
1173 	void *dummy;
1174 {
1175 	cv_init(&selwait, "select");
1176 	mtx_init(&sellock, "sellck", NULL, MTX_DEF);
1177 }
1178