xref: /freebsd/sys/kern/sys_generic.c (revision 7dfd9569a2f0637fb9a48157b1c1bfe5709faee3)
1 /*-
2  * Copyright (c) 1982, 1986, 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 4. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
35  */
36 
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD$");
39 
40 #include "opt_compat.h"
41 #include "opt_ktrace.h"
42 
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/sysproto.h>
46 #include <sys/filedesc.h>
47 #include <sys/filio.h>
48 #include <sys/fcntl.h>
49 #include <sys/file.h>
50 #include <sys/proc.h>
51 #include <sys/signalvar.h>
52 #include <sys/socketvar.h>
53 #include <sys/uio.h>
54 #include <sys/kernel.h>
55 #include <sys/limits.h>
56 #include <sys/malloc.h>
57 #include <sys/poll.h>
58 #include <sys/resourcevar.h>
59 #include <sys/selinfo.h>
60 #include <sys/sleepqueue.h>
61 #include <sys/syscallsubr.h>
62 #include <sys/sysctl.h>
63 #include <sys/sysent.h>
64 #include <sys/vnode.h>
65 #include <sys/bio.h>
66 #include <sys/buf.h>
67 #include <sys/condvar.h>
68 #ifdef KTRACE
69 #include <sys/ktrace.h>
70 #endif
71 #include <vm/vm.h>
72 #include <vm/vm_page.h>
73 
74 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
75 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
76 MALLOC_DEFINE(M_IOV, "iov", "large iov's");
77 
78 static int	pollscan(struct thread *, struct pollfd *, u_int);
79 static int	selscan(struct thread *, fd_mask **, fd_mask **, int);
80 static int	dofileread(struct thread *, int, struct file *, struct uio *,
81 		    off_t, int);
82 static int	dofilewrite(struct thread *, int, struct file *, struct uio *,
83 		    off_t, int);
84 static void	doselwakeup(struct selinfo *, int);
85 
86 /*
87  * Read system call.
88  */
89 #ifndef _SYS_SYSPROTO_H_
90 struct read_args {
91 	int	fd;
92 	void	*buf;
93 	size_t	nbyte;
94 };
95 #endif
96 /*
97  * MPSAFE
98  */
99 int
100 read(td, uap)
101 	struct thread *td;
102 	struct read_args *uap;
103 {
104 	struct uio auio;
105 	struct iovec aiov;
106 	int error;
107 
108 	if (uap->nbyte > INT_MAX)
109 		return (EINVAL);
110 	aiov.iov_base = uap->buf;
111 	aiov.iov_len = uap->nbyte;
112 	auio.uio_iov = &aiov;
113 	auio.uio_iovcnt = 1;
114 	auio.uio_resid = uap->nbyte;
115 	auio.uio_segflg = UIO_USERSPACE;
116 	error = kern_readv(td, uap->fd, &auio);
117 	return(error);
118 }
119 
120 /*
121  * Positioned read system call
122  */
123 #ifndef _SYS_SYSPROTO_H_
124 struct pread_args {
125 	int	fd;
126 	void	*buf;
127 	size_t	nbyte;
128 	int	pad;
129 	off_t	offset;
130 };
131 #endif
132 /*
133  * MPSAFE
134  */
135 int
136 pread(td, uap)
137 	struct thread *td;
138 	struct pread_args *uap;
139 {
140 	struct uio auio;
141 	struct iovec aiov;
142 	int error;
143 
144 	if (uap->nbyte > INT_MAX)
145 		return (EINVAL);
146 	aiov.iov_base = uap->buf;
147 	aiov.iov_len = uap->nbyte;
148 	auio.uio_iov = &aiov;
149 	auio.uio_iovcnt = 1;
150 	auio.uio_resid = uap->nbyte;
151 	auio.uio_segflg = UIO_USERSPACE;
152 	error = kern_preadv(td, uap->fd, &auio, uap->offset);
153 	return(error);
154 }
155 
156 /*
157  * Scatter read system call.
158  */
159 #ifndef _SYS_SYSPROTO_H_
160 struct readv_args {
161 	int	fd;
162 	struct	iovec *iovp;
163 	u_int	iovcnt;
164 };
165 #endif
166 /*
167  * MPSAFE
168  */
169 int
170 readv(struct thread *td, struct readv_args *uap)
171 {
172 	struct uio *auio;
173 	int error;
174 
175 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
176 	if (error)
177 		return (error);
178 	error = kern_readv(td, uap->fd, auio);
179 	free(auio, M_IOV);
180 	return (error);
181 }
182 
183 int
184 kern_readv(struct thread *td, int fd, struct uio *auio)
185 {
186 	struct file *fp;
187 	int error;
188 
189 	error = fget_read(td, fd, &fp);
190 	if (error)
191 		return (error);
192 	error = dofileread(td, fd, fp, auio, (off_t)-1, 0);
193 	fdrop(fp, td);
194 	return (error);
195 }
196 
197 /*
198  * Scatter positioned read system call.
199  */
200 #ifndef _SYS_SYSPROTO_H_
201 struct preadv_args {
202 	int	fd;
203 	struct	iovec *iovp;
204 	u_int	iovcnt;
205 	off_t	offset;
206 };
207 #endif
208 /*
209  * MPSAFE
210  */
211 int
212 preadv(struct thread *td, struct preadv_args *uap)
213 {
214 	struct uio *auio;
215 	int error;
216 
217 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
218 	if (error)
219 		return (error);
220 	error = kern_preadv(td, uap->fd, auio, uap->offset);
221 	free(auio, M_IOV);
222 	return (error);
223 }
224 
225 int
226 kern_preadv(td, fd, auio, offset)
227 	struct thread *td;
228 	int fd;
229 	struct uio *auio;
230 	off_t offset;
231 {
232 	struct file *fp;
233 	int error;
234 
235 	error = fget_read(td, fd, &fp);
236 	if (error)
237 		return (error);
238 	if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
239 		error = ESPIPE;
240 	else if (offset < 0 && fp->f_vnode->v_type != VCHR)
241 		error = EINVAL;
242 	else
243 		error = dofileread(td, fd, fp, auio, offset, FOF_OFFSET);
244 	fdrop(fp, td);
245 	return (error);
246 }
247 
248 /*
249  * Common code for readv and preadv that reads data in
250  * from a file using the passed in uio, offset, and flags.
251  */
252 static int
253 dofileread(td, fd, fp, auio, offset, flags)
254 	struct thread *td;
255 	int fd;
256 	struct file *fp;
257 	struct uio *auio;
258 	off_t offset;
259 	int flags;
260 {
261 	ssize_t cnt;
262 	int error;
263 #ifdef KTRACE
264 	struct uio *ktruio = NULL;
265 #endif
266 
267 	/* Finish zero length reads right here */
268 	if (auio->uio_resid == 0) {
269 		td->td_retval[0] = 0;
270 		return(0);
271 	}
272 	auio->uio_rw = UIO_READ;
273 	auio->uio_offset = offset;
274 	auio->uio_td = td;
275 #ifdef KTRACE
276 	if (KTRPOINT(td, KTR_GENIO))
277 		ktruio = cloneuio(auio);
278 #endif
279 	cnt = auio->uio_resid;
280 	if ((error = fo_read(fp, auio, td->td_ucred, flags, td))) {
281 		if (auio->uio_resid != cnt && (error == ERESTART ||
282 		    error == EINTR || error == EWOULDBLOCK))
283 			error = 0;
284 	}
285 	cnt -= auio->uio_resid;
286 #ifdef KTRACE
287 	if (ktruio != NULL) {
288 		ktruio->uio_resid = cnt;
289 		ktrgenio(fd, UIO_READ, ktruio, error);
290 	}
291 #endif
292 	td->td_retval[0] = cnt;
293 	return (error);
294 }
295 
296 /*
297  * Write system call
298  */
299 #ifndef _SYS_SYSPROTO_H_
300 struct write_args {
301 	int	fd;
302 	const void *buf;
303 	size_t	nbyte;
304 };
305 #endif
306 /*
307  * MPSAFE
308  */
309 int
310 write(td, uap)
311 	struct thread *td;
312 	struct write_args *uap;
313 {
314 	struct uio auio;
315 	struct iovec aiov;
316 	int error;
317 
318 	if (uap->nbyte > INT_MAX)
319 		return (EINVAL);
320 	aiov.iov_base = (void *)(uintptr_t)uap->buf;
321 	aiov.iov_len = uap->nbyte;
322 	auio.uio_iov = &aiov;
323 	auio.uio_iovcnt = 1;
324 	auio.uio_resid = uap->nbyte;
325 	auio.uio_segflg = UIO_USERSPACE;
326 	error = kern_writev(td, uap->fd, &auio);
327 	return(error);
328 }
329 
330 /*
331  * Positioned write system call
332  */
333 #ifndef _SYS_SYSPROTO_H_
334 struct pwrite_args {
335 	int	fd;
336 	const void *buf;
337 	size_t	nbyte;
338 	int	pad;
339 	off_t	offset;
340 };
341 #endif
342 /*
343  * MPSAFE
344  */
345 int
346 pwrite(td, uap)
347 	struct thread *td;
348 	struct pwrite_args *uap;
349 {
350 	struct uio auio;
351 	struct iovec aiov;
352 	int error;
353 
354 	if (uap->nbyte > INT_MAX)
355 		return (EINVAL);
356 	aiov.iov_base = (void *)(uintptr_t)uap->buf;
357 	aiov.iov_len = uap->nbyte;
358 	auio.uio_iov = &aiov;
359 	auio.uio_iovcnt = 1;
360 	auio.uio_resid = uap->nbyte;
361 	auio.uio_segflg = UIO_USERSPACE;
362 	error = kern_pwritev(td, uap->fd, &auio, uap->offset);
363 	return(error);
364 }
365 
366 /*
367  * Gather write system call
368  */
369 #ifndef _SYS_SYSPROTO_H_
370 struct writev_args {
371 	int	fd;
372 	struct	iovec *iovp;
373 	u_int	iovcnt;
374 };
375 #endif
376 /*
377  * MPSAFE
378  */
379 int
380 writev(struct thread *td, struct writev_args *uap)
381 {
382 	struct uio *auio;
383 	int error;
384 
385 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
386 	if (error)
387 		return (error);
388 	error = kern_writev(td, uap->fd, auio);
389 	free(auio, M_IOV);
390 	return (error);
391 }
392 
393 int
394 kern_writev(struct thread *td, int fd, struct uio *auio)
395 {
396 	struct file *fp;
397 	int error;
398 
399 	error = fget_write(td, fd, &fp);
400 	if (error)
401 		return (error);
402 	error = dofilewrite(td, fd, fp, auio, (off_t)-1, 0);
403 	fdrop(fp, td);
404 	return (error);
405 }
406 
407 /*
408  * Gather positioned write system call
409  */
410 #ifndef _SYS_SYSPROTO_H_
411 struct pwritev_args {
412 	int	fd;
413 	struct	iovec *iovp;
414 	u_int	iovcnt;
415 	off_t	offset;
416 };
417 #endif
418 /*
419  * MPSAFE
420  */
421 int
422 pwritev(struct thread *td, struct pwritev_args *uap)
423 {
424 	struct uio *auio;
425 	int error;
426 
427 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
428 	if (error)
429 		return (error);
430 	error = kern_pwritev(td, uap->fd, auio, uap->offset);
431 	free(auio, M_IOV);
432 	return (error);
433 }
434 
435 int
436 kern_pwritev(td, fd, auio, offset)
437 	struct thread *td;
438 	struct uio *auio;
439 	int fd;
440 	off_t offset;
441 {
442 	struct file *fp;
443 	int error;
444 
445 	error = fget_write(td, fd, &fp);
446 	if (error)
447 		return (error);
448 	if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
449 		error = ESPIPE;
450 	else if (offset < 0 && fp->f_vnode->v_type != VCHR)
451 		error = EINVAL;
452 	else
453 		error = dofilewrite(td, fd, fp, auio, offset, FOF_OFFSET);
454 	fdrop(fp, td);
455 	return (error);
456 }
457 
458 /*
459  * Common code for writev and pwritev that writes data to
460  * a file using the passed in uio, offset, and flags.
461  */
462 static int
463 dofilewrite(td, fd, fp, auio, offset, flags)
464 	struct thread *td;
465 	int fd;
466 	struct file *fp;
467 	struct uio *auio;
468 	off_t offset;
469 	int flags;
470 {
471 	ssize_t cnt;
472 	int error;
473 #ifdef KTRACE
474 	struct uio *ktruio = NULL;
475 #endif
476 
477 	auio->uio_rw = UIO_WRITE;
478 	auio->uio_td = td;
479 	auio->uio_offset = offset;
480 #ifdef KTRACE
481 	if (KTRPOINT(td, KTR_GENIO))
482 		ktruio = cloneuio(auio);
483 #endif
484 	cnt = auio->uio_resid;
485 	if (fp->f_type == DTYPE_VNODE)
486 		bwillwrite();
487 	if ((error = fo_write(fp, auio, td->td_ucred, flags, td))) {
488 		if (auio->uio_resid != cnt && (error == ERESTART ||
489 		    error == EINTR || error == EWOULDBLOCK))
490 			error = 0;
491 		/* Socket layer is responsible for issuing SIGPIPE. */
492 		if (error == EPIPE) {
493 			PROC_LOCK(td->td_proc);
494 			psignal(td->td_proc, SIGPIPE);
495 			PROC_UNLOCK(td->td_proc);
496 		}
497 	}
498 	cnt -= auio->uio_resid;
499 #ifdef KTRACE
500 	if (ktruio != NULL) {
501 		ktruio->uio_resid = cnt;
502 		ktrgenio(fd, UIO_WRITE, ktruio, error);
503 	}
504 #endif
505 	td->td_retval[0] = cnt;
506 	return (error);
507 }
508 
509 /*
510  * Ioctl system call
511  */
512 #ifndef _SYS_SYSPROTO_H_
513 struct ioctl_args {
514 	int	fd;
515 	u_long	com;
516 	caddr_t	data;
517 };
518 #endif
519 /*
520  * MPSAFE
521  */
522 /* ARGSUSED */
523 int
524 ioctl(struct thread *td, struct ioctl_args *uap)
525 {
526 	struct file *fp;
527 	struct filedesc *fdp;
528 	u_long com;
529 	int error = 0;
530 	u_int size;
531 	caddr_t data, memp;
532 	int tmp;
533 
534 	if (uap->com > 0xffffffff) {
535 		printf(
536 		    "WARNING pid %d (%s): ioctl sign-extension ioctl %lx\n",
537 		    td->td_proc->p_pid, td->td_proc->p_comm, uap->com);
538 		uap->com &= 0xffffffff;
539 	}
540 	if ((error = fget(td, uap->fd, &fp)) != 0)
541 		return (error);
542 	if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
543 		fdrop(fp, td);
544 		return (EBADF);
545 	}
546 	fdp = td->td_proc->p_fd;
547 	switch (com = uap->com) {
548 	case FIONCLEX:
549 		FILEDESC_LOCK_FAST(fdp);
550 		fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE;
551 		FILEDESC_UNLOCK_FAST(fdp);
552 		fdrop(fp, td);
553 		return (0);
554 	case FIOCLEX:
555 		FILEDESC_LOCK_FAST(fdp);
556 		fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE;
557 		FILEDESC_UNLOCK_FAST(fdp);
558 		fdrop(fp, td);
559 		return (0);
560 	}
561 
562 	/*
563 	 * Interpret high order word to find amount of data to be
564 	 * copied to/from the user's address space.
565 	 */
566 	size = IOCPARM_LEN(com);
567 	if ((size > IOCPARM_MAX) ||
568 	    ((com & (IOC_VOID  | IOC_IN | IOC_OUT)) == 0) ||
569 #if defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
570 	    ((com & IOC_OUT) && size == 0) ||
571 #else
572 	    ((com & (IOC_IN | IOC_OUT)) && size == 0) ||
573 #endif
574 	    ((com & IOC_VOID) && size > 0)) {
575 		fdrop(fp, td);
576 		return (ENOTTY);
577 	}
578 
579 	if (size > 0) {
580 		memp = malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
581 		data = memp;
582 	} else {
583 		memp = NULL;
584 		data = (void *)&uap->data;
585 	}
586 	if (com & IOC_IN) {
587 		error = copyin(uap->data, data, (u_int)size);
588 		if (error) {
589 			free(memp, M_IOCTLOPS);
590 			fdrop(fp, td);
591 			return (error);
592 		}
593 	} else if (com & IOC_OUT) {
594 		/*
595 		 * Zero the buffer so the user always
596 		 * gets back something deterministic.
597 		 */
598 		bzero(data, size);
599 	}
600 
601 	if (com == FIONBIO) {
602 		FILE_LOCK(fp);
603 		if ((tmp = *(int *)data))
604 			fp->f_flag |= FNONBLOCK;
605 		else
606 			fp->f_flag &= ~FNONBLOCK;
607 		FILE_UNLOCK(fp);
608 		data = (void *)&tmp;
609 	} else if (com == FIOASYNC) {
610 		FILE_LOCK(fp);
611 		if ((tmp = *(int *)data))
612 			fp->f_flag |= FASYNC;
613 		else
614 			fp->f_flag &= ~FASYNC;
615 		FILE_UNLOCK(fp);
616 		data = (void *)&tmp;
617 	}
618 
619 	error = fo_ioctl(fp, com, data, td->td_ucred, td);
620 
621 	if (error == 0 && (com & IOC_OUT))
622 		error = copyout(data, uap->data, (u_int)size);
623 
624 	if (memp != NULL)
625 		free(memp, M_IOCTLOPS);
626 	fdrop(fp, td);
627 	return (error);
628 }
629 
630 /*
631  * sellock and selwait are initialized in selectinit() via SYSINIT.
632  */
633 struct mtx	sellock;
634 struct cv	selwait;
635 u_int		nselcoll;	/* Select collisions since boot */
636 SYSCTL_UINT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
637 
638 /*
639  * Select system call.
640  */
641 #ifndef _SYS_SYSPROTO_H_
642 struct select_args {
643 	int	nd;
644 	fd_set	*in, *ou, *ex;
645 	struct	timeval *tv;
646 };
647 #endif
648 /*
649  * MPSAFE
650  */
651 int
652 select(td, uap)
653 	register struct thread *td;
654 	register struct select_args *uap;
655 {
656 	struct timeval tv, *tvp;
657 	int error;
658 
659 	if (uap->tv != NULL) {
660 		error = copyin(uap->tv, &tv, sizeof(tv));
661 		if (error)
662 			return (error);
663 		tvp = &tv;
664 	} else
665 		tvp = NULL;
666 
667 	return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp));
668 }
669 
670 int
671 kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou,
672     fd_set *fd_ex, struct timeval *tvp)
673 {
674 	struct filedesc *fdp;
675 	/*
676 	 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
677 	 * infds with the new FD_SETSIZE of 1024, and more than enough for
678 	 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
679 	 * of 256.
680 	 */
681 	fd_mask s_selbits[howmany(2048, NFDBITS)];
682 	fd_mask *ibits[3], *obits[3], *selbits, *sbp;
683 	struct timeval atv, rtv, ttv;
684 	int error, timo;
685 	u_int ncoll, nbufbytes, ncpbytes, nfdbits;
686 
687 	if (nd < 0)
688 		return (EINVAL);
689 	fdp = td->td_proc->p_fd;
690 
691 	FILEDESC_LOCK_FAST(fdp);
692 
693 	if (nd > td->td_proc->p_fd->fd_nfiles)
694 		nd = td->td_proc->p_fd->fd_nfiles;   /* forgiving; slightly wrong */
695 	FILEDESC_UNLOCK_FAST(fdp);
696 
697 	/*
698 	 * Allocate just enough bits for the non-null fd_sets.  Use the
699 	 * preallocated auto buffer if possible.
700 	 */
701 	nfdbits = roundup(nd, NFDBITS);
702 	ncpbytes = nfdbits / NBBY;
703 	nbufbytes = 0;
704 	if (fd_in != NULL)
705 		nbufbytes += 2 * ncpbytes;
706 	if (fd_ou != NULL)
707 		nbufbytes += 2 * ncpbytes;
708 	if (fd_ex != NULL)
709 		nbufbytes += 2 * ncpbytes;
710 	if (nbufbytes <= sizeof s_selbits)
711 		selbits = &s_selbits[0];
712 	else
713 		selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
714 
715 	/*
716 	 * Assign pointers into the bit buffers and fetch the input bits.
717 	 * Put the output buffers together so that they can be bzeroed
718 	 * together.
719 	 */
720 	sbp = selbits;
721 #define	getbits(name, x) \
722 	do {								\
723 		if (name == NULL)					\
724 			ibits[x] = NULL;				\
725 		else {							\
726 			ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp;	\
727 			obits[x] = sbp;					\
728 			sbp += ncpbytes / sizeof *sbp;			\
729 			error = copyin(name, ibits[x], ncpbytes);	\
730 			if (error != 0)					\
731 				goto done_nosellock;			\
732 		}							\
733 	} while (0)
734 	getbits(fd_in, 0);
735 	getbits(fd_ou, 1);
736 	getbits(fd_ex, 2);
737 #undef	getbits
738 	if (nbufbytes != 0)
739 		bzero(selbits, nbufbytes / 2);
740 
741 	if (tvp != NULL) {
742 		atv = *tvp;
743 		if (itimerfix(&atv)) {
744 			error = EINVAL;
745 			goto done_nosellock;
746 		}
747 		getmicrouptime(&rtv);
748 		timevaladd(&atv, &rtv);
749 	} else {
750 		atv.tv_sec = 0;
751 		atv.tv_usec = 0;
752 	}
753 	timo = 0;
754 	TAILQ_INIT(&td->td_selq);
755 	mtx_lock(&sellock);
756 retry:
757 	ncoll = nselcoll;
758 	mtx_lock_spin(&sched_lock);
759 	td->td_flags |= TDF_SELECT;
760 	mtx_unlock_spin(&sched_lock);
761 	mtx_unlock(&sellock);
762 
763 	error = selscan(td, ibits, obits, nd);
764 	mtx_lock(&sellock);
765 	if (error || td->td_retval[0])
766 		goto done;
767 	if (atv.tv_sec || atv.tv_usec) {
768 		getmicrouptime(&rtv);
769 		if (timevalcmp(&rtv, &atv, >=))
770 			goto done;
771 		ttv = atv;
772 		timevalsub(&ttv, &rtv);
773 		timo = ttv.tv_sec > 24 * 60 * 60 ?
774 		    24 * 60 * 60 * hz : tvtohz(&ttv);
775 	}
776 
777 	/*
778 	 * An event of interest may occur while we do not hold
779 	 * sellock, so check TDF_SELECT and the number of
780 	 * collisions and rescan the file descriptors if
781 	 * necessary.
782 	 */
783 	mtx_lock_spin(&sched_lock);
784 	if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
785 		mtx_unlock_spin(&sched_lock);
786 		goto retry;
787 	}
788 	mtx_unlock_spin(&sched_lock);
789 
790 	if (timo > 0)
791 		error = cv_timedwait_sig(&selwait, &sellock, timo);
792 	else
793 		error = cv_wait_sig(&selwait, &sellock);
794 
795 	if (error == 0)
796 		goto retry;
797 
798 done:
799 	clear_selinfo_list(td);
800 	mtx_lock_spin(&sched_lock);
801 	td->td_flags &= ~TDF_SELECT;
802 	mtx_unlock_spin(&sched_lock);
803 	mtx_unlock(&sellock);
804 
805 done_nosellock:
806 	/* select is not restarted after signals... */
807 	if (error == ERESTART)
808 		error = EINTR;
809 	if (error == EWOULDBLOCK)
810 		error = 0;
811 #define	putbits(name, x) \
812 	if (name && (error2 = copyout(obits[x], name, ncpbytes))) \
813 		error = error2;
814 	if (error == 0) {
815 		int error2;
816 
817 		putbits(fd_in, 0);
818 		putbits(fd_ou, 1);
819 		putbits(fd_ex, 2);
820 #undef putbits
821 	}
822 	if (selbits != &s_selbits[0])
823 		free(selbits, M_SELECT);
824 
825 	return (error);
826 }
827 
828 static int
829 selscan(td, ibits, obits, nfd)
830 	struct thread *td;
831 	fd_mask **ibits, **obits;
832 	int nfd;
833 {
834 	int msk, i, fd;
835 	fd_mask bits;
836 	struct file *fp;
837 	int n = 0;
838 	/* Note: backend also returns POLLHUP/POLLERR if appropriate. */
839 	static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
840 	struct filedesc *fdp = td->td_proc->p_fd;
841 
842 	FILEDESC_LOCK(fdp);
843 	for (msk = 0; msk < 3; msk++) {
844 		if (ibits[msk] == NULL)
845 			continue;
846 		for (i = 0; i < nfd; i += NFDBITS) {
847 			bits = ibits[msk][i/NFDBITS];
848 			/* ffs(int mask) not portable, fd_mask is long */
849 			for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
850 				if (!(bits & 1))
851 					continue;
852 				if ((fp = fget_locked(fdp, fd)) == NULL) {
853 					FILEDESC_UNLOCK(fdp);
854 					return (EBADF);
855 				}
856 				if (fo_poll(fp, flag[msk], td->td_ucred,
857 				    td)) {
858 					obits[msk][(fd)/NFDBITS] |=
859 					    ((fd_mask)1 << ((fd) % NFDBITS));
860 					n++;
861 				}
862 			}
863 		}
864 	}
865 	FILEDESC_UNLOCK(fdp);
866 	td->td_retval[0] = n;
867 	return (0);
868 }
869 
870 /*
871  * Poll system call.
872  */
873 #ifndef _SYS_SYSPROTO_H_
874 struct poll_args {
875 	struct pollfd *fds;
876 	u_int	nfds;
877 	int	timeout;
878 };
879 #endif
880 /*
881  * MPSAFE
882  */
883 int
884 poll(td, uap)
885 	struct thread *td;
886 	struct poll_args *uap;
887 {
888 	struct pollfd *bits;
889 	struct pollfd smallbits[32];
890 	struct timeval atv, rtv, ttv;
891 	int error = 0, timo;
892 	u_int ncoll, nfds;
893 	size_t ni;
894 
895 	nfds = uap->nfds;
896 
897 	/*
898 	 * This is kinda bogus.  We have fd limits, but that is not
899 	 * really related to the size of the pollfd array.  Make sure
900 	 * we let the process use at least FD_SETSIZE entries and at
901 	 * least enough for the current limits.  We want to be reasonably
902 	 * safe, but not overly restrictive.
903 	 */
904 	PROC_LOCK(td->td_proc);
905 	if ((nfds > lim_cur(td->td_proc, RLIMIT_NOFILE)) &&
906 	    (nfds > FD_SETSIZE)) {
907 		PROC_UNLOCK(td->td_proc);
908 		error = EINVAL;
909 		goto done2;
910 	}
911 	PROC_UNLOCK(td->td_proc);
912 	ni = nfds * sizeof(struct pollfd);
913 	if (ni > sizeof(smallbits))
914 		bits = malloc(ni, M_TEMP, M_WAITOK);
915 	else
916 		bits = smallbits;
917 	error = copyin(uap->fds, bits, ni);
918 	if (error)
919 		goto done_nosellock;
920 	if (uap->timeout != INFTIM) {
921 		atv.tv_sec = uap->timeout / 1000;
922 		atv.tv_usec = (uap->timeout % 1000) * 1000;
923 		if (itimerfix(&atv)) {
924 			error = EINVAL;
925 			goto done_nosellock;
926 		}
927 		getmicrouptime(&rtv);
928 		timevaladd(&atv, &rtv);
929 	} else {
930 		atv.tv_sec = 0;
931 		atv.tv_usec = 0;
932 	}
933 	timo = 0;
934 	TAILQ_INIT(&td->td_selq);
935 	mtx_lock(&sellock);
936 retry:
937 	ncoll = nselcoll;
938 	mtx_lock_spin(&sched_lock);
939 	td->td_flags |= TDF_SELECT;
940 	mtx_unlock_spin(&sched_lock);
941 	mtx_unlock(&sellock);
942 
943 	error = pollscan(td, bits, nfds);
944 	mtx_lock(&sellock);
945 	if (error || td->td_retval[0])
946 		goto done;
947 	if (atv.tv_sec || atv.tv_usec) {
948 		getmicrouptime(&rtv);
949 		if (timevalcmp(&rtv, &atv, >=))
950 			goto done;
951 		ttv = atv;
952 		timevalsub(&ttv, &rtv);
953 		timo = ttv.tv_sec > 24 * 60 * 60 ?
954 		    24 * 60 * 60 * hz : tvtohz(&ttv);
955 	}
956 	/*
957 	 * An event of interest may occur while we do not hold
958 	 * sellock, so check TDF_SELECT and the number of collisions
959 	 * and rescan the file descriptors if necessary.
960 	 */
961 	mtx_lock_spin(&sched_lock);
962 	if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
963 		mtx_unlock_spin(&sched_lock);
964 		goto retry;
965 	}
966 	mtx_unlock_spin(&sched_lock);
967 
968 	if (timo > 0)
969 		error = cv_timedwait_sig(&selwait, &sellock, timo);
970 	else
971 		error = cv_wait_sig(&selwait, &sellock);
972 
973 	if (error == 0)
974 		goto retry;
975 
976 done:
977 	clear_selinfo_list(td);
978 	mtx_lock_spin(&sched_lock);
979 	td->td_flags &= ~TDF_SELECT;
980 	mtx_unlock_spin(&sched_lock);
981 	mtx_unlock(&sellock);
982 
983 done_nosellock:
984 	/* poll is not restarted after signals... */
985 	if (error == ERESTART)
986 		error = EINTR;
987 	if (error == EWOULDBLOCK)
988 		error = 0;
989 	if (error == 0) {
990 		error = copyout(bits, uap->fds, ni);
991 		if (error)
992 			goto out;
993 	}
994 out:
995 	if (ni > sizeof(smallbits))
996 		free(bits, M_TEMP);
997 done2:
998 	return (error);
999 }
1000 
1001 static int
1002 pollscan(td, fds, nfd)
1003 	struct thread *td;
1004 	struct pollfd *fds;
1005 	u_int nfd;
1006 {
1007 	register struct filedesc *fdp = td->td_proc->p_fd;
1008 	int i;
1009 	struct file *fp;
1010 	int n = 0;
1011 
1012 	FILEDESC_LOCK(fdp);
1013 	for (i = 0; i < nfd; i++, fds++) {
1014 		if (fds->fd >= fdp->fd_nfiles) {
1015 			fds->revents = POLLNVAL;
1016 			n++;
1017 		} else if (fds->fd < 0) {
1018 			fds->revents = 0;
1019 		} else {
1020 			fp = fdp->fd_ofiles[fds->fd];
1021 			if (fp == NULL) {
1022 				fds->revents = POLLNVAL;
1023 				n++;
1024 			} else {
1025 				/*
1026 				 * Note: backend also returns POLLHUP and
1027 				 * POLLERR if appropriate.
1028 				 */
1029 				fds->revents = fo_poll(fp, fds->events,
1030 				    td->td_ucred, td);
1031 				if (fds->revents != 0)
1032 					n++;
1033 			}
1034 		}
1035 	}
1036 	FILEDESC_UNLOCK(fdp);
1037 	td->td_retval[0] = n;
1038 	return (0);
1039 }
1040 
1041 /*
1042  * OpenBSD poll system call.
1043  * XXX this isn't quite a true representation..  OpenBSD uses select ops.
1044  */
1045 #ifndef _SYS_SYSPROTO_H_
1046 struct openbsd_poll_args {
1047 	struct pollfd *fds;
1048 	u_int	nfds;
1049 	int	timeout;
1050 };
1051 #endif
1052 /*
1053  * MPSAFE
1054  */
1055 int
1056 openbsd_poll(td, uap)
1057 	register struct thread *td;
1058 	register struct openbsd_poll_args *uap;
1059 {
1060 	return (poll(td, (struct poll_args *)uap));
1061 }
1062 
1063 /*
1064  * Remove the references to the thread from all of the objects
1065  * we were polling.
1066  *
1067  * This code assumes that the underlying owner of the selinfo
1068  * structure will hold sellock before it changes it, and that
1069  * it will unlink itself from our list if it goes away.
1070  */
1071 void
1072 clear_selinfo_list(td)
1073 	struct thread *td;
1074 {
1075 	struct selinfo *si;
1076 
1077 	mtx_assert(&sellock, MA_OWNED);
1078 	TAILQ_FOREACH(si, &td->td_selq, si_thrlist)
1079 		si->si_thread = NULL;
1080 	TAILQ_INIT(&td->td_selq);
1081 }
1082 
1083 /*
1084  * Record a select request.
1085  */
1086 void
1087 selrecord(selector, sip)
1088 	struct thread *selector;
1089 	struct selinfo *sip;
1090 {
1091 
1092 	mtx_lock(&sellock);
1093 	/*
1094 	 * If the selinfo's thread pointer is NULL then take ownership of it.
1095 	 *
1096 	 * If the thread pointer is not NULL and it points to another
1097 	 * thread, then we have a collision.
1098 	 *
1099 	 * If the thread pointer is not NULL and points back to us then leave
1100 	 * it alone as we've already added pointed it at us and added it to
1101 	 * our list.
1102 	 */
1103 	if (sip->si_thread == NULL) {
1104 		sip->si_thread = selector;
1105 		TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist);
1106 	} else if (sip->si_thread != selector) {
1107 		sip->si_flags |= SI_COLL;
1108 	}
1109 
1110 	mtx_unlock(&sellock);
1111 }
1112 
1113 /* Wake up a selecting thread. */
1114 void
1115 selwakeup(sip)
1116 	struct selinfo *sip;
1117 {
1118 	doselwakeup(sip, -1);
1119 }
1120 
1121 /* Wake up a selecting thread, and set its priority. */
1122 void
1123 selwakeuppri(sip, pri)
1124 	struct selinfo *sip;
1125 	int pri;
1126 {
1127 	doselwakeup(sip, pri);
1128 }
1129 
1130 /*
1131  * Do a wakeup when a selectable event occurs.
1132  */
1133 static void
1134 doselwakeup(sip, pri)
1135 	struct selinfo *sip;
1136 	int pri;
1137 {
1138 	struct thread *td;
1139 
1140 	mtx_lock(&sellock);
1141 	td = sip->si_thread;
1142 	if ((sip->si_flags & SI_COLL) != 0) {
1143 		nselcoll++;
1144 		sip->si_flags &= ~SI_COLL;
1145 		cv_broadcastpri(&selwait, pri);
1146 	}
1147 	if (td == NULL) {
1148 		mtx_unlock(&sellock);
1149 		return;
1150 	}
1151 	TAILQ_REMOVE(&td->td_selq, sip, si_thrlist);
1152 	sip->si_thread = NULL;
1153 	mtx_lock_spin(&sched_lock);
1154 	td->td_flags &= ~TDF_SELECT;
1155 	mtx_unlock_spin(&sched_lock);
1156 	sleepq_remove(td, &selwait);
1157 	mtx_unlock(&sellock);
1158 }
1159 
1160 static void selectinit(void *);
1161 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL)
1162 
1163 /* ARGSUSED*/
1164 static void
1165 selectinit(dummy)
1166 	void *dummy;
1167 {
1168 	cv_init(&selwait, "select");
1169 	mtx_init(&sellock, "sellck", NULL, MTX_DEF);
1170 }
1171