xref: /freebsd/sys/kern/sys_generic.c (revision 6b3455a7665208c366849f0b2b3bc916fb97516e)
1 /*
2  * Copyright (c) 1982, 1986, 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 4. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
35  */
36 
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD$");
39 
40 #include "opt_ktrace.h"
41 
42 #include <sys/param.h>
43 #include <sys/systm.h>
44 #include <sys/sysproto.h>
45 #include <sys/filedesc.h>
46 #include <sys/filio.h>
47 #include <sys/fcntl.h>
48 #include <sys/file.h>
49 #include <sys/proc.h>
50 #include <sys/signalvar.h>
51 #include <sys/socketvar.h>
52 #include <sys/uio.h>
53 #include <sys/kernel.h>
54 #include <sys/limits.h>
55 #include <sys/malloc.h>
56 #include <sys/poll.h>
57 #include <sys/resourcevar.h>
58 #include <sys/selinfo.h>
59 #include <sys/sleepqueue.h>
60 #include <sys/syscallsubr.h>
61 #include <sys/sysctl.h>
62 #include <sys/sysent.h>
63 #include <sys/vnode.h>
64 #include <sys/bio.h>
65 #include <sys/buf.h>
66 #include <sys/condvar.h>
67 #ifdef KTRACE
68 #include <sys/ktrace.h>
69 #endif
70 #include <vm/vm.h>
71 #include <vm/vm_page.h>
72 
73 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
74 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
75 MALLOC_DEFINE(M_IOV, "iov", "large iov's");
76 
77 static int	pollscan(struct thread *, struct pollfd *, u_int);
78 static int	selscan(struct thread *, fd_mask **, fd_mask **, int);
79 static int	dofileread(struct thread *, struct file *, int, void *,
80 		    size_t, off_t, int);
81 static int	dofilewrite(struct thread *, struct file *, int,
82 		    const void *, size_t, off_t, int);
83 static void	doselwakeup(struct selinfo *, int);
84 
85 /*
86  * Read system call.
87  */
88 #ifndef _SYS_SYSPROTO_H_
89 struct read_args {
90 	int	fd;
91 	void	*buf;
92 	size_t	nbyte;
93 };
94 #endif
95 /*
96  * MPSAFE
97  */
98 int
99 read(td, uap)
100 	struct thread *td;
101 	struct read_args *uap;
102 {
103 	struct file *fp;
104 	int error;
105 
106 	if ((error = fget_read(td, uap->fd, &fp)) == 0) {
107 		error = dofileread(td, fp, uap->fd, uap->buf,
108 			    uap->nbyte, (off_t)-1, 0);
109 		fdrop(fp, td);
110 	}
111 	return(error);
112 }
113 
114 /*
115  * Pread system call
116  */
117 #ifndef _SYS_SYSPROTO_H_
118 struct pread_args {
119 	int	fd;
120 	void	*buf;
121 	size_t	nbyte;
122 	int	pad;
123 	off_t	offset;
124 };
125 #endif
126 /*
127  * MPSAFE
128  */
129 int
130 pread(td, uap)
131 	struct thread *td;
132 	struct pread_args *uap;
133 {
134 	struct file *fp;
135 	int error;
136 
137 	if ((error = fget_read(td, uap->fd, &fp)) != 0)
138 		return (error);
139 	if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
140 		error = ESPIPE;
141 	else if (uap->offset < 0 && fp->f_vnode->v_type != VCHR)
142 		error = EINVAL;
143 	else {
144 		error = dofileread(td, fp, uap->fd, uap->buf, uap->nbyte,
145 			    uap->offset, FOF_OFFSET);
146 	}
147 	fdrop(fp, td);
148 	return(error);
149 }
150 
151 /*
152  * Code common for read and pread
153  */
154 static int
155 dofileread(td, fp, fd, buf, nbyte, offset, flags)
156 	struct thread *td;
157 	struct file *fp;
158 	int fd, flags;
159 	void *buf;
160 	size_t nbyte;
161 	off_t offset;
162 {
163 	struct uio auio;
164 	struct iovec aiov;
165 	long cnt, error = 0;
166 #ifdef KTRACE
167 	struct uio *ktruio = NULL;
168 #endif
169 
170 	aiov.iov_base = buf;
171 	aiov.iov_len = nbyte;
172 	auio.uio_iov = &aiov;
173 	auio.uio_iovcnt = 1;
174 	auio.uio_offset = offset;
175 	if (nbyte > INT_MAX)
176 		return (EINVAL);
177 	auio.uio_resid = nbyte;
178 	auio.uio_rw = UIO_READ;
179 	auio.uio_segflg = UIO_USERSPACE;
180 	auio.uio_td = td;
181 #ifdef KTRACE
182 	if (KTRPOINT(td, KTR_GENIO))
183 		ktruio = cloneuio(&auio);
184 #endif
185 	cnt = nbyte;
186 
187 	if ((error = fo_read(fp, &auio, td->td_ucred, flags, td))) {
188 		if (auio.uio_resid != cnt && (error == ERESTART ||
189 		    error == EINTR || error == EWOULDBLOCK))
190 			error = 0;
191 	}
192 	cnt -= auio.uio_resid;
193 #ifdef KTRACE
194 	if (ktruio != NULL) {
195 		ktruio->uio_resid = cnt;
196 		ktrgenio(fd, UIO_READ, ktruio, error);
197 	}
198 #endif
199 	td->td_retval[0] = cnt;
200 	return (error);
201 }
202 
203 /*
204  * Scatter read system call.
205  */
206 #ifndef _SYS_SYSPROTO_H_
207 struct readv_args {
208 	int	fd;
209 	struct	iovec *iovp;
210 	u_int	iovcnt;
211 };
212 #endif
213 /*
214  * MPSAFE
215  */
216 int
217 readv(struct thread *td, struct readv_args *uap)
218 {
219 	struct file *fp;
220 	struct uio *auio = NULL;
221 	long cnt;
222 	int error;
223 #ifdef KTRACE
224 	struct uio *ktruio = NULL;
225 #endif
226 
227 	error = fget_read(td, uap->fd, &fp);
228 	if (error)
229 		return (error);
230 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
231 	if (error) {
232 		fdrop(fp, td);
233 		return (error);
234 	}
235 	auio->uio_rw = UIO_READ;
236 	auio->uio_td = td;
237 #ifdef KTRACE
238 	if (KTRPOINT(td, KTR_GENIO))
239 		ktruio = cloneuio(auio);
240 #endif
241 	cnt = auio->uio_resid;
242 	if ((error = fo_read(fp, auio, td->td_ucred, 0, td))) {
243 		if (auio->uio_resid != cnt && (error == ERESTART ||
244 		    error == EINTR || error == EWOULDBLOCK))
245 			error = 0;
246 	}
247 	cnt -= auio->uio_resid;
248 #ifdef KTRACE
249 	if (ktruio != NULL) {
250 		ktruio->uio_resid = cnt;
251 		ktrgenio(uap->fd, UIO_READ, ktruio, error);
252 	}
253 #endif
254 	td->td_retval[0] = cnt;
255 	free(auio, M_IOV);
256 	fdrop(fp, td);
257 	return (error);
258 }
259 
260 /*
261  * Write system call
262  */
263 #ifndef _SYS_SYSPROTO_H_
264 struct write_args {
265 	int	fd;
266 	const void *buf;
267 	size_t	nbyte;
268 };
269 #endif
270 /*
271  * MPSAFE
272  */
273 int
274 write(td, uap)
275 	struct thread *td;
276 	struct write_args *uap;
277 {
278 	struct file *fp;
279 	int error;
280 
281 	if ((error = fget_write(td, uap->fd, &fp)) == 0) {
282 		error = dofilewrite(td, fp, uap->fd, uap->buf, uap->nbyte,
283 			    (off_t)-1, 0);
284 		fdrop(fp, td);
285 	} else {
286 		error = EBADF;	/* XXX this can't be right */
287 	}
288 	return(error);
289 }
290 
291 /*
292  * Pwrite system call
293  */
294 #ifndef _SYS_SYSPROTO_H_
295 struct pwrite_args {
296 	int	fd;
297 	const void *buf;
298 	size_t	nbyte;
299 	int	pad;
300 	off_t	offset;
301 };
302 #endif
303 /*
304  * MPSAFE
305  */
306 int
307 pwrite(td, uap)
308 	struct thread *td;
309 	struct pwrite_args *uap;
310 {
311 	struct file *fp;
312 	int error;
313 
314 	if ((error = fget_write(td, uap->fd, &fp)) == 0) {
315 		if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
316 			error = ESPIPE;
317 		else if (uap->offset < 0 && fp->f_vnode->v_type != VCHR)
318 			error = EINVAL;
319 		else {
320 			error = dofilewrite(td, fp, uap->fd, uap->buf,
321 				    uap->nbyte, uap->offset, FOF_OFFSET);
322 		}
323 		fdrop(fp, td);
324 	} else {
325 		error = EBADF;	/* this can't be right */
326 	}
327 	return(error);
328 }
329 
330 static int
331 dofilewrite(td, fp, fd, buf, nbyte, offset, flags)
332 	struct thread *td;
333 	struct file *fp;
334 	int fd, flags;
335 	const void *buf;
336 	size_t nbyte;
337 	off_t offset;
338 {
339 	struct uio auio;
340 	struct iovec aiov;
341 	long cnt, error = 0;
342 #ifdef KTRACE
343 	struct uio *ktruio = NULL;
344 #endif
345 
346 	aiov.iov_base = (void *)(uintptr_t)buf;
347 	aiov.iov_len = nbyte;
348 	auio.uio_iov = &aiov;
349 	auio.uio_iovcnt = 1;
350 	auio.uio_offset = offset;
351 	if (nbyte > INT_MAX)
352 		return (EINVAL);
353 	auio.uio_resid = nbyte;
354 	auio.uio_rw = UIO_WRITE;
355 	auio.uio_segflg = UIO_USERSPACE;
356 	auio.uio_td = td;
357 #ifdef KTRACE
358 	if (KTRPOINT(td, KTR_GENIO))
359 		ktruio = cloneuio(&auio);
360 #endif
361 	cnt = nbyte;
362 	if (fp->f_type == DTYPE_VNODE)
363 		bwillwrite();
364 	if ((error = fo_write(fp, &auio, td->td_ucred, flags, td))) {
365 		if (auio.uio_resid != cnt && (error == ERESTART ||
366 		    error == EINTR || error == EWOULDBLOCK))
367 			error = 0;
368 		/* Socket layer is responsible for issuing SIGPIPE. */
369 		if (error == EPIPE && fp->f_type != DTYPE_SOCKET) {
370 			PROC_LOCK(td->td_proc);
371 			psignal(td->td_proc, SIGPIPE);
372 			PROC_UNLOCK(td->td_proc);
373 		}
374 	}
375 	cnt -= auio.uio_resid;
376 #ifdef KTRACE
377 	if (ktruio != NULL) {
378 		ktruio->uio_resid = cnt;
379 		ktrgenio(fd, UIO_WRITE, ktruio, error);
380 	}
381 #endif
382 	td->td_retval[0] = cnt;
383 	return (error);
384 }
385 
386 /*
387  * Gather write system call
388  */
389 #ifndef _SYS_SYSPROTO_H_
390 struct writev_args {
391 	int	fd;
392 	struct	iovec *iovp;
393 	u_int	iovcnt;
394 };
395 #endif
396 /*
397  * MPSAFE
398  */
399 int
400 writev(struct thread *td, struct writev_args *uap)
401 {
402 	struct file *fp;
403 	struct uio *auio = NULL;
404 	long cnt;
405 	int error;
406 #ifdef KTRACE
407 	struct uio *ktruio = NULL;
408 #endif
409 
410 	error = fget_write(td, uap->fd, &fp);
411 	if (error)
412 		return (EBADF);
413 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
414 	if (error) {
415 		fdrop(fp, td);
416 		return (error);
417 	}
418 	auio->uio_rw = UIO_WRITE;
419 	auio->uio_td = td;
420 #ifdef KTRACE
421 	if (KTRPOINT(td, KTR_GENIO))
422 		ktruio = cloneuio(auio);
423 #endif
424 	cnt = auio->uio_resid;
425 	if (fp->f_type == DTYPE_VNODE)
426 		bwillwrite();
427 	if ((error = fo_write(fp, auio, td->td_ucred, 0, td))) {
428 		if (auio->uio_resid != cnt && (error == ERESTART ||
429 		    error == EINTR || error == EWOULDBLOCK))
430 			error = 0;
431 		if (error == EPIPE) {
432 			PROC_LOCK(td->td_proc);
433 			psignal(td->td_proc, SIGPIPE);
434 			PROC_UNLOCK(td->td_proc);
435 		}
436 	}
437 	cnt -= auio->uio_resid;
438 #ifdef KTRACE
439 	if (ktruio != NULL) {
440 		ktruio->uio_resid = cnt;
441 		ktrgenio(uap->fd, UIO_WRITE, ktruio, error);
442 	}
443 #endif
444 	td->td_retval[0] = cnt;
445 	fdrop(fp, td);
446 	free(auio, M_IOV);
447 	return (error);
448 }
449 
450 /*
451  * Ioctl system call
452  */
453 #ifndef _SYS_SYSPROTO_H_
454 struct ioctl_args {
455 	int	fd;
456 	u_long	com;
457 	caddr_t	data;
458 };
459 #endif
460 /*
461  * MPSAFE
462  */
463 /* ARGSUSED */
464 int
465 ioctl(td, uap)
466 	struct thread *td;
467 	register struct ioctl_args *uap;
468 {
469 	struct file *fp;
470 	register struct filedesc *fdp;
471 	register u_long com;
472 	int error = 0;
473 	register u_int size;
474 	caddr_t data, memp;
475 	int tmp;
476 #define STK_PARAMS	128
477 	union {
478 	    char stkbuf[STK_PARAMS];
479 	    long align;
480 	} ubuf;
481 
482 	if ((error = fget(td, uap->fd, &fp)) != 0)
483 		return (error);
484 	mtx_lock(&Giant);
485 	if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
486 		fdrop(fp, td);
487 		mtx_unlock(&Giant);
488 		return (EBADF);
489 	}
490 	fdp = td->td_proc->p_fd;
491 	switch (com = uap->com) {
492 	case FIONCLEX:
493 		FILEDESC_LOCK(fdp);
494 		fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE;
495 		FILEDESC_UNLOCK(fdp);
496 		fdrop(fp, td);
497 		mtx_unlock(&Giant);
498 		return (0);
499 	case FIOCLEX:
500 		FILEDESC_LOCK(fdp);
501 		fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE;
502 		FILEDESC_UNLOCK(fdp);
503 		fdrop(fp, td);
504 		mtx_unlock(&Giant);
505 		return (0);
506 	}
507 
508 	/*
509 	 * Interpret high order word to find amount of data to be
510 	 * copied to/from the user's address space.
511 	 */
512 	size = IOCPARM_LEN(com);
513 	if (size > IOCPARM_MAX) {
514 		fdrop(fp, td);
515 		mtx_unlock(&Giant);
516 		return (ENOTTY);
517 	}
518 
519 	memp = NULL;
520 	if (size > sizeof (ubuf.stkbuf)) {
521 		memp = malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
522 		data = memp;
523 	} else {
524 		data = ubuf.stkbuf;
525 	}
526 	if (com&IOC_IN) {
527 		if (size) {
528 			error = copyin(uap->data, data, (u_int)size);
529 			if (error) {
530 				if (memp)
531 					free(memp, M_IOCTLOPS);
532 				fdrop(fp, td);
533 				goto done;
534 			}
535 		} else {
536 			*(caddr_t *)data = uap->data;
537 		}
538 	} else if ((com&IOC_OUT) && size) {
539 		/*
540 		 * Zero the buffer so the user always
541 		 * gets back something deterministic.
542 		 */
543 		bzero(data, size);
544 	} else if (com&IOC_VOID) {
545 		*(caddr_t *)data = uap->data;
546 	}
547 
548 	switch (com) {
549 
550 	case FIONBIO:
551 		FILE_LOCK(fp);
552 		if ((tmp = *(int *)data))
553 			fp->f_flag |= FNONBLOCK;
554 		else
555 			fp->f_flag &= ~FNONBLOCK;
556 		FILE_UNLOCK(fp);
557 		error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
558 		break;
559 
560 	case FIOASYNC:
561 		FILE_LOCK(fp);
562 		if ((tmp = *(int *)data))
563 			fp->f_flag |= FASYNC;
564 		else
565 			fp->f_flag &= ~FASYNC;
566 		FILE_UNLOCK(fp);
567 		error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td);
568 		break;
569 
570 	default:
571 		error = fo_ioctl(fp, com, data, td->td_ucred, td);
572 		/*
573 		 * Copy any data to user, size was
574 		 * already set and checked above.
575 		 */
576 		if (error == 0 && (com&IOC_OUT) && size)
577 			error = copyout(data, uap->data, (u_int)size);
578 		break;
579 	}
580 	if (memp)
581 		free(memp, M_IOCTLOPS);
582 	fdrop(fp, td);
583 done:
584 	mtx_unlock(&Giant);
585 	return (error);
586 }
587 
588 /*
589  * sellock and selwait are initialized in selectinit() via SYSINIT.
590  */
591 struct mtx	sellock;
592 struct cv	selwait;
593 u_int		nselcoll;	/* Select collisions since boot */
594 SYSCTL_UINT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
595 
596 /*
597  * Select system call.
598  */
599 #ifndef _SYS_SYSPROTO_H_
600 struct select_args {
601 	int	nd;
602 	fd_set	*in, *ou, *ex;
603 	struct	timeval *tv;
604 };
605 #endif
606 /*
607  * MPSAFE
608  */
609 int
610 select(td, uap)
611 	register struct thread *td;
612 	register struct select_args *uap;
613 {
614 	struct timeval tv, *tvp;
615 	int error;
616 
617 	if (uap->tv != NULL) {
618 		error = copyin(uap->tv, &tv, sizeof(tv));
619 		if (error)
620 			return (error);
621 		tvp = &tv;
622 	} else
623 		tvp = NULL;
624 
625 	return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp));
626 }
627 
628 int
629 kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou,
630     fd_set *fd_ex, struct timeval *tvp)
631 {
632 	struct filedesc *fdp;
633 	/*
634 	 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
635 	 * infds with the new FD_SETSIZE of 1024, and more than enough for
636 	 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
637 	 * of 256.
638 	 */
639 	fd_mask s_selbits[howmany(2048, NFDBITS)];
640 	fd_mask *ibits[3], *obits[3], *selbits, *sbp;
641 	struct timeval atv, rtv, ttv;
642 	int error, timo;
643 	u_int ncoll, nbufbytes, ncpbytes, nfdbits;
644 
645 	if (nd < 0)
646 		return (EINVAL);
647 	fdp = td->td_proc->p_fd;
648 	/*
649 	 * XXX: kern_select() currently requires that we acquire Giant
650 	 * even if none of the file descriptors we poll requires Giant.
651 	 */
652 	mtx_lock(&Giant);
653 	FILEDESC_LOCK(fdp);
654 
655 	if (nd > td->td_proc->p_fd->fd_nfiles)
656 		nd = td->td_proc->p_fd->fd_nfiles;   /* forgiving; slightly wrong */
657 	FILEDESC_UNLOCK(fdp);
658 
659 	/*
660 	 * Allocate just enough bits for the non-null fd_sets.  Use the
661 	 * preallocated auto buffer if possible.
662 	 */
663 	nfdbits = roundup(nd, NFDBITS);
664 	ncpbytes = nfdbits / NBBY;
665 	nbufbytes = 0;
666 	if (fd_in != NULL)
667 		nbufbytes += 2 * ncpbytes;
668 	if (fd_ou != NULL)
669 		nbufbytes += 2 * ncpbytes;
670 	if (fd_ex != NULL)
671 		nbufbytes += 2 * ncpbytes;
672 	if (nbufbytes <= sizeof s_selbits)
673 		selbits = &s_selbits[0];
674 	else
675 		selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
676 
677 	/*
678 	 * Assign pointers into the bit buffers and fetch the input bits.
679 	 * Put the output buffers together so that they can be bzeroed
680 	 * together.
681 	 */
682 	sbp = selbits;
683 #define	getbits(name, x) \
684 	do {								\
685 		if (name == NULL)					\
686 			ibits[x] = NULL;				\
687 		else {							\
688 			ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp;	\
689 			obits[x] = sbp;					\
690 			sbp += ncpbytes / sizeof *sbp;			\
691 			error = copyin(name, ibits[x], ncpbytes);	\
692 			if (error != 0)					\
693 				goto done_nosellock;			\
694 		}							\
695 	} while (0)
696 	getbits(fd_in, 0);
697 	getbits(fd_ou, 1);
698 	getbits(fd_ex, 2);
699 #undef	getbits
700 	if (nbufbytes != 0)
701 		bzero(selbits, nbufbytes / 2);
702 
703 	if (tvp != NULL) {
704 		atv = *tvp;
705 		if (itimerfix(&atv)) {
706 			error = EINVAL;
707 			goto done_nosellock;
708 		}
709 		getmicrouptime(&rtv);
710 		timevaladd(&atv, &rtv);
711 	} else {
712 		atv.tv_sec = 0;
713 		atv.tv_usec = 0;
714 	}
715 	timo = 0;
716 	TAILQ_INIT(&td->td_selq);
717 	mtx_lock(&sellock);
718 retry:
719 	ncoll = nselcoll;
720 	mtx_lock_spin(&sched_lock);
721 	td->td_flags |= TDF_SELECT;
722 	mtx_unlock_spin(&sched_lock);
723 	mtx_unlock(&sellock);
724 
725 	error = selscan(td, ibits, obits, nd);
726 	mtx_lock(&sellock);
727 	if (error || td->td_retval[0])
728 		goto done;
729 	if (atv.tv_sec || atv.tv_usec) {
730 		getmicrouptime(&rtv);
731 		if (timevalcmp(&rtv, &atv, >=))
732 			goto done;
733 		ttv = atv;
734 		timevalsub(&ttv, &rtv);
735 		timo = ttv.tv_sec > 24 * 60 * 60 ?
736 		    24 * 60 * 60 * hz : tvtohz(&ttv);
737 	}
738 
739 	/*
740 	 * An event of interest may occur while we do not hold
741 	 * sellock, so check TDF_SELECT and the number of
742 	 * collisions and rescan the file descriptors if
743 	 * necessary.
744 	 */
745 	mtx_lock_spin(&sched_lock);
746 	if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
747 		mtx_unlock_spin(&sched_lock);
748 		goto retry;
749 	}
750 	mtx_unlock_spin(&sched_lock);
751 
752 	if (timo > 0)
753 		error = cv_timedwait_sig(&selwait, &sellock, timo);
754 	else
755 		error = cv_wait_sig(&selwait, &sellock);
756 
757 	if (error == 0)
758 		goto retry;
759 
760 done:
761 	clear_selinfo_list(td);
762 	mtx_lock_spin(&sched_lock);
763 	td->td_flags &= ~TDF_SELECT;
764 	mtx_unlock_spin(&sched_lock);
765 	mtx_unlock(&sellock);
766 
767 done_nosellock:
768 	/* select is not restarted after signals... */
769 	if (error == ERESTART)
770 		error = EINTR;
771 	if (error == EWOULDBLOCK)
772 		error = 0;
773 #define	putbits(name, x) \
774 	if (name && (error2 = copyout(obits[x], name, ncpbytes))) \
775 		error = error2;
776 	if (error == 0) {
777 		int error2;
778 
779 		putbits(fd_in, 0);
780 		putbits(fd_ou, 1);
781 		putbits(fd_ex, 2);
782 #undef putbits
783 	}
784 	if (selbits != &s_selbits[0])
785 		free(selbits, M_SELECT);
786 
787 	mtx_unlock(&Giant);
788 	return (error);
789 }
790 
791 static int
792 selscan(td, ibits, obits, nfd)
793 	struct thread *td;
794 	fd_mask **ibits, **obits;
795 	int nfd;
796 {
797 	int msk, i, fd;
798 	fd_mask bits;
799 	struct file *fp;
800 	int n = 0;
801 	/* Note: backend also returns POLLHUP/POLLERR if appropriate. */
802 	static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
803 	struct filedesc *fdp = td->td_proc->p_fd;
804 
805 	FILEDESC_LOCK(fdp);
806 	for (msk = 0; msk < 3; msk++) {
807 		if (ibits[msk] == NULL)
808 			continue;
809 		for (i = 0; i < nfd; i += NFDBITS) {
810 			bits = ibits[msk][i/NFDBITS];
811 			/* ffs(int mask) not portable, fd_mask is long */
812 			for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
813 				if (!(bits & 1))
814 					continue;
815 				if ((fp = fget_locked(fdp, fd)) == NULL) {
816 					FILEDESC_UNLOCK(fdp);
817 					return (EBADF);
818 				}
819 				if (fo_poll(fp, flag[msk], td->td_ucred,
820 				    td)) {
821 					obits[msk][(fd)/NFDBITS] |=
822 					    ((fd_mask)1 << ((fd) % NFDBITS));
823 					n++;
824 				}
825 			}
826 		}
827 	}
828 	FILEDESC_UNLOCK(fdp);
829 	td->td_retval[0] = n;
830 	return (0);
831 }
832 
833 /*
834  * Poll system call.
835  */
836 #ifndef _SYS_SYSPROTO_H_
837 struct poll_args {
838 	struct pollfd *fds;
839 	u_int	nfds;
840 	int	timeout;
841 };
842 #endif
843 /*
844  * MPSAFE
845  */
846 int
847 poll(td, uap)
848 	struct thread *td;
849 	struct poll_args *uap;
850 {
851 	caddr_t bits;
852 	char smallbits[32 * sizeof(struct pollfd)];
853 	struct timeval atv, rtv, ttv;
854 	int error = 0, timo;
855 	u_int ncoll, nfds;
856 	size_t ni;
857 
858 	nfds = uap->nfds;
859 
860 	/*
861 	 * XXX: poll() currently requires that we acquire Giant even if
862 	 * none of the file descriptors we poll requires Giant.
863 	 */
864 	mtx_lock(&Giant);
865 	/*
866 	 * This is kinda bogus.  We have fd limits, but that is not
867 	 * really related to the size of the pollfd array.  Make sure
868 	 * we let the process use at least FD_SETSIZE entries and at
869 	 * least enough for the current limits.  We want to be reasonably
870 	 * safe, but not overly restrictive.
871 	 */
872 	PROC_LOCK(td->td_proc);
873 	if ((nfds > lim_cur(td->td_proc, RLIMIT_NOFILE)) &&
874 	    (nfds > FD_SETSIZE)) {
875 		PROC_UNLOCK(td->td_proc);
876 		error = EINVAL;
877 		goto done2;
878 	}
879 	PROC_UNLOCK(td->td_proc);
880 	ni = nfds * sizeof(struct pollfd);
881 	if (ni > sizeof(smallbits))
882 		bits = malloc(ni, M_TEMP, M_WAITOK);
883 	else
884 		bits = smallbits;
885 	error = copyin(uap->fds, bits, ni);
886 	if (error)
887 		goto done_nosellock;
888 	if (uap->timeout != INFTIM) {
889 		atv.tv_sec = uap->timeout / 1000;
890 		atv.tv_usec = (uap->timeout % 1000) * 1000;
891 		if (itimerfix(&atv)) {
892 			error = EINVAL;
893 			goto done_nosellock;
894 		}
895 		getmicrouptime(&rtv);
896 		timevaladd(&atv, &rtv);
897 	} else {
898 		atv.tv_sec = 0;
899 		atv.tv_usec = 0;
900 	}
901 	timo = 0;
902 	TAILQ_INIT(&td->td_selq);
903 	mtx_lock(&sellock);
904 retry:
905 	ncoll = nselcoll;
906 	mtx_lock_spin(&sched_lock);
907 	td->td_flags |= TDF_SELECT;
908 	mtx_unlock_spin(&sched_lock);
909 	mtx_unlock(&sellock);
910 
911 	error = pollscan(td, (struct pollfd *)bits, nfds);
912 	mtx_lock(&sellock);
913 	if (error || td->td_retval[0])
914 		goto done;
915 	if (atv.tv_sec || atv.tv_usec) {
916 		getmicrouptime(&rtv);
917 		if (timevalcmp(&rtv, &atv, >=))
918 			goto done;
919 		ttv = atv;
920 		timevalsub(&ttv, &rtv);
921 		timo = ttv.tv_sec > 24 * 60 * 60 ?
922 		    24 * 60 * 60 * hz : tvtohz(&ttv);
923 	}
924 	/*
925 	 * An event of interest may occur while we do not hold
926 	 * sellock, so check TDF_SELECT and the number of collisions
927 	 * and rescan the file descriptors if necessary.
928 	 */
929 	mtx_lock_spin(&sched_lock);
930 	if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
931 		mtx_unlock_spin(&sched_lock);
932 		goto retry;
933 	}
934 	mtx_unlock_spin(&sched_lock);
935 
936 	if (timo > 0)
937 		error = cv_timedwait_sig(&selwait, &sellock, timo);
938 	else
939 		error = cv_wait_sig(&selwait, &sellock);
940 
941 	if (error == 0)
942 		goto retry;
943 
944 done:
945 	clear_selinfo_list(td);
946 	mtx_lock_spin(&sched_lock);
947 	td->td_flags &= ~TDF_SELECT;
948 	mtx_unlock_spin(&sched_lock);
949 	mtx_unlock(&sellock);
950 
951 done_nosellock:
952 	/* poll is not restarted after signals... */
953 	if (error == ERESTART)
954 		error = EINTR;
955 	if (error == EWOULDBLOCK)
956 		error = 0;
957 	if (error == 0) {
958 		error = copyout(bits, uap->fds, ni);
959 		if (error)
960 			goto out;
961 	}
962 out:
963 	if (ni > sizeof(smallbits))
964 		free(bits, M_TEMP);
965 done2:
966 	mtx_unlock(&Giant);
967 	return (error);
968 }
969 
970 static int
971 pollscan(td, fds, nfd)
972 	struct thread *td;
973 	struct pollfd *fds;
974 	u_int nfd;
975 {
976 	register struct filedesc *fdp = td->td_proc->p_fd;
977 	int i;
978 	struct file *fp;
979 	int n = 0;
980 
981 	FILEDESC_LOCK(fdp);
982 	for (i = 0; i < nfd; i++, fds++) {
983 		if (fds->fd >= fdp->fd_nfiles) {
984 			fds->revents = POLLNVAL;
985 			n++;
986 		} else if (fds->fd < 0) {
987 			fds->revents = 0;
988 		} else {
989 			fp = fdp->fd_ofiles[fds->fd];
990 			if (fp == NULL) {
991 				fds->revents = POLLNVAL;
992 				n++;
993 			} else {
994 				/*
995 				 * Note: backend also returns POLLHUP and
996 				 * POLLERR if appropriate.
997 				 */
998 				fds->revents = fo_poll(fp, fds->events,
999 				    td->td_ucred, td);
1000 				if (fds->revents != 0)
1001 					n++;
1002 			}
1003 		}
1004 	}
1005 	FILEDESC_UNLOCK(fdp);
1006 	td->td_retval[0] = n;
1007 	return (0);
1008 }
1009 
1010 /*
1011  * OpenBSD poll system call.
1012  * XXX this isn't quite a true representation..  OpenBSD uses select ops.
1013  */
1014 #ifndef _SYS_SYSPROTO_H_
1015 struct openbsd_poll_args {
1016 	struct pollfd *fds;
1017 	u_int	nfds;
1018 	int	timeout;
1019 };
1020 #endif
1021 /*
1022  * MPSAFE
1023  */
1024 int
1025 openbsd_poll(td, uap)
1026 	register struct thread *td;
1027 	register struct openbsd_poll_args *uap;
1028 {
1029 	return (poll(td, (struct poll_args *)uap));
1030 }
1031 
1032 /*
1033  * Remove the references to the thread from all of the objects
1034  * we were polling.
1035  *
1036  * This code assumes that the underlying owner of the selinfo
1037  * structure will hold sellock before it changes it, and that
1038  * it will unlink itself from our list if it goes away.
1039  */
1040 void
1041 clear_selinfo_list(td)
1042 	struct thread *td;
1043 {
1044 	struct selinfo *si;
1045 
1046 	mtx_assert(&sellock, MA_OWNED);
1047 	TAILQ_FOREACH(si, &td->td_selq, si_thrlist)
1048 		si->si_thread = NULL;
1049 	TAILQ_INIT(&td->td_selq);
1050 }
1051 
1052 /*
1053  * Record a select request.
1054  */
1055 void
1056 selrecord(selector, sip)
1057 	struct thread *selector;
1058 	struct selinfo *sip;
1059 {
1060 
1061 	mtx_lock(&sellock);
1062 	/*
1063 	 * If the selinfo's thread pointer is NULL then take ownership of it.
1064 	 *
1065 	 * If the thread pointer is not NULL and it points to another
1066 	 * thread, then we have a collision.
1067 	 *
1068 	 * If the thread pointer is not NULL and points back to us then leave
1069 	 * it alone as we've already added pointed it at us and added it to
1070 	 * our list.
1071 	 */
1072 	if (sip->si_thread == NULL) {
1073 		sip->si_thread = selector;
1074 		TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist);
1075 	} else if (sip->si_thread != selector) {
1076 		sip->si_flags |= SI_COLL;
1077 	}
1078 
1079 	mtx_unlock(&sellock);
1080 }
1081 
1082 /* Wake up a selecting thread. */
1083 void
1084 selwakeup(sip)
1085 	struct selinfo *sip;
1086 {
1087 	doselwakeup(sip, -1);
1088 }
1089 
1090 /* Wake up a selecting thread, and set its priority. */
1091 void
1092 selwakeuppri(sip, pri)
1093 	struct selinfo *sip;
1094 	int pri;
1095 {
1096 	doselwakeup(sip, pri);
1097 }
1098 
1099 /*
1100  * Do a wakeup when a selectable event occurs.
1101  */
1102 static void
1103 doselwakeup(sip, pri)
1104 	struct selinfo *sip;
1105 	int pri;
1106 {
1107 	struct thread *td;
1108 
1109 	mtx_lock(&sellock);
1110 	td = sip->si_thread;
1111 	if ((sip->si_flags & SI_COLL) != 0) {
1112 		nselcoll++;
1113 		sip->si_flags &= ~SI_COLL;
1114 		cv_broadcastpri(&selwait, pri);
1115 	}
1116 	if (td == NULL) {
1117 		mtx_unlock(&sellock);
1118 		return;
1119 	}
1120 	TAILQ_REMOVE(&td->td_selq, sip, si_thrlist);
1121 	sip->si_thread = NULL;
1122 	mtx_lock_spin(&sched_lock);
1123 	td->td_flags &= ~TDF_SELECT;
1124 	mtx_unlock_spin(&sched_lock);
1125 	sleepq_remove(td, &selwait);
1126 	mtx_unlock(&sellock);
1127 }
1128 
1129 static void selectinit(void *);
1130 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL)
1131 
1132 /* ARGSUSED*/
1133 static void
1134 selectinit(dummy)
1135 	void *dummy;
1136 {
1137 	cv_init(&selwait, "select");
1138 	mtx_init(&sellock, "sellck", NULL, MTX_DEF);
1139 }
1140