xref: /freebsd/sys/kern/sys_generic.c (revision 7afc53b8dfcc7d5897920ce6cc7e842fbb4ab813)
1 /*-
2  * Copyright (c) 1982, 1986, 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 4. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
35  */
36 
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD$");
39 
40 #include "opt_ktrace.h"
41 
42 #include <sys/param.h>
43 #include <sys/systm.h>
44 #include <sys/sysproto.h>
45 #include <sys/filedesc.h>
46 #include <sys/filio.h>
47 #include <sys/fcntl.h>
48 #include <sys/file.h>
49 #include <sys/proc.h>
50 #include <sys/signalvar.h>
51 #include <sys/socketvar.h>
52 #include <sys/uio.h>
53 #include <sys/kernel.h>
54 #include <sys/limits.h>
55 #include <sys/malloc.h>
56 #include <sys/poll.h>
57 #include <sys/resourcevar.h>
58 #include <sys/selinfo.h>
59 #include <sys/sleepqueue.h>
60 #include <sys/syscallsubr.h>
61 #include <sys/sysctl.h>
62 #include <sys/sysent.h>
63 #include <sys/vnode.h>
64 #include <sys/bio.h>
65 #include <sys/buf.h>
66 #include <sys/condvar.h>
67 #ifdef KTRACE
68 #include <sys/ktrace.h>
69 #endif
70 #include <vm/vm.h>
71 #include <vm/vm_page.h>
72 
73 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
74 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
75 MALLOC_DEFINE(M_IOV, "iov", "large iov's");
76 
77 static int	pollscan(struct thread *, struct pollfd *, u_int);
78 static int	selscan(struct thread *, fd_mask **, fd_mask **, int);
79 static int	dofileread(struct thread *, struct file *, int, void *,
80 		    size_t, off_t, int);
81 static int	dofilewrite(struct thread *, struct file *, int,
82 		    const void *, size_t, off_t, int);
83 static void	doselwakeup(struct selinfo *, int);
84 
85 /*
86  * Read system call.
87  */
88 #ifndef _SYS_SYSPROTO_H_
89 struct read_args {
90 	int	fd;
91 	void	*buf;
92 	size_t	nbyte;
93 };
94 #endif
95 /*
96  * MPSAFE
97  */
98 int
99 read(td, uap)
100 	struct thread *td;
101 	struct read_args *uap;
102 {
103 	struct file *fp;
104 	int error;
105 
106 	if ((error = fget_read(td, uap->fd, &fp)) == 0) {
107 		error = dofileread(td, fp, uap->fd, uap->buf,
108 			    uap->nbyte, (off_t)-1, 0);
109 		fdrop(fp, td);
110 	}
111 	return(error);
112 }
113 
114 /*
115  * Pread system call
116  */
117 #ifndef _SYS_SYSPROTO_H_
118 struct pread_args {
119 	int	fd;
120 	void	*buf;
121 	size_t	nbyte;
122 	int	pad;
123 	off_t	offset;
124 };
125 #endif
126 /*
127  * MPSAFE
128  */
129 int
130 pread(td, uap)
131 	struct thread *td;
132 	struct pread_args *uap;
133 {
134 	struct file *fp;
135 	int error;
136 
137 	if ((error = fget_read(td, uap->fd, &fp)) != 0)
138 		return (error);
139 	if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
140 		error = ESPIPE;
141 	else if (uap->offset < 0 && fp->f_vnode->v_type != VCHR)
142 		error = EINVAL;
143 	else {
144 		error = dofileread(td, fp, uap->fd, uap->buf, uap->nbyte,
145 			    uap->offset, FOF_OFFSET);
146 	}
147 	fdrop(fp, td);
148 	return(error);
149 }
150 
151 /*
152  * Code common for read and pread
153  */
154 static int
155 dofileread(td, fp, fd, buf, nbyte, offset, flags)
156 	struct thread *td;
157 	struct file *fp;
158 	int fd, flags;
159 	void *buf;
160 	size_t nbyte;
161 	off_t offset;
162 {
163 	struct uio auio;
164 	struct iovec aiov;
165 	ssize_t cnt;
166 	long error = 0;
167 #ifdef KTRACE
168 	struct uio *ktruio = NULL;
169 #endif
170 
171 	/* Finish zero length reads right here */
172 	if (nbyte == 0) {
173 		td->td_retval[0] = 0;
174 		return(0);
175 	}
176 	aiov.iov_base = buf;
177 	aiov.iov_len = nbyte;
178 	auio.uio_iov = &aiov;
179 	auio.uio_iovcnt = 1;
180 	auio.uio_offset = offset;
181 	if (nbyte > INT_MAX)
182 		return (EINVAL);
183 	auio.uio_resid = nbyte;
184 	auio.uio_rw = UIO_READ;
185 	auio.uio_segflg = UIO_USERSPACE;
186 	auio.uio_td = td;
187 #ifdef KTRACE
188 	if (KTRPOINT(td, KTR_GENIO))
189 		ktruio = cloneuio(&auio);
190 #endif
191 	cnt = nbyte;
192 
193 	if ((error = fo_read(fp, &auio, td->td_ucred, flags, td))) {
194 		if (auio.uio_resid != cnt && (error == ERESTART ||
195 		    error == EINTR || error == EWOULDBLOCK))
196 			error = 0;
197 	}
198 	cnt -= auio.uio_resid;
199 #ifdef KTRACE
200 	if (ktruio != NULL) {
201 		ktruio->uio_resid = cnt;
202 		ktrgenio(fd, UIO_READ, ktruio, error);
203 	}
204 #endif
205 	td->td_retval[0] = cnt;
206 	return (error);
207 }
208 
209 /*
210  * Scatter read system call.
211  */
212 #ifndef _SYS_SYSPROTO_H_
213 struct readv_args {
214 	int	fd;
215 	struct	iovec *iovp;
216 	u_int	iovcnt;
217 };
218 #endif
219 /*
220  * MPSAFE
221  */
222 int
223 readv(struct thread *td, struct readv_args *uap)
224 {
225 	struct uio *auio;
226 	int error;
227 
228 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
229 	if (error)
230 		return (error);
231 	error = kern_readv(td, uap->fd, auio);
232 	free(auio, M_IOV);
233 	return (error);
234 }
235 
236 int
237 kern_readv(struct thread *td, int fd, struct uio *auio)
238 {
239 	struct file *fp;
240 	long cnt;
241 	int error;
242 #ifdef KTRACE
243 	struct uio *ktruio = NULL;
244 #endif
245 
246 	error = fget_read(td, fd, &fp);
247 	if (error)
248 		return (error);
249 	/* Finish zero length reads right here */
250 	if (auio->uio_resid == 0) {
251 		td->td_retval[0] = 0;
252 		fdrop(fp, td);
253 		return(0);
254 	}
255 	auio->uio_rw = UIO_READ;
256 	auio->uio_td = td;
257 #ifdef KTRACE
258 	if (KTRPOINT(td, KTR_GENIO))
259 		ktruio = cloneuio(auio);
260 #endif
261 	cnt = auio->uio_resid;
262 	if ((error = fo_read(fp, auio, td->td_ucred, 0, td))) {
263 		if (auio->uio_resid != cnt && (error == ERESTART ||
264 		    error == EINTR || error == EWOULDBLOCK))
265 			error = 0;
266 	}
267 	cnt -= auio->uio_resid;
268 #ifdef KTRACE
269 	if (ktruio != NULL) {
270 		ktruio->uio_resid = cnt;
271 		ktrgenio(fd, UIO_READ, ktruio, error);
272 	}
273 #endif
274 	td->td_retval[0] = cnt;
275 	fdrop(fp, td);
276 	return (error);
277 }
278 
279 /*
280  * Write system call
281  */
282 #ifndef _SYS_SYSPROTO_H_
283 struct write_args {
284 	int	fd;
285 	const void *buf;
286 	size_t	nbyte;
287 };
288 #endif
289 /*
290  * MPSAFE
291  */
292 int
293 write(td, uap)
294 	struct thread *td;
295 	struct write_args *uap;
296 {
297 	struct file *fp;
298 	int error;
299 
300 	if ((error = fget_write(td, uap->fd, &fp)) == 0) {
301 		error = dofilewrite(td, fp, uap->fd, uap->buf, uap->nbyte,
302 			    (off_t)-1, 0);
303 		fdrop(fp, td);
304 	} else {
305 		error = EBADF;	/* XXX this can't be right */
306 	}
307 	return(error);
308 }
309 
310 /*
311  * Pwrite system call
312  */
313 #ifndef _SYS_SYSPROTO_H_
314 struct pwrite_args {
315 	int	fd;
316 	const void *buf;
317 	size_t	nbyte;
318 	int	pad;
319 	off_t	offset;
320 };
321 #endif
322 /*
323  * MPSAFE
324  */
325 int
326 pwrite(td, uap)
327 	struct thread *td;
328 	struct pwrite_args *uap;
329 {
330 	struct file *fp;
331 	int error;
332 
333 	if ((error = fget_write(td, uap->fd, &fp)) == 0) {
334 		if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
335 			error = ESPIPE;
336 		else if (uap->offset < 0 && fp->f_vnode->v_type != VCHR)
337 			error = EINVAL;
338 		else {
339 			error = dofilewrite(td, fp, uap->fd, uap->buf,
340 				    uap->nbyte, uap->offset, FOF_OFFSET);
341 		}
342 		fdrop(fp, td);
343 	} else {
344 		error = EBADF;	/* this can't be right */
345 	}
346 	return(error);
347 }
348 
349 static int
350 dofilewrite(td, fp, fd, buf, nbyte, offset, flags)
351 	struct thread *td;
352 	struct file *fp;
353 	int fd, flags;
354 	const void *buf;
355 	size_t nbyte;
356 	off_t offset;
357 {
358 	struct uio auio;
359 	struct iovec aiov;
360 	ssize_t cnt;
361 	long error = 0;
362 #ifdef KTRACE
363 	struct uio *ktruio = NULL;
364 #endif
365 
366 	aiov.iov_base = (void *)(uintptr_t)buf;
367 	aiov.iov_len = nbyte;
368 	auio.uio_iov = &aiov;
369 	auio.uio_iovcnt = 1;
370 	auio.uio_offset = offset;
371 	if (nbyte > INT_MAX)
372 		return (EINVAL);
373 	auio.uio_resid = nbyte;
374 	auio.uio_rw = UIO_WRITE;
375 	auio.uio_segflg = UIO_USERSPACE;
376 	auio.uio_td = td;
377 #ifdef KTRACE
378 	if (KTRPOINT(td, KTR_GENIO))
379 		ktruio = cloneuio(&auio);
380 #endif
381 	cnt = nbyte;
382 	if (fp->f_type == DTYPE_VNODE)
383 		bwillwrite();
384 	if ((error = fo_write(fp, &auio, td->td_ucred, flags, td))) {
385 		if (auio.uio_resid != cnt && (error == ERESTART ||
386 		    error == EINTR || error == EWOULDBLOCK))
387 			error = 0;
388 		/* Socket layer is responsible for issuing SIGPIPE. */
389 		if (error == EPIPE && fp->f_type != DTYPE_SOCKET) {
390 			PROC_LOCK(td->td_proc);
391 			psignal(td->td_proc, SIGPIPE);
392 			PROC_UNLOCK(td->td_proc);
393 		}
394 	}
395 	cnt -= auio.uio_resid;
396 #ifdef KTRACE
397 	if (ktruio != NULL) {
398 		ktruio->uio_resid = cnt;
399 		ktrgenio(fd, UIO_WRITE, ktruio, error);
400 	}
401 #endif
402 	td->td_retval[0] = cnt;
403 	return (error);
404 }
405 
406 /*
407  * Gather write system call
408  */
409 #ifndef _SYS_SYSPROTO_H_
410 struct writev_args {
411 	int	fd;
412 	struct	iovec *iovp;
413 	u_int	iovcnt;
414 };
415 #endif
416 /*
417  * MPSAFE
418  */
419 int
420 writev(struct thread *td, struct writev_args *uap)
421 {
422 	struct uio *auio;
423 	int error;
424 
425 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
426 	if (error)
427 		return (error);
428 	error = kern_writev(td, uap->fd, auio);
429 	free(auio, M_IOV);
430 	return (error);
431 }
432 
433 int
434 kern_writev(struct thread *td, int fd, struct uio *auio)
435 {
436 	struct file *fp;
437 	long cnt;
438 	int error;
439 #ifdef KTRACE
440 	struct uio *ktruio = NULL;
441 #endif
442 
443 	error = fget_write(td, fd, &fp);
444 	if (error)
445 		return (EBADF);
446 	auio->uio_rw = UIO_WRITE;
447 	auio->uio_td = td;
448 #ifdef KTRACE
449 	if (KTRPOINT(td, KTR_GENIO))
450 		ktruio = cloneuio(auio);
451 #endif
452 	cnt = auio->uio_resid;
453 	if (fp->f_type == DTYPE_VNODE)
454 		bwillwrite();
455 	if ((error = fo_write(fp, auio, td->td_ucred, 0, td))) {
456 		if (auio->uio_resid != cnt && (error == ERESTART ||
457 		    error == EINTR || error == EWOULDBLOCK))
458 			error = 0;
459 		if (error == EPIPE) {
460 			PROC_LOCK(td->td_proc);
461 			psignal(td->td_proc, SIGPIPE);
462 			PROC_UNLOCK(td->td_proc);
463 		}
464 	}
465 	cnt -= auio->uio_resid;
466 #ifdef KTRACE
467 	if (ktruio != NULL) {
468 		ktruio->uio_resid = cnt;
469 		ktrgenio(fd, UIO_WRITE, ktruio, error);
470 	}
471 #endif
472 	td->td_retval[0] = cnt;
473 	fdrop(fp, td);
474 	return (error);
475 }
476 
477 /*
478  * Ioctl system call
479  */
480 #ifndef _SYS_SYSPROTO_H_
481 struct ioctl_args {
482 	int	fd;
483 	u_long	com;
484 	caddr_t	data;
485 };
486 #endif
487 /*
488  * MPSAFE
489  */
490 /* ARGSUSED */
491 int
492 ioctl(struct thread *td, struct ioctl_args *uap)
493 {
494 	struct file *fp;
495 	struct filedesc *fdp;
496 	u_long com;
497 	int error = 0;
498 	u_int size;
499 	caddr_t data, memp;
500 	int tmp;
501 
502 	if (uap->com > 0xffffffff) {
503 		printf(
504 		    "WARNING pid %d (%s): ioctl sign-extension ioctl %lx\n",
505 		    td->td_proc->p_pid, td->td_proc->p_comm, uap->com);
506 		uap->com &= 0xffffffff;
507 	}
508 	if ((error = fget(td, uap->fd, &fp)) != 0)
509 		return (error);
510 	if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
511 		fdrop(fp, td);
512 		return (EBADF);
513 	}
514 	fdp = td->td_proc->p_fd;
515 	switch (com = uap->com) {
516 	case FIONCLEX:
517 		FILEDESC_LOCK_FAST(fdp);
518 		fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE;
519 		FILEDESC_UNLOCK_FAST(fdp);
520 		fdrop(fp, td);
521 		return (0);
522 	case FIOCLEX:
523 		FILEDESC_LOCK_FAST(fdp);
524 		fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE;
525 		FILEDESC_UNLOCK_FAST(fdp);
526 		fdrop(fp, td);
527 		return (0);
528 	}
529 
530 	/*
531 	 * Interpret high order word to find amount of data to be
532 	 * copied to/from the user's address space.
533 	 */
534 	size = IOCPARM_LEN(com);
535 	if ((size > IOCPARM_MAX) ||
536 	    ((com & (IOC_VOID  | IOC_IN | IOC_OUT)) == 0) ||
537 	    ((com & IOC_VOID) && size > 0) ||
538 	    ((com & (IOC_IN | IOC_OUT)) && size == 0)) {
539 		fdrop(fp, td);
540 		return (ENOTTY);
541 	}
542 
543 	if (size > 0) {
544 		memp = malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
545 		data = memp;
546 	} else {
547 		memp = NULL;
548 		data = (void *)&uap->data;
549 	}
550 	if (com & IOC_IN) {
551 		error = copyin(uap->data, data, (u_int)size);
552 		if (error) {
553 			free(memp, M_IOCTLOPS);
554 			fdrop(fp, td);
555 			return (error);
556 		}
557 	} else if (com & IOC_OUT) {
558 		/*
559 		 * Zero the buffer so the user always
560 		 * gets back something deterministic.
561 		 */
562 		bzero(data, size);
563 	}
564 
565 	if (com == FIONBIO) {
566 		FILE_LOCK(fp);
567 		if ((tmp = *(int *)data))
568 			fp->f_flag |= FNONBLOCK;
569 		else
570 			fp->f_flag &= ~FNONBLOCK;
571 		FILE_UNLOCK(fp);
572 		data = (void *)&tmp;
573 	} else if (com == FIOASYNC) {
574 		FILE_LOCK(fp);
575 		if ((tmp = *(int *)data))
576 			fp->f_flag |= FASYNC;
577 		else
578 			fp->f_flag &= ~FASYNC;
579 		FILE_UNLOCK(fp);
580 		data = (void *)&tmp;
581 	}
582 
583 	error = fo_ioctl(fp, com, data, td->td_ucred, td);
584 
585 	if (error == 0 && (com & IOC_OUT))
586 		error = copyout(data, uap->data, (u_int)size);
587 
588 	if (memp != NULL)
589 		free(memp, M_IOCTLOPS);
590 	fdrop(fp, td);
591 	return (error);
592 }
593 
594 /*
595  * sellock and selwait are initialized in selectinit() via SYSINIT.
596  */
597 struct mtx	sellock;
598 struct cv	selwait;
599 u_int		nselcoll;	/* Select collisions since boot */
600 SYSCTL_UINT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
601 
602 /*
603  * Select system call.
604  */
605 #ifndef _SYS_SYSPROTO_H_
606 struct select_args {
607 	int	nd;
608 	fd_set	*in, *ou, *ex;
609 	struct	timeval *tv;
610 };
611 #endif
612 /*
613  * MPSAFE
614  */
615 int
616 select(td, uap)
617 	register struct thread *td;
618 	register struct select_args *uap;
619 {
620 	struct timeval tv, *tvp;
621 	int error;
622 
623 	if (uap->tv != NULL) {
624 		error = copyin(uap->tv, &tv, sizeof(tv));
625 		if (error)
626 			return (error);
627 		tvp = &tv;
628 	} else
629 		tvp = NULL;
630 
631 	return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp));
632 }
633 
634 int
635 kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou,
636     fd_set *fd_ex, struct timeval *tvp)
637 {
638 	struct filedesc *fdp;
639 	/*
640 	 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
641 	 * infds with the new FD_SETSIZE of 1024, and more than enough for
642 	 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
643 	 * of 256.
644 	 */
645 	fd_mask s_selbits[howmany(2048, NFDBITS)];
646 	fd_mask *ibits[3], *obits[3], *selbits, *sbp;
647 	struct timeval atv, rtv, ttv;
648 	int error, timo;
649 	u_int ncoll, nbufbytes, ncpbytes, nfdbits;
650 
651 	if (nd < 0)
652 		return (EINVAL);
653 	fdp = td->td_proc->p_fd;
654 
655 	FILEDESC_LOCK_FAST(fdp);
656 
657 	if (nd > td->td_proc->p_fd->fd_nfiles)
658 		nd = td->td_proc->p_fd->fd_nfiles;   /* forgiving; slightly wrong */
659 	FILEDESC_UNLOCK_FAST(fdp);
660 
661 	/*
662 	 * Allocate just enough bits for the non-null fd_sets.  Use the
663 	 * preallocated auto buffer if possible.
664 	 */
665 	nfdbits = roundup(nd, NFDBITS);
666 	ncpbytes = nfdbits / NBBY;
667 	nbufbytes = 0;
668 	if (fd_in != NULL)
669 		nbufbytes += 2 * ncpbytes;
670 	if (fd_ou != NULL)
671 		nbufbytes += 2 * ncpbytes;
672 	if (fd_ex != NULL)
673 		nbufbytes += 2 * ncpbytes;
674 	if (nbufbytes <= sizeof s_selbits)
675 		selbits = &s_selbits[0];
676 	else
677 		selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
678 
679 	/*
680 	 * Assign pointers into the bit buffers and fetch the input bits.
681 	 * Put the output buffers together so that they can be bzeroed
682 	 * together.
683 	 */
684 	sbp = selbits;
685 #define	getbits(name, x) \
686 	do {								\
687 		if (name == NULL)					\
688 			ibits[x] = NULL;				\
689 		else {							\
690 			ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp;	\
691 			obits[x] = sbp;					\
692 			sbp += ncpbytes / sizeof *sbp;			\
693 			error = copyin(name, ibits[x], ncpbytes);	\
694 			if (error != 0)					\
695 				goto done_nosellock;			\
696 		}							\
697 	} while (0)
698 	getbits(fd_in, 0);
699 	getbits(fd_ou, 1);
700 	getbits(fd_ex, 2);
701 #undef	getbits
702 	if (nbufbytes != 0)
703 		bzero(selbits, nbufbytes / 2);
704 
705 	if (tvp != NULL) {
706 		atv = *tvp;
707 		if (itimerfix(&atv)) {
708 			error = EINVAL;
709 			goto done_nosellock;
710 		}
711 		getmicrouptime(&rtv);
712 		timevaladd(&atv, &rtv);
713 	} else {
714 		atv.tv_sec = 0;
715 		atv.tv_usec = 0;
716 	}
717 	timo = 0;
718 	TAILQ_INIT(&td->td_selq);
719 	mtx_lock(&sellock);
720 retry:
721 	ncoll = nselcoll;
722 	mtx_lock_spin(&sched_lock);
723 	td->td_flags |= TDF_SELECT;
724 	mtx_unlock_spin(&sched_lock);
725 	mtx_unlock(&sellock);
726 
727 	error = selscan(td, ibits, obits, nd);
728 	mtx_lock(&sellock);
729 	if (error || td->td_retval[0])
730 		goto done;
731 	if (atv.tv_sec || atv.tv_usec) {
732 		getmicrouptime(&rtv);
733 		if (timevalcmp(&rtv, &atv, >=))
734 			goto done;
735 		ttv = atv;
736 		timevalsub(&ttv, &rtv);
737 		timo = ttv.tv_sec > 24 * 60 * 60 ?
738 		    24 * 60 * 60 * hz : tvtohz(&ttv);
739 	}
740 
741 	/*
742 	 * An event of interest may occur while we do not hold
743 	 * sellock, so check TDF_SELECT and the number of
744 	 * collisions and rescan the file descriptors if
745 	 * necessary.
746 	 */
747 	mtx_lock_spin(&sched_lock);
748 	if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
749 		mtx_unlock_spin(&sched_lock);
750 		goto retry;
751 	}
752 	mtx_unlock_spin(&sched_lock);
753 
754 	if (timo > 0)
755 		error = cv_timedwait_sig(&selwait, &sellock, timo);
756 	else
757 		error = cv_wait_sig(&selwait, &sellock);
758 
759 	if (error == 0)
760 		goto retry;
761 
762 done:
763 	clear_selinfo_list(td);
764 	mtx_lock_spin(&sched_lock);
765 	td->td_flags &= ~TDF_SELECT;
766 	mtx_unlock_spin(&sched_lock);
767 	mtx_unlock(&sellock);
768 
769 done_nosellock:
770 	/* select is not restarted after signals... */
771 	if (error == ERESTART)
772 		error = EINTR;
773 	if (error == EWOULDBLOCK)
774 		error = 0;
775 #define	putbits(name, x) \
776 	if (name && (error2 = copyout(obits[x], name, ncpbytes))) \
777 		error = error2;
778 	if (error == 0) {
779 		int error2;
780 
781 		putbits(fd_in, 0);
782 		putbits(fd_ou, 1);
783 		putbits(fd_ex, 2);
784 #undef putbits
785 	}
786 	if (selbits != &s_selbits[0])
787 		free(selbits, M_SELECT);
788 
789 	return (error);
790 }
791 
792 static int
793 selscan(td, ibits, obits, nfd)
794 	struct thread *td;
795 	fd_mask **ibits, **obits;
796 	int nfd;
797 {
798 	int msk, i, fd;
799 	fd_mask bits;
800 	struct file *fp;
801 	int n = 0;
802 	/* Note: backend also returns POLLHUP/POLLERR if appropriate. */
803 	static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
804 	struct filedesc *fdp = td->td_proc->p_fd;
805 
806 	FILEDESC_LOCK(fdp);
807 	for (msk = 0; msk < 3; msk++) {
808 		if (ibits[msk] == NULL)
809 			continue;
810 		for (i = 0; i < nfd; i += NFDBITS) {
811 			bits = ibits[msk][i/NFDBITS];
812 			/* ffs(int mask) not portable, fd_mask is long */
813 			for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
814 				if (!(bits & 1))
815 					continue;
816 				if ((fp = fget_locked(fdp, fd)) == NULL) {
817 					FILEDESC_UNLOCK(fdp);
818 					return (EBADF);
819 				}
820 				if (fo_poll(fp, flag[msk], td->td_ucred,
821 				    td)) {
822 					obits[msk][(fd)/NFDBITS] |=
823 					    ((fd_mask)1 << ((fd) % NFDBITS));
824 					n++;
825 				}
826 			}
827 		}
828 	}
829 	FILEDESC_UNLOCK(fdp);
830 	td->td_retval[0] = n;
831 	return (0);
832 }
833 
834 /*
835  * Poll system call.
836  */
837 #ifndef _SYS_SYSPROTO_H_
838 struct poll_args {
839 	struct pollfd *fds;
840 	u_int	nfds;
841 	int	timeout;
842 };
843 #endif
844 /*
845  * MPSAFE
846  */
847 int
848 poll(td, uap)
849 	struct thread *td;
850 	struct poll_args *uap;
851 {
852 	struct pollfd *bits;
853 	struct pollfd smallbits[32];
854 	struct timeval atv, rtv, ttv;
855 	int error = 0, timo;
856 	u_int ncoll, nfds;
857 	size_t ni;
858 
859 	nfds = uap->nfds;
860 
861 	/*
862 	 * This is kinda bogus.  We have fd limits, but that is not
863 	 * really related to the size of the pollfd array.  Make sure
864 	 * we let the process use at least FD_SETSIZE entries and at
865 	 * least enough for the current limits.  We want to be reasonably
866 	 * safe, but not overly restrictive.
867 	 */
868 	PROC_LOCK(td->td_proc);
869 	if ((nfds > lim_cur(td->td_proc, RLIMIT_NOFILE)) &&
870 	    (nfds > FD_SETSIZE)) {
871 		PROC_UNLOCK(td->td_proc);
872 		error = EINVAL;
873 		goto done2;
874 	}
875 	PROC_UNLOCK(td->td_proc);
876 	ni = nfds * sizeof(struct pollfd);
877 	if (ni > sizeof(smallbits))
878 		bits = malloc(ni, M_TEMP, M_WAITOK);
879 	else
880 		bits = smallbits;
881 	error = copyin(uap->fds, bits, ni);
882 	if (error)
883 		goto done_nosellock;
884 	if (uap->timeout != INFTIM) {
885 		atv.tv_sec = uap->timeout / 1000;
886 		atv.tv_usec = (uap->timeout % 1000) * 1000;
887 		if (itimerfix(&atv)) {
888 			error = EINVAL;
889 			goto done_nosellock;
890 		}
891 		getmicrouptime(&rtv);
892 		timevaladd(&atv, &rtv);
893 	} else {
894 		atv.tv_sec = 0;
895 		atv.tv_usec = 0;
896 	}
897 	timo = 0;
898 	TAILQ_INIT(&td->td_selq);
899 	mtx_lock(&sellock);
900 retry:
901 	ncoll = nselcoll;
902 	mtx_lock_spin(&sched_lock);
903 	td->td_flags |= TDF_SELECT;
904 	mtx_unlock_spin(&sched_lock);
905 	mtx_unlock(&sellock);
906 
907 	error = pollscan(td, bits, nfds);
908 	mtx_lock(&sellock);
909 	if (error || td->td_retval[0])
910 		goto done;
911 	if (atv.tv_sec || atv.tv_usec) {
912 		getmicrouptime(&rtv);
913 		if (timevalcmp(&rtv, &atv, >=))
914 			goto done;
915 		ttv = atv;
916 		timevalsub(&ttv, &rtv);
917 		timo = ttv.tv_sec > 24 * 60 * 60 ?
918 		    24 * 60 * 60 * hz : tvtohz(&ttv);
919 	}
920 	/*
921 	 * An event of interest may occur while we do not hold
922 	 * sellock, so check TDF_SELECT and the number of collisions
923 	 * and rescan the file descriptors if necessary.
924 	 */
925 	mtx_lock_spin(&sched_lock);
926 	if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
927 		mtx_unlock_spin(&sched_lock);
928 		goto retry;
929 	}
930 	mtx_unlock_spin(&sched_lock);
931 
932 	if (timo > 0)
933 		error = cv_timedwait_sig(&selwait, &sellock, timo);
934 	else
935 		error = cv_wait_sig(&selwait, &sellock);
936 
937 	if (error == 0)
938 		goto retry;
939 
940 done:
941 	clear_selinfo_list(td);
942 	mtx_lock_spin(&sched_lock);
943 	td->td_flags &= ~TDF_SELECT;
944 	mtx_unlock_spin(&sched_lock);
945 	mtx_unlock(&sellock);
946 
947 done_nosellock:
948 	/* poll is not restarted after signals... */
949 	if (error == ERESTART)
950 		error = EINTR;
951 	if (error == EWOULDBLOCK)
952 		error = 0;
953 	if (error == 0) {
954 		error = copyout(bits, uap->fds, ni);
955 		if (error)
956 			goto out;
957 	}
958 out:
959 	if (ni > sizeof(smallbits))
960 		free(bits, M_TEMP);
961 done2:
962 	return (error);
963 }
964 
965 static int
966 pollscan(td, fds, nfd)
967 	struct thread *td;
968 	struct pollfd *fds;
969 	u_int nfd;
970 {
971 	register struct filedesc *fdp = td->td_proc->p_fd;
972 	int i;
973 	struct file *fp;
974 	int n = 0;
975 
976 	FILEDESC_LOCK(fdp);
977 	for (i = 0; i < nfd; i++, fds++) {
978 		if (fds->fd >= fdp->fd_nfiles) {
979 			fds->revents = POLLNVAL;
980 			n++;
981 		} else if (fds->fd < 0) {
982 			fds->revents = 0;
983 		} else {
984 			fp = fdp->fd_ofiles[fds->fd];
985 			if (fp == NULL) {
986 				fds->revents = POLLNVAL;
987 				n++;
988 			} else {
989 				/*
990 				 * Note: backend also returns POLLHUP and
991 				 * POLLERR if appropriate.
992 				 */
993 				fds->revents = fo_poll(fp, fds->events,
994 				    td->td_ucred, td);
995 				if (fds->revents != 0)
996 					n++;
997 			}
998 		}
999 	}
1000 	FILEDESC_UNLOCK(fdp);
1001 	td->td_retval[0] = n;
1002 	return (0);
1003 }
1004 
1005 /*
1006  * OpenBSD poll system call.
1007  * XXX this isn't quite a true representation..  OpenBSD uses select ops.
1008  */
1009 #ifndef _SYS_SYSPROTO_H_
1010 struct openbsd_poll_args {
1011 	struct pollfd *fds;
1012 	u_int	nfds;
1013 	int	timeout;
1014 };
1015 #endif
1016 /*
1017  * MPSAFE
1018  */
1019 int
1020 openbsd_poll(td, uap)
1021 	register struct thread *td;
1022 	register struct openbsd_poll_args *uap;
1023 {
1024 	return (poll(td, (struct poll_args *)uap));
1025 }
1026 
1027 /*
1028  * Remove the references to the thread from all of the objects
1029  * we were polling.
1030  *
1031  * This code assumes that the underlying owner of the selinfo
1032  * structure will hold sellock before it changes it, and that
1033  * it will unlink itself from our list if it goes away.
1034  */
1035 void
1036 clear_selinfo_list(td)
1037 	struct thread *td;
1038 {
1039 	struct selinfo *si;
1040 
1041 	mtx_assert(&sellock, MA_OWNED);
1042 	TAILQ_FOREACH(si, &td->td_selq, si_thrlist)
1043 		si->si_thread = NULL;
1044 	TAILQ_INIT(&td->td_selq);
1045 }
1046 
1047 /*
1048  * Record a select request.
1049  */
1050 void
1051 selrecord(selector, sip)
1052 	struct thread *selector;
1053 	struct selinfo *sip;
1054 {
1055 
1056 	mtx_lock(&sellock);
1057 	/*
1058 	 * If the selinfo's thread pointer is NULL then take ownership of it.
1059 	 *
1060 	 * If the thread pointer is not NULL and it points to another
1061 	 * thread, then we have a collision.
1062 	 *
1063 	 * If the thread pointer is not NULL and points back to us then leave
1064 	 * it alone as we've already added pointed it at us and added it to
1065 	 * our list.
1066 	 */
1067 	if (sip->si_thread == NULL) {
1068 		sip->si_thread = selector;
1069 		TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist);
1070 	} else if (sip->si_thread != selector) {
1071 		sip->si_flags |= SI_COLL;
1072 	}
1073 
1074 	mtx_unlock(&sellock);
1075 }
1076 
1077 /* Wake up a selecting thread. */
1078 void
1079 selwakeup(sip)
1080 	struct selinfo *sip;
1081 {
1082 	doselwakeup(sip, -1);
1083 }
1084 
1085 /* Wake up a selecting thread, and set its priority. */
1086 void
1087 selwakeuppri(sip, pri)
1088 	struct selinfo *sip;
1089 	int pri;
1090 {
1091 	doselwakeup(sip, pri);
1092 }
1093 
1094 /*
1095  * Do a wakeup when a selectable event occurs.
1096  */
1097 static void
1098 doselwakeup(sip, pri)
1099 	struct selinfo *sip;
1100 	int pri;
1101 {
1102 	struct thread *td;
1103 
1104 	mtx_lock(&sellock);
1105 	td = sip->si_thread;
1106 	if ((sip->si_flags & SI_COLL) != 0) {
1107 		nselcoll++;
1108 		sip->si_flags &= ~SI_COLL;
1109 		cv_broadcastpri(&selwait, pri);
1110 	}
1111 	if (td == NULL) {
1112 		mtx_unlock(&sellock);
1113 		return;
1114 	}
1115 	TAILQ_REMOVE(&td->td_selq, sip, si_thrlist);
1116 	sip->si_thread = NULL;
1117 	mtx_lock_spin(&sched_lock);
1118 	td->td_flags &= ~TDF_SELECT;
1119 	mtx_unlock_spin(&sched_lock);
1120 	sleepq_remove(td, &selwait);
1121 	mtx_unlock(&sellock);
1122 }
1123 
1124 static void selectinit(void *);
1125 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL)
1126 
1127 /* ARGSUSED*/
1128 static void
1129 selectinit(dummy)
1130 	void *dummy;
1131 {
1132 	cv_init(&selwait, "select");
1133 	mtx_init(&sellock, "sellck", NULL, MTX_DEF);
1134 }
1135