xref: /freebsd/sys/kern/sys_generic.c (revision ebfbcb8bec539ff98a9a0cf36397776540ea59d0)
1 /*-
2  * Copyright (c) 1982, 1986, 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 4. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
35  */
36 
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD$");
39 
40 #include "opt_compat.h"
41 #include "opt_ktrace.h"
42 
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/sysproto.h>
46 #include <sys/filedesc.h>
47 #include <sys/filio.h>
48 #include <sys/fcntl.h>
49 #include <sys/file.h>
50 #include <sys/proc.h>
51 #include <sys/signalvar.h>
52 #include <sys/socketvar.h>
53 #include <sys/uio.h>
54 #include <sys/kernel.h>
55 #include <sys/limits.h>
56 #include <sys/malloc.h>
57 #include <sys/poll.h>
58 #include <sys/resourcevar.h>
59 #include <sys/selinfo.h>
60 #include <sys/sleepqueue.h>
61 #include <sys/syscallsubr.h>
62 #include <sys/sysctl.h>
63 #include <sys/sysent.h>
64 #include <sys/vnode.h>
65 #include <sys/bio.h>
66 #include <sys/buf.h>
67 #include <sys/condvar.h>
68 #ifdef KTRACE
69 #include <sys/ktrace.h>
70 #endif
71 #include <vm/vm.h>
72 #include <vm/vm_page.h>
73 
74 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
75 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
76 MALLOC_DEFINE(M_IOV, "iov", "large iov's");
77 
78 static int	pollscan(struct thread *, struct pollfd *, u_int);
79 static int	selscan(struct thread *, fd_mask **, fd_mask **, int);
80 static int	dofileread(struct thread *, int, struct file *, struct uio *,
81 		    off_t, int);
82 static int	dofilewrite(struct thread *, int, struct file *, struct uio *,
83 		    off_t, int);
84 static void	doselwakeup(struct selinfo *, int);
85 
86 /*
87  * Read system call.
88  */
89 #ifndef _SYS_SYSPROTO_H_
90 struct read_args {
91 	int	fd;
92 	void	*buf;
93 	size_t	nbyte;
94 };
95 #endif
96 /*
97  * MPSAFE
98  */
99 int
100 read(td, uap)
101 	struct thread *td;
102 	struct read_args *uap;
103 {
104 	struct uio auio;
105 	struct iovec aiov;
106 	int error;
107 
108 	if (uap->nbyte > INT_MAX)
109 		return (EINVAL);
110 	aiov.iov_base = uap->buf;
111 	aiov.iov_len = uap->nbyte;
112 	auio.uio_iov = &aiov;
113 	auio.uio_iovcnt = 1;
114 	auio.uio_resid = uap->nbyte;
115 	auio.uio_segflg = UIO_USERSPACE;
116 	error = kern_readv(td, uap->fd, &auio);
117 	return(error);
118 }
119 
120 /*
121  * Positioned read system call
122  */
123 #ifndef _SYS_SYSPROTO_H_
124 struct pread_args {
125 	int	fd;
126 	void	*buf;
127 	size_t	nbyte;
128 	int	pad;
129 	off_t	offset;
130 };
131 #endif
132 /*
133  * MPSAFE
134  */
135 int
136 pread(td, uap)
137 	struct thread *td;
138 	struct pread_args *uap;
139 {
140 	struct uio auio;
141 	struct iovec aiov;
142 	int error;
143 
144 	if (uap->nbyte > INT_MAX)
145 		return (EINVAL);
146 	aiov.iov_base = uap->buf;
147 	aiov.iov_len = uap->nbyte;
148 	auio.uio_iov = &aiov;
149 	auio.uio_iovcnt = 1;
150 	auio.uio_resid = uap->nbyte;
151 	auio.uio_segflg = UIO_USERSPACE;
152 	error = kern_preadv(td, uap->fd, &auio, uap->offset);
153 	return(error);
154 }
155 
156 /*
157  * Scatter read system call.
158  */
159 #ifndef _SYS_SYSPROTO_H_
160 struct readv_args {
161 	int	fd;
162 	struct	iovec *iovp;
163 	u_int	iovcnt;
164 };
165 #endif
166 /*
167  * MPSAFE
168  */
169 int
170 readv(struct thread *td, struct readv_args *uap)
171 {
172 	struct uio *auio;
173 	int error;
174 
175 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
176 	if (error)
177 		return (error);
178 	error = kern_readv(td, uap->fd, auio);
179 	free(auio, M_IOV);
180 	return (error);
181 }
182 
183 int
184 kern_readv(struct thread *td, int fd, struct uio *auio)
185 {
186 	struct file *fp;
187 	int error;
188 
189 	error = fget_read(td, fd, &fp);
190 	if (error)
191 		return (error);
192 	error = dofileread(td, fd, fp, auio, (off_t)-1, 0);
193 	fdrop(fp, td);
194 	return (error);
195 }
196 
197 /*
198  * Scatter positioned read system call.
199  */
200 #ifndef _SYS_SYSPROTO_H_
201 struct preadv_args {
202 	int	fd;
203 	struct	iovec *iovp;
204 	u_int	iovcnt;
205 	off_t	offset;
206 };
207 #endif
208 /*
209  * MPSAFE
210  */
211 int
212 preadv(struct thread *td, struct preadv_args *uap)
213 {
214 	struct uio *auio;
215 	int error;
216 
217 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
218 	if (error)
219 		return (error);
220 	error = kern_preadv(td, uap->fd, auio, uap->offset);
221 	free(auio, M_IOV);
222 	return (error);
223 }
224 
225 int
226 kern_preadv(td, fd, auio, offset)
227 	struct thread *td;
228 	int fd;
229 	struct uio *auio;
230 	off_t offset;
231 {
232 	struct file *fp;
233 	int error;
234 
235 	error = fget_read(td, fd, &fp);
236 	if (error)
237 		return (error);
238 	if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
239 		error = ESPIPE;
240 	else if (offset < 0 && fp->f_vnode->v_type != VCHR)
241 		error = EINVAL;
242 	else
243 		error = dofileread(td, fd, fp, auio, offset, FOF_OFFSET);
244 	fdrop(fp, td);
245 	return (error);
246 }
247 
248 /*
249  * Common code for readv and preadv that reads data in
250  * from a file using the passed in uio, offset, and flags.
251  */
252 static int
253 dofileread(td, fd, fp, auio, offset, flags)
254 	struct thread *td;
255 	int fd;
256 	struct file *fp;
257 	struct uio *auio;
258 	off_t offset;
259 	int flags;
260 {
261 	ssize_t cnt;
262 	int error;
263 #ifdef KTRACE
264 	struct uio *ktruio = NULL;
265 #endif
266 
267 	/* Finish zero length reads right here */
268 	if (auio->uio_resid == 0) {
269 		td->td_retval[0] = 0;
270 		return(0);
271 	}
272 	auio->uio_rw = UIO_READ;
273 	auio->uio_offset = offset;
274 	auio->uio_td = td;
275 #ifdef KTRACE
276 	if (KTRPOINT(td, KTR_GENIO))
277 		ktruio = cloneuio(auio);
278 #endif
279 	cnt = auio->uio_resid;
280 	if ((error = fo_read(fp, auio, td->td_ucred, flags, td))) {
281 		if (auio->uio_resid != cnt && (error == ERESTART ||
282 		    error == EINTR || error == EWOULDBLOCK))
283 			error = 0;
284 	}
285 	cnt -= auio->uio_resid;
286 #ifdef KTRACE
287 	if (ktruio != NULL) {
288 		ktruio->uio_resid = cnt;
289 		ktrgenio(fd, UIO_READ, ktruio, error);
290 	}
291 #endif
292 	td->td_retval[0] = cnt;
293 	return (error);
294 }
295 
296 /*
297  * Write system call
298  */
299 #ifndef _SYS_SYSPROTO_H_
300 struct write_args {
301 	int	fd;
302 	const void *buf;
303 	size_t	nbyte;
304 };
305 #endif
306 /*
307  * MPSAFE
308  */
309 int
310 write(td, uap)
311 	struct thread *td;
312 	struct write_args *uap;
313 {
314 	struct uio auio;
315 	struct iovec aiov;
316 	int error;
317 
318 	if (uap->nbyte > INT_MAX)
319 		return (EINVAL);
320 	aiov.iov_base = (void *)(uintptr_t)uap->buf;
321 	aiov.iov_len = uap->nbyte;
322 	auio.uio_iov = &aiov;
323 	auio.uio_iovcnt = 1;
324 	auio.uio_resid = uap->nbyte;
325 	auio.uio_segflg = UIO_USERSPACE;
326 	error = kern_writev(td, uap->fd, &auio);
327 	return(error);
328 }
329 
330 /*
331  * Positioned write system call
332  */
333 #ifndef _SYS_SYSPROTO_H_
334 struct pwrite_args {
335 	int	fd;
336 	const void *buf;
337 	size_t	nbyte;
338 	int	pad;
339 	off_t	offset;
340 };
341 #endif
342 /*
343  * MPSAFE
344  */
345 int
346 pwrite(td, uap)
347 	struct thread *td;
348 	struct pwrite_args *uap;
349 {
350 	struct uio auio;
351 	struct iovec aiov;
352 	int error;
353 
354 	if (uap->nbyte > INT_MAX)
355 		return (EINVAL);
356 	aiov.iov_base = (void *)(uintptr_t)uap->buf;
357 	aiov.iov_len = uap->nbyte;
358 	auio.uio_iov = &aiov;
359 	auio.uio_iovcnt = 1;
360 	auio.uio_resid = uap->nbyte;
361 	auio.uio_segflg = UIO_USERSPACE;
362 	error = kern_pwritev(td, uap->fd, &auio, uap->offset);
363 	return(error);
364 }
365 
366 /*
367  * Gather write system call
368  */
369 #ifndef _SYS_SYSPROTO_H_
370 struct writev_args {
371 	int	fd;
372 	struct	iovec *iovp;
373 	u_int	iovcnt;
374 };
375 #endif
376 /*
377  * MPSAFE
378  */
379 int
380 writev(struct thread *td, struct writev_args *uap)
381 {
382 	struct uio *auio;
383 	int error;
384 
385 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
386 	if (error)
387 		return (error);
388 	error = kern_writev(td, uap->fd, auio);
389 	free(auio, M_IOV);
390 	return (error);
391 }
392 
393 int
394 kern_writev(struct thread *td, int fd, struct uio *auio)
395 {
396 	struct file *fp;
397 	int error;
398 
399 	error = fget_write(td, fd, &fp);
400 	if (error)
401 		return (error);
402 	error = dofilewrite(td, fd, fp, auio, (off_t)-1, 0);
403 	fdrop(fp, td);
404 	return (error);
405 }
406 
407 /*
408  * Gather positioned write system call
409  */
410 #ifndef _SYS_SYSPROTO_H_
411 struct pwritev_args {
412 	int	fd;
413 	struct	iovec *iovp;
414 	u_int	iovcnt;
415 	off_t	offset;
416 };
417 #endif
418 /*
419  * MPSAFE
420  */
421 int
422 pwritev(struct thread *td, struct pwritev_args *uap)
423 {
424 	struct uio *auio;
425 	int error;
426 
427 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
428 	if (error)
429 		return (error);
430 	error = kern_pwritev(td, uap->fd, auio, uap->offset);
431 	free(auio, M_IOV);
432 	return (error);
433 }
434 
435 int
436 kern_pwritev(td, fd, auio, offset)
437 	struct thread *td;
438 	struct uio *auio;
439 	int fd;
440 	off_t offset;
441 {
442 	struct file *fp;
443 	int error;
444 
445 	error = fget_write(td, fd, &fp);
446 	if (error)
447 		return (error);
448 	if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
449 		error = ESPIPE;
450 	else if (offset < 0 && fp->f_vnode->v_type != VCHR)
451 		error = EINVAL;
452 	else
453 		error = dofilewrite(td, fd, fp, auio, offset, FOF_OFFSET);
454 	fdrop(fp, td);
455 	return (error);
456 }
457 
458 /*
459  * Common code for writev and pwritev that writes data to
460  * a file using the passed in uio, offset, and flags.
461  */
462 static int
463 dofilewrite(td, fd, fp, auio, offset, flags)
464 	struct thread *td;
465 	int fd;
466 	struct file *fp;
467 	struct uio *auio;
468 	off_t offset;
469 	int flags;
470 {
471 	ssize_t cnt;
472 	int error;
473 #ifdef KTRACE
474 	struct uio *ktruio = NULL;
475 #endif
476 
477 	auio->uio_rw = UIO_WRITE;
478 	auio->uio_td = td;
479 	auio->uio_offset = offset;
480 #ifdef KTRACE
481 	if (KTRPOINT(td, KTR_GENIO))
482 		ktruio = cloneuio(auio);
483 #endif
484 	cnt = auio->uio_resid;
485 	if (fp->f_type == DTYPE_VNODE)
486 		bwillwrite();
487 	if ((error = fo_write(fp, auio, td->td_ucred, flags, td))) {
488 		if (auio->uio_resid != cnt && (error == ERESTART ||
489 		    error == EINTR || error == EWOULDBLOCK))
490 			error = 0;
491 		/* Socket layer is responsible for issuing SIGPIPE. */
492 		if (error == EPIPE) {
493 			PROC_LOCK(td->td_proc);
494 			psignal(td->td_proc, SIGPIPE);
495 			PROC_UNLOCK(td->td_proc);
496 		}
497 	}
498 	cnt -= auio->uio_resid;
499 #ifdef KTRACE
500 	if (ktruio != NULL) {
501 		ktruio->uio_resid = cnt;
502 		ktrgenio(fd, UIO_WRITE, ktruio, error);
503 	}
504 #endif
505 	td->td_retval[0] = cnt;
506 	return (error);
507 }
508 
509 /*
510  * Ioctl system call
511  */
512 #ifndef _SYS_SYSPROTO_H_
513 struct ioctl_args {
514 	int	fd;
515 	u_long	com;
516 	caddr_t	data;
517 };
518 #endif
519 /*
520  * MPSAFE
521  */
522 /* ARGSUSED */
523 int
524 ioctl(struct thread *td, struct ioctl_args *uap)
525 {
526 	u_long com;
527 	int arg, error;
528 	u_int size;
529 	caddr_t data;
530 
531 	if (uap->com > 0xffffffff) {
532 		printf(
533 		    "WARNING pid %d (%s): ioctl sign-extension ioctl %lx\n",
534 		    td->td_proc->p_pid, td->td_proc->p_comm, uap->com);
535 		uap->com &= 0xffffffff;
536 	}
537 	com = uap->com;
538 
539 	/*
540 	 * Interpret high order word to find amount of data to be
541 	 * copied to/from the user's address space.
542 	 */
543 	size = IOCPARM_LEN(com);
544 	if ((size > IOCPARM_MAX) ||
545 	    ((com & (IOC_VOID  | IOC_IN | IOC_OUT)) == 0) ||
546 #if defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
547 	    ((com & IOC_OUT) && size == 0) ||
548 #else
549 	    ((com & (IOC_IN | IOC_OUT)) && size == 0) ||
550 #endif
551 	    ((com & IOC_VOID) && size > 0 && size != sizeof(int)))
552 		return (ENOTTY);
553 
554 	if (size > 0) {
555 		if (!(com & IOC_VOID))
556 			data = malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
557 		else {
558 			/* Integer argument. */
559 			arg = (intptr_t)uap->data;
560 			data = (void *)&arg;
561 			size = 0;
562 		}
563 	} else
564 		data = (void *)&uap->data;
565 	if (com & IOC_IN) {
566 		error = copyin(uap->data, data, (u_int)size);
567 		if (error) {
568 			free(data, M_IOCTLOPS);
569 			return (error);
570 		}
571 	} else if (com & IOC_OUT) {
572 		/*
573 		 * Zero the buffer so the user always
574 		 * gets back something deterministic.
575 		 */
576 		bzero(data, size);
577 	}
578 
579 	error = kern_ioctl(td, uap->fd, com, data);
580 
581 	if (error == 0 && (com & IOC_OUT))
582 		error = copyout(data, uap->data, (u_int)size);
583 
584 	if (size > 0)
585 		free(data, M_IOCTLOPS);
586 	return (error);
587 }
588 
589 int
590 kern_ioctl(struct thread *td, int fd, u_long com, caddr_t data)
591 {
592 	struct file *fp;
593 	struct filedesc *fdp;
594 	int error;
595 	int tmp;
596 
597 	if ((error = fget(td, fd, &fp)) != 0)
598 		return (error);
599 	if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
600 		fdrop(fp, td);
601 		return (EBADF);
602 	}
603 	fdp = td->td_proc->p_fd;
604 	switch (com) {
605 	case FIONCLEX:
606 		FILEDESC_LOCK_FAST(fdp);
607 		fdp->fd_ofileflags[fd] &= ~UF_EXCLOSE;
608 		FILEDESC_UNLOCK_FAST(fdp);
609 		goto out;
610 	case FIOCLEX:
611 		FILEDESC_LOCK_FAST(fdp);
612 		fdp->fd_ofileflags[fd] |= UF_EXCLOSE;
613 		FILEDESC_UNLOCK_FAST(fdp);
614 		goto out;
615 	case FIONBIO:
616 		FILE_LOCK(fp);
617 		if ((tmp = *(int *)data))
618 			fp->f_flag |= FNONBLOCK;
619 		else
620 			fp->f_flag &= ~FNONBLOCK;
621 		FILE_UNLOCK(fp);
622 		data = (void *)&tmp;
623 		break;
624 	case FIOASYNC:
625 		FILE_LOCK(fp);
626 		if ((tmp = *(int *)data))
627 			fp->f_flag |= FASYNC;
628 		else
629 			fp->f_flag &= ~FASYNC;
630 		FILE_UNLOCK(fp);
631 		data = (void *)&tmp;
632 		break;
633 	}
634 
635 	error = fo_ioctl(fp, com, data, td->td_ucred, td);
636 out:
637 	fdrop(fp, td);
638 	return (error);
639 }
640 
641 /*
642  * sellock and selwait are initialized in selectinit() via SYSINIT.
643  */
644 struct mtx	sellock;
645 struct cv	selwait;
646 u_int		nselcoll;	/* Select collisions since boot */
647 SYSCTL_UINT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
648 
649 /*
650  * Select system call.
651  */
652 #ifndef _SYS_SYSPROTO_H_
653 struct select_args {
654 	int	nd;
655 	fd_set	*in, *ou, *ex;
656 	struct	timeval *tv;
657 };
658 #endif
659 /*
660  * MPSAFE
661  */
662 int
663 select(td, uap)
664 	register struct thread *td;
665 	register struct select_args *uap;
666 {
667 	struct timeval tv, *tvp;
668 	int error;
669 
670 	if (uap->tv != NULL) {
671 		error = copyin(uap->tv, &tv, sizeof(tv));
672 		if (error)
673 			return (error);
674 		tvp = &tv;
675 	} else
676 		tvp = NULL;
677 
678 	return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp));
679 }
680 
681 int
682 kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou,
683     fd_set *fd_ex, struct timeval *tvp)
684 {
685 	struct filedesc *fdp;
686 	/*
687 	 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
688 	 * infds with the new FD_SETSIZE of 1024, and more than enough for
689 	 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
690 	 * of 256.
691 	 */
692 	fd_mask s_selbits[howmany(2048, NFDBITS)];
693 	fd_mask *ibits[3], *obits[3], *selbits, *sbp;
694 	struct timeval atv, rtv, ttv;
695 	int error, timo;
696 	u_int ncoll, nbufbytes, ncpbytes, nfdbits;
697 
698 	if (nd < 0)
699 		return (EINVAL);
700 	fdp = td->td_proc->p_fd;
701 
702 	FILEDESC_LOCK_FAST(fdp);
703 
704 	if (nd > td->td_proc->p_fd->fd_nfiles)
705 		nd = td->td_proc->p_fd->fd_nfiles;   /* forgiving; slightly wrong */
706 	FILEDESC_UNLOCK_FAST(fdp);
707 
708 	/*
709 	 * Allocate just enough bits for the non-null fd_sets.  Use the
710 	 * preallocated auto buffer if possible.
711 	 */
712 	nfdbits = roundup(nd, NFDBITS);
713 	ncpbytes = nfdbits / NBBY;
714 	nbufbytes = 0;
715 	if (fd_in != NULL)
716 		nbufbytes += 2 * ncpbytes;
717 	if (fd_ou != NULL)
718 		nbufbytes += 2 * ncpbytes;
719 	if (fd_ex != NULL)
720 		nbufbytes += 2 * ncpbytes;
721 	if (nbufbytes <= sizeof s_selbits)
722 		selbits = &s_selbits[0];
723 	else
724 		selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
725 
726 	/*
727 	 * Assign pointers into the bit buffers and fetch the input bits.
728 	 * Put the output buffers together so that they can be bzeroed
729 	 * together.
730 	 */
731 	sbp = selbits;
732 #define	getbits(name, x) \
733 	do {								\
734 		if (name == NULL)					\
735 			ibits[x] = NULL;				\
736 		else {							\
737 			ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp;	\
738 			obits[x] = sbp;					\
739 			sbp += ncpbytes / sizeof *sbp;			\
740 			error = copyin(name, ibits[x], ncpbytes);	\
741 			if (error != 0)					\
742 				goto done_nosellock;			\
743 		}							\
744 	} while (0)
745 	getbits(fd_in, 0);
746 	getbits(fd_ou, 1);
747 	getbits(fd_ex, 2);
748 #undef	getbits
749 	if (nbufbytes != 0)
750 		bzero(selbits, nbufbytes / 2);
751 
752 	if (tvp != NULL) {
753 		atv = *tvp;
754 		if (itimerfix(&atv)) {
755 			error = EINVAL;
756 			goto done_nosellock;
757 		}
758 		getmicrouptime(&rtv);
759 		timevaladd(&atv, &rtv);
760 	} else {
761 		atv.tv_sec = 0;
762 		atv.tv_usec = 0;
763 	}
764 	timo = 0;
765 	TAILQ_INIT(&td->td_selq);
766 	mtx_lock(&sellock);
767 retry:
768 	ncoll = nselcoll;
769 	mtx_lock_spin(&sched_lock);
770 	td->td_flags |= TDF_SELECT;
771 	mtx_unlock_spin(&sched_lock);
772 	mtx_unlock(&sellock);
773 
774 	error = selscan(td, ibits, obits, nd);
775 	mtx_lock(&sellock);
776 	if (error || td->td_retval[0])
777 		goto done;
778 	if (atv.tv_sec || atv.tv_usec) {
779 		getmicrouptime(&rtv);
780 		if (timevalcmp(&rtv, &atv, >=))
781 			goto done;
782 		ttv = atv;
783 		timevalsub(&ttv, &rtv);
784 		timo = ttv.tv_sec > 24 * 60 * 60 ?
785 		    24 * 60 * 60 * hz : tvtohz(&ttv);
786 	}
787 
788 	/*
789 	 * An event of interest may occur while we do not hold
790 	 * sellock, so check TDF_SELECT and the number of
791 	 * collisions and rescan the file descriptors if
792 	 * necessary.
793 	 */
794 	mtx_lock_spin(&sched_lock);
795 	if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
796 		mtx_unlock_spin(&sched_lock);
797 		goto retry;
798 	}
799 	mtx_unlock_spin(&sched_lock);
800 
801 	if (timo > 0)
802 		error = cv_timedwait_sig(&selwait, &sellock, timo);
803 	else
804 		error = cv_wait_sig(&selwait, &sellock);
805 
806 	if (error == 0)
807 		goto retry;
808 
809 done:
810 	clear_selinfo_list(td);
811 	mtx_lock_spin(&sched_lock);
812 	td->td_flags &= ~TDF_SELECT;
813 	mtx_unlock_spin(&sched_lock);
814 	mtx_unlock(&sellock);
815 
816 done_nosellock:
817 	/* select is not restarted after signals... */
818 	if (error == ERESTART)
819 		error = EINTR;
820 	if (error == EWOULDBLOCK)
821 		error = 0;
822 #define	putbits(name, x) \
823 	if (name && (error2 = copyout(obits[x], name, ncpbytes))) \
824 		error = error2;
825 	if (error == 0) {
826 		int error2;
827 
828 		putbits(fd_in, 0);
829 		putbits(fd_ou, 1);
830 		putbits(fd_ex, 2);
831 #undef putbits
832 	}
833 	if (selbits != &s_selbits[0])
834 		free(selbits, M_SELECT);
835 
836 	return (error);
837 }
838 
839 static int
840 selscan(td, ibits, obits, nfd)
841 	struct thread *td;
842 	fd_mask **ibits, **obits;
843 	int nfd;
844 {
845 	int msk, i, fd;
846 	fd_mask bits;
847 	struct file *fp;
848 	int n = 0;
849 	/* Note: backend also returns POLLHUP/POLLERR if appropriate. */
850 	static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
851 	struct filedesc *fdp = td->td_proc->p_fd;
852 
853 	FILEDESC_LOCK(fdp);
854 	for (msk = 0; msk < 3; msk++) {
855 		if (ibits[msk] == NULL)
856 			continue;
857 		for (i = 0; i < nfd; i += NFDBITS) {
858 			bits = ibits[msk][i/NFDBITS];
859 			/* ffs(int mask) not portable, fd_mask is long */
860 			for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
861 				if (!(bits & 1))
862 					continue;
863 				if ((fp = fget_locked(fdp, fd)) == NULL) {
864 					FILEDESC_UNLOCK(fdp);
865 					return (EBADF);
866 				}
867 				if (fo_poll(fp, flag[msk], td->td_ucred,
868 				    td)) {
869 					obits[msk][(fd)/NFDBITS] |=
870 					    ((fd_mask)1 << ((fd) % NFDBITS));
871 					n++;
872 				}
873 			}
874 		}
875 	}
876 	FILEDESC_UNLOCK(fdp);
877 	td->td_retval[0] = n;
878 	return (0);
879 }
880 
881 /*
882  * Poll system call.
883  */
884 #ifndef _SYS_SYSPROTO_H_
885 struct poll_args {
886 	struct pollfd *fds;
887 	u_int	nfds;
888 	int	timeout;
889 };
890 #endif
891 /*
892  * MPSAFE
893  */
894 int
895 poll(td, uap)
896 	struct thread *td;
897 	struct poll_args *uap;
898 {
899 	struct pollfd *bits;
900 	struct pollfd smallbits[32];
901 	struct timeval atv, rtv, ttv;
902 	int error = 0, timo;
903 	u_int ncoll, nfds;
904 	size_t ni;
905 
906 	nfds = uap->nfds;
907 
908 	/*
909 	 * This is kinda bogus.  We have fd limits, but that is not
910 	 * really related to the size of the pollfd array.  Make sure
911 	 * we let the process use at least FD_SETSIZE entries and at
912 	 * least enough for the current limits.  We want to be reasonably
913 	 * safe, but not overly restrictive.
914 	 */
915 	PROC_LOCK(td->td_proc);
916 	if ((nfds > lim_cur(td->td_proc, RLIMIT_NOFILE)) &&
917 	    (nfds > FD_SETSIZE)) {
918 		PROC_UNLOCK(td->td_proc);
919 		error = EINVAL;
920 		goto done2;
921 	}
922 	PROC_UNLOCK(td->td_proc);
923 	ni = nfds * sizeof(struct pollfd);
924 	if (ni > sizeof(smallbits))
925 		bits = malloc(ni, M_TEMP, M_WAITOK);
926 	else
927 		bits = smallbits;
928 	error = copyin(uap->fds, bits, ni);
929 	if (error)
930 		goto done_nosellock;
931 	if (uap->timeout != INFTIM) {
932 		atv.tv_sec = uap->timeout / 1000;
933 		atv.tv_usec = (uap->timeout % 1000) * 1000;
934 		if (itimerfix(&atv)) {
935 			error = EINVAL;
936 			goto done_nosellock;
937 		}
938 		getmicrouptime(&rtv);
939 		timevaladd(&atv, &rtv);
940 	} else {
941 		atv.tv_sec = 0;
942 		atv.tv_usec = 0;
943 	}
944 	timo = 0;
945 	TAILQ_INIT(&td->td_selq);
946 	mtx_lock(&sellock);
947 retry:
948 	ncoll = nselcoll;
949 	mtx_lock_spin(&sched_lock);
950 	td->td_flags |= TDF_SELECT;
951 	mtx_unlock_spin(&sched_lock);
952 	mtx_unlock(&sellock);
953 
954 	error = pollscan(td, bits, nfds);
955 	mtx_lock(&sellock);
956 	if (error || td->td_retval[0])
957 		goto done;
958 	if (atv.tv_sec || atv.tv_usec) {
959 		getmicrouptime(&rtv);
960 		if (timevalcmp(&rtv, &atv, >=))
961 			goto done;
962 		ttv = atv;
963 		timevalsub(&ttv, &rtv);
964 		timo = ttv.tv_sec > 24 * 60 * 60 ?
965 		    24 * 60 * 60 * hz : tvtohz(&ttv);
966 	}
967 	/*
968 	 * An event of interest may occur while we do not hold
969 	 * sellock, so check TDF_SELECT and the number of collisions
970 	 * and rescan the file descriptors if necessary.
971 	 */
972 	mtx_lock_spin(&sched_lock);
973 	if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
974 		mtx_unlock_spin(&sched_lock);
975 		goto retry;
976 	}
977 	mtx_unlock_spin(&sched_lock);
978 
979 	if (timo > 0)
980 		error = cv_timedwait_sig(&selwait, &sellock, timo);
981 	else
982 		error = cv_wait_sig(&selwait, &sellock);
983 
984 	if (error == 0)
985 		goto retry;
986 
987 done:
988 	clear_selinfo_list(td);
989 	mtx_lock_spin(&sched_lock);
990 	td->td_flags &= ~TDF_SELECT;
991 	mtx_unlock_spin(&sched_lock);
992 	mtx_unlock(&sellock);
993 
994 done_nosellock:
995 	/* poll is not restarted after signals... */
996 	if (error == ERESTART)
997 		error = EINTR;
998 	if (error == EWOULDBLOCK)
999 		error = 0;
1000 	if (error == 0) {
1001 		error = copyout(bits, uap->fds, ni);
1002 		if (error)
1003 			goto out;
1004 	}
1005 out:
1006 	if (ni > sizeof(smallbits))
1007 		free(bits, M_TEMP);
1008 done2:
1009 	return (error);
1010 }
1011 
1012 static int
1013 pollscan(td, fds, nfd)
1014 	struct thread *td;
1015 	struct pollfd *fds;
1016 	u_int nfd;
1017 {
1018 	register struct filedesc *fdp = td->td_proc->p_fd;
1019 	int i;
1020 	struct file *fp;
1021 	int n = 0;
1022 
1023 	FILEDESC_LOCK(fdp);
1024 	for (i = 0; i < nfd; i++, fds++) {
1025 		if (fds->fd >= fdp->fd_nfiles) {
1026 			fds->revents = POLLNVAL;
1027 			n++;
1028 		} else if (fds->fd < 0) {
1029 			fds->revents = 0;
1030 		} else {
1031 			fp = fdp->fd_ofiles[fds->fd];
1032 			if (fp == NULL) {
1033 				fds->revents = POLLNVAL;
1034 				n++;
1035 			} else {
1036 				/*
1037 				 * Note: backend also returns POLLHUP and
1038 				 * POLLERR if appropriate.
1039 				 */
1040 				fds->revents = fo_poll(fp, fds->events,
1041 				    td->td_ucred, td);
1042 				if (fds->revents != 0)
1043 					n++;
1044 			}
1045 		}
1046 	}
1047 	FILEDESC_UNLOCK(fdp);
1048 	td->td_retval[0] = n;
1049 	return (0);
1050 }
1051 
1052 /*
1053  * OpenBSD poll system call.
1054  * XXX this isn't quite a true representation..  OpenBSD uses select ops.
1055  */
1056 #ifndef _SYS_SYSPROTO_H_
1057 struct openbsd_poll_args {
1058 	struct pollfd *fds;
1059 	u_int	nfds;
1060 	int	timeout;
1061 };
1062 #endif
1063 /*
1064  * MPSAFE
1065  */
1066 int
1067 openbsd_poll(td, uap)
1068 	register struct thread *td;
1069 	register struct openbsd_poll_args *uap;
1070 {
1071 	return (poll(td, (struct poll_args *)uap));
1072 }
1073 
1074 /*
1075  * Remove the references to the thread from all of the objects
1076  * we were polling.
1077  *
1078  * This code assumes that the underlying owner of the selinfo
1079  * structure will hold sellock before it changes it, and that
1080  * it will unlink itself from our list if it goes away.
1081  */
1082 void
1083 clear_selinfo_list(td)
1084 	struct thread *td;
1085 {
1086 	struct selinfo *si;
1087 
1088 	mtx_assert(&sellock, MA_OWNED);
1089 	TAILQ_FOREACH(si, &td->td_selq, si_thrlist)
1090 		si->si_thread = NULL;
1091 	TAILQ_INIT(&td->td_selq);
1092 }
1093 
1094 /*
1095  * Record a select request.
1096  */
1097 void
1098 selrecord(selector, sip)
1099 	struct thread *selector;
1100 	struct selinfo *sip;
1101 {
1102 
1103 	mtx_lock(&sellock);
1104 	/*
1105 	 * If the selinfo's thread pointer is NULL then take ownership of it.
1106 	 *
1107 	 * If the thread pointer is not NULL and it points to another
1108 	 * thread, then we have a collision.
1109 	 *
1110 	 * If the thread pointer is not NULL and points back to us then leave
1111 	 * it alone as we've already added pointed it at us and added it to
1112 	 * our list.
1113 	 */
1114 	if (sip->si_thread == NULL) {
1115 		sip->si_thread = selector;
1116 		TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist);
1117 	} else if (sip->si_thread != selector) {
1118 		sip->si_flags |= SI_COLL;
1119 	}
1120 
1121 	mtx_unlock(&sellock);
1122 }
1123 
1124 /* Wake up a selecting thread. */
1125 void
1126 selwakeup(sip)
1127 	struct selinfo *sip;
1128 {
1129 	doselwakeup(sip, -1);
1130 }
1131 
1132 /* Wake up a selecting thread, and set its priority. */
1133 void
1134 selwakeuppri(sip, pri)
1135 	struct selinfo *sip;
1136 	int pri;
1137 {
1138 	doselwakeup(sip, pri);
1139 }
1140 
1141 /*
1142  * Do a wakeup when a selectable event occurs.
1143  */
1144 static void
1145 doselwakeup(sip, pri)
1146 	struct selinfo *sip;
1147 	int pri;
1148 {
1149 	struct thread *td;
1150 
1151 	mtx_lock(&sellock);
1152 	td = sip->si_thread;
1153 	if ((sip->si_flags & SI_COLL) != 0) {
1154 		nselcoll++;
1155 		sip->si_flags &= ~SI_COLL;
1156 		cv_broadcastpri(&selwait, pri);
1157 	}
1158 	if (td == NULL) {
1159 		mtx_unlock(&sellock);
1160 		return;
1161 	}
1162 	TAILQ_REMOVE(&td->td_selq, sip, si_thrlist);
1163 	sip->si_thread = NULL;
1164 	mtx_lock_spin(&sched_lock);
1165 	td->td_flags &= ~TDF_SELECT;
1166 	mtx_unlock_spin(&sched_lock);
1167 	sleepq_remove(td, &selwait);
1168 	mtx_unlock(&sellock);
1169 }
1170 
1171 static void selectinit(void *);
1172 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL)
1173 
1174 /* ARGSUSED*/
1175 static void
1176 selectinit(dummy)
1177 	void *dummy;
1178 {
1179 	cv_init(&selwait, "select");
1180 	mtx_init(&sellock, "sellck", NULL, MTX_DEF);
1181 }
1182