xref: /freebsd/sys/kern/sys_generic.c (revision 35a04710d7286aa9538917fd7f8e417dbee95b82)
1 /*-
2  * Copyright (c) 1982, 1986, 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 4. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
35  */
36 
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD$");
39 
40 #include "opt_compat.h"
41 #include "opt_ktrace.h"
42 
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/sysproto.h>
46 #include <sys/filedesc.h>
47 #include <sys/filio.h>
48 #include <sys/fcntl.h>
49 #include <sys/file.h>
50 #include <sys/proc.h>
51 #include <sys/signalvar.h>
52 #include <sys/socketvar.h>
53 #include <sys/uio.h>
54 #include <sys/kernel.h>
55 #include <sys/limits.h>
56 #include <sys/malloc.h>
57 #include <sys/poll.h>
58 #include <sys/resourcevar.h>
59 #include <sys/selinfo.h>
60 #include <sys/sleepqueue.h>
61 #include <sys/syscallsubr.h>
62 #include <sys/sysctl.h>
63 #include <sys/sysent.h>
64 #include <sys/vnode.h>
65 #include <sys/bio.h>
66 #include <sys/buf.h>
67 #include <sys/condvar.h>
68 #ifdef KTRACE
69 #include <sys/ktrace.h>
70 #endif
71 
72 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
73 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
74 MALLOC_DEFINE(M_IOV, "iov", "large iov's");
75 
76 static int	pollscan(struct thread *, struct pollfd *, u_int);
77 static int	selscan(struct thread *, fd_mask **, fd_mask **, int);
78 static int	dofileread(struct thread *, int, struct file *, struct uio *,
79 		    off_t, int);
80 static int	dofilewrite(struct thread *, int, struct file *, struct uio *,
81 		    off_t, int);
82 static void	doselwakeup(struct selinfo *, int);
83 
84 #ifndef _SYS_SYSPROTO_H_
85 struct read_args {
86 	int	fd;
87 	void	*buf;
88 	size_t	nbyte;
89 };
90 #endif
91 int
92 read(td, uap)
93 	struct thread *td;
94 	struct read_args *uap;
95 {
96 	struct uio auio;
97 	struct iovec aiov;
98 	int error;
99 
100 	if (uap->nbyte > INT_MAX)
101 		return (EINVAL);
102 	aiov.iov_base = uap->buf;
103 	aiov.iov_len = uap->nbyte;
104 	auio.uio_iov = &aiov;
105 	auio.uio_iovcnt = 1;
106 	auio.uio_resid = uap->nbyte;
107 	auio.uio_segflg = UIO_USERSPACE;
108 	error = kern_readv(td, uap->fd, &auio);
109 	return(error);
110 }
111 
112 /*
113  * Positioned read system call
114  */
115 #ifndef _SYS_SYSPROTO_H_
116 struct pread_args {
117 	int	fd;
118 	void	*buf;
119 	size_t	nbyte;
120 	int	pad;
121 	off_t	offset;
122 };
123 #endif
124 int
125 pread(td, uap)
126 	struct thread *td;
127 	struct pread_args *uap;
128 {
129 	struct uio auio;
130 	struct iovec aiov;
131 	int error;
132 
133 	if (uap->nbyte > INT_MAX)
134 		return (EINVAL);
135 	aiov.iov_base = uap->buf;
136 	aiov.iov_len = uap->nbyte;
137 	auio.uio_iov = &aiov;
138 	auio.uio_iovcnt = 1;
139 	auio.uio_resid = uap->nbyte;
140 	auio.uio_segflg = UIO_USERSPACE;
141 	error = kern_preadv(td, uap->fd, &auio, uap->offset);
142 	return(error);
143 }
144 
145 int
146 freebsd6_pread(td, uap)
147 	struct thread *td;
148 	struct freebsd6_pread_args *uap;
149 {
150 	struct pread_args oargs;
151 
152 	oargs.fd = uap->fd;
153 	oargs.buf = uap->buf;
154 	oargs.nbyte = uap->nbyte;
155 	oargs.offset = uap->offset;
156 	return (pread(td, &oargs));
157 }
158 
159 /*
160  * Scatter read system call.
161  */
162 #ifndef _SYS_SYSPROTO_H_
163 struct readv_args {
164 	int	fd;
165 	struct	iovec *iovp;
166 	u_int	iovcnt;
167 };
168 #endif
169 int
170 readv(struct thread *td, struct readv_args *uap)
171 {
172 	struct uio *auio;
173 	int error;
174 
175 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
176 	if (error)
177 		return (error);
178 	error = kern_readv(td, uap->fd, auio);
179 	free(auio, M_IOV);
180 	return (error);
181 }
182 
183 int
184 kern_readv(struct thread *td, int fd, struct uio *auio)
185 {
186 	struct file *fp;
187 	int error;
188 
189 	error = fget_read(td, fd, &fp);
190 	if (error)
191 		return (error);
192 	error = dofileread(td, fd, fp, auio, (off_t)-1, 0);
193 	fdrop(fp, td);
194 	return (error);
195 }
196 
197 /*
198  * Scatter positioned read system call.
199  */
200 #ifndef _SYS_SYSPROTO_H_
201 struct preadv_args {
202 	int	fd;
203 	struct	iovec *iovp;
204 	u_int	iovcnt;
205 	off_t	offset;
206 };
207 #endif
208 int
209 preadv(struct thread *td, struct preadv_args *uap)
210 {
211 	struct uio *auio;
212 	int error;
213 
214 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
215 	if (error)
216 		return (error);
217 	error = kern_preadv(td, uap->fd, auio, uap->offset);
218 	free(auio, M_IOV);
219 	return (error);
220 }
221 
222 int
223 kern_preadv(td, fd, auio, offset)
224 	struct thread *td;
225 	int fd;
226 	struct uio *auio;
227 	off_t offset;
228 {
229 	struct file *fp;
230 	int error;
231 
232 	error = fget_read(td, fd, &fp);
233 	if (error)
234 		return (error);
235 	if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
236 		error = ESPIPE;
237 	else if (offset < 0 && fp->f_vnode->v_type != VCHR)
238 		error = EINVAL;
239 	else
240 		error = dofileread(td, fd, fp, auio, offset, FOF_OFFSET);
241 	fdrop(fp, td);
242 	return (error);
243 }
244 
245 /*
246  * Common code for readv and preadv that reads data in
247  * from a file using the passed in uio, offset, and flags.
248  */
249 static int
250 dofileread(td, fd, fp, auio, offset, flags)
251 	struct thread *td;
252 	int fd;
253 	struct file *fp;
254 	struct uio *auio;
255 	off_t offset;
256 	int flags;
257 {
258 	ssize_t cnt;
259 	int error;
260 #ifdef KTRACE
261 	struct uio *ktruio = NULL;
262 #endif
263 
264 	/* Finish zero length reads right here */
265 	if (auio->uio_resid == 0) {
266 		td->td_retval[0] = 0;
267 		return(0);
268 	}
269 	auio->uio_rw = UIO_READ;
270 	auio->uio_offset = offset;
271 	auio->uio_td = td;
272 #ifdef KTRACE
273 	if (KTRPOINT(td, KTR_GENIO))
274 		ktruio = cloneuio(auio);
275 #endif
276 	cnt = auio->uio_resid;
277 	if ((error = fo_read(fp, auio, td->td_ucred, flags, td))) {
278 		if (auio->uio_resid != cnt && (error == ERESTART ||
279 		    error == EINTR || error == EWOULDBLOCK))
280 			error = 0;
281 	}
282 	cnt -= auio->uio_resid;
283 #ifdef KTRACE
284 	if (ktruio != NULL) {
285 		ktruio->uio_resid = cnt;
286 		ktrgenio(fd, UIO_READ, ktruio, error);
287 	}
288 #endif
289 	td->td_retval[0] = cnt;
290 	return (error);
291 }
292 
293 #ifndef _SYS_SYSPROTO_H_
294 struct write_args {
295 	int	fd;
296 	const void *buf;
297 	size_t	nbyte;
298 };
299 #endif
300 int
301 write(td, uap)
302 	struct thread *td;
303 	struct write_args *uap;
304 {
305 	struct uio auio;
306 	struct iovec aiov;
307 	int error;
308 
309 	if (uap->nbyte > INT_MAX)
310 		return (EINVAL);
311 	aiov.iov_base = (void *)(uintptr_t)uap->buf;
312 	aiov.iov_len = uap->nbyte;
313 	auio.uio_iov = &aiov;
314 	auio.uio_iovcnt = 1;
315 	auio.uio_resid = uap->nbyte;
316 	auio.uio_segflg = UIO_USERSPACE;
317 	error = kern_writev(td, uap->fd, &auio);
318 	return(error);
319 }
320 
321 /*
322  * Positioned write system call.
323  */
324 #ifndef _SYS_SYSPROTO_H_
325 struct pwrite_args {
326 	int	fd;
327 	const void *buf;
328 	size_t	nbyte;
329 	int	pad;
330 	off_t	offset;
331 };
332 #endif
333 int
334 pwrite(td, uap)
335 	struct thread *td;
336 	struct pwrite_args *uap;
337 {
338 	struct uio auio;
339 	struct iovec aiov;
340 	int error;
341 
342 	if (uap->nbyte > INT_MAX)
343 		return (EINVAL);
344 	aiov.iov_base = (void *)(uintptr_t)uap->buf;
345 	aiov.iov_len = uap->nbyte;
346 	auio.uio_iov = &aiov;
347 	auio.uio_iovcnt = 1;
348 	auio.uio_resid = uap->nbyte;
349 	auio.uio_segflg = UIO_USERSPACE;
350 	error = kern_pwritev(td, uap->fd, &auio, uap->offset);
351 	return(error);
352 }
353 
354 int
355 freebsd6_pwrite(td, uap)
356 	struct thread *td;
357 	struct freebsd6_pwrite_args *uap;
358 {
359 	struct pwrite_args oargs;
360 
361 	oargs.fd = uap->fd;
362 	oargs.buf = uap->buf;
363 	oargs.nbyte = uap->nbyte;
364 	oargs.offset = uap->offset;
365 	return (pwrite(td, &oargs));
366 }
367 
368 /*
369  * Gather write system call.
370  */
371 #ifndef _SYS_SYSPROTO_H_
372 struct writev_args {
373 	int	fd;
374 	struct	iovec *iovp;
375 	u_int	iovcnt;
376 };
377 #endif
378 int
379 writev(struct thread *td, struct writev_args *uap)
380 {
381 	struct uio *auio;
382 	int error;
383 
384 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
385 	if (error)
386 		return (error);
387 	error = kern_writev(td, uap->fd, auio);
388 	free(auio, M_IOV);
389 	return (error);
390 }
391 
392 int
393 kern_writev(struct thread *td, int fd, struct uio *auio)
394 {
395 	struct file *fp;
396 	int error;
397 
398 	error = fget_write(td, fd, &fp);
399 	if (error)
400 		return (error);
401 	error = dofilewrite(td, fd, fp, auio, (off_t)-1, 0);
402 	fdrop(fp, td);
403 	return (error);
404 }
405 
406 /*
407  * Gather positioned write system call.
408  */
409 #ifndef _SYS_SYSPROTO_H_
410 struct pwritev_args {
411 	int	fd;
412 	struct	iovec *iovp;
413 	u_int	iovcnt;
414 	off_t	offset;
415 };
416 #endif
417 int
418 pwritev(struct thread *td, struct pwritev_args *uap)
419 {
420 	struct uio *auio;
421 	int error;
422 
423 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
424 	if (error)
425 		return (error);
426 	error = kern_pwritev(td, uap->fd, auio, uap->offset);
427 	free(auio, M_IOV);
428 	return (error);
429 }
430 
431 int
432 kern_pwritev(td, fd, auio, offset)
433 	struct thread *td;
434 	struct uio *auio;
435 	int fd;
436 	off_t offset;
437 {
438 	struct file *fp;
439 	int error;
440 
441 	error = fget_write(td, fd, &fp);
442 	if (error)
443 		return (error);
444 	if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
445 		error = ESPIPE;
446 	else if (offset < 0 && fp->f_vnode->v_type != VCHR)
447 		error = EINVAL;
448 	else
449 		error = dofilewrite(td, fd, fp, auio, offset, FOF_OFFSET);
450 	fdrop(fp, td);
451 	return (error);
452 }
453 
454 /*
455  * Common code for writev and pwritev that writes data to
456  * a file using the passed in uio, offset, and flags.
457  */
458 static int
459 dofilewrite(td, fd, fp, auio, offset, flags)
460 	struct thread *td;
461 	int fd;
462 	struct file *fp;
463 	struct uio *auio;
464 	off_t offset;
465 	int flags;
466 {
467 	ssize_t cnt;
468 	int error;
469 #ifdef KTRACE
470 	struct uio *ktruio = NULL;
471 #endif
472 
473 	auio->uio_rw = UIO_WRITE;
474 	auio->uio_td = td;
475 	auio->uio_offset = offset;
476 #ifdef KTRACE
477 	if (KTRPOINT(td, KTR_GENIO))
478 		ktruio = cloneuio(auio);
479 #endif
480 	cnt = auio->uio_resid;
481 	if (fp->f_type == DTYPE_VNODE)
482 		bwillwrite();
483 	if ((error = fo_write(fp, auio, td->td_ucred, flags, td))) {
484 		if (auio->uio_resid != cnt && (error == ERESTART ||
485 		    error == EINTR || error == EWOULDBLOCK))
486 			error = 0;
487 		/* Socket layer is responsible for issuing SIGPIPE. */
488 		if (fp->f_type != DTYPE_SOCKET && error == EPIPE) {
489 			PROC_LOCK(td->td_proc);
490 			psignal(td->td_proc, SIGPIPE);
491 			PROC_UNLOCK(td->td_proc);
492 		}
493 	}
494 	cnt -= auio->uio_resid;
495 #ifdef KTRACE
496 	if (ktruio != NULL) {
497 		ktruio->uio_resid = cnt;
498 		ktrgenio(fd, UIO_WRITE, ktruio, error);
499 	}
500 #endif
501 	td->td_retval[0] = cnt;
502 	return (error);
503 }
504 
505 #ifndef _SYS_SYSPROTO_H_
506 struct ioctl_args {
507 	int	fd;
508 	u_long	com;
509 	caddr_t	data;
510 };
511 #endif
512 /* ARGSUSED */
513 int
514 ioctl(struct thread *td, struct ioctl_args *uap)
515 {
516 	u_long com;
517 	int arg, error;
518 	u_int size;
519 	caddr_t data;
520 
521 	if (uap->com > 0xffffffff) {
522 		printf(
523 		    "WARNING pid %d (%s): ioctl sign-extension ioctl %lx\n",
524 		    td->td_proc->p_pid, td->td_name, uap->com);
525 		uap->com &= 0xffffffff;
526 	}
527 	com = uap->com;
528 
529 	/*
530 	 * Interpret high order word to find amount of data to be
531 	 * copied to/from the user's address space.
532 	 */
533 	size = IOCPARM_LEN(com);
534 	if ((size > IOCPARM_MAX) ||
535 	    ((com & (IOC_VOID  | IOC_IN | IOC_OUT)) == 0) ||
536 #if defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
537 	    ((com & IOC_OUT) && size == 0) ||
538 #else
539 	    ((com & (IOC_IN | IOC_OUT)) && size == 0) ||
540 #endif
541 	    ((com & IOC_VOID) && size > 0 && size != sizeof(int)))
542 		return (ENOTTY);
543 
544 	if (size > 0) {
545 		if (!(com & IOC_VOID))
546 			data = malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
547 		else {
548 			/* Integer argument. */
549 			arg = (intptr_t)uap->data;
550 			data = (void *)&arg;
551 			size = 0;
552 		}
553 	} else
554 		data = (void *)&uap->data;
555 	if (com & IOC_IN) {
556 		error = copyin(uap->data, data, (u_int)size);
557 		if (error) {
558 			if (size > 0)
559 				free(data, M_IOCTLOPS);
560 			return (error);
561 		}
562 	} else if (com & IOC_OUT) {
563 		/*
564 		 * Zero the buffer so the user always
565 		 * gets back something deterministic.
566 		 */
567 		bzero(data, size);
568 	}
569 
570 	error = kern_ioctl(td, uap->fd, com, data);
571 
572 	if (error == 0 && (com & IOC_OUT))
573 		error = copyout(data, uap->data, (u_int)size);
574 
575 	if (size > 0)
576 		free(data, M_IOCTLOPS);
577 	return (error);
578 }
579 
580 int
581 kern_ioctl(struct thread *td, int fd, u_long com, caddr_t data)
582 {
583 	struct file *fp;
584 	struct filedesc *fdp;
585 	int error;
586 	int tmp;
587 
588 	if ((error = fget(td, fd, &fp)) != 0)
589 		return (error);
590 	if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
591 		fdrop(fp, td);
592 		return (EBADF);
593 	}
594 	fdp = td->td_proc->p_fd;
595 	switch (com) {
596 	case FIONCLEX:
597 		FILEDESC_XLOCK(fdp);
598 		fdp->fd_ofileflags[fd] &= ~UF_EXCLOSE;
599 		FILEDESC_XUNLOCK(fdp);
600 		goto out;
601 	case FIOCLEX:
602 		FILEDESC_XLOCK(fdp);
603 		fdp->fd_ofileflags[fd] |= UF_EXCLOSE;
604 		FILEDESC_XUNLOCK(fdp);
605 		goto out;
606 	case FIONBIO:
607 		FILE_LOCK(fp);
608 		if ((tmp = *(int *)data))
609 			fp->f_flag |= FNONBLOCK;
610 		else
611 			fp->f_flag &= ~FNONBLOCK;
612 		FILE_UNLOCK(fp);
613 		data = (void *)&tmp;
614 		break;
615 	case FIOASYNC:
616 		FILE_LOCK(fp);
617 		if ((tmp = *(int *)data))
618 			fp->f_flag |= FASYNC;
619 		else
620 			fp->f_flag &= ~FASYNC;
621 		FILE_UNLOCK(fp);
622 		data = (void *)&tmp;
623 		break;
624 	}
625 
626 	error = fo_ioctl(fp, com, data, td->td_ucred, td);
627 out:
628 	fdrop(fp, td);
629 	return (error);
630 }
631 
632 /*
633  * sellock and selwait are initialized in selectinit() via SYSINIT.
634  */
635 struct mtx	sellock;
636 struct cv	selwait;
637 u_int		nselcoll;	/* Select collisions since boot */
638 SYSCTL_UINT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
639 
640 #ifndef _SYS_SYSPROTO_H_
641 struct select_args {
642 	int	nd;
643 	fd_set	*in, *ou, *ex;
644 	struct	timeval *tv;
645 };
646 #endif
647 int
648 select(td, uap)
649 	register struct thread *td;
650 	register struct select_args *uap;
651 {
652 	struct timeval tv, *tvp;
653 	int error;
654 
655 	if (uap->tv != NULL) {
656 		error = copyin(uap->tv, &tv, sizeof(tv));
657 		if (error)
658 			return (error);
659 		tvp = &tv;
660 	} else
661 		tvp = NULL;
662 
663 	return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp));
664 }
665 
666 int
667 kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou,
668     fd_set *fd_ex, struct timeval *tvp)
669 {
670 	struct filedesc *fdp;
671 	/*
672 	 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
673 	 * infds with the new FD_SETSIZE of 1024, and more than enough for
674 	 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
675 	 * of 256.
676 	 */
677 	fd_mask s_selbits[howmany(2048, NFDBITS)];
678 	fd_mask *ibits[3], *obits[3], *selbits, *sbp;
679 	struct timeval atv, rtv, ttv;
680 	int error, timo;
681 	u_int ncoll, nbufbytes, ncpbytes, nfdbits;
682 
683 	if (nd < 0)
684 		return (EINVAL);
685 	fdp = td->td_proc->p_fd;
686 
687 	FILEDESC_SLOCK(fdp);
688 	if (nd > td->td_proc->p_fd->fd_nfiles)
689 		nd = td->td_proc->p_fd->fd_nfiles;   /* forgiving; slightly wrong */
690 	FILEDESC_SUNLOCK(fdp);
691 
692 	/*
693 	 * Allocate just enough bits for the non-null fd_sets.  Use the
694 	 * preallocated auto buffer if possible.
695 	 */
696 	nfdbits = roundup(nd, NFDBITS);
697 	ncpbytes = nfdbits / NBBY;
698 	nbufbytes = 0;
699 	if (fd_in != NULL)
700 		nbufbytes += 2 * ncpbytes;
701 	if (fd_ou != NULL)
702 		nbufbytes += 2 * ncpbytes;
703 	if (fd_ex != NULL)
704 		nbufbytes += 2 * ncpbytes;
705 	if (nbufbytes <= sizeof s_selbits)
706 		selbits = &s_selbits[0];
707 	else
708 		selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
709 
710 	/*
711 	 * Assign pointers into the bit buffers and fetch the input bits.
712 	 * Put the output buffers together so that they can be bzeroed
713 	 * together.
714 	 */
715 	sbp = selbits;
716 #define	getbits(name, x) \
717 	do {								\
718 		if (name == NULL)					\
719 			ibits[x] = NULL;				\
720 		else {							\
721 			ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp;	\
722 			obits[x] = sbp;					\
723 			sbp += ncpbytes / sizeof *sbp;			\
724 			error = copyin(name, ibits[x], ncpbytes);	\
725 			if (error != 0)					\
726 				goto done_nosellock;			\
727 		}							\
728 	} while (0)
729 	getbits(fd_in, 0);
730 	getbits(fd_ou, 1);
731 	getbits(fd_ex, 2);
732 #undef	getbits
733 	if (nbufbytes != 0)
734 		bzero(selbits, nbufbytes / 2);
735 
736 	if (tvp != NULL) {
737 		atv = *tvp;
738 		if (itimerfix(&atv)) {
739 			error = EINVAL;
740 			goto done_nosellock;
741 		}
742 		getmicrouptime(&rtv);
743 		timevaladd(&atv, &rtv);
744 	} else {
745 		atv.tv_sec = 0;
746 		atv.tv_usec = 0;
747 	}
748 	timo = 0;
749 	TAILQ_INIT(&td->td_selq);
750 	mtx_lock(&sellock);
751 retry:
752 	ncoll = nselcoll;
753 	thread_lock(td);
754 	td->td_flags |= TDF_SELECT;
755 	thread_unlock(td);
756 	mtx_unlock(&sellock);
757 
758 	error = selscan(td, ibits, obits, nd);
759 	mtx_lock(&sellock);
760 	if (error || td->td_retval[0])
761 		goto done;
762 	if (atv.tv_sec || atv.tv_usec) {
763 		getmicrouptime(&rtv);
764 		if (timevalcmp(&rtv, &atv, >=))
765 			goto done;
766 		ttv = atv;
767 		timevalsub(&ttv, &rtv);
768 		timo = ttv.tv_sec > 24 * 60 * 60 ?
769 		    24 * 60 * 60 * hz : tvtohz(&ttv);
770 	}
771 
772 	/*
773 	 * An event of interest may occur while we do not hold
774 	 * sellock, so check TDF_SELECT and the number of
775 	 * collisions and rescan the file descriptors if
776 	 * necessary.
777 	 */
778 	thread_lock(td);
779 	if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
780 		thread_unlock(td);
781 		goto retry;
782 	}
783 	thread_unlock(td);
784 
785 	if (timo > 0)
786 		error = cv_timedwait_sig(&selwait, &sellock, timo);
787 	else
788 		error = cv_wait_sig(&selwait, &sellock);
789 
790 	if (error == 0)
791 		goto retry;
792 
793 done:
794 	clear_selinfo_list(td);
795 	thread_lock(td);
796 	td->td_flags &= ~TDF_SELECT;
797 	thread_unlock(td);
798 	mtx_unlock(&sellock);
799 
800 done_nosellock:
801 	/* select is not restarted after signals... */
802 	if (error == ERESTART)
803 		error = EINTR;
804 	if (error == EWOULDBLOCK)
805 		error = 0;
806 #define	putbits(name, x) \
807 	if (name && (error2 = copyout(obits[x], name, ncpbytes))) \
808 		error = error2;
809 	if (error == 0) {
810 		int error2;
811 
812 		putbits(fd_in, 0);
813 		putbits(fd_ou, 1);
814 		putbits(fd_ex, 2);
815 #undef putbits
816 	}
817 	if (selbits != &s_selbits[0])
818 		free(selbits, M_SELECT);
819 
820 	return (error);
821 }
822 
823 static int
824 selscan(td, ibits, obits, nfd)
825 	struct thread *td;
826 	fd_mask **ibits, **obits;
827 	int nfd;
828 {
829 	int msk, i, fd;
830 	fd_mask bits;
831 	struct file *fp;
832 	int n = 0;
833 	/* Note: backend also returns POLLHUP/POLLERR if appropriate. */
834 	static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
835 	struct filedesc *fdp = td->td_proc->p_fd;
836 
837 	FILEDESC_SLOCK(fdp);
838 	for (msk = 0; msk < 3; msk++) {
839 		if (ibits[msk] == NULL)
840 			continue;
841 		for (i = 0; i < nfd; i += NFDBITS) {
842 			bits = ibits[msk][i/NFDBITS];
843 			/* ffs(int mask) not portable, fd_mask is long */
844 			for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
845 				if (!(bits & 1))
846 					continue;
847 				if ((fp = fget_locked(fdp, fd)) == NULL) {
848 					FILEDESC_SUNLOCK(fdp);
849 					return (EBADF);
850 				}
851 				if (fo_poll(fp, flag[msk], td->td_ucred,
852 				    td)) {
853 					obits[msk][(fd)/NFDBITS] |=
854 					    ((fd_mask)1 << ((fd) % NFDBITS));
855 					n++;
856 				}
857 			}
858 		}
859 	}
860 	FILEDESC_SUNLOCK(fdp);
861 	td->td_retval[0] = n;
862 	return (0);
863 }
864 
865 #ifndef _SYS_SYSPROTO_H_
866 struct poll_args {
867 	struct pollfd *fds;
868 	u_int	nfds;
869 	int	timeout;
870 };
871 #endif
872 int
873 poll(td, uap)
874 	struct thread *td;
875 	struct poll_args *uap;
876 {
877 	struct pollfd *bits;
878 	struct pollfd smallbits[32];
879 	struct timeval atv, rtv, ttv;
880 	int error = 0, timo;
881 	u_int ncoll, nfds;
882 	size_t ni;
883 
884 	nfds = uap->nfds;
885 
886 	/*
887 	 * This is kinda bogus.  We have fd limits, but that is not
888 	 * really related to the size of the pollfd array.  Make sure
889 	 * we let the process use at least FD_SETSIZE entries and at
890 	 * least enough for the current limits.  We want to be reasonably
891 	 * safe, but not overly restrictive.
892 	 */
893 	PROC_LOCK(td->td_proc);
894 	if ((nfds > lim_cur(td->td_proc, RLIMIT_NOFILE)) &&
895 	    (nfds > FD_SETSIZE)) {
896 		PROC_UNLOCK(td->td_proc);
897 		error = EINVAL;
898 		goto done2;
899 	}
900 	PROC_UNLOCK(td->td_proc);
901 	ni = nfds * sizeof(struct pollfd);
902 	if (ni > sizeof(smallbits))
903 		bits = malloc(ni, M_TEMP, M_WAITOK);
904 	else
905 		bits = smallbits;
906 	error = copyin(uap->fds, bits, ni);
907 	if (error)
908 		goto done_nosellock;
909 	if (uap->timeout != INFTIM) {
910 		atv.tv_sec = uap->timeout / 1000;
911 		atv.tv_usec = (uap->timeout % 1000) * 1000;
912 		if (itimerfix(&atv)) {
913 			error = EINVAL;
914 			goto done_nosellock;
915 		}
916 		getmicrouptime(&rtv);
917 		timevaladd(&atv, &rtv);
918 	} else {
919 		atv.tv_sec = 0;
920 		atv.tv_usec = 0;
921 	}
922 	timo = 0;
923 	TAILQ_INIT(&td->td_selq);
924 	mtx_lock(&sellock);
925 retry:
926 	ncoll = nselcoll;
927 	thread_lock(td);
928 	td->td_flags |= TDF_SELECT;
929 	thread_unlock(td);
930 	mtx_unlock(&sellock);
931 
932 	error = pollscan(td, bits, nfds);
933 	mtx_lock(&sellock);
934 	if (error || td->td_retval[0])
935 		goto done;
936 	if (atv.tv_sec || atv.tv_usec) {
937 		getmicrouptime(&rtv);
938 		if (timevalcmp(&rtv, &atv, >=))
939 			goto done;
940 		ttv = atv;
941 		timevalsub(&ttv, &rtv);
942 		timo = ttv.tv_sec > 24 * 60 * 60 ?
943 		    24 * 60 * 60 * hz : tvtohz(&ttv);
944 	}
945 	/*
946 	 * An event of interest may occur while we do not hold
947 	 * sellock, so check TDF_SELECT and the number of collisions
948 	 * and rescan the file descriptors if necessary.
949 	 */
950 	thread_lock(td);
951 	if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
952 		thread_unlock(td);
953 		goto retry;
954 	}
955 	thread_unlock(td);
956 
957 	if (timo > 0)
958 		error = cv_timedwait_sig(&selwait, &sellock, timo);
959 	else
960 		error = cv_wait_sig(&selwait, &sellock);
961 
962 	if (error == 0)
963 		goto retry;
964 
965 done:
966 	clear_selinfo_list(td);
967 	thread_lock(td);
968 	td->td_flags &= ~TDF_SELECT;
969 	thread_unlock(td);
970 	mtx_unlock(&sellock);
971 
972 done_nosellock:
973 	/* poll is not restarted after signals... */
974 	if (error == ERESTART)
975 		error = EINTR;
976 	if (error == EWOULDBLOCK)
977 		error = 0;
978 	if (error == 0) {
979 		error = copyout(bits, uap->fds, ni);
980 		if (error)
981 			goto out;
982 	}
983 out:
984 	if (ni > sizeof(smallbits))
985 		free(bits, M_TEMP);
986 done2:
987 	return (error);
988 }
989 
990 static int
991 pollscan(td, fds, nfd)
992 	struct thread *td;
993 	struct pollfd *fds;
994 	u_int nfd;
995 {
996 	register struct filedesc *fdp = td->td_proc->p_fd;
997 	int i;
998 	struct file *fp;
999 	int n = 0;
1000 
1001 	FILEDESC_SLOCK(fdp);
1002 	for (i = 0; i < nfd; i++, fds++) {
1003 		if (fds->fd >= fdp->fd_nfiles) {
1004 			fds->revents = POLLNVAL;
1005 			n++;
1006 		} else if (fds->fd < 0) {
1007 			fds->revents = 0;
1008 		} else {
1009 			fp = fdp->fd_ofiles[fds->fd];
1010 			if (fp == NULL) {
1011 				fds->revents = POLLNVAL;
1012 				n++;
1013 			} else {
1014 				/*
1015 				 * Note: backend also returns POLLHUP and
1016 				 * POLLERR if appropriate.
1017 				 */
1018 				fds->revents = fo_poll(fp, fds->events,
1019 				    td->td_ucred, td);
1020 				if (fds->revents != 0)
1021 					n++;
1022 			}
1023 		}
1024 	}
1025 	FILEDESC_SUNLOCK(fdp);
1026 	td->td_retval[0] = n;
1027 	return (0);
1028 }
1029 
1030 /*
1031  * OpenBSD poll system call.
1032  *
1033  * XXX this isn't quite a true representation..  OpenBSD uses select ops.
1034  */
1035 #ifndef _SYS_SYSPROTO_H_
1036 struct openbsd_poll_args {
1037 	struct pollfd *fds;
1038 	u_int	nfds;
1039 	int	timeout;
1040 };
1041 #endif
1042 int
1043 openbsd_poll(td, uap)
1044 	register struct thread *td;
1045 	register struct openbsd_poll_args *uap;
1046 {
1047 	return (poll(td, (struct poll_args *)uap));
1048 }
1049 
1050 /*
1051  * Remove the references to the thread from all of the objects we were
1052  * polling.
1053  *
1054  * This code assumes that the underlying owner of the selinfo structure will
1055  * hold sellock before it changes it, and that it will unlink itself from our
1056  * list if it goes away.
1057  */
1058 void
1059 clear_selinfo_list(td)
1060 	struct thread *td;
1061 {
1062 	struct selinfo *si;
1063 
1064 	mtx_assert(&sellock, MA_OWNED);
1065 	TAILQ_FOREACH(si, &td->td_selq, si_thrlist)
1066 		si->si_thread = NULL;
1067 	TAILQ_INIT(&td->td_selq);
1068 }
1069 
1070 /*
1071  * Record a select request.
1072  */
1073 void
1074 selrecord(selector, sip)
1075 	struct thread *selector;
1076 	struct selinfo *sip;
1077 {
1078 
1079 	mtx_lock(&sellock);
1080 	/*
1081 	 * If the selinfo's thread pointer is NULL then take ownership of it.
1082 	 *
1083 	 * If the thread pointer is not NULL and it points to another
1084 	 * thread, then we have a collision.
1085 	 *
1086 	 * If the thread pointer is not NULL and points back to us then leave
1087 	 * it alone as we've already added pointed it at us and added it to
1088 	 * our list.
1089 	 */
1090 	if (sip->si_thread == NULL) {
1091 		sip->si_thread = selector;
1092 		TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist);
1093 	} else if (sip->si_thread != selector) {
1094 		sip->si_flags |= SI_COLL;
1095 	}
1096 
1097 	mtx_unlock(&sellock);
1098 }
1099 
1100 /* Wake up a selecting thread. */
1101 void
1102 selwakeup(sip)
1103 	struct selinfo *sip;
1104 {
1105 	doselwakeup(sip, -1);
1106 }
1107 
1108 /* Wake up a selecting thread, and set its priority. */
1109 void
1110 selwakeuppri(sip, pri)
1111 	struct selinfo *sip;
1112 	int pri;
1113 {
1114 	doselwakeup(sip, pri);
1115 }
1116 
1117 /*
1118  * Do a wakeup when a selectable event occurs.
1119  */
1120 static void
1121 doselwakeup(sip, pri)
1122 	struct selinfo *sip;
1123 	int pri;
1124 {
1125 	struct thread *td;
1126 
1127 	mtx_lock(&sellock);
1128 	td = sip->si_thread;
1129 	if ((sip->si_flags & SI_COLL) != 0) {
1130 		nselcoll++;
1131 		sip->si_flags &= ~SI_COLL;
1132 		cv_broadcastpri(&selwait, pri);
1133 	}
1134 	if (td == NULL) {
1135 		mtx_unlock(&sellock);
1136 		return;
1137 	}
1138 	TAILQ_REMOVE(&td->td_selq, sip, si_thrlist);
1139 	sip->si_thread = NULL;
1140 	thread_lock(td);
1141 	td->td_flags &= ~TDF_SELECT;
1142 	thread_unlock(td);
1143 	sleepq_remove(td, &selwait);
1144 	mtx_unlock(&sellock);
1145 }
1146 
1147 static void selectinit(void *);
1148 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL)
1149 
1150 /* ARGSUSED*/
1151 static void
1152 selectinit(dummy)
1153 	void *dummy;
1154 {
1155 	cv_init(&selwait, "select");
1156 	mtx_init(&sellock, "sellck", NULL, MTX_DEF);
1157 }
1158