xref: /freebsd/sys/kern/sys_generic.c (revision f3bb407b7c1b3faa88d0580541f01a8e6fb6cc68)
1 /*-
2  * Copyright (c) 1982, 1986, 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 4. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
35  */
36 
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD$");
39 
40 #include "opt_compat.h"
41 #include "opt_ktrace.h"
42 
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/sysproto.h>
46 #include <sys/filedesc.h>
47 #include <sys/filio.h>
48 #include <sys/fcntl.h>
49 #include <sys/file.h>
50 #include <sys/proc.h>
51 #include <sys/signalvar.h>
52 #include <sys/socketvar.h>
53 #include <sys/uio.h>
54 #include <sys/kernel.h>
55 #include <sys/limits.h>
56 #include <sys/malloc.h>
57 #include <sys/poll.h>
58 #include <sys/resourcevar.h>
59 #include <sys/selinfo.h>
60 #include <sys/sleepqueue.h>
61 #include <sys/syscallsubr.h>
62 #include <sys/sysctl.h>
63 #include <sys/sysent.h>
64 #include <sys/vnode.h>
65 #include <sys/bio.h>
66 #include <sys/buf.h>
67 #include <sys/condvar.h>
68 #ifdef KTRACE
69 #include <sys/ktrace.h>
70 #endif
71 #include <vm/vm.h>
72 #include <vm/vm_page.h>
73 
74 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
75 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
76 MALLOC_DEFINE(M_IOV, "iov", "large iov's");
77 
78 static int	pollscan(struct thread *, struct pollfd *, u_int);
79 static int	selscan(struct thread *, fd_mask **, fd_mask **, int);
80 static int	dofileread(struct thread *, int, struct file *, struct uio *,
81 		    off_t, int);
82 static int	dofilewrite(struct thread *, int, struct file *, struct uio *,
83 		    off_t, int);
84 static void	doselwakeup(struct selinfo *, int);
85 
86 #ifndef _SYS_SYSPROTO_H_
87 struct read_args {
88 	int	fd;
89 	void	*buf;
90 	size_t	nbyte;
91 };
92 #endif
93 int
94 read(td, uap)
95 	struct thread *td;
96 	struct read_args *uap;
97 {
98 	struct uio auio;
99 	struct iovec aiov;
100 	int error;
101 
102 	if (uap->nbyte > INT_MAX)
103 		return (EINVAL);
104 	aiov.iov_base = uap->buf;
105 	aiov.iov_len = uap->nbyte;
106 	auio.uio_iov = &aiov;
107 	auio.uio_iovcnt = 1;
108 	auio.uio_resid = uap->nbyte;
109 	auio.uio_segflg = UIO_USERSPACE;
110 	error = kern_readv(td, uap->fd, &auio);
111 	return(error);
112 }
113 
114 /*
115  * Positioned read system call
116  */
117 #ifndef _SYS_SYSPROTO_H_
118 struct pread_args {
119 	int	fd;
120 	void	*buf;
121 	size_t	nbyte;
122 	int	pad;
123 	off_t	offset;
124 };
125 #endif
126 int
127 pread(td, uap)
128 	struct thread *td;
129 	struct pread_args *uap;
130 {
131 	struct uio auio;
132 	struct iovec aiov;
133 	int error;
134 
135 	if (uap->nbyte > INT_MAX)
136 		return (EINVAL);
137 	aiov.iov_base = uap->buf;
138 	aiov.iov_len = uap->nbyte;
139 	auio.uio_iov = &aiov;
140 	auio.uio_iovcnt = 1;
141 	auio.uio_resid = uap->nbyte;
142 	auio.uio_segflg = UIO_USERSPACE;
143 	error = kern_preadv(td, uap->fd, &auio, uap->offset);
144 	return(error);
145 }
146 
147 /*
148  * Scatter read system call.
149  */
150 #ifndef _SYS_SYSPROTO_H_
151 struct readv_args {
152 	int	fd;
153 	struct	iovec *iovp;
154 	u_int	iovcnt;
155 };
156 #endif
157 int
158 readv(struct thread *td, struct readv_args *uap)
159 {
160 	struct uio *auio;
161 	int error;
162 
163 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
164 	if (error)
165 		return (error);
166 	error = kern_readv(td, uap->fd, auio);
167 	free(auio, M_IOV);
168 	return (error);
169 }
170 
171 int
172 kern_readv(struct thread *td, int fd, struct uio *auio)
173 {
174 	struct file *fp;
175 	int error;
176 
177 	error = fget_read(td, fd, &fp);
178 	if (error)
179 		return (error);
180 	error = dofileread(td, fd, fp, auio, (off_t)-1, 0);
181 	fdrop(fp, td);
182 	return (error);
183 }
184 
185 /*
186  * Scatter positioned read system call.
187  */
188 #ifndef _SYS_SYSPROTO_H_
189 struct preadv_args {
190 	int	fd;
191 	struct	iovec *iovp;
192 	u_int	iovcnt;
193 	off_t	offset;
194 };
195 #endif
196 int
197 preadv(struct thread *td, struct preadv_args *uap)
198 {
199 	struct uio *auio;
200 	int error;
201 
202 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
203 	if (error)
204 		return (error);
205 	error = kern_preadv(td, uap->fd, auio, uap->offset);
206 	free(auio, M_IOV);
207 	return (error);
208 }
209 
210 int
211 kern_preadv(td, fd, auio, offset)
212 	struct thread *td;
213 	int fd;
214 	struct uio *auio;
215 	off_t offset;
216 {
217 	struct file *fp;
218 	int error;
219 
220 	error = fget_read(td, fd, &fp);
221 	if (error)
222 		return (error);
223 	if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
224 		error = ESPIPE;
225 	else if (offset < 0 && fp->f_vnode->v_type != VCHR)
226 		error = EINVAL;
227 	else
228 		error = dofileread(td, fd, fp, auio, offset, FOF_OFFSET);
229 	fdrop(fp, td);
230 	return (error);
231 }
232 
233 /*
234  * Common code for readv and preadv that reads data in
235  * from a file using the passed in uio, offset, and flags.
236  */
237 static int
238 dofileread(td, fd, fp, auio, offset, flags)
239 	struct thread *td;
240 	int fd;
241 	struct file *fp;
242 	struct uio *auio;
243 	off_t offset;
244 	int flags;
245 {
246 	ssize_t cnt;
247 	int error;
248 #ifdef KTRACE
249 	struct uio *ktruio = NULL;
250 #endif
251 
252 	/* Finish zero length reads right here */
253 	if (auio->uio_resid == 0) {
254 		td->td_retval[0] = 0;
255 		return(0);
256 	}
257 	auio->uio_rw = UIO_READ;
258 	auio->uio_offset = offset;
259 	auio->uio_td = td;
260 #ifdef KTRACE
261 	if (KTRPOINT(td, KTR_GENIO))
262 		ktruio = cloneuio(auio);
263 #endif
264 	cnt = auio->uio_resid;
265 	if ((error = fo_read(fp, auio, td->td_ucred, flags, td))) {
266 		if (auio->uio_resid != cnt && (error == ERESTART ||
267 		    error == EINTR || error == EWOULDBLOCK))
268 			error = 0;
269 	}
270 	cnt -= auio->uio_resid;
271 #ifdef KTRACE
272 	if (ktruio != NULL) {
273 		ktruio->uio_resid = cnt;
274 		ktrgenio(fd, UIO_READ, ktruio, error);
275 	}
276 #endif
277 	td->td_retval[0] = cnt;
278 	return (error);
279 }
280 
281 #ifndef _SYS_SYSPROTO_H_
282 struct write_args {
283 	int	fd;
284 	const void *buf;
285 	size_t	nbyte;
286 };
287 #endif
288 int
289 write(td, uap)
290 	struct thread *td;
291 	struct write_args *uap;
292 {
293 	struct uio auio;
294 	struct iovec aiov;
295 	int error;
296 
297 	if (uap->nbyte > INT_MAX)
298 		return (EINVAL);
299 	aiov.iov_base = (void *)(uintptr_t)uap->buf;
300 	aiov.iov_len = uap->nbyte;
301 	auio.uio_iov = &aiov;
302 	auio.uio_iovcnt = 1;
303 	auio.uio_resid = uap->nbyte;
304 	auio.uio_segflg = UIO_USERSPACE;
305 	error = kern_writev(td, uap->fd, &auio);
306 	return(error);
307 }
308 
309 /*
310  * Positioned write system call.
311  */
312 #ifndef _SYS_SYSPROTO_H_
313 struct pwrite_args {
314 	int	fd;
315 	const void *buf;
316 	size_t	nbyte;
317 	int	pad;
318 	off_t	offset;
319 };
320 #endif
321 int
322 pwrite(td, uap)
323 	struct thread *td;
324 	struct pwrite_args *uap;
325 {
326 	struct uio auio;
327 	struct iovec aiov;
328 	int error;
329 
330 	if (uap->nbyte > INT_MAX)
331 		return (EINVAL);
332 	aiov.iov_base = (void *)(uintptr_t)uap->buf;
333 	aiov.iov_len = uap->nbyte;
334 	auio.uio_iov = &aiov;
335 	auio.uio_iovcnt = 1;
336 	auio.uio_resid = uap->nbyte;
337 	auio.uio_segflg = UIO_USERSPACE;
338 	error = kern_pwritev(td, uap->fd, &auio, uap->offset);
339 	return(error);
340 }
341 
342 /*
343  * Gather write system call.
344  */
345 #ifndef _SYS_SYSPROTO_H_
346 struct writev_args {
347 	int	fd;
348 	struct	iovec *iovp;
349 	u_int	iovcnt;
350 };
351 #endif
352 int
353 writev(struct thread *td, struct writev_args *uap)
354 {
355 	struct uio *auio;
356 	int error;
357 
358 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
359 	if (error)
360 		return (error);
361 	error = kern_writev(td, uap->fd, auio);
362 	free(auio, M_IOV);
363 	return (error);
364 }
365 
366 int
367 kern_writev(struct thread *td, int fd, struct uio *auio)
368 {
369 	struct file *fp;
370 	int error;
371 
372 	error = fget_write(td, fd, &fp);
373 	if (error)
374 		return (error);
375 	error = dofilewrite(td, fd, fp, auio, (off_t)-1, 0);
376 	fdrop(fp, td);
377 	return (error);
378 }
379 
380 /*
381  * Gather positioned write system call.
382  */
383 #ifndef _SYS_SYSPROTO_H_
384 struct pwritev_args {
385 	int	fd;
386 	struct	iovec *iovp;
387 	u_int	iovcnt;
388 	off_t	offset;
389 };
390 #endif
391 int
392 pwritev(struct thread *td, struct pwritev_args *uap)
393 {
394 	struct uio *auio;
395 	int error;
396 
397 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
398 	if (error)
399 		return (error);
400 	error = kern_pwritev(td, uap->fd, auio, uap->offset);
401 	free(auio, M_IOV);
402 	return (error);
403 }
404 
405 int
406 kern_pwritev(td, fd, auio, offset)
407 	struct thread *td;
408 	struct uio *auio;
409 	int fd;
410 	off_t offset;
411 {
412 	struct file *fp;
413 	int error;
414 
415 	error = fget_write(td, fd, &fp);
416 	if (error)
417 		return (error);
418 	if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
419 		error = ESPIPE;
420 	else if (offset < 0 && fp->f_vnode->v_type != VCHR)
421 		error = EINVAL;
422 	else
423 		error = dofilewrite(td, fd, fp, auio, offset, FOF_OFFSET);
424 	fdrop(fp, td);
425 	return (error);
426 }
427 
428 /*
429  * Common code for writev and pwritev that writes data to
430  * a file using the passed in uio, offset, and flags.
431  */
432 static int
433 dofilewrite(td, fd, fp, auio, offset, flags)
434 	struct thread *td;
435 	int fd;
436 	struct file *fp;
437 	struct uio *auio;
438 	off_t offset;
439 	int flags;
440 {
441 	ssize_t cnt;
442 	int error;
443 #ifdef KTRACE
444 	struct uio *ktruio = NULL;
445 #endif
446 
447 	auio->uio_rw = UIO_WRITE;
448 	auio->uio_td = td;
449 	auio->uio_offset = offset;
450 #ifdef KTRACE
451 	if (KTRPOINT(td, KTR_GENIO))
452 		ktruio = cloneuio(auio);
453 #endif
454 	cnt = auio->uio_resid;
455 	if (fp->f_type == DTYPE_VNODE)
456 		bwillwrite();
457 	if ((error = fo_write(fp, auio, td->td_ucred, flags, td))) {
458 		if (auio->uio_resid != cnt && (error == ERESTART ||
459 		    error == EINTR || error == EWOULDBLOCK))
460 			error = 0;
461 		/* Socket layer is responsible for issuing SIGPIPE. */
462 		if (fp->f_type != DTYPE_SOCKET && error == EPIPE) {
463 			PROC_LOCK(td->td_proc);
464 			psignal(td->td_proc, SIGPIPE);
465 			PROC_UNLOCK(td->td_proc);
466 		}
467 	}
468 	cnt -= auio->uio_resid;
469 #ifdef KTRACE
470 	if (ktruio != NULL) {
471 		ktruio->uio_resid = cnt;
472 		ktrgenio(fd, UIO_WRITE, ktruio, error);
473 	}
474 #endif
475 	td->td_retval[0] = cnt;
476 	return (error);
477 }
478 
479 #ifndef _SYS_SYSPROTO_H_
480 struct ioctl_args {
481 	int	fd;
482 	u_long	com;
483 	caddr_t	data;
484 };
485 #endif
486 /* ARGSUSED */
487 int
488 ioctl(struct thread *td, struct ioctl_args *uap)
489 {
490 	u_long com;
491 	int arg, error;
492 	u_int size;
493 	caddr_t data;
494 
495 	if (uap->com > 0xffffffff) {
496 		printf(
497 		    "WARNING pid %d (%s): ioctl sign-extension ioctl %lx\n",
498 		    td->td_proc->p_pid, td->td_proc->p_comm, uap->com);
499 		uap->com &= 0xffffffff;
500 	}
501 	com = uap->com;
502 
503 	/*
504 	 * Interpret high order word to find amount of data to be
505 	 * copied to/from the user's address space.
506 	 */
507 	size = IOCPARM_LEN(com);
508 	if ((size > IOCPARM_MAX) ||
509 	    ((com & (IOC_VOID  | IOC_IN | IOC_OUT)) == 0) ||
510 #if defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
511 	    ((com & IOC_OUT) && size == 0) ||
512 #else
513 	    ((com & (IOC_IN | IOC_OUT)) && size == 0) ||
514 #endif
515 	    ((com & IOC_VOID) && size > 0 && size != sizeof(int)))
516 		return (ENOTTY);
517 
518 	if (size > 0) {
519 		if (!(com & IOC_VOID))
520 			data = malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
521 		else {
522 			/* Integer argument. */
523 			arg = (intptr_t)uap->data;
524 			data = (void *)&arg;
525 			size = 0;
526 		}
527 	} else
528 		data = (void *)&uap->data;
529 	if (com & IOC_IN) {
530 		error = copyin(uap->data, data, (u_int)size);
531 		if (error) {
532 			if (size > 0)
533 				free(data, M_IOCTLOPS);
534 			return (error);
535 		}
536 	} else if (com & IOC_OUT) {
537 		/*
538 		 * Zero the buffer so the user always
539 		 * gets back something deterministic.
540 		 */
541 		bzero(data, size);
542 	}
543 
544 	error = kern_ioctl(td, uap->fd, com, data);
545 
546 	if (error == 0 && (com & IOC_OUT))
547 		error = copyout(data, uap->data, (u_int)size);
548 
549 	if (size > 0)
550 		free(data, M_IOCTLOPS);
551 	return (error);
552 }
553 
554 int
555 kern_ioctl(struct thread *td, int fd, u_long com, caddr_t data)
556 {
557 	struct file *fp;
558 	struct filedesc *fdp;
559 	int error;
560 	int tmp;
561 
562 	if ((error = fget(td, fd, &fp)) != 0)
563 		return (error);
564 	if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
565 		fdrop(fp, td);
566 		return (EBADF);
567 	}
568 	fdp = td->td_proc->p_fd;
569 	switch (com) {
570 	case FIONCLEX:
571 		FILEDESC_LOCK_FAST(fdp);
572 		fdp->fd_ofileflags[fd] &= ~UF_EXCLOSE;
573 		FILEDESC_UNLOCK_FAST(fdp);
574 		goto out;
575 	case FIOCLEX:
576 		FILEDESC_LOCK_FAST(fdp);
577 		fdp->fd_ofileflags[fd] |= UF_EXCLOSE;
578 		FILEDESC_UNLOCK_FAST(fdp);
579 		goto out;
580 	case FIONBIO:
581 		FILE_LOCK(fp);
582 		if ((tmp = *(int *)data))
583 			fp->f_flag |= FNONBLOCK;
584 		else
585 			fp->f_flag &= ~FNONBLOCK;
586 		FILE_UNLOCK(fp);
587 		data = (void *)&tmp;
588 		break;
589 	case FIOASYNC:
590 		FILE_LOCK(fp);
591 		if ((tmp = *(int *)data))
592 			fp->f_flag |= FASYNC;
593 		else
594 			fp->f_flag &= ~FASYNC;
595 		FILE_UNLOCK(fp);
596 		data = (void *)&tmp;
597 		break;
598 	}
599 
600 	error = fo_ioctl(fp, com, data, td->td_ucred, td);
601 out:
602 	fdrop(fp, td);
603 	return (error);
604 }
605 
606 /*
607  * sellock and selwait are initialized in selectinit() via SYSINIT.
608  */
609 struct mtx	sellock;
610 struct cv	selwait;
611 u_int		nselcoll;	/* Select collisions since boot */
612 SYSCTL_UINT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
613 
614 #ifndef _SYS_SYSPROTO_H_
615 struct select_args {
616 	int	nd;
617 	fd_set	*in, *ou, *ex;
618 	struct	timeval *tv;
619 };
620 #endif
621 int
622 select(td, uap)
623 	register struct thread *td;
624 	register struct select_args *uap;
625 {
626 	struct timeval tv, *tvp;
627 	int error;
628 
629 	if (uap->tv != NULL) {
630 		error = copyin(uap->tv, &tv, sizeof(tv));
631 		if (error)
632 			return (error);
633 		tvp = &tv;
634 	} else
635 		tvp = NULL;
636 
637 	return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp));
638 }
639 
640 int
641 kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou,
642     fd_set *fd_ex, struct timeval *tvp)
643 {
644 	struct filedesc *fdp;
645 	/*
646 	 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
647 	 * infds with the new FD_SETSIZE of 1024, and more than enough for
648 	 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
649 	 * of 256.
650 	 */
651 	fd_mask s_selbits[howmany(2048, NFDBITS)];
652 	fd_mask *ibits[3], *obits[3], *selbits, *sbp;
653 	struct timeval atv, rtv, ttv;
654 	int error, timo;
655 	u_int ncoll, nbufbytes, ncpbytes, nfdbits;
656 
657 	if (nd < 0)
658 		return (EINVAL);
659 	fdp = td->td_proc->p_fd;
660 
661 	FILEDESC_LOCK_FAST(fdp);
662 
663 	if (nd > td->td_proc->p_fd->fd_nfiles)
664 		nd = td->td_proc->p_fd->fd_nfiles;   /* forgiving; slightly wrong */
665 	FILEDESC_UNLOCK_FAST(fdp);
666 
667 	/*
668 	 * Allocate just enough bits for the non-null fd_sets.  Use the
669 	 * preallocated auto buffer if possible.
670 	 */
671 	nfdbits = roundup(nd, NFDBITS);
672 	ncpbytes = nfdbits / NBBY;
673 	nbufbytes = 0;
674 	if (fd_in != NULL)
675 		nbufbytes += 2 * ncpbytes;
676 	if (fd_ou != NULL)
677 		nbufbytes += 2 * ncpbytes;
678 	if (fd_ex != NULL)
679 		nbufbytes += 2 * ncpbytes;
680 	if (nbufbytes <= sizeof s_selbits)
681 		selbits = &s_selbits[0];
682 	else
683 		selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
684 
685 	/*
686 	 * Assign pointers into the bit buffers and fetch the input bits.
687 	 * Put the output buffers together so that they can be bzeroed
688 	 * together.
689 	 */
690 	sbp = selbits;
691 #define	getbits(name, x) \
692 	do {								\
693 		if (name == NULL)					\
694 			ibits[x] = NULL;				\
695 		else {							\
696 			ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp;	\
697 			obits[x] = sbp;					\
698 			sbp += ncpbytes / sizeof *sbp;			\
699 			error = copyin(name, ibits[x], ncpbytes);	\
700 			if (error != 0)					\
701 				goto done_nosellock;			\
702 		}							\
703 	} while (0)
704 	getbits(fd_in, 0);
705 	getbits(fd_ou, 1);
706 	getbits(fd_ex, 2);
707 #undef	getbits
708 	if (nbufbytes != 0)
709 		bzero(selbits, nbufbytes / 2);
710 
711 	if (tvp != NULL) {
712 		atv = *tvp;
713 		if (itimerfix(&atv)) {
714 			error = EINVAL;
715 			goto done_nosellock;
716 		}
717 		getmicrouptime(&rtv);
718 		timevaladd(&atv, &rtv);
719 	} else {
720 		atv.tv_sec = 0;
721 		atv.tv_usec = 0;
722 	}
723 	timo = 0;
724 	TAILQ_INIT(&td->td_selq);
725 	mtx_lock(&sellock);
726 retry:
727 	ncoll = nselcoll;
728 	mtx_lock_spin(&sched_lock);
729 	td->td_flags |= TDF_SELECT;
730 	mtx_unlock_spin(&sched_lock);
731 	mtx_unlock(&sellock);
732 
733 	error = selscan(td, ibits, obits, nd);
734 	mtx_lock(&sellock);
735 	if (error || td->td_retval[0])
736 		goto done;
737 	if (atv.tv_sec || atv.tv_usec) {
738 		getmicrouptime(&rtv);
739 		if (timevalcmp(&rtv, &atv, >=))
740 			goto done;
741 		ttv = atv;
742 		timevalsub(&ttv, &rtv);
743 		timo = ttv.tv_sec > 24 * 60 * 60 ?
744 		    24 * 60 * 60 * hz : tvtohz(&ttv);
745 	}
746 
747 	/*
748 	 * An event of interest may occur while we do not hold
749 	 * sellock, so check TDF_SELECT and the number of
750 	 * collisions and rescan the file descriptors if
751 	 * necessary.
752 	 */
753 	mtx_lock_spin(&sched_lock);
754 	if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
755 		mtx_unlock_spin(&sched_lock);
756 		goto retry;
757 	}
758 	mtx_unlock_spin(&sched_lock);
759 
760 	if (timo > 0)
761 		error = cv_timedwait_sig(&selwait, &sellock, timo);
762 	else
763 		error = cv_wait_sig(&selwait, &sellock);
764 
765 	if (error == 0)
766 		goto retry;
767 
768 done:
769 	clear_selinfo_list(td);
770 	mtx_lock_spin(&sched_lock);
771 	td->td_flags &= ~TDF_SELECT;
772 	mtx_unlock_spin(&sched_lock);
773 	mtx_unlock(&sellock);
774 
775 done_nosellock:
776 	/* select is not restarted after signals... */
777 	if (error == ERESTART)
778 		error = EINTR;
779 	if (error == EWOULDBLOCK)
780 		error = 0;
781 #define	putbits(name, x) \
782 	if (name && (error2 = copyout(obits[x], name, ncpbytes))) \
783 		error = error2;
784 	if (error == 0) {
785 		int error2;
786 
787 		putbits(fd_in, 0);
788 		putbits(fd_ou, 1);
789 		putbits(fd_ex, 2);
790 #undef putbits
791 	}
792 	if (selbits != &s_selbits[0])
793 		free(selbits, M_SELECT);
794 
795 	return (error);
796 }
797 
798 static int
799 selscan(td, ibits, obits, nfd)
800 	struct thread *td;
801 	fd_mask **ibits, **obits;
802 	int nfd;
803 {
804 	int msk, i, fd;
805 	fd_mask bits;
806 	struct file *fp;
807 	int n = 0;
808 	/* Note: backend also returns POLLHUP/POLLERR if appropriate. */
809 	static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
810 	struct filedesc *fdp = td->td_proc->p_fd;
811 
812 	FILEDESC_LOCK(fdp);
813 	for (msk = 0; msk < 3; msk++) {
814 		if (ibits[msk] == NULL)
815 			continue;
816 		for (i = 0; i < nfd; i += NFDBITS) {
817 			bits = ibits[msk][i/NFDBITS];
818 			/* ffs(int mask) not portable, fd_mask is long */
819 			for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
820 				if (!(bits & 1))
821 					continue;
822 				if ((fp = fget_locked(fdp, fd)) == NULL) {
823 					FILEDESC_UNLOCK(fdp);
824 					return (EBADF);
825 				}
826 				if (fo_poll(fp, flag[msk], td->td_ucred,
827 				    td)) {
828 					obits[msk][(fd)/NFDBITS] |=
829 					    ((fd_mask)1 << ((fd) % NFDBITS));
830 					n++;
831 				}
832 			}
833 		}
834 	}
835 	FILEDESC_UNLOCK(fdp);
836 	td->td_retval[0] = n;
837 	return (0);
838 }
839 
840 #ifndef _SYS_SYSPROTO_H_
841 struct poll_args {
842 	struct pollfd *fds;
843 	u_int	nfds;
844 	int	timeout;
845 };
846 #endif
847 int
848 poll(td, uap)
849 	struct thread *td;
850 	struct poll_args *uap;
851 {
852 	struct pollfd *bits;
853 	struct pollfd smallbits[32];
854 	struct timeval atv, rtv, ttv;
855 	int error = 0, timo;
856 	u_int ncoll, nfds;
857 	size_t ni;
858 
859 	nfds = uap->nfds;
860 
861 	/*
862 	 * This is kinda bogus.  We have fd limits, but that is not
863 	 * really related to the size of the pollfd array.  Make sure
864 	 * we let the process use at least FD_SETSIZE entries and at
865 	 * least enough for the current limits.  We want to be reasonably
866 	 * safe, but not overly restrictive.
867 	 */
868 	PROC_LOCK(td->td_proc);
869 	if ((nfds > lim_cur(td->td_proc, RLIMIT_NOFILE)) &&
870 	    (nfds > FD_SETSIZE)) {
871 		PROC_UNLOCK(td->td_proc);
872 		error = EINVAL;
873 		goto done2;
874 	}
875 	PROC_UNLOCK(td->td_proc);
876 	ni = nfds * sizeof(struct pollfd);
877 	if (ni > sizeof(smallbits))
878 		bits = malloc(ni, M_TEMP, M_WAITOK);
879 	else
880 		bits = smallbits;
881 	error = copyin(uap->fds, bits, ni);
882 	if (error)
883 		goto done_nosellock;
884 	if (uap->timeout != INFTIM) {
885 		atv.tv_sec = uap->timeout / 1000;
886 		atv.tv_usec = (uap->timeout % 1000) * 1000;
887 		if (itimerfix(&atv)) {
888 			error = EINVAL;
889 			goto done_nosellock;
890 		}
891 		getmicrouptime(&rtv);
892 		timevaladd(&atv, &rtv);
893 	} else {
894 		atv.tv_sec = 0;
895 		atv.tv_usec = 0;
896 	}
897 	timo = 0;
898 	TAILQ_INIT(&td->td_selq);
899 	mtx_lock(&sellock);
900 retry:
901 	ncoll = nselcoll;
902 	mtx_lock_spin(&sched_lock);
903 	td->td_flags |= TDF_SELECT;
904 	mtx_unlock_spin(&sched_lock);
905 	mtx_unlock(&sellock);
906 
907 	error = pollscan(td, bits, nfds);
908 	mtx_lock(&sellock);
909 	if (error || td->td_retval[0])
910 		goto done;
911 	if (atv.tv_sec || atv.tv_usec) {
912 		getmicrouptime(&rtv);
913 		if (timevalcmp(&rtv, &atv, >=))
914 			goto done;
915 		ttv = atv;
916 		timevalsub(&ttv, &rtv);
917 		timo = ttv.tv_sec > 24 * 60 * 60 ?
918 		    24 * 60 * 60 * hz : tvtohz(&ttv);
919 	}
920 	/*
921 	 * An event of interest may occur while we do not hold
922 	 * sellock, so check TDF_SELECT and the number of collisions
923 	 * and rescan the file descriptors if necessary.
924 	 */
925 	mtx_lock_spin(&sched_lock);
926 	if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
927 		mtx_unlock_spin(&sched_lock);
928 		goto retry;
929 	}
930 	mtx_unlock_spin(&sched_lock);
931 
932 	if (timo > 0)
933 		error = cv_timedwait_sig(&selwait, &sellock, timo);
934 	else
935 		error = cv_wait_sig(&selwait, &sellock);
936 
937 	if (error == 0)
938 		goto retry;
939 
940 done:
941 	clear_selinfo_list(td);
942 	mtx_lock_spin(&sched_lock);
943 	td->td_flags &= ~TDF_SELECT;
944 	mtx_unlock_spin(&sched_lock);
945 	mtx_unlock(&sellock);
946 
947 done_nosellock:
948 	/* poll is not restarted after signals... */
949 	if (error == ERESTART)
950 		error = EINTR;
951 	if (error == EWOULDBLOCK)
952 		error = 0;
953 	if (error == 0) {
954 		error = copyout(bits, uap->fds, ni);
955 		if (error)
956 			goto out;
957 	}
958 out:
959 	if (ni > sizeof(smallbits))
960 		free(bits, M_TEMP);
961 done2:
962 	return (error);
963 }
964 
965 static int
966 pollscan(td, fds, nfd)
967 	struct thread *td;
968 	struct pollfd *fds;
969 	u_int nfd;
970 {
971 	register struct filedesc *fdp = td->td_proc->p_fd;
972 	int i;
973 	struct file *fp;
974 	int n = 0;
975 
976 	FILEDESC_LOCK(fdp);
977 	for (i = 0; i < nfd; i++, fds++) {
978 		if (fds->fd >= fdp->fd_nfiles) {
979 			fds->revents = POLLNVAL;
980 			n++;
981 		} else if (fds->fd < 0) {
982 			fds->revents = 0;
983 		} else {
984 			fp = fdp->fd_ofiles[fds->fd];
985 			if (fp == NULL) {
986 				fds->revents = POLLNVAL;
987 				n++;
988 			} else {
989 				/*
990 				 * Note: backend also returns POLLHUP and
991 				 * POLLERR if appropriate.
992 				 */
993 				fds->revents = fo_poll(fp, fds->events,
994 				    td->td_ucred, td);
995 				if (fds->revents != 0)
996 					n++;
997 			}
998 		}
999 	}
1000 	FILEDESC_UNLOCK(fdp);
1001 	td->td_retval[0] = n;
1002 	return (0);
1003 }
1004 
1005 /*
1006  * OpenBSD poll system call.
1007  *
1008  * XXX this isn't quite a true representation..  OpenBSD uses select ops.
1009  */
1010 #ifndef _SYS_SYSPROTO_H_
1011 struct openbsd_poll_args {
1012 	struct pollfd *fds;
1013 	u_int	nfds;
1014 	int	timeout;
1015 };
1016 #endif
1017 int
1018 openbsd_poll(td, uap)
1019 	register struct thread *td;
1020 	register struct openbsd_poll_args *uap;
1021 {
1022 	return (poll(td, (struct poll_args *)uap));
1023 }
1024 
1025 /*
1026  * Remove the references to the thread from all of the objects we were
1027  * polling.
1028  *
1029  * This code assumes that the underlying owner of the selinfo structure will
1030  * hold sellock before it changes it, and that it will unlink itself from our
1031  * list if it goes away.
1032  */
1033 void
1034 clear_selinfo_list(td)
1035 	struct thread *td;
1036 {
1037 	struct selinfo *si;
1038 
1039 	mtx_assert(&sellock, MA_OWNED);
1040 	TAILQ_FOREACH(si, &td->td_selq, si_thrlist)
1041 		si->si_thread = NULL;
1042 	TAILQ_INIT(&td->td_selq);
1043 }
1044 
1045 /*
1046  * Record a select request.
1047  */
1048 void
1049 selrecord(selector, sip)
1050 	struct thread *selector;
1051 	struct selinfo *sip;
1052 {
1053 
1054 	mtx_lock(&sellock);
1055 	/*
1056 	 * If the selinfo's thread pointer is NULL then take ownership of it.
1057 	 *
1058 	 * If the thread pointer is not NULL and it points to another
1059 	 * thread, then we have a collision.
1060 	 *
1061 	 * If the thread pointer is not NULL and points back to us then leave
1062 	 * it alone as we've already added pointed it at us and added it to
1063 	 * our list.
1064 	 */
1065 	if (sip->si_thread == NULL) {
1066 		sip->si_thread = selector;
1067 		TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist);
1068 	} else if (sip->si_thread != selector) {
1069 		sip->si_flags |= SI_COLL;
1070 	}
1071 
1072 	mtx_unlock(&sellock);
1073 }
1074 
1075 /* Wake up a selecting thread. */
1076 void
1077 selwakeup(sip)
1078 	struct selinfo *sip;
1079 {
1080 	doselwakeup(sip, -1);
1081 }
1082 
1083 /* Wake up a selecting thread, and set its priority. */
1084 void
1085 selwakeuppri(sip, pri)
1086 	struct selinfo *sip;
1087 	int pri;
1088 {
1089 	doselwakeup(sip, pri);
1090 }
1091 
1092 /*
1093  * Do a wakeup when a selectable event occurs.
1094  */
1095 static void
1096 doselwakeup(sip, pri)
1097 	struct selinfo *sip;
1098 	int pri;
1099 {
1100 	struct thread *td;
1101 
1102 	mtx_lock(&sellock);
1103 	td = sip->si_thread;
1104 	if ((sip->si_flags & SI_COLL) != 0) {
1105 		nselcoll++;
1106 		sip->si_flags &= ~SI_COLL;
1107 		cv_broadcastpri(&selwait, pri);
1108 	}
1109 	if (td == NULL) {
1110 		mtx_unlock(&sellock);
1111 		return;
1112 	}
1113 	TAILQ_REMOVE(&td->td_selq, sip, si_thrlist);
1114 	sip->si_thread = NULL;
1115 	mtx_lock_spin(&sched_lock);
1116 	td->td_flags &= ~TDF_SELECT;
1117 	mtx_unlock_spin(&sched_lock);
1118 	sleepq_remove(td, &selwait);
1119 	mtx_unlock(&sellock);
1120 }
1121 
1122 static void selectinit(void *);
1123 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL)
1124 
1125 /* ARGSUSED*/
1126 static void
1127 selectinit(dummy)
1128 	void *dummy;
1129 {
1130 	cv_init(&selwait, "select");
1131 	mtx_init(&sellock, "sellck", NULL, MTX_DEF);
1132 }
1133