xref: /freebsd/sys/kern/sys_generic.c (revision 0c927cdd8e6e05387fc5a9ffcb5dbe128d4ad749)
1 /*-
2  * Copyright (c) 1982, 1986, 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 4. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
35  */
36 
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD$");
39 
40 #include "opt_compat.h"
41 #include "opt_ktrace.h"
42 
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/sysproto.h>
46 #include <sys/filedesc.h>
47 #include <sys/filio.h>
48 #include <sys/fcntl.h>
49 #include <sys/file.h>
50 #include <sys/proc.h>
51 #include <sys/signalvar.h>
52 #include <sys/socketvar.h>
53 #include <sys/uio.h>
54 #include <sys/kernel.h>
55 #include <sys/limits.h>
56 #include <sys/malloc.h>
57 #include <sys/poll.h>
58 #include <sys/resourcevar.h>
59 #include <sys/selinfo.h>
60 #include <sys/sleepqueue.h>
61 #include <sys/syscallsubr.h>
62 #include <sys/sysctl.h>
63 #include <sys/sysent.h>
64 #include <sys/vnode.h>
65 #include <sys/bio.h>
66 #include <sys/buf.h>
67 #include <sys/condvar.h>
68 #ifdef KTRACE
69 #include <sys/ktrace.h>
70 #endif
71 
72 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
73 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
74 MALLOC_DEFINE(M_IOV, "iov", "large iov's");
75 
76 static int	pollscan(struct thread *, struct pollfd *, u_int);
77 static int	selscan(struct thread *, fd_mask **, fd_mask **, int);
78 static int	dofileread(struct thread *, int, struct file *, struct uio *,
79 		    off_t, int);
80 static int	dofilewrite(struct thread *, int, struct file *, struct uio *,
81 		    off_t, int);
82 static void	doselwakeup(struct selinfo *, int);
83 
84 #ifndef _SYS_SYSPROTO_H_
85 struct read_args {
86 	int	fd;
87 	void	*buf;
88 	size_t	nbyte;
89 };
90 #endif
91 int
92 read(td, uap)
93 	struct thread *td;
94 	struct read_args *uap;
95 {
96 	struct uio auio;
97 	struct iovec aiov;
98 	int error;
99 
100 	if (uap->nbyte > INT_MAX)
101 		return (EINVAL);
102 	aiov.iov_base = uap->buf;
103 	aiov.iov_len = uap->nbyte;
104 	auio.uio_iov = &aiov;
105 	auio.uio_iovcnt = 1;
106 	auio.uio_resid = uap->nbyte;
107 	auio.uio_segflg = UIO_USERSPACE;
108 	error = kern_readv(td, uap->fd, &auio);
109 	return(error);
110 }
111 
112 /*
113  * Positioned read system call
114  */
115 #ifndef _SYS_SYSPROTO_H_
116 struct pread_args {
117 	int	fd;
118 	void	*buf;
119 	size_t	nbyte;
120 	int	pad;
121 	off_t	offset;
122 };
123 #endif
124 int
125 pread(td, uap)
126 	struct thread *td;
127 	struct pread_args *uap;
128 {
129 	struct uio auio;
130 	struct iovec aiov;
131 	int error;
132 
133 	if (uap->nbyte > INT_MAX)
134 		return (EINVAL);
135 	aiov.iov_base = uap->buf;
136 	aiov.iov_len = uap->nbyte;
137 	auio.uio_iov = &aiov;
138 	auio.uio_iovcnt = 1;
139 	auio.uio_resid = uap->nbyte;
140 	auio.uio_segflg = UIO_USERSPACE;
141 	error = kern_preadv(td, uap->fd, &auio, uap->offset);
142 	return(error);
143 }
144 
145 /*
146  * Scatter read system call.
147  */
148 #ifndef _SYS_SYSPROTO_H_
149 struct readv_args {
150 	int	fd;
151 	struct	iovec *iovp;
152 	u_int	iovcnt;
153 };
154 #endif
155 int
156 readv(struct thread *td, struct readv_args *uap)
157 {
158 	struct uio *auio;
159 	int error;
160 
161 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
162 	if (error)
163 		return (error);
164 	error = kern_readv(td, uap->fd, auio);
165 	free(auio, M_IOV);
166 	return (error);
167 }
168 
169 int
170 kern_readv(struct thread *td, int fd, struct uio *auio)
171 {
172 	struct file *fp;
173 	int error;
174 
175 	error = fget_read(td, fd, &fp);
176 	if (error)
177 		return (error);
178 	error = dofileread(td, fd, fp, auio, (off_t)-1, 0);
179 	fdrop(fp, td);
180 	return (error);
181 }
182 
183 /*
184  * Scatter positioned read system call.
185  */
186 #ifndef _SYS_SYSPROTO_H_
187 struct preadv_args {
188 	int	fd;
189 	struct	iovec *iovp;
190 	u_int	iovcnt;
191 	off_t	offset;
192 };
193 #endif
194 int
195 preadv(struct thread *td, struct preadv_args *uap)
196 {
197 	struct uio *auio;
198 	int error;
199 
200 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
201 	if (error)
202 		return (error);
203 	error = kern_preadv(td, uap->fd, auio, uap->offset);
204 	free(auio, M_IOV);
205 	return (error);
206 }
207 
208 int
209 kern_preadv(td, fd, auio, offset)
210 	struct thread *td;
211 	int fd;
212 	struct uio *auio;
213 	off_t offset;
214 {
215 	struct file *fp;
216 	int error;
217 
218 	error = fget_read(td, fd, &fp);
219 	if (error)
220 		return (error);
221 	if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
222 		error = ESPIPE;
223 	else if (offset < 0 && fp->f_vnode->v_type != VCHR)
224 		error = EINVAL;
225 	else
226 		error = dofileread(td, fd, fp, auio, offset, FOF_OFFSET);
227 	fdrop(fp, td);
228 	return (error);
229 }
230 
231 /*
232  * Common code for readv and preadv that reads data in
233  * from a file using the passed in uio, offset, and flags.
234  */
235 static int
236 dofileread(td, fd, fp, auio, offset, flags)
237 	struct thread *td;
238 	int fd;
239 	struct file *fp;
240 	struct uio *auio;
241 	off_t offset;
242 	int flags;
243 {
244 	ssize_t cnt;
245 	int error;
246 #ifdef KTRACE
247 	struct uio *ktruio = NULL;
248 #endif
249 
250 	/* Finish zero length reads right here */
251 	if (auio->uio_resid == 0) {
252 		td->td_retval[0] = 0;
253 		return(0);
254 	}
255 	auio->uio_rw = UIO_READ;
256 	auio->uio_offset = offset;
257 	auio->uio_td = td;
258 #ifdef KTRACE
259 	if (KTRPOINT(td, KTR_GENIO))
260 		ktruio = cloneuio(auio);
261 #endif
262 	cnt = auio->uio_resid;
263 	if ((error = fo_read(fp, auio, td->td_ucred, flags, td))) {
264 		if (auio->uio_resid != cnt && (error == ERESTART ||
265 		    error == EINTR || error == EWOULDBLOCK))
266 			error = 0;
267 	}
268 	cnt -= auio->uio_resid;
269 #ifdef KTRACE
270 	if (ktruio != NULL) {
271 		ktruio->uio_resid = cnt;
272 		ktrgenio(fd, UIO_READ, ktruio, error);
273 	}
274 #endif
275 	td->td_retval[0] = cnt;
276 	return (error);
277 }
278 
279 #ifndef _SYS_SYSPROTO_H_
280 struct write_args {
281 	int	fd;
282 	const void *buf;
283 	size_t	nbyte;
284 };
285 #endif
286 int
287 write(td, uap)
288 	struct thread *td;
289 	struct write_args *uap;
290 {
291 	struct uio auio;
292 	struct iovec aiov;
293 	int error;
294 
295 	if (uap->nbyte > INT_MAX)
296 		return (EINVAL);
297 	aiov.iov_base = (void *)(uintptr_t)uap->buf;
298 	aiov.iov_len = uap->nbyte;
299 	auio.uio_iov = &aiov;
300 	auio.uio_iovcnt = 1;
301 	auio.uio_resid = uap->nbyte;
302 	auio.uio_segflg = UIO_USERSPACE;
303 	error = kern_writev(td, uap->fd, &auio);
304 	return(error);
305 }
306 
307 /*
308  * Positioned write system call.
309  */
310 #ifndef _SYS_SYSPROTO_H_
311 struct pwrite_args {
312 	int	fd;
313 	const void *buf;
314 	size_t	nbyte;
315 	int	pad;
316 	off_t	offset;
317 };
318 #endif
319 int
320 pwrite(td, uap)
321 	struct thread *td;
322 	struct pwrite_args *uap;
323 {
324 	struct uio auio;
325 	struct iovec aiov;
326 	int error;
327 
328 	if (uap->nbyte > INT_MAX)
329 		return (EINVAL);
330 	aiov.iov_base = (void *)(uintptr_t)uap->buf;
331 	aiov.iov_len = uap->nbyte;
332 	auio.uio_iov = &aiov;
333 	auio.uio_iovcnt = 1;
334 	auio.uio_resid = uap->nbyte;
335 	auio.uio_segflg = UIO_USERSPACE;
336 	error = kern_pwritev(td, uap->fd, &auio, uap->offset);
337 	return(error);
338 }
339 
340 /*
341  * Gather write system call.
342  */
343 #ifndef _SYS_SYSPROTO_H_
344 struct writev_args {
345 	int	fd;
346 	struct	iovec *iovp;
347 	u_int	iovcnt;
348 };
349 #endif
350 int
351 writev(struct thread *td, struct writev_args *uap)
352 {
353 	struct uio *auio;
354 	int error;
355 
356 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
357 	if (error)
358 		return (error);
359 	error = kern_writev(td, uap->fd, auio);
360 	free(auio, M_IOV);
361 	return (error);
362 }
363 
364 int
365 kern_writev(struct thread *td, int fd, struct uio *auio)
366 {
367 	struct file *fp;
368 	int error;
369 
370 	error = fget_write(td, fd, &fp);
371 	if (error)
372 		return (error);
373 	error = dofilewrite(td, fd, fp, auio, (off_t)-1, 0);
374 	fdrop(fp, td);
375 	return (error);
376 }
377 
378 /*
379  * Gather positioned write system call.
380  */
381 #ifndef _SYS_SYSPROTO_H_
382 struct pwritev_args {
383 	int	fd;
384 	struct	iovec *iovp;
385 	u_int	iovcnt;
386 	off_t	offset;
387 };
388 #endif
389 int
390 pwritev(struct thread *td, struct pwritev_args *uap)
391 {
392 	struct uio *auio;
393 	int error;
394 
395 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
396 	if (error)
397 		return (error);
398 	error = kern_pwritev(td, uap->fd, auio, uap->offset);
399 	free(auio, M_IOV);
400 	return (error);
401 }
402 
403 int
404 kern_pwritev(td, fd, auio, offset)
405 	struct thread *td;
406 	struct uio *auio;
407 	int fd;
408 	off_t offset;
409 {
410 	struct file *fp;
411 	int error;
412 
413 	error = fget_write(td, fd, &fp);
414 	if (error)
415 		return (error);
416 	if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
417 		error = ESPIPE;
418 	else if (offset < 0 && fp->f_vnode->v_type != VCHR)
419 		error = EINVAL;
420 	else
421 		error = dofilewrite(td, fd, fp, auio, offset, FOF_OFFSET);
422 	fdrop(fp, td);
423 	return (error);
424 }
425 
426 /*
427  * Common code for writev and pwritev that writes data to
428  * a file using the passed in uio, offset, and flags.
429  */
430 static int
431 dofilewrite(td, fd, fp, auio, offset, flags)
432 	struct thread *td;
433 	int fd;
434 	struct file *fp;
435 	struct uio *auio;
436 	off_t offset;
437 	int flags;
438 {
439 	ssize_t cnt;
440 	int error;
441 #ifdef KTRACE
442 	struct uio *ktruio = NULL;
443 #endif
444 
445 	auio->uio_rw = UIO_WRITE;
446 	auio->uio_td = td;
447 	auio->uio_offset = offset;
448 #ifdef KTRACE
449 	if (KTRPOINT(td, KTR_GENIO))
450 		ktruio = cloneuio(auio);
451 #endif
452 	cnt = auio->uio_resid;
453 	if (fp->f_type == DTYPE_VNODE)
454 		bwillwrite();
455 	if ((error = fo_write(fp, auio, td->td_ucred, flags, td))) {
456 		if (auio->uio_resid != cnt && (error == ERESTART ||
457 		    error == EINTR || error == EWOULDBLOCK))
458 			error = 0;
459 		/* Socket layer is responsible for issuing SIGPIPE. */
460 		if (fp->f_type != DTYPE_SOCKET && error == EPIPE) {
461 			PROC_LOCK(td->td_proc);
462 			psignal(td->td_proc, SIGPIPE);
463 			PROC_UNLOCK(td->td_proc);
464 		}
465 	}
466 	cnt -= auio->uio_resid;
467 #ifdef KTRACE
468 	if (ktruio != NULL) {
469 		ktruio->uio_resid = cnt;
470 		ktrgenio(fd, UIO_WRITE, ktruio, error);
471 	}
472 #endif
473 	td->td_retval[0] = cnt;
474 	return (error);
475 }
476 
477 #ifndef _SYS_SYSPROTO_H_
478 struct ioctl_args {
479 	int	fd;
480 	u_long	com;
481 	caddr_t	data;
482 };
483 #endif
484 /* ARGSUSED */
485 int
486 ioctl(struct thread *td, struct ioctl_args *uap)
487 {
488 	u_long com;
489 	int arg, error;
490 	u_int size;
491 	caddr_t data;
492 
493 	if (uap->com > 0xffffffff) {
494 		printf(
495 		    "WARNING pid %d (%s): ioctl sign-extension ioctl %lx\n",
496 		    td->td_proc->p_pid, td->td_proc->p_comm, uap->com);
497 		uap->com &= 0xffffffff;
498 	}
499 	com = uap->com;
500 
501 	/*
502 	 * Interpret high order word to find amount of data to be
503 	 * copied to/from the user's address space.
504 	 */
505 	size = IOCPARM_LEN(com);
506 	if ((size > IOCPARM_MAX) ||
507 	    ((com & (IOC_VOID  | IOC_IN | IOC_OUT)) == 0) ||
508 #if defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
509 	    ((com & IOC_OUT) && size == 0) ||
510 #else
511 	    ((com & (IOC_IN | IOC_OUT)) && size == 0) ||
512 #endif
513 	    ((com & IOC_VOID) && size > 0 && size != sizeof(int)))
514 		return (ENOTTY);
515 
516 	if (size > 0) {
517 		if (!(com & IOC_VOID))
518 			data = malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
519 		else {
520 			/* Integer argument. */
521 			arg = (intptr_t)uap->data;
522 			data = (void *)&arg;
523 			size = 0;
524 		}
525 	} else
526 		data = (void *)&uap->data;
527 	if (com & IOC_IN) {
528 		error = copyin(uap->data, data, (u_int)size);
529 		if (error) {
530 			if (size > 0)
531 				free(data, M_IOCTLOPS);
532 			return (error);
533 		}
534 	} else if (com & IOC_OUT) {
535 		/*
536 		 * Zero the buffer so the user always
537 		 * gets back something deterministic.
538 		 */
539 		bzero(data, size);
540 	}
541 
542 	error = kern_ioctl(td, uap->fd, com, data);
543 
544 	if (error == 0 && (com & IOC_OUT))
545 		error = copyout(data, uap->data, (u_int)size);
546 
547 	if (size > 0)
548 		free(data, M_IOCTLOPS);
549 	return (error);
550 }
551 
552 int
553 kern_ioctl(struct thread *td, int fd, u_long com, caddr_t data)
554 {
555 	struct file *fp;
556 	struct filedesc *fdp;
557 	int error;
558 	int tmp;
559 
560 	if ((error = fget(td, fd, &fp)) != 0)
561 		return (error);
562 	if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
563 		fdrop(fp, td);
564 		return (EBADF);
565 	}
566 	fdp = td->td_proc->p_fd;
567 	switch (com) {
568 	case FIONCLEX:
569 		FILEDESC_XLOCK(fdp);
570 		fdp->fd_ofileflags[fd] &= ~UF_EXCLOSE;
571 		FILEDESC_XUNLOCK(fdp);
572 		goto out;
573 	case FIOCLEX:
574 		FILEDESC_XLOCK(fdp);
575 		fdp->fd_ofileflags[fd] |= UF_EXCLOSE;
576 		FILEDESC_XUNLOCK(fdp);
577 		goto out;
578 	case FIONBIO:
579 		FILE_LOCK(fp);
580 		if ((tmp = *(int *)data))
581 			fp->f_flag |= FNONBLOCK;
582 		else
583 			fp->f_flag &= ~FNONBLOCK;
584 		FILE_UNLOCK(fp);
585 		data = (void *)&tmp;
586 		break;
587 	case FIOASYNC:
588 		FILE_LOCK(fp);
589 		if ((tmp = *(int *)data))
590 			fp->f_flag |= FASYNC;
591 		else
592 			fp->f_flag &= ~FASYNC;
593 		FILE_UNLOCK(fp);
594 		data = (void *)&tmp;
595 		break;
596 	}
597 
598 	error = fo_ioctl(fp, com, data, td->td_ucred, td);
599 out:
600 	fdrop(fp, td);
601 	return (error);
602 }
603 
604 /*
605  * sellock and selwait are initialized in selectinit() via SYSINIT.
606  */
607 struct mtx	sellock;
608 struct cv	selwait;
609 u_int		nselcoll;	/* Select collisions since boot */
610 SYSCTL_UINT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
611 
612 #ifndef _SYS_SYSPROTO_H_
613 struct select_args {
614 	int	nd;
615 	fd_set	*in, *ou, *ex;
616 	struct	timeval *tv;
617 };
618 #endif
619 int
620 select(td, uap)
621 	register struct thread *td;
622 	register struct select_args *uap;
623 {
624 	struct timeval tv, *tvp;
625 	int error;
626 
627 	if (uap->tv != NULL) {
628 		error = copyin(uap->tv, &tv, sizeof(tv));
629 		if (error)
630 			return (error);
631 		tvp = &tv;
632 	} else
633 		tvp = NULL;
634 
635 	return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp));
636 }
637 
638 int
639 kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou,
640     fd_set *fd_ex, struct timeval *tvp)
641 {
642 	struct filedesc *fdp;
643 	/*
644 	 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
645 	 * infds with the new FD_SETSIZE of 1024, and more than enough for
646 	 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
647 	 * of 256.
648 	 */
649 	fd_mask s_selbits[howmany(2048, NFDBITS)];
650 	fd_mask *ibits[3], *obits[3], *selbits, *sbp;
651 	struct timeval atv, rtv, ttv;
652 	int error, timo;
653 	u_int ncoll, nbufbytes, ncpbytes, nfdbits;
654 
655 	if (nd < 0)
656 		return (EINVAL);
657 	fdp = td->td_proc->p_fd;
658 
659 	FILEDESC_SLOCK(fdp);
660 	if (nd > td->td_proc->p_fd->fd_nfiles)
661 		nd = td->td_proc->p_fd->fd_nfiles;   /* forgiving; slightly wrong */
662 	FILEDESC_SUNLOCK(fdp);
663 
664 	/*
665 	 * Allocate just enough bits for the non-null fd_sets.  Use the
666 	 * preallocated auto buffer if possible.
667 	 */
668 	nfdbits = roundup(nd, NFDBITS);
669 	ncpbytes = nfdbits / NBBY;
670 	nbufbytes = 0;
671 	if (fd_in != NULL)
672 		nbufbytes += 2 * ncpbytes;
673 	if (fd_ou != NULL)
674 		nbufbytes += 2 * ncpbytes;
675 	if (fd_ex != NULL)
676 		nbufbytes += 2 * ncpbytes;
677 	if (nbufbytes <= sizeof s_selbits)
678 		selbits = &s_selbits[0];
679 	else
680 		selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
681 
682 	/*
683 	 * Assign pointers into the bit buffers and fetch the input bits.
684 	 * Put the output buffers together so that they can be bzeroed
685 	 * together.
686 	 */
687 	sbp = selbits;
688 #define	getbits(name, x) \
689 	do {								\
690 		if (name == NULL)					\
691 			ibits[x] = NULL;				\
692 		else {							\
693 			ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp;	\
694 			obits[x] = sbp;					\
695 			sbp += ncpbytes / sizeof *sbp;			\
696 			error = copyin(name, ibits[x], ncpbytes);	\
697 			if (error != 0)					\
698 				goto done_nosellock;			\
699 		}							\
700 	} while (0)
701 	getbits(fd_in, 0);
702 	getbits(fd_ou, 1);
703 	getbits(fd_ex, 2);
704 #undef	getbits
705 	if (nbufbytes != 0)
706 		bzero(selbits, nbufbytes / 2);
707 
708 	if (tvp != NULL) {
709 		atv = *tvp;
710 		if (itimerfix(&atv)) {
711 			error = EINVAL;
712 			goto done_nosellock;
713 		}
714 		getmicrouptime(&rtv);
715 		timevaladd(&atv, &rtv);
716 	} else {
717 		atv.tv_sec = 0;
718 		atv.tv_usec = 0;
719 	}
720 	timo = 0;
721 	TAILQ_INIT(&td->td_selq);
722 	mtx_lock(&sellock);
723 retry:
724 	ncoll = nselcoll;
725 	thread_lock(td);
726 	td->td_flags |= TDF_SELECT;
727 	thread_unlock(td);
728 	mtx_unlock(&sellock);
729 
730 	error = selscan(td, ibits, obits, nd);
731 	mtx_lock(&sellock);
732 	if (error || td->td_retval[0])
733 		goto done;
734 	if (atv.tv_sec || atv.tv_usec) {
735 		getmicrouptime(&rtv);
736 		if (timevalcmp(&rtv, &atv, >=))
737 			goto done;
738 		ttv = atv;
739 		timevalsub(&ttv, &rtv);
740 		timo = ttv.tv_sec > 24 * 60 * 60 ?
741 		    24 * 60 * 60 * hz : tvtohz(&ttv);
742 	}
743 
744 	/*
745 	 * An event of interest may occur while we do not hold
746 	 * sellock, so check TDF_SELECT and the number of
747 	 * collisions and rescan the file descriptors if
748 	 * necessary.
749 	 */
750 	thread_lock(td);
751 	if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
752 		thread_unlock(td);
753 		goto retry;
754 	}
755 	thread_unlock(td);
756 
757 	if (timo > 0)
758 		error = cv_timedwait_sig(&selwait, &sellock, timo);
759 	else
760 		error = cv_wait_sig(&selwait, &sellock);
761 
762 	if (error == 0)
763 		goto retry;
764 
765 done:
766 	clear_selinfo_list(td);
767 	thread_lock(td);
768 	td->td_flags &= ~TDF_SELECT;
769 	thread_unlock(td);
770 	mtx_unlock(&sellock);
771 
772 done_nosellock:
773 	/* select is not restarted after signals... */
774 	if (error == ERESTART)
775 		error = EINTR;
776 	if (error == EWOULDBLOCK)
777 		error = 0;
778 #define	putbits(name, x) \
779 	if (name && (error2 = copyout(obits[x], name, ncpbytes))) \
780 		error = error2;
781 	if (error == 0) {
782 		int error2;
783 
784 		putbits(fd_in, 0);
785 		putbits(fd_ou, 1);
786 		putbits(fd_ex, 2);
787 #undef putbits
788 	}
789 	if (selbits != &s_selbits[0])
790 		free(selbits, M_SELECT);
791 
792 	return (error);
793 }
794 
795 static int
796 selscan(td, ibits, obits, nfd)
797 	struct thread *td;
798 	fd_mask **ibits, **obits;
799 	int nfd;
800 {
801 	int msk, i, fd;
802 	fd_mask bits;
803 	struct file *fp;
804 	int n = 0;
805 	/* Note: backend also returns POLLHUP/POLLERR if appropriate. */
806 	static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
807 	struct filedesc *fdp = td->td_proc->p_fd;
808 
809 	FILEDESC_SLOCK(fdp);
810 	for (msk = 0; msk < 3; msk++) {
811 		if (ibits[msk] == NULL)
812 			continue;
813 		for (i = 0; i < nfd; i += NFDBITS) {
814 			bits = ibits[msk][i/NFDBITS];
815 			/* ffs(int mask) not portable, fd_mask is long */
816 			for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
817 				if (!(bits & 1))
818 					continue;
819 				if ((fp = fget_locked(fdp, fd)) == NULL) {
820 					FILEDESC_SUNLOCK(fdp);
821 					return (EBADF);
822 				}
823 				if (fo_poll(fp, flag[msk], td->td_ucred,
824 				    td)) {
825 					obits[msk][(fd)/NFDBITS] |=
826 					    ((fd_mask)1 << ((fd) % NFDBITS));
827 					n++;
828 				}
829 			}
830 		}
831 	}
832 	FILEDESC_SUNLOCK(fdp);
833 	td->td_retval[0] = n;
834 	return (0);
835 }
836 
837 #ifndef _SYS_SYSPROTO_H_
838 struct poll_args {
839 	struct pollfd *fds;
840 	u_int	nfds;
841 	int	timeout;
842 };
843 #endif
844 int
845 poll(td, uap)
846 	struct thread *td;
847 	struct poll_args *uap;
848 {
849 	struct pollfd *bits;
850 	struct pollfd smallbits[32];
851 	struct timeval atv, rtv, ttv;
852 	int error = 0, timo;
853 	u_int ncoll, nfds;
854 	size_t ni;
855 
856 	nfds = uap->nfds;
857 
858 	/*
859 	 * This is kinda bogus.  We have fd limits, but that is not
860 	 * really related to the size of the pollfd array.  Make sure
861 	 * we let the process use at least FD_SETSIZE entries and at
862 	 * least enough for the current limits.  We want to be reasonably
863 	 * safe, but not overly restrictive.
864 	 */
865 	PROC_LOCK(td->td_proc);
866 	if ((nfds > lim_cur(td->td_proc, RLIMIT_NOFILE)) &&
867 	    (nfds > FD_SETSIZE)) {
868 		PROC_UNLOCK(td->td_proc);
869 		error = EINVAL;
870 		goto done2;
871 	}
872 	PROC_UNLOCK(td->td_proc);
873 	ni = nfds * sizeof(struct pollfd);
874 	if (ni > sizeof(smallbits))
875 		bits = malloc(ni, M_TEMP, M_WAITOK);
876 	else
877 		bits = smallbits;
878 	error = copyin(uap->fds, bits, ni);
879 	if (error)
880 		goto done_nosellock;
881 	if (uap->timeout != INFTIM) {
882 		atv.tv_sec = uap->timeout / 1000;
883 		atv.tv_usec = (uap->timeout % 1000) * 1000;
884 		if (itimerfix(&atv)) {
885 			error = EINVAL;
886 			goto done_nosellock;
887 		}
888 		getmicrouptime(&rtv);
889 		timevaladd(&atv, &rtv);
890 	} else {
891 		atv.tv_sec = 0;
892 		atv.tv_usec = 0;
893 	}
894 	timo = 0;
895 	TAILQ_INIT(&td->td_selq);
896 	mtx_lock(&sellock);
897 retry:
898 	ncoll = nselcoll;
899 	thread_lock(td);
900 	td->td_flags |= TDF_SELECT;
901 	thread_unlock(td);
902 	mtx_unlock(&sellock);
903 
904 	error = pollscan(td, bits, nfds);
905 	mtx_lock(&sellock);
906 	if (error || td->td_retval[0])
907 		goto done;
908 	if (atv.tv_sec || atv.tv_usec) {
909 		getmicrouptime(&rtv);
910 		if (timevalcmp(&rtv, &atv, >=))
911 			goto done;
912 		ttv = atv;
913 		timevalsub(&ttv, &rtv);
914 		timo = ttv.tv_sec > 24 * 60 * 60 ?
915 		    24 * 60 * 60 * hz : tvtohz(&ttv);
916 	}
917 	/*
918 	 * An event of interest may occur while we do not hold
919 	 * sellock, so check TDF_SELECT and the number of collisions
920 	 * and rescan the file descriptors if necessary.
921 	 */
922 	thread_lock(td);
923 	if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
924 		thread_unlock(td);
925 		goto retry;
926 	}
927 	thread_unlock(td);
928 
929 	if (timo > 0)
930 		error = cv_timedwait_sig(&selwait, &sellock, timo);
931 	else
932 		error = cv_wait_sig(&selwait, &sellock);
933 
934 	if (error == 0)
935 		goto retry;
936 
937 done:
938 	clear_selinfo_list(td);
939 	thread_lock(td);
940 	td->td_flags &= ~TDF_SELECT;
941 	thread_unlock(td);
942 	mtx_unlock(&sellock);
943 
944 done_nosellock:
945 	/* poll is not restarted after signals... */
946 	if (error == ERESTART)
947 		error = EINTR;
948 	if (error == EWOULDBLOCK)
949 		error = 0;
950 	if (error == 0) {
951 		error = copyout(bits, uap->fds, ni);
952 		if (error)
953 			goto out;
954 	}
955 out:
956 	if (ni > sizeof(smallbits))
957 		free(bits, M_TEMP);
958 done2:
959 	return (error);
960 }
961 
962 static int
963 pollscan(td, fds, nfd)
964 	struct thread *td;
965 	struct pollfd *fds;
966 	u_int nfd;
967 {
968 	register struct filedesc *fdp = td->td_proc->p_fd;
969 	int i;
970 	struct file *fp;
971 	int n = 0;
972 
973 	FILEDESC_SLOCK(fdp);
974 	for (i = 0; i < nfd; i++, fds++) {
975 		if (fds->fd >= fdp->fd_nfiles) {
976 			fds->revents = POLLNVAL;
977 			n++;
978 		} else if (fds->fd < 0) {
979 			fds->revents = 0;
980 		} else {
981 			fp = fdp->fd_ofiles[fds->fd];
982 			if (fp == NULL) {
983 				fds->revents = POLLNVAL;
984 				n++;
985 			} else {
986 				/*
987 				 * Note: backend also returns POLLHUP and
988 				 * POLLERR if appropriate.
989 				 */
990 				fds->revents = fo_poll(fp, fds->events,
991 				    td->td_ucred, td);
992 				if (fds->revents != 0)
993 					n++;
994 			}
995 		}
996 	}
997 	FILEDESC_SUNLOCK(fdp);
998 	td->td_retval[0] = n;
999 	return (0);
1000 }
1001 
1002 /*
1003  * OpenBSD poll system call.
1004  *
1005  * XXX this isn't quite a true representation..  OpenBSD uses select ops.
1006  */
1007 #ifndef _SYS_SYSPROTO_H_
1008 struct openbsd_poll_args {
1009 	struct pollfd *fds;
1010 	u_int	nfds;
1011 	int	timeout;
1012 };
1013 #endif
1014 int
1015 openbsd_poll(td, uap)
1016 	register struct thread *td;
1017 	register struct openbsd_poll_args *uap;
1018 {
1019 	return (poll(td, (struct poll_args *)uap));
1020 }
1021 
1022 /*
1023  * Remove the references to the thread from all of the objects we were
1024  * polling.
1025  *
1026  * This code assumes that the underlying owner of the selinfo structure will
1027  * hold sellock before it changes it, and that it will unlink itself from our
1028  * list if it goes away.
1029  */
1030 void
1031 clear_selinfo_list(td)
1032 	struct thread *td;
1033 {
1034 	struct selinfo *si;
1035 
1036 	mtx_assert(&sellock, MA_OWNED);
1037 	TAILQ_FOREACH(si, &td->td_selq, si_thrlist)
1038 		si->si_thread = NULL;
1039 	TAILQ_INIT(&td->td_selq);
1040 }
1041 
1042 /*
1043  * Record a select request.
1044  */
1045 void
1046 selrecord(selector, sip)
1047 	struct thread *selector;
1048 	struct selinfo *sip;
1049 {
1050 
1051 	mtx_lock(&sellock);
1052 	/*
1053 	 * If the selinfo's thread pointer is NULL then take ownership of it.
1054 	 *
1055 	 * If the thread pointer is not NULL and it points to another
1056 	 * thread, then we have a collision.
1057 	 *
1058 	 * If the thread pointer is not NULL and points back to us then leave
1059 	 * it alone as we've already added pointed it at us and added it to
1060 	 * our list.
1061 	 */
1062 	if (sip->si_thread == NULL) {
1063 		sip->si_thread = selector;
1064 		TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist);
1065 	} else if (sip->si_thread != selector) {
1066 		sip->si_flags |= SI_COLL;
1067 	}
1068 
1069 	mtx_unlock(&sellock);
1070 }
1071 
1072 /* Wake up a selecting thread. */
1073 void
1074 selwakeup(sip)
1075 	struct selinfo *sip;
1076 {
1077 	doselwakeup(sip, -1);
1078 }
1079 
1080 /* Wake up a selecting thread, and set its priority. */
1081 void
1082 selwakeuppri(sip, pri)
1083 	struct selinfo *sip;
1084 	int pri;
1085 {
1086 	doselwakeup(sip, pri);
1087 }
1088 
1089 /*
1090  * Do a wakeup when a selectable event occurs.
1091  */
1092 static void
1093 doselwakeup(sip, pri)
1094 	struct selinfo *sip;
1095 	int pri;
1096 {
1097 	struct thread *td;
1098 
1099 	mtx_lock(&sellock);
1100 	td = sip->si_thread;
1101 	if ((sip->si_flags & SI_COLL) != 0) {
1102 		nselcoll++;
1103 		sip->si_flags &= ~SI_COLL;
1104 		cv_broadcastpri(&selwait, pri);
1105 	}
1106 	if (td == NULL) {
1107 		mtx_unlock(&sellock);
1108 		return;
1109 	}
1110 	TAILQ_REMOVE(&td->td_selq, sip, si_thrlist);
1111 	sip->si_thread = NULL;
1112 	thread_lock(td);
1113 	td->td_flags &= ~TDF_SELECT;
1114 	thread_unlock(td);
1115 	sleepq_remove(td, &selwait);
1116 	mtx_unlock(&sellock);
1117 }
1118 
1119 static void selectinit(void *);
1120 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL)
1121 
1122 /* ARGSUSED*/
1123 static void
1124 selectinit(dummy)
1125 	void *dummy;
1126 {
1127 	cv_init(&selwait, "select");
1128 	mtx_init(&sellock, "sellck", NULL, MTX_DEF);
1129 }
1130