xref: /freebsd/sys/kern/sys_generic.c (revision d2387d42b8da231a5b95cbc313825fb2aadf26f6)
1 /*
2  * Copyright (c) 1982, 1986, 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
39  */
40 
41 #include <sys/cdefs.h>
42 __FBSDID("$FreeBSD$");
43 
44 #include "opt_ktrace.h"
45 
46 #include <sys/param.h>
47 #include <sys/systm.h>
48 #include <sys/sysproto.h>
49 #include <sys/filedesc.h>
50 #include <sys/filio.h>
51 #include <sys/fcntl.h>
52 #include <sys/file.h>
53 #include <sys/proc.h>
54 #include <sys/signalvar.h>
55 #include <sys/socketvar.h>
56 #include <sys/uio.h>
57 #include <sys/kernel.h>
58 #include <sys/limits.h>
59 #include <sys/malloc.h>
60 #include <sys/poll.h>
61 #include <sys/resourcevar.h>
62 #include <sys/selinfo.h>
63 #include <sys/sleepqueue.h>
64 #include <sys/syscallsubr.h>
65 #include <sys/sysctl.h>
66 #include <sys/sysent.h>
67 #include <sys/vnode.h>
68 #include <sys/bio.h>
69 #include <sys/buf.h>
70 #include <sys/condvar.h>
71 #ifdef KTRACE
72 #include <sys/ktrace.h>
73 #endif
74 #include <vm/vm.h>
75 #include <vm/vm_page.h>
76 
77 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
78 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
79 MALLOC_DEFINE(M_IOV, "iov", "large iov's");
80 
81 static int	pollscan(struct thread *, struct pollfd *, u_int);
82 static int	selscan(struct thread *, fd_mask **, fd_mask **, int);
83 static int	dofileread(struct thread *, struct file *, int, void *,
84 		    size_t, off_t, int);
85 static int	dofilewrite(struct thread *, struct file *, int,
86 		    const void *, size_t, off_t, int);
87 static void	doselwakeup(struct selinfo *, int);
88 
89 /*
90  * Read system call.
91  */
92 #ifndef _SYS_SYSPROTO_H_
93 struct read_args {
94 	int	fd;
95 	void	*buf;
96 	size_t	nbyte;
97 };
98 #endif
99 /*
100  * MPSAFE
101  */
102 int
103 read(td, uap)
104 	struct thread *td;
105 	struct read_args *uap;
106 {
107 	struct file *fp;
108 	int error;
109 
110 	if ((error = fget_read(td, uap->fd, &fp)) == 0) {
111 		error = dofileread(td, fp, uap->fd, uap->buf,
112 			    uap->nbyte, (off_t)-1, 0);
113 		fdrop(fp, td);
114 	}
115 	return(error);
116 }
117 
118 /*
119  * Pread system call
120  */
121 #ifndef _SYS_SYSPROTO_H_
122 struct pread_args {
123 	int	fd;
124 	void	*buf;
125 	size_t	nbyte;
126 	int	pad;
127 	off_t	offset;
128 };
129 #endif
130 /*
131  * MPSAFE
132  */
133 int
134 pread(td, uap)
135 	struct thread *td;
136 	struct pread_args *uap;
137 {
138 	struct file *fp;
139 	int error;
140 
141 	if ((error = fget_read(td, uap->fd, &fp)) != 0)
142 		return (error);
143 	if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
144 		error = ESPIPE;
145 	else if (uap->offset < 0 && fp->f_vnode->v_type != VCHR)
146 		error = EINVAL;
147 	else {
148 		error = dofileread(td, fp, uap->fd, uap->buf, uap->nbyte,
149 			    uap->offset, FOF_OFFSET);
150 	}
151 	fdrop(fp, td);
152 	return(error);
153 }
154 
155 /*
156  * Code common for read and pread
157  */
158 static int
159 dofileread(td, fp, fd, buf, nbyte, offset, flags)
160 	struct thread *td;
161 	struct file *fp;
162 	int fd, flags;
163 	void *buf;
164 	size_t nbyte;
165 	off_t offset;
166 {
167 	struct uio auio;
168 	struct iovec aiov;
169 	long cnt, error = 0;
170 #ifdef KTRACE
171 	struct iovec ktriov;
172 	struct uio ktruio;
173 	int didktr = 0;
174 #endif
175 
176 	aiov.iov_base = buf;
177 	aiov.iov_len = nbyte;
178 	auio.uio_iov = &aiov;
179 	auio.uio_iovcnt = 1;
180 	auio.uio_offset = offset;
181 	if (nbyte > INT_MAX)
182 		return (EINVAL);
183 	auio.uio_resid = nbyte;
184 	auio.uio_rw = UIO_READ;
185 	auio.uio_segflg = UIO_USERSPACE;
186 	auio.uio_td = td;
187 #ifdef KTRACE
188 	/*
189 	 * if tracing, save a copy of iovec
190 	 */
191 	if (KTRPOINT(td, KTR_GENIO)) {
192 		ktriov = aiov;
193 		ktruio = auio;
194 		didktr = 1;
195 	}
196 #endif
197 	cnt = nbyte;
198 
199 	if ((error = fo_read(fp, &auio, td->td_ucred, flags, td))) {
200 		if (auio.uio_resid != cnt && (error == ERESTART ||
201 		    error == EINTR || error == EWOULDBLOCK))
202 			error = 0;
203 	}
204 	cnt -= auio.uio_resid;
205 #ifdef KTRACE
206 	if (didktr && error == 0) {
207 		ktruio.uio_iov = &ktriov;
208 		ktruio.uio_resid = cnt;
209 		ktrgenio(fd, UIO_READ, &ktruio, error);
210 	}
211 #endif
212 	td->td_retval[0] = cnt;
213 	return (error);
214 }
215 
216 /*
217  * Scatter read system call.
218  */
219 #ifndef _SYS_SYSPROTO_H_
220 struct readv_args {
221 	int	fd;
222 	struct	iovec *iovp;
223 	u_int	iovcnt;
224 };
225 #endif
226 /*
227  * MPSAFE
228  */
229 int
230 readv(td, uap)
231 	struct thread *td;
232 	struct readv_args *uap;
233 {
234 	struct file *fp;
235 	struct uio auio;
236 	struct iovec *iov;
237 	struct iovec *needfree;
238 	struct iovec aiov[UIO_SMALLIOV];
239 	long i, cnt;
240 	int error;
241 	u_int iovlen;
242 #ifdef KTRACE
243 	struct iovec *ktriov = NULL;
244 	struct uio ktruio;
245 #endif
246 
247 	if ((error = fget_read(td, uap->fd, &fp)) != 0)
248 		return (error);
249 	needfree = NULL;
250 	/* note: can't use iovlen until iovcnt is validated */
251 	iovlen = uap->iovcnt * sizeof (struct iovec);
252 	if (uap->iovcnt > UIO_SMALLIOV) {
253 		if (uap->iovcnt > UIO_MAXIOV) {
254 			error = EINVAL;
255 			goto done;
256 		}
257 		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
258 		needfree = iov;
259 	} else
260 		iov = aiov;
261 	auio.uio_iov = iov;
262 	auio.uio_iovcnt = uap->iovcnt;
263 	auio.uio_rw = UIO_READ;
264 	auio.uio_segflg = UIO_USERSPACE;
265 	auio.uio_td = td;
266 	auio.uio_offset = -1;
267 	if ((error = copyin(uap->iovp, iov, iovlen)))
268 		goto done;
269 	auio.uio_resid = 0;
270 	for (i = 0; i < uap->iovcnt; i++) {
271 		if (iov->iov_len > INT_MAX - auio.uio_resid) {
272 			error = EINVAL;
273 			goto done;
274 		}
275 		auio.uio_resid += iov->iov_len;
276 		iov++;
277 	}
278 #ifdef KTRACE
279 	/*
280 	 * if tracing, save a copy of iovec
281 	 */
282 	if (KTRPOINT(td, KTR_GENIO))  {
283 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
284 		bcopy(auio.uio_iov, ktriov, iovlen);
285 		ktruio = auio;
286 	}
287 #endif
288 	cnt = auio.uio_resid;
289 	if ((error = fo_read(fp, &auio, td->td_ucred, 0, td))) {
290 		if (auio.uio_resid != cnt && (error == ERESTART ||
291 		    error == EINTR || error == EWOULDBLOCK))
292 			error = 0;
293 	}
294 	cnt -= auio.uio_resid;
295 #ifdef KTRACE
296 	if (ktriov != NULL) {
297 		if (error == 0) {
298 			ktruio.uio_iov = ktriov;
299 			ktruio.uio_resid = cnt;
300 			ktrgenio(uap->fd, UIO_READ, &ktruio, error);
301 		}
302 		FREE(ktriov, M_TEMP);
303 	}
304 #endif
305 	td->td_retval[0] = cnt;
306 done:
307 	fdrop(fp, td);
308 	if (needfree)
309 		FREE(needfree, M_IOV);
310 	return (error);
311 }
312 
313 /*
314  * Write system call
315  */
316 #ifndef _SYS_SYSPROTO_H_
317 struct write_args {
318 	int	fd;
319 	const void *buf;
320 	size_t	nbyte;
321 };
322 #endif
323 /*
324  * MPSAFE
325  */
326 int
327 write(td, uap)
328 	struct thread *td;
329 	struct write_args *uap;
330 {
331 	struct file *fp;
332 	int error;
333 
334 	if ((error = fget_write(td, uap->fd, &fp)) == 0) {
335 		error = dofilewrite(td, fp, uap->fd, uap->buf, uap->nbyte,
336 			    (off_t)-1, 0);
337 		fdrop(fp, td);
338 	} else {
339 		error = EBADF;	/* XXX this can't be right */
340 	}
341 	return(error);
342 }
343 
344 /*
345  * Pwrite system call
346  */
347 #ifndef _SYS_SYSPROTO_H_
348 struct pwrite_args {
349 	int	fd;
350 	const void *buf;
351 	size_t	nbyte;
352 	int	pad;
353 	off_t	offset;
354 };
355 #endif
356 /*
357  * MPSAFE
358  */
359 int
360 pwrite(td, uap)
361 	struct thread *td;
362 	struct pwrite_args *uap;
363 {
364 	struct file *fp;
365 	int error;
366 
367 	if ((error = fget_write(td, uap->fd, &fp)) == 0) {
368 		if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
369 			error = ESPIPE;
370 		else if (uap->offset < 0 && fp->f_vnode->v_type != VCHR)
371 			error = EINVAL;
372 		else {
373 			error = dofilewrite(td, fp, uap->fd, uap->buf,
374 				    uap->nbyte, uap->offset, FOF_OFFSET);
375 		}
376 		fdrop(fp, td);
377 	} else {
378 		error = EBADF;	/* this can't be right */
379 	}
380 	return(error);
381 }
382 
383 static int
384 dofilewrite(td, fp, fd, buf, nbyte, offset, flags)
385 	struct thread *td;
386 	struct file *fp;
387 	int fd, flags;
388 	const void *buf;
389 	size_t nbyte;
390 	off_t offset;
391 {
392 	struct uio auio;
393 	struct iovec aiov;
394 	long cnt, error = 0;
395 #ifdef KTRACE
396 	struct iovec ktriov;
397 	struct uio ktruio;
398 	int didktr = 0;
399 #endif
400 
401 	aiov.iov_base = (void *)(uintptr_t)buf;
402 	aiov.iov_len = nbyte;
403 	auio.uio_iov = &aiov;
404 	auio.uio_iovcnt = 1;
405 	auio.uio_offset = offset;
406 	if (nbyte > INT_MAX)
407 		return (EINVAL);
408 	auio.uio_resid = nbyte;
409 	auio.uio_rw = UIO_WRITE;
410 	auio.uio_segflg = UIO_USERSPACE;
411 	auio.uio_td = td;
412 #ifdef KTRACE
413 	/*
414 	 * if tracing, save a copy of iovec and uio
415 	 */
416 	if (KTRPOINT(td, KTR_GENIO)) {
417 		ktriov = aiov;
418 		ktruio = auio;
419 		didktr = 1;
420 	}
421 #endif
422 	cnt = nbyte;
423 	if (fp->f_type == DTYPE_VNODE)
424 		bwillwrite();
425 	if ((error = fo_write(fp, &auio, td->td_ucred, flags, td))) {
426 		if (auio.uio_resid != cnt && (error == ERESTART ||
427 		    error == EINTR || error == EWOULDBLOCK))
428 			error = 0;
429 		/* Socket layer is responsible for issuing SIGPIPE. */
430 		if (error == EPIPE && fp->f_type != DTYPE_SOCKET) {
431 			PROC_LOCK(td->td_proc);
432 			psignal(td->td_proc, SIGPIPE);
433 			PROC_UNLOCK(td->td_proc);
434 		}
435 	}
436 	cnt -= auio.uio_resid;
437 #ifdef KTRACE
438 	if (didktr && error == 0) {
439 		ktruio.uio_iov = &ktriov;
440 		ktruio.uio_resid = cnt;
441 		ktrgenio(fd, UIO_WRITE, &ktruio, error);
442 	}
443 #endif
444 	td->td_retval[0] = cnt;
445 	return (error);
446 }
447 
448 /*
449  * Gather write system call
450  */
451 #ifndef _SYS_SYSPROTO_H_
452 struct writev_args {
453 	int	fd;
454 	struct	iovec *iovp;
455 	u_int	iovcnt;
456 };
457 #endif
458 /*
459  * MPSAFE
460  */
461 int
462 writev(td, uap)
463 	struct thread *td;
464 	register struct writev_args *uap;
465 {
466 	struct file *fp;
467 	struct uio auio;
468 	register struct iovec *iov;
469 	struct iovec *needfree;
470 	struct iovec aiov[UIO_SMALLIOV];
471 	long i, cnt, error = 0;
472 	u_int iovlen;
473 #ifdef KTRACE
474 	struct iovec *ktriov = NULL;
475 	struct uio ktruio;
476 #endif
477 
478 	if ((error = fget_write(td, uap->fd, &fp)) != 0)
479 		return (EBADF);
480 	needfree = NULL;
481 	/* note: can't use iovlen until iovcnt is validated */
482 	iovlen = uap->iovcnt * sizeof (struct iovec);
483 	if (uap->iovcnt > UIO_SMALLIOV) {
484 		if (uap->iovcnt > UIO_MAXIOV) {
485 			error = EINVAL;
486 			goto done;
487 		}
488 		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
489 		needfree = iov;
490 	} else
491 		iov = aiov;
492 	auio.uio_iov = iov;
493 	auio.uio_iovcnt = uap->iovcnt;
494 	auio.uio_rw = UIO_WRITE;
495 	auio.uio_segflg = UIO_USERSPACE;
496 	auio.uio_td = td;
497 	auio.uio_offset = -1;
498 	if ((error = copyin(uap->iovp, iov, iovlen)))
499 		goto done;
500 	auio.uio_resid = 0;
501 	for (i = 0; i < uap->iovcnt; i++) {
502 		if (iov->iov_len > INT_MAX - auio.uio_resid) {
503 			error = EINVAL;
504 			goto done;
505 		}
506 		auio.uio_resid += iov->iov_len;
507 		iov++;
508 	}
509 #ifdef KTRACE
510 	/*
511 	 * if tracing, save a copy of iovec and uio
512 	 */
513 	if (KTRPOINT(td, KTR_GENIO))  {
514 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
515 		bcopy(auio.uio_iov, ktriov, iovlen);
516 		ktruio = auio;
517 	}
518 #endif
519 	cnt = auio.uio_resid;
520 	if (fp->f_type == DTYPE_VNODE)
521 		bwillwrite();
522 	if ((error = fo_write(fp, &auio, td->td_ucred, 0, td))) {
523 		if (auio.uio_resid != cnt && (error == ERESTART ||
524 		    error == EINTR || error == EWOULDBLOCK))
525 			error = 0;
526 		if (error == EPIPE) {
527 			PROC_LOCK(td->td_proc);
528 			psignal(td->td_proc, SIGPIPE);
529 			PROC_UNLOCK(td->td_proc);
530 		}
531 	}
532 	cnt -= auio.uio_resid;
533 #ifdef KTRACE
534 	if (ktriov != NULL) {
535 		if (error == 0) {
536 			ktruio.uio_iov = ktriov;
537 			ktruio.uio_resid = cnt;
538 			ktrgenio(uap->fd, UIO_WRITE, &ktruio, error);
539 		}
540 		FREE(ktriov, M_TEMP);
541 	}
542 #endif
543 	td->td_retval[0] = cnt;
544 done:
545 	fdrop(fp, td);
546 	if (needfree)
547 		FREE(needfree, M_IOV);
548 	return (error);
549 }
550 
551 /*
552  * Ioctl system call
553  */
554 #ifndef _SYS_SYSPROTO_H_
555 struct ioctl_args {
556 	int	fd;
557 	u_long	com;
558 	caddr_t	data;
559 };
560 #endif
561 /*
562  * MPSAFE
563  */
564 /* ARGSUSED */
565 int
566 ioctl(td, uap)
567 	struct thread *td;
568 	register struct ioctl_args *uap;
569 {
570 	struct file *fp;
571 	register struct filedesc *fdp;
572 	register u_long com;
573 	int error = 0;
574 	register u_int size;
575 	caddr_t data, memp;
576 	int tmp;
577 #define STK_PARAMS	128
578 	union {
579 	    char stkbuf[STK_PARAMS];
580 	    long align;
581 	} ubuf;
582 
583 	if ((error = fget(td, uap->fd, &fp)) != 0)
584 		return (error);
585 	mtx_lock(&Giant);
586 	if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
587 		fdrop(fp, td);
588 		mtx_unlock(&Giant);
589 		return (EBADF);
590 	}
591 	fdp = td->td_proc->p_fd;
592 	switch (com = uap->com) {
593 	case FIONCLEX:
594 		FILEDESC_LOCK(fdp);
595 		fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE;
596 		FILEDESC_UNLOCK(fdp);
597 		fdrop(fp, td);
598 		mtx_unlock(&Giant);
599 		return (0);
600 	case FIOCLEX:
601 		FILEDESC_LOCK(fdp);
602 		fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE;
603 		FILEDESC_UNLOCK(fdp);
604 		fdrop(fp, td);
605 		mtx_unlock(&Giant);
606 		return (0);
607 	}
608 
609 	/*
610 	 * Interpret high order word to find amount of data to be
611 	 * copied to/from the user's address space.
612 	 */
613 	size = IOCPARM_LEN(com);
614 	if (size > IOCPARM_MAX) {
615 		fdrop(fp, td);
616 		mtx_unlock(&Giant);
617 		return (ENOTTY);
618 	}
619 
620 	memp = NULL;
621 	if (size > sizeof (ubuf.stkbuf)) {
622 		memp = malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
623 		data = memp;
624 	} else {
625 		data = ubuf.stkbuf;
626 	}
627 	if (com&IOC_IN) {
628 		if (size) {
629 			error = copyin(uap->data, data, (u_int)size);
630 			if (error) {
631 				if (memp)
632 					free(memp, M_IOCTLOPS);
633 				fdrop(fp, td);
634 				goto done;
635 			}
636 		} else {
637 			*(caddr_t *)data = uap->data;
638 		}
639 	} else if ((com&IOC_OUT) && size) {
640 		/*
641 		 * Zero the buffer so the user always
642 		 * gets back something deterministic.
643 		 */
644 		bzero(data, size);
645 	} else if (com&IOC_VOID) {
646 		*(caddr_t *)data = uap->data;
647 	}
648 
649 	switch (com) {
650 
651 	case FIONBIO:
652 		FILE_LOCK(fp);
653 		if ((tmp = *(int *)data))
654 			fp->f_flag |= FNONBLOCK;
655 		else
656 			fp->f_flag &= ~FNONBLOCK;
657 		FILE_UNLOCK(fp);
658 		error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
659 		break;
660 
661 	case FIOASYNC:
662 		FILE_LOCK(fp);
663 		if ((tmp = *(int *)data))
664 			fp->f_flag |= FASYNC;
665 		else
666 			fp->f_flag &= ~FASYNC;
667 		FILE_UNLOCK(fp);
668 		error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td);
669 		break;
670 
671 	default:
672 		error = fo_ioctl(fp, com, data, td->td_ucred, td);
673 		/*
674 		 * Copy any data to user, size was
675 		 * already set and checked above.
676 		 */
677 		if (error == 0 && (com&IOC_OUT) && size)
678 			error = copyout(data, uap->data, (u_int)size);
679 		break;
680 	}
681 	if (memp)
682 		free(memp, M_IOCTLOPS);
683 	fdrop(fp, td);
684 done:
685 	mtx_unlock(&Giant);
686 	return (error);
687 }
688 
689 /*
690  * sellock and selwait are initialized in selectinit() via SYSINIT.
691  */
692 struct mtx	sellock;
693 struct cv	selwait;
694 u_int		nselcoll;	/* Select collisions since boot */
695 SYSCTL_UINT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
696 
697 /*
698  * Select system call.
699  */
700 #ifndef _SYS_SYSPROTO_H_
701 struct select_args {
702 	int	nd;
703 	fd_set	*in, *ou, *ex;
704 	struct	timeval *tv;
705 };
706 #endif
707 /*
708  * MPSAFE
709  */
710 int
711 select(td, uap)
712 	register struct thread *td;
713 	register struct select_args *uap;
714 {
715 	struct timeval tv, *tvp;
716 	int error;
717 
718 	if (uap->tv != NULL) {
719 		error = copyin(uap->tv, &tv, sizeof(tv));
720 		if (error)
721 			return (error);
722 		tvp = &tv;
723 	} else
724 		tvp = NULL;
725 
726 	return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp));
727 }
728 
729 int
730 kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou,
731     fd_set *fd_ex, struct timeval *tvp)
732 {
733 	struct filedesc *fdp;
734 	/*
735 	 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
736 	 * infds with the new FD_SETSIZE of 1024, and more than enough for
737 	 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
738 	 * of 256.
739 	 */
740 	fd_mask s_selbits[howmany(2048, NFDBITS)];
741 	fd_mask *ibits[3], *obits[3], *selbits, *sbp;
742 	struct timeval atv, rtv, ttv;
743 	int error, timo;
744 	u_int ncoll, nbufbytes, ncpbytes, nfdbits;
745 
746 	if (nd < 0)
747 		return (EINVAL);
748 	fdp = td->td_proc->p_fd;
749 	/*
750 	 * XXX: kern_select() currently requires that we acquire Giant
751 	 * even if none of the file descriptors we poll requires Giant.
752 	 */
753 	mtx_lock(&Giant);
754 	FILEDESC_LOCK(fdp);
755 
756 	if (nd > td->td_proc->p_fd->fd_nfiles)
757 		nd = td->td_proc->p_fd->fd_nfiles;   /* forgiving; slightly wrong */
758 	FILEDESC_UNLOCK(fdp);
759 
760 	/*
761 	 * Allocate just enough bits for the non-null fd_sets.  Use the
762 	 * preallocated auto buffer if possible.
763 	 */
764 	nfdbits = roundup(nd, NFDBITS);
765 	ncpbytes = nfdbits / NBBY;
766 	nbufbytes = 0;
767 	if (fd_in != NULL)
768 		nbufbytes += 2 * ncpbytes;
769 	if (fd_ou != NULL)
770 		nbufbytes += 2 * ncpbytes;
771 	if (fd_ex != NULL)
772 		nbufbytes += 2 * ncpbytes;
773 	if (nbufbytes <= sizeof s_selbits)
774 		selbits = &s_selbits[0];
775 	else
776 		selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
777 
778 	/*
779 	 * Assign pointers into the bit buffers and fetch the input bits.
780 	 * Put the output buffers together so that they can be bzeroed
781 	 * together.
782 	 */
783 	sbp = selbits;
784 #define	getbits(name, x) \
785 	do {								\
786 		if (name == NULL)					\
787 			ibits[x] = NULL;				\
788 		else {							\
789 			ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp;	\
790 			obits[x] = sbp;					\
791 			sbp += ncpbytes / sizeof *sbp;			\
792 			error = copyin(name, ibits[x], ncpbytes);	\
793 			if (error != 0)					\
794 				goto done_nosellock;			\
795 		}							\
796 	} while (0)
797 	getbits(fd_in, 0);
798 	getbits(fd_ou, 1);
799 	getbits(fd_ex, 2);
800 #undef	getbits
801 	if (nbufbytes != 0)
802 		bzero(selbits, nbufbytes / 2);
803 
804 	if (tvp != NULL) {
805 		atv = *tvp;
806 		if (itimerfix(&atv)) {
807 			error = EINVAL;
808 			goto done_nosellock;
809 		}
810 		getmicrouptime(&rtv);
811 		timevaladd(&atv, &rtv);
812 	} else {
813 		atv.tv_sec = 0;
814 		atv.tv_usec = 0;
815 	}
816 	timo = 0;
817 	TAILQ_INIT(&td->td_selq);
818 	mtx_lock(&sellock);
819 retry:
820 	ncoll = nselcoll;
821 	mtx_lock_spin(&sched_lock);
822 	td->td_flags |= TDF_SELECT;
823 	mtx_unlock_spin(&sched_lock);
824 	mtx_unlock(&sellock);
825 
826 	error = selscan(td, ibits, obits, nd);
827 	mtx_lock(&sellock);
828 	if (error || td->td_retval[0])
829 		goto done;
830 	if (atv.tv_sec || atv.tv_usec) {
831 		getmicrouptime(&rtv);
832 		if (timevalcmp(&rtv, &atv, >=))
833 			goto done;
834 		ttv = atv;
835 		timevalsub(&ttv, &rtv);
836 		timo = ttv.tv_sec > 24 * 60 * 60 ?
837 		    24 * 60 * 60 * hz : tvtohz(&ttv);
838 	}
839 
840 	/*
841 	 * An event of interest may occur while we do not hold
842 	 * sellock, so check TDF_SELECT and the number of
843 	 * collisions and rescan the file descriptors if
844 	 * necessary.
845 	 */
846 	mtx_lock_spin(&sched_lock);
847 	if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
848 		mtx_unlock_spin(&sched_lock);
849 		goto retry;
850 	}
851 	mtx_unlock_spin(&sched_lock);
852 
853 	if (timo > 0)
854 		error = cv_timedwait_sig(&selwait, &sellock, timo);
855 	else
856 		error = cv_wait_sig(&selwait, &sellock);
857 
858 	if (error == 0)
859 		goto retry;
860 
861 done:
862 	clear_selinfo_list(td);
863 	mtx_lock_spin(&sched_lock);
864 	td->td_flags &= ~TDF_SELECT;
865 	mtx_unlock_spin(&sched_lock);
866 	mtx_unlock(&sellock);
867 
868 done_nosellock:
869 	/* select is not restarted after signals... */
870 	if (error == ERESTART)
871 		error = EINTR;
872 	if (error == EWOULDBLOCK)
873 		error = 0;
874 #define	putbits(name, x) \
875 	if (name && (error2 = copyout(obits[x], name, ncpbytes))) \
876 		error = error2;
877 	if (error == 0) {
878 		int error2;
879 
880 		putbits(fd_in, 0);
881 		putbits(fd_ou, 1);
882 		putbits(fd_ex, 2);
883 #undef putbits
884 	}
885 	if (selbits != &s_selbits[0])
886 		free(selbits, M_SELECT);
887 
888 	mtx_unlock(&Giant);
889 	return (error);
890 }
891 
892 static int
893 selscan(td, ibits, obits, nfd)
894 	struct thread *td;
895 	fd_mask **ibits, **obits;
896 	int nfd;
897 {
898 	int msk, i, fd;
899 	fd_mask bits;
900 	struct file *fp;
901 	int n = 0;
902 	/* Note: backend also returns POLLHUP/POLLERR if appropriate. */
903 	static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
904 	struct filedesc *fdp = td->td_proc->p_fd;
905 
906 	FILEDESC_LOCK(fdp);
907 	for (msk = 0; msk < 3; msk++) {
908 		if (ibits[msk] == NULL)
909 			continue;
910 		for (i = 0; i < nfd; i += NFDBITS) {
911 			bits = ibits[msk][i/NFDBITS];
912 			/* ffs(int mask) not portable, fd_mask is long */
913 			for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
914 				if (!(bits & 1))
915 					continue;
916 				if ((fp = fget_locked(fdp, fd)) == NULL) {
917 					FILEDESC_UNLOCK(fdp);
918 					return (EBADF);
919 				}
920 				if (fo_poll(fp, flag[msk], td->td_ucred,
921 				    td)) {
922 					obits[msk][(fd)/NFDBITS] |=
923 					    ((fd_mask)1 << ((fd) % NFDBITS));
924 					n++;
925 				}
926 			}
927 		}
928 	}
929 	FILEDESC_UNLOCK(fdp);
930 	td->td_retval[0] = n;
931 	return (0);
932 }
933 
934 /*
935  * Poll system call.
936  */
937 #ifndef _SYS_SYSPROTO_H_
938 struct poll_args {
939 	struct pollfd *fds;
940 	u_int	nfds;
941 	int	timeout;
942 };
943 #endif
944 /*
945  * MPSAFE
946  */
947 int
948 poll(td, uap)
949 	struct thread *td;
950 	struct poll_args *uap;
951 {
952 	caddr_t bits;
953 	char smallbits[32 * sizeof(struct pollfd)];
954 	struct timeval atv, rtv, ttv;
955 	int error = 0, timo;
956 	u_int ncoll, nfds;
957 	size_t ni;
958 
959 	nfds = uap->nfds;
960 
961 	/*
962 	 * XXX: poll() currently requires that we acquire Giant even if
963 	 * none of the file descriptors we poll requires Giant.
964 	 */
965 	mtx_lock(&Giant);
966 	/*
967 	 * This is kinda bogus.  We have fd limits, but that is not
968 	 * really related to the size of the pollfd array.  Make sure
969 	 * we let the process use at least FD_SETSIZE entries and at
970 	 * least enough for the current limits.  We want to be reasonably
971 	 * safe, but not overly restrictive.
972 	 */
973 	PROC_LOCK(td->td_proc);
974 	if ((nfds > lim_cur(td->td_proc, RLIMIT_NOFILE)) &&
975 	    (nfds > FD_SETSIZE)) {
976 		PROC_UNLOCK(td->td_proc);
977 		error = EINVAL;
978 		goto done2;
979 	}
980 	PROC_UNLOCK(td->td_proc);
981 	ni = nfds * sizeof(struct pollfd);
982 	if (ni > sizeof(smallbits))
983 		bits = malloc(ni, M_TEMP, M_WAITOK);
984 	else
985 		bits = smallbits;
986 	error = copyin(uap->fds, bits, ni);
987 	if (error)
988 		goto done_nosellock;
989 	if (uap->timeout != INFTIM) {
990 		atv.tv_sec = uap->timeout / 1000;
991 		atv.tv_usec = (uap->timeout % 1000) * 1000;
992 		if (itimerfix(&atv)) {
993 			error = EINVAL;
994 			goto done_nosellock;
995 		}
996 		getmicrouptime(&rtv);
997 		timevaladd(&atv, &rtv);
998 	} else {
999 		atv.tv_sec = 0;
1000 		atv.tv_usec = 0;
1001 	}
1002 	timo = 0;
1003 	TAILQ_INIT(&td->td_selq);
1004 	mtx_lock(&sellock);
1005 retry:
1006 	ncoll = nselcoll;
1007 	mtx_lock_spin(&sched_lock);
1008 	td->td_flags |= TDF_SELECT;
1009 	mtx_unlock_spin(&sched_lock);
1010 	mtx_unlock(&sellock);
1011 
1012 	error = pollscan(td, (struct pollfd *)bits, nfds);
1013 	mtx_lock(&sellock);
1014 	if (error || td->td_retval[0])
1015 		goto done;
1016 	if (atv.tv_sec || atv.tv_usec) {
1017 		getmicrouptime(&rtv);
1018 		if (timevalcmp(&rtv, &atv, >=))
1019 			goto done;
1020 		ttv = atv;
1021 		timevalsub(&ttv, &rtv);
1022 		timo = ttv.tv_sec > 24 * 60 * 60 ?
1023 		    24 * 60 * 60 * hz : tvtohz(&ttv);
1024 	}
1025 	/*
1026 	 * An event of interest may occur while we do not hold
1027 	 * sellock, so check TDF_SELECT and the number of collisions
1028 	 * and rescan the file descriptors if necessary.
1029 	 */
1030 	mtx_lock_spin(&sched_lock);
1031 	if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
1032 		mtx_unlock_spin(&sched_lock);
1033 		goto retry;
1034 	}
1035 	mtx_unlock_spin(&sched_lock);
1036 
1037 	if (timo > 0)
1038 		error = cv_timedwait_sig(&selwait, &sellock, timo);
1039 	else
1040 		error = cv_wait_sig(&selwait, &sellock);
1041 
1042 	if (error == 0)
1043 		goto retry;
1044 
1045 done:
1046 	clear_selinfo_list(td);
1047 	mtx_lock_spin(&sched_lock);
1048 	td->td_flags &= ~TDF_SELECT;
1049 	mtx_unlock_spin(&sched_lock);
1050 	mtx_unlock(&sellock);
1051 
1052 done_nosellock:
1053 	/* poll is not restarted after signals... */
1054 	if (error == ERESTART)
1055 		error = EINTR;
1056 	if (error == EWOULDBLOCK)
1057 		error = 0;
1058 	if (error == 0) {
1059 		error = copyout(bits, uap->fds, ni);
1060 		if (error)
1061 			goto out;
1062 	}
1063 out:
1064 	if (ni > sizeof(smallbits))
1065 		free(bits, M_TEMP);
1066 done2:
1067 	mtx_unlock(&Giant);
1068 	return (error);
1069 }
1070 
1071 static int
1072 pollscan(td, fds, nfd)
1073 	struct thread *td;
1074 	struct pollfd *fds;
1075 	u_int nfd;
1076 {
1077 	register struct filedesc *fdp = td->td_proc->p_fd;
1078 	int i;
1079 	struct file *fp;
1080 	int n = 0;
1081 
1082 	FILEDESC_LOCK(fdp);
1083 	for (i = 0; i < nfd; i++, fds++) {
1084 		if (fds->fd >= fdp->fd_nfiles) {
1085 			fds->revents = POLLNVAL;
1086 			n++;
1087 		} else if (fds->fd < 0) {
1088 			fds->revents = 0;
1089 		} else {
1090 			fp = fdp->fd_ofiles[fds->fd];
1091 			if (fp == NULL) {
1092 				fds->revents = POLLNVAL;
1093 				n++;
1094 			} else {
1095 				/*
1096 				 * Note: backend also returns POLLHUP and
1097 				 * POLLERR if appropriate.
1098 				 */
1099 				fds->revents = fo_poll(fp, fds->events,
1100 				    td->td_ucred, td);
1101 				if (fds->revents != 0)
1102 					n++;
1103 			}
1104 		}
1105 	}
1106 	FILEDESC_UNLOCK(fdp);
1107 	td->td_retval[0] = n;
1108 	return (0);
1109 }
1110 
1111 /*
1112  * OpenBSD poll system call.
1113  * XXX this isn't quite a true representation..  OpenBSD uses select ops.
1114  */
1115 #ifndef _SYS_SYSPROTO_H_
1116 struct openbsd_poll_args {
1117 	struct pollfd *fds;
1118 	u_int	nfds;
1119 	int	timeout;
1120 };
1121 #endif
1122 /*
1123  * MPSAFE
1124  */
1125 int
1126 openbsd_poll(td, uap)
1127 	register struct thread *td;
1128 	register struct openbsd_poll_args *uap;
1129 {
1130 	return (poll(td, (struct poll_args *)uap));
1131 }
1132 
1133 /*
1134  * Remove the references to the thread from all of the objects
1135  * we were polling.
1136  *
1137  * This code assumes that the underlying owner of the selinfo
1138  * structure will hold sellock before it changes it, and that
1139  * it will unlink itself from our list if it goes away.
1140  */
1141 void
1142 clear_selinfo_list(td)
1143 	struct thread *td;
1144 {
1145 	struct selinfo *si;
1146 
1147 	mtx_assert(&sellock, MA_OWNED);
1148 	TAILQ_FOREACH(si, &td->td_selq, si_thrlist)
1149 		si->si_thread = NULL;
1150 	TAILQ_INIT(&td->td_selq);
1151 }
1152 
1153 /*
1154  * Record a select request.
1155  */
1156 void
1157 selrecord(selector, sip)
1158 	struct thread *selector;
1159 	struct selinfo *sip;
1160 {
1161 
1162 	mtx_lock(&sellock);
1163 	/*
1164 	 * If the selinfo's thread pointer is NULL then take ownership of it.
1165 	 *
1166 	 * If the thread pointer is not NULL and it points to another
1167 	 * thread, then we have a collision.
1168 	 *
1169 	 * If the thread pointer is not NULL and points back to us then leave
1170 	 * it alone as we've already added pointed it at us and added it to
1171 	 * our list.
1172 	 */
1173 	if (sip->si_thread == NULL) {
1174 		sip->si_thread = selector;
1175 		TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist);
1176 	} else if (sip->si_thread != selector) {
1177 		sip->si_flags |= SI_COLL;
1178 	}
1179 
1180 	mtx_unlock(&sellock);
1181 }
1182 
1183 /* Wake up a selecting thread. */
1184 void
1185 selwakeup(sip)
1186 	struct selinfo *sip;
1187 {
1188 	doselwakeup(sip, -1);
1189 }
1190 
1191 /* Wake up a selecting thread, and set its priority. */
1192 void
1193 selwakeuppri(sip, pri)
1194 	struct selinfo *sip;
1195 	int pri;
1196 {
1197 	doselwakeup(sip, pri);
1198 }
1199 
1200 /*
1201  * Do a wakeup when a selectable event occurs.
1202  */
1203 static void
1204 doselwakeup(sip, pri)
1205 	struct selinfo *sip;
1206 	int pri;
1207 {
1208 	struct thread *td;
1209 
1210 	mtx_lock(&sellock);
1211 	td = sip->si_thread;
1212 	if ((sip->si_flags & SI_COLL) != 0) {
1213 		nselcoll++;
1214 		sip->si_flags &= ~SI_COLL;
1215 		cv_broadcastpri(&selwait, pri);
1216 	}
1217 	if (td == NULL) {
1218 		mtx_unlock(&sellock);
1219 		return;
1220 	}
1221 	TAILQ_REMOVE(&td->td_selq, sip, si_thrlist);
1222 	sip->si_thread = NULL;
1223 	mtx_lock_spin(&sched_lock);
1224 	td->td_flags &= ~TDF_SELECT;
1225 	mtx_unlock_spin(&sched_lock);
1226 	sleepq_remove(td, &selwait);
1227 	mtx_unlock(&sellock);
1228 }
1229 
1230 static void selectinit(void *);
1231 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL)
1232 
1233 /* ARGSUSED*/
1234 static void
1235 selectinit(dummy)
1236 	void *dummy;
1237 {
1238 	cv_init(&selwait, "select");
1239 	mtx_init(&sellock, "sellck", NULL, MTX_DEF);
1240 }
1241