xref: /freebsd/sys/kern/sys_generic.c (revision 7660b554bc59a07be0431c17e0e33815818baa69)
1 /*
2  * Copyright (c) 1982, 1986, 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
39  */
40 
41 #include <sys/cdefs.h>
42 __FBSDID("$FreeBSD$");
43 
44 #include "opt_ktrace.h"
45 
46 #include <sys/param.h>
47 #include <sys/systm.h>
48 #include <sys/sysproto.h>
49 #include <sys/filedesc.h>
50 #include <sys/filio.h>
51 #include <sys/fcntl.h>
52 #include <sys/file.h>
53 #include <sys/proc.h>
54 #include <sys/signalvar.h>
55 #include <sys/socketvar.h>
56 #include <sys/uio.h>
57 #include <sys/kernel.h>
58 #include <sys/limits.h>
59 #include <sys/malloc.h>
60 #include <sys/poll.h>
61 #include <sys/resourcevar.h>
62 #include <sys/selinfo.h>
63 #include <sys/syscallsubr.h>
64 #include <sys/sysctl.h>
65 #include <sys/sysent.h>
66 #include <sys/bio.h>
67 #include <sys/buf.h>
68 #include <sys/condvar.h>
69 #ifdef KTRACE
70 #include <sys/ktrace.h>
71 #endif
72 #include <vm/vm.h>
73 #include <vm/vm_page.h>
74 
75 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
76 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
77 MALLOC_DEFINE(M_IOV, "iov", "large iov's");
78 
79 static int	pollscan(struct thread *, struct pollfd *, u_int);
80 static int	selscan(struct thread *, fd_mask **, fd_mask **, int);
81 static int	dofileread(struct thread *, struct file *, int, void *,
82 		    size_t, off_t, int);
83 static int	dofilewrite(struct thread *, struct file *, int,
84 		    const void *, size_t, off_t, int);
85 
86 /*
87  * Read system call.
88  */
89 #ifndef _SYS_SYSPROTO_H_
90 struct read_args {
91 	int	fd;
92 	void	*buf;
93 	size_t	nbyte;
94 };
95 #endif
96 /*
97  * MPSAFE
98  */
99 int
100 read(td, uap)
101 	struct thread *td;
102 	struct read_args *uap;
103 {
104 	struct file *fp;
105 	int error;
106 
107 	if ((error = fget_read(td, uap->fd, &fp)) == 0) {
108 		error = dofileread(td, fp, uap->fd, uap->buf,
109 			    uap->nbyte, (off_t)-1, 0);
110 		fdrop(fp, td);
111 	}
112 	return(error);
113 }
114 
115 /*
116  * Pread system call
117  */
118 #ifndef _SYS_SYSPROTO_H_
119 struct pread_args {
120 	int	fd;
121 	void	*buf;
122 	size_t	nbyte;
123 	int	pad;
124 	off_t	offset;
125 };
126 #endif
127 /*
128  * MPSAFE
129  */
130 int
131 pread(td, uap)
132 	struct thread *td;
133 	struct pread_args *uap;
134 {
135 	struct file *fp;
136 	int error;
137 
138 	if ((error = fget_read(td, uap->fd, &fp)) != 0)
139 		return (error);
140 	if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) {
141 		error = ESPIPE;
142 	} else {
143 		error = dofileread(td, fp, uap->fd, uap->buf, uap->nbyte,
144 			    uap->offset, FOF_OFFSET);
145 	}
146 	fdrop(fp, td);
147 	return(error);
148 }
149 
150 /*
151  * Code common for read and pread
152  */
153 static int
154 dofileread(td, fp, fd, buf, nbyte, offset, flags)
155 	struct thread *td;
156 	struct file *fp;
157 	int fd, flags;
158 	void *buf;
159 	size_t nbyte;
160 	off_t offset;
161 {
162 	struct uio auio;
163 	struct iovec aiov;
164 	long cnt, error = 0;
165 #ifdef KTRACE
166 	struct iovec ktriov;
167 	struct uio ktruio;
168 	int didktr = 0;
169 #endif
170 
171 	aiov.iov_base = buf;
172 	aiov.iov_len = nbyte;
173 	auio.uio_iov = &aiov;
174 	auio.uio_iovcnt = 1;
175 	auio.uio_offset = offset;
176 	if (nbyte > INT_MAX)
177 		return (EINVAL);
178 	auio.uio_resid = nbyte;
179 	auio.uio_rw = UIO_READ;
180 	auio.uio_segflg = UIO_USERSPACE;
181 	auio.uio_td = td;
182 #ifdef KTRACE
183 	/*
184 	 * if tracing, save a copy of iovec
185 	 */
186 	if (KTRPOINT(td, KTR_GENIO)) {
187 		ktriov = aiov;
188 		ktruio = auio;
189 		didktr = 1;
190 	}
191 #endif
192 	cnt = nbyte;
193 
194 	if ((error = fo_read(fp, &auio, td->td_ucred, flags, td))) {
195 		if (auio.uio_resid != cnt && (error == ERESTART ||
196 		    error == EINTR || error == EWOULDBLOCK))
197 			error = 0;
198 	}
199 	cnt -= auio.uio_resid;
200 #ifdef KTRACE
201 	if (didktr && error == 0) {
202 		ktruio.uio_iov = &ktriov;
203 		ktruio.uio_resid = cnt;
204 		ktrgenio(fd, UIO_READ, &ktruio, error);
205 	}
206 #endif
207 	td->td_retval[0] = cnt;
208 	return (error);
209 }
210 
211 /*
212  * Scatter read system call.
213  */
214 #ifndef _SYS_SYSPROTO_H_
215 struct readv_args {
216 	int	fd;
217 	struct	iovec *iovp;
218 	u_int	iovcnt;
219 };
220 #endif
221 /*
222  * MPSAFE
223  */
224 int
225 readv(td, uap)
226 	struct thread *td;
227 	struct readv_args *uap;
228 {
229 	struct file *fp;
230 	struct uio auio;
231 	struct iovec *iov;
232 	struct iovec *needfree;
233 	struct iovec aiov[UIO_SMALLIOV];
234 	long i, cnt;
235 	int error;
236 	u_int iovlen;
237 #ifdef KTRACE
238 	struct iovec *ktriov = NULL;
239 	struct uio ktruio;
240 #endif
241 
242 	if ((error = fget_read(td, uap->fd, &fp)) != 0)
243 		return (error);
244 	needfree = NULL;
245 	/* note: can't use iovlen until iovcnt is validated */
246 	iovlen = uap->iovcnt * sizeof (struct iovec);
247 	if (uap->iovcnt > UIO_SMALLIOV) {
248 		if (uap->iovcnt > UIO_MAXIOV) {
249 			error = EINVAL;
250 			goto done;
251 		}
252 		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
253 		needfree = iov;
254 	} else
255 		iov = aiov;
256 	auio.uio_iov = iov;
257 	auio.uio_iovcnt = uap->iovcnt;
258 	auio.uio_rw = UIO_READ;
259 	auio.uio_segflg = UIO_USERSPACE;
260 	auio.uio_td = td;
261 	auio.uio_offset = -1;
262 	if ((error = copyin(uap->iovp, iov, iovlen)))
263 		goto done;
264 	auio.uio_resid = 0;
265 	for (i = 0; i < uap->iovcnt; i++) {
266 		if (iov->iov_len > INT_MAX - auio.uio_resid) {
267 			error = EINVAL;
268 			goto done;
269 		}
270 		auio.uio_resid += iov->iov_len;
271 		iov++;
272 	}
273 #ifdef KTRACE
274 	/*
275 	 * if tracing, save a copy of iovec
276 	 */
277 	if (KTRPOINT(td, KTR_GENIO))  {
278 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
279 		bcopy(auio.uio_iov, ktriov, iovlen);
280 		ktruio = auio;
281 	}
282 #endif
283 	cnt = auio.uio_resid;
284 	if ((error = fo_read(fp, &auio, td->td_ucred, 0, td))) {
285 		if (auio.uio_resid != cnt && (error == ERESTART ||
286 		    error == EINTR || error == EWOULDBLOCK))
287 			error = 0;
288 	}
289 	cnt -= auio.uio_resid;
290 #ifdef KTRACE
291 	if (ktriov != NULL) {
292 		if (error == 0) {
293 			ktruio.uio_iov = ktriov;
294 			ktruio.uio_resid = cnt;
295 			ktrgenio(uap->fd, UIO_READ, &ktruio, error);
296 		}
297 		FREE(ktriov, M_TEMP);
298 	}
299 #endif
300 	td->td_retval[0] = cnt;
301 done:
302 	fdrop(fp, td);
303 	if (needfree)
304 		FREE(needfree, M_IOV);
305 	return (error);
306 }
307 
308 /*
309  * Write system call
310  */
311 #ifndef _SYS_SYSPROTO_H_
312 struct write_args {
313 	int	fd;
314 	const void *buf;
315 	size_t	nbyte;
316 };
317 #endif
318 /*
319  * MPSAFE
320  */
321 int
322 write(td, uap)
323 	struct thread *td;
324 	struct write_args *uap;
325 {
326 	struct file *fp;
327 	int error;
328 
329 	if ((error = fget_write(td, uap->fd, &fp)) == 0) {
330 		error = dofilewrite(td, fp, uap->fd, uap->buf, uap->nbyte,
331 			    (off_t)-1, 0);
332 		fdrop(fp, td);
333 	} else {
334 		error = EBADF;	/* XXX this can't be right */
335 	}
336 	return(error);
337 }
338 
339 /*
340  * Pwrite system call
341  */
342 #ifndef _SYS_SYSPROTO_H_
343 struct pwrite_args {
344 	int	fd;
345 	const void *buf;
346 	size_t	nbyte;
347 	int	pad;
348 	off_t	offset;
349 };
350 #endif
351 /*
352  * MPSAFE
353  */
354 int
355 pwrite(td, uap)
356 	struct thread *td;
357 	struct pwrite_args *uap;
358 {
359 	struct file *fp;
360 	int error;
361 
362 	if ((error = fget_write(td, uap->fd, &fp)) == 0) {
363 		if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) {
364 			error = ESPIPE;
365 		} else {
366 			error = dofilewrite(td, fp, uap->fd, uap->buf,
367 				    uap->nbyte, uap->offset, FOF_OFFSET);
368 		}
369 		fdrop(fp, td);
370 	} else {
371 		error = EBADF;	/* this can't be right */
372 	}
373 	return(error);
374 }
375 
376 static int
377 dofilewrite(td, fp, fd, buf, nbyte, offset, flags)
378 	struct thread *td;
379 	struct file *fp;
380 	int fd, flags;
381 	const void *buf;
382 	size_t nbyte;
383 	off_t offset;
384 {
385 	struct uio auio;
386 	struct iovec aiov;
387 	long cnt, error = 0;
388 #ifdef KTRACE
389 	struct iovec ktriov;
390 	struct uio ktruio;
391 	int didktr = 0;
392 #endif
393 
394 	aiov.iov_base = (void *)(uintptr_t)buf;
395 	aiov.iov_len = nbyte;
396 	auio.uio_iov = &aiov;
397 	auio.uio_iovcnt = 1;
398 	auio.uio_offset = offset;
399 	if (nbyte > INT_MAX)
400 		return (EINVAL);
401 	auio.uio_resid = nbyte;
402 	auio.uio_rw = UIO_WRITE;
403 	auio.uio_segflg = UIO_USERSPACE;
404 	auio.uio_td = td;
405 #ifdef KTRACE
406 	/*
407 	 * if tracing, save a copy of iovec and uio
408 	 */
409 	if (KTRPOINT(td, KTR_GENIO)) {
410 		ktriov = aiov;
411 		ktruio = auio;
412 		didktr = 1;
413 	}
414 #endif
415 	cnt = nbyte;
416 	if (fp->f_type == DTYPE_VNODE)
417 		bwillwrite();
418 	if ((error = fo_write(fp, &auio, td->td_ucred, flags, td))) {
419 		if (auio.uio_resid != cnt && (error == ERESTART ||
420 		    error == EINTR || error == EWOULDBLOCK))
421 			error = 0;
422 		/* Socket layer is responsible for issuing SIGPIPE. */
423 		if (error == EPIPE && fp->f_type != DTYPE_SOCKET) {
424 			PROC_LOCK(td->td_proc);
425 			psignal(td->td_proc, SIGPIPE);
426 			PROC_UNLOCK(td->td_proc);
427 		}
428 	}
429 	cnt -= auio.uio_resid;
430 #ifdef KTRACE
431 	if (didktr && error == 0) {
432 		ktruio.uio_iov = &ktriov;
433 		ktruio.uio_resid = cnt;
434 		ktrgenio(fd, UIO_WRITE, &ktruio, error);
435 	}
436 #endif
437 	td->td_retval[0] = cnt;
438 	return (error);
439 }
440 
441 /*
442  * Gather write system call
443  */
444 #ifndef _SYS_SYSPROTO_H_
445 struct writev_args {
446 	int	fd;
447 	struct	iovec *iovp;
448 	u_int	iovcnt;
449 };
450 #endif
451 /*
452  * MPSAFE
453  */
454 int
455 writev(td, uap)
456 	struct thread *td;
457 	register struct writev_args *uap;
458 {
459 	struct file *fp;
460 	struct uio auio;
461 	register struct iovec *iov;
462 	struct iovec *needfree;
463 	struct iovec aiov[UIO_SMALLIOV];
464 	long i, cnt, error = 0;
465 	u_int iovlen;
466 #ifdef KTRACE
467 	struct iovec *ktriov = NULL;
468 	struct uio ktruio;
469 #endif
470 
471 	if ((error = fget_write(td, uap->fd, &fp)) != 0)
472 		return (EBADF);
473 	needfree = NULL;
474 	/* note: can't use iovlen until iovcnt is validated */
475 	iovlen = uap->iovcnt * sizeof (struct iovec);
476 	if (uap->iovcnt > UIO_SMALLIOV) {
477 		if (uap->iovcnt > UIO_MAXIOV) {
478 			error = EINVAL;
479 			goto done;
480 		}
481 		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
482 		needfree = iov;
483 	} else
484 		iov = aiov;
485 	auio.uio_iov = iov;
486 	auio.uio_iovcnt = uap->iovcnt;
487 	auio.uio_rw = UIO_WRITE;
488 	auio.uio_segflg = UIO_USERSPACE;
489 	auio.uio_td = td;
490 	auio.uio_offset = -1;
491 	if ((error = copyin(uap->iovp, iov, iovlen)))
492 		goto done;
493 	auio.uio_resid = 0;
494 	for (i = 0; i < uap->iovcnt; i++) {
495 		if (iov->iov_len > INT_MAX - auio.uio_resid) {
496 			error = EINVAL;
497 			goto done;
498 		}
499 		auio.uio_resid += iov->iov_len;
500 		iov++;
501 	}
502 #ifdef KTRACE
503 	/*
504 	 * if tracing, save a copy of iovec and uio
505 	 */
506 	if (KTRPOINT(td, KTR_GENIO))  {
507 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
508 		bcopy(auio.uio_iov, ktriov, iovlen);
509 		ktruio = auio;
510 	}
511 #endif
512 	cnt = auio.uio_resid;
513 	if (fp->f_type == DTYPE_VNODE)
514 		bwillwrite();
515 	if ((error = fo_write(fp, &auio, td->td_ucred, 0, td))) {
516 		if (auio.uio_resid != cnt && (error == ERESTART ||
517 		    error == EINTR || error == EWOULDBLOCK))
518 			error = 0;
519 		if (error == EPIPE) {
520 			PROC_LOCK(td->td_proc);
521 			psignal(td->td_proc, SIGPIPE);
522 			PROC_UNLOCK(td->td_proc);
523 		}
524 	}
525 	cnt -= auio.uio_resid;
526 #ifdef KTRACE
527 	if (ktriov != NULL) {
528 		if (error == 0) {
529 			ktruio.uio_iov = ktriov;
530 			ktruio.uio_resid = cnt;
531 			ktrgenio(uap->fd, UIO_WRITE, &ktruio, error);
532 		}
533 		FREE(ktriov, M_TEMP);
534 	}
535 #endif
536 	td->td_retval[0] = cnt;
537 done:
538 	fdrop(fp, td);
539 	if (needfree)
540 		FREE(needfree, M_IOV);
541 	return (error);
542 }
543 
544 /*
545  * Ioctl system call
546  */
547 #ifndef _SYS_SYSPROTO_H_
548 struct ioctl_args {
549 	int	fd;
550 	u_long	com;
551 	caddr_t	data;
552 };
553 #endif
554 /*
555  * MPSAFE
556  */
557 /* ARGSUSED */
558 int
559 ioctl(td, uap)
560 	struct thread *td;
561 	register struct ioctl_args *uap;
562 {
563 	struct file *fp;
564 	register struct filedesc *fdp;
565 	register u_long com;
566 	int error = 0;
567 	register u_int size;
568 	caddr_t data, memp;
569 	int tmp;
570 #define STK_PARAMS	128
571 	union {
572 	    char stkbuf[STK_PARAMS];
573 	    long align;
574 	} ubuf;
575 
576 	if ((error = fget(td, uap->fd, &fp)) != 0)
577 		return (error);
578 	mtx_lock(&Giant);
579 	if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
580 		fdrop(fp, td);
581 		mtx_unlock(&Giant);
582 		return (EBADF);
583 	}
584 	fdp = td->td_proc->p_fd;
585 	switch (com = uap->com) {
586 	case FIONCLEX:
587 		FILEDESC_LOCK(fdp);
588 		fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE;
589 		FILEDESC_UNLOCK(fdp);
590 		fdrop(fp, td);
591 		mtx_unlock(&Giant);
592 		return (0);
593 	case FIOCLEX:
594 		FILEDESC_LOCK(fdp);
595 		fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE;
596 		FILEDESC_UNLOCK(fdp);
597 		fdrop(fp, td);
598 		mtx_unlock(&Giant);
599 		return (0);
600 	}
601 
602 	/*
603 	 * Interpret high order word to find amount of data to be
604 	 * copied to/from the user's address space.
605 	 */
606 	size = IOCPARM_LEN(com);
607 	if (size > IOCPARM_MAX) {
608 		fdrop(fp, td);
609 		mtx_unlock(&Giant);
610 		return (ENOTTY);
611 	}
612 
613 	memp = NULL;
614 	if (size > sizeof (ubuf.stkbuf)) {
615 		memp = malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
616 		data = memp;
617 	} else {
618 		data = ubuf.stkbuf;
619 	}
620 	if (com&IOC_IN) {
621 		if (size) {
622 			error = copyin(uap->data, data, (u_int)size);
623 			if (error) {
624 				if (memp)
625 					free(memp, M_IOCTLOPS);
626 				fdrop(fp, td);
627 				goto done;
628 			}
629 		} else {
630 			*(caddr_t *)data = uap->data;
631 		}
632 	} else if ((com&IOC_OUT) && size) {
633 		/*
634 		 * Zero the buffer so the user always
635 		 * gets back something deterministic.
636 		 */
637 		bzero(data, size);
638 	} else if (com&IOC_VOID) {
639 		*(caddr_t *)data = uap->data;
640 	}
641 
642 	switch (com) {
643 
644 	case FIONBIO:
645 		FILE_LOCK(fp);
646 		if ((tmp = *(int *)data))
647 			fp->f_flag |= FNONBLOCK;
648 		else
649 			fp->f_flag &= ~FNONBLOCK;
650 		FILE_UNLOCK(fp);
651 		error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
652 		break;
653 
654 	case FIOASYNC:
655 		FILE_LOCK(fp);
656 		if ((tmp = *(int *)data))
657 			fp->f_flag |= FASYNC;
658 		else
659 			fp->f_flag &= ~FASYNC;
660 		FILE_UNLOCK(fp);
661 		error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td);
662 		break;
663 
664 	default:
665 		error = fo_ioctl(fp, com, data, td->td_ucred, td);
666 		/*
667 		 * Copy any data to user, size was
668 		 * already set and checked above.
669 		 */
670 		if (error == 0 && (com&IOC_OUT) && size)
671 			error = copyout(data, uap->data, (u_int)size);
672 		break;
673 	}
674 	if (memp)
675 		free(memp, M_IOCTLOPS);
676 	fdrop(fp, td);
677 done:
678 	mtx_unlock(&Giant);
679 	return (error);
680 }
681 
682 /*
683  * sellock and selwait are initialized in selectinit() via SYSINIT.
684  */
685 struct mtx	sellock;
686 struct cv	selwait;
687 u_int		nselcoll;	/* Select collisions since boot */
688 SYSCTL_UINT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
689 
690 /*
691  * Select system call.
692  */
693 #ifndef _SYS_SYSPROTO_H_
694 struct select_args {
695 	int	nd;
696 	fd_set	*in, *ou, *ex;
697 	struct	timeval *tv;
698 };
699 #endif
700 /*
701  * MPSAFE
702  */
703 int
704 select(td, uap)
705 	register struct thread *td;
706 	register struct select_args *uap;
707 {
708 	struct timeval tv, *tvp;
709 	int error;
710 
711 	if (uap->tv != NULL) {
712 		error = copyin(uap->tv, &tv, sizeof(tv));
713 		if (error)
714 			return (error);
715 		tvp = &tv;
716 	} else
717 		tvp = NULL;
718 
719 	return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp));
720 }
721 
722 int
723 kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou,
724     fd_set *fd_ex, struct timeval *tvp)
725 {
726 	struct filedesc *fdp;
727 	/*
728 	 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
729 	 * infds with the new FD_SETSIZE of 1024, and more than enough for
730 	 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
731 	 * of 256.
732 	 */
733 	fd_mask s_selbits[howmany(2048, NFDBITS)];
734 	fd_mask *ibits[3], *obits[3], *selbits, *sbp;
735 	struct timeval atv, rtv, ttv;
736 	int error, timo;
737 	u_int ncoll, nbufbytes, ncpbytes, nfdbits;
738 
739 	if (nd < 0)
740 		return (EINVAL);
741 	fdp = td->td_proc->p_fd;
742 	mtx_lock(&Giant);
743 	FILEDESC_LOCK(fdp);
744 
745 	if (nd > td->td_proc->p_fd->fd_nfiles)
746 		nd = td->td_proc->p_fd->fd_nfiles;   /* forgiving; slightly wrong */
747 	FILEDESC_UNLOCK(fdp);
748 
749 	/*
750 	 * Allocate just enough bits for the non-null fd_sets.  Use the
751 	 * preallocated auto buffer if possible.
752 	 */
753 	nfdbits = roundup(nd, NFDBITS);
754 	ncpbytes = nfdbits / NBBY;
755 	nbufbytes = 0;
756 	if (fd_in != NULL)
757 		nbufbytes += 2 * ncpbytes;
758 	if (fd_ou != NULL)
759 		nbufbytes += 2 * ncpbytes;
760 	if (fd_ex != NULL)
761 		nbufbytes += 2 * ncpbytes;
762 	if (nbufbytes <= sizeof s_selbits)
763 		selbits = &s_selbits[0];
764 	else
765 		selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
766 
767 	/*
768 	 * Assign pointers into the bit buffers and fetch the input bits.
769 	 * Put the output buffers together so that they can be bzeroed
770 	 * together.
771 	 */
772 	sbp = selbits;
773 #define	getbits(name, x) \
774 	do {								\
775 		if (name == NULL)					\
776 			ibits[x] = NULL;				\
777 		else {							\
778 			ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp;	\
779 			obits[x] = sbp;					\
780 			sbp += ncpbytes / sizeof *sbp;			\
781 			error = copyin(name, ibits[x], ncpbytes);	\
782 			if (error != 0)					\
783 				goto done_nosellock;			\
784 		}							\
785 	} while (0)
786 	getbits(fd_in, 0);
787 	getbits(fd_ou, 1);
788 	getbits(fd_ex, 2);
789 #undef	getbits
790 	if (nbufbytes != 0)
791 		bzero(selbits, nbufbytes / 2);
792 
793 	if (tvp != NULL) {
794 		atv = *tvp;
795 		if (itimerfix(&atv)) {
796 			error = EINVAL;
797 			goto done_nosellock;
798 		}
799 		getmicrouptime(&rtv);
800 		timevaladd(&atv, &rtv);
801 	} else {
802 		atv.tv_sec = 0;
803 		atv.tv_usec = 0;
804 	}
805 	timo = 0;
806 	TAILQ_INIT(&td->td_selq);
807 	mtx_lock(&sellock);
808 retry:
809 	ncoll = nselcoll;
810 	mtx_lock_spin(&sched_lock);
811 	td->td_flags |= TDF_SELECT;
812 	mtx_unlock_spin(&sched_lock);
813 	mtx_unlock(&sellock);
814 
815 	error = selscan(td, ibits, obits, nd);
816 	mtx_lock(&sellock);
817 	if (error || td->td_retval[0])
818 		goto done;
819 	if (atv.tv_sec || atv.tv_usec) {
820 		getmicrouptime(&rtv);
821 		if (timevalcmp(&rtv, &atv, >=))
822 			goto done;
823 		ttv = atv;
824 		timevalsub(&ttv, &rtv);
825 		timo = ttv.tv_sec > 24 * 60 * 60 ?
826 		    24 * 60 * 60 * hz : tvtohz(&ttv);
827 	}
828 
829 	/*
830 	 * An event of interest may occur while we do not hold
831 	 * sellock, so check TDF_SELECT and the number of
832 	 * collisions and rescan the file descriptors if
833 	 * necessary.
834 	 */
835 	mtx_lock_spin(&sched_lock);
836 	if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
837 		mtx_unlock_spin(&sched_lock);
838 		goto retry;
839 	}
840 	mtx_unlock_spin(&sched_lock);
841 
842 	if (timo > 0)
843 		error = cv_timedwait_sig(&selwait, &sellock, timo);
844 	else
845 		error = cv_wait_sig(&selwait, &sellock);
846 
847 	if (error == 0)
848 		goto retry;
849 
850 done:
851 	clear_selinfo_list(td);
852 	mtx_lock_spin(&sched_lock);
853 	td->td_flags &= ~TDF_SELECT;
854 	mtx_unlock_spin(&sched_lock);
855 	mtx_unlock(&sellock);
856 
857 done_nosellock:
858 	/* select is not restarted after signals... */
859 	if (error == ERESTART)
860 		error = EINTR;
861 	if (error == EWOULDBLOCK)
862 		error = 0;
863 #define	putbits(name, x) \
864 	if (name && (error2 = copyout(obits[x], name, ncpbytes))) \
865 		error = error2;
866 	if (error == 0) {
867 		int error2;
868 
869 		putbits(fd_in, 0);
870 		putbits(fd_ou, 1);
871 		putbits(fd_ex, 2);
872 #undef putbits
873 	}
874 	if (selbits != &s_selbits[0])
875 		free(selbits, M_SELECT);
876 
877 	mtx_unlock(&Giant);
878 	return (error);
879 }
880 
881 static int
882 selscan(td, ibits, obits, nfd)
883 	struct thread *td;
884 	fd_mask **ibits, **obits;
885 	int nfd;
886 {
887 	int msk, i, fd;
888 	fd_mask bits;
889 	struct file *fp;
890 	int n = 0;
891 	/* Note: backend also returns POLLHUP/POLLERR if appropriate. */
892 	static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
893 	struct filedesc *fdp = td->td_proc->p_fd;
894 
895 	FILEDESC_LOCK(fdp);
896 	for (msk = 0; msk < 3; msk++) {
897 		if (ibits[msk] == NULL)
898 			continue;
899 		for (i = 0; i < nfd; i += NFDBITS) {
900 			bits = ibits[msk][i/NFDBITS];
901 			/* ffs(int mask) not portable, fd_mask is long */
902 			for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
903 				if (!(bits & 1))
904 					continue;
905 				if ((fp = fget_locked(fdp, fd)) == NULL) {
906 					FILEDESC_UNLOCK(fdp);
907 					return (EBADF);
908 				}
909 				if (fo_poll(fp, flag[msk], td->td_ucred,
910 				    td)) {
911 					obits[msk][(fd)/NFDBITS] |=
912 					    ((fd_mask)1 << ((fd) % NFDBITS));
913 					n++;
914 				}
915 			}
916 		}
917 	}
918 	FILEDESC_UNLOCK(fdp);
919 	td->td_retval[0] = n;
920 	return (0);
921 }
922 
923 /*
924  * Poll system call.
925  */
926 #ifndef _SYS_SYSPROTO_H_
927 struct poll_args {
928 	struct pollfd *fds;
929 	u_int	nfds;
930 	int	timeout;
931 };
932 #endif
933 /*
934  * MPSAFE
935  */
936 int
937 poll(td, uap)
938 	struct thread *td;
939 	struct poll_args *uap;
940 {
941 	caddr_t bits;
942 	char smallbits[32 * sizeof(struct pollfd)];
943 	struct timeval atv, rtv, ttv;
944 	int error = 0, timo;
945 	u_int ncoll, nfds;
946 	size_t ni;
947 
948 	nfds = uap->nfds;
949 
950 	mtx_lock(&Giant);
951 	/*
952 	 * This is kinda bogus.  We have fd limits, but that is not
953 	 * really related to the size of the pollfd array.  Make sure
954 	 * we let the process use at least FD_SETSIZE entries and at
955 	 * least enough for the current limits.  We want to be reasonably
956 	 * safe, but not overly restrictive.
957 	 */
958 	if ((nfds > td->td_proc->p_rlimit[RLIMIT_NOFILE].rlim_cur) &&
959 	    (nfds > FD_SETSIZE)) {
960 		error = EINVAL;
961 		goto done2;
962 	}
963 	ni = nfds * sizeof(struct pollfd);
964 	if (ni > sizeof(smallbits))
965 		bits = malloc(ni, M_TEMP, M_WAITOK);
966 	else
967 		bits = smallbits;
968 	error = copyin(uap->fds, bits, ni);
969 	if (error)
970 		goto done_nosellock;
971 	if (uap->timeout != INFTIM) {
972 		atv.tv_sec = uap->timeout / 1000;
973 		atv.tv_usec = (uap->timeout % 1000) * 1000;
974 		if (itimerfix(&atv)) {
975 			error = EINVAL;
976 			goto done_nosellock;
977 		}
978 		getmicrouptime(&rtv);
979 		timevaladd(&atv, &rtv);
980 	} else {
981 		atv.tv_sec = 0;
982 		atv.tv_usec = 0;
983 	}
984 	timo = 0;
985 	TAILQ_INIT(&td->td_selq);
986 	mtx_lock(&sellock);
987 retry:
988 	ncoll = nselcoll;
989 	mtx_lock_spin(&sched_lock);
990 	td->td_flags |= TDF_SELECT;
991 	mtx_unlock_spin(&sched_lock);
992 	mtx_unlock(&sellock);
993 
994 	error = pollscan(td, (struct pollfd *)bits, nfds);
995 	mtx_lock(&sellock);
996 	if (error || td->td_retval[0])
997 		goto done;
998 	if (atv.tv_sec || atv.tv_usec) {
999 		getmicrouptime(&rtv);
1000 		if (timevalcmp(&rtv, &atv, >=))
1001 			goto done;
1002 		ttv = atv;
1003 		timevalsub(&ttv, &rtv);
1004 		timo = ttv.tv_sec > 24 * 60 * 60 ?
1005 		    24 * 60 * 60 * hz : tvtohz(&ttv);
1006 	}
1007 	/*
1008 	 * An event of interest may occur while we do not hold
1009 	 * sellock, so check TDF_SELECT and the number of collisions
1010 	 * and rescan the file descriptors if necessary.
1011 	 */
1012 	mtx_lock_spin(&sched_lock);
1013 	if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
1014 		mtx_unlock_spin(&sched_lock);
1015 		goto retry;
1016 	}
1017 	mtx_unlock_spin(&sched_lock);
1018 
1019 	if (timo > 0)
1020 		error = cv_timedwait_sig(&selwait, &sellock, timo);
1021 	else
1022 		error = cv_wait_sig(&selwait, &sellock);
1023 
1024 	if (error == 0)
1025 		goto retry;
1026 
1027 done:
1028 	clear_selinfo_list(td);
1029 	mtx_lock_spin(&sched_lock);
1030 	td->td_flags &= ~TDF_SELECT;
1031 	mtx_unlock_spin(&sched_lock);
1032 	mtx_unlock(&sellock);
1033 
1034 done_nosellock:
1035 	/* poll is not restarted after signals... */
1036 	if (error == ERESTART)
1037 		error = EINTR;
1038 	if (error == EWOULDBLOCK)
1039 		error = 0;
1040 	if (error == 0) {
1041 		error = copyout(bits, uap->fds, ni);
1042 		if (error)
1043 			goto out;
1044 	}
1045 out:
1046 	if (ni > sizeof(smallbits))
1047 		free(bits, M_TEMP);
1048 done2:
1049 	mtx_unlock(&Giant);
1050 	return (error);
1051 }
1052 
1053 static int
1054 pollscan(td, fds, nfd)
1055 	struct thread *td;
1056 	struct pollfd *fds;
1057 	u_int nfd;
1058 {
1059 	register struct filedesc *fdp = td->td_proc->p_fd;
1060 	int i;
1061 	struct file *fp;
1062 	int n = 0;
1063 
1064 	FILEDESC_LOCK(fdp);
1065 	for (i = 0; i < nfd; i++, fds++) {
1066 		if (fds->fd >= fdp->fd_nfiles) {
1067 			fds->revents = POLLNVAL;
1068 			n++;
1069 		} else if (fds->fd < 0) {
1070 			fds->revents = 0;
1071 		} else {
1072 			fp = fdp->fd_ofiles[fds->fd];
1073 			if (fp == NULL) {
1074 				fds->revents = POLLNVAL;
1075 				n++;
1076 			} else {
1077 				/*
1078 				 * Note: backend also returns POLLHUP and
1079 				 * POLLERR if appropriate.
1080 				 */
1081 				fds->revents = fo_poll(fp, fds->events,
1082 				    td->td_ucred, td);
1083 				if (fds->revents != 0)
1084 					n++;
1085 			}
1086 		}
1087 	}
1088 	FILEDESC_UNLOCK(fdp);
1089 	td->td_retval[0] = n;
1090 	return (0);
1091 }
1092 
1093 /*
1094  * OpenBSD poll system call.
1095  * XXX this isn't quite a true representation..  OpenBSD uses select ops.
1096  */
1097 #ifndef _SYS_SYSPROTO_H_
1098 struct openbsd_poll_args {
1099 	struct pollfd *fds;
1100 	u_int	nfds;
1101 	int	timeout;
1102 };
1103 #endif
1104 /*
1105  * MPSAFE
1106  */
1107 int
1108 openbsd_poll(td, uap)
1109 	register struct thread *td;
1110 	register struct openbsd_poll_args *uap;
1111 {
1112 	return (poll(td, (struct poll_args *)uap));
1113 }
1114 
1115 /*
1116  * Remove the references to the thread from all of the objects
1117  * we were polling.
1118  *
1119  * This code assumes that the underlying owner of the selinfo
1120  * structure will hold sellock before it changes it, and that
1121  * it will unlink itself from our list if it goes away.
1122  */
1123 void
1124 clear_selinfo_list(td)
1125 	struct thread *td;
1126 {
1127 	struct selinfo *si;
1128 
1129 	mtx_assert(&sellock, MA_OWNED);
1130 	TAILQ_FOREACH(si, &td->td_selq, si_thrlist)
1131 		si->si_thread = NULL;
1132 	TAILQ_INIT(&td->td_selq);
1133 }
1134 
1135 /*ARGSUSED*/
1136 int
1137 seltrue(dev, events, td)
1138 	dev_t dev;
1139 	int events;
1140 	struct thread *td;
1141 {
1142 
1143 	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
1144 }
1145 
1146 /*
1147  * Record a select request.
1148  */
1149 void
1150 selrecord(selector, sip)
1151 	struct thread *selector;
1152 	struct selinfo *sip;
1153 {
1154 
1155 	mtx_lock(&sellock);
1156 	/*
1157 	 * If the selinfo's thread pointer is NULL then take ownership of it.
1158 	 *
1159 	 * If the thread pointer is not NULL and it points to another
1160 	 * thread, then we have a collision.
1161 	 *
1162 	 * If the thread pointer is not NULL and points back to us then leave
1163 	 * it alone as we've already added pointed it at us and added it to
1164 	 * our list.
1165 	 */
1166 	if (sip->si_thread == NULL) {
1167 		sip->si_thread = selector;
1168 		TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist);
1169 	} else if (sip->si_thread != selector) {
1170 		sip->si_flags |= SI_COLL;
1171 	}
1172 
1173 	mtx_unlock(&sellock);
1174 }
1175 
1176 /*
1177  * Do a wakeup when a selectable event occurs.
1178  */
1179 void
1180 selwakeup(sip)
1181 	struct selinfo *sip;
1182 {
1183 	struct thread *td;
1184 
1185 	mtx_lock(&sellock);
1186 	td = sip->si_thread;
1187 	if ((sip->si_flags & SI_COLL) != 0) {
1188 		nselcoll++;
1189 		sip->si_flags &= ~SI_COLL;
1190 		cv_broadcast(&selwait);
1191 	}
1192 	if (td == NULL) {
1193 		mtx_unlock(&sellock);
1194 		return;
1195 	}
1196 	TAILQ_REMOVE(&td->td_selq, sip, si_thrlist);
1197 	sip->si_thread = NULL;
1198 	mtx_lock_spin(&sched_lock);
1199 	if (td->td_wchan == &selwait) {
1200 		cv_waitq_remove(td);
1201 		TD_CLR_SLEEPING(td);
1202 		setrunnable(td);
1203 	} else
1204 		td->td_flags &= ~TDF_SELECT;
1205 	mtx_unlock_spin(&sched_lock);
1206 	mtx_unlock(&sellock);
1207 }
1208 
1209 static void selectinit(void *);
1210 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL)
1211 
1212 /* ARGSUSED*/
1213 static void
1214 selectinit(dummy)
1215 	void *dummy;
1216 {
1217 	cv_init(&selwait, "select");
1218 	mtx_init(&sellock, "sellck", NULL, MTX_DEF);
1219 }
1220