xref: /freebsd/sys/kern/sys_generic.c (revision 52ec752989b2e6d4e9a59a8ff25d8ff596d85e62)
1 /*
2  * Copyright (c) 1982, 1986, 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
39  */
40 
41 #include <sys/cdefs.h>
42 __FBSDID("$FreeBSD$");
43 
44 #include "opt_ktrace.h"
45 
46 #include <sys/param.h>
47 #include <sys/systm.h>
48 #include <sys/sysproto.h>
49 #include <sys/filedesc.h>
50 #include <sys/filio.h>
51 #include <sys/fcntl.h>
52 #include <sys/file.h>
53 #include <sys/proc.h>
54 #include <sys/signalvar.h>
55 #include <sys/socketvar.h>
56 #include <sys/uio.h>
57 #include <sys/kernel.h>
58 #include <sys/limits.h>
59 #include <sys/malloc.h>
60 #include <sys/poll.h>
61 #include <sys/resourcevar.h>
62 #include <sys/selinfo.h>
63 #include <sys/syscallsubr.h>
64 #include <sys/sysctl.h>
65 #include <sys/sysent.h>
66 #include <sys/bio.h>
67 #include <sys/buf.h>
68 #include <sys/condvar.h>
69 #ifdef KTRACE
70 #include <sys/ktrace.h>
71 #endif
72 #include <vm/vm.h>
73 #include <vm/vm_page.h>
74 
75 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
76 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
77 MALLOC_DEFINE(M_IOV, "iov", "large iov's");
78 
79 static int	pollscan(struct thread *, struct pollfd *, u_int);
80 static int	selscan(struct thread *, fd_mask **, fd_mask **, int);
81 static int	dofileread(struct thread *, struct file *, int, void *,
82 		    size_t, off_t, int);
83 static int	dofilewrite(struct thread *, struct file *, int,
84 		    const void *, size_t, off_t, int);
85 static void	doselwakeup(struct selinfo *, int);
86 
87 /*
88  * Read system call.
89  */
90 #ifndef _SYS_SYSPROTO_H_
91 struct read_args {
92 	int	fd;
93 	void	*buf;
94 	size_t	nbyte;
95 };
96 #endif
97 /*
98  * MPSAFE
99  */
100 int
101 read(td, uap)
102 	struct thread *td;
103 	struct read_args *uap;
104 {
105 	struct file *fp;
106 	int error;
107 
108 	if ((error = fget_read(td, uap->fd, &fp)) == 0) {
109 		error = dofileread(td, fp, uap->fd, uap->buf,
110 			    uap->nbyte, (off_t)-1, 0);
111 		fdrop(fp, td);
112 	}
113 	return(error);
114 }
115 
116 /*
117  * Pread system call
118  */
119 #ifndef _SYS_SYSPROTO_H_
120 struct pread_args {
121 	int	fd;
122 	void	*buf;
123 	size_t	nbyte;
124 	int	pad;
125 	off_t	offset;
126 };
127 #endif
128 /*
129  * MPSAFE
130  */
131 int
132 pread(td, uap)
133 	struct thread *td;
134 	struct pread_args *uap;
135 {
136 	struct file *fp;
137 	int error;
138 
139 	if ((error = fget_read(td, uap->fd, &fp)) != 0)
140 		return (error);
141 	if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) {
142 		error = ESPIPE;
143 	} else {
144 		error = dofileread(td, fp, uap->fd, uap->buf, uap->nbyte,
145 			    uap->offset, FOF_OFFSET);
146 	}
147 	fdrop(fp, td);
148 	return(error);
149 }
150 
151 /*
152  * Code common for read and pread
153  */
154 static int
155 dofileread(td, fp, fd, buf, nbyte, offset, flags)
156 	struct thread *td;
157 	struct file *fp;
158 	int fd, flags;
159 	void *buf;
160 	size_t nbyte;
161 	off_t offset;
162 {
163 	struct uio auio;
164 	struct iovec aiov;
165 	long cnt, error = 0;
166 #ifdef KTRACE
167 	struct iovec ktriov;
168 	struct uio ktruio;
169 	int didktr = 0;
170 #endif
171 
172 	aiov.iov_base = buf;
173 	aiov.iov_len = nbyte;
174 	auio.uio_iov = &aiov;
175 	auio.uio_iovcnt = 1;
176 	auio.uio_offset = offset;
177 	if (nbyte > INT_MAX)
178 		return (EINVAL);
179 	auio.uio_resid = nbyte;
180 	auio.uio_rw = UIO_READ;
181 	auio.uio_segflg = UIO_USERSPACE;
182 	auio.uio_td = td;
183 #ifdef KTRACE
184 	/*
185 	 * if tracing, save a copy of iovec
186 	 */
187 	if (KTRPOINT(td, KTR_GENIO)) {
188 		ktriov = aiov;
189 		ktruio = auio;
190 		didktr = 1;
191 	}
192 #endif
193 	cnt = nbyte;
194 
195 	if ((error = fo_read(fp, &auio, td->td_ucred, flags, td))) {
196 		if (auio.uio_resid != cnt && (error == ERESTART ||
197 		    error == EINTR || error == EWOULDBLOCK))
198 			error = 0;
199 	}
200 	cnt -= auio.uio_resid;
201 #ifdef KTRACE
202 	if (didktr && error == 0) {
203 		ktruio.uio_iov = &ktriov;
204 		ktruio.uio_resid = cnt;
205 		ktrgenio(fd, UIO_READ, &ktruio, error);
206 	}
207 #endif
208 	td->td_retval[0] = cnt;
209 	return (error);
210 }
211 
212 /*
213  * Scatter read system call.
214  */
215 #ifndef _SYS_SYSPROTO_H_
216 struct readv_args {
217 	int	fd;
218 	struct	iovec *iovp;
219 	u_int	iovcnt;
220 };
221 #endif
222 /*
223  * MPSAFE
224  */
225 int
226 readv(td, uap)
227 	struct thread *td;
228 	struct readv_args *uap;
229 {
230 	struct file *fp;
231 	struct uio auio;
232 	struct iovec *iov;
233 	struct iovec *needfree;
234 	struct iovec aiov[UIO_SMALLIOV];
235 	long i, cnt;
236 	int error;
237 	u_int iovlen;
238 #ifdef KTRACE
239 	struct iovec *ktriov = NULL;
240 	struct uio ktruio;
241 #endif
242 
243 	if ((error = fget_read(td, uap->fd, &fp)) != 0)
244 		return (error);
245 	needfree = NULL;
246 	/* note: can't use iovlen until iovcnt is validated */
247 	iovlen = uap->iovcnt * sizeof (struct iovec);
248 	if (uap->iovcnt > UIO_SMALLIOV) {
249 		if (uap->iovcnt > UIO_MAXIOV) {
250 			error = EINVAL;
251 			goto done;
252 		}
253 		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
254 		needfree = iov;
255 	} else
256 		iov = aiov;
257 	auio.uio_iov = iov;
258 	auio.uio_iovcnt = uap->iovcnt;
259 	auio.uio_rw = UIO_READ;
260 	auio.uio_segflg = UIO_USERSPACE;
261 	auio.uio_td = td;
262 	auio.uio_offset = -1;
263 	if ((error = copyin(uap->iovp, iov, iovlen)))
264 		goto done;
265 	auio.uio_resid = 0;
266 	for (i = 0; i < uap->iovcnt; i++) {
267 		if (iov->iov_len > INT_MAX - auio.uio_resid) {
268 			error = EINVAL;
269 			goto done;
270 		}
271 		auio.uio_resid += iov->iov_len;
272 		iov++;
273 	}
274 #ifdef KTRACE
275 	/*
276 	 * if tracing, save a copy of iovec
277 	 */
278 	if (KTRPOINT(td, KTR_GENIO))  {
279 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
280 		bcopy(auio.uio_iov, ktriov, iovlen);
281 		ktruio = auio;
282 	}
283 #endif
284 	cnt = auio.uio_resid;
285 	if ((error = fo_read(fp, &auio, td->td_ucred, 0, td))) {
286 		if (auio.uio_resid != cnt && (error == ERESTART ||
287 		    error == EINTR || error == EWOULDBLOCK))
288 			error = 0;
289 	}
290 	cnt -= auio.uio_resid;
291 #ifdef KTRACE
292 	if (ktriov != NULL) {
293 		if (error == 0) {
294 			ktruio.uio_iov = ktriov;
295 			ktruio.uio_resid = cnt;
296 			ktrgenio(uap->fd, UIO_READ, &ktruio, error);
297 		}
298 		FREE(ktriov, M_TEMP);
299 	}
300 #endif
301 	td->td_retval[0] = cnt;
302 done:
303 	fdrop(fp, td);
304 	if (needfree)
305 		FREE(needfree, M_IOV);
306 	return (error);
307 }
308 
309 /*
310  * Write system call
311  */
312 #ifndef _SYS_SYSPROTO_H_
313 struct write_args {
314 	int	fd;
315 	const void *buf;
316 	size_t	nbyte;
317 };
318 #endif
319 /*
320  * MPSAFE
321  */
322 int
323 write(td, uap)
324 	struct thread *td;
325 	struct write_args *uap;
326 {
327 	struct file *fp;
328 	int error;
329 
330 	if ((error = fget_write(td, uap->fd, &fp)) == 0) {
331 		error = dofilewrite(td, fp, uap->fd, uap->buf, uap->nbyte,
332 			    (off_t)-1, 0);
333 		fdrop(fp, td);
334 	} else {
335 		error = EBADF;	/* XXX this can't be right */
336 	}
337 	return(error);
338 }
339 
340 /*
341  * Pwrite system call
342  */
343 #ifndef _SYS_SYSPROTO_H_
344 struct pwrite_args {
345 	int	fd;
346 	const void *buf;
347 	size_t	nbyte;
348 	int	pad;
349 	off_t	offset;
350 };
351 #endif
352 /*
353  * MPSAFE
354  */
355 int
356 pwrite(td, uap)
357 	struct thread *td;
358 	struct pwrite_args *uap;
359 {
360 	struct file *fp;
361 	int error;
362 
363 	if ((error = fget_write(td, uap->fd, &fp)) == 0) {
364 		if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) {
365 			error = ESPIPE;
366 		} else {
367 			error = dofilewrite(td, fp, uap->fd, uap->buf,
368 				    uap->nbyte, uap->offset, FOF_OFFSET);
369 		}
370 		fdrop(fp, td);
371 	} else {
372 		error = EBADF;	/* this can't be right */
373 	}
374 	return(error);
375 }
376 
377 static int
378 dofilewrite(td, fp, fd, buf, nbyte, offset, flags)
379 	struct thread *td;
380 	struct file *fp;
381 	int fd, flags;
382 	const void *buf;
383 	size_t nbyte;
384 	off_t offset;
385 {
386 	struct uio auio;
387 	struct iovec aiov;
388 	long cnt, error = 0;
389 #ifdef KTRACE
390 	struct iovec ktriov;
391 	struct uio ktruio;
392 	int didktr = 0;
393 #endif
394 
395 	aiov.iov_base = (void *)(uintptr_t)buf;
396 	aiov.iov_len = nbyte;
397 	auio.uio_iov = &aiov;
398 	auio.uio_iovcnt = 1;
399 	auio.uio_offset = offset;
400 	if (nbyte > INT_MAX)
401 		return (EINVAL);
402 	auio.uio_resid = nbyte;
403 	auio.uio_rw = UIO_WRITE;
404 	auio.uio_segflg = UIO_USERSPACE;
405 	auio.uio_td = td;
406 #ifdef KTRACE
407 	/*
408 	 * if tracing, save a copy of iovec and uio
409 	 */
410 	if (KTRPOINT(td, KTR_GENIO)) {
411 		ktriov = aiov;
412 		ktruio = auio;
413 		didktr = 1;
414 	}
415 #endif
416 	cnt = nbyte;
417 	if (fp->f_type == DTYPE_VNODE)
418 		bwillwrite();
419 	if ((error = fo_write(fp, &auio, td->td_ucred, flags, td))) {
420 		if (auio.uio_resid != cnt && (error == ERESTART ||
421 		    error == EINTR || error == EWOULDBLOCK))
422 			error = 0;
423 		/* Socket layer is responsible for issuing SIGPIPE. */
424 		if (error == EPIPE && fp->f_type != DTYPE_SOCKET) {
425 			PROC_LOCK(td->td_proc);
426 			psignal(td->td_proc, SIGPIPE);
427 			PROC_UNLOCK(td->td_proc);
428 		}
429 	}
430 	cnt -= auio.uio_resid;
431 #ifdef KTRACE
432 	if (didktr && error == 0) {
433 		ktruio.uio_iov = &ktriov;
434 		ktruio.uio_resid = cnt;
435 		ktrgenio(fd, UIO_WRITE, &ktruio, error);
436 	}
437 #endif
438 	td->td_retval[0] = cnt;
439 	return (error);
440 }
441 
442 /*
443  * Gather write system call
444  */
445 #ifndef _SYS_SYSPROTO_H_
446 struct writev_args {
447 	int	fd;
448 	struct	iovec *iovp;
449 	u_int	iovcnt;
450 };
451 #endif
452 /*
453  * MPSAFE
454  */
455 int
456 writev(td, uap)
457 	struct thread *td;
458 	register struct writev_args *uap;
459 {
460 	struct file *fp;
461 	struct uio auio;
462 	register struct iovec *iov;
463 	struct iovec *needfree;
464 	struct iovec aiov[UIO_SMALLIOV];
465 	long i, cnt, error = 0;
466 	u_int iovlen;
467 #ifdef KTRACE
468 	struct iovec *ktriov = NULL;
469 	struct uio ktruio;
470 #endif
471 
472 	if ((error = fget_write(td, uap->fd, &fp)) != 0)
473 		return (EBADF);
474 	needfree = NULL;
475 	/* note: can't use iovlen until iovcnt is validated */
476 	iovlen = uap->iovcnt * sizeof (struct iovec);
477 	if (uap->iovcnt > UIO_SMALLIOV) {
478 		if (uap->iovcnt > UIO_MAXIOV) {
479 			error = EINVAL;
480 			goto done;
481 		}
482 		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
483 		needfree = iov;
484 	} else
485 		iov = aiov;
486 	auio.uio_iov = iov;
487 	auio.uio_iovcnt = uap->iovcnt;
488 	auio.uio_rw = UIO_WRITE;
489 	auio.uio_segflg = UIO_USERSPACE;
490 	auio.uio_td = td;
491 	auio.uio_offset = -1;
492 	if ((error = copyin(uap->iovp, iov, iovlen)))
493 		goto done;
494 	auio.uio_resid = 0;
495 	for (i = 0; i < uap->iovcnt; i++) {
496 		if (iov->iov_len > INT_MAX - auio.uio_resid) {
497 			error = EINVAL;
498 			goto done;
499 		}
500 		auio.uio_resid += iov->iov_len;
501 		iov++;
502 	}
503 #ifdef KTRACE
504 	/*
505 	 * if tracing, save a copy of iovec and uio
506 	 */
507 	if (KTRPOINT(td, KTR_GENIO))  {
508 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
509 		bcopy(auio.uio_iov, ktriov, iovlen);
510 		ktruio = auio;
511 	}
512 #endif
513 	cnt = auio.uio_resid;
514 	if (fp->f_type == DTYPE_VNODE)
515 		bwillwrite();
516 	if ((error = fo_write(fp, &auio, td->td_ucred, 0, td))) {
517 		if (auio.uio_resid != cnt && (error == ERESTART ||
518 		    error == EINTR || error == EWOULDBLOCK))
519 			error = 0;
520 		if (error == EPIPE) {
521 			PROC_LOCK(td->td_proc);
522 			psignal(td->td_proc, SIGPIPE);
523 			PROC_UNLOCK(td->td_proc);
524 		}
525 	}
526 	cnt -= auio.uio_resid;
527 #ifdef KTRACE
528 	if (ktriov != NULL) {
529 		if (error == 0) {
530 			ktruio.uio_iov = ktriov;
531 			ktruio.uio_resid = cnt;
532 			ktrgenio(uap->fd, UIO_WRITE, &ktruio, error);
533 		}
534 		FREE(ktriov, M_TEMP);
535 	}
536 #endif
537 	td->td_retval[0] = cnt;
538 done:
539 	fdrop(fp, td);
540 	if (needfree)
541 		FREE(needfree, M_IOV);
542 	return (error);
543 }
544 
545 /*
546  * Ioctl system call
547  */
548 #ifndef _SYS_SYSPROTO_H_
549 struct ioctl_args {
550 	int	fd;
551 	u_long	com;
552 	caddr_t	data;
553 };
554 #endif
555 /*
556  * MPSAFE
557  */
558 /* ARGSUSED */
559 int
560 ioctl(td, uap)
561 	struct thread *td;
562 	register struct ioctl_args *uap;
563 {
564 	struct file *fp;
565 	register struct filedesc *fdp;
566 	register u_long com;
567 	int error = 0;
568 	register u_int size;
569 	caddr_t data, memp;
570 	int tmp;
571 #define STK_PARAMS	128
572 	union {
573 	    char stkbuf[STK_PARAMS];
574 	    long align;
575 	} ubuf;
576 
577 	if ((error = fget(td, uap->fd, &fp)) != 0)
578 		return (error);
579 	mtx_lock(&Giant);
580 	if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
581 		fdrop(fp, td);
582 		mtx_unlock(&Giant);
583 		return (EBADF);
584 	}
585 	fdp = td->td_proc->p_fd;
586 	switch (com = uap->com) {
587 	case FIONCLEX:
588 		FILEDESC_LOCK(fdp);
589 		fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE;
590 		FILEDESC_UNLOCK(fdp);
591 		fdrop(fp, td);
592 		mtx_unlock(&Giant);
593 		return (0);
594 	case FIOCLEX:
595 		FILEDESC_LOCK(fdp);
596 		fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE;
597 		FILEDESC_UNLOCK(fdp);
598 		fdrop(fp, td);
599 		mtx_unlock(&Giant);
600 		return (0);
601 	}
602 
603 	/*
604 	 * Interpret high order word to find amount of data to be
605 	 * copied to/from the user's address space.
606 	 */
607 	size = IOCPARM_LEN(com);
608 	if (size > IOCPARM_MAX) {
609 		fdrop(fp, td);
610 		mtx_unlock(&Giant);
611 		return (ENOTTY);
612 	}
613 
614 	memp = NULL;
615 	if (size > sizeof (ubuf.stkbuf)) {
616 		memp = malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
617 		data = memp;
618 	} else {
619 		data = ubuf.stkbuf;
620 	}
621 	if (com&IOC_IN) {
622 		if (size) {
623 			error = copyin(uap->data, data, (u_int)size);
624 			if (error) {
625 				if (memp)
626 					free(memp, M_IOCTLOPS);
627 				fdrop(fp, td);
628 				goto done;
629 			}
630 		} else {
631 			*(caddr_t *)data = uap->data;
632 		}
633 	} else if ((com&IOC_OUT) && size) {
634 		/*
635 		 * Zero the buffer so the user always
636 		 * gets back something deterministic.
637 		 */
638 		bzero(data, size);
639 	} else if (com&IOC_VOID) {
640 		*(caddr_t *)data = uap->data;
641 	}
642 
643 	switch (com) {
644 
645 	case FIONBIO:
646 		FILE_LOCK(fp);
647 		if ((tmp = *(int *)data))
648 			fp->f_flag |= FNONBLOCK;
649 		else
650 			fp->f_flag &= ~FNONBLOCK;
651 		FILE_UNLOCK(fp);
652 		error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
653 		break;
654 
655 	case FIOASYNC:
656 		FILE_LOCK(fp);
657 		if ((tmp = *(int *)data))
658 			fp->f_flag |= FASYNC;
659 		else
660 			fp->f_flag &= ~FASYNC;
661 		FILE_UNLOCK(fp);
662 		error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td);
663 		break;
664 
665 	default:
666 		error = fo_ioctl(fp, com, data, td->td_ucred, td);
667 		/*
668 		 * Copy any data to user, size was
669 		 * already set and checked above.
670 		 */
671 		if (error == 0 && (com&IOC_OUT) && size)
672 			error = copyout(data, uap->data, (u_int)size);
673 		break;
674 	}
675 	if (memp)
676 		free(memp, M_IOCTLOPS);
677 	fdrop(fp, td);
678 done:
679 	mtx_unlock(&Giant);
680 	return (error);
681 }
682 
683 /*
684  * sellock and selwait are initialized in selectinit() via SYSINIT.
685  */
686 struct mtx	sellock;
687 struct cv	selwait;
688 u_int		nselcoll;	/* Select collisions since boot */
689 SYSCTL_UINT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
690 
691 /*
692  * Select system call.
693  */
694 #ifndef _SYS_SYSPROTO_H_
695 struct select_args {
696 	int	nd;
697 	fd_set	*in, *ou, *ex;
698 	struct	timeval *tv;
699 };
700 #endif
701 /*
702  * MPSAFE
703  */
704 int
705 select(td, uap)
706 	register struct thread *td;
707 	register struct select_args *uap;
708 {
709 	struct timeval tv, *tvp;
710 	int error;
711 
712 	if (uap->tv != NULL) {
713 		error = copyin(uap->tv, &tv, sizeof(tv));
714 		if (error)
715 			return (error);
716 		tvp = &tv;
717 	} else
718 		tvp = NULL;
719 
720 	return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp));
721 }
722 
723 int
724 kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou,
725     fd_set *fd_ex, struct timeval *tvp)
726 {
727 	struct filedesc *fdp;
728 	/*
729 	 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
730 	 * infds with the new FD_SETSIZE of 1024, and more than enough for
731 	 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
732 	 * of 256.
733 	 */
734 	fd_mask s_selbits[howmany(2048, NFDBITS)];
735 	fd_mask *ibits[3], *obits[3], *selbits, *sbp;
736 	struct timeval atv, rtv, ttv;
737 	int error, timo;
738 	u_int ncoll, nbufbytes, ncpbytes, nfdbits;
739 
740 	if (nd < 0)
741 		return (EINVAL);
742 	fdp = td->td_proc->p_fd;
743 	mtx_lock(&Giant);
744 	FILEDESC_LOCK(fdp);
745 
746 	if (nd > td->td_proc->p_fd->fd_nfiles)
747 		nd = td->td_proc->p_fd->fd_nfiles;   /* forgiving; slightly wrong */
748 	FILEDESC_UNLOCK(fdp);
749 
750 	/*
751 	 * Allocate just enough bits for the non-null fd_sets.  Use the
752 	 * preallocated auto buffer if possible.
753 	 */
754 	nfdbits = roundup(nd, NFDBITS);
755 	ncpbytes = nfdbits / NBBY;
756 	nbufbytes = 0;
757 	if (fd_in != NULL)
758 		nbufbytes += 2 * ncpbytes;
759 	if (fd_ou != NULL)
760 		nbufbytes += 2 * ncpbytes;
761 	if (fd_ex != NULL)
762 		nbufbytes += 2 * ncpbytes;
763 	if (nbufbytes <= sizeof s_selbits)
764 		selbits = &s_selbits[0];
765 	else
766 		selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
767 
768 	/*
769 	 * Assign pointers into the bit buffers and fetch the input bits.
770 	 * Put the output buffers together so that they can be bzeroed
771 	 * together.
772 	 */
773 	sbp = selbits;
774 #define	getbits(name, x) \
775 	do {								\
776 		if (name == NULL)					\
777 			ibits[x] = NULL;				\
778 		else {							\
779 			ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp;	\
780 			obits[x] = sbp;					\
781 			sbp += ncpbytes / sizeof *sbp;			\
782 			error = copyin(name, ibits[x], ncpbytes);	\
783 			if (error != 0)					\
784 				goto done_nosellock;			\
785 		}							\
786 	} while (0)
787 	getbits(fd_in, 0);
788 	getbits(fd_ou, 1);
789 	getbits(fd_ex, 2);
790 #undef	getbits
791 	if (nbufbytes != 0)
792 		bzero(selbits, nbufbytes / 2);
793 
794 	if (tvp != NULL) {
795 		atv = *tvp;
796 		if (itimerfix(&atv)) {
797 			error = EINVAL;
798 			goto done_nosellock;
799 		}
800 		getmicrouptime(&rtv);
801 		timevaladd(&atv, &rtv);
802 	} else {
803 		atv.tv_sec = 0;
804 		atv.tv_usec = 0;
805 	}
806 	timo = 0;
807 	TAILQ_INIT(&td->td_selq);
808 	mtx_lock(&sellock);
809 retry:
810 	ncoll = nselcoll;
811 	mtx_lock_spin(&sched_lock);
812 	td->td_flags |= TDF_SELECT;
813 	mtx_unlock_spin(&sched_lock);
814 	mtx_unlock(&sellock);
815 
816 	error = selscan(td, ibits, obits, nd);
817 	mtx_lock(&sellock);
818 	if (error || td->td_retval[0])
819 		goto done;
820 	if (atv.tv_sec || atv.tv_usec) {
821 		getmicrouptime(&rtv);
822 		if (timevalcmp(&rtv, &atv, >=))
823 			goto done;
824 		ttv = atv;
825 		timevalsub(&ttv, &rtv);
826 		timo = ttv.tv_sec > 24 * 60 * 60 ?
827 		    24 * 60 * 60 * hz : tvtohz(&ttv);
828 	}
829 
830 	/*
831 	 * An event of interest may occur while we do not hold
832 	 * sellock, so check TDF_SELECT and the number of
833 	 * collisions and rescan the file descriptors if
834 	 * necessary.
835 	 */
836 	mtx_lock_spin(&sched_lock);
837 	if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
838 		mtx_unlock_spin(&sched_lock);
839 		goto retry;
840 	}
841 	mtx_unlock_spin(&sched_lock);
842 
843 	if (timo > 0)
844 		error = cv_timedwait_sig(&selwait, &sellock, timo);
845 	else
846 		error = cv_wait_sig(&selwait, &sellock);
847 
848 	if (error == 0)
849 		goto retry;
850 
851 done:
852 	clear_selinfo_list(td);
853 	mtx_lock_spin(&sched_lock);
854 	td->td_flags &= ~TDF_SELECT;
855 	mtx_unlock_spin(&sched_lock);
856 	mtx_unlock(&sellock);
857 
858 done_nosellock:
859 	/* select is not restarted after signals... */
860 	if (error == ERESTART)
861 		error = EINTR;
862 	if (error == EWOULDBLOCK)
863 		error = 0;
864 #define	putbits(name, x) \
865 	if (name && (error2 = copyout(obits[x], name, ncpbytes))) \
866 		error = error2;
867 	if (error == 0) {
868 		int error2;
869 
870 		putbits(fd_in, 0);
871 		putbits(fd_ou, 1);
872 		putbits(fd_ex, 2);
873 #undef putbits
874 	}
875 	if (selbits != &s_selbits[0])
876 		free(selbits, M_SELECT);
877 
878 	mtx_unlock(&Giant);
879 	return (error);
880 }
881 
882 static int
883 selscan(td, ibits, obits, nfd)
884 	struct thread *td;
885 	fd_mask **ibits, **obits;
886 	int nfd;
887 {
888 	int msk, i, fd;
889 	fd_mask bits;
890 	struct file *fp;
891 	int n = 0;
892 	/* Note: backend also returns POLLHUP/POLLERR if appropriate. */
893 	static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
894 	struct filedesc *fdp = td->td_proc->p_fd;
895 
896 	FILEDESC_LOCK(fdp);
897 	for (msk = 0; msk < 3; msk++) {
898 		if (ibits[msk] == NULL)
899 			continue;
900 		for (i = 0; i < nfd; i += NFDBITS) {
901 			bits = ibits[msk][i/NFDBITS];
902 			/* ffs(int mask) not portable, fd_mask is long */
903 			for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
904 				if (!(bits & 1))
905 					continue;
906 				if ((fp = fget_locked(fdp, fd)) == NULL) {
907 					FILEDESC_UNLOCK(fdp);
908 					return (EBADF);
909 				}
910 				if (fo_poll(fp, flag[msk], td->td_ucred,
911 				    td)) {
912 					obits[msk][(fd)/NFDBITS] |=
913 					    ((fd_mask)1 << ((fd) % NFDBITS));
914 					n++;
915 				}
916 			}
917 		}
918 	}
919 	FILEDESC_UNLOCK(fdp);
920 	td->td_retval[0] = n;
921 	return (0);
922 }
923 
924 /*
925  * Poll system call.
926  */
927 #ifndef _SYS_SYSPROTO_H_
928 struct poll_args {
929 	struct pollfd *fds;
930 	u_int	nfds;
931 	int	timeout;
932 };
933 #endif
934 /*
935  * MPSAFE
936  */
937 int
938 poll(td, uap)
939 	struct thread *td;
940 	struct poll_args *uap;
941 {
942 	caddr_t bits;
943 	char smallbits[32 * sizeof(struct pollfd)];
944 	struct timeval atv, rtv, ttv;
945 	int error = 0, timo;
946 	u_int ncoll, nfds;
947 	size_t ni;
948 
949 	nfds = uap->nfds;
950 
951 	mtx_lock(&Giant);
952 	/*
953 	 * This is kinda bogus.  We have fd limits, but that is not
954 	 * really related to the size of the pollfd array.  Make sure
955 	 * we let the process use at least FD_SETSIZE entries and at
956 	 * least enough for the current limits.  We want to be reasonably
957 	 * safe, but not overly restrictive.
958 	 */
959 	if ((nfds > td->td_proc->p_rlimit[RLIMIT_NOFILE].rlim_cur) &&
960 	    (nfds > FD_SETSIZE)) {
961 		error = EINVAL;
962 		goto done2;
963 	}
964 	ni = nfds * sizeof(struct pollfd);
965 	if (ni > sizeof(smallbits))
966 		bits = malloc(ni, M_TEMP, M_WAITOK);
967 	else
968 		bits = smallbits;
969 	error = copyin(uap->fds, bits, ni);
970 	if (error)
971 		goto done_nosellock;
972 	if (uap->timeout != INFTIM) {
973 		atv.tv_sec = uap->timeout / 1000;
974 		atv.tv_usec = (uap->timeout % 1000) * 1000;
975 		if (itimerfix(&atv)) {
976 			error = EINVAL;
977 			goto done_nosellock;
978 		}
979 		getmicrouptime(&rtv);
980 		timevaladd(&atv, &rtv);
981 	} else {
982 		atv.tv_sec = 0;
983 		atv.tv_usec = 0;
984 	}
985 	timo = 0;
986 	TAILQ_INIT(&td->td_selq);
987 	mtx_lock(&sellock);
988 retry:
989 	ncoll = nselcoll;
990 	mtx_lock_spin(&sched_lock);
991 	td->td_flags |= TDF_SELECT;
992 	mtx_unlock_spin(&sched_lock);
993 	mtx_unlock(&sellock);
994 
995 	error = pollscan(td, (struct pollfd *)bits, nfds);
996 	mtx_lock(&sellock);
997 	if (error || td->td_retval[0])
998 		goto done;
999 	if (atv.tv_sec || atv.tv_usec) {
1000 		getmicrouptime(&rtv);
1001 		if (timevalcmp(&rtv, &atv, >=))
1002 			goto done;
1003 		ttv = atv;
1004 		timevalsub(&ttv, &rtv);
1005 		timo = ttv.tv_sec > 24 * 60 * 60 ?
1006 		    24 * 60 * 60 * hz : tvtohz(&ttv);
1007 	}
1008 	/*
1009 	 * An event of interest may occur while we do not hold
1010 	 * sellock, so check TDF_SELECT and the number of collisions
1011 	 * and rescan the file descriptors if necessary.
1012 	 */
1013 	mtx_lock_spin(&sched_lock);
1014 	if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
1015 		mtx_unlock_spin(&sched_lock);
1016 		goto retry;
1017 	}
1018 	mtx_unlock_spin(&sched_lock);
1019 
1020 	if (timo > 0)
1021 		error = cv_timedwait_sig(&selwait, &sellock, timo);
1022 	else
1023 		error = cv_wait_sig(&selwait, &sellock);
1024 
1025 	if (error == 0)
1026 		goto retry;
1027 
1028 done:
1029 	clear_selinfo_list(td);
1030 	mtx_lock_spin(&sched_lock);
1031 	td->td_flags &= ~TDF_SELECT;
1032 	mtx_unlock_spin(&sched_lock);
1033 	mtx_unlock(&sellock);
1034 
1035 done_nosellock:
1036 	/* poll is not restarted after signals... */
1037 	if (error == ERESTART)
1038 		error = EINTR;
1039 	if (error == EWOULDBLOCK)
1040 		error = 0;
1041 	if (error == 0) {
1042 		error = copyout(bits, uap->fds, ni);
1043 		if (error)
1044 			goto out;
1045 	}
1046 out:
1047 	if (ni > sizeof(smallbits))
1048 		free(bits, M_TEMP);
1049 done2:
1050 	mtx_unlock(&Giant);
1051 	return (error);
1052 }
1053 
1054 static int
1055 pollscan(td, fds, nfd)
1056 	struct thread *td;
1057 	struct pollfd *fds;
1058 	u_int nfd;
1059 {
1060 	register struct filedesc *fdp = td->td_proc->p_fd;
1061 	int i;
1062 	struct file *fp;
1063 	int n = 0;
1064 
1065 	FILEDESC_LOCK(fdp);
1066 	for (i = 0; i < nfd; i++, fds++) {
1067 		if (fds->fd >= fdp->fd_nfiles) {
1068 			fds->revents = POLLNVAL;
1069 			n++;
1070 		} else if (fds->fd < 0) {
1071 			fds->revents = 0;
1072 		} else {
1073 			fp = fdp->fd_ofiles[fds->fd];
1074 			if (fp == NULL) {
1075 				fds->revents = POLLNVAL;
1076 				n++;
1077 			} else {
1078 				/*
1079 				 * Note: backend also returns POLLHUP and
1080 				 * POLLERR if appropriate.
1081 				 */
1082 				fds->revents = fo_poll(fp, fds->events,
1083 				    td->td_ucred, td);
1084 				if (fds->revents != 0)
1085 					n++;
1086 			}
1087 		}
1088 	}
1089 	FILEDESC_UNLOCK(fdp);
1090 	td->td_retval[0] = n;
1091 	return (0);
1092 }
1093 
1094 /*
1095  * OpenBSD poll system call.
1096  * XXX this isn't quite a true representation..  OpenBSD uses select ops.
1097  */
1098 #ifndef _SYS_SYSPROTO_H_
1099 struct openbsd_poll_args {
1100 	struct pollfd *fds;
1101 	u_int	nfds;
1102 	int	timeout;
1103 };
1104 #endif
1105 /*
1106  * MPSAFE
1107  */
1108 int
1109 openbsd_poll(td, uap)
1110 	register struct thread *td;
1111 	register struct openbsd_poll_args *uap;
1112 {
1113 	return (poll(td, (struct poll_args *)uap));
1114 }
1115 
1116 /*
1117  * Remove the references to the thread from all of the objects
1118  * we were polling.
1119  *
1120  * This code assumes that the underlying owner of the selinfo
1121  * structure will hold sellock before it changes it, and that
1122  * it will unlink itself from our list if it goes away.
1123  */
1124 void
1125 clear_selinfo_list(td)
1126 	struct thread *td;
1127 {
1128 	struct selinfo *si;
1129 
1130 	mtx_assert(&sellock, MA_OWNED);
1131 	TAILQ_FOREACH(si, &td->td_selq, si_thrlist)
1132 		si->si_thread = NULL;
1133 	TAILQ_INIT(&td->td_selq);
1134 }
1135 
1136 /*
1137  * Record a select request.
1138  */
1139 void
1140 selrecord(selector, sip)
1141 	struct thread *selector;
1142 	struct selinfo *sip;
1143 {
1144 
1145 	mtx_lock(&sellock);
1146 	/*
1147 	 * If the selinfo's thread pointer is NULL then take ownership of it.
1148 	 *
1149 	 * If the thread pointer is not NULL and it points to another
1150 	 * thread, then we have a collision.
1151 	 *
1152 	 * If the thread pointer is not NULL and points back to us then leave
1153 	 * it alone as we've already added pointed it at us and added it to
1154 	 * our list.
1155 	 */
1156 	if (sip->si_thread == NULL) {
1157 		sip->si_thread = selector;
1158 		TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist);
1159 	} else if (sip->si_thread != selector) {
1160 		sip->si_flags |= SI_COLL;
1161 	}
1162 
1163 	mtx_unlock(&sellock);
1164 }
1165 
1166 /* Wake up a selecting thread. */
1167 void
1168 selwakeup(sip)
1169 	struct selinfo *sip;
1170 {
1171 	doselwakeup(sip, -1);
1172 }
1173 
1174 /* Wake up a selecting thread, and set its priority. */
1175 void
1176 selwakeuppri(sip, pri)
1177 	struct selinfo *sip;
1178 	int pri;
1179 {
1180 	doselwakeup(sip, pri);
1181 }
1182 
1183 /*
1184  * Do a wakeup when a selectable event occurs.
1185  */
1186 static void
1187 doselwakeup(sip, pri)
1188 	struct selinfo *sip;
1189 	int pri;
1190 {
1191 	struct thread *td;
1192 
1193 	mtx_lock(&sellock);
1194 	td = sip->si_thread;
1195 	if ((sip->si_flags & SI_COLL) != 0) {
1196 		nselcoll++;
1197 		sip->si_flags &= ~SI_COLL;
1198 		cv_broadcastpri(&selwait, pri);
1199 	}
1200 	if (td == NULL) {
1201 		mtx_unlock(&sellock);
1202 		return;
1203 	}
1204 	TAILQ_REMOVE(&td->td_selq, sip, si_thrlist);
1205 	sip->si_thread = NULL;
1206 	mtx_lock_spin(&sched_lock);
1207 	if (td->td_wchan == &selwait) {
1208 		cv_waitq_remove(td);
1209 		TD_CLR_SLEEPING(td);
1210 		if (pri >= PRI_MIN && pri <= PRI_MAX && td->td_priority > pri)
1211 			td->td_priority = pri;
1212 		setrunnable(td);
1213 	} else
1214 		td->td_flags &= ~TDF_SELECT;
1215 	mtx_unlock_spin(&sched_lock);
1216 	mtx_unlock(&sellock);
1217 }
1218 
1219 static void selectinit(void *);
1220 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL)
1221 
1222 /* ARGSUSED*/
1223 static void
1224 selectinit(dummy)
1225 	void *dummy;
1226 {
1227 	cv_init(&selwait, "select");
1228 	mtx_init(&sellock, "sellck", NULL, MTX_DEF);
1229 }
1230