xref: /freebsd/sys/kern/sys_generic.c (revision d37ea99837e6ad50837fd9fe1771ddf1c3ba6002)
1 /*
2  * Copyright (c) 1982, 1986, 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 4. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
35  */
36 
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD$");
39 
40 #include "opt_ktrace.h"
41 
42 #include <sys/param.h>
43 #include <sys/systm.h>
44 #include <sys/sysproto.h>
45 #include <sys/filedesc.h>
46 #include <sys/filio.h>
47 #include <sys/fcntl.h>
48 #include <sys/file.h>
49 #include <sys/proc.h>
50 #include <sys/signalvar.h>
51 #include <sys/socketvar.h>
52 #include <sys/uio.h>
53 #include <sys/kernel.h>
54 #include <sys/limits.h>
55 #include <sys/malloc.h>
56 #include <sys/poll.h>
57 #include <sys/resourcevar.h>
58 #include <sys/selinfo.h>
59 #include <sys/sleepqueue.h>
60 #include <sys/syscallsubr.h>
61 #include <sys/sysctl.h>
62 #include <sys/sysent.h>
63 #include <sys/vnode.h>
64 #include <sys/bio.h>
65 #include <sys/buf.h>
66 #include <sys/condvar.h>
67 #ifdef KTRACE
68 #include <sys/ktrace.h>
69 #endif
70 #include <vm/vm.h>
71 #include <vm/vm_page.h>
72 
73 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
74 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
75 MALLOC_DEFINE(M_IOV, "iov", "large iov's");
76 
77 static int	pollscan(struct thread *, struct pollfd *, u_int);
78 static int	selscan(struct thread *, fd_mask **, fd_mask **, int);
79 static int	dofileread(struct thread *, struct file *, int, void *,
80 		    size_t, off_t, int);
81 static int	dofilewrite(struct thread *, struct file *, int,
82 		    const void *, size_t, off_t, int);
83 static void	doselwakeup(struct selinfo *, int);
84 
85 /*
86  * Read system call.
87  */
88 #ifndef _SYS_SYSPROTO_H_
89 struct read_args {
90 	int	fd;
91 	void	*buf;
92 	size_t	nbyte;
93 };
94 #endif
95 /*
96  * MPSAFE
97  */
98 int
99 read(td, uap)
100 	struct thread *td;
101 	struct read_args *uap;
102 {
103 	struct file *fp;
104 	int error;
105 
106 	if ((error = fget_read(td, uap->fd, &fp)) == 0) {
107 		error = dofileread(td, fp, uap->fd, uap->buf,
108 			    uap->nbyte, (off_t)-1, 0);
109 		fdrop(fp, td);
110 	}
111 	return(error);
112 }
113 
114 /*
115  * Pread system call
116  */
117 #ifndef _SYS_SYSPROTO_H_
118 struct pread_args {
119 	int	fd;
120 	void	*buf;
121 	size_t	nbyte;
122 	int	pad;
123 	off_t	offset;
124 };
125 #endif
126 /*
127  * MPSAFE
128  */
129 int
130 pread(td, uap)
131 	struct thread *td;
132 	struct pread_args *uap;
133 {
134 	struct file *fp;
135 	int error;
136 
137 	if ((error = fget_read(td, uap->fd, &fp)) != 0)
138 		return (error);
139 	if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
140 		error = ESPIPE;
141 	else if (uap->offset < 0 && fp->f_vnode->v_type != VCHR)
142 		error = EINVAL;
143 	else {
144 		error = dofileread(td, fp, uap->fd, uap->buf, uap->nbyte,
145 			    uap->offset, FOF_OFFSET);
146 	}
147 	fdrop(fp, td);
148 	return(error);
149 }
150 
151 /*
152  * Code common for read and pread
153  */
154 static int
155 dofileread(td, fp, fd, buf, nbyte, offset, flags)
156 	struct thread *td;
157 	struct file *fp;
158 	int fd, flags;
159 	void *buf;
160 	size_t nbyte;
161 	off_t offset;
162 {
163 	struct uio auio;
164 	struct iovec aiov;
165 	long cnt, error = 0;
166 #ifdef KTRACE
167 	struct iovec ktriov;
168 	struct uio ktruio;
169 	int didktr = 0;
170 #endif
171 
172 	aiov.iov_base = buf;
173 	aiov.iov_len = nbyte;
174 	auio.uio_iov = &aiov;
175 	auio.uio_iovcnt = 1;
176 	auio.uio_offset = offset;
177 	if (nbyte > INT_MAX)
178 		return (EINVAL);
179 	auio.uio_resid = nbyte;
180 	auio.uio_rw = UIO_READ;
181 	auio.uio_segflg = UIO_USERSPACE;
182 	auio.uio_td = td;
183 #ifdef KTRACE
184 	/*
185 	 * if tracing, save a copy of iovec
186 	 */
187 	if (KTRPOINT(td, KTR_GENIO)) {
188 		ktriov = aiov;
189 		ktruio = auio;
190 		didktr = 1;
191 	}
192 #endif
193 	cnt = nbyte;
194 
195 	if ((error = fo_read(fp, &auio, td->td_ucred, flags, td))) {
196 		if (auio.uio_resid != cnt && (error == ERESTART ||
197 		    error == EINTR || error == EWOULDBLOCK))
198 			error = 0;
199 	}
200 	cnt -= auio.uio_resid;
201 #ifdef KTRACE
202 	if (didktr && error == 0) {
203 		ktruio.uio_iov = &ktriov;
204 		ktruio.uio_resid = cnt;
205 		ktrgenio(fd, UIO_READ, &ktruio, error);
206 	}
207 #endif
208 	td->td_retval[0] = cnt;
209 	return (error);
210 }
211 
212 /*
213  * Scatter read system call.
214  */
215 #ifndef _SYS_SYSPROTO_H_
216 struct readv_args {
217 	int	fd;
218 	struct	iovec *iovp;
219 	u_int	iovcnt;
220 };
221 #endif
222 /*
223  * MPSAFE
224  */
225 int
226 readv(td, uap)
227 	struct thread *td;
228 	struct readv_args *uap;
229 {
230 	struct file *fp;
231 	struct uio auio;
232 	struct iovec *iov;
233 	struct iovec *needfree;
234 	struct iovec aiov[UIO_SMALLIOV];
235 	long i, cnt;
236 	int error;
237 	u_int iovlen;
238 #ifdef KTRACE
239 	struct iovec *ktriov = NULL;
240 	struct uio ktruio;
241 #endif
242 
243 	if ((error = fget_read(td, uap->fd, &fp)) != 0)
244 		return (error);
245 	needfree = NULL;
246 	/* note: can't use iovlen until iovcnt is validated */
247 	iovlen = uap->iovcnt * sizeof (struct iovec);
248 	if (uap->iovcnt > UIO_SMALLIOV) {
249 		if (uap->iovcnt > UIO_MAXIOV) {
250 			error = EINVAL;
251 			goto done;
252 		}
253 		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
254 		needfree = iov;
255 	} else
256 		iov = aiov;
257 	auio.uio_iov = iov;
258 	auio.uio_iovcnt = uap->iovcnt;
259 	auio.uio_rw = UIO_READ;
260 	auio.uio_segflg = UIO_USERSPACE;
261 	auio.uio_td = td;
262 	auio.uio_offset = -1;
263 	if ((error = copyin(uap->iovp, iov, iovlen)))
264 		goto done;
265 	auio.uio_resid = 0;
266 	for (i = 0; i < uap->iovcnt; i++) {
267 		if (iov->iov_len > INT_MAX - auio.uio_resid) {
268 			error = EINVAL;
269 			goto done;
270 		}
271 		auio.uio_resid += iov->iov_len;
272 		iov++;
273 	}
274 #ifdef KTRACE
275 	/*
276 	 * if tracing, save a copy of iovec
277 	 */
278 	if (KTRPOINT(td, KTR_GENIO))  {
279 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
280 		bcopy(auio.uio_iov, ktriov, iovlen);
281 		ktruio = auio;
282 	}
283 #endif
284 	cnt = auio.uio_resid;
285 	if ((error = fo_read(fp, &auio, td->td_ucred, 0, td))) {
286 		if (auio.uio_resid != cnt && (error == ERESTART ||
287 		    error == EINTR || error == EWOULDBLOCK))
288 			error = 0;
289 	}
290 	cnt -= auio.uio_resid;
291 #ifdef KTRACE
292 	if (ktriov != NULL) {
293 		if (error == 0) {
294 			ktruio.uio_iov = ktriov;
295 			ktruio.uio_resid = cnt;
296 			ktrgenio(uap->fd, UIO_READ, &ktruio, error);
297 		}
298 		FREE(ktriov, M_TEMP);
299 	}
300 #endif
301 	td->td_retval[0] = cnt;
302 done:
303 	fdrop(fp, td);
304 	if (needfree)
305 		FREE(needfree, M_IOV);
306 	return (error);
307 }
308 
309 /*
310  * Write system call
311  */
312 #ifndef _SYS_SYSPROTO_H_
313 struct write_args {
314 	int	fd;
315 	const void *buf;
316 	size_t	nbyte;
317 };
318 #endif
319 /*
320  * MPSAFE
321  */
322 int
323 write(td, uap)
324 	struct thread *td;
325 	struct write_args *uap;
326 {
327 	struct file *fp;
328 	int error;
329 
330 	if ((error = fget_write(td, uap->fd, &fp)) == 0) {
331 		error = dofilewrite(td, fp, uap->fd, uap->buf, uap->nbyte,
332 			    (off_t)-1, 0);
333 		fdrop(fp, td);
334 	} else {
335 		error = EBADF;	/* XXX this can't be right */
336 	}
337 	return(error);
338 }
339 
340 /*
341  * Pwrite system call
342  */
343 #ifndef _SYS_SYSPROTO_H_
344 struct pwrite_args {
345 	int	fd;
346 	const void *buf;
347 	size_t	nbyte;
348 	int	pad;
349 	off_t	offset;
350 };
351 #endif
352 /*
353  * MPSAFE
354  */
355 int
356 pwrite(td, uap)
357 	struct thread *td;
358 	struct pwrite_args *uap;
359 {
360 	struct file *fp;
361 	int error;
362 
363 	if ((error = fget_write(td, uap->fd, &fp)) == 0) {
364 		if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
365 			error = ESPIPE;
366 		else if (uap->offset < 0 && fp->f_vnode->v_type != VCHR)
367 			error = EINVAL;
368 		else {
369 			error = dofilewrite(td, fp, uap->fd, uap->buf,
370 				    uap->nbyte, uap->offset, FOF_OFFSET);
371 		}
372 		fdrop(fp, td);
373 	} else {
374 		error = EBADF;	/* this can't be right */
375 	}
376 	return(error);
377 }
378 
379 static int
380 dofilewrite(td, fp, fd, buf, nbyte, offset, flags)
381 	struct thread *td;
382 	struct file *fp;
383 	int fd, flags;
384 	const void *buf;
385 	size_t nbyte;
386 	off_t offset;
387 {
388 	struct uio auio;
389 	struct iovec aiov;
390 	long cnt, error = 0;
391 #ifdef KTRACE
392 	struct iovec ktriov;
393 	struct uio ktruio;
394 	int didktr = 0;
395 #endif
396 
397 	aiov.iov_base = (void *)(uintptr_t)buf;
398 	aiov.iov_len = nbyte;
399 	auio.uio_iov = &aiov;
400 	auio.uio_iovcnt = 1;
401 	auio.uio_offset = offset;
402 	if (nbyte > INT_MAX)
403 		return (EINVAL);
404 	auio.uio_resid = nbyte;
405 	auio.uio_rw = UIO_WRITE;
406 	auio.uio_segflg = UIO_USERSPACE;
407 	auio.uio_td = td;
408 #ifdef KTRACE
409 	/*
410 	 * if tracing, save a copy of iovec and uio
411 	 */
412 	if (KTRPOINT(td, KTR_GENIO)) {
413 		ktriov = aiov;
414 		ktruio = auio;
415 		didktr = 1;
416 	}
417 #endif
418 	cnt = nbyte;
419 	if (fp->f_type == DTYPE_VNODE)
420 		bwillwrite();
421 	if ((error = fo_write(fp, &auio, td->td_ucred, flags, td))) {
422 		if (auio.uio_resid != cnt && (error == ERESTART ||
423 		    error == EINTR || error == EWOULDBLOCK))
424 			error = 0;
425 		/* Socket layer is responsible for issuing SIGPIPE. */
426 		if (error == EPIPE && fp->f_type != DTYPE_SOCKET) {
427 			PROC_LOCK(td->td_proc);
428 			psignal(td->td_proc, SIGPIPE);
429 			PROC_UNLOCK(td->td_proc);
430 		}
431 	}
432 	cnt -= auio.uio_resid;
433 #ifdef KTRACE
434 	if (didktr && error == 0) {
435 		ktruio.uio_iov = &ktriov;
436 		ktruio.uio_resid = cnt;
437 		ktrgenio(fd, UIO_WRITE, &ktruio, error);
438 	}
439 #endif
440 	td->td_retval[0] = cnt;
441 	return (error);
442 }
443 
444 /*
445  * Gather write system call
446  */
447 #ifndef _SYS_SYSPROTO_H_
448 struct writev_args {
449 	int	fd;
450 	struct	iovec *iovp;
451 	u_int	iovcnt;
452 };
453 #endif
454 /*
455  * MPSAFE
456  */
457 int
458 writev(td, uap)
459 	struct thread *td;
460 	register struct writev_args *uap;
461 {
462 	struct file *fp;
463 	struct uio auio;
464 	register struct iovec *iov;
465 	struct iovec *needfree;
466 	struct iovec aiov[UIO_SMALLIOV];
467 	long i, cnt, error = 0;
468 	u_int iovlen;
469 #ifdef KTRACE
470 	struct iovec *ktriov = NULL;
471 	struct uio ktruio;
472 #endif
473 
474 	if ((error = fget_write(td, uap->fd, &fp)) != 0)
475 		return (EBADF);
476 	needfree = NULL;
477 	/* note: can't use iovlen until iovcnt is validated */
478 	iovlen = uap->iovcnt * sizeof (struct iovec);
479 	if (uap->iovcnt > UIO_SMALLIOV) {
480 		if (uap->iovcnt > UIO_MAXIOV) {
481 			error = EINVAL;
482 			goto done;
483 		}
484 		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
485 		needfree = iov;
486 	} else
487 		iov = aiov;
488 	auio.uio_iov = iov;
489 	auio.uio_iovcnt = uap->iovcnt;
490 	auio.uio_rw = UIO_WRITE;
491 	auio.uio_segflg = UIO_USERSPACE;
492 	auio.uio_td = td;
493 	auio.uio_offset = -1;
494 	if ((error = copyin(uap->iovp, iov, iovlen)))
495 		goto done;
496 	auio.uio_resid = 0;
497 	for (i = 0; i < uap->iovcnt; i++) {
498 		if (iov->iov_len > INT_MAX - auio.uio_resid) {
499 			error = EINVAL;
500 			goto done;
501 		}
502 		auio.uio_resid += iov->iov_len;
503 		iov++;
504 	}
505 #ifdef KTRACE
506 	/*
507 	 * if tracing, save a copy of iovec and uio
508 	 */
509 	if (KTRPOINT(td, KTR_GENIO))  {
510 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
511 		bcopy(auio.uio_iov, ktriov, iovlen);
512 		ktruio = auio;
513 	}
514 #endif
515 	cnt = auio.uio_resid;
516 	if (fp->f_type == DTYPE_VNODE)
517 		bwillwrite();
518 	if ((error = fo_write(fp, &auio, td->td_ucred, 0, td))) {
519 		if (auio.uio_resid != cnt && (error == ERESTART ||
520 		    error == EINTR || error == EWOULDBLOCK))
521 			error = 0;
522 		if (error == EPIPE) {
523 			PROC_LOCK(td->td_proc);
524 			psignal(td->td_proc, SIGPIPE);
525 			PROC_UNLOCK(td->td_proc);
526 		}
527 	}
528 	cnt -= auio.uio_resid;
529 #ifdef KTRACE
530 	if (ktriov != NULL) {
531 		if (error == 0) {
532 			ktruio.uio_iov = ktriov;
533 			ktruio.uio_resid = cnt;
534 			ktrgenio(uap->fd, UIO_WRITE, &ktruio, error);
535 		}
536 		FREE(ktriov, M_TEMP);
537 	}
538 #endif
539 	td->td_retval[0] = cnt;
540 done:
541 	fdrop(fp, td);
542 	if (needfree)
543 		FREE(needfree, M_IOV);
544 	return (error);
545 }
546 
547 /*
548  * Ioctl system call
549  */
550 #ifndef _SYS_SYSPROTO_H_
551 struct ioctl_args {
552 	int	fd;
553 	u_long	com;
554 	caddr_t	data;
555 };
556 #endif
557 /*
558  * MPSAFE
559  */
560 /* ARGSUSED */
561 int
562 ioctl(td, uap)
563 	struct thread *td;
564 	register struct ioctl_args *uap;
565 {
566 	struct file *fp;
567 	register struct filedesc *fdp;
568 	register u_long com;
569 	int error = 0;
570 	register u_int size;
571 	caddr_t data, memp;
572 	int tmp;
573 #define STK_PARAMS	128
574 	union {
575 	    char stkbuf[STK_PARAMS];
576 	    long align;
577 	} ubuf;
578 
579 	if ((error = fget(td, uap->fd, &fp)) != 0)
580 		return (error);
581 	mtx_lock(&Giant);
582 	if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
583 		fdrop(fp, td);
584 		mtx_unlock(&Giant);
585 		return (EBADF);
586 	}
587 	fdp = td->td_proc->p_fd;
588 	switch (com = uap->com) {
589 	case FIONCLEX:
590 		FILEDESC_LOCK(fdp);
591 		fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE;
592 		FILEDESC_UNLOCK(fdp);
593 		fdrop(fp, td);
594 		mtx_unlock(&Giant);
595 		return (0);
596 	case FIOCLEX:
597 		FILEDESC_LOCK(fdp);
598 		fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE;
599 		FILEDESC_UNLOCK(fdp);
600 		fdrop(fp, td);
601 		mtx_unlock(&Giant);
602 		return (0);
603 	}
604 
605 	/*
606 	 * Interpret high order word to find amount of data to be
607 	 * copied to/from the user's address space.
608 	 */
609 	size = IOCPARM_LEN(com);
610 	if (size > IOCPARM_MAX) {
611 		fdrop(fp, td);
612 		mtx_unlock(&Giant);
613 		return (ENOTTY);
614 	}
615 
616 	memp = NULL;
617 	if (size > sizeof (ubuf.stkbuf)) {
618 		memp = malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
619 		data = memp;
620 	} else {
621 		data = ubuf.stkbuf;
622 	}
623 	if (com&IOC_IN) {
624 		if (size) {
625 			error = copyin(uap->data, data, (u_int)size);
626 			if (error) {
627 				if (memp)
628 					free(memp, M_IOCTLOPS);
629 				fdrop(fp, td);
630 				goto done;
631 			}
632 		} else {
633 			*(caddr_t *)data = uap->data;
634 		}
635 	} else if ((com&IOC_OUT) && size) {
636 		/*
637 		 * Zero the buffer so the user always
638 		 * gets back something deterministic.
639 		 */
640 		bzero(data, size);
641 	} else if (com&IOC_VOID) {
642 		*(caddr_t *)data = uap->data;
643 	}
644 
645 	switch (com) {
646 
647 	case FIONBIO:
648 		FILE_LOCK(fp);
649 		if ((tmp = *(int *)data))
650 			fp->f_flag |= FNONBLOCK;
651 		else
652 			fp->f_flag &= ~FNONBLOCK;
653 		FILE_UNLOCK(fp);
654 		error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
655 		break;
656 
657 	case FIOASYNC:
658 		FILE_LOCK(fp);
659 		if ((tmp = *(int *)data))
660 			fp->f_flag |= FASYNC;
661 		else
662 			fp->f_flag &= ~FASYNC;
663 		FILE_UNLOCK(fp);
664 		error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td);
665 		break;
666 
667 	default:
668 		error = fo_ioctl(fp, com, data, td->td_ucred, td);
669 		/*
670 		 * Copy any data to user, size was
671 		 * already set and checked above.
672 		 */
673 		if (error == 0 && (com&IOC_OUT) && size)
674 			error = copyout(data, uap->data, (u_int)size);
675 		break;
676 	}
677 	if (memp)
678 		free(memp, M_IOCTLOPS);
679 	fdrop(fp, td);
680 done:
681 	mtx_unlock(&Giant);
682 	return (error);
683 }
684 
685 /*
686  * sellock and selwait are initialized in selectinit() via SYSINIT.
687  */
688 struct mtx	sellock;
689 struct cv	selwait;
690 u_int		nselcoll;	/* Select collisions since boot */
691 SYSCTL_UINT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
692 
693 /*
694  * Select system call.
695  */
696 #ifndef _SYS_SYSPROTO_H_
697 struct select_args {
698 	int	nd;
699 	fd_set	*in, *ou, *ex;
700 	struct	timeval *tv;
701 };
702 #endif
703 /*
704  * MPSAFE
705  */
706 int
707 select(td, uap)
708 	register struct thread *td;
709 	register struct select_args *uap;
710 {
711 	struct timeval tv, *tvp;
712 	int error;
713 
714 	if (uap->tv != NULL) {
715 		error = copyin(uap->tv, &tv, sizeof(tv));
716 		if (error)
717 			return (error);
718 		tvp = &tv;
719 	} else
720 		tvp = NULL;
721 
722 	return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp));
723 }
724 
725 int
726 kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou,
727     fd_set *fd_ex, struct timeval *tvp)
728 {
729 	struct filedesc *fdp;
730 	/*
731 	 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
732 	 * infds with the new FD_SETSIZE of 1024, and more than enough for
733 	 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
734 	 * of 256.
735 	 */
736 	fd_mask s_selbits[howmany(2048, NFDBITS)];
737 	fd_mask *ibits[3], *obits[3], *selbits, *sbp;
738 	struct timeval atv, rtv, ttv;
739 	int error, timo;
740 	u_int ncoll, nbufbytes, ncpbytes, nfdbits;
741 
742 	if (nd < 0)
743 		return (EINVAL);
744 	fdp = td->td_proc->p_fd;
745 	/*
746 	 * XXX: kern_select() currently requires that we acquire Giant
747 	 * even if none of the file descriptors we poll requires Giant.
748 	 */
749 	mtx_lock(&Giant);
750 	FILEDESC_LOCK(fdp);
751 
752 	if (nd > td->td_proc->p_fd->fd_nfiles)
753 		nd = td->td_proc->p_fd->fd_nfiles;   /* forgiving; slightly wrong */
754 	FILEDESC_UNLOCK(fdp);
755 
756 	/*
757 	 * Allocate just enough bits for the non-null fd_sets.  Use the
758 	 * preallocated auto buffer if possible.
759 	 */
760 	nfdbits = roundup(nd, NFDBITS);
761 	ncpbytes = nfdbits / NBBY;
762 	nbufbytes = 0;
763 	if (fd_in != NULL)
764 		nbufbytes += 2 * ncpbytes;
765 	if (fd_ou != NULL)
766 		nbufbytes += 2 * ncpbytes;
767 	if (fd_ex != NULL)
768 		nbufbytes += 2 * ncpbytes;
769 	if (nbufbytes <= sizeof s_selbits)
770 		selbits = &s_selbits[0];
771 	else
772 		selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
773 
774 	/*
775 	 * Assign pointers into the bit buffers and fetch the input bits.
776 	 * Put the output buffers together so that they can be bzeroed
777 	 * together.
778 	 */
779 	sbp = selbits;
780 #define	getbits(name, x) \
781 	do {								\
782 		if (name == NULL)					\
783 			ibits[x] = NULL;				\
784 		else {							\
785 			ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp;	\
786 			obits[x] = sbp;					\
787 			sbp += ncpbytes / sizeof *sbp;			\
788 			error = copyin(name, ibits[x], ncpbytes);	\
789 			if (error != 0)					\
790 				goto done_nosellock;			\
791 		}							\
792 	} while (0)
793 	getbits(fd_in, 0);
794 	getbits(fd_ou, 1);
795 	getbits(fd_ex, 2);
796 #undef	getbits
797 	if (nbufbytes != 0)
798 		bzero(selbits, nbufbytes / 2);
799 
800 	if (tvp != NULL) {
801 		atv = *tvp;
802 		if (itimerfix(&atv)) {
803 			error = EINVAL;
804 			goto done_nosellock;
805 		}
806 		getmicrouptime(&rtv);
807 		timevaladd(&atv, &rtv);
808 	} else {
809 		atv.tv_sec = 0;
810 		atv.tv_usec = 0;
811 	}
812 	timo = 0;
813 	TAILQ_INIT(&td->td_selq);
814 	mtx_lock(&sellock);
815 retry:
816 	ncoll = nselcoll;
817 	mtx_lock_spin(&sched_lock);
818 	td->td_flags |= TDF_SELECT;
819 	mtx_unlock_spin(&sched_lock);
820 	mtx_unlock(&sellock);
821 
822 	error = selscan(td, ibits, obits, nd);
823 	mtx_lock(&sellock);
824 	if (error || td->td_retval[0])
825 		goto done;
826 	if (atv.tv_sec || atv.tv_usec) {
827 		getmicrouptime(&rtv);
828 		if (timevalcmp(&rtv, &atv, >=))
829 			goto done;
830 		ttv = atv;
831 		timevalsub(&ttv, &rtv);
832 		timo = ttv.tv_sec > 24 * 60 * 60 ?
833 		    24 * 60 * 60 * hz : tvtohz(&ttv);
834 	}
835 
836 	/*
837 	 * An event of interest may occur while we do not hold
838 	 * sellock, so check TDF_SELECT and the number of
839 	 * collisions and rescan the file descriptors if
840 	 * necessary.
841 	 */
842 	mtx_lock_spin(&sched_lock);
843 	if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
844 		mtx_unlock_spin(&sched_lock);
845 		goto retry;
846 	}
847 	mtx_unlock_spin(&sched_lock);
848 
849 	if (timo > 0)
850 		error = cv_timedwait_sig(&selwait, &sellock, timo);
851 	else
852 		error = cv_wait_sig(&selwait, &sellock);
853 
854 	if (error == 0)
855 		goto retry;
856 
857 done:
858 	clear_selinfo_list(td);
859 	mtx_lock_spin(&sched_lock);
860 	td->td_flags &= ~TDF_SELECT;
861 	mtx_unlock_spin(&sched_lock);
862 	mtx_unlock(&sellock);
863 
864 done_nosellock:
865 	/* select is not restarted after signals... */
866 	if (error == ERESTART)
867 		error = EINTR;
868 	if (error == EWOULDBLOCK)
869 		error = 0;
870 #define	putbits(name, x) \
871 	if (name && (error2 = copyout(obits[x], name, ncpbytes))) \
872 		error = error2;
873 	if (error == 0) {
874 		int error2;
875 
876 		putbits(fd_in, 0);
877 		putbits(fd_ou, 1);
878 		putbits(fd_ex, 2);
879 #undef putbits
880 	}
881 	if (selbits != &s_selbits[0])
882 		free(selbits, M_SELECT);
883 
884 	mtx_unlock(&Giant);
885 	return (error);
886 }
887 
888 static int
889 selscan(td, ibits, obits, nfd)
890 	struct thread *td;
891 	fd_mask **ibits, **obits;
892 	int nfd;
893 {
894 	int msk, i, fd;
895 	fd_mask bits;
896 	struct file *fp;
897 	int n = 0;
898 	/* Note: backend also returns POLLHUP/POLLERR if appropriate. */
899 	static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
900 	struct filedesc *fdp = td->td_proc->p_fd;
901 
902 	FILEDESC_LOCK(fdp);
903 	for (msk = 0; msk < 3; msk++) {
904 		if (ibits[msk] == NULL)
905 			continue;
906 		for (i = 0; i < nfd; i += NFDBITS) {
907 			bits = ibits[msk][i/NFDBITS];
908 			/* ffs(int mask) not portable, fd_mask is long */
909 			for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
910 				if (!(bits & 1))
911 					continue;
912 				if ((fp = fget_locked(fdp, fd)) == NULL) {
913 					FILEDESC_UNLOCK(fdp);
914 					return (EBADF);
915 				}
916 				if (fo_poll(fp, flag[msk], td->td_ucred,
917 				    td)) {
918 					obits[msk][(fd)/NFDBITS] |=
919 					    ((fd_mask)1 << ((fd) % NFDBITS));
920 					n++;
921 				}
922 			}
923 		}
924 	}
925 	FILEDESC_UNLOCK(fdp);
926 	td->td_retval[0] = n;
927 	return (0);
928 }
929 
930 /*
931  * Poll system call.
932  */
933 #ifndef _SYS_SYSPROTO_H_
934 struct poll_args {
935 	struct pollfd *fds;
936 	u_int	nfds;
937 	int	timeout;
938 };
939 #endif
940 /*
941  * MPSAFE
942  */
943 int
944 poll(td, uap)
945 	struct thread *td;
946 	struct poll_args *uap;
947 {
948 	caddr_t bits;
949 	char smallbits[32 * sizeof(struct pollfd)];
950 	struct timeval atv, rtv, ttv;
951 	int error = 0, timo;
952 	u_int ncoll, nfds;
953 	size_t ni;
954 
955 	nfds = uap->nfds;
956 
957 	/*
958 	 * XXX: poll() currently requires that we acquire Giant even if
959 	 * none of the file descriptors we poll requires Giant.
960 	 */
961 	mtx_lock(&Giant);
962 	/*
963 	 * This is kinda bogus.  We have fd limits, but that is not
964 	 * really related to the size of the pollfd array.  Make sure
965 	 * we let the process use at least FD_SETSIZE entries and at
966 	 * least enough for the current limits.  We want to be reasonably
967 	 * safe, but not overly restrictive.
968 	 */
969 	PROC_LOCK(td->td_proc);
970 	if ((nfds > lim_cur(td->td_proc, RLIMIT_NOFILE)) &&
971 	    (nfds > FD_SETSIZE)) {
972 		PROC_UNLOCK(td->td_proc);
973 		error = EINVAL;
974 		goto done2;
975 	}
976 	PROC_UNLOCK(td->td_proc);
977 	ni = nfds * sizeof(struct pollfd);
978 	if (ni > sizeof(smallbits))
979 		bits = malloc(ni, M_TEMP, M_WAITOK);
980 	else
981 		bits = smallbits;
982 	error = copyin(uap->fds, bits, ni);
983 	if (error)
984 		goto done_nosellock;
985 	if (uap->timeout != INFTIM) {
986 		atv.tv_sec = uap->timeout / 1000;
987 		atv.tv_usec = (uap->timeout % 1000) * 1000;
988 		if (itimerfix(&atv)) {
989 			error = EINVAL;
990 			goto done_nosellock;
991 		}
992 		getmicrouptime(&rtv);
993 		timevaladd(&atv, &rtv);
994 	} else {
995 		atv.tv_sec = 0;
996 		atv.tv_usec = 0;
997 	}
998 	timo = 0;
999 	TAILQ_INIT(&td->td_selq);
1000 	mtx_lock(&sellock);
1001 retry:
1002 	ncoll = nselcoll;
1003 	mtx_lock_spin(&sched_lock);
1004 	td->td_flags |= TDF_SELECT;
1005 	mtx_unlock_spin(&sched_lock);
1006 	mtx_unlock(&sellock);
1007 
1008 	error = pollscan(td, (struct pollfd *)bits, nfds);
1009 	mtx_lock(&sellock);
1010 	if (error || td->td_retval[0])
1011 		goto done;
1012 	if (atv.tv_sec || atv.tv_usec) {
1013 		getmicrouptime(&rtv);
1014 		if (timevalcmp(&rtv, &atv, >=))
1015 			goto done;
1016 		ttv = atv;
1017 		timevalsub(&ttv, &rtv);
1018 		timo = ttv.tv_sec > 24 * 60 * 60 ?
1019 		    24 * 60 * 60 * hz : tvtohz(&ttv);
1020 	}
1021 	/*
1022 	 * An event of interest may occur while we do not hold
1023 	 * sellock, so check TDF_SELECT and the number of collisions
1024 	 * and rescan the file descriptors if necessary.
1025 	 */
1026 	mtx_lock_spin(&sched_lock);
1027 	if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
1028 		mtx_unlock_spin(&sched_lock);
1029 		goto retry;
1030 	}
1031 	mtx_unlock_spin(&sched_lock);
1032 
1033 	if (timo > 0)
1034 		error = cv_timedwait_sig(&selwait, &sellock, timo);
1035 	else
1036 		error = cv_wait_sig(&selwait, &sellock);
1037 
1038 	if (error == 0)
1039 		goto retry;
1040 
1041 done:
1042 	clear_selinfo_list(td);
1043 	mtx_lock_spin(&sched_lock);
1044 	td->td_flags &= ~TDF_SELECT;
1045 	mtx_unlock_spin(&sched_lock);
1046 	mtx_unlock(&sellock);
1047 
1048 done_nosellock:
1049 	/* poll is not restarted after signals... */
1050 	if (error == ERESTART)
1051 		error = EINTR;
1052 	if (error == EWOULDBLOCK)
1053 		error = 0;
1054 	if (error == 0) {
1055 		error = copyout(bits, uap->fds, ni);
1056 		if (error)
1057 			goto out;
1058 	}
1059 out:
1060 	if (ni > sizeof(smallbits))
1061 		free(bits, M_TEMP);
1062 done2:
1063 	mtx_unlock(&Giant);
1064 	return (error);
1065 }
1066 
1067 static int
1068 pollscan(td, fds, nfd)
1069 	struct thread *td;
1070 	struct pollfd *fds;
1071 	u_int nfd;
1072 {
1073 	register struct filedesc *fdp = td->td_proc->p_fd;
1074 	int i;
1075 	struct file *fp;
1076 	int n = 0;
1077 
1078 	FILEDESC_LOCK(fdp);
1079 	for (i = 0; i < nfd; i++, fds++) {
1080 		if (fds->fd >= fdp->fd_nfiles) {
1081 			fds->revents = POLLNVAL;
1082 			n++;
1083 		} else if (fds->fd < 0) {
1084 			fds->revents = 0;
1085 		} else {
1086 			fp = fdp->fd_ofiles[fds->fd];
1087 			if (fp == NULL) {
1088 				fds->revents = POLLNVAL;
1089 				n++;
1090 			} else {
1091 				/*
1092 				 * Note: backend also returns POLLHUP and
1093 				 * POLLERR if appropriate.
1094 				 */
1095 				fds->revents = fo_poll(fp, fds->events,
1096 				    td->td_ucred, td);
1097 				if (fds->revents != 0)
1098 					n++;
1099 			}
1100 		}
1101 	}
1102 	FILEDESC_UNLOCK(fdp);
1103 	td->td_retval[0] = n;
1104 	return (0);
1105 }
1106 
1107 /*
1108  * OpenBSD poll system call.
1109  * XXX this isn't quite a true representation..  OpenBSD uses select ops.
1110  */
1111 #ifndef _SYS_SYSPROTO_H_
1112 struct openbsd_poll_args {
1113 	struct pollfd *fds;
1114 	u_int	nfds;
1115 	int	timeout;
1116 };
1117 #endif
1118 /*
1119  * MPSAFE
1120  */
1121 int
1122 openbsd_poll(td, uap)
1123 	register struct thread *td;
1124 	register struct openbsd_poll_args *uap;
1125 {
1126 	return (poll(td, (struct poll_args *)uap));
1127 }
1128 
1129 /*
1130  * Remove the references to the thread from all of the objects
1131  * we were polling.
1132  *
1133  * This code assumes that the underlying owner of the selinfo
1134  * structure will hold sellock before it changes it, and that
1135  * it will unlink itself from our list if it goes away.
1136  */
1137 void
1138 clear_selinfo_list(td)
1139 	struct thread *td;
1140 {
1141 	struct selinfo *si;
1142 
1143 	mtx_assert(&sellock, MA_OWNED);
1144 	TAILQ_FOREACH(si, &td->td_selq, si_thrlist)
1145 		si->si_thread = NULL;
1146 	TAILQ_INIT(&td->td_selq);
1147 }
1148 
1149 /*
1150  * Record a select request.
1151  */
1152 void
1153 selrecord(selector, sip)
1154 	struct thread *selector;
1155 	struct selinfo *sip;
1156 {
1157 
1158 	mtx_lock(&sellock);
1159 	/*
1160 	 * If the selinfo's thread pointer is NULL then take ownership of it.
1161 	 *
1162 	 * If the thread pointer is not NULL and it points to another
1163 	 * thread, then we have a collision.
1164 	 *
1165 	 * If the thread pointer is not NULL and points back to us then leave
1166 	 * it alone as we've already added pointed it at us and added it to
1167 	 * our list.
1168 	 */
1169 	if (sip->si_thread == NULL) {
1170 		sip->si_thread = selector;
1171 		TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist);
1172 	} else if (sip->si_thread != selector) {
1173 		sip->si_flags |= SI_COLL;
1174 	}
1175 
1176 	mtx_unlock(&sellock);
1177 }
1178 
1179 /* Wake up a selecting thread. */
1180 void
1181 selwakeup(sip)
1182 	struct selinfo *sip;
1183 {
1184 	doselwakeup(sip, -1);
1185 }
1186 
1187 /* Wake up a selecting thread, and set its priority. */
1188 void
1189 selwakeuppri(sip, pri)
1190 	struct selinfo *sip;
1191 	int pri;
1192 {
1193 	doselwakeup(sip, pri);
1194 }
1195 
1196 /*
1197  * Do a wakeup when a selectable event occurs.
1198  */
1199 static void
1200 doselwakeup(sip, pri)
1201 	struct selinfo *sip;
1202 	int pri;
1203 {
1204 	struct thread *td;
1205 
1206 	mtx_lock(&sellock);
1207 	td = sip->si_thread;
1208 	if ((sip->si_flags & SI_COLL) != 0) {
1209 		nselcoll++;
1210 		sip->si_flags &= ~SI_COLL;
1211 		cv_broadcastpri(&selwait, pri);
1212 	}
1213 	if (td == NULL) {
1214 		mtx_unlock(&sellock);
1215 		return;
1216 	}
1217 	TAILQ_REMOVE(&td->td_selq, sip, si_thrlist);
1218 	sip->si_thread = NULL;
1219 	mtx_lock_spin(&sched_lock);
1220 	td->td_flags &= ~TDF_SELECT;
1221 	mtx_unlock_spin(&sched_lock);
1222 	sleepq_remove(td, &selwait);
1223 	mtx_unlock(&sellock);
1224 }
1225 
1226 static void selectinit(void *);
1227 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL)
1228 
1229 /* ARGSUSED*/
1230 static void
1231 selectinit(dummy)
1232 	void *dummy;
1233 {
1234 	cv_init(&selwait, "select");
1235 	mtx_init(&sellock, "sellck", NULL, MTX_DEF);
1236 }
1237