xref: /freebsd/sys/kern/sys_generic.c (revision c17d43407fe04133a94055b0dbc7ea8965654a9f)
1 /*
2  * Copyright (c) 1982, 1986, 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
39  * $FreeBSD$
40  */
41 
42 #include "opt_ktrace.h"
43 
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/sysproto.h>
47 #include <sys/filedesc.h>
48 #include <sys/filio.h>
49 #include <sys/fcntl.h>
50 #include <sys/file.h>
51 #include <sys/proc.h>
52 #include <sys/signalvar.h>
53 #include <sys/socketvar.h>
54 #include <sys/uio.h>
55 #include <sys/kernel.h>
56 #include <sys/malloc.h>
57 #include <sys/poll.h>
58 #include <sys/resourcevar.h>
59 #include <sys/selinfo.h>
60 #include <sys/sysctl.h>
61 #include <sys/sysent.h>
62 #include <sys/bio.h>
63 #include <sys/buf.h>
64 #include <sys/condvar.h>
65 #ifdef KTRACE
66 #include <sys/ktrace.h>
67 #endif
68 #include <vm/vm.h>
69 #include <vm/vm_page.h>
70 
71 #include <machine/limits.h>
72 
73 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
74 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
75 MALLOC_DEFINE(M_IOV, "iov", "large iov's");
76 
77 static int	pollscan(struct thread *, struct pollfd *, u_int);
78 static int	selscan(struct thread *, fd_mask **, fd_mask **, int);
79 static int	dofileread(struct thread *, struct file *, int, void *,
80 		    size_t, off_t, int);
81 static int	dofilewrite(struct thread *, struct file *, int,
82 		    const void *, size_t, off_t, int);
83 
84 /*
85  * Read system call.
86  */
87 #ifndef _SYS_SYSPROTO_H_
88 struct read_args {
89 	int	fd;
90 	void	*buf;
91 	size_t	nbyte;
92 };
93 #endif
94 /*
95  * MPSAFE
96  */
97 int
98 read(td, uap)
99 	struct thread *td;
100 	struct read_args *uap;
101 {
102 	struct file *fp;
103 	int error;
104 
105 	if ((error = fget_read(td, uap->fd, &fp)) == 0) {
106 		error = dofileread(td, fp, uap->fd, uap->buf,
107 			    uap->nbyte, (off_t)-1, 0);
108 		fdrop(fp, td);
109 	}
110 	return(error);
111 }
112 
113 /*
114  * Pread system call
115  */
116 #ifndef _SYS_SYSPROTO_H_
117 struct pread_args {
118 	int	fd;
119 	void	*buf;
120 	size_t	nbyte;
121 	int	pad;
122 	off_t	offset;
123 };
124 #endif
125 /*
126  * MPSAFE
127  */
128 int
129 pread(td, uap)
130 	struct thread *td;
131 	struct pread_args *uap;
132 {
133 	struct file *fp;
134 	int error;
135 
136 	if ((error = fget_read(td, uap->fd, &fp)) != 0)
137 		return (error);
138 	if (fp->f_type != DTYPE_VNODE) {
139 		error = ESPIPE;
140 	} else {
141 		error = dofileread(td, fp, uap->fd, uap->buf, uap->nbyte,
142 			    uap->offset, FOF_OFFSET);
143 	}
144 	fdrop(fp, td);
145 	return(error);
146 }
147 
148 /*
149  * Code common for read and pread
150  */
151 int
152 dofileread(td, fp, fd, buf, nbyte, offset, flags)
153 	struct thread *td;
154 	struct file *fp;
155 	int fd, flags;
156 	void *buf;
157 	size_t nbyte;
158 	off_t offset;
159 {
160 	struct uio auio;
161 	struct iovec aiov;
162 	long cnt, error = 0;
163 #ifdef KTRACE
164 	struct iovec ktriov;
165 	struct uio ktruio;
166 	int didktr = 0;
167 #endif
168 
169 	aiov.iov_base = (caddr_t)buf;
170 	aiov.iov_len = nbyte;
171 	auio.uio_iov = &aiov;
172 	auio.uio_iovcnt = 1;
173 	auio.uio_offset = offset;
174 	if (nbyte > INT_MAX)
175 		return (EINVAL);
176 	auio.uio_resid = nbyte;
177 	auio.uio_rw = UIO_READ;
178 	auio.uio_segflg = UIO_USERSPACE;
179 	auio.uio_td = td;
180 #ifdef KTRACE
181 	/*
182 	 * if tracing, save a copy of iovec
183 	 */
184 	if (KTRPOINT(td->td_proc, KTR_GENIO)) {
185 		ktriov = aiov;
186 		ktruio = auio;
187 		didktr = 1;
188 	}
189 #endif
190 	cnt = nbyte;
191 
192 	if ((error = fo_read(fp, &auio, fp->f_cred, flags, td))) {
193 		if (auio.uio_resid != cnt && (error == ERESTART ||
194 		    error == EINTR || error == EWOULDBLOCK))
195 			error = 0;
196 	}
197 	cnt -= auio.uio_resid;
198 #ifdef KTRACE
199 	if (didktr && error == 0) {
200 		ktruio.uio_iov = &ktriov;
201 		ktruio.uio_resid = cnt;
202 		ktrgenio(td->td_proc->p_tracep, fd, UIO_READ, &ktruio, error);
203 	}
204 #endif
205 	td->td_retval[0] = cnt;
206 	return (error);
207 }
208 
209 /*
210  * Scatter read system call.
211  */
212 #ifndef _SYS_SYSPROTO_H_
213 struct readv_args {
214 	int	fd;
215 	struct	iovec *iovp;
216 	u_int	iovcnt;
217 };
218 #endif
219 /*
220  * MPSAFE
221  */
222 int
223 readv(td, uap)
224 	struct thread *td;
225 	struct readv_args *uap;
226 {
227 	struct file *fp;
228 	struct uio auio;
229 	struct iovec *iov;
230 	struct iovec *needfree;
231 	struct iovec aiov[UIO_SMALLIOV];
232 	long i, cnt, error = 0;
233 	u_int iovlen;
234 #ifdef KTRACE
235 	struct iovec *ktriov = NULL;
236 	struct uio ktruio;
237 #endif
238 	mtx_lock(&Giant);
239 
240 	if ((error = fget_read(td, uap->fd, &fp)) != 0)
241 		goto done2;
242 	/* note: can't use iovlen until iovcnt is validated */
243 	iovlen = uap->iovcnt * sizeof (struct iovec);
244 	if (uap->iovcnt > UIO_SMALLIOV) {
245 		if (uap->iovcnt > UIO_MAXIOV) {
246 			error = EINVAL;
247 			goto done2;
248 		}
249 		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
250 		needfree = iov;
251 	} else {
252 		iov = aiov;
253 		needfree = NULL;
254 	}
255 	auio.uio_iov = iov;
256 	auio.uio_iovcnt = uap->iovcnt;
257 	auio.uio_rw = UIO_READ;
258 	auio.uio_segflg = UIO_USERSPACE;
259 	auio.uio_td = td;
260 	auio.uio_offset = -1;
261 	if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)))
262 		goto done;
263 	auio.uio_resid = 0;
264 	for (i = 0; i < uap->iovcnt; i++) {
265 		if (iov->iov_len > INT_MAX - auio.uio_resid) {
266 			error = EINVAL;
267 			goto done;
268 		}
269 		auio.uio_resid += iov->iov_len;
270 		iov++;
271 	}
272 #ifdef KTRACE
273 	/*
274 	 * if tracing, save a copy of iovec
275 	 */
276 	if (KTRPOINT(td->td_proc, KTR_GENIO))  {
277 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
278 		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
279 		ktruio = auio;
280 	}
281 #endif
282 	cnt = auio.uio_resid;
283 	if ((error = fo_read(fp, &auio, fp->f_cred, 0, td))) {
284 		if (auio.uio_resid != cnt && (error == ERESTART ||
285 		    error == EINTR || error == EWOULDBLOCK))
286 			error = 0;
287 	}
288 	cnt -= auio.uio_resid;
289 #ifdef KTRACE
290 	if (ktriov != NULL) {
291 		if (error == 0) {
292 			ktruio.uio_iov = ktriov;
293 			ktruio.uio_resid = cnt;
294 			ktrgenio(td->td_proc->p_tracep, uap->fd, UIO_READ, &ktruio,
295 			    error);
296 		}
297 		FREE(ktriov, M_TEMP);
298 	}
299 #endif
300 	td->td_retval[0] = cnt;
301 done:
302 	fdrop(fp, td);
303 	if (needfree)
304 		FREE(needfree, M_IOV);
305 done2:
306 	mtx_unlock(&Giant);
307 	return (error);
308 }
309 
310 /*
311  * Write system call
312  */
313 #ifndef _SYS_SYSPROTO_H_
314 struct write_args {
315 	int	fd;
316 	const void *buf;
317 	size_t	nbyte;
318 };
319 #endif
320 /*
321  * MPSAFE
322  */
323 int
324 write(td, uap)
325 	struct thread *td;
326 	struct write_args *uap;
327 {
328 	struct file *fp;
329 	int error;
330 
331 	if ((error = fget_write(td, uap->fd, &fp)) == 0) {
332 		error = dofilewrite(td, fp, uap->fd, uap->buf, uap->nbyte,
333 			    (off_t)-1, 0);
334 		fdrop(fp, td);
335 	} else {
336 		error = EBADF;	/* XXX this can't be right */
337 	}
338 	return(error);
339 }
340 
341 /*
342  * Pwrite system call
343  */
344 #ifndef _SYS_SYSPROTO_H_
345 struct pwrite_args {
346 	int	fd;
347 	const void *buf;
348 	size_t	nbyte;
349 	int	pad;
350 	off_t	offset;
351 };
352 #endif
353 /*
354  * MPSAFE
355  */
356 int
357 pwrite(td, uap)
358 	struct thread *td;
359 	struct pwrite_args *uap;
360 {
361 	struct file *fp;
362 	int error;
363 
364 	if ((error = fget_write(td, uap->fd, &fp)) == 0) {
365 		if (fp->f_type == DTYPE_VNODE) {
366 			error = dofilewrite(td, fp, uap->fd, uap->buf,
367 				    uap->nbyte, uap->offset, FOF_OFFSET);
368 		} else {
369 			error = ESPIPE;
370 		}
371 		fdrop(fp, td);
372 	} else {
373 		error = EBADF;	/* this can't be right */
374 	}
375 	return(error);
376 }
377 
378 static int
379 dofilewrite(td, fp, fd, buf, nbyte, offset, flags)
380 	struct thread *td;
381 	struct file *fp;
382 	int fd, flags;
383 	const void *buf;
384 	size_t nbyte;
385 	off_t offset;
386 {
387 	struct uio auio;
388 	struct iovec aiov;
389 	long cnt, error = 0;
390 #ifdef KTRACE
391 	struct iovec ktriov;
392 	struct uio ktruio;
393 	int didktr = 0;
394 #endif
395 
396 	aiov.iov_base = (void *)(uintptr_t)buf;
397 	aiov.iov_len = nbyte;
398 	auio.uio_iov = &aiov;
399 	auio.uio_iovcnt = 1;
400 	auio.uio_offset = offset;
401 	if (nbyte > INT_MAX)
402 		return (EINVAL);
403 	auio.uio_resid = nbyte;
404 	auio.uio_rw = UIO_WRITE;
405 	auio.uio_segflg = UIO_USERSPACE;
406 	auio.uio_td = td;
407 #ifdef KTRACE
408 	/*
409 	 * if tracing, save a copy of iovec and uio
410 	 */
411 	if (KTRPOINT(td->td_proc, KTR_GENIO)) {
412 		ktriov = aiov;
413 		ktruio = auio;
414 		didktr = 1;
415 	}
416 #endif
417 	cnt = nbyte;
418 	if (fp->f_type == DTYPE_VNODE)
419 		bwillwrite();
420 	if ((error = fo_write(fp, &auio, fp->f_cred, flags, td))) {
421 		if (auio.uio_resid != cnt && (error == ERESTART ||
422 		    error == EINTR || error == EWOULDBLOCK))
423 			error = 0;
424 		if (error == EPIPE) {
425 			PROC_LOCK(td->td_proc);
426 			psignal(td->td_proc, SIGPIPE);
427 			PROC_UNLOCK(td->td_proc);
428 		}
429 	}
430 	cnt -= auio.uio_resid;
431 #ifdef KTRACE
432 	if (didktr && error == 0) {
433 		ktruio.uio_iov = &ktriov;
434 		ktruio.uio_resid = cnt;
435 		ktrgenio(td->td_proc->p_tracep, fd, UIO_WRITE, &ktruio, error);
436 	}
437 #endif
438 	td->td_retval[0] = cnt;
439 	return (error);
440 }
441 
442 /*
443  * Gather write system call
444  */
445 #ifndef _SYS_SYSPROTO_H_
446 struct writev_args {
447 	int	fd;
448 	struct	iovec *iovp;
449 	u_int	iovcnt;
450 };
451 #endif
452 /*
453  * MPSAFE
454  */
455 int
456 writev(td, uap)
457 	struct thread *td;
458 	register struct writev_args *uap;
459 {
460 	struct file *fp;
461 	struct uio auio;
462 	register struct iovec *iov;
463 	struct iovec *needfree;
464 	struct iovec aiov[UIO_SMALLIOV];
465 	long i, cnt, error = 0;
466 	u_int iovlen;
467 #ifdef KTRACE
468 	struct iovec *ktriov = NULL;
469 	struct uio ktruio;
470 #endif
471 
472 	mtx_lock(&Giant);
473 	if ((error = fget_write(td, uap->fd, &fp)) != 0) {
474 		error = EBADF;
475 		goto done2;
476 	}
477 	/* note: can't use iovlen until iovcnt is validated */
478 	iovlen = uap->iovcnt * sizeof (struct iovec);
479 	if (uap->iovcnt > UIO_SMALLIOV) {
480 		if (uap->iovcnt > UIO_MAXIOV) {
481 			needfree = NULL;
482 			error = EINVAL;
483 			goto done;
484 		}
485 		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
486 		needfree = iov;
487 	} else {
488 		iov = aiov;
489 		needfree = NULL;
490 	}
491 	auio.uio_iov = iov;
492 	auio.uio_iovcnt = uap->iovcnt;
493 	auio.uio_rw = UIO_WRITE;
494 	auio.uio_segflg = UIO_USERSPACE;
495 	auio.uio_td = td;
496 	auio.uio_offset = -1;
497 	if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)))
498 		goto done;
499 	auio.uio_resid = 0;
500 	for (i = 0; i < uap->iovcnt; i++) {
501 		if (iov->iov_len > INT_MAX - auio.uio_resid) {
502 			error = EINVAL;
503 			goto done;
504 		}
505 		auio.uio_resid += iov->iov_len;
506 		iov++;
507 	}
508 #ifdef KTRACE
509 	/*
510 	 * if tracing, save a copy of iovec and uio
511 	 */
512 	if (KTRPOINT(td->td_proc, KTR_GENIO))  {
513 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
514 		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
515 		ktruio = auio;
516 	}
517 #endif
518 	cnt = auio.uio_resid;
519 	if (fp->f_type == DTYPE_VNODE)
520 		bwillwrite();
521 	if ((error = fo_write(fp, &auio, fp->f_cred, 0, td))) {
522 		if (auio.uio_resid != cnt && (error == ERESTART ||
523 		    error == EINTR || error == EWOULDBLOCK))
524 			error = 0;
525 		if (error == EPIPE) {
526 			PROC_LOCK(td->td_proc);
527 			psignal(td->td_proc, SIGPIPE);
528 			PROC_UNLOCK(td->td_proc);
529 		}
530 	}
531 	cnt -= auio.uio_resid;
532 #ifdef KTRACE
533 	if (ktriov != NULL) {
534 		if (error == 0) {
535 			ktruio.uio_iov = ktriov;
536 			ktruio.uio_resid = cnt;
537 			ktrgenio(td->td_proc->p_tracep, uap->fd, UIO_WRITE, &ktruio,
538 			    error);
539 		}
540 		FREE(ktriov, M_TEMP);
541 	}
542 #endif
543 	td->td_retval[0] = cnt;
544 done:
545 	fdrop(fp, td);
546 	if (needfree)
547 		FREE(needfree, M_IOV);
548 done2:
549 	mtx_unlock(&Giant);
550 	return (error);
551 }
552 
553 /*
554  * Ioctl system call
555  */
556 #ifndef _SYS_SYSPROTO_H_
557 struct ioctl_args {
558 	int	fd;
559 	u_long	com;
560 	caddr_t	data;
561 };
562 #endif
563 /*
564  * MPSAFE
565  */
566 /* ARGSUSED */
567 int
568 ioctl(td, uap)
569 	struct thread *td;
570 	register struct ioctl_args *uap;
571 {
572 	struct file *fp;
573 	register struct filedesc *fdp;
574 	register u_long com;
575 	int error = 0;
576 	register u_int size;
577 	caddr_t data, memp;
578 	int tmp;
579 #define STK_PARAMS	128
580 	union {
581 	    char stkbuf[STK_PARAMS];
582 	    long align;
583 	} ubuf;
584 
585 	if ((error = fget(td, uap->fd, &fp)) != 0)
586 		return (error);
587 	mtx_lock(&Giant);
588 	if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
589 		fdrop(fp, td);
590 		mtx_unlock(&Giant);
591 		return (EBADF);
592 	}
593 	fdp = td->td_proc->p_fd;
594 	switch (com = uap->com) {
595 	case FIONCLEX:
596 		FILEDESC_LOCK(fdp);
597 		fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE;
598 		FILEDESC_UNLOCK(fdp);
599 		fdrop(fp, td);
600 		mtx_unlock(&Giant);
601 		return (0);
602 	case FIOCLEX:
603 		FILEDESC_LOCK(fdp);
604 		fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE;
605 		FILEDESC_UNLOCK(fdp);
606 		fdrop(fp, td);
607 		mtx_unlock(&Giant);
608 		return (0);
609 	}
610 
611 	/*
612 	 * Interpret high order word to find amount of data to be
613 	 * copied to/from the user's address space.
614 	 */
615 	size = IOCPARM_LEN(com);
616 	if (size > IOCPARM_MAX) {
617 		fdrop(fp, td);
618 		mtx_unlock(&Giant);
619 		return (ENOTTY);
620 	}
621 
622 	memp = NULL;
623 	if (size > sizeof (ubuf.stkbuf)) {
624 		memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
625 		data = memp;
626 	} else {
627 		data = ubuf.stkbuf;
628 	}
629 	if (com&IOC_IN) {
630 		if (size) {
631 			error = copyin(uap->data, data, (u_int)size);
632 			if (error) {
633 				if (memp)
634 					free(memp, M_IOCTLOPS);
635 				fdrop(fp, td);
636 				goto done;
637 			}
638 		} else {
639 			*(caddr_t *)data = uap->data;
640 		}
641 	} else if ((com&IOC_OUT) && size) {
642 		/*
643 		 * Zero the buffer so the user always
644 		 * gets back something deterministic.
645 		 */
646 		bzero(data, size);
647 	} else if (com&IOC_VOID) {
648 		*(caddr_t *)data = uap->data;
649 	}
650 
651 	switch (com) {
652 
653 	case FIONBIO:
654 		FILE_LOCK(fp);
655 		if ((tmp = *(int *)data))
656 			fp->f_flag |= FNONBLOCK;
657 		else
658 			fp->f_flag &= ~FNONBLOCK;
659 		FILE_UNLOCK(fp);
660 		error = fo_ioctl(fp, FIONBIO, (caddr_t)&tmp, td);
661 		break;
662 
663 	case FIOASYNC:
664 		FILE_LOCK(fp);
665 		if ((tmp = *(int *)data))
666 			fp->f_flag |= FASYNC;
667 		else
668 			fp->f_flag &= ~FASYNC;
669 		FILE_UNLOCK(fp);
670 		error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, td);
671 		break;
672 
673 	default:
674 		error = fo_ioctl(fp, com, data, td);
675 		/*
676 		 * Copy any data to user, size was
677 		 * already set and checked above.
678 		 */
679 		if (error == 0 && (com&IOC_OUT) && size)
680 			error = copyout(data, uap->data, (u_int)size);
681 		break;
682 	}
683 	if (memp)
684 		free(memp, M_IOCTLOPS);
685 	fdrop(fp, td);
686 done:
687 	mtx_unlock(&Giant);
688 	return (error);
689 }
690 
691 /*
692  * sellock and selwait are initialized in selectinit() via SYSINIT.
693  */
694 struct mtx	sellock;
695 struct cv	selwait;
696 int	nselcoll;	/* Select collisions since boot */
697 SYSCTL_INT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
698 
699 /*
700  * Select system call.
701  */
702 #ifndef _SYS_SYSPROTO_H_
703 struct select_args {
704 	int	nd;
705 	fd_set	*in, *ou, *ex;
706 	struct	timeval *tv;
707 };
708 #endif
709 /*
710  * MPSAFE
711  */
712 int
713 select(td, uap)
714 	register struct thread *td;
715 	register struct select_args *uap;
716 {
717 	struct filedesc *fdp;
718 	/*
719 	 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
720 	 * infds with the new FD_SETSIZE of 1024, and more than enough for
721 	 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
722 	 * of 256.
723 	 */
724 	fd_mask s_selbits[howmany(2048, NFDBITS)];
725 	fd_mask *ibits[3], *obits[3], *selbits, *sbp;
726 	struct timeval atv, rtv, ttv;
727 	int ncoll, error, timo;
728 	u_int nbufbytes, ncpbytes, nfdbits;
729 
730 	if (uap->nd < 0)
731 		return (EINVAL);
732 	fdp = td->td_proc->p_fd;
733 	mtx_lock(&Giant);
734 	FILEDESC_LOCK(fdp);
735 
736 	if (uap->nd > td->td_proc->p_fd->fd_nfiles)
737 		uap->nd = td->td_proc->p_fd->fd_nfiles;   /* forgiving; slightly wrong */
738 	FILEDESC_UNLOCK(fdp);
739 
740 	/*
741 	 * Allocate just enough bits for the non-null fd_sets.  Use the
742 	 * preallocated auto buffer if possible.
743 	 */
744 	nfdbits = roundup(uap->nd, NFDBITS);
745 	ncpbytes = nfdbits / NBBY;
746 	nbufbytes = 0;
747 	if (uap->in != NULL)
748 		nbufbytes += 2 * ncpbytes;
749 	if (uap->ou != NULL)
750 		nbufbytes += 2 * ncpbytes;
751 	if (uap->ex != NULL)
752 		nbufbytes += 2 * ncpbytes;
753 	if (nbufbytes <= sizeof s_selbits)
754 		selbits = &s_selbits[0];
755 	else
756 		selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
757 
758 	/*
759 	 * Assign pointers into the bit buffers and fetch the input bits.
760 	 * Put the output buffers together so that they can be bzeroed
761 	 * together.
762 	 */
763 	sbp = selbits;
764 #define	getbits(name, x) \
765 	do {								\
766 		if (uap->name == NULL)					\
767 			ibits[x] = NULL;				\
768 		else {							\
769 			ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp;	\
770 			obits[x] = sbp;					\
771 			sbp += ncpbytes / sizeof *sbp;			\
772 			error = copyin(uap->name, ibits[x], ncpbytes);	\
773 			if (error != 0)					\
774 				goto done_nosellock;			\
775 		}							\
776 	} while (0)
777 	getbits(in, 0);
778 	getbits(ou, 1);
779 	getbits(ex, 2);
780 #undef	getbits
781 	if (nbufbytes != 0)
782 		bzero(selbits, nbufbytes / 2);
783 
784 	if (uap->tv) {
785 		error = copyin((caddr_t)uap->tv, (caddr_t)&atv,
786 			sizeof (atv));
787 		if (error)
788 			goto done_nosellock;
789 		if (itimerfix(&atv)) {
790 			error = EINVAL;
791 			goto done_nosellock;
792 		}
793 		getmicrouptime(&rtv);
794 		timevaladd(&atv, &rtv);
795 	} else {
796 		atv.tv_sec = 0;
797 		atv.tv_usec = 0;
798 	}
799 	timo = 0;
800 	mtx_lock(&sellock);
801 retry:
802 	ncoll = nselcoll;
803 	mtx_lock_spin(&sched_lock);
804 	td->td_flags |= TDF_SELECT;
805 	mtx_unlock_spin(&sched_lock);
806 	mtx_unlock(&sellock);
807 
808 	/* XXX Is there a better place for this? */
809 	TAILQ_INIT(&td->td_selq);
810 	error = selscan(td, ibits, obits, uap->nd);
811 	mtx_lock(&sellock);
812 	if (error || td->td_retval[0])
813 		goto done;
814 	if (atv.tv_sec || atv.tv_usec) {
815 		getmicrouptime(&rtv);
816 		if (timevalcmp(&rtv, &atv, >=))
817 			goto done;
818 		ttv = atv;
819 		timevalsub(&ttv, &rtv);
820 		timo = ttv.tv_sec > 24 * 60 * 60 ?
821 		    24 * 60 * 60 * hz : tvtohz(&ttv);
822 	}
823 
824 	/*
825 	 * An event of interest may occur while we do not hold
826 	 * sellock, so check TDF_SELECT and the number of
827 	 * collisions and rescan the file descriptors if
828 	 * necessary.
829 	 */
830 	mtx_lock_spin(&sched_lock);
831 	if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
832 		mtx_unlock_spin(&sched_lock);
833 		goto retry;
834 	}
835 	mtx_unlock_spin(&sched_lock);
836 
837 	if (timo > 0)
838 		error = cv_timedwait_sig(&selwait, &sellock, timo);
839 	else
840 		error = cv_wait_sig(&selwait, &sellock);
841 
842 	if (error == 0)
843 		goto retry;
844 
845 done:
846 	clear_selinfo_list(td);
847 	mtx_lock_spin(&sched_lock);
848 	td->td_flags &= ~TDF_SELECT;
849 	mtx_unlock_spin(&sched_lock);
850 	mtx_unlock(&sellock);
851 
852 done_nosellock:
853 	/* select is not restarted after signals... */
854 	if (error == ERESTART)
855 		error = EINTR;
856 	if (error == EWOULDBLOCK)
857 		error = 0;
858 #define	putbits(name, x) \
859 	if (uap->name && (error2 = copyout(obits[x], uap->name, ncpbytes))) \
860 		error = error2;
861 	if (error == 0) {
862 		int error2;
863 
864 		putbits(in, 0);
865 		putbits(ou, 1);
866 		putbits(ex, 2);
867 #undef putbits
868 	}
869 	if (selbits != &s_selbits[0])
870 		free(selbits, M_SELECT);
871 
872 	mtx_unlock(&Giant);
873 	return (error);
874 }
875 
876 static int
877 selscan(td, ibits, obits, nfd)
878 	struct thread *td;
879 	fd_mask **ibits, **obits;
880 	int nfd;
881 {
882 	int msk, i, fd;
883 	fd_mask bits;
884 	struct file *fp;
885 	int n = 0;
886 	/* Note: backend also returns POLLHUP/POLLERR if appropriate. */
887 	static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
888 	struct filedesc *fdp = td->td_proc->p_fd;
889 
890 	FILEDESC_LOCK(fdp);
891 	for (msk = 0; msk < 3; msk++) {
892 		if (ibits[msk] == NULL)
893 			continue;
894 		for (i = 0; i < nfd; i += NFDBITS) {
895 			bits = ibits[msk][i/NFDBITS];
896 			/* ffs(int mask) not portable, fd_mask is long */
897 			for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
898 				if (!(bits & 1))
899 					continue;
900 				if ((fp = fget_locked(fdp, fd)) == NULL) {
901 					FILEDESC_UNLOCK(fdp);
902 					return (EBADF);
903 				}
904 				if (fo_poll(fp, flag[msk], fp->f_cred, td)) {
905 					obits[msk][(fd)/NFDBITS] |=
906 					    ((fd_mask)1 << ((fd) % NFDBITS));
907 					n++;
908 				}
909 			}
910 		}
911 	}
912 	FILEDESC_UNLOCK(fdp);
913 	td->td_retval[0] = n;
914 	return (0);
915 }
916 
917 /*
918  * Poll system call.
919  */
920 #ifndef _SYS_SYSPROTO_H_
921 struct poll_args {
922 	struct pollfd *fds;
923 	u_int	nfds;
924 	int	timeout;
925 };
926 #endif
927 /*
928  * MPSAFE
929  */
930 int
931 poll(td, uap)
932 	struct thread *td;
933 	struct poll_args *uap;
934 {
935 	caddr_t bits;
936 	char smallbits[32 * sizeof(struct pollfd)];
937 	struct timeval atv, rtv, ttv;
938 	int ncoll, error = 0, timo;
939 	u_int nfds;
940 	size_t ni;
941 
942 	nfds = SCARG(uap, nfds);
943 
944 	mtx_lock(&Giant);
945 	/*
946 	 * This is kinda bogus.  We have fd limits, but that is not
947 	 * really related to the size of the pollfd array.  Make sure
948 	 * we let the process use at least FD_SETSIZE entries and at
949 	 * least enough for the current limits.  We want to be reasonably
950 	 * safe, but not overly restrictive.
951 	 */
952 	if ((nfds > td->td_proc->p_rlimit[RLIMIT_NOFILE].rlim_cur) &&
953 	    (nfds > FD_SETSIZE)) {
954 		error = EINVAL;
955 		goto done2;
956 	}
957 	ni = nfds * sizeof(struct pollfd);
958 	if (ni > sizeof(smallbits))
959 		bits = malloc(ni, M_TEMP, M_WAITOK);
960 	else
961 		bits = smallbits;
962 	error = copyin(SCARG(uap, fds), bits, ni);
963 	if (error)
964 		goto done_nosellock;
965 	if (SCARG(uap, timeout) != INFTIM) {
966 		atv.tv_sec = SCARG(uap, timeout) / 1000;
967 		atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000;
968 		if (itimerfix(&atv)) {
969 			error = EINVAL;
970 			goto done_nosellock;
971 		}
972 		getmicrouptime(&rtv);
973 		timevaladd(&atv, &rtv);
974 	} else {
975 		atv.tv_sec = 0;
976 		atv.tv_usec = 0;
977 	}
978 	timo = 0;
979 	mtx_lock(&sellock);
980 retry:
981 	ncoll = nselcoll;
982 	mtx_lock_spin(&sched_lock);
983 	td->td_flags |= TDF_SELECT;
984 	mtx_unlock_spin(&sched_lock);
985 	mtx_unlock(&sellock);
986 
987 	/* XXX Is there a better place for this? */
988 	TAILQ_INIT(&td->td_selq);
989 	error = pollscan(td, (struct pollfd *)bits, nfds);
990 	mtx_lock(&sellock);
991 	if (error || td->td_retval[0])
992 		goto done;
993 	if (atv.tv_sec || atv.tv_usec) {
994 		getmicrouptime(&rtv);
995 		if (timevalcmp(&rtv, &atv, >=))
996 			goto done;
997 		ttv = atv;
998 		timevalsub(&ttv, &rtv);
999 		timo = ttv.tv_sec > 24 * 60 * 60 ?
1000 		    24 * 60 * 60 * hz : tvtohz(&ttv);
1001 	}
1002 	/*
1003 	 * An event of interest may occur while we do not hold
1004 	 * sellock, so check TDF_SELECT and the number of collisions
1005 	 * and rescan the file descriptors if necessary.
1006 	 */
1007 	mtx_lock_spin(&sched_lock);
1008 	if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
1009 		mtx_unlock_spin(&sched_lock);
1010 		goto retry;
1011 	}
1012 	mtx_unlock_spin(&sched_lock);
1013 
1014 	if (timo > 0)
1015 		error = cv_timedwait_sig(&selwait, &sellock, timo);
1016 	else
1017 		error = cv_wait_sig(&selwait, &sellock);
1018 
1019 	if (error == 0)
1020 		goto retry;
1021 
1022 done:
1023 	clear_selinfo_list(td);
1024 	mtx_lock_spin(&sched_lock);
1025 	td->td_flags &= ~TDF_SELECT;
1026 	mtx_unlock_spin(&sched_lock);
1027 	mtx_unlock(&sellock);
1028 
1029 done_nosellock:
1030 	/* poll is not restarted after signals... */
1031 	if (error == ERESTART)
1032 		error = EINTR;
1033 	if (error == EWOULDBLOCK)
1034 		error = 0;
1035 	if (error == 0) {
1036 		error = copyout(bits, SCARG(uap, fds), ni);
1037 		if (error)
1038 			goto out;
1039 	}
1040 out:
1041 	if (ni > sizeof(smallbits))
1042 		free(bits, M_TEMP);
1043 done2:
1044 	mtx_unlock(&Giant);
1045 	return (error);
1046 }
1047 
1048 static int
1049 pollscan(td, fds, nfd)
1050 	struct thread *td;
1051 	struct pollfd *fds;
1052 	u_int nfd;
1053 {
1054 	register struct filedesc *fdp = td->td_proc->p_fd;
1055 	int i;
1056 	struct file *fp;
1057 	int n = 0;
1058 
1059 	FILEDESC_LOCK(fdp);
1060 	for (i = 0; i < nfd; i++, fds++) {
1061 		if (fds->fd >= fdp->fd_nfiles) {
1062 			fds->revents = POLLNVAL;
1063 			n++;
1064 		} else if (fds->fd < 0) {
1065 			fds->revents = 0;
1066 		} else {
1067 			fp = fdp->fd_ofiles[fds->fd];
1068 			if (fp == NULL) {
1069 				fds->revents = POLLNVAL;
1070 				n++;
1071 			} else {
1072 				/*
1073 				 * Note: backend also returns POLLHUP and
1074 				 * POLLERR if appropriate.
1075 				 */
1076 				fds->revents = fo_poll(fp, fds->events,
1077 				    fp->f_cred, td);
1078 				if (fds->revents != 0)
1079 					n++;
1080 			}
1081 		}
1082 	}
1083 	FILEDESC_UNLOCK(fdp);
1084 	td->td_retval[0] = n;
1085 	return (0);
1086 }
1087 
1088 /*
1089  * OpenBSD poll system call.
1090  * XXX this isn't quite a true representation..  OpenBSD uses select ops.
1091  */
1092 #ifndef _SYS_SYSPROTO_H_
1093 struct openbsd_poll_args {
1094 	struct pollfd *fds;
1095 	u_int	nfds;
1096 	int	timeout;
1097 };
1098 #endif
1099 /*
1100  * MPSAFE
1101  */
1102 int
1103 openbsd_poll(td, uap)
1104 	register struct thread *td;
1105 	register struct openbsd_poll_args *uap;
1106 {
1107 	return (poll(td, (struct poll_args *)uap));
1108 }
1109 
1110 /*
1111  * Remove the references to the thread from all of the objects
1112  * we were polling.
1113  *
1114  * This code assumes that the underlying owner of the selinfo
1115  * structure will hold sellock before it changes it, and that
1116  * it will unlink itself from our list if it goes away.
1117  */
1118 void
1119 clear_selinfo_list(td)
1120 	struct thread *td;
1121 {
1122 	struct selinfo *si;
1123 
1124 	mtx_assert(&sellock, MA_OWNED);
1125 	TAILQ_FOREACH(si, &td->td_selq, si_thrlist)
1126 		si->si_thread = NULL;
1127 	TAILQ_INIT(&td->td_selq);
1128 }
1129 
1130 /*ARGSUSED*/
1131 int
1132 seltrue(dev, events, td)
1133 	dev_t dev;
1134 	int events;
1135 	struct thread *td;
1136 {
1137 
1138 	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
1139 }
1140 
1141 /*
1142  * Record a select request.
1143  */
1144 void
1145 selrecord(selector, sip)
1146 	struct thread *selector;
1147 	struct selinfo *sip;
1148 {
1149 
1150 	mtx_lock(&sellock);
1151 	/*
1152 	 * If the thread is NULL then take ownership of selinfo
1153 	 * however if the thread is not NULL and the thread points to
1154 	 * someone else, then we have a collision, otherwise leave it alone
1155 	 * as we've owned it in a previous selrecord on this selinfo.
1156 	 */
1157 	if (sip->si_thread == NULL) {
1158 		sip->si_thread = selector;
1159 		TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist);
1160 	} else if (sip->si_thread != selector) {
1161 		sip->si_flags |= SI_COLL;
1162 	}
1163 
1164 	mtx_unlock(&sellock);
1165 }
1166 
1167 /*
1168  * Do a wakeup when a selectable event occurs.
1169  */
1170 void
1171 selwakeup(sip)
1172 	struct selinfo *sip;
1173 {
1174 	struct thread *td;
1175 
1176 	mtx_lock(&sellock);
1177 	td = sip->si_thread;
1178 	if ((sip->si_flags & SI_COLL) != 0) {
1179 		nselcoll++;
1180 		sip->si_flags &= ~SI_COLL;
1181 		cv_broadcast(&selwait);
1182 	}
1183 	if (td == NULL) {
1184 		mtx_unlock(&sellock);
1185 		return;
1186 	}
1187 	TAILQ_REMOVE(&td->td_selq, sip, si_thrlist);
1188 	sip->si_thread = NULL;
1189 	mtx_lock_spin(&sched_lock);
1190 	if (td->td_wchan == (caddr_t)&selwait) {
1191 		if (td->td_proc->p_stat == SSLEEP)
1192 			setrunnable(td);
1193 		else
1194 			cv_waitq_remove(td);
1195 	} else
1196 		td->td_flags &= ~TDF_SELECT;
1197 	mtx_unlock_spin(&sched_lock);
1198 	mtx_unlock(&sellock);
1199 }
1200 
1201 static void selectinit(void *);
1202 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL)
1203 
1204 /* ARGSUSED*/
1205 static void
1206 selectinit(dummy)
1207 	void *dummy;
1208 {
1209 	cv_init(&selwait, "select");
1210 	mtx_init(&sellock, "sellck", MTX_DEF);
1211 }
1212