xref: /freebsd/sys/kern/sys_generic.c (revision 6990ffd8a95caaba6858ad44ff1b3157d1efba8f)
1 /*
2  * Copyright (c) 1982, 1986, 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
39  * $FreeBSD$
40  */
41 
42 #include "opt_ktrace.h"
43 
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/sysproto.h>
47 #include <sys/filedesc.h>
48 #include <sys/filio.h>
49 #include <sys/fcntl.h>
50 #include <sys/file.h>
51 #include <sys/proc.h>
52 #include <sys/signalvar.h>
53 #include <sys/socketvar.h>
54 #include <sys/uio.h>
55 #include <sys/kernel.h>
56 #include <sys/malloc.h>
57 #include <sys/poll.h>
58 #include <sys/resourcevar.h>
59 #include <sys/selinfo.h>
60 #include <sys/sysctl.h>
61 #include <sys/sysent.h>
62 #include <sys/bio.h>
63 #include <sys/buf.h>
64 #include <sys/condvar.h>
65 #ifdef KTRACE
66 #include <sys/ktrace.h>
67 #endif
68 #include <vm/vm.h>
69 #include <vm/vm_page.h>
70 
71 #include <machine/limits.h>
72 
73 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
74 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
75 MALLOC_DEFINE(M_IOV, "iov", "large iov's");
76 
77 static int	pollscan __P((struct thread *, struct pollfd *, u_int));
78 static int	pollholddrop __P((struct thread *, struct pollfd *, u_int, int));
79 static int	selscan __P((struct thread *, fd_mask **, fd_mask **, int));
80 static int	selholddrop __P((struct thread *, fd_mask *, fd_mask *, int, int));
81 static int	dofileread __P((struct thread *, struct file *, int, void *,
82 		    size_t, off_t, int));
83 static int	dofilewrite __P((struct thread *, struct file *, int,
84 		    const void *, size_t, off_t, int));
85 
86 struct file*
87 holdfp(fdp, fd, flag)
88 	struct filedesc* fdp;
89 	int fd, flag;
90 {
91 	struct file* fp;
92 
93 	if (((u_int)fd) >= fdp->fd_nfiles ||
94 	    (fp = fdp->fd_ofiles[fd]) == NULL ||
95 	    (fp->f_flag & flag) == 0) {
96 		return (NULL);
97 	}
98 	fhold(fp);
99 	return (fp);
100 }
101 
102 /*
103  * Read system call.
104  */
105 #ifndef _SYS_SYSPROTO_H_
106 struct read_args {
107 	int	fd;
108 	void	*buf;
109 	size_t	nbyte;
110 };
111 #endif
112 /*
113  * MPSAFE
114  */
115 int
116 read(td, uap)
117 	struct thread *td;
118 	register struct read_args *uap;
119 {
120 	register struct file *fp;
121 	int error;
122 
123 	mtx_lock(&Giant);
124 	if ((fp = holdfp(td->td_proc->p_fd, uap->fd, FREAD)) != NULL) {
125 		error = dofileread(td, fp, uap->fd, uap->buf,
126 			    uap->nbyte, (off_t)-1, 0);
127 		fdrop(fp, td);
128 	} else {
129 		error = EBADF;
130 	}
131 	mtx_unlock(&Giant);
132 	return(error);
133 }
134 
135 /*
136  * Pread system call
137  */
138 #ifndef _SYS_SYSPROTO_H_
139 struct pread_args {
140 	int	fd;
141 	void	*buf;
142 	size_t	nbyte;
143 	int	pad;
144 	off_t	offset;
145 };
146 #endif
147 /*
148  * MPSAFE
149  */
150 int
151 pread(td, uap)
152 	struct thread *td;
153 	register struct pread_args *uap;
154 {
155 	register struct file *fp;
156 	int error;
157 
158 	mtx_lock(&Giant);
159 	if ((fp = holdfp(td->td_proc->p_fd, uap->fd, FREAD)) == NULL) {
160 		error = EBADF;
161 	} else if (fp->f_type != DTYPE_VNODE) {
162 		error = ESPIPE;
163 		fdrop(fp, td);
164 	} else {
165 		error = dofileread(td, fp, uap->fd, uap->buf, uap->nbyte,
166 			    uap->offset, FOF_OFFSET);
167 		fdrop(fp, td);
168 	}
169 	mtx_unlock(&Giant);
170 	return(error);
171 }
172 
173 /*
174  * Code common for read and pread
175  */
176 int
177 dofileread(td, fp, fd, buf, nbyte, offset, flags)
178 	struct thread *td;
179 	struct file *fp;
180 	int fd, flags;
181 	void *buf;
182 	size_t nbyte;
183 	off_t offset;
184 {
185 	struct uio auio;
186 	struct iovec aiov;
187 	long cnt, error = 0;
188 #ifdef KTRACE
189 	struct iovec ktriov;
190 	struct uio ktruio;
191 	int didktr = 0;
192 #endif
193 
194 	aiov.iov_base = (caddr_t)buf;
195 	aiov.iov_len = nbyte;
196 	auio.uio_iov = &aiov;
197 	auio.uio_iovcnt = 1;
198 	auio.uio_offset = offset;
199 	if (nbyte > INT_MAX)
200 		return (EINVAL);
201 	auio.uio_resid = nbyte;
202 	auio.uio_rw = UIO_READ;
203 	auio.uio_segflg = UIO_USERSPACE;
204 	auio.uio_td = td;
205 #ifdef KTRACE
206 	/*
207 	 * if tracing, save a copy of iovec
208 	 */
209 	if (KTRPOINT(td->td_proc, KTR_GENIO)) {
210 		ktriov = aiov;
211 		ktruio = auio;
212 		didktr = 1;
213 	}
214 #endif
215 	cnt = nbyte;
216 
217 	if ((error = fo_read(fp, &auio, fp->f_cred, flags, td))) {
218 		if (auio.uio_resid != cnt && (error == ERESTART ||
219 		    error == EINTR || error == EWOULDBLOCK))
220 			error = 0;
221 	}
222 	cnt -= auio.uio_resid;
223 #ifdef KTRACE
224 	if (didktr && error == 0) {
225 		ktruio.uio_iov = &ktriov;
226 		ktruio.uio_resid = cnt;
227 		ktrgenio(td->td_proc->p_tracep, fd, UIO_READ, &ktruio, error);
228 	}
229 #endif
230 	td->td_retval[0] = cnt;
231 	return (error);
232 }
233 
234 /*
235  * Scatter read system call.
236  */
237 #ifndef _SYS_SYSPROTO_H_
238 struct readv_args {
239 	int	fd;
240 	struct	iovec *iovp;
241 	u_int	iovcnt;
242 };
243 #endif
244 /*
245  * MPSAFE
246  */
247 int
248 readv(td, uap)
249 	struct thread *td;
250 	register struct readv_args *uap;
251 {
252 	register struct file *fp;
253 	register struct filedesc *fdp;
254 	struct uio auio;
255 	register struct iovec *iov;
256 	struct iovec *needfree;
257 	struct iovec aiov[UIO_SMALLIOV];
258 	long i, cnt, error = 0;
259 	u_int iovlen;
260 #ifdef KTRACE
261 	struct iovec *ktriov = NULL;
262 	struct uio ktruio;
263 #endif
264 	mtx_lock(&Giant);
265 	fdp = td->td_proc->p_fd;
266 
267 	if ((fp = holdfp(fdp, uap->fd, FREAD)) == NULL) {
268 		error = EBADF;
269 		goto done2;
270 	}
271 	/* note: can't use iovlen until iovcnt is validated */
272 	iovlen = uap->iovcnt * sizeof (struct iovec);
273 	if (uap->iovcnt > UIO_SMALLIOV) {
274 		if (uap->iovcnt > UIO_MAXIOV) {
275 			error = EINVAL;
276 			goto done2;
277 		}
278 		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
279 		needfree = iov;
280 	} else {
281 		iov = aiov;
282 		needfree = NULL;
283 	}
284 	auio.uio_iov = iov;
285 	auio.uio_iovcnt = uap->iovcnt;
286 	auio.uio_rw = UIO_READ;
287 	auio.uio_segflg = UIO_USERSPACE;
288 	auio.uio_td = td;
289 	auio.uio_offset = -1;
290 	if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)))
291 		goto done;
292 	auio.uio_resid = 0;
293 	for (i = 0; i < uap->iovcnt; i++) {
294 		if (iov->iov_len > INT_MAX - auio.uio_resid) {
295 			error = EINVAL;
296 			goto done;
297 		}
298 		auio.uio_resid += iov->iov_len;
299 		iov++;
300 	}
301 #ifdef KTRACE
302 	/*
303 	 * if tracing, save a copy of iovec
304 	 */
305 	if (KTRPOINT(td->td_proc, KTR_GENIO))  {
306 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
307 		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
308 		ktruio = auio;
309 	}
310 #endif
311 	cnt = auio.uio_resid;
312 	if ((error = fo_read(fp, &auio, fp->f_cred, 0, td))) {
313 		if (auio.uio_resid != cnt && (error == ERESTART ||
314 		    error == EINTR || error == EWOULDBLOCK))
315 			error = 0;
316 	}
317 	cnt -= auio.uio_resid;
318 #ifdef KTRACE
319 	if (ktriov != NULL) {
320 		if (error == 0) {
321 			ktruio.uio_iov = ktriov;
322 			ktruio.uio_resid = cnt;
323 			ktrgenio(td->td_proc->p_tracep, uap->fd, UIO_READ, &ktruio,
324 			    error);
325 		}
326 		FREE(ktriov, M_TEMP);
327 	}
328 #endif
329 	td->td_retval[0] = cnt;
330 done:
331 	fdrop(fp, td);
332 	if (needfree)
333 		FREE(needfree, M_IOV);
334 done2:
335 	mtx_unlock(&Giant);
336 	return (error);
337 }
338 
339 /*
340  * Write system call
341  */
342 #ifndef _SYS_SYSPROTO_H_
343 struct write_args {
344 	int	fd;
345 	const void *buf;
346 	size_t	nbyte;
347 };
348 #endif
349 /*
350  * MPSAFE
351  */
352 int
353 write(td, uap)
354 	struct thread *td;
355 	register struct write_args *uap;
356 {
357 	register struct file *fp;
358 	int error;
359 
360 	mtx_lock(&Giant);
361 	if ((fp = holdfp(td->td_proc->p_fd, uap->fd, FWRITE)) != NULL) {
362 		error = dofilewrite(td, fp, uap->fd, uap->buf, uap->nbyte,
363 			    (off_t)-1, 0);
364 		fdrop(fp, td);
365 	} else {
366 		error = EBADF;
367 	}
368 	mtx_unlock(&Giant);
369 	return(error);
370 }
371 
372 /*
373  * Pwrite system call
374  */
375 #ifndef _SYS_SYSPROTO_H_
376 struct pwrite_args {
377 	int	fd;
378 	const void *buf;
379 	size_t	nbyte;
380 	int	pad;
381 	off_t	offset;
382 };
383 #endif
384 /*
385  * MPSAFE
386  */
387 int
388 pwrite(td, uap)
389 	struct thread *td;
390 	register struct pwrite_args *uap;
391 {
392 	register struct file *fp;
393 	int error;
394 
395 	mtx_lock(&Giant);
396 	if ((fp = holdfp(td->td_proc->p_fd, uap->fd, FWRITE)) == NULL) {
397 		error = EBADF;
398 	} else if (fp->f_type != DTYPE_VNODE) {
399 		error = ESPIPE;
400 		fdrop(fp, td);
401 	} else {
402 		error = dofilewrite(td, fp, uap->fd, uap->buf, uap->nbyte,
403 			    uap->offset, FOF_OFFSET);
404 		fdrop(fp, td);
405 	}
406 	mtx_unlock(&Giant);
407 	return(error);
408 }
409 
410 static int
411 dofilewrite(td, fp, fd, buf, nbyte, offset, flags)
412 	struct thread *td;
413 	struct file *fp;
414 	int fd, flags;
415 	const void *buf;
416 	size_t nbyte;
417 	off_t offset;
418 {
419 	struct uio auio;
420 	struct iovec aiov;
421 	long cnt, error = 0;
422 #ifdef KTRACE
423 	struct iovec ktriov;
424 	struct uio ktruio;
425 	int didktr = 0;
426 #endif
427 
428 	aiov.iov_base = (void *)(uintptr_t)buf;
429 	aiov.iov_len = nbyte;
430 	auio.uio_iov = &aiov;
431 	auio.uio_iovcnt = 1;
432 	auio.uio_offset = offset;
433 	if (nbyte > INT_MAX)
434 		return (EINVAL);
435 	auio.uio_resid = nbyte;
436 	auio.uio_rw = UIO_WRITE;
437 	auio.uio_segflg = UIO_USERSPACE;
438 	auio.uio_td = td;
439 #ifdef KTRACE
440 	/*
441 	 * if tracing, save a copy of iovec and uio
442 	 */
443 	if (KTRPOINT(td->td_proc, KTR_GENIO)) {
444 		ktriov = aiov;
445 		ktruio = auio;
446 		didktr = 1;
447 	}
448 #endif
449 	cnt = nbyte;
450 	if (fp->f_type == DTYPE_VNODE)
451 		bwillwrite();
452 	if ((error = fo_write(fp, &auio, fp->f_cred, flags, td))) {
453 		if (auio.uio_resid != cnt && (error == ERESTART ||
454 		    error == EINTR || error == EWOULDBLOCK))
455 			error = 0;
456 		if (error == EPIPE) {
457 			PROC_LOCK(td->td_proc);
458 			psignal(td->td_proc, SIGPIPE);
459 			PROC_UNLOCK(td->td_proc);
460 		}
461 	}
462 	cnt -= auio.uio_resid;
463 #ifdef KTRACE
464 	if (didktr && error == 0) {
465 		ktruio.uio_iov = &ktriov;
466 		ktruio.uio_resid = cnt;
467 		ktrgenio(td->td_proc->p_tracep, fd, UIO_WRITE, &ktruio, error);
468 	}
469 #endif
470 	td->td_retval[0] = cnt;
471 	return (error);
472 }
473 
474 /*
475  * Gather write system call
476  */
477 #ifndef _SYS_SYSPROTO_H_
478 struct writev_args {
479 	int	fd;
480 	struct	iovec *iovp;
481 	u_int	iovcnt;
482 };
483 #endif
484 /*
485  * MPSAFE
486  */
487 int
488 writev(td, uap)
489 	struct thread *td;
490 	register struct writev_args *uap;
491 {
492 	register struct file *fp;
493 	register struct filedesc *fdp;
494 	struct uio auio;
495 	register struct iovec *iov;
496 	struct iovec *needfree;
497 	struct iovec aiov[UIO_SMALLIOV];
498 	long i, cnt, error = 0;
499 	u_int iovlen;
500 #ifdef KTRACE
501 	struct iovec *ktriov = NULL;
502 	struct uio ktruio;
503 #endif
504 
505 	mtx_lock(&Giant);
506 	fdp = td->td_proc->p_fd;
507 	if ((fp = holdfp(fdp, uap->fd, FWRITE)) == NULL) {
508 		error = EBADF;
509 		goto done2;
510 	}
511 	/* note: can't use iovlen until iovcnt is validated */
512 	iovlen = uap->iovcnt * sizeof (struct iovec);
513 	if (uap->iovcnt > UIO_SMALLIOV) {
514 		if (uap->iovcnt > UIO_MAXIOV) {
515 			needfree = NULL;
516 			error = EINVAL;
517 			goto done;
518 		}
519 		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
520 		needfree = iov;
521 	} else {
522 		iov = aiov;
523 		needfree = NULL;
524 	}
525 	auio.uio_iov = iov;
526 	auio.uio_iovcnt = uap->iovcnt;
527 	auio.uio_rw = UIO_WRITE;
528 	auio.uio_segflg = UIO_USERSPACE;
529 	auio.uio_td = td;
530 	auio.uio_offset = -1;
531 	if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)))
532 		goto done;
533 	auio.uio_resid = 0;
534 	for (i = 0; i < uap->iovcnt; i++) {
535 		if (iov->iov_len > INT_MAX - auio.uio_resid) {
536 			error = EINVAL;
537 			goto done;
538 		}
539 		auio.uio_resid += iov->iov_len;
540 		iov++;
541 	}
542 #ifdef KTRACE
543 	/*
544 	 * if tracing, save a copy of iovec and uio
545 	 */
546 	if (KTRPOINT(td->td_proc, KTR_GENIO))  {
547 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
548 		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
549 		ktruio = auio;
550 	}
551 #endif
552 	cnt = auio.uio_resid;
553 	if (fp->f_type == DTYPE_VNODE)
554 		bwillwrite();
555 	if ((error = fo_write(fp, &auio, fp->f_cred, 0, td))) {
556 		if (auio.uio_resid != cnt && (error == ERESTART ||
557 		    error == EINTR || error == EWOULDBLOCK))
558 			error = 0;
559 		if (error == EPIPE) {
560 			PROC_LOCK(td->td_proc);
561 			psignal(td->td_proc, SIGPIPE);
562 			PROC_UNLOCK(td->td_proc);
563 		}
564 	}
565 	cnt -= auio.uio_resid;
566 #ifdef KTRACE
567 	if (ktriov != NULL) {
568 		if (error == 0) {
569 			ktruio.uio_iov = ktriov;
570 			ktruio.uio_resid = cnt;
571 			ktrgenio(td->td_proc->p_tracep, uap->fd, UIO_WRITE, &ktruio,
572 			    error);
573 		}
574 		FREE(ktriov, M_TEMP);
575 	}
576 #endif
577 	td->td_retval[0] = cnt;
578 done:
579 	fdrop(fp, td);
580 	if (needfree)
581 		FREE(needfree, M_IOV);
582 done2:
583 	mtx_unlock(&Giant);
584 	return (error);
585 }
586 
587 /*
588  * Ioctl system call
589  */
590 #ifndef _SYS_SYSPROTO_H_
591 struct ioctl_args {
592 	int	fd;
593 	u_long	com;
594 	caddr_t	data;
595 };
596 #endif
597 /*
598  * MPSAFE
599  */
600 /* ARGSUSED */
601 int
602 ioctl(td, uap)
603 	struct thread *td;
604 	register struct ioctl_args *uap;
605 {
606 	register struct file *fp;
607 	register struct filedesc *fdp;
608 	register u_long com;
609 	int error = 0;
610 	register u_int size;
611 	caddr_t data, memp;
612 	int tmp;
613 #define STK_PARAMS	128
614 	union {
615 	    char stkbuf[STK_PARAMS];
616 	    long align;
617 	} ubuf;
618 
619 	mtx_lock(&Giant);
620 	fdp = td->td_proc->p_fd;
621 	if ((u_int)uap->fd >= fdp->fd_nfiles ||
622 	    (fp = fdp->fd_ofiles[uap->fd]) == NULL) {
623 		error = EBADF;
624 		goto done2;
625 	}
626 
627 	if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
628 		error = EBADF;
629 		goto done2;
630 	}
631 
632 	switch (com = uap->com) {
633 	case FIONCLEX:
634 		fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE;
635 		goto done2;
636 	case FIOCLEX:
637 		fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE;
638 		goto done2;
639 	}
640 
641 	/*
642 	 * Interpret high order word to find amount of data to be
643 	 * copied to/from the user's address space.
644 	 */
645 	size = IOCPARM_LEN(com);
646 	if (size > IOCPARM_MAX) {
647 		error = ENOTTY;
648 		goto done2;
649 	}
650 
651 	fhold(fp);
652 
653 	memp = NULL;
654 	if (size > sizeof (ubuf.stkbuf)) {
655 		memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
656 		data = memp;
657 	} else {
658 		data = ubuf.stkbuf;
659 	}
660 	if (com&IOC_IN) {
661 		if (size) {
662 			error = copyin(uap->data, data, (u_int)size);
663 			if (error) {
664 				if (memp)
665 					free(memp, M_IOCTLOPS);
666 				fdrop(fp, td);
667 				goto done2;
668 			}
669 		} else {
670 			*(caddr_t *)data = uap->data;
671 		}
672 	} else if ((com&IOC_OUT) && size) {
673 		/*
674 		 * Zero the buffer so the user always
675 		 * gets back something deterministic.
676 		 */
677 		bzero(data, size);
678 	} else if (com&IOC_VOID) {
679 		*(caddr_t *)data = uap->data;
680 	}
681 
682 	switch (com) {
683 
684 	case FIONBIO:
685 		if ((tmp = *(int *)data))
686 			fp->f_flag |= FNONBLOCK;
687 		else
688 			fp->f_flag &= ~FNONBLOCK;
689 		error = fo_ioctl(fp, FIONBIO, (caddr_t)&tmp, td);
690 		break;
691 
692 	case FIOASYNC:
693 		if ((tmp = *(int *)data))
694 			fp->f_flag |= FASYNC;
695 		else
696 			fp->f_flag &= ~FASYNC;
697 		error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, td);
698 		break;
699 
700 	default:
701 		error = fo_ioctl(fp, com, data, td);
702 		/*
703 		 * Copy any data to user, size was
704 		 * already set and checked above.
705 		 */
706 		if (error == 0 && (com&IOC_OUT) && size)
707 			error = copyout(data, uap->data, (u_int)size);
708 		break;
709 	}
710 	if (memp)
711 		free(memp, M_IOCTLOPS);
712 	fdrop(fp, td);
713 done2:
714 	mtx_unlock(&Giant);
715 	return (error);
716 }
717 
718 static int	nselcoll;	/* Select collisions since boot */
719 struct cv	selwait;
720 SYSCTL_INT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
721 
722 /*
723  * Select system call.
724  */
725 #ifndef _SYS_SYSPROTO_H_
726 struct select_args {
727 	int	nd;
728 	fd_set	*in, *ou, *ex;
729 	struct	timeval *tv;
730 };
731 #endif
732 /*
733  * MPSAFE
734  */
735 int
736 select(td, uap)
737 	register struct thread *td;
738 	register struct select_args *uap;
739 {
740 	/*
741 	 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
742 	 * infds with the new FD_SETSIZE of 1024, and more than enough for
743 	 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
744 	 * of 256.
745 	 */
746 	fd_mask s_selbits[howmany(2048, NFDBITS)];
747 	fd_mask s_heldbits[howmany(2048, NFDBITS)];
748 	fd_mask *ibits[3], *obits[3], *selbits, *sbp, *heldbits, *hibits, *hobits;
749 	struct timeval atv, rtv, ttv;
750 	int ncoll, error, timo, i;
751 	u_int nbufbytes, ncpbytes, nfdbits;
752 
753 	if (uap->nd < 0)
754 		return (EINVAL);
755 
756 	mtx_lock(&Giant);
757 
758 	if (uap->nd > td->td_proc->p_fd->fd_nfiles)
759 		uap->nd = td->td_proc->p_fd->fd_nfiles;   /* forgiving; slightly wrong */
760 
761 	/*
762 	 * Allocate just enough bits for the non-null fd_sets.  Use the
763 	 * preallocated auto buffer if possible.
764 	 */
765 	nfdbits = roundup(uap->nd, NFDBITS);
766 	ncpbytes = nfdbits / NBBY;
767 	nbufbytes = 0;
768 	if (uap->in != NULL)
769 		nbufbytes += 2 * ncpbytes;
770 	if (uap->ou != NULL)
771 		nbufbytes += 2 * ncpbytes;
772 	if (uap->ex != NULL)
773 		nbufbytes += 2 * ncpbytes;
774 	if (nbufbytes <= sizeof s_selbits)
775 		selbits = &s_selbits[0];
776 	else
777 		selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
778 	if (2 * ncpbytes <= sizeof s_heldbits) {
779 		bzero(s_heldbits, sizeof(s_heldbits));
780 		heldbits = &s_heldbits[0];
781 	} else
782 		heldbits = malloc(2 * ncpbytes, M_SELECT, M_WAITOK | M_ZERO);
783 
784 	/*
785 	 * Assign pointers into the bit buffers and fetch the input bits.
786 	 * Put the output buffers together so that they can be bzeroed
787 	 * together.
788 	 */
789 	sbp = selbits;
790 	hibits = heldbits + ncpbytes / sizeof *heldbits;
791 	hobits = heldbits;
792 #define	getbits(name, x) \
793 	do {								\
794 		if (uap->name == NULL)					\
795 			ibits[x] = NULL;				\
796 		else {							\
797 			ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp;	\
798 			obits[x] = sbp;					\
799 			sbp += ncpbytes / sizeof *sbp;			\
800 			error = copyin(uap->name, ibits[x], ncpbytes);	\
801 			if (error != 0)					\
802 				goto done_noproclock;			\
803 			for (i = 0;					\
804 			     i < ncpbytes / sizeof ibits[i][0];		\
805 			     i++)					\
806 				hibits[i] |= ibits[x][i];		\
807 		}							\
808 	} while (0)
809 	getbits(in, 0);
810 	getbits(ou, 1);
811 	getbits(ex, 2);
812 #undef	getbits
813 	if (nbufbytes != 0)
814 		bzero(selbits, nbufbytes / 2);
815 
816 	if (uap->tv) {
817 		error = copyin((caddr_t)uap->tv, (caddr_t)&atv,
818 			sizeof (atv));
819 		if (error)
820 			goto done_noproclock;
821 		if (itimerfix(&atv)) {
822 			error = EINVAL;
823 			goto done_noproclock;
824 		}
825 		getmicrouptime(&rtv);
826 		timevaladd(&atv, &rtv);
827 	} else {
828 		atv.tv_sec = 0;
829 		atv.tv_usec = 0;
830 	}
831 	selholddrop(td, hibits, hobits, uap->nd, 1);
832 	timo = 0;
833 	PROC_LOCK(td->td_proc);
834 retry:
835 	ncoll = nselcoll;
836 	td->td_flags |= TDF_SELECT;
837 	PROC_UNLOCK(td->td_proc);
838 	error = selscan(td, ibits, obits, uap->nd);
839 	PROC_LOCK(td->td_proc);
840 	if (error || td->td_retval[0])
841 		goto done;
842 	if (atv.tv_sec || atv.tv_usec) {
843 		getmicrouptime(&rtv);
844 		if (timevalcmp(&rtv, &atv, >=)) {
845 			/*
846 			 * An event of our interest may occur during locking a process.
847 			 * In order to avoid missing the event that occured during locking
848 			 * the process, test TDF_SELECT and rescan file descriptors if
849 			 * necessary.
850 			 */
851 			if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
852 				ncoll = nselcoll;
853 				td->td_flags |= TDF_SELECT;
854 				PROC_UNLOCK(td->td_proc);
855 				error = selscan(td, ibits, obits, uap->nd);
856 				PROC_LOCK(td->td_proc);
857 			}
858 			goto done;
859 		}
860 		ttv = atv;
861 		timevalsub(&ttv, &rtv);
862 		timo = ttv.tv_sec > 24 * 60 * 60 ?
863 		    24 * 60 * 60 * hz : tvtohz(&ttv);
864 	}
865 	td->td_flags &= ~TDF_SELECT;
866 
867 	if (timo > 0)
868 		error = cv_timedwait_sig(&selwait, &td->td_proc->p_mtx, timo);
869 	else
870 		error = cv_wait_sig(&selwait, &td->td_proc->p_mtx);
871 
872 	if (error == 0)
873 		goto retry;
874 
875 done:
876 	td->td_flags &= ~TDF_SELECT;
877 	PROC_UNLOCK(td->td_proc);
878 	selholddrop(td, hibits, hobits, uap->nd, 0);
879 done_noproclock:
880 	/* select is not restarted after signals... */
881 	if (error == ERESTART)
882 		error = EINTR;
883 	if (error == EWOULDBLOCK)
884 		error = 0;
885 #define	putbits(name, x) \
886 	if (uap->name && (error2 = copyout(obits[x], uap->name, ncpbytes))) \
887 		error = error2;
888 	if (error == 0) {
889 		int error2;
890 
891 		putbits(in, 0);
892 		putbits(ou, 1);
893 		putbits(ex, 2);
894 #undef putbits
895 	}
896 	if (selbits != &s_selbits[0])
897 		free(selbits, M_SELECT);
898 	if (heldbits != &s_heldbits[0])
899 		free(heldbits, M_SELECT);
900 
901 	mtx_unlock(&Giant);
902 	return (error);
903 }
904 
905 static int
906 selholddrop(td, ibits, obits, nfd, hold)
907 	struct thread *td;
908 	fd_mask *ibits, *obits;
909 	int nfd, hold;
910 {
911 	struct filedesc *fdp = td->td_proc->p_fd;
912 	int i, fd;
913 	fd_mask bits;
914 	struct file *fp;
915 
916 	for (i = 0; i < nfd; i += NFDBITS) {
917 		if (hold)
918 			bits = ibits[i/NFDBITS];
919 		else
920 			bits = obits[i/NFDBITS];
921 		/* ffs(int mask) not portable, fd_mask is long */
922 		for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
923 			if (!(bits & 1))
924 				continue;
925 			fp = fdp->fd_ofiles[fd];
926 			if (fp == NULL)
927 				return (EBADF);
928 			if (hold) {
929 				fhold(fp);
930 				obits[(fd)/NFDBITS] |=
931 				    ((fd_mask)1 << ((fd) % NFDBITS));
932 			} else
933 				fdrop(fp, td);
934 		}
935 	}
936 	return (0);
937 }
938 
939 static int
940 selscan(td, ibits, obits, nfd)
941 	struct thread *td;
942 	fd_mask **ibits, **obits;
943 	int nfd;
944 {
945 	struct filedesc *fdp = td->td_proc->p_fd;
946 	int msk, i, fd;
947 	fd_mask bits;
948 	struct file *fp;
949 	int n = 0;
950 	/* Note: backend also returns POLLHUP/POLLERR if appropriate. */
951 	static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
952 
953 	for (msk = 0; msk < 3; msk++) {
954 		if (ibits[msk] == NULL)
955 			continue;
956 		for (i = 0; i < nfd; i += NFDBITS) {
957 			bits = ibits[msk][i/NFDBITS];
958 			/* ffs(int mask) not portable, fd_mask is long */
959 			for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
960 				if (!(bits & 1))
961 					continue;
962 				fp = fdp->fd_ofiles[fd];
963 				if (fp == NULL)
964 					return (EBADF);
965 				if (fo_poll(fp, flag[msk], fp->f_cred, td)) {
966 					obits[msk][(fd)/NFDBITS] |=
967 					    ((fd_mask)1 << ((fd) % NFDBITS));
968 					n++;
969 				}
970 			}
971 		}
972 	}
973 	td->td_retval[0] = n;
974 	return (0);
975 }
976 
977 /*
978  * Poll system call.
979  */
980 #ifndef _SYS_SYSPROTO_H_
981 struct poll_args {
982 	struct pollfd *fds;
983 	u_int	nfds;
984 	int	timeout;
985 };
986 #endif
987 /*
988  * MPSAFE
989  */
990 int
991 poll(td, uap)
992 	struct thread *td;
993 	struct poll_args *uap;
994 {
995 	caddr_t bits;
996 	char smallbits[32 * sizeof(struct pollfd)];
997 	struct timeval atv, rtv, ttv;
998 	int ncoll, error = 0, timo;
999 	u_int nfds;
1000 	size_t ni;
1001 	struct pollfd p_heldbits[32];
1002 	struct pollfd *heldbits;
1003 
1004 	nfds = SCARG(uap, nfds);
1005 
1006 	mtx_lock(&Giant);
1007 	/*
1008 	 * This is kinda bogus.  We have fd limits, but that is not
1009 	 * really related to the size of the pollfd array.  Make sure
1010 	 * we let the process use at least FD_SETSIZE entries and at
1011 	 * least enough for the current limits.  We want to be reasonably
1012 	 * safe, but not overly restrictive.
1013 	 */
1014 	if ((nfds > td->td_proc->p_rlimit[RLIMIT_NOFILE].rlim_cur) &&
1015 	    (nfds > FD_SETSIZE)) {
1016 		error = EINVAL;
1017 		goto done2;
1018 	}
1019 	ni = nfds * sizeof(struct pollfd);
1020 	if (ni > sizeof(smallbits))
1021 		bits = malloc(ni, M_TEMP, M_WAITOK);
1022 	else
1023 		bits = smallbits;
1024 	if (ni > sizeof(p_heldbits))
1025 		heldbits = malloc(ni, M_TEMP, M_WAITOK);
1026 	else {
1027 		bzero(p_heldbits, sizeof(p_heldbits));
1028 		heldbits = p_heldbits;
1029 	}
1030 	error = copyin(SCARG(uap, fds), bits, ni);
1031 	if (error)
1032 		goto done_noproclock;
1033 	bcopy(bits, heldbits, ni);
1034 	if (SCARG(uap, timeout) != INFTIM) {
1035 		atv.tv_sec = SCARG(uap, timeout) / 1000;
1036 		atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000;
1037 		if (itimerfix(&atv)) {
1038 			error = EINVAL;
1039 			goto done_noproclock;
1040 		}
1041 		getmicrouptime(&rtv);
1042 		timevaladd(&atv, &rtv);
1043 	} else {
1044 		atv.tv_sec = 0;
1045 		atv.tv_usec = 0;
1046 	}
1047 	pollholddrop(td, heldbits, nfds, 1);
1048 	timo = 0;
1049 	PROC_LOCK(td->td_proc);
1050 retry:
1051 	ncoll = nselcoll;
1052 	td->td_flags |= TDF_SELECT;
1053 	PROC_UNLOCK(td->td_proc);
1054 	error = pollscan(td, (struct pollfd *)bits, nfds);
1055 	PROC_LOCK(td->td_proc);
1056 	if (error || td->td_retval[0])
1057 		goto done;
1058 	if (atv.tv_sec || atv.tv_usec) {
1059 		getmicrouptime(&rtv);
1060 		if (timevalcmp(&rtv, &atv, >=)) {
1061 			/*
1062 			 * An event of our interest may occur during locking a process.
1063 			 * In order to avoid missing the event that occured during locking
1064 			 * the process, test TDF_SELECT and rescan file descriptors if
1065 			 * necessary.
1066 			 */
1067 			if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
1068 				ncoll = nselcoll;
1069 				td->td_flags |= TDF_SELECT;
1070 				PROC_UNLOCK(td->td_proc);
1071 				error = pollscan(td, (struct pollfd *)bits, nfds);
1072 				PROC_LOCK(td->td_proc);
1073 			}
1074 			goto done;
1075 		}
1076 		ttv = atv;
1077 		timevalsub(&ttv, &rtv);
1078 		timo = ttv.tv_sec > 24 * 60 * 60 ?
1079 		    24 * 60 * 60 * hz : tvtohz(&ttv);
1080 	}
1081 	td->td_flags &= ~TDF_SELECT;
1082 	if (timo > 0)
1083 		error = cv_timedwait_sig(&selwait, &td->td_proc->p_mtx, timo);
1084 	else
1085 		error = cv_wait_sig(&selwait, &td->td_proc->p_mtx);
1086 	if (error == 0)
1087 		goto retry;
1088 
1089 done:
1090 	td->td_flags &= ~TDF_SELECT;
1091 	PROC_UNLOCK(td->td_proc);
1092 	pollholddrop(td, heldbits, nfds, 0);
1093 done_noproclock:
1094 	/* poll is not restarted after signals... */
1095 	if (error == ERESTART)
1096 		error = EINTR;
1097 	if (error == EWOULDBLOCK)
1098 		error = 0;
1099 	if (error == 0) {
1100 		error = copyout(bits, SCARG(uap, fds), ni);
1101 		if (error)
1102 			goto out;
1103 	}
1104 out:
1105 	if (ni > sizeof(smallbits))
1106 		free(bits, M_TEMP);
1107 	if (ni > sizeof(p_heldbits))
1108 		free(heldbits, M_TEMP);
1109 done2:
1110 	mtx_unlock(&Giant);
1111 	return (error);
1112 }
1113 
1114 static int
1115 pollholddrop(td, fds, nfd, hold)
1116 	struct thread *td;
1117 	struct pollfd *fds;
1118 	u_int nfd;
1119 	int hold;
1120 {
1121 	register struct filedesc *fdp = td->td_proc->p_fd;
1122 	int i;
1123 	struct file *fp;
1124 
1125 	for (i = 0; i < nfd; i++, fds++) {
1126 		if (0 <= fds->fd && fds->fd < fdp->fd_nfiles) {
1127 			fp = fdp->fd_ofiles[fds->fd];
1128 			if (hold) {
1129 				if (fp != NULL) {
1130 					fhold(fp);
1131 					fds->revents = 1;
1132 				} else
1133 					fds->revents = 0;
1134 			} else if(fp != NULL && fds->revents)
1135 				fdrop(fp, td);
1136 		}
1137 	}
1138 	return (0);
1139 }
1140 
1141 static int
1142 pollscan(td, fds, nfd)
1143 	struct thread *td;
1144 	struct pollfd *fds;
1145 	u_int nfd;
1146 {
1147 	register struct filedesc *fdp = td->td_proc->p_fd;
1148 	int i;
1149 	struct file *fp;
1150 	int n = 0;
1151 
1152 	for (i = 0; i < nfd; i++, fds++) {
1153 		if (fds->fd >= fdp->fd_nfiles) {
1154 			fds->revents = POLLNVAL;
1155 			n++;
1156 		} else if (fds->fd < 0) {
1157 			fds->revents = 0;
1158 		} else {
1159 			fp = fdp->fd_ofiles[fds->fd];
1160 			if (fp == NULL) {
1161 				fds->revents = POLLNVAL;
1162 				n++;
1163 			} else {
1164 				/*
1165 				 * Note: backend also returns POLLHUP and
1166 				 * POLLERR if appropriate.
1167 				 */
1168 				fds->revents = fo_poll(fp, fds->events,
1169 				    fp->f_cred, td);
1170 				if (fds->revents != 0)
1171 					n++;
1172 			}
1173 		}
1174 	}
1175 	td->td_retval[0] = n;
1176 	return (0);
1177 }
1178 
1179 /*
1180  * OpenBSD poll system call.
1181  * XXX this isn't quite a true representation..  OpenBSD uses select ops.
1182  */
1183 #ifndef _SYS_SYSPROTO_H_
1184 struct openbsd_poll_args {
1185 	struct pollfd *fds;
1186 	u_int	nfds;
1187 	int	timeout;
1188 };
1189 #endif
1190 /*
1191  * MPSAFE
1192  */
1193 int
1194 openbsd_poll(td, uap)
1195 	register struct thread *td;
1196 	register struct openbsd_poll_args *uap;
1197 {
1198 	return (poll(td, (struct poll_args *)uap));
1199 }
1200 
1201 /*ARGSUSED*/
1202 int
1203 seltrue(dev, events, td)
1204 	dev_t dev;
1205 	int events;
1206 	struct thread *td;
1207 {
1208 
1209 	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
1210 }
1211 
1212 static int
1213 find_thread_in_proc(struct proc *p, struct thread *td)
1214 {
1215 	struct thread *td2;
1216 	FOREACH_THREAD_IN_PROC(p, td2) {
1217 		if (td2 == td) {
1218 			return (1);
1219 		}
1220 	}
1221 	return (0);
1222 }
1223 
1224 /*
1225  * Record a select request.
1226  */
1227 void
1228 selrecord(selector, sip)
1229 	struct thread *selector;
1230 	struct selinfo *sip;
1231 {
1232 	struct proc *p;
1233 	pid_t mypid;
1234 
1235 	mypid = selector->td_proc->p_pid;
1236 	if ((sip->si_pid == mypid) &&
1237 	    (sip->si_thread == selector)) { /* XXXKSE should be an ID? */
1238 		return;
1239 	}
1240 	if (sip->si_pid &&
1241 	    (p = pfind(sip->si_pid)) &&
1242 	    (find_thread_in_proc(p, sip->si_thread))) {
1243 		mtx_lock_spin(&sched_lock);
1244 	    	if (sip->si_thread->td_wchan == (caddr_t)&selwait) {
1245 			mtx_unlock_spin(&sched_lock);
1246 			PROC_UNLOCK(p);
1247 			sip->si_flags |= SI_COLL;
1248 			return;
1249 		}
1250 		mtx_unlock_spin(&sched_lock);
1251 		PROC_UNLOCK(p);
1252 	}
1253 	sip->si_pid = mypid;
1254 	sip->si_thread = selector;
1255 }
1256 
1257 /*
1258  * Do a wakeup when a selectable event occurs.
1259  */
1260 void
1261 selwakeup(sip)
1262 	register struct selinfo *sip;
1263 {
1264 	struct thread *td;
1265 	register struct proc *p;
1266 
1267 	if (sip->si_pid == 0)
1268 		return;
1269 	if (sip->si_flags & SI_COLL) {
1270 		nselcoll++;
1271 		sip->si_flags &= ~SI_COLL;
1272 		cv_broadcast(&selwait);
1273 	}
1274 	p = pfind(sip->si_pid);
1275 	sip->si_pid = 0;
1276 	td = sip->si_thread;
1277 	if (p != NULL) {
1278 		if (!find_thread_in_proc(p, td)) {
1279 			PROC_UNLOCK(p); /* lock is in pfind() */;
1280 			return;
1281 		}
1282 		mtx_lock_spin(&sched_lock);
1283 		if (td->td_wchan == (caddr_t)&selwait) {
1284 			if (td->td_proc->p_stat == SSLEEP)
1285 				setrunnable(td);
1286 			else
1287 				cv_waitq_remove(td);
1288 		} else
1289 			td->td_flags &= ~TDF_SELECT;
1290 		mtx_unlock_spin(&sched_lock);
1291 		PROC_UNLOCK(p); /* Lock is in pfind() */
1292 	}
1293 }
1294 
1295 static void selectinit __P((void *));
1296 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL)
1297 
1298 /* ARGSUSED*/
1299 static void
1300 selectinit(dummy)
1301 	void *dummy;
1302 {
1303 	cv_init(&selwait, "select");
1304 }
1305