xref: /freebsd/sys/kern/sys_generic.c (revision 41466b50c1d5bfd1cf6adaae547a579a75d7c04e)
1 /*
2  * Copyright (c) 1982, 1986, 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
39  * $FreeBSD$
40  */
41 
42 #include "opt_ktrace.h"
43 
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/sysproto.h>
47 #include <sys/filedesc.h>
48 #include <sys/filio.h>
49 #include <sys/fcntl.h>
50 #include <sys/file.h>
51 #include <sys/proc.h>
52 #include <sys/signalvar.h>
53 #include <sys/socketvar.h>
54 #include <sys/uio.h>
55 #include <sys/kernel.h>
56 #include <sys/malloc.h>
57 #include <sys/poll.h>
58 #include <sys/resourcevar.h>
59 #include <sys/selinfo.h>
60 #include <sys/sysctl.h>
61 #include <sys/sysent.h>
62 #include <sys/bio.h>
63 #include <sys/buf.h>
64 #include <sys/condvar.h>
65 #ifdef KTRACE
66 #include <sys/ktrace.h>
67 #endif
68 #include <vm/vm.h>
69 #include <vm/vm_page.h>
70 
71 #include <machine/limits.h>
72 
73 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
74 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
75 MALLOC_DEFINE(M_IOV, "iov", "large iov's");
76 
77 static int	pollscan __P((struct thread *, struct pollfd *, u_int));
78 static int	pollholddrop __P((struct thread *, struct pollfd *, u_int, int));
79 static int	selscan __P((struct thread *, fd_mask **, fd_mask **, int));
80 static int	selholddrop __P((struct thread *, fd_mask *, fd_mask *, int, int));
81 static int	dofileread __P((struct thread *, struct file *, int, void *,
82 		    size_t, off_t, int));
83 static int	dofilewrite __P((struct thread *, struct file *, int,
84 		    const void *, size_t, off_t, int));
85 
86 struct file*
87 holdfp(fdp, fd, flag)
88 	struct filedesc* fdp;
89 	int fd, flag;
90 {
91 	struct file* fp;
92 
93 	if (((u_int)fd) >= fdp->fd_nfiles ||
94 	    (fp = fdp->fd_ofiles[fd]) == NULL ||
95 	    (fp->f_flag & flag) == 0) {
96 		return (NULL);
97 	}
98 	fhold(fp);
99 	return (fp);
100 }
101 
102 /*
103  * Read system call.
104  */
105 #ifndef _SYS_SYSPROTO_H_
106 struct read_args {
107 	int	fd;
108 	void	*buf;
109 	size_t	nbyte;
110 };
111 #endif
112 /*
113  * MPSAFE
114  */
115 int
116 read(td, uap)
117 	struct thread *td;
118 	register struct read_args *uap;
119 {
120 	register struct file *fp;
121 	int error;
122 
123 	mtx_lock(&Giant);
124 	if ((fp = holdfp(td->td_proc->p_fd, uap->fd, FREAD)) != NULL) {
125 		error = dofileread(td, fp, uap->fd, uap->buf,
126 			    uap->nbyte, (off_t)-1, 0);
127 		fdrop(fp, td);
128 	} else {
129 		error = EBADF;
130 	}
131 	mtx_unlock(&Giant);
132 	return(error);
133 }
134 
135 /*
136  * Pread system call
137  */
138 #ifndef _SYS_SYSPROTO_H_
139 struct pread_args {
140 	int	fd;
141 	void	*buf;
142 	size_t	nbyte;
143 	int	pad;
144 	off_t	offset;
145 };
146 #endif
147 /*
148  * MPSAFE
149  */
150 int
151 pread(td, uap)
152 	struct thread *td;
153 	register struct pread_args *uap;
154 {
155 	register struct file *fp;
156 	int error;
157 
158 	mtx_lock(&Giant);
159 	if ((fp = holdfp(td->td_proc->p_fd, uap->fd, FREAD)) == NULL) {
160 		error = EBADF;
161 	} else if (fp->f_type != DTYPE_VNODE) {
162 		error = ESPIPE;
163 		fdrop(fp, td);
164 	} else {
165 		error = dofileread(td, fp, uap->fd, uap->buf, uap->nbyte,
166 			    uap->offset, FOF_OFFSET);
167 		fdrop(fp, td);
168 	}
169 	mtx_unlock(&Giant);
170 	return(error);
171 }
172 
173 /*
174  * Code common for read and pread
175  */
176 int
177 dofileread(td, fp, fd, buf, nbyte, offset, flags)
178 	struct thread *td;
179 	struct file *fp;
180 	int fd, flags;
181 	void *buf;
182 	size_t nbyte;
183 	off_t offset;
184 {
185 	struct uio auio;
186 	struct iovec aiov;
187 	long cnt, error = 0;
188 #ifdef KTRACE
189 	struct iovec ktriov;
190 	struct uio ktruio;
191 	int didktr = 0;
192 #endif
193 
194 	aiov.iov_base = (caddr_t)buf;
195 	aiov.iov_len = nbyte;
196 	auio.uio_iov = &aiov;
197 	auio.uio_iovcnt = 1;
198 	auio.uio_offset = offset;
199 	if (nbyte > INT_MAX)
200 		return (EINVAL);
201 	auio.uio_resid = nbyte;
202 	auio.uio_rw = UIO_READ;
203 	auio.uio_segflg = UIO_USERSPACE;
204 	auio.uio_td = td;
205 #ifdef KTRACE
206 	/*
207 	 * if tracing, save a copy of iovec
208 	 */
209 	if (KTRPOINT(td->td_proc, KTR_GENIO)) {
210 		ktriov = aiov;
211 		ktruio = auio;
212 		didktr = 1;
213 	}
214 #endif
215 	cnt = nbyte;
216 
217 	if ((error = fo_read(fp, &auio, fp->f_cred, flags, td))) {
218 		if (auio.uio_resid != cnt && (error == ERESTART ||
219 		    error == EINTR || error == EWOULDBLOCK))
220 			error = 0;
221 	}
222 	cnt -= auio.uio_resid;
223 #ifdef KTRACE
224 	if (didktr && error == 0) {
225 		ktruio.uio_iov = &ktriov;
226 		ktruio.uio_resid = cnt;
227 		ktrgenio(td->td_proc->p_tracep, fd, UIO_READ, &ktruio, error);
228 	}
229 #endif
230 	td->td_retval[0] = cnt;
231 	return (error);
232 }
233 
234 /*
235  * Scatter read system call.
236  */
237 #ifndef _SYS_SYSPROTO_H_
238 struct readv_args {
239 	int	fd;
240 	struct	iovec *iovp;
241 	u_int	iovcnt;
242 };
243 #endif
244 /*
245  * MPSAFE
246  */
247 int
248 readv(td, uap)
249 	struct thread *td;
250 	register struct readv_args *uap;
251 {
252 	register struct file *fp;
253 	register struct filedesc *fdp;
254 	struct uio auio;
255 	register struct iovec *iov;
256 	struct iovec *needfree;
257 	struct iovec aiov[UIO_SMALLIOV];
258 	long i, cnt, error = 0;
259 	u_int iovlen;
260 #ifdef KTRACE
261 	struct iovec *ktriov = NULL;
262 	struct uio ktruio;
263 #endif
264 	mtx_lock(&Giant);
265 	fdp = td->td_proc->p_fd;
266 
267 	if ((fp = holdfp(fdp, uap->fd, FREAD)) == NULL) {
268 		error = EBADF;
269 		goto done2;
270 	}
271 	/* note: can't use iovlen until iovcnt is validated */
272 	iovlen = uap->iovcnt * sizeof (struct iovec);
273 	if (uap->iovcnt > UIO_SMALLIOV) {
274 		if (uap->iovcnt > UIO_MAXIOV) {
275 			error = EINVAL;
276 			goto done2;
277 		}
278 		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
279 		needfree = iov;
280 	} else {
281 		iov = aiov;
282 		needfree = NULL;
283 	}
284 	auio.uio_iov = iov;
285 	auio.uio_iovcnt = uap->iovcnt;
286 	auio.uio_rw = UIO_READ;
287 	auio.uio_segflg = UIO_USERSPACE;
288 	auio.uio_td = td;
289 	auio.uio_offset = -1;
290 	if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)))
291 		goto done;
292 	auio.uio_resid = 0;
293 	for (i = 0; i < uap->iovcnt; i++) {
294 		if (iov->iov_len > INT_MAX - auio.uio_resid) {
295 			error = EINVAL;
296 			goto done;
297 		}
298 		auio.uio_resid += iov->iov_len;
299 		iov++;
300 	}
301 #ifdef KTRACE
302 	/*
303 	 * if tracing, save a copy of iovec
304 	 */
305 	if (KTRPOINT(td->td_proc, KTR_GENIO))  {
306 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
307 		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
308 		ktruio = auio;
309 	}
310 #endif
311 	cnt = auio.uio_resid;
312 	if ((error = fo_read(fp, &auio, fp->f_cred, 0, td))) {
313 		if (auio.uio_resid != cnt && (error == ERESTART ||
314 		    error == EINTR || error == EWOULDBLOCK))
315 			error = 0;
316 	}
317 	cnt -= auio.uio_resid;
318 #ifdef KTRACE
319 	if (ktriov != NULL) {
320 		if (error == 0) {
321 			ktruio.uio_iov = ktriov;
322 			ktruio.uio_resid = cnt;
323 			ktrgenio(td->td_proc->p_tracep, uap->fd, UIO_READ, &ktruio,
324 			    error);
325 		}
326 		FREE(ktriov, M_TEMP);
327 	}
328 #endif
329 	td->td_retval[0] = cnt;
330 done:
331 	fdrop(fp, td);
332 	if (needfree)
333 		FREE(needfree, M_IOV);
334 done2:
335 	mtx_unlock(&Giant);
336 	return (error);
337 }
338 
339 /*
340  * Write system call
341  */
342 #ifndef _SYS_SYSPROTO_H_
343 struct write_args {
344 	int	fd;
345 	const void *buf;
346 	size_t	nbyte;
347 };
348 #endif
349 /*
350  * MPSAFE
351  */
352 int
353 write(td, uap)
354 	struct thread *td;
355 	register struct write_args *uap;
356 {
357 	register struct file *fp;
358 	int error;
359 
360 	mtx_lock(&Giant);
361 	if ((fp = holdfp(td->td_proc->p_fd, uap->fd, FWRITE)) != NULL) {
362 		error = dofilewrite(td, fp, uap->fd, uap->buf, uap->nbyte,
363 			    (off_t)-1, 0);
364 		fdrop(fp, td);
365 	} else {
366 		error = EBADF;
367 	}
368 	mtx_unlock(&Giant);
369 	return(error);
370 }
371 
372 /*
373  * Pwrite system call
374  */
375 #ifndef _SYS_SYSPROTO_H_
376 struct pwrite_args {
377 	int	fd;
378 	const void *buf;
379 	size_t	nbyte;
380 	int	pad;
381 	off_t	offset;
382 };
383 #endif
384 /*
385  * MPSAFE
386  */
387 int
388 pwrite(td, uap)
389 	struct thread *td;
390 	register struct pwrite_args *uap;
391 {
392 	register struct file *fp;
393 	int error;
394 
395 	mtx_lock(&Giant);
396 	if ((fp = holdfp(td->td_proc->p_fd, uap->fd, FWRITE)) == NULL) {
397 		error = EBADF;
398 	} else if (fp->f_type != DTYPE_VNODE) {
399 		error = ESPIPE;
400 		fdrop(fp, td);
401 	} else {
402 		error = dofilewrite(td, fp, uap->fd, uap->buf, uap->nbyte,
403 			    uap->offset, FOF_OFFSET);
404 		fdrop(fp, td);
405 	}
406 	mtx_unlock(&Giant);
407 	return(error);
408 }
409 
410 static int
411 dofilewrite(td, fp, fd, buf, nbyte, offset, flags)
412 	struct thread *td;
413 	struct file *fp;
414 	int fd, flags;
415 	const void *buf;
416 	size_t nbyte;
417 	off_t offset;
418 {
419 	struct uio auio;
420 	struct iovec aiov;
421 	long cnt, error = 0;
422 #ifdef KTRACE
423 	struct iovec ktriov;
424 	struct uio ktruio;
425 	int didktr = 0;
426 #endif
427 
428 	aiov.iov_base = (void *)(uintptr_t)buf;
429 	aiov.iov_len = nbyte;
430 	auio.uio_iov = &aiov;
431 	auio.uio_iovcnt = 1;
432 	auio.uio_offset = offset;
433 	if (nbyte > INT_MAX)
434 		return (EINVAL);
435 	auio.uio_resid = nbyte;
436 	auio.uio_rw = UIO_WRITE;
437 	auio.uio_segflg = UIO_USERSPACE;
438 	auio.uio_td = td;
439 #ifdef KTRACE
440 	/*
441 	 * if tracing, save a copy of iovec and uio
442 	 */
443 	if (KTRPOINT(td->td_proc, KTR_GENIO)) {
444 		ktriov = aiov;
445 		ktruio = auio;
446 		didktr = 1;
447 	}
448 #endif
449 	cnt = nbyte;
450 	if (fp->f_type == DTYPE_VNODE)
451 		bwillwrite();
452 	if ((error = fo_write(fp, &auio, fp->f_cred, flags, td))) {
453 		if (auio.uio_resid != cnt && (error == ERESTART ||
454 		    error == EINTR || error == EWOULDBLOCK))
455 			error = 0;
456 		if (error == EPIPE) {
457 			PROC_LOCK(td->td_proc);
458 			psignal(td->td_proc, SIGPIPE);
459 			PROC_UNLOCK(td->td_proc);
460 		}
461 	}
462 	cnt -= auio.uio_resid;
463 #ifdef KTRACE
464 	if (didktr && error == 0) {
465 		ktruio.uio_iov = &ktriov;
466 		ktruio.uio_resid = cnt;
467 		ktrgenio(td->td_proc->p_tracep, fd, UIO_WRITE, &ktruio, error);
468 	}
469 #endif
470 	td->td_retval[0] = cnt;
471 	return (error);
472 }
473 
474 /*
475  * Gather write system call
476  */
477 #ifndef _SYS_SYSPROTO_H_
478 struct writev_args {
479 	int	fd;
480 	struct	iovec *iovp;
481 	u_int	iovcnt;
482 };
483 #endif
484 /*
485  * MPSAFE
486  */
487 int
488 writev(td, uap)
489 	struct thread *td;
490 	register struct writev_args *uap;
491 {
492 	register struct file *fp;
493 	register struct filedesc *fdp;
494 	struct uio auio;
495 	register struct iovec *iov;
496 	struct iovec *needfree;
497 	struct iovec aiov[UIO_SMALLIOV];
498 	long i, cnt, error = 0;
499 	u_int iovlen;
500 #ifdef KTRACE
501 	struct iovec *ktriov = NULL;
502 	struct uio ktruio;
503 #endif
504 
505 	mtx_lock(&Giant);
506 	fdp = td->td_proc->p_fd;
507 	if ((fp = holdfp(fdp, uap->fd, FWRITE)) == NULL) {
508 		error = EBADF;
509 		goto done2;
510 	}
511 	/* note: can't use iovlen until iovcnt is validated */
512 	iovlen = uap->iovcnt * sizeof (struct iovec);
513 	if (uap->iovcnt > UIO_SMALLIOV) {
514 		if (uap->iovcnt > UIO_MAXIOV) {
515 			needfree = NULL;
516 			error = EINVAL;
517 			goto done;
518 		}
519 		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
520 		needfree = iov;
521 	} else {
522 		iov = aiov;
523 		needfree = NULL;
524 	}
525 	auio.uio_iov = iov;
526 	auio.uio_iovcnt = uap->iovcnt;
527 	auio.uio_rw = UIO_WRITE;
528 	auio.uio_segflg = UIO_USERSPACE;
529 	auio.uio_td = td;
530 	auio.uio_offset = -1;
531 	if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)))
532 		goto done;
533 	auio.uio_resid = 0;
534 	for (i = 0; i < uap->iovcnt; i++) {
535 		if (iov->iov_len > INT_MAX - auio.uio_resid) {
536 			error = EINVAL;
537 			goto done;
538 		}
539 		auio.uio_resid += iov->iov_len;
540 		iov++;
541 	}
542 #ifdef KTRACE
543 	/*
544 	 * if tracing, save a copy of iovec and uio
545 	 */
546 	if (KTRPOINT(td->td_proc, KTR_GENIO))  {
547 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
548 		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
549 		ktruio = auio;
550 	}
551 #endif
552 	cnt = auio.uio_resid;
553 	if (fp->f_type == DTYPE_VNODE)
554 		bwillwrite();
555 	if ((error = fo_write(fp, &auio, fp->f_cred, 0, td))) {
556 		if (auio.uio_resid != cnt && (error == ERESTART ||
557 		    error == EINTR || error == EWOULDBLOCK))
558 			error = 0;
559 		if (error == EPIPE) {
560 			PROC_LOCK(td->td_proc);
561 			psignal(td->td_proc, SIGPIPE);
562 			PROC_UNLOCK(td->td_proc);
563 		}
564 	}
565 	cnt -= auio.uio_resid;
566 #ifdef KTRACE
567 	if (ktriov != NULL) {
568 		if (error == 0) {
569 			ktruio.uio_iov = ktriov;
570 			ktruio.uio_resid = cnt;
571 			ktrgenio(td->td_proc->p_tracep, uap->fd, UIO_WRITE, &ktruio,
572 			    error);
573 		}
574 		FREE(ktriov, M_TEMP);
575 	}
576 #endif
577 	td->td_retval[0] = cnt;
578 done:
579 	fdrop(fp, td);
580 	if (needfree)
581 		FREE(needfree, M_IOV);
582 done2:
583 	mtx_unlock(&Giant);
584 	return (error);
585 }
586 
587 /*
588  * Ioctl system call
589  */
590 #ifndef _SYS_SYSPROTO_H_
591 struct ioctl_args {
592 	int	fd;
593 	u_long	com;
594 	caddr_t	data;
595 };
596 #endif
597 /*
598  * MPSAFE
599  */
600 /* ARGSUSED */
601 int
602 ioctl(td, uap)
603 	struct thread *td;
604 	register struct ioctl_args *uap;
605 {
606 	register struct file *fp;
607 	register struct filedesc *fdp;
608 	register u_long com;
609 	int error = 0;
610 	register u_int size;
611 	caddr_t data, memp;
612 	int tmp;
613 #define STK_PARAMS	128
614 	union {
615 	    char stkbuf[STK_PARAMS];
616 	    long align;
617 	} ubuf;
618 
619 	mtx_lock(&Giant);
620 	fdp = td->td_proc->p_fd;
621 	if ((u_int)uap->fd >= fdp->fd_nfiles ||
622 	    (fp = fdp->fd_ofiles[uap->fd]) == NULL) {
623 		error = EBADF;
624 		goto done2;
625 	}
626 
627 	if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
628 		error = EBADF;
629 		goto done2;
630 	}
631 
632 	switch (com = uap->com) {
633 	case FIONCLEX:
634 		fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE;
635 		goto done2;
636 	case FIOCLEX:
637 		fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE;
638 		goto done2;
639 	}
640 
641 	/*
642 	 * Interpret high order word to find amount of data to be
643 	 * copied to/from the user's address space.
644 	 */
645 	size = IOCPARM_LEN(com);
646 	if (size > IOCPARM_MAX) {
647 		error = ENOTTY;
648 		goto done2;
649 	}
650 
651 	fhold(fp);
652 
653 	memp = NULL;
654 	if (size > sizeof (ubuf.stkbuf)) {
655 		memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
656 		data = memp;
657 	} else {
658 		data = ubuf.stkbuf;
659 	}
660 	if (com&IOC_IN) {
661 		if (size) {
662 			error = copyin(uap->data, data, (u_int)size);
663 			if (error) {
664 				if (memp)
665 					free(memp, M_IOCTLOPS);
666 				fdrop(fp, td);
667 				goto done2;
668 			}
669 		} else {
670 			*(caddr_t *)data = uap->data;
671 		}
672 	} else if ((com&IOC_OUT) && size) {
673 		/*
674 		 * Zero the buffer so the user always
675 		 * gets back something deterministic.
676 		 */
677 		bzero(data, size);
678 	} else if (com&IOC_VOID) {
679 		*(caddr_t *)data = uap->data;
680 	}
681 
682 	switch (com) {
683 
684 	case FIONBIO:
685 		if ((tmp = *(int *)data))
686 			fp->f_flag |= FNONBLOCK;
687 		else
688 			fp->f_flag &= ~FNONBLOCK;
689 		error = fo_ioctl(fp, FIONBIO, (caddr_t)&tmp, td);
690 		break;
691 
692 	case FIOASYNC:
693 		if ((tmp = *(int *)data))
694 			fp->f_flag |= FASYNC;
695 		else
696 			fp->f_flag &= ~FASYNC;
697 		error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, td);
698 		break;
699 
700 	default:
701 		error = fo_ioctl(fp, com, data, td);
702 		/*
703 		 * Copy any data to user, size was
704 		 * already set and checked above.
705 		 */
706 		if (error == 0 && (com&IOC_OUT) && size)
707 			error = copyout(data, uap->data, (u_int)size);
708 		break;
709 	}
710 	if (memp)
711 		free(memp, M_IOCTLOPS);
712 	fdrop(fp, td);
713 done2:
714 	mtx_unlock(&Giant);
715 	return (error);
716 }
717 
718 static int	nselcoll;	/* Select collisions since boot */
719 struct cv	selwait;
720 SYSCTL_INT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
721 
722 /*
723  * Select system call.
724  */
725 #ifndef _SYS_SYSPROTO_H_
726 struct select_args {
727 	int	nd;
728 	fd_set	*in, *ou, *ex;
729 	struct	timeval *tv;
730 };
731 #endif
732 /*
733  * MPSAFE
734  */
735 int
736 select(td, uap)
737 	register struct thread *td;
738 	register struct select_args *uap;
739 {
740 	/*
741 	 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
742 	 * infds with the new FD_SETSIZE of 1024, and more than enough for
743 	 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
744 	 * of 256.
745 	 */
746 	fd_mask s_selbits[howmany(2048, NFDBITS)];
747 	fd_mask s_heldbits[howmany(2048, NFDBITS)];
748 	fd_mask *ibits[3], *obits[3], *selbits, *sbp, *heldbits, *hibits, *hobits;
749 	struct timeval atv, rtv, ttv;
750 	int ncoll, error, timo, i;
751 	u_int nbufbytes, ncpbytes, nfdbits;
752 
753 	if (uap->nd < 0)
754 		return (EINVAL);
755 
756 	mtx_lock(&Giant);
757 
758 	if (uap->nd > td->td_proc->p_fd->fd_nfiles)
759 		uap->nd = td->td_proc->p_fd->fd_nfiles;   /* forgiving; slightly wrong */
760 
761 	/*
762 	 * Allocate just enough bits for the non-null fd_sets.  Use the
763 	 * preallocated auto buffer if possible.
764 	 */
765 	nfdbits = roundup(uap->nd, NFDBITS);
766 	ncpbytes = nfdbits / NBBY;
767 	nbufbytes = 0;
768 	if (uap->in != NULL)
769 		nbufbytes += 2 * ncpbytes;
770 	if (uap->ou != NULL)
771 		nbufbytes += 2 * ncpbytes;
772 	if (uap->ex != NULL)
773 		nbufbytes += 2 * ncpbytes;
774 	if (nbufbytes <= sizeof s_selbits)
775 		selbits = &s_selbits[0];
776 	else
777 		selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
778 	if (2 * ncpbytes <= sizeof s_heldbits) {
779 		bzero(s_heldbits, sizeof(s_heldbits));
780 		heldbits = &s_heldbits[0];
781 	} else
782 		heldbits = malloc(2 * ncpbytes, M_SELECT, M_WAITOK | M_ZERO);
783 
784 	/*
785 	 * Assign pointers into the bit buffers and fetch the input bits.
786 	 * Put the output buffers together so that they can be bzeroed
787 	 * together.
788 	 */
789 	sbp = selbits;
790 	hibits = heldbits + ncpbytes / sizeof *heldbits;
791 	hobits = heldbits;
792 #define	getbits(name, x) \
793 	do {								\
794 		if (uap->name == NULL)					\
795 			ibits[x] = NULL;				\
796 		else {							\
797 			ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp;	\
798 			obits[x] = sbp;					\
799 			sbp += ncpbytes / sizeof *sbp;			\
800 			error = copyin(uap->name, ibits[x], ncpbytes);	\
801 			if (error != 0)					\
802 				goto done_noproclock;			\
803 			for (i = 0;					\
804 			     i < ncpbytes / sizeof ibits[i][0];		\
805 			     i++)					\
806 				hibits[i] |= ibits[x][i];		\
807 		}							\
808 	} while (0)
809 	getbits(in, 0);
810 	getbits(ou, 1);
811 	getbits(ex, 2);
812 #undef	getbits
813 	if (nbufbytes != 0)
814 		bzero(selbits, nbufbytes / 2);
815 
816 	if (uap->tv) {
817 		error = copyin((caddr_t)uap->tv, (caddr_t)&atv,
818 			sizeof (atv));
819 		if (error)
820 			goto done_noproclock;
821 		if (itimerfix(&atv)) {
822 			error = EINVAL;
823 			goto done_noproclock;
824 		}
825 		getmicrouptime(&rtv);
826 		timevaladd(&atv, &rtv);
827 	} else {
828 		atv.tv_sec = 0;
829 		atv.tv_usec = 0;
830 	}
831 	selholddrop(td, hibits, hobits, uap->nd, 1);
832 	timo = 0;
833 	PROC_LOCK(td->td_proc);
834 retry:
835 	ncoll = nselcoll;
836 	mtx_lock_spin(&sched_lock);
837 	td->td_flags |= TDF_SELECT;
838 	mtx_unlock_spin(&sched_lock);
839 	PROC_UNLOCK(td->td_proc);
840 	error = selscan(td, ibits, obits, uap->nd);
841 	PROC_LOCK(td->td_proc);
842 	if (error || td->td_retval[0])
843 		goto done;
844 	if (atv.tv_sec || atv.tv_usec) {
845 		getmicrouptime(&rtv);
846 		if (timevalcmp(&rtv, &atv, >=)) {
847 			/*
848 			 * An event of our interest may occur during locking a process.
849 			 * In order to avoid missing the event that occured during locking
850 			 * the process, test TDF_SELECT and rescan file descriptors if
851 			 * necessary.
852 			 */
853 			mtx_lock_spin(&sched_lock);
854 			if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
855 				ncoll = nselcoll;
856 				td->td_flags |= TDF_SELECT;
857 				mtx_unlock_spin(&sched_lock);
858 				PROC_UNLOCK(td->td_proc);
859 				error = selscan(td, ibits, obits, uap->nd);
860 				PROC_LOCK(td->td_proc);
861 			} else
862 				mtx_unlock_spin(&sched_lock);
863 			goto done;
864 		}
865 		ttv = atv;
866 		timevalsub(&ttv, &rtv);
867 		timo = ttv.tv_sec > 24 * 60 * 60 ?
868 		    24 * 60 * 60 * hz : tvtohz(&ttv);
869 	}
870 	mtx_lock_spin(&sched_lock);
871 	td->td_flags &= ~TDF_SELECT;
872 	mtx_unlock_spin(&sched_lock);
873 
874 	if (timo > 0)
875 		error = cv_timedwait_sig(&selwait, &td->td_proc->p_mtx, timo);
876 	else
877 		error = cv_wait_sig(&selwait, &td->td_proc->p_mtx);
878 
879 	if (error == 0)
880 		goto retry;
881 
882 done:
883 	mtx_lock_spin(&sched_lock);
884 	td->td_flags &= ~TDF_SELECT;
885 	mtx_unlock_spin(&sched_lock);
886 	PROC_UNLOCK(td->td_proc);
887 	selholddrop(td, hibits, hobits, uap->nd, 0);
888 done_noproclock:
889 	/* select is not restarted after signals... */
890 	if (error == ERESTART)
891 		error = EINTR;
892 	if (error == EWOULDBLOCK)
893 		error = 0;
894 #define	putbits(name, x) \
895 	if (uap->name && (error2 = copyout(obits[x], uap->name, ncpbytes))) \
896 		error = error2;
897 	if (error == 0) {
898 		int error2;
899 
900 		putbits(in, 0);
901 		putbits(ou, 1);
902 		putbits(ex, 2);
903 #undef putbits
904 	}
905 	if (selbits != &s_selbits[0])
906 		free(selbits, M_SELECT);
907 	if (heldbits != &s_heldbits[0])
908 		free(heldbits, M_SELECT);
909 
910 	mtx_unlock(&Giant);
911 	return (error);
912 }
913 
914 static int
915 selholddrop(td, ibits, obits, nfd, hold)
916 	struct thread *td;
917 	fd_mask *ibits, *obits;
918 	int nfd, hold;
919 {
920 	struct filedesc *fdp = td->td_proc->p_fd;
921 	int i, fd;
922 	fd_mask bits;
923 	struct file *fp;
924 
925 	for (i = 0; i < nfd; i += NFDBITS) {
926 		if (hold)
927 			bits = ibits[i/NFDBITS];
928 		else
929 			bits = obits[i/NFDBITS];
930 		/* ffs(int mask) not portable, fd_mask is long */
931 		for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
932 			if (!(bits & 1))
933 				continue;
934 			fp = fdp->fd_ofiles[fd];
935 			if (fp == NULL)
936 				return (EBADF);
937 			if (hold) {
938 				fhold(fp);
939 				obits[(fd)/NFDBITS] |=
940 				    ((fd_mask)1 << ((fd) % NFDBITS));
941 			} else
942 				fdrop(fp, td);
943 		}
944 	}
945 	return (0);
946 }
947 
948 static int
949 selscan(td, ibits, obits, nfd)
950 	struct thread *td;
951 	fd_mask **ibits, **obits;
952 	int nfd;
953 {
954 	struct filedesc *fdp = td->td_proc->p_fd;
955 	int msk, i, fd;
956 	fd_mask bits;
957 	struct file *fp;
958 	int n = 0;
959 	/* Note: backend also returns POLLHUP/POLLERR if appropriate. */
960 	static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
961 
962 	for (msk = 0; msk < 3; msk++) {
963 		if (ibits[msk] == NULL)
964 			continue;
965 		for (i = 0; i < nfd; i += NFDBITS) {
966 			bits = ibits[msk][i/NFDBITS];
967 			/* ffs(int mask) not portable, fd_mask is long */
968 			for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
969 				if (!(bits & 1))
970 					continue;
971 				fp = fdp->fd_ofiles[fd];
972 				if (fp == NULL)
973 					return (EBADF);
974 				if (fo_poll(fp, flag[msk], fp->f_cred, td)) {
975 					obits[msk][(fd)/NFDBITS] |=
976 					    ((fd_mask)1 << ((fd) % NFDBITS));
977 					n++;
978 				}
979 			}
980 		}
981 	}
982 	td->td_retval[0] = n;
983 	return (0);
984 }
985 
986 /*
987  * Poll system call.
988  */
989 #ifndef _SYS_SYSPROTO_H_
990 struct poll_args {
991 	struct pollfd *fds;
992 	u_int	nfds;
993 	int	timeout;
994 };
995 #endif
996 /*
997  * MPSAFE
998  */
999 int
1000 poll(td, uap)
1001 	struct thread *td;
1002 	struct poll_args *uap;
1003 {
1004 	caddr_t bits;
1005 	char smallbits[32 * sizeof(struct pollfd)];
1006 	struct timeval atv, rtv, ttv;
1007 	int ncoll, error = 0, timo;
1008 	u_int nfds;
1009 	size_t ni;
1010 	struct pollfd p_heldbits[32];
1011 	struct pollfd *heldbits;
1012 
1013 	nfds = SCARG(uap, nfds);
1014 
1015 	mtx_lock(&Giant);
1016 	/*
1017 	 * This is kinda bogus.  We have fd limits, but that is not
1018 	 * really related to the size of the pollfd array.  Make sure
1019 	 * we let the process use at least FD_SETSIZE entries and at
1020 	 * least enough for the current limits.  We want to be reasonably
1021 	 * safe, but not overly restrictive.
1022 	 */
1023 	if ((nfds > td->td_proc->p_rlimit[RLIMIT_NOFILE].rlim_cur) &&
1024 	    (nfds > FD_SETSIZE)) {
1025 		error = EINVAL;
1026 		goto done2;
1027 	}
1028 	ni = nfds * sizeof(struct pollfd);
1029 	if (ni > sizeof(smallbits))
1030 		bits = malloc(ni, M_TEMP, M_WAITOK);
1031 	else
1032 		bits = smallbits;
1033 	if (ni > sizeof(p_heldbits))
1034 		heldbits = malloc(ni, M_TEMP, M_WAITOK);
1035 	else {
1036 		bzero(p_heldbits, sizeof(p_heldbits));
1037 		heldbits = p_heldbits;
1038 	}
1039 	error = copyin(SCARG(uap, fds), bits, ni);
1040 	if (error)
1041 		goto done_noproclock;
1042 	bcopy(bits, heldbits, ni);
1043 	if (SCARG(uap, timeout) != INFTIM) {
1044 		atv.tv_sec = SCARG(uap, timeout) / 1000;
1045 		atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000;
1046 		if (itimerfix(&atv)) {
1047 			error = EINVAL;
1048 			goto done_noproclock;
1049 		}
1050 		getmicrouptime(&rtv);
1051 		timevaladd(&atv, &rtv);
1052 	} else {
1053 		atv.tv_sec = 0;
1054 		atv.tv_usec = 0;
1055 	}
1056 	pollholddrop(td, heldbits, nfds, 1);
1057 	timo = 0;
1058 	PROC_LOCK(td->td_proc);
1059 retry:
1060 	ncoll = nselcoll;
1061 	mtx_lock_spin(&sched_lock);
1062 	td->td_flags |= TDF_SELECT;
1063 	mtx_unlock_spin(&sched_lock);
1064 	PROC_UNLOCK(td->td_proc);
1065 	error = pollscan(td, (struct pollfd *)bits, nfds);
1066 	PROC_LOCK(td->td_proc);
1067 	if (error || td->td_retval[0])
1068 		goto done;
1069 	if (atv.tv_sec || atv.tv_usec) {
1070 		getmicrouptime(&rtv);
1071 		if (timevalcmp(&rtv, &atv, >=)) {
1072 			/*
1073 			 * An event of our interest may occur during locking a process.
1074 			 * In order to avoid missing the event that occured during locking
1075 			 * the process, test TDF_SELECT and rescan file descriptors if
1076 			 * necessary.
1077 			 */
1078 			mtx_lock_spin(&sched_lock);
1079 			if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
1080 				ncoll = nselcoll;
1081 				td->td_flags |= TDF_SELECT;
1082 				mtx_unlock_spin(&sched_lock);
1083 				PROC_UNLOCK(td->td_proc);
1084 				error = pollscan(td, (struct pollfd *)bits, nfds);
1085 				PROC_LOCK(td->td_proc);
1086 			} else
1087 				mtx_unlock_spin(&sched_lock);
1088 			goto done;
1089 		}
1090 		ttv = atv;
1091 		timevalsub(&ttv, &rtv);
1092 		timo = ttv.tv_sec > 24 * 60 * 60 ?
1093 		    24 * 60 * 60 * hz : tvtohz(&ttv);
1094 	}
1095 	mtx_lock_spin(&sched_lock);
1096 	td->td_flags &= ~TDF_SELECT;
1097 	mtx_unlock_spin(&sched_lock);
1098 	if (timo > 0)
1099 		error = cv_timedwait_sig(&selwait, &td->td_proc->p_mtx, timo);
1100 	else
1101 		error = cv_wait_sig(&selwait, &td->td_proc->p_mtx);
1102 	if (error == 0)
1103 		goto retry;
1104 
1105 done:
1106 	mtx_lock_spin(&sched_lock);
1107 	td->td_flags &= ~TDF_SELECT;
1108 	mtx_unlock_spin(&sched_lock);
1109 	PROC_UNLOCK(td->td_proc);
1110 	pollholddrop(td, heldbits, nfds, 0);
1111 done_noproclock:
1112 	/* poll is not restarted after signals... */
1113 	if (error == ERESTART)
1114 		error = EINTR;
1115 	if (error == EWOULDBLOCK)
1116 		error = 0;
1117 	if (error == 0) {
1118 		error = copyout(bits, SCARG(uap, fds), ni);
1119 		if (error)
1120 			goto out;
1121 	}
1122 out:
1123 	if (ni > sizeof(smallbits))
1124 		free(bits, M_TEMP);
1125 	if (ni > sizeof(p_heldbits))
1126 		free(heldbits, M_TEMP);
1127 done2:
1128 	mtx_unlock(&Giant);
1129 	return (error);
1130 }
1131 
1132 static int
1133 pollholddrop(td, fds, nfd, hold)
1134 	struct thread *td;
1135 	struct pollfd *fds;
1136 	u_int nfd;
1137 	int hold;
1138 {
1139 	register struct filedesc *fdp = td->td_proc->p_fd;
1140 	int i;
1141 	struct file *fp;
1142 
1143 	for (i = 0; i < nfd; i++, fds++) {
1144 		if (0 <= fds->fd && fds->fd < fdp->fd_nfiles) {
1145 			fp = fdp->fd_ofiles[fds->fd];
1146 			if (hold) {
1147 				if (fp != NULL) {
1148 					fhold(fp);
1149 					fds->revents = 1;
1150 				} else
1151 					fds->revents = 0;
1152 			} else if(fp != NULL && fds->revents)
1153 				fdrop(fp, td);
1154 		}
1155 	}
1156 	return (0);
1157 }
1158 
1159 static int
1160 pollscan(td, fds, nfd)
1161 	struct thread *td;
1162 	struct pollfd *fds;
1163 	u_int nfd;
1164 {
1165 	register struct filedesc *fdp = td->td_proc->p_fd;
1166 	int i;
1167 	struct file *fp;
1168 	int n = 0;
1169 
1170 	for (i = 0; i < nfd; i++, fds++) {
1171 		if (fds->fd >= fdp->fd_nfiles) {
1172 			fds->revents = POLLNVAL;
1173 			n++;
1174 		} else if (fds->fd < 0) {
1175 			fds->revents = 0;
1176 		} else {
1177 			fp = fdp->fd_ofiles[fds->fd];
1178 			if (fp == NULL) {
1179 				fds->revents = POLLNVAL;
1180 				n++;
1181 			} else {
1182 				/*
1183 				 * Note: backend also returns POLLHUP and
1184 				 * POLLERR if appropriate.
1185 				 */
1186 				fds->revents = fo_poll(fp, fds->events,
1187 				    fp->f_cred, td);
1188 				if (fds->revents != 0)
1189 					n++;
1190 			}
1191 		}
1192 	}
1193 	td->td_retval[0] = n;
1194 	return (0);
1195 }
1196 
1197 /*
1198  * OpenBSD poll system call.
1199  * XXX this isn't quite a true representation..  OpenBSD uses select ops.
1200  */
1201 #ifndef _SYS_SYSPROTO_H_
1202 struct openbsd_poll_args {
1203 	struct pollfd *fds;
1204 	u_int	nfds;
1205 	int	timeout;
1206 };
1207 #endif
1208 /*
1209  * MPSAFE
1210  */
1211 int
1212 openbsd_poll(td, uap)
1213 	register struct thread *td;
1214 	register struct openbsd_poll_args *uap;
1215 {
1216 	return (poll(td, (struct poll_args *)uap));
1217 }
1218 
1219 /*ARGSUSED*/
1220 int
1221 seltrue(dev, events, td)
1222 	dev_t dev;
1223 	int events;
1224 	struct thread *td;
1225 {
1226 
1227 	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
1228 }
1229 
1230 static int
1231 find_thread_in_proc(struct proc *p, struct thread *td)
1232 {
1233 	struct thread *td2;
1234 	FOREACH_THREAD_IN_PROC(p, td2) {
1235 		if (td2 == td) {
1236 			return (1);
1237 		}
1238 	}
1239 	return (0);
1240 }
1241 
1242 /*
1243  * Record a select request.
1244  */
1245 void
1246 selrecord(selector, sip)
1247 	struct thread *selector;
1248 	struct selinfo *sip;
1249 {
1250 	struct proc *p;
1251 	pid_t mypid;
1252 
1253 	mypid = selector->td_proc->p_pid;
1254 	if ((sip->si_pid == mypid) &&
1255 	    (sip->si_thread == selector)) { /* XXXKSE should be an ID? */
1256 		return;
1257 	}
1258 	if (sip->si_pid &&
1259 	    (p = pfind(sip->si_pid)) &&
1260 	    (find_thread_in_proc(p, sip->si_thread))) {
1261 		mtx_lock_spin(&sched_lock);
1262 	    	if (sip->si_thread->td_wchan == (caddr_t)&selwait) {
1263 			mtx_unlock_spin(&sched_lock);
1264 			PROC_UNLOCK(p);
1265 			sip->si_flags |= SI_COLL;
1266 			return;
1267 		}
1268 		mtx_unlock_spin(&sched_lock);
1269 		PROC_UNLOCK(p);
1270 	}
1271 	sip->si_pid = mypid;
1272 	sip->si_thread = selector;
1273 }
1274 
1275 /*
1276  * Do a wakeup when a selectable event occurs.
1277  */
1278 void
1279 selwakeup(sip)
1280 	register struct selinfo *sip;
1281 {
1282 	struct thread *td;
1283 	register struct proc *p;
1284 
1285 	if (sip->si_pid == 0)
1286 		return;
1287 	if (sip->si_flags & SI_COLL) {
1288 		nselcoll++;
1289 		sip->si_flags &= ~SI_COLL;
1290 		cv_broadcast(&selwait);
1291 	}
1292 	p = pfind(sip->si_pid);
1293 	sip->si_pid = 0;
1294 	td = sip->si_thread;
1295 	if (p != NULL) {
1296 		if (!find_thread_in_proc(p, td)) {
1297 			PROC_UNLOCK(p); /* lock is in pfind() */;
1298 			return;
1299 		}
1300 		mtx_lock_spin(&sched_lock);
1301 		if (td->td_wchan == (caddr_t)&selwait) {
1302 			if (td->td_proc->p_stat == SSLEEP)
1303 				setrunnable(td);
1304 			else
1305 				cv_waitq_remove(td);
1306 		} else
1307 			td->td_flags &= ~TDF_SELECT;
1308 		mtx_unlock_spin(&sched_lock);
1309 		PROC_UNLOCK(p); /* Lock is in pfind() */
1310 	}
1311 }
1312 
1313 static void selectinit __P((void *));
1314 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL)
1315 
1316 /* ARGSUSED*/
1317 static void
1318 selectinit(dummy)
1319 	void *dummy;
1320 {
1321 	cv_init(&selwait, "select");
1322 }
1323