xref: /freebsd/sys/kern/sys_generic.c (revision eacee0ff7ec955b32e09515246bd97b6edcd2b0f)
1 /*
2  * Copyright (c) 1982, 1986, 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
39  * $FreeBSD$
40  */
41 
42 #include "opt_ktrace.h"
43 
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/sysproto.h>
47 #include <sys/filedesc.h>
48 #include <sys/filio.h>
49 #include <sys/fcntl.h>
50 #include <sys/file.h>
51 #include <sys/proc.h>
52 #include <sys/signalvar.h>
53 #include <sys/socketvar.h>
54 #include <sys/uio.h>
55 #include <sys/kernel.h>
56 #include <sys/malloc.h>
57 #include <sys/poll.h>
58 #include <sys/resourcevar.h>
59 #include <sys/selinfo.h>
60 #include <sys/sysctl.h>
61 #include <sys/sysent.h>
62 #include <sys/bio.h>
63 #include <sys/buf.h>
64 #include <sys/condvar.h>
65 #ifdef KTRACE
66 #include <sys/ktrace.h>
67 #endif
68 #include <vm/vm.h>
69 #include <vm/vm_page.h>
70 
71 #include <machine/limits.h>
72 
73 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
74 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
75 MALLOC_DEFINE(M_IOV, "iov", "large iov's");
76 
77 static int	pollscan __P((struct thread *, struct pollfd *, u_int));
78 static int	selscan __P((struct thread *, fd_mask **, fd_mask **, int));
79 static int	dofileread __P((struct thread *, struct file *, int, void *,
80 		    size_t, off_t, int));
81 static int	dofilewrite __P((struct thread *, struct file *, int,
82 		    const void *, size_t, off_t, int));
83 
84 /*
85  * Read system call.
86  */
87 #ifndef _SYS_SYSPROTO_H_
88 struct read_args {
89 	int	fd;
90 	void	*buf;
91 	size_t	nbyte;
92 };
93 #endif
94 /*
95  * MPSAFE
96  */
97 int
98 read(td, uap)
99 	struct thread *td;
100 	struct read_args *uap;
101 {
102 	struct file *fp;
103 	int error;
104 
105 	mtx_lock(&Giant);
106 	if ((error = fget_read(td, uap->fd, &fp)) == 0) {
107 		error = dofileread(td, fp, uap->fd, uap->buf,
108 			    uap->nbyte, (off_t)-1, 0);
109 		fdrop(fp, td);
110 	}
111 	mtx_unlock(&Giant);
112 	return(error);
113 }
114 
115 /*
116  * Pread system call
117  */
118 #ifndef _SYS_SYSPROTO_H_
119 struct pread_args {
120 	int	fd;
121 	void	*buf;
122 	size_t	nbyte;
123 	int	pad;
124 	off_t	offset;
125 };
126 #endif
127 /*
128  * MPSAFE
129  */
130 int
131 pread(td, uap)
132 	struct thread *td;
133 	struct pread_args *uap;
134 {
135 	struct file *fp;
136 	int error;
137 
138 	if ((error = fget_read(td, uap->fd, &fp)) != 0)
139 		return (error);
140 	mtx_lock(&Giant);
141 	if (fp->f_type != DTYPE_VNODE) {
142 		error = ESPIPE;
143 	} else {
144 		error = dofileread(td, fp, uap->fd, uap->buf, uap->nbyte,
145 			    uap->offset, FOF_OFFSET);
146 	}
147 	fdrop(fp, td);
148 	mtx_unlock(&Giant);
149 	return(error);
150 }
151 
152 /*
153  * Code common for read and pread
154  */
155 int
156 dofileread(td, fp, fd, buf, nbyte, offset, flags)
157 	struct thread *td;
158 	struct file *fp;
159 	int fd, flags;
160 	void *buf;
161 	size_t nbyte;
162 	off_t offset;
163 {
164 	struct uio auio;
165 	struct iovec aiov;
166 	long cnt, error = 0;
167 #ifdef KTRACE
168 	struct iovec ktriov;
169 	struct uio ktruio;
170 	int didktr = 0;
171 #endif
172 
173 	aiov.iov_base = (caddr_t)buf;
174 	aiov.iov_len = nbyte;
175 	auio.uio_iov = &aiov;
176 	auio.uio_iovcnt = 1;
177 	auio.uio_offset = offset;
178 	if (nbyte > INT_MAX)
179 		return (EINVAL);
180 	auio.uio_resid = nbyte;
181 	auio.uio_rw = UIO_READ;
182 	auio.uio_segflg = UIO_USERSPACE;
183 	auio.uio_td = td;
184 #ifdef KTRACE
185 	/*
186 	 * if tracing, save a copy of iovec
187 	 */
188 	if (KTRPOINT(td->td_proc, KTR_GENIO)) {
189 		ktriov = aiov;
190 		ktruio = auio;
191 		didktr = 1;
192 	}
193 #endif
194 	cnt = nbyte;
195 
196 	if ((error = fo_read(fp, &auio, fp->f_cred, flags, td))) {
197 		if (auio.uio_resid != cnt && (error == ERESTART ||
198 		    error == EINTR || error == EWOULDBLOCK))
199 			error = 0;
200 	}
201 	cnt -= auio.uio_resid;
202 #ifdef KTRACE
203 	if (didktr && error == 0) {
204 		ktruio.uio_iov = &ktriov;
205 		ktruio.uio_resid = cnt;
206 		ktrgenio(td->td_proc->p_tracep, fd, UIO_READ, &ktruio, error);
207 	}
208 #endif
209 	td->td_retval[0] = cnt;
210 	return (error);
211 }
212 
213 /*
214  * Scatter read system call.
215  */
216 #ifndef _SYS_SYSPROTO_H_
217 struct readv_args {
218 	int	fd;
219 	struct	iovec *iovp;
220 	u_int	iovcnt;
221 };
222 #endif
223 /*
224  * MPSAFE
225  */
226 int
227 readv(td, uap)
228 	struct thread *td;
229 	struct readv_args *uap;
230 {
231 	struct file *fp;
232 	struct uio auio;
233 	struct iovec *iov;
234 	struct iovec *needfree;
235 	struct iovec aiov[UIO_SMALLIOV];
236 	long i, cnt, error = 0;
237 	u_int iovlen;
238 #ifdef KTRACE
239 	struct iovec *ktriov = NULL;
240 	struct uio ktruio;
241 #endif
242 	mtx_lock(&Giant);
243 
244 	if ((error = fget_read(td, uap->fd, &fp)) != 0)
245 		goto done2;
246 	/* note: can't use iovlen until iovcnt is validated */
247 	iovlen = uap->iovcnt * sizeof (struct iovec);
248 	if (uap->iovcnt > UIO_SMALLIOV) {
249 		if (uap->iovcnt > UIO_MAXIOV) {
250 			error = EINVAL;
251 			goto done2;
252 		}
253 		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
254 		needfree = iov;
255 	} else {
256 		iov = aiov;
257 		needfree = NULL;
258 	}
259 	auio.uio_iov = iov;
260 	auio.uio_iovcnt = uap->iovcnt;
261 	auio.uio_rw = UIO_READ;
262 	auio.uio_segflg = UIO_USERSPACE;
263 	auio.uio_td = td;
264 	auio.uio_offset = -1;
265 	if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)))
266 		goto done;
267 	auio.uio_resid = 0;
268 	for (i = 0; i < uap->iovcnt; i++) {
269 		if (iov->iov_len > INT_MAX - auio.uio_resid) {
270 			error = EINVAL;
271 			goto done;
272 		}
273 		auio.uio_resid += iov->iov_len;
274 		iov++;
275 	}
276 #ifdef KTRACE
277 	/*
278 	 * if tracing, save a copy of iovec
279 	 */
280 	if (KTRPOINT(td->td_proc, KTR_GENIO))  {
281 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
282 		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
283 		ktruio = auio;
284 	}
285 #endif
286 	cnt = auio.uio_resid;
287 	if ((error = fo_read(fp, &auio, fp->f_cred, 0, td))) {
288 		if (auio.uio_resid != cnt && (error == ERESTART ||
289 		    error == EINTR || error == EWOULDBLOCK))
290 			error = 0;
291 	}
292 	cnt -= auio.uio_resid;
293 #ifdef KTRACE
294 	if (ktriov != NULL) {
295 		if (error == 0) {
296 			ktruio.uio_iov = ktriov;
297 			ktruio.uio_resid = cnt;
298 			ktrgenio(td->td_proc->p_tracep, uap->fd, UIO_READ, &ktruio,
299 			    error);
300 		}
301 		FREE(ktriov, M_TEMP);
302 	}
303 #endif
304 	td->td_retval[0] = cnt;
305 done:
306 	fdrop(fp, td);
307 	if (needfree)
308 		FREE(needfree, M_IOV);
309 done2:
310 	mtx_unlock(&Giant);
311 	return (error);
312 }
313 
314 /*
315  * Write system call
316  */
317 #ifndef _SYS_SYSPROTO_H_
318 struct write_args {
319 	int	fd;
320 	const void *buf;
321 	size_t	nbyte;
322 };
323 #endif
324 /*
325  * MPSAFE
326  */
327 int
328 write(td, uap)
329 	struct thread *td;
330 	struct write_args *uap;
331 {
332 	struct file *fp;
333 	int error;
334 
335 	mtx_lock(&Giant);
336 	if ((error = fget_write(td, uap->fd, &fp)) == 0) {
337 		error = dofilewrite(td, fp, uap->fd, uap->buf, uap->nbyte,
338 			    (off_t)-1, 0);
339 		fdrop(fp, td);
340 	} else {
341 		error = EBADF;	/* XXX this can't be right */
342 	}
343 	mtx_unlock(&Giant);
344 	return(error);
345 }
346 
347 /*
348  * Pwrite system call
349  */
350 #ifndef _SYS_SYSPROTO_H_
351 struct pwrite_args {
352 	int	fd;
353 	const void *buf;
354 	size_t	nbyte;
355 	int	pad;
356 	off_t	offset;
357 };
358 #endif
359 /*
360  * MPSAFE
361  */
362 int
363 pwrite(td, uap)
364 	struct thread *td;
365 	struct pwrite_args *uap;
366 {
367 	struct file *fp;
368 	int error;
369 
370 	if ((error = fget_write(td, uap->fd, &fp)) == 0) {
371 		mtx_lock(&Giant);
372 		if (fp->f_type == DTYPE_VNODE) {
373 			error = dofilewrite(td, fp, uap->fd, uap->buf,
374 				    uap->nbyte, uap->offset, FOF_OFFSET);
375 		} else {
376 			error = ESPIPE;
377 		}
378 		fdrop(fp, td);
379 		mtx_unlock(&Giant);
380 	} else {
381 		error = EBADF;	/* this can't be right */
382 	}
383 	return(error);
384 }
385 
386 static int
387 dofilewrite(td, fp, fd, buf, nbyte, offset, flags)
388 	struct thread *td;
389 	struct file *fp;
390 	int fd, flags;
391 	const void *buf;
392 	size_t nbyte;
393 	off_t offset;
394 {
395 	struct uio auio;
396 	struct iovec aiov;
397 	long cnt, error = 0;
398 #ifdef KTRACE
399 	struct iovec ktriov;
400 	struct uio ktruio;
401 	int didktr = 0;
402 #endif
403 
404 	aiov.iov_base = (void *)(uintptr_t)buf;
405 	aiov.iov_len = nbyte;
406 	auio.uio_iov = &aiov;
407 	auio.uio_iovcnt = 1;
408 	auio.uio_offset = offset;
409 	if (nbyte > INT_MAX)
410 		return (EINVAL);
411 	auio.uio_resid = nbyte;
412 	auio.uio_rw = UIO_WRITE;
413 	auio.uio_segflg = UIO_USERSPACE;
414 	auio.uio_td = td;
415 #ifdef KTRACE
416 	/*
417 	 * if tracing, save a copy of iovec and uio
418 	 */
419 	if (KTRPOINT(td->td_proc, KTR_GENIO)) {
420 		ktriov = aiov;
421 		ktruio = auio;
422 		didktr = 1;
423 	}
424 #endif
425 	cnt = nbyte;
426 	if (fp->f_type == DTYPE_VNODE)
427 		bwillwrite();
428 	if ((error = fo_write(fp, &auio, fp->f_cred, flags, td))) {
429 		if (auio.uio_resid != cnt && (error == ERESTART ||
430 		    error == EINTR || error == EWOULDBLOCK))
431 			error = 0;
432 		if (error == EPIPE) {
433 			PROC_LOCK(td->td_proc);
434 			psignal(td->td_proc, SIGPIPE);
435 			PROC_UNLOCK(td->td_proc);
436 		}
437 	}
438 	cnt -= auio.uio_resid;
439 #ifdef KTRACE
440 	if (didktr && error == 0) {
441 		ktruio.uio_iov = &ktriov;
442 		ktruio.uio_resid = cnt;
443 		ktrgenio(td->td_proc->p_tracep, fd, UIO_WRITE, &ktruio, error);
444 	}
445 #endif
446 	td->td_retval[0] = cnt;
447 	return (error);
448 }
449 
450 /*
451  * Gather write system call
452  */
453 #ifndef _SYS_SYSPROTO_H_
454 struct writev_args {
455 	int	fd;
456 	struct	iovec *iovp;
457 	u_int	iovcnt;
458 };
459 #endif
460 /*
461  * MPSAFE
462  */
463 int
464 writev(td, uap)
465 	struct thread *td;
466 	register struct writev_args *uap;
467 {
468 	struct file *fp;
469 	struct uio auio;
470 	register struct iovec *iov;
471 	struct iovec *needfree;
472 	struct iovec aiov[UIO_SMALLIOV];
473 	long i, cnt, error = 0;
474 	u_int iovlen;
475 #ifdef KTRACE
476 	struct iovec *ktriov = NULL;
477 	struct uio ktruio;
478 #endif
479 
480 	mtx_lock(&Giant);
481 	if ((error = fget_write(td, uap->fd, &fp)) != 0) {
482 		error = EBADF;
483 		goto done2;
484 	}
485 	/* note: can't use iovlen until iovcnt is validated */
486 	iovlen = uap->iovcnt * sizeof (struct iovec);
487 	if (uap->iovcnt > UIO_SMALLIOV) {
488 		if (uap->iovcnt > UIO_MAXIOV) {
489 			needfree = NULL;
490 			error = EINVAL;
491 			goto done;
492 		}
493 		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
494 		needfree = iov;
495 	} else {
496 		iov = aiov;
497 		needfree = NULL;
498 	}
499 	auio.uio_iov = iov;
500 	auio.uio_iovcnt = uap->iovcnt;
501 	auio.uio_rw = UIO_WRITE;
502 	auio.uio_segflg = UIO_USERSPACE;
503 	auio.uio_td = td;
504 	auio.uio_offset = -1;
505 	if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)))
506 		goto done;
507 	auio.uio_resid = 0;
508 	for (i = 0; i < uap->iovcnt; i++) {
509 		if (iov->iov_len > INT_MAX - auio.uio_resid) {
510 			error = EINVAL;
511 			goto done;
512 		}
513 		auio.uio_resid += iov->iov_len;
514 		iov++;
515 	}
516 #ifdef KTRACE
517 	/*
518 	 * if tracing, save a copy of iovec and uio
519 	 */
520 	if (KTRPOINT(td->td_proc, KTR_GENIO))  {
521 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
522 		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
523 		ktruio = auio;
524 	}
525 #endif
526 	cnt = auio.uio_resid;
527 	if (fp->f_type == DTYPE_VNODE)
528 		bwillwrite();
529 	if ((error = fo_write(fp, &auio, fp->f_cred, 0, td))) {
530 		if (auio.uio_resid != cnt && (error == ERESTART ||
531 		    error == EINTR || error == EWOULDBLOCK))
532 			error = 0;
533 		if (error == EPIPE) {
534 			PROC_LOCK(td->td_proc);
535 			psignal(td->td_proc, SIGPIPE);
536 			PROC_UNLOCK(td->td_proc);
537 		}
538 	}
539 	cnt -= auio.uio_resid;
540 #ifdef KTRACE
541 	if (ktriov != NULL) {
542 		if (error == 0) {
543 			ktruio.uio_iov = ktriov;
544 			ktruio.uio_resid = cnt;
545 			ktrgenio(td->td_proc->p_tracep, uap->fd, UIO_WRITE, &ktruio,
546 			    error);
547 		}
548 		FREE(ktriov, M_TEMP);
549 	}
550 #endif
551 	td->td_retval[0] = cnt;
552 done:
553 	fdrop(fp, td);
554 	if (needfree)
555 		FREE(needfree, M_IOV);
556 done2:
557 	mtx_unlock(&Giant);
558 	return (error);
559 }
560 
561 /*
562  * Ioctl system call
563  */
564 #ifndef _SYS_SYSPROTO_H_
565 struct ioctl_args {
566 	int	fd;
567 	u_long	com;
568 	caddr_t	data;
569 };
570 #endif
571 /*
572  * MPSAFE
573  */
574 /* ARGSUSED */
575 int
576 ioctl(td, uap)
577 	struct thread *td;
578 	register struct ioctl_args *uap;
579 {
580 	struct file *fp;
581 	register struct filedesc *fdp;
582 	register u_long com;
583 	int error = 0;
584 	register u_int size;
585 	caddr_t data, memp;
586 	int tmp;
587 #define STK_PARAMS	128
588 	union {
589 	    char stkbuf[STK_PARAMS];
590 	    long align;
591 	} ubuf;
592 
593 	if ((error = fget(td, uap->fd, &fp)) != 0)
594 		return (error);
595 	mtx_lock(&Giant);
596 	if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
597 		fdrop(fp, td);
598 		mtx_unlock(&Giant);
599 		return (EBADF);
600 	}
601 	fdp = td->td_proc->p_fd;
602 	switch (com = uap->com) {
603 	case FIONCLEX:
604 		FILEDESC_LOCK(fdp);
605 		fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE;
606 		FILEDESC_UNLOCK(fdp);
607 		fdrop(fp, td);
608 		mtx_unlock(&Giant);
609 		return (0);
610 	case FIOCLEX:
611 		FILEDESC_LOCK(fdp);
612 		fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE;
613 		FILEDESC_UNLOCK(fdp);
614 		fdrop(fp, td);
615 		mtx_unlock(&Giant);
616 		return (0);
617 	}
618 
619 	/*
620 	 * Interpret high order word to find amount of data to be
621 	 * copied to/from the user's address space.
622 	 */
623 	size = IOCPARM_LEN(com);
624 	if (size > IOCPARM_MAX) {
625 		fdrop(fp, td);
626 		mtx_unlock(&Giant);
627 		return (ENOTTY);
628 	}
629 
630 	memp = NULL;
631 	if (size > sizeof (ubuf.stkbuf)) {
632 		memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
633 		data = memp;
634 	} else {
635 		data = ubuf.stkbuf;
636 	}
637 	if (com&IOC_IN) {
638 		if (size) {
639 			error = copyin(uap->data, data, (u_int)size);
640 			if (error) {
641 				if (memp)
642 					free(memp, M_IOCTLOPS);
643 				fdrop(fp, td);
644 				goto done;
645 			}
646 		} else {
647 			*(caddr_t *)data = uap->data;
648 		}
649 	} else if ((com&IOC_OUT) && size) {
650 		/*
651 		 * Zero the buffer so the user always
652 		 * gets back something deterministic.
653 		 */
654 		bzero(data, size);
655 	} else if (com&IOC_VOID) {
656 		*(caddr_t *)data = uap->data;
657 	}
658 
659 	switch (com) {
660 
661 	case FIONBIO:
662 		FILE_LOCK(fp);
663 		if ((tmp = *(int *)data))
664 			fp->f_flag |= FNONBLOCK;
665 		else
666 			fp->f_flag &= ~FNONBLOCK;
667 		FILE_UNLOCK(fp);
668 		error = fo_ioctl(fp, FIONBIO, (caddr_t)&tmp, td);
669 		break;
670 
671 	case FIOASYNC:
672 		FILE_LOCK(fp);
673 		if ((tmp = *(int *)data))
674 			fp->f_flag |= FASYNC;
675 		else
676 			fp->f_flag &= ~FASYNC;
677 		FILE_UNLOCK(fp);
678 		error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, td);
679 		break;
680 
681 	default:
682 		error = fo_ioctl(fp, com, data, td);
683 		/*
684 		 * Copy any data to user, size was
685 		 * already set and checked above.
686 		 */
687 		if (error == 0 && (com&IOC_OUT) && size)
688 			error = copyout(data, uap->data, (u_int)size);
689 		break;
690 	}
691 	if (memp)
692 		free(memp, M_IOCTLOPS);
693 	fdrop(fp, td);
694 done:
695 	mtx_unlock(&Giant);
696 	return (error);
697 }
698 
699 static int	nselcoll;	/* Select collisions since boot */
700 struct cv	selwait;
701 SYSCTL_INT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
702 
703 /*
704  * Select system call.
705  */
706 #ifndef _SYS_SYSPROTO_H_
707 struct select_args {
708 	int	nd;
709 	fd_set	*in, *ou, *ex;
710 	struct	timeval *tv;
711 };
712 #endif
713 /*
714  * MPSAFE
715  */
716 int
717 select(td, uap)
718 	register struct thread *td;
719 	register struct select_args *uap;
720 {
721 	struct filedesc *fdp;
722 	/*
723 	 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
724 	 * infds with the new FD_SETSIZE of 1024, and more than enough for
725 	 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
726 	 * of 256.
727 	 */
728 	fd_mask s_selbits[howmany(2048, NFDBITS)];
729 	fd_mask *ibits[3], *obits[3], *selbits, *sbp;
730 	struct timeval atv, rtv, ttv;
731 	int ncoll, error, timo;
732 	u_int nbufbytes, ncpbytes, nfdbits;
733 
734 	if (uap->nd < 0)
735 		return (EINVAL);
736 	fdp = td->td_proc->p_fd;
737 	mtx_lock(&Giant);
738 	FILEDESC_LOCK(fdp);
739 
740 	if (uap->nd > td->td_proc->p_fd->fd_nfiles)
741 		uap->nd = td->td_proc->p_fd->fd_nfiles;   /* forgiving; slightly wrong */
742 	FILEDESC_UNLOCK(fdp);
743 
744 	/*
745 	 * Allocate just enough bits for the non-null fd_sets.  Use the
746 	 * preallocated auto buffer if possible.
747 	 */
748 	nfdbits = roundup(uap->nd, NFDBITS);
749 	ncpbytes = nfdbits / NBBY;
750 	nbufbytes = 0;
751 	if (uap->in != NULL)
752 		nbufbytes += 2 * ncpbytes;
753 	if (uap->ou != NULL)
754 		nbufbytes += 2 * ncpbytes;
755 	if (uap->ex != NULL)
756 		nbufbytes += 2 * ncpbytes;
757 	if (nbufbytes <= sizeof s_selbits)
758 		selbits = &s_selbits[0];
759 	else
760 		selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
761 
762 	/*
763 	 * Assign pointers into the bit buffers and fetch the input bits.
764 	 * Put the output buffers together so that they can be bzeroed
765 	 * together.
766 	 */
767 	sbp = selbits;
768 #define	getbits(name, x) \
769 	do {								\
770 		if (uap->name == NULL)					\
771 			ibits[x] = NULL;				\
772 		else {							\
773 			ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp;	\
774 			obits[x] = sbp;					\
775 			sbp += ncpbytes / sizeof *sbp;			\
776 			error = copyin(uap->name, ibits[x], ncpbytes);	\
777 			if (error != 0)					\
778 				goto done_noproclock;			\
779 		}							\
780 	} while (0)
781 	getbits(in, 0);
782 	getbits(ou, 1);
783 	getbits(ex, 2);
784 #undef	getbits
785 	if (nbufbytes != 0)
786 		bzero(selbits, nbufbytes / 2);
787 
788 	if (uap->tv) {
789 		error = copyin((caddr_t)uap->tv, (caddr_t)&atv,
790 			sizeof (atv));
791 		if (error)
792 			goto done_noproclock;
793 		if (itimerfix(&atv)) {
794 			error = EINVAL;
795 			goto done_noproclock;
796 		}
797 		getmicrouptime(&rtv);
798 		timevaladd(&atv, &rtv);
799 	} else {
800 		atv.tv_sec = 0;
801 		atv.tv_usec = 0;
802 	}
803 	timo = 0;
804 	PROC_LOCK(td->td_proc);
805 retry:
806 	ncoll = nselcoll;
807 	mtx_lock_spin(&sched_lock);
808 	td->td_flags |= TDF_SELECT;
809 	mtx_unlock_spin(&sched_lock);
810 	PROC_UNLOCK(td->td_proc);
811 	error = selscan(td, ibits, obits, uap->nd);
812 	PROC_LOCK(td->td_proc);
813 	if (error || td->td_retval[0])
814 		goto done;
815 	if (atv.tv_sec || atv.tv_usec) {
816 		getmicrouptime(&rtv);
817 		if (timevalcmp(&rtv, &atv, >=)) {
818 			/*
819 			 * An event of our interest may occur during locking a process.
820 			 * In order to avoid missing the event that occured during locking
821 			 * the process, test TDF_SELECT and rescan file descriptors if
822 			 * necessary.
823 			 */
824 			mtx_lock_spin(&sched_lock);
825 			if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
826 				ncoll = nselcoll;
827 				td->td_flags |= TDF_SELECT;
828 				mtx_unlock_spin(&sched_lock);
829 				PROC_UNLOCK(td->td_proc);
830 				error = selscan(td, ibits, obits, uap->nd);
831 				PROC_LOCK(td->td_proc);
832 			} else
833 				mtx_unlock_spin(&sched_lock);
834 			goto done;
835 		}
836 		ttv = atv;
837 		timevalsub(&ttv, &rtv);
838 		timo = ttv.tv_sec > 24 * 60 * 60 ?
839 		    24 * 60 * 60 * hz : tvtohz(&ttv);
840 	}
841 	mtx_lock_spin(&sched_lock);
842 	td->td_flags &= ~TDF_SELECT;
843 	mtx_unlock_spin(&sched_lock);
844 
845 	if (timo > 0)
846 		error = cv_timedwait_sig(&selwait, &td->td_proc->p_mtx, timo);
847 	else
848 		error = cv_wait_sig(&selwait, &td->td_proc->p_mtx);
849 
850 	if (error == 0)
851 		goto retry;
852 
853 done:
854 	mtx_lock_spin(&sched_lock);
855 	td->td_flags &= ~TDF_SELECT;
856 	mtx_unlock_spin(&sched_lock);
857 	PROC_UNLOCK(td->td_proc);
858 done_noproclock:
859 	/* select is not restarted after signals... */
860 	if (error == ERESTART)
861 		error = EINTR;
862 	if (error == EWOULDBLOCK)
863 		error = 0;
864 #define	putbits(name, x) \
865 	if (uap->name && (error2 = copyout(obits[x], uap->name, ncpbytes))) \
866 		error = error2;
867 	if (error == 0) {
868 		int error2;
869 
870 		putbits(in, 0);
871 		putbits(ou, 1);
872 		putbits(ex, 2);
873 #undef putbits
874 	}
875 	if (selbits != &s_selbits[0])
876 		free(selbits, M_SELECT);
877 
878 	mtx_unlock(&Giant);
879 	return (error);
880 }
881 
882 static int
883 selscan(td, ibits, obits, nfd)
884 	struct thread *td;
885 	fd_mask **ibits, **obits;
886 	int nfd;
887 {
888 	int msk, i, fd;
889 	fd_mask bits;
890 	struct file *fp;
891 	int n = 0;
892 	/* Note: backend also returns POLLHUP/POLLERR if appropriate. */
893 	static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
894 	struct filedesc *fdp = td->td_proc->p_fd;
895 
896 	FILEDESC_LOCK(fdp);
897 	for (msk = 0; msk < 3; msk++) {
898 		if (ibits[msk] == NULL)
899 			continue;
900 		for (i = 0; i < nfd; i += NFDBITS) {
901 			bits = ibits[msk][i/NFDBITS];
902 			/* ffs(int mask) not portable, fd_mask is long */
903 			for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
904 				if (!(bits & 1))
905 					continue;
906 				if ((fp = fget_locked(fdp, fd)) == NULL) {
907 					FILEDESC_UNLOCK(fdp);
908 					return (EBADF);
909 				}
910 				if (fo_poll(fp, flag[msk], fp->f_cred, td)) {
911 					obits[msk][(fd)/NFDBITS] |=
912 					    ((fd_mask)1 << ((fd) % NFDBITS));
913 					n++;
914 				}
915 			}
916 		}
917 	}
918 	FILEDESC_UNLOCK(fdp);
919 	td->td_retval[0] = n;
920 	return (0);
921 }
922 
923 /*
924  * Poll system call.
925  */
926 #ifndef _SYS_SYSPROTO_H_
927 struct poll_args {
928 	struct pollfd *fds;
929 	u_int	nfds;
930 	int	timeout;
931 };
932 #endif
933 /*
934  * MPSAFE
935  */
936 int
937 poll(td, uap)
938 	struct thread *td;
939 	struct poll_args *uap;
940 {
941 	caddr_t bits;
942 	char smallbits[32 * sizeof(struct pollfd)];
943 	struct timeval atv, rtv, ttv;
944 	int ncoll, error = 0, timo;
945 	u_int nfds;
946 	size_t ni;
947 
948 	nfds = SCARG(uap, nfds);
949 
950 	mtx_lock(&Giant);
951 	/*
952 	 * This is kinda bogus.  We have fd limits, but that is not
953 	 * really related to the size of the pollfd array.  Make sure
954 	 * we let the process use at least FD_SETSIZE entries and at
955 	 * least enough for the current limits.  We want to be reasonably
956 	 * safe, but not overly restrictive.
957 	 */
958 	if ((nfds > td->td_proc->p_rlimit[RLIMIT_NOFILE].rlim_cur) &&
959 	    (nfds > FD_SETSIZE)) {
960 		error = EINVAL;
961 		goto done2;
962 	}
963 	ni = nfds * sizeof(struct pollfd);
964 	if (ni > sizeof(smallbits))
965 		bits = malloc(ni, M_TEMP, M_WAITOK);
966 	else
967 		bits = smallbits;
968 	error = copyin(SCARG(uap, fds), bits, ni);
969 	if (error)
970 		goto done_noproclock;
971 	if (SCARG(uap, timeout) != INFTIM) {
972 		atv.tv_sec = SCARG(uap, timeout) / 1000;
973 		atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000;
974 		if (itimerfix(&atv)) {
975 			error = EINVAL;
976 			goto done_noproclock;
977 		}
978 		getmicrouptime(&rtv);
979 		timevaladd(&atv, &rtv);
980 	} else {
981 		atv.tv_sec = 0;
982 		atv.tv_usec = 0;
983 	}
984 	timo = 0;
985 	PROC_LOCK(td->td_proc);
986 retry:
987 	ncoll = nselcoll;
988 	mtx_lock_spin(&sched_lock);
989 	td->td_flags |= TDF_SELECT;
990 	mtx_unlock_spin(&sched_lock);
991 	PROC_UNLOCK(td->td_proc);
992 	error = pollscan(td, (struct pollfd *)bits, nfds);
993 	PROC_LOCK(td->td_proc);
994 	if (error || td->td_retval[0])
995 		goto done;
996 	if (atv.tv_sec || atv.tv_usec) {
997 		getmicrouptime(&rtv);
998 		if (timevalcmp(&rtv, &atv, >=)) {
999 			/*
1000 			 * An event of our interest may occur during locking a process.
1001 			 * In order to avoid missing the event that occured during locking
1002 			 * the process, test TDF_SELECT and rescan file descriptors if
1003 			 * necessary.
1004 			 */
1005 			mtx_lock_spin(&sched_lock);
1006 			if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
1007 				ncoll = nselcoll;
1008 				td->td_flags |= TDF_SELECT;
1009 				mtx_unlock_spin(&sched_lock);
1010 				PROC_UNLOCK(td->td_proc);
1011 				error = pollscan(td, (struct pollfd *)bits, nfds);
1012 				PROC_LOCK(td->td_proc);
1013 			} else
1014 				mtx_unlock_spin(&sched_lock);
1015 			goto done;
1016 		}
1017 		ttv = atv;
1018 		timevalsub(&ttv, &rtv);
1019 		timo = ttv.tv_sec > 24 * 60 * 60 ?
1020 		    24 * 60 * 60 * hz : tvtohz(&ttv);
1021 	}
1022 	mtx_lock_spin(&sched_lock);
1023 	td->td_flags &= ~TDF_SELECT;
1024 	mtx_unlock_spin(&sched_lock);
1025 	if (timo > 0)
1026 		error = cv_timedwait_sig(&selwait, &td->td_proc->p_mtx, timo);
1027 	else
1028 		error = cv_wait_sig(&selwait, &td->td_proc->p_mtx);
1029 	if (error == 0)
1030 		goto retry;
1031 
1032 done:
1033 	mtx_lock_spin(&sched_lock);
1034 	td->td_flags &= ~TDF_SELECT;
1035 	mtx_unlock_spin(&sched_lock);
1036 	PROC_UNLOCK(td->td_proc);
1037 done_noproclock:
1038 	/* poll is not restarted after signals... */
1039 	if (error == ERESTART)
1040 		error = EINTR;
1041 	if (error == EWOULDBLOCK)
1042 		error = 0;
1043 	if (error == 0) {
1044 		error = copyout(bits, SCARG(uap, fds), ni);
1045 		if (error)
1046 			goto out;
1047 	}
1048 out:
1049 	if (ni > sizeof(smallbits))
1050 		free(bits, M_TEMP);
1051 done2:
1052 	mtx_unlock(&Giant);
1053 	return (error);
1054 }
1055 
1056 static int
1057 pollscan(td, fds, nfd)
1058 	struct thread *td;
1059 	struct pollfd *fds;
1060 	u_int nfd;
1061 {
1062 	register struct filedesc *fdp = td->td_proc->p_fd;
1063 	int i;
1064 	struct file *fp;
1065 	int n = 0;
1066 
1067 	FILEDESC_LOCK(fdp);
1068 	for (i = 0; i < nfd; i++, fds++) {
1069 		if (fds->fd >= fdp->fd_nfiles) {
1070 			fds->revents = POLLNVAL;
1071 			n++;
1072 		} else if (fds->fd < 0) {
1073 			fds->revents = 0;
1074 		} else {
1075 			fp = fdp->fd_ofiles[fds->fd];
1076 			if (fp == NULL) {
1077 				fds->revents = POLLNVAL;
1078 				n++;
1079 			} else {
1080 				/*
1081 				 * Note: backend also returns POLLHUP and
1082 				 * POLLERR if appropriate.
1083 				 */
1084 				fds->revents = fo_poll(fp, fds->events,
1085 				    fp->f_cred, td);
1086 				if (fds->revents != 0)
1087 					n++;
1088 			}
1089 		}
1090 	}
1091 	FILEDESC_UNLOCK(fdp);
1092 	td->td_retval[0] = n;
1093 	return (0);
1094 }
1095 
1096 /*
1097  * OpenBSD poll system call.
1098  * XXX this isn't quite a true representation..  OpenBSD uses select ops.
1099  */
1100 #ifndef _SYS_SYSPROTO_H_
1101 struct openbsd_poll_args {
1102 	struct pollfd *fds;
1103 	u_int	nfds;
1104 	int	timeout;
1105 };
1106 #endif
1107 /*
1108  * MPSAFE
1109  */
1110 int
1111 openbsd_poll(td, uap)
1112 	register struct thread *td;
1113 	register struct openbsd_poll_args *uap;
1114 {
1115 	return (poll(td, (struct poll_args *)uap));
1116 }
1117 
1118 /*ARGSUSED*/
1119 int
1120 seltrue(dev, events, td)
1121 	dev_t dev;
1122 	int events;
1123 	struct thread *td;
1124 {
1125 
1126 	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
1127 }
1128 
1129 static int
1130 find_thread_in_proc(struct proc *p, struct thread *td)
1131 {
1132 	struct thread *td2;
1133 	FOREACH_THREAD_IN_PROC(p, td2) {
1134 		if (td2 == td) {
1135 			return (1);
1136 		}
1137 	}
1138 	return (0);
1139 }
1140 
1141 /*
1142  * Record a select request.
1143  */
1144 void
1145 selrecord(selector, sip)
1146 	struct thread *selector;
1147 	struct selinfo *sip;
1148 {
1149 	struct proc *p;
1150 	pid_t mypid;
1151 
1152 	mypid = selector->td_proc->p_pid;
1153 	if ((sip->si_pid == mypid) &&
1154 	    (sip->si_thread == selector)) { /* XXXKSE should be an ID? */
1155 		return;
1156 	}
1157 	if (sip->si_pid &&
1158 	    (p = pfind(sip->si_pid)) &&
1159 	    (find_thread_in_proc(p, sip->si_thread))) {
1160 		mtx_lock_spin(&sched_lock);
1161 	    	if (sip->si_thread->td_wchan == (caddr_t)&selwait) {
1162 			mtx_unlock_spin(&sched_lock);
1163 			PROC_UNLOCK(p);
1164 			sip->si_flags |= SI_COLL;
1165 			return;
1166 		}
1167 		mtx_unlock_spin(&sched_lock);
1168 		PROC_UNLOCK(p);
1169 	}
1170 	sip->si_pid = mypid;
1171 	sip->si_thread = selector;
1172 }
1173 
1174 /*
1175  * Do a wakeup when a selectable event occurs.
1176  */
1177 void
1178 selwakeup(sip)
1179 	register struct selinfo *sip;
1180 {
1181 	struct thread *td;
1182 	register struct proc *p;
1183 
1184 	if (sip->si_pid == 0)
1185 		return;
1186 	if (sip->si_flags & SI_COLL) {
1187 		nselcoll++;
1188 		sip->si_flags &= ~SI_COLL;
1189 		cv_broadcast(&selwait);
1190 	}
1191 	p = pfind(sip->si_pid);
1192 	sip->si_pid = 0;
1193 	td = sip->si_thread;
1194 	if (p != NULL) {
1195 		if (!find_thread_in_proc(p, td)) {
1196 			PROC_UNLOCK(p); /* lock is in pfind() */;
1197 			return;
1198 		}
1199 		mtx_lock_spin(&sched_lock);
1200 		if (td->td_wchan == (caddr_t)&selwait) {
1201 			if (td->td_proc->p_stat == SSLEEP)
1202 				setrunnable(td);
1203 			else
1204 				cv_waitq_remove(td);
1205 		} else
1206 			td->td_flags &= ~TDF_SELECT;
1207 		mtx_unlock_spin(&sched_lock);
1208 		PROC_UNLOCK(p); /* Lock is in pfind() */
1209 	}
1210 }
1211 
1212 static void selectinit __P((void *));
1213 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL)
1214 
1215 /* ARGSUSED*/
1216 static void
1217 selectinit(dummy)
1218 	void *dummy;
1219 {
1220 	cv_init(&selwait, "select");
1221 }
1222