xref: /freebsd/sys/kern/sys_generic.c (revision 8fa113e5fc65fe6abc757f0089f477a87ee4d185)
1 /*
2  * Copyright (c) 1982, 1986, 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
39  * $FreeBSD$
40  */
41 
42 #include "opt_ktrace.h"
43 
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/sysproto.h>
47 #include <sys/filedesc.h>
48 #include <sys/filio.h>
49 #include <sys/fcntl.h>
50 #include <sys/file.h>
51 #include <sys/proc.h>
52 #include <sys/signalvar.h>
53 #include <sys/socketvar.h>
54 #include <sys/uio.h>
55 #include <sys/kernel.h>
56 #include <sys/malloc.h>
57 #include <sys/poll.h>
58 #include <sys/resourcevar.h>
59 #include <sys/selinfo.h>
60 #include <sys/sysctl.h>
61 #include <sys/sysent.h>
62 #include <sys/bio.h>
63 #include <sys/buf.h>
64 #include <sys/condvar.h>
65 #ifdef KTRACE
66 #include <sys/ktrace.h>
67 #endif
68 #include <vm/vm.h>
69 #include <vm/vm_page.h>
70 
71 #include <machine/limits.h>
72 
73 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
74 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
75 MALLOC_DEFINE(M_IOV, "iov", "large iov's");
76 
77 static int	pollscan __P((struct thread *, struct pollfd *, u_int));
78 static int	pollholddrop __P((struct thread *, struct pollfd *, u_int, int));
79 static int	selscan __P((struct thread *, fd_mask **, fd_mask **, int));
80 static int	selholddrop __P((struct thread *, fd_mask *, fd_mask *, int, int));
81 static int	dofileread __P((struct thread *, struct file *, int, void *,
82 		    size_t, off_t, int));
83 static int	dofilewrite __P((struct thread *, struct file *, int,
84 		    const void *, size_t, off_t, int));
85 
86 /*
87  * Read system call.
88  */
89 #ifndef _SYS_SYSPROTO_H_
90 struct read_args {
91 	int	fd;
92 	void	*buf;
93 	size_t	nbyte;
94 };
95 #endif
96 /*
97  * MPSAFE
98  */
99 int
100 read(td, uap)
101 	struct thread *td;
102 	struct read_args *uap;
103 {
104 	struct file *fp;
105 	int error;
106 
107 	mtx_lock(&Giant);
108 	if ((error = fget_read(td, uap->fd, &fp)) == 0) {
109 		error = dofileread(td, fp, uap->fd, uap->buf,
110 			    uap->nbyte, (off_t)-1, 0);
111 		fdrop(fp, td);
112 	}
113 	mtx_unlock(&Giant);
114 	return(error);
115 }
116 
117 /*
118  * Pread system call
119  */
120 #ifndef _SYS_SYSPROTO_H_
121 struct pread_args {
122 	int	fd;
123 	void	*buf;
124 	size_t	nbyte;
125 	int	pad;
126 	off_t	offset;
127 };
128 #endif
129 /*
130  * MPSAFE
131  */
132 int
133 pread(td, uap)
134 	struct thread *td;
135 	struct pread_args *uap;
136 {
137 	struct file *fp;
138 	int error;
139 
140 	mtx_lock(&Giant);
141 	if ((error = fget_read(td, uap->fd, &fp)) == 0) {
142 		if (fp->f_type == DTYPE_VNODE) {
143 			error = dofileread(td, fp, uap->fd, uap->buf,
144 				    uap->nbyte, uap->offset, FOF_OFFSET);
145 		} else {
146 			error = ESPIPE;
147 		}
148 		fdrop(fp, td);
149 	}
150 	mtx_unlock(&Giant);
151 	return(error);
152 }
153 
154 /*
155  * Code common for read and pread
156  */
157 int
158 dofileread(td, fp, fd, buf, nbyte, offset, flags)
159 	struct thread *td;
160 	struct file *fp;
161 	int fd, flags;
162 	void *buf;
163 	size_t nbyte;
164 	off_t offset;
165 {
166 	struct uio auio;
167 	struct iovec aiov;
168 	long cnt, error = 0;
169 #ifdef KTRACE
170 	struct iovec ktriov;
171 	struct uio ktruio;
172 	int didktr = 0;
173 #endif
174 
175 	aiov.iov_base = (caddr_t)buf;
176 	aiov.iov_len = nbyte;
177 	auio.uio_iov = &aiov;
178 	auio.uio_iovcnt = 1;
179 	auio.uio_offset = offset;
180 	if (nbyte > INT_MAX)
181 		return (EINVAL);
182 	auio.uio_resid = nbyte;
183 	auio.uio_rw = UIO_READ;
184 	auio.uio_segflg = UIO_USERSPACE;
185 	auio.uio_td = td;
186 #ifdef KTRACE
187 	/*
188 	 * if tracing, save a copy of iovec
189 	 */
190 	if (KTRPOINT(td->td_proc, KTR_GENIO)) {
191 		ktriov = aiov;
192 		ktruio = auio;
193 		didktr = 1;
194 	}
195 #endif
196 	cnt = nbyte;
197 
198 	if ((error = fo_read(fp, &auio, fp->f_cred, flags, td))) {
199 		if (auio.uio_resid != cnt && (error == ERESTART ||
200 		    error == EINTR || error == EWOULDBLOCK))
201 			error = 0;
202 	}
203 	cnt -= auio.uio_resid;
204 #ifdef KTRACE
205 	if (didktr && error == 0) {
206 		ktruio.uio_iov = &ktriov;
207 		ktruio.uio_resid = cnt;
208 		ktrgenio(td->td_proc->p_tracep, fd, UIO_READ, &ktruio, error);
209 	}
210 #endif
211 	td->td_retval[0] = cnt;
212 	return (error);
213 }
214 
215 /*
216  * Scatter read system call.
217  */
218 #ifndef _SYS_SYSPROTO_H_
219 struct readv_args {
220 	int	fd;
221 	struct	iovec *iovp;
222 	u_int	iovcnt;
223 };
224 #endif
225 /*
226  * MPSAFE
227  */
228 int
229 readv(td, uap)
230 	struct thread *td;
231 	struct readv_args *uap;
232 {
233 	struct file *fp;
234 	struct uio auio;
235 	struct iovec *iov;
236 	struct iovec *needfree;
237 	struct iovec aiov[UIO_SMALLIOV];
238 	long i, cnt, error = 0;
239 	u_int iovlen;
240 #ifdef KTRACE
241 	struct iovec *ktriov = NULL;
242 	struct uio ktruio;
243 #endif
244 	mtx_lock(&Giant);
245 
246 	if ((error = fget_read(td, uap->fd, &fp)) != 0)
247 		goto done2;
248 	/* note: can't use iovlen until iovcnt is validated */
249 	iovlen = uap->iovcnt * sizeof (struct iovec);
250 	if (uap->iovcnt > UIO_SMALLIOV) {
251 		if (uap->iovcnt > UIO_MAXIOV) {
252 			error = EINVAL;
253 			goto done2;
254 		}
255 		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
256 		needfree = iov;
257 	} else {
258 		iov = aiov;
259 		needfree = NULL;
260 	}
261 	auio.uio_iov = iov;
262 	auio.uio_iovcnt = uap->iovcnt;
263 	auio.uio_rw = UIO_READ;
264 	auio.uio_segflg = UIO_USERSPACE;
265 	auio.uio_td = td;
266 	auio.uio_offset = -1;
267 	if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)))
268 		goto done;
269 	auio.uio_resid = 0;
270 	for (i = 0; i < uap->iovcnt; i++) {
271 		if (iov->iov_len > INT_MAX - auio.uio_resid) {
272 			error = EINVAL;
273 			goto done;
274 		}
275 		auio.uio_resid += iov->iov_len;
276 		iov++;
277 	}
278 #ifdef KTRACE
279 	/*
280 	 * if tracing, save a copy of iovec
281 	 */
282 	if (KTRPOINT(td->td_proc, KTR_GENIO))  {
283 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
284 		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
285 		ktruio = auio;
286 	}
287 #endif
288 	cnt = auio.uio_resid;
289 	if ((error = fo_read(fp, &auio, fp->f_cred, 0, td))) {
290 		if (auio.uio_resid != cnt && (error == ERESTART ||
291 		    error == EINTR || error == EWOULDBLOCK))
292 			error = 0;
293 	}
294 	cnt -= auio.uio_resid;
295 #ifdef KTRACE
296 	if (ktriov != NULL) {
297 		if (error == 0) {
298 			ktruio.uio_iov = ktriov;
299 			ktruio.uio_resid = cnt;
300 			ktrgenio(td->td_proc->p_tracep, uap->fd, UIO_READ, &ktruio,
301 			    error);
302 		}
303 		FREE(ktriov, M_TEMP);
304 	}
305 #endif
306 	td->td_retval[0] = cnt;
307 done:
308 	fdrop(fp, td);
309 	if (needfree)
310 		FREE(needfree, M_IOV);
311 done2:
312 	mtx_unlock(&Giant);
313 	return (error);
314 }
315 
316 /*
317  * Write system call
318  */
319 #ifndef _SYS_SYSPROTO_H_
320 struct write_args {
321 	int	fd;
322 	const void *buf;
323 	size_t	nbyte;
324 };
325 #endif
326 /*
327  * MPSAFE
328  */
329 int
330 write(td, uap)
331 	struct thread *td;
332 	struct write_args *uap;
333 {
334 	struct file *fp;
335 	int error;
336 
337 	mtx_lock(&Giant);
338 	if ((error = fget_write(td, uap->fd, &fp)) == 0) {
339 		error = dofilewrite(td, fp, uap->fd, uap->buf, uap->nbyte,
340 			    (off_t)-1, 0);
341 		fdrop(fp, td);
342 	} else {
343 		error = EBADF;	/* XXX this can't be right */
344 	}
345 	mtx_unlock(&Giant);
346 	return(error);
347 }
348 
349 /*
350  * Pwrite system call
351  */
352 #ifndef _SYS_SYSPROTO_H_
353 struct pwrite_args {
354 	int	fd;
355 	const void *buf;
356 	size_t	nbyte;
357 	int	pad;
358 	off_t	offset;
359 };
360 #endif
361 /*
362  * MPSAFE
363  */
364 int
365 pwrite(td, uap)
366 	struct thread *td;
367 	struct pwrite_args *uap;
368 {
369 	struct file *fp;
370 	int error;
371 
372 	mtx_lock(&Giant);
373 	if ((error = fget_write(td, uap->fd, &fp)) == 0) {
374 		if (fp->f_type == DTYPE_VNODE) {
375 			error = dofilewrite(td, fp, uap->fd, uap->buf,
376 				    uap->nbyte, uap->offset, FOF_OFFSET);
377 		} else {
378 			error = ESPIPE;
379 		}
380 		fdrop(fp, td);
381 	} else {
382 		error = EBADF;	/* this can't be right */
383 	}
384 	mtx_unlock(&Giant);
385 	return(error);
386 }
387 
388 static int
389 dofilewrite(td, fp, fd, buf, nbyte, offset, flags)
390 	struct thread *td;
391 	struct file *fp;
392 	int fd, flags;
393 	const void *buf;
394 	size_t nbyte;
395 	off_t offset;
396 {
397 	struct uio auio;
398 	struct iovec aiov;
399 	long cnt, error = 0;
400 #ifdef KTRACE
401 	struct iovec ktriov;
402 	struct uio ktruio;
403 	int didktr = 0;
404 #endif
405 
406 	aiov.iov_base = (void *)(uintptr_t)buf;
407 	aiov.iov_len = nbyte;
408 	auio.uio_iov = &aiov;
409 	auio.uio_iovcnt = 1;
410 	auio.uio_offset = offset;
411 	if (nbyte > INT_MAX)
412 		return (EINVAL);
413 	auio.uio_resid = nbyte;
414 	auio.uio_rw = UIO_WRITE;
415 	auio.uio_segflg = UIO_USERSPACE;
416 	auio.uio_td = td;
417 #ifdef KTRACE
418 	/*
419 	 * if tracing, save a copy of iovec and uio
420 	 */
421 	if (KTRPOINT(td->td_proc, KTR_GENIO)) {
422 		ktriov = aiov;
423 		ktruio = auio;
424 		didktr = 1;
425 	}
426 #endif
427 	cnt = nbyte;
428 	if (fp->f_type == DTYPE_VNODE)
429 		bwillwrite();
430 	if ((error = fo_write(fp, &auio, fp->f_cred, flags, td))) {
431 		if (auio.uio_resid != cnt && (error == ERESTART ||
432 		    error == EINTR || error == EWOULDBLOCK))
433 			error = 0;
434 		if (error == EPIPE) {
435 			PROC_LOCK(td->td_proc);
436 			psignal(td->td_proc, SIGPIPE);
437 			PROC_UNLOCK(td->td_proc);
438 		}
439 	}
440 	cnt -= auio.uio_resid;
441 #ifdef KTRACE
442 	if (didktr && error == 0) {
443 		ktruio.uio_iov = &ktriov;
444 		ktruio.uio_resid = cnt;
445 		ktrgenio(td->td_proc->p_tracep, fd, UIO_WRITE, &ktruio, error);
446 	}
447 #endif
448 	td->td_retval[0] = cnt;
449 	return (error);
450 }
451 
452 /*
453  * Gather write system call
454  */
455 #ifndef _SYS_SYSPROTO_H_
456 struct writev_args {
457 	int	fd;
458 	struct	iovec *iovp;
459 	u_int	iovcnt;
460 };
461 #endif
462 /*
463  * MPSAFE
464  */
465 int
466 writev(td, uap)
467 	struct thread *td;
468 	register struct writev_args *uap;
469 {
470 	struct file *fp;
471 	struct uio auio;
472 	register struct iovec *iov;
473 	struct iovec *needfree;
474 	struct iovec aiov[UIO_SMALLIOV];
475 	long i, cnt, error = 0;
476 	u_int iovlen;
477 #ifdef KTRACE
478 	struct iovec *ktriov = NULL;
479 	struct uio ktruio;
480 #endif
481 
482 	mtx_lock(&Giant);
483 	if ((error = fget_write(td, uap->fd, &fp)) != 0) {
484 		error = EBADF;
485 		goto done2;
486 	}
487 	/* note: can't use iovlen until iovcnt is validated */
488 	iovlen = uap->iovcnt * sizeof (struct iovec);
489 	if (uap->iovcnt > UIO_SMALLIOV) {
490 		if (uap->iovcnt > UIO_MAXIOV) {
491 			needfree = NULL;
492 			error = EINVAL;
493 			goto done;
494 		}
495 		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
496 		needfree = iov;
497 	} else {
498 		iov = aiov;
499 		needfree = NULL;
500 	}
501 	auio.uio_iov = iov;
502 	auio.uio_iovcnt = uap->iovcnt;
503 	auio.uio_rw = UIO_WRITE;
504 	auio.uio_segflg = UIO_USERSPACE;
505 	auio.uio_td = td;
506 	auio.uio_offset = -1;
507 	if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)))
508 		goto done;
509 	auio.uio_resid = 0;
510 	for (i = 0; i < uap->iovcnt; i++) {
511 		if (iov->iov_len > INT_MAX - auio.uio_resid) {
512 			error = EINVAL;
513 			goto done;
514 		}
515 		auio.uio_resid += iov->iov_len;
516 		iov++;
517 	}
518 #ifdef KTRACE
519 	/*
520 	 * if tracing, save a copy of iovec and uio
521 	 */
522 	if (KTRPOINT(td->td_proc, KTR_GENIO))  {
523 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
524 		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
525 		ktruio = auio;
526 	}
527 #endif
528 	cnt = auio.uio_resid;
529 	if (fp->f_type == DTYPE_VNODE)
530 		bwillwrite();
531 	if ((error = fo_write(fp, &auio, fp->f_cred, 0, td))) {
532 		if (auio.uio_resid != cnt && (error == ERESTART ||
533 		    error == EINTR || error == EWOULDBLOCK))
534 			error = 0;
535 		if (error == EPIPE) {
536 			PROC_LOCK(td->td_proc);
537 			psignal(td->td_proc, SIGPIPE);
538 			PROC_UNLOCK(td->td_proc);
539 		}
540 	}
541 	cnt -= auio.uio_resid;
542 #ifdef KTRACE
543 	if (ktriov != NULL) {
544 		if (error == 0) {
545 			ktruio.uio_iov = ktriov;
546 			ktruio.uio_resid = cnt;
547 			ktrgenio(td->td_proc->p_tracep, uap->fd, UIO_WRITE, &ktruio,
548 			    error);
549 		}
550 		FREE(ktriov, M_TEMP);
551 	}
552 #endif
553 	td->td_retval[0] = cnt;
554 done:
555 	fdrop(fp, td);
556 	if (needfree)
557 		FREE(needfree, M_IOV);
558 done2:
559 	mtx_unlock(&Giant);
560 	return (error);
561 }
562 
563 /*
564  * Ioctl system call
565  */
566 #ifndef _SYS_SYSPROTO_H_
567 struct ioctl_args {
568 	int	fd;
569 	u_long	com;
570 	caddr_t	data;
571 };
572 #endif
573 /*
574  * MPSAFE
575  */
576 /* ARGSUSED */
577 int
578 ioctl(td, uap)
579 	struct thread *td;
580 	register struct ioctl_args *uap;
581 {
582 	register struct file *fp;
583 	register struct filedesc *fdp;
584 	register u_long com;
585 	int error = 0;
586 	register u_int size;
587 	caddr_t data, memp;
588 	int tmp;
589 #define STK_PARAMS	128
590 	union {
591 	    char stkbuf[STK_PARAMS];
592 	    long align;
593 	} ubuf;
594 
595 	mtx_lock(&Giant);
596 	fdp = td->td_proc->p_fd;
597 	if ((u_int)uap->fd >= fdp->fd_nfiles ||
598 	    (fp = fdp->fd_ofiles[uap->fd]) == NULL) {
599 		error = EBADF;
600 		goto done2;
601 	}
602 
603 	if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
604 		error = EBADF;
605 		goto done2;
606 	}
607 
608 	switch (com = uap->com) {
609 	case FIONCLEX:
610 		fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE;
611 		goto done2;
612 	case FIOCLEX:
613 		fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE;
614 		goto done2;
615 	}
616 
617 	/*
618 	 * Interpret high order word to find amount of data to be
619 	 * copied to/from the user's address space.
620 	 */
621 	size = IOCPARM_LEN(com);
622 	if (size > IOCPARM_MAX) {
623 		error = ENOTTY;
624 		goto done2;
625 	}
626 
627 	fhold(fp);
628 
629 	memp = NULL;
630 	if (size > sizeof (ubuf.stkbuf)) {
631 		memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
632 		data = memp;
633 	} else {
634 		data = ubuf.stkbuf;
635 	}
636 	if (com&IOC_IN) {
637 		if (size) {
638 			error = copyin(uap->data, data, (u_int)size);
639 			if (error) {
640 				if (memp)
641 					free(memp, M_IOCTLOPS);
642 				fdrop(fp, td);
643 				goto done2;
644 			}
645 		} else {
646 			*(caddr_t *)data = uap->data;
647 		}
648 	} else if ((com&IOC_OUT) && size) {
649 		/*
650 		 * Zero the buffer so the user always
651 		 * gets back something deterministic.
652 		 */
653 		bzero(data, size);
654 	} else if (com&IOC_VOID) {
655 		*(caddr_t *)data = uap->data;
656 	}
657 
658 	switch (com) {
659 
660 	case FIONBIO:
661 		if ((tmp = *(int *)data))
662 			fp->f_flag |= FNONBLOCK;
663 		else
664 			fp->f_flag &= ~FNONBLOCK;
665 		error = fo_ioctl(fp, FIONBIO, (caddr_t)&tmp, td);
666 		break;
667 
668 	case FIOASYNC:
669 		if ((tmp = *(int *)data))
670 			fp->f_flag |= FASYNC;
671 		else
672 			fp->f_flag &= ~FASYNC;
673 		error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, td);
674 		break;
675 
676 	default:
677 		error = fo_ioctl(fp, com, data, td);
678 		/*
679 		 * Copy any data to user, size was
680 		 * already set and checked above.
681 		 */
682 		if (error == 0 && (com&IOC_OUT) && size)
683 			error = copyout(data, uap->data, (u_int)size);
684 		break;
685 	}
686 	if (memp)
687 		free(memp, M_IOCTLOPS);
688 	fdrop(fp, td);
689 done2:
690 	mtx_unlock(&Giant);
691 	return (error);
692 }
693 
694 static int	nselcoll;	/* Select collisions since boot */
695 struct cv	selwait;
696 SYSCTL_INT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
697 
698 /*
699  * Select system call.
700  */
701 #ifndef _SYS_SYSPROTO_H_
702 struct select_args {
703 	int	nd;
704 	fd_set	*in, *ou, *ex;
705 	struct	timeval *tv;
706 };
707 #endif
708 /*
709  * MPSAFE
710  */
711 int
712 select(td, uap)
713 	register struct thread *td;
714 	register struct select_args *uap;
715 {
716 	/*
717 	 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
718 	 * infds with the new FD_SETSIZE of 1024, and more than enough for
719 	 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
720 	 * of 256.
721 	 */
722 	fd_mask s_selbits[howmany(2048, NFDBITS)];
723 	fd_mask s_heldbits[howmany(2048, NFDBITS)];
724 	fd_mask *ibits[3], *obits[3], *selbits, *sbp, *heldbits, *hibits, *hobits;
725 	struct timeval atv, rtv, ttv;
726 	int ncoll, error, timo, i;
727 	u_int nbufbytes, ncpbytes, nfdbits;
728 
729 	if (uap->nd < 0)
730 		return (EINVAL);
731 
732 	mtx_lock(&Giant);
733 
734 	if (uap->nd > td->td_proc->p_fd->fd_nfiles)
735 		uap->nd = td->td_proc->p_fd->fd_nfiles;   /* forgiving; slightly wrong */
736 
737 	/*
738 	 * Allocate just enough bits for the non-null fd_sets.  Use the
739 	 * preallocated auto buffer if possible.
740 	 */
741 	nfdbits = roundup(uap->nd, NFDBITS);
742 	ncpbytes = nfdbits / NBBY;
743 	nbufbytes = 0;
744 	if (uap->in != NULL)
745 		nbufbytes += 2 * ncpbytes;
746 	if (uap->ou != NULL)
747 		nbufbytes += 2 * ncpbytes;
748 	if (uap->ex != NULL)
749 		nbufbytes += 2 * ncpbytes;
750 	if (nbufbytes <= sizeof s_selbits)
751 		selbits = &s_selbits[0];
752 	else
753 		selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
754 	if (2 * ncpbytes <= sizeof s_heldbits) {
755 		bzero(s_heldbits, sizeof(s_heldbits));
756 		heldbits = &s_heldbits[0];
757 	} else
758 		heldbits = malloc(2 * ncpbytes, M_SELECT, M_WAITOK | M_ZERO);
759 
760 	/*
761 	 * Assign pointers into the bit buffers and fetch the input bits.
762 	 * Put the output buffers together so that they can be bzeroed
763 	 * together.
764 	 */
765 	sbp = selbits;
766 	hibits = heldbits + ncpbytes / sizeof *heldbits;
767 	hobits = heldbits;
768 #define	getbits(name, x) \
769 	do {								\
770 		if (uap->name == NULL)					\
771 			ibits[x] = NULL;				\
772 		else {							\
773 			ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp;	\
774 			obits[x] = sbp;					\
775 			sbp += ncpbytes / sizeof *sbp;			\
776 			error = copyin(uap->name, ibits[x], ncpbytes);	\
777 			if (error != 0)					\
778 				goto done_noproclock;			\
779 			for (i = 0;					\
780 			     i < ncpbytes / sizeof ibits[i][0];		\
781 			     i++)					\
782 				hibits[i] |= ibits[x][i];		\
783 		}							\
784 	} while (0)
785 	getbits(in, 0);
786 	getbits(ou, 1);
787 	getbits(ex, 2);
788 #undef	getbits
789 	if (nbufbytes != 0)
790 		bzero(selbits, nbufbytes / 2);
791 
792 	if (uap->tv) {
793 		error = copyin((caddr_t)uap->tv, (caddr_t)&atv,
794 			sizeof (atv));
795 		if (error)
796 			goto done_noproclock;
797 		if (itimerfix(&atv)) {
798 			error = EINVAL;
799 			goto done_noproclock;
800 		}
801 		getmicrouptime(&rtv);
802 		timevaladd(&atv, &rtv);
803 	} else {
804 		atv.tv_sec = 0;
805 		atv.tv_usec = 0;
806 	}
807 	selholddrop(td, hibits, hobits, uap->nd, 1);
808 	timo = 0;
809 	PROC_LOCK(td->td_proc);
810 retry:
811 	ncoll = nselcoll;
812 	mtx_lock_spin(&sched_lock);
813 	td->td_flags |= TDF_SELECT;
814 	mtx_unlock_spin(&sched_lock);
815 	PROC_UNLOCK(td->td_proc);
816 	error = selscan(td, ibits, obits, uap->nd);
817 	PROC_LOCK(td->td_proc);
818 	if (error || td->td_retval[0])
819 		goto done;
820 	if (atv.tv_sec || atv.tv_usec) {
821 		getmicrouptime(&rtv);
822 		if (timevalcmp(&rtv, &atv, >=)) {
823 			/*
824 			 * An event of our interest may occur during locking a process.
825 			 * In order to avoid missing the event that occured during locking
826 			 * the process, test TDF_SELECT and rescan file descriptors if
827 			 * necessary.
828 			 */
829 			mtx_lock_spin(&sched_lock);
830 			if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
831 				ncoll = nselcoll;
832 				td->td_flags |= TDF_SELECT;
833 				mtx_unlock_spin(&sched_lock);
834 				PROC_UNLOCK(td->td_proc);
835 				error = selscan(td, ibits, obits, uap->nd);
836 				PROC_LOCK(td->td_proc);
837 			} else
838 				mtx_unlock_spin(&sched_lock);
839 			goto done;
840 		}
841 		ttv = atv;
842 		timevalsub(&ttv, &rtv);
843 		timo = ttv.tv_sec > 24 * 60 * 60 ?
844 		    24 * 60 * 60 * hz : tvtohz(&ttv);
845 	}
846 	mtx_lock_spin(&sched_lock);
847 	td->td_flags &= ~TDF_SELECT;
848 	mtx_unlock_spin(&sched_lock);
849 
850 	if (timo > 0)
851 		error = cv_timedwait_sig(&selwait, &td->td_proc->p_mtx, timo);
852 	else
853 		error = cv_wait_sig(&selwait, &td->td_proc->p_mtx);
854 
855 	if (error == 0)
856 		goto retry;
857 
858 done:
859 	mtx_lock_spin(&sched_lock);
860 	td->td_flags &= ~TDF_SELECT;
861 	mtx_unlock_spin(&sched_lock);
862 	PROC_UNLOCK(td->td_proc);
863 	selholddrop(td, hibits, hobits, uap->nd, 0);
864 done_noproclock:
865 	/* select is not restarted after signals... */
866 	if (error == ERESTART)
867 		error = EINTR;
868 	if (error == EWOULDBLOCK)
869 		error = 0;
870 #define	putbits(name, x) \
871 	if (uap->name && (error2 = copyout(obits[x], uap->name, ncpbytes))) \
872 		error = error2;
873 	if (error == 0) {
874 		int error2;
875 
876 		putbits(in, 0);
877 		putbits(ou, 1);
878 		putbits(ex, 2);
879 #undef putbits
880 	}
881 	if (selbits != &s_selbits[0])
882 		free(selbits, M_SELECT);
883 	if (heldbits != &s_heldbits[0])
884 		free(heldbits, M_SELECT);
885 
886 	mtx_unlock(&Giant);
887 	return (error);
888 }
889 
890 static int
891 selholddrop(td, ibits, obits, nfd, hold)
892 	struct thread *td;
893 	fd_mask *ibits, *obits;
894 	int nfd, hold;
895 {
896 	struct filedesc *fdp = td->td_proc->p_fd;
897 	int i, fd;
898 	fd_mask bits;
899 	struct file *fp;
900 
901 	for (i = 0; i < nfd; i += NFDBITS) {
902 		if (hold)
903 			bits = ibits[i/NFDBITS];
904 		else
905 			bits = obits[i/NFDBITS];
906 		/* ffs(int mask) not portable, fd_mask is long */
907 		for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
908 			if (!(bits & 1))
909 				continue;
910 			fp = fdp->fd_ofiles[fd];
911 			if (fp == NULL)
912 				return (EBADF);
913 			if (hold) {
914 				fhold(fp);
915 				obits[(fd)/NFDBITS] |=
916 				    ((fd_mask)1 << ((fd) % NFDBITS));
917 			} else
918 				fdrop(fp, td);
919 		}
920 	}
921 	return (0);
922 }
923 
924 static int
925 selscan(td, ibits, obits, nfd)
926 	struct thread *td;
927 	fd_mask **ibits, **obits;
928 	int nfd;
929 {
930 	struct filedesc *fdp = td->td_proc->p_fd;
931 	int msk, i, fd;
932 	fd_mask bits;
933 	struct file *fp;
934 	int n = 0;
935 	/* Note: backend also returns POLLHUP/POLLERR if appropriate. */
936 	static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
937 
938 	for (msk = 0; msk < 3; msk++) {
939 		if (ibits[msk] == NULL)
940 			continue;
941 		for (i = 0; i < nfd; i += NFDBITS) {
942 			bits = ibits[msk][i/NFDBITS];
943 			/* ffs(int mask) not portable, fd_mask is long */
944 			for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
945 				if (!(bits & 1))
946 					continue;
947 				fp = fdp->fd_ofiles[fd];
948 				if (fp == NULL)
949 					return (EBADF);
950 				if (fo_poll(fp, flag[msk], fp->f_cred, td)) {
951 					obits[msk][(fd)/NFDBITS] |=
952 					    ((fd_mask)1 << ((fd) % NFDBITS));
953 					n++;
954 				}
955 			}
956 		}
957 	}
958 	td->td_retval[0] = n;
959 	return (0);
960 }
961 
962 /*
963  * Poll system call.
964  */
965 #ifndef _SYS_SYSPROTO_H_
966 struct poll_args {
967 	struct pollfd *fds;
968 	u_int	nfds;
969 	int	timeout;
970 };
971 #endif
972 /*
973  * MPSAFE
974  */
975 int
976 poll(td, uap)
977 	struct thread *td;
978 	struct poll_args *uap;
979 {
980 	caddr_t bits;
981 	char smallbits[32 * sizeof(struct pollfd)];
982 	struct timeval atv, rtv, ttv;
983 	int ncoll, error = 0, timo;
984 	u_int nfds;
985 	size_t ni;
986 	struct pollfd p_heldbits[32];
987 	struct pollfd *heldbits;
988 
989 	nfds = SCARG(uap, nfds);
990 
991 	mtx_lock(&Giant);
992 	/*
993 	 * This is kinda bogus.  We have fd limits, but that is not
994 	 * really related to the size of the pollfd array.  Make sure
995 	 * we let the process use at least FD_SETSIZE entries and at
996 	 * least enough for the current limits.  We want to be reasonably
997 	 * safe, but not overly restrictive.
998 	 */
999 	if ((nfds > td->td_proc->p_rlimit[RLIMIT_NOFILE].rlim_cur) &&
1000 	    (nfds > FD_SETSIZE)) {
1001 		error = EINVAL;
1002 		goto done2;
1003 	}
1004 	ni = nfds * sizeof(struct pollfd);
1005 	if (ni > sizeof(smallbits))
1006 		bits = malloc(ni, M_TEMP, M_WAITOK);
1007 	else
1008 		bits = smallbits;
1009 	if (ni > sizeof(p_heldbits))
1010 		heldbits = malloc(ni, M_TEMP, M_WAITOK);
1011 	else {
1012 		bzero(p_heldbits, sizeof(p_heldbits));
1013 		heldbits = p_heldbits;
1014 	}
1015 	error = copyin(SCARG(uap, fds), bits, ni);
1016 	if (error)
1017 		goto done_noproclock;
1018 	bcopy(bits, heldbits, ni);
1019 	if (SCARG(uap, timeout) != INFTIM) {
1020 		atv.tv_sec = SCARG(uap, timeout) / 1000;
1021 		atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000;
1022 		if (itimerfix(&atv)) {
1023 			error = EINVAL;
1024 			goto done_noproclock;
1025 		}
1026 		getmicrouptime(&rtv);
1027 		timevaladd(&atv, &rtv);
1028 	} else {
1029 		atv.tv_sec = 0;
1030 		atv.tv_usec = 0;
1031 	}
1032 	pollholddrop(td, heldbits, nfds, 1);
1033 	timo = 0;
1034 	PROC_LOCK(td->td_proc);
1035 retry:
1036 	ncoll = nselcoll;
1037 	mtx_lock_spin(&sched_lock);
1038 	td->td_flags |= TDF_SELECT;
1039 	mtx_unlock_spin(&sched_lock);
1040 	PROC_UNLOCK(td->td_proc);
1041 	error = pollscan(td, (struct pollfd *)bits, nfds);
1042 	PROC_LOCK(td->td_proc);
1043 	if (error || td->td_retval[0])
1044 		goto done;
1045 	if (atv.tv_sec || atv.tv_usec) {
1046 		getmicrouptime(&rtv);
1047 		if (timevalcmp(&rtv, &atv, >=)) {
1048 			/*
1049 			 * An event of our interest may occur during locking a process.
1050 			 * In order to avoid missing the event that occured during locking
1051 			 * the process, test TDF_SELECT and rescan file descriptors if
1052 			 * necessary.
1053 			 */
1054 			mtx_lock_spin(&sched_lock);
1055 			if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
1056 				ncoll = nselcoll;
1057 				td->td_flags |= TDF_SELECT;
1058 				mtx_unlock_spin(&sched_lock);
1059 				PROC_UNLOCK(td->td_proc);
1060 				error = pollscan(td, (struct pollfd *)bits, nfds);
1061 				PROC_LOCK(td->td_proc);
1062 			} else
1063 				mtx_unlock_spin(&sched_lock);
1064 			goto done;
1065 		}
1066 		ttv = atv;
1067 		timevalsub(&ttv, &rtv);
1068 		timo = ttv.tv_sec > 24 * 60 * 60 ?
1069 		    24 * 60 * 60 * hz : tvtohz(&ttv);
1070 	}
1071 	mtx_lock_spin(&sched_lock);
1072 	td->td_flags &= ~TDF_SELECT;
1073 	mtx_unlock_spin(&sched_lock);
1074 	if (timo > 0)
1075 		error = cv_timedwait_sig(&selwait, &td->td_proc->p_mtx, timo);
1076 	else
1077 		error = cv_wait_sig(&selwait, &td->td_proc->p_mtx);
1078 	if (error == 0)
1079 		goto retry;
1080 
1081 done:
1082 	mtx_lock_spin(&sched_lock);
1083 	td->td_flags &= ~TDF_SELECT;
1084 	mtx_unlock_spin(&sched_lock);
1085 	PROC_UNLOCK(td->td_proc);
1086 	pollholddrop(td, heldbits, nfds, 0);
1087 done_noproclock:
1088 	/* poll is not restarted after signals... */
1089 	if (error == ERESTART)
1090 		error = EINTR;
1091 	if (error == EWOULDBLOCK)
1092 		error = 0;
1093 	if (error == 0) {
1094 		error = copyout(bits, SCARG(uap, fds), ni);
1095 		if (error)
1096 			goto out;
1097 	}
1098 out:
1099 	if (ni > sizeof(smallbits))
1100 		free(bits, M_TEMP);
1101 	if (ni > sizeof(p_heldbits))
1102 		free(heldbits, M_TEMP);
1103 done2:
1104 	mtx_unlock(&Giant);
1105 	return (error);
1106 }
1107 
1108 static int
1109 pollholddrop(td, fds, nfd, hold)
1110 	struct thread *td;
1111 	struct pollfd *fds;
1112 	u_int nfd;
1113 	int hold;
1114 {
1115 	register struct filedesc *fdp = td->td_proc->p_fd;
1116 	int i;
1117 	struct file *fp;
1118 
1119 	for (i = 0; i < nfd; i++, fds++) {
1120 		if (0 <= fds->fd && fds->fd < fdp->fd_nfiles) {
1121 			fp = fdp->fd_ofiles[fds->fd];
1122 			if (hold) {
1123 				if (fp != NULL) {
1124 					fhold(fp);
1125 					fds->revents = 1;
1126 				} else
1127 					fds->revents = 0;
1128 			} else if(fp != NULL && fds->revents)
1129 				fdrop(fp, td);
1130 		}
1131 	}
1132 	return (0);
1133 }
1134 
1135 static int
1136 pollscan(td, fds, nfd)
1137 	struct thread *td;
1138 	struct pollfd *fds;
1139 	u_int nfd;
1140 {
1141 	register struct filedesc *fdp = td->td_proc->p_fd;
1142 	int i;
1143 	struct file *fp;
1144 	int n = 0;
1145 
1146 	for (i = 0; i < nfd; i++, fds++) {
1147 		if (fds->fd >= fdp->fd_nfiles) {
1148 			fds->revents = POLLNVAL;
1149 			n++;
1150 		} else if (fds->fd < 0) {
1151 			fds->revents = 0;
1152 		} else {
1153 			fp = fdp->fd_ofiles[fds->fd];
1154 			if (fp == NULL) {
1155 				fds->revents = POLLNVAL;
1156 				n++;
1157 			} else {
1158 				/*
1159 				 * Note: backend also returns POLLHUP and
1160 				 * POLLERR if appropriate.
1161 				 */
1162 				fds->revents = fo_poll(fp, fds->events,
1163 				    fp->f_cred, td);
1164 				if (fds->revents != 0)
1165 					n++;
1166 			}
1167 		}
1168 	}
1169 	td->td_retval[0] = n;
1170 	return (0);
1171 }
1172 
1173 /*
1174  * OpenBSD poll system call.
1175  * XXX this isn't quite a true representation..  OpenBSD uses select ops.
1176  */
1177 #ifndef _SYS_SYSPROTO_H_
1178 struct openbsd_poll_args {
1179 	struct pollfd *fds;
1180 	u_int	nfds;
1181 	int	timeout;
1182 };
1183 #endif
1184 /*
1185  * MPSAFE
1186  */
1187 int
1188 openbsd_poll(td, uap)
1189 	register struct thread *td;
1190 	register struct openbsd_poll_args *uap;
1191 {
1192 	return (poll(td, (struct poll_args *)uap));
1193 }
1194 
1195 /*ARGSUSED*/
1196 int
1197 seltrue(dev, events, td)
1198 	dev_t dev;
1199 	int events;
1200 	struct thread *td;
1201 {
1202 
1203 	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
1204 }
1205 
1206 static int
1207 find_thread_in_proc(struct proc *p, struct thread *td)
1208 {
1209 	struct thread *td2;
1210 	FOREACH_THREAD_IN_PROC(p, td2) {
1211 		if (td2 == td) {
1212 			return (1);
1213 		}
1214 	}
1215 	return (0);
1216 }
1217 
1218 /*
1219  * Record a select request.
1220  */
1221 void
1222 selrecord(selector, sip)
1223 	struct thread *selector;
1224 	struct selinfo *sip;
1225 {
1226 	struct proc *p;
1227 	pid_t mypid;
1228 
1229 	mypid = selector->td_proc->p_pid;
1230 	if ((sip->si_pid == mypid) &&
1231 	    (sip->si_thread == selector)) { /* XXXKSE should be an ID? */
1232 		return;
1233 	}
1234 	if (sip->si_pid &&
1235 	    (p = pfind(sip->si_pid)) &&
1236 	    (find_thread_in_proc(p, sip->si_thread))) {
1237 		mtx_lock_spin(&sched_lock);
1238 	    	if (sip->si_thread->td_wchan == (caddr_t)&selwait) {
1239 			mtx_unlock_spin(&sched_lock);
1240 			PROC_UNLOCK(p);
1241 			sip->si_flags |= SI_COLL;
1242 			return;
1243 		}
1244 		mtx_unlock_spin(&sched_lock);
1245 		PROC_UNLOCK(p);
1246 	}
1247 	sip->si_pid = mypid;
1248 	sip->si_thread = selector;
1249 }
1250 
1251 /*
1252  * Do a wakeup when a selectable event occurs.
1253  */
1254 void
1255 selwakeup(sip)
1256 	register struct selinfo *sip;
1257 {
1258 	struct thread *td;
1259 	register struct proc *p;
1260 
1261 	if (sip->si_pid == 0)
1262 		return;
1263 	if (sip->si_flags & SI_COLL) {
1264 		nselcoll++;
1265 		sip->si_flags &= ~SI_COLL;
1266 		cv_broadcast(&selwait);
1267 	}
1268 	p = pfind(sip->si_pid);
1269 	sip->si_pid = 0;
1270 	td = sip->si_thread;
1271 	if (p != NULL) {
1272 		if (!find_thread_in_proc(p, td)) {
1273 			PROC_UNLOCK(p); /* lock is in pfind() */;
1274 			return;
1275 		}
1276 		mtx_lock_spin(&sched_lock);
1277 		if (td->td_wchan == (caddr_t)&selwait) {
1278 			if (td->td_proc->p_stat == SSLEEP)
1279 				setrunnable(td);
1280 			else
1281 				cv_waitq_remove(td);
1282 		} else
1283 			td->td_flags &= ~TDF_SELECT;
1284 		mtx_unlock_spin(&sched_lock);
1285 		PROC_UNLOCK(p); /* Lock is in pfind() */
1286 	}
1287 }
1288 
1289 static void selectinit __P((void *));
1290 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL)
1291 
1292 /* ARGSUSED*/
1293 static void
1294 selectinit(dummy)
1295 	void *dummy;
1296 {
1297 	cv_init(&selwait, "select");
1298 }
1299