xref: /freebsd/sys/kern/sys_generic.c (revision f6a4109212fd8fbabc731f07b2dd5c7e07fbec33)
1 /*
2  * Copyright (c) 1982, 1986, 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
39  */
40 
41 #include <sys/cdefs.h>
42 __FBSDID("$FreeBSD$");
43 
44 #include "opt_ktrace.h"
45 
46 #include <sys/param.h>
47 #include <sys/systm.h>
48 #include <sys/sysproto.h>
49 #include <sys/filedesc.h>
50 #include <sys/filio.h>
51 #include <sys/fcntl.h>
52 #include <sys/file.h>
53 #include <sys/proc.h>
54 #include <sys/signalvar.h>
55 #include <sys/socketvar.h>
56 #include <sys/uio.h>
57 #include <sys/kernel.h>
58 #include <sys/limits.h>
59 #include <sys/malloc.h>
60 #include <sys/poll.h>
61 #include <sys/resourcevar.h>
62 #include <sys/selinfo.h>
63 #include <sys/syscallsubr.h>
64 #include <sys/sysctl.h>
65 #include <sys/sysent.h>
66 #include <sys/vnode.h>
67 #include <sys/bio.h>
68 #include <sys/buf.h>
69 #include <sys/condvar.h>
70 #ifdef KTRACE
71 #include <sys/ktrace.h>
72 #endif
73 #include <vm/vm.h>
74 #include <vm/vm_page.h>
75 
76 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
77 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
78 MALLOC_DEFINE(M_IOV, "iov", "large iov's");
79 
80 static int	pollscan(struct thread *, struct pollfd *, u_int);
81 static int	selscan(struct thread *, fd_mask **, fd_mask **, int);
82 static int	dofileread(struct thread *, struct file *, int, void *,
83 		    size_t, off_t, int);
84 static int	dofilewrite(struct thread *, struct file *, int,
85 		    const void *, size_t, off_t, int);
86 static void	doselwakeup(struct selinfo *, int);
87 
88 /*
89  * Read system call.
90  */
91 #ifndef _SYS_SYSPROTO_H_
92 struct read_args {
93 	int	fd;
94 	void	*buf;
95 	size_t	nbyte;
96 };
97 #endif
98 /*
99  * MPSAFE
100  */
101 int
102 read(td, uap)
103 	struct thread *td;
104 	struct read_args *uap;
105 {
106 	struct file *fp;
107 	int error;
108 
109 	if ((error = fget_read(td, uap->fd, &fp)) == 0) {
110 		error = dofileread(td, fp, uap->fd, uap->buf,
111 			    uap->nbyte, (off_t)-1, 0);
112 		fdrop(fp, td);
113 	}
114 	return(error);
115 }
116 
117 /*
118  * Pread system call
119  */
120 #ifndef _SYS_SYSPROTO_H_
121 struct pread_args {
122 	int	fd;
123 	void	*buf;
124 	size_t	nbyte;
125 	int	pad;
126 	off_t	offset;
127 };
128 #endif
129 /*
130  * MPSAFE
131  */
132 int
133 pread(td, uap)
134 	struct thread *td;
135 	struct pread_args *uap;
136 {
137 	struct file *fp;
138 	int error;
139 
140 	if ((error = fget_read(td, uap->fd, &fp)) != 0)
141 		return (error);
142 	if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
143 		error = ESPIPE;
144 	else if (uap->offset < 0 && fp->f_vnode->v_type != VCHR)
145 		error = EINVAL;
146 	else {
147 		error = dofileread(td, fp, uap->fd, uap->buf, uap->nbyte,
148 			    uap->offset, FOF_OFFSET);
149 	}
150 	fdrop(fp, td);
151 	return(error);
152 }
153 
154 /*
155  * Code common for read and pread
156  */
157 static int
158 dofileread(td, fp, fd, buf, nbyte, offset, flags)
159 	struct thread *td;
160 	struct file *fp;
161 	int fd, flags;
162 	void *buf;
163 	size_t nbyte;
164 	off_t offset;
165 {
166 	struct uio auio;
167 	struct iovec aiov;
168 	long cnt, error = 0;
169 #ifdef KTRACE
170 	struct iovec ktriov;
171 	struct uio ktruio;
172 	int didktr = 0;
173 #endif
174 
175 	aiov.iov_base = buf;
176 	aiov.iov_len = nbyte;
177 	auio.uio_iov = &aiov;
178 	auio.uio_iovcnt = 1;
179 	auio.uio_offset = offset;
180 	if (nbyte > INT_MAX)
181 		return (EINVAL);
182 	auio.uio_resid = nbyte;
183 	auio.uio_rw = UIO_READ;
184 	auio.uio_segflg = UIO_USERSPACE;
185 	auio.uio_td = td;
186 #ifdef KTRACE
187 	/*
188 	 * if tracing, save a copy of iovec
189 	 */
190 	if (KTRPOINT(td, KTR_GENIO)) {
191 		ktriov = aiov;
192 		ktruio = auio;
193 		didktr = 1;
194 	}
195 #endif
196 	cnt = nbyte;
197 
198 	if ((error = fo_read(fp, &auio, td->td_ucred, flags, td))) {
199 		if (auio.uio_resid != cnt && (error == ERESTART ||
200 		    error == EINTR || error == EWOULDBLOCK))
201 			error = 0;
202 	}
203 	cnt -= auio.uio_resid;
204 #ifdef KTRACE
205 	if (didktr && error == 0) {
206 		ktruio.uio_iov = &ktriov;
207 		ktruio.uio_resid = cnt;
208 		ktrgenio(fd, UIO_READ, &ktruio, error);
209 	}
210 #endif
211 	td->td_retval[0] = cnt;
212 	return (error);
213 }
214 
215 /*
216  * Scatter read system call.
217  */
218 #ifndef _SYS_SYSPROTO_H_
219 struct readv_args {
220 	int	fd;
221 	struct	iovec *iovp;
222 	u_int	iovcnt;
223 };
224 #endif
225 /*
226  * MPSAFE
227  */
228 int
229 readv(td, uap)
230 	struct thread *td;
231 	struct readv_args *uap;
232 {
233 	struct file *fp;
234 	struct uio auio;
235 	struct iovec *iov;
236 	struct iovec *needfree;
237 	struct iovec aiov[UIO_SMALLIOV];
238 	long i, cnt;
239 	int error;
240 	u_int iovlen;
241 #ifdef KTRACE
242 	struct iovec *ktriov = NULL;
243 	struct uio ktruio;
244 #endif
245 
246 	if ((error = fget_read(td, uap->fd, &fp)) != 0)
247 		return (error);
248 	needfree = NULL;
249 	/* note: can't use iovlen until iovcnt is validated */
250 	iovlen = uap->iovcnt * sizeof (struct iovec);
251 	if (uap->iovcnt > UIO_SMALLIOV) {
252 		if (uap->iovcnt > UIO_MAXIOV) {
253 			error = EINVAL;
254 			goto done;
255 		}
256 		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
257 		needfree = iov;
258 	} else
259 		iov = aiov;
260 	auio.uio_iov = iov;
261 	auio.uio_iovcnt = uap->iovcnt;
262 	auio.uio_rw = UIO_READ;
263 	auio.uio_segflg = UIO_USERSPACE;
264 	auio.uio_td = td;
265 	auio.uio_offset = -1;
266 	if ((error = copyin(uap->iovp, iov, iovlen)))
267 		goto done;
268 	auio.uio_resid = 0;
269 	for (i = 0; i < uap->iovcnt; i++) {
270 		if (iov->iov_len > INT_MAX - auio.uio_resid) {
271 			error = EINVAL;
272 			goto done;
273 		}
274 		auio.uio_resid += iov->iov_len;
275 		iov++;
276 	}
277 #ifdef KTRACE
278 	/*
279 	 * if tracing, save a copy of iovec
280 	 */
281 	if (KTRPOINT(td, KTR_GENIO))  {
282 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
283 		bcopy(auio.uio_iov, ktriov, iovlen);
284 		ktruio = auio;
285 	}
286 #endif
287 	cnt = auio.uio_resid;
288 	if ((error = fo_read(fp, &auio, td->td_ucred, 0, td))) {
289 		if (auio.uio_resid != cnt && (error == ERESTART ||
290 		    error == EINTR || error == EWOULDBLOCK))
291 			error = 0;
292 	}
293 	cnt -= auio.uio_resid;
294 #ifdef KTRACE
295 	if (ktriov != NULL) {
296 		if (error == 0) {
297 			ktruio.uio_iov = ktriov;
298 			ktruio.uio_resid = cnt;
299 			ktrgenio(uap->fd, UIO_READ, &ktruio, error);
300 		}
301 		FREE(ktriov, M_TEMP);
302 	}
303 #endif
304 	td->td_retval[0] = cnt;
305 done:
306 	fdrop(fp, td);
307 	if (needfree)
308 		FREE(needfree, M_IOV);
309 	return (error);
310 }
311 
312 /*
313  * Write system call
314  */
315 #ifndef _SYS_SYSPROTO_H_
316 struct write_args {
317 	int	fd;
318 	const void *buf;
319 	size_t	nbyte;
320 };
321 #endif
322 /*
323  * MPSAFE
324  */
325 int
326 write(td, uap)
327 	struct thread *td;
328 	struct write_args *uap;
329 {
330 	struct file *fp;
331 	int error;
332 
333 	if ((error = fget_write(td, uap->fd, &fp)) == 0) {
334 		error = dofilewrite(td, fp, uap->fd, uap->buf, uap->nbyte,
335 			    (off_t)-1, 0);
336 		fdrop(fp, td);
337 	} else {
338 		error = EBADF;	/* XXX this can't be right */
339 	}
340 	return(error);
341 }
342 
343 /*
344  * Pwrite system call
345  */
346 #ifndef _SYS_SYSPROTO_H_
347 struct pwrite_args {
348 	int	fd;
349 	const void *buf;
350 	size_t	nbyte;
351 	int	pad;
352 	off_t	offset;
353 };
354 #endif
355 /*
356  * MPSAFE
357  */
358 int
359 pwrite(td, uap)
360 	struct thread *td;
361 	struct pwrite_args *uap;
362 {
363 	struct file *fp;
364 	int error;
365 
366 	if ((error = fget_write(td, uap->fd, &fp)) == 0) {
367 		if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
368 			error = ESPIPE;
369 		else if (uap->offset < 0 && fp->f_vnode->v_type != VCHR)
370 			error = EINVAL;
371 		else {
372 			error = dofilewrite(td, fp, uap->fd, uap->buf,
373 				    uap->nbyte, uap->offset, FOF_OFFSET);
374 		}
375 		fdrop(fp, td);
376 	} else {
377 		error = EBADF;	/* this can't be right */
378 	}
379 	return(error);
380 }
381 
382 static int
383 dofilewrite(td, fp, fd, buf, nbyte, offset, flags)
384 	struct thread *td;
385 	struct file *fp;
386 	int fd, flags;
387 	const void *buf;
388 	size_t nbyte;
389 	off_t offset;
390 {
391 	struct uio auio;
392 	struct iovec aiov;
393 	long cnt, error = 0;
394 #ifdef KTRACE
395 	struct iovec ktriov;
396 	struct uio ktruio;
397 	int didktr = 0;
398 #endif
399 
400 	aiov.iov_base = (void *)(uintptr_t)buf;
401 	aiov.iov_len = nbyte;
402 	auio.uio_iov = &aiov;
403 	auio.uio_iovcnt = 1;
404 	auio.uio_offset = offset;
405 	if (nbyte > INT_MAX)
406 		return (EINVAL);
407 	auio.uio_resid = nbyte;
408 	auio.uio_rw = UIO_WRITE;
409 	auio.uio_segflg = UIO_USERSPACE;
410 	auio.uio_td = td;
411 #ifdef KTRACE
412 	/*
413 	 * if tracing, save a copy of iovec and uio
414 	 */
415 	if (KTRPOINT(td, KTR_GENIO)) {
416 		ktriov = aiov;
417 		ktruio = auio;
418 		didktr = 1;
419 	}
420 #endif
421 	cnt = nbyte;
422 	if (fp->f_type == DTYPE_VNODE)
423 		bwillwrite();
424 	if ((error = fo_write(fp, &auio, td->td_ucred, flags, td))) {
425 		if (auio.uio_resid != cnt && (error == ERESTART ||
426 		    error == EINTR || error == EWOULDBLOCK))
427 			error = 0;
428 		/* Socket layer is responsible for issuing SIGPIPE. */
429 		if (error == EPIPE && fp->f_type != DTYPE_SOCKET) {
430 			PROC_LOCK(td->td_proc);
431 			psignal(td->td_proc, SIGPIPE);
432 			PROC_UNLOCK(td->td_proc);
433 		}
434 	}
435 	cnt -= auio.uio_resid;
436 #ifdef KTRACE
437 	if (didktr && error == 0) {
438 		ktruio.uio_iov = &ktriov;
439 		ktruio.uio_resid = cnt;
440 		ktrgenio(fd, UIO_WRITE, &ktruio, error);
441 	}
442 #endif
443 	td->td_retval[0] = cnt;
444 	return (error);
445 }
446 
447 /*
448  * Gather write system call
449  */
450 #ifndef _SYS_SYSPROTO_H_
451 struct writev_args {
452 	int	fd;
453 	struct	iovec *iovp;
454 	u_int	iovcnt;
455 };
456 #endif
457 /*
458  * MPSAFE
459  */
460 int
461 writev(td, uap)
462 	struct thread *td;
463 	register struct writev_args *uap;
464 {
465 	struct file *fp;
466 	struct uio auio;
467 	register struct iovec *iov;
468 	struct iovec *needfree;
469 	struct iovec aiov[UIO_SMALLIOV];
470 	long i, cnt, error = 0;
471 	u_int iovlen;
472 #ifdef KTRACE
473 	struct iovec *ktriov = NULL;
474 	struct uio ktruio;
475 #endif
476 
477 	if ((error = fget_write(td, uap->fd, &fp)) != 0)
478 		return (EBADF);
479 	needfree = NULL;
480 	/* note: can't use iovlen until iovcnt is validated */
481 	iovlen = uap->iovcnt * sizeof (struct iovec);
482 	if (uap->iovcnt > UIO_SMALLIOV) {
483 		if (uap->iovcnt > UIO_MAXIOV) {
484 			error = EINVAL;
485 			goto done;
486 		}
487 		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
488 		needfree = iov;
489 	} else
490 		iov = aiov;
491 	auio.uio_iov = iov;
492 	auio.uio_iovcnt = uap->iovcnt;
493 	auio.uio_rw = UIO_WRITE;
494 	auio.uio_segflg = UIO_USERSPACE;
495 	auio.uio_td = td;
496 	auio.uio_offset = -1;
497 	if ((error = copyin(uap->iovp, iov, iovlen)))
498 		goto done;
499 	auio.uio_resid = 0;
500 	for (i = 0; i < uap->iovcnt; i++) {
501 		if (iov->iov_len > INT_MAX - auio.uio_resid) {
502 			error = EINVAL;
503 			goto done;
504 		}
505 		auio.uio_resid += iov->iov_len;
506 		iov++;
507 	}
508 #ifdef KTRACE
509 	/*
510 	 * if tracing, save a copy of iovec and uio
511 	 */
512 	if (KTRPOINT(td, KTR_GENIO))  {
513 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
514 		bcopy(auio.uio_iov, ktriov, iovlen);
515 		ktruio = auio;
516 	}
517 #endif
518 	cnt = auio.uio_resid;
519 	if (fp->f_type == DTYPE_VNODE)
520 		bwillwrite();
521 	if ((error = fo_write(fp, &auio, td->td_ucred, 0, td))) {
522 		if (auio.uio_resid != cnt && (error == ERESTART ||
523 		    error == EINTR || error == EWOULDBLOCK))
524 			error = 0;
525 		if (error == EPIPE) {
526 			PROC_LOCK(td->td_proc);
527 			psignal(td->td_proc, SIGPIPE);
528 			PROC_UNLOCK(td->td_proc);
529 		}
530 	}
531 	cnt -= auio.uio_resid;
532 #ifdef KTRACE
533 	if (ktriov != NULL) {
534 		if (error == 0) {
535 			ktruio.uio_iov = ktriov;
536 			ktruio.uio_resid = cnt;
537 			ktrgenio(uap->fd, UIO_WRITE, &ktruio, error);
538 		}
539 		FREE(ktriov, M_TEMP);
540 	}
541 #endif
542 	td->td_retval[0] = cnt;
543 done:
544 	fdrop(fp, td);
545 	if (needfree)
546 		FREE(needfree, M_IOV);
547 	return (error);
548 }
549 
550 /*
551  * Ioctl system call
552  */
553 #ifndef _SYS_SYSPROTO_H_
554 struct ioctl_args {
555 	int	fd;
556 	u_long	com;
557 	caddr_t	data;
558 };
559 #endif
560 /*
561  * MPSAFE
562  */
563 /* ARGSUSED */
564 int
565 ioctl(td, uap)
566 	struct thread *td;
567 	register struct ioctl_args *uap;
568 {
569 	struct file *fp;
570 	register struct filedesc *fdp;
571 	register u_long com;
572 	int error = 0;
573 	register u_int size;
574 	caddr_t data, memp;
575 	int tmp;
576 #define STK_PARAMS	128
577 	union {
578 	    char stkbuf[STK_PARAMS];
579 	    long align;
580 	} ubuf;
581 
582 	if ((error = fget(td, uap->fd, &fp)) != 0)
583 		return (error);
584 	mtx_lock(&Giant);
585 	if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
586 		fdrop(fp, td);
587 		mtx_unlock(&Giant);
588 		return (EBADF);
589 	}
590 	fdp = td->td_proc->p_fd;
591 	switch (com = uap->com) {
592 	case FIONCLEX:
593 		FILEDESC_LOCK(fdp);
594 		fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE;
595 		FILEDESC_UNLOCK(fdp);
596 		fdrop(fp, td);
597 		mtx_unlock(&Giant);
598 		return (0);
599 	case FIOCLEX:
600 		FILEDESC_LOCK(fdp);
601 		fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE;
602 		FILEDESC_UNLOCK(fdp);
603 		fdrop(fp, td);
604 		mtx_unlock(&Giant);
605 		return (0);
606 	}
607 
608 	/*
609 	 * Interpret high order word to find amount of data to be
610 	 * copied to/from the user's address space.
611 	 */
612 	size = IOCPARM_LEN(com);
613 	if (size > IOCPARM_MAX) {
614 		fdrop(fp, td);
615 		mtx_unlock(&Giant);
616 		return (ENOTTY);
617 	}
618 
619 	memp = NULL;
620 	if (size > sizeof (ubuf.stkbuf)) {
621 		memp = malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
622 		data = memp;
623 	} else {
624 		data = ubuf.stkbuf;
625 	}
626 	if (com&IOC_IN) {
627 		if (size) {
628 			error = copyin(uap->data, data, (u_int)size);
629 			if (error) {
630 				if (memp)
631 					free(memp, M_IOCTLOPS);
632 				fdrop(fp, td);
633 				goto done;
634 			}
635 		} else {
636 			*(caddr_t *)data = uap->data;
637 		}
638 	} else if ((com&IOC_OUT) && size) {
639 		/*
640 		 * Zero the buffer so the user always
641 		 * gets back something deterministic.
642 		 */
643 		bzero(data, size);
644 	} else if (com&IOC_VOID) {
645 		*(caddr_t *)data = uap->data;
646 	}
647 
648 	switch (com) {
649 
650 	case FIONBIO:
651 		FILE_LOCK(fp);
652 		if ((tmp = *(int *)data))
653 			fp->f_flag |= FNONBLOCK;
654 		else
655 			fp->f_flag &= ~FNONBLOCK;
656 		FILE_UNLOCK(fp);
657 		error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
658 		break;
659 
660 	case FIOASYNC:
661 		FILE_LOCK(fp);
662 		if ((tmp = *(int *)data))
663 			fp->f_flag |= FASYNC;
664 		else
665 			fp->f_flag &= ~FASYNC;
666 		FILE_UNLOCK(fp);
667 		error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td);
668 		break;
669 
670 	default:
671 		error = fo_ioctl(fp, com, data, td->td_ucred, td);
672 		/*
673 		 * Copy any data to user, size was
674 		 * already set and checked above.
675 		 */
676 		if (error == 0 && (com&IOC_OUT) && size)
677 			error = copyout(data, uap->data, (u_int)size);
678 		break;
679 	}
680 	if (memp)
681 		free(memp, M_IOCTLOPS);
682 	fdrop(fp, td);
683 done:
684 	mtx_unlock(&Giant);
685 	return (error);
686 }
687 
688 /*
689  * sellock and selwait are initialized in selectinit() via SYSINIT.
690  */
691 struct mtx	sellock;
692 struct cv	selwait;
693 u_int		nselcoll;	/* Select collisions since boot */
694 SYSCTL_UINT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
695 
696 /*
697  * Select system call.
698  */
699 #ifndef _SYS_SYSPROTO_H_
700 struct select_args {
701 	int	nd;
702 	fd_set	*in, *ou, *ex;
703 	struct	timeval *tv;
704 };
705 #endif
706 /*
707  * MPSAFE
708  */
709 int
710 select(td, uap)
711 	register struct thread *td;
712 	register struct select_args *uap;
713 {
714 	struct timeval tv, *tvp;
715 	int error;
716 
717 	if (uap->tv != NULL) {
718 		error = copyin(uap->tv, &tv, sizeof(tv));
719 		if (error)
720 			return (error);
721 		tvp = &tv;
722 	} else
723 		tvp = NULL;
724 
725 	return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp));
726 }
727 
728 int
729 kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou,
730     fd_set *fd_ex, struct timeval *tvp)
731 {
732 	struct filedesc *fdp;
733 	/*
734 	 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
735 	 * infds with the new FD_SETSIZE of 1024, and more than enough for
736 	 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
737 	 * of 256.
738 	 */
739 	fd_mask s_selbits[howmany(2048, NFDBITS)];
740 	fd_mask *ibits[3], *obits[3], *selbits, *sbp;
741 	struct timeval atv, rtv, ttv;
742 	int error, timo;
743 	u_int ncoll, nbufbytes, ncpbytes, nfdbits;
744 
745 	if (nd < 0)
746 		return (EINVAL);
747 	fdp = td->td_proc->p_fd;
748 	mtx_lock(&Giant);
749 	FILEDESC_LOCK(fdp);
750 
751 	if (nd > td->td_proc->p_fd->fd_nfiles)
752 		nd = td->td_proc->p_fd->fd_nfiles;   /* forgiving; slightly wrong */
753 	FILEDESC_UNLOCK(fdp);
754 
755 	/*
756 	 * Allocate just enough bits for the non-null fd_sets.  Use the
757 	 * preallocated auto buffer if possible.
758 	 */
759 	nfdbits = roundup(nd, NFDBITS);
760 	ncpbytes = nfdbits / NBBY;
761 	nbufbytes = 0;
762 	if (fd_in != NULL)
763 		nbufbytes += 2 * ncpbytes;
764 	if (fd_ou != NULL)
765 		nbufbytes += 2 * ncpbytes;
766 	if (fd_ex != NULL)
767 		nbufbytes += 2 * ncpbytes;
768 	if (nbufbytes <= sizeof s_selbits)
769 		selbits = &s_selbits[0];
770 	else
771 		selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
772 
773 	/*
774 	 * Assign pointers into the bit buffers and fetch the input bits.
775 	 * Put the output buffers together so that they can be bzeroed
776 	 * together.
777 	 */
778 	sbp = selbits;
779 #define	getbits(name, x) \
780 	do {								\
781 		if (name == NULL)					\
782 			ibits[x] = NULL;				\
783 		else {							\
784 			ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp;	\
785 			obits[x] = sbp;					\
786 			sbp += ncpbytes / sizeof *sbp;			\
787 			error = copyin(name, ibits[x], ncpbytes);	\
788 			if (error != 0)					\
789 				goto done_nosellock;			\
790 		}							\
791 	} while (0)
792 	getbits(fd_in, 0);
793 	getbits(fd_ou, 1);
794 	getbits(fd_ex, 2);
795 #undef	getbits
796 	if (nbufbytes != 0)
797 		bzero(selbits, nbufbytes / 2);
798 
799 	if (tvp != NULL) {
800 		atv = *tvp;
801 		if (itimerfix(&atv)) {
802 			error = EINVAL;
803 			goto done_nosellock;
804 		}
805 		getmicrouptime(&rtv);
806 		timevaladd(&atv, &rtv);
807 	} else {
808 		atv.tv_sec = 0;
809 		atv.tv_usec = 0;
810 	}
811 	timo = 0;
812 	TAILQ_INIT(&td->td_selq);
813 	mtx_lock(&sellock);
814 retry:
815 	ncoll = nselcoll;
816 	mtx_lock_spin(&sched_lock);
817 	td->td_flags |= TDF_SELECT;
818 	mtx_unlock_spin(&sched_lock);
819 	mtx_unlock(&sellock);
820 
821 	error = selscan(td, ibits, obits, nd);
822 	mtx_lock(&sellock);
823 	if (error || td->td_retval[0])
824 		goto done;
825 	if (atv.tv_sec || atv.tv_usec) {
826 		getmicrouptime(&rtv);
827 		if (timevalcmp(&rtv, &atv, >=))
828 			goto done;
829 		ttv = atv;
830 		timevalsub(&ttv, &rtv);
831 		timo = ttv.tv_sec > 24 * 60 * 60 ?
832 		    24 * 60 * 60 * hz : tvtohz(&ttv);
833 	}
834 
835 	/*
836 	 * An event of interest may occur while we do not hold
837 	 * sellock, so check TDF_SELECT and the number of
838 	 * collisions and rescan the file descriptors if
839 	 * necessary.
840 	 */
841 	mtx_lock_spin(&sched_lock);
842 	if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
843 		mtx_unlock_spin(&sched_lock);
844 		goto retry;
845 	}
846 	mtx_unlock_spin(&sched_lock);
847 
848 	if (timo > 0)
849 		error = cv_timedwait_sig(&selwait, &sellock, timo);
850 	else
851 		error = cv_wait_sig(&selwait, &sellock);
852 
853 	if (error == 0)
854 		goto retry;
855 
856 done:
857 	clear_selinfo_list(td);
858 	mtx_lock_spin(&sched_lock);
859 	td->td_flags &= ~TDF_SELECT;
860 	mtx_unlock_spin(&sched_lock);
861 	mtx_unlock(&sellock);
862 
863 done_nosellock:
864 	/* select is not restarted after signals... */
865 	if (error == ERESTART)
866 		error = EINTR;
867 	if (error == EWOULDBLOCK)
868 		error = 0;
869 #define	putbits(name, x) \
870 	if (name && (error2 = copyout(obits[x], name, ncpbytes))) \
871 		error = error2;
872 	if (error == 0) {
873 		int error2;
874 
875 		putbits(fd_in, 0);
876 		putbits(fd_ou, 1);
877 		putbits(fd_ex, 2);
878 #undef putbits
879 	}
880 	if (selbits != &s_selbits[0])
881 		free(selbits, M_SELECT);
882 
883 	mtx_unlock(&Giant);
884 	return (error);
885 }
886 
887 static int
888 selscan(td, ibits, obits, nfd)
889 	struct thread *td;
890 	fd_mask **ibits, **obits;
891 	int nfd;
892 {
893 	int msk, i, fd;
894 	fd_mask bits;
895 	struct file *fp;
896 	int n = 0;
897 	/* Note: backend also returns POLLHUP/POLLERR if appropriate. */
898 	static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
899 	struct filedesc *fdp = td->td_proc->p_fd;
900 
901 	FILEDESC_LOCK(fdp);
902 	for (msk = 0; msk < 3; msk++) {
903 		if (ibits[msk] == NULL)
904 			continue;
905 		for (i = 0; i < nfd; i += NFDBITS) {
906 			bits = ibits[msk][i/NFDBITS];
907 			/* ffs(int mask) not portable, fd_mask is long */
908 			for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
909 				if (!(bits & 1))
910 					continue;
911 				if ((fp = fget_locked(fdp, fd)) == NULL) {
912 					FILEDESC_UNLOCK(fdp);
913 					return (EBADF);
914 				}
915 				if (fo_poll(fp, flag[msk], td->td_ucred,
916 				    td)) {
917 					obits[msk][(fd)/NFDBITS] |=
918 					    ((fd_mask)1 << ((fd) % NFDBITS));
919 					n++;
920 				}
921 			}
922 		}
923 	}
924 	FILEDESC_UNLOCK(fdp);
925 	td->td_retval[0] = n;
926 	return (0);
927 }
928 
929 /*
930  * Poll system call.
931  */
932 #ifndef _SYS_SYSPROTO_H_
933 struct poll_args {
934 	struct pollfd *fds;
935 	u_int	nfds;
936 	int	timeout;
937 };
938 #endif
939 /*
940  * MPSAFE
941  */
942 int
943 poll(td, uap)
944 	struct thread *td;
945 	struct poll_args *uap;
946 {
947 	caddr_t bits;
948 	char smallbits[32 * sizeof(struct pollfd)];
949 	struct timeval atv, rtv, ttv;
950 	int error = 0, timo;
951 	u_int ncoll, nfds;
952 	size_t ni;
953 
954 	nfds = uap->nfds;
955 
956 	mtx_lock(&Giant);
957 	/*
958 	 * This is kinda bogus.  We have fd limits, but that is not
959 	 * really related to the size of the pollfd array.  Make sure
960 	 * we let the process use at least FD_SETSIZE entries and at
961 	 * least enough for the current limits.  We want to be reasonably
962 	 * safe, but not overly restrictive.
963 	 */
964 	PROC_LOCK(td->td_proc);
965 	if ((nfds > lim_cur(td->td_proc, RLIMIT_NOFILE)) &&
966 	    (nfds > FD_SETSIZE)) {
967 		PROC_UNLOCK(td->td_proc);
968 		error = EINVAL;
969 		goto done2;
970 	}
971 	PROC_UNLOCK(td->td_proc);
972 	ni = nfds * sizeof(struct pollfd);
973 	if (ni > sizeof(smallbits))
974 		bits = malloc(ni, M_TEMP, M_WAITOK);
975 	else
976 		bits = smallbits;
977 	error = copyin(uap->fds, bits, ni);
978 	if (error)
979 		goto done_nosellock;
980 	if (uap->timeout != INFTIM) {
981 		atv.tv_sec = uap->timeout / 1000;
982 		atv.tv_usec = (uap->timeout % 1000) * 1000;
983 		if (itimerfix(&atv)) {
984 			error = EINVAL;
985 			goto done_nosellock;
986 		}
987 		getmicrouptime(&rtv);
988 		timevaladd(&atv, &rtv);
989 	} else {
990 		atv.tv_sec = 0;
991 		atv.tv_usec = 0;
992 	}
993 	timo = 0;
994 	TAILQ_INIT(&td->td_selq);
995 	mtx_lock(&sellock);
996 retry:
997 	ncoll = nselcoll;
998 	mtx_lock_spin(&sched_lock);
999 	td->td_flags |= TDF_SELECT;
1000 	mtx_unlock_spin(&sched_lock);
1001 	mtx_unlock(&sellock);
1002 
1003 	error = pollscan(td, (struct pollfd *)bits, nfds);
1004 	mtx_lock(&sellock);
1005 	if (error || td->td_retval[0])
1006 		goto done;
1007 	if (atv.tv_sec || atv.tv_usec) {
1008 		getmicrouptime(&rtv);
1009 		if (timevalcmp(&rtv, &atv, >=))
1010 			goto done;
1011 		ttv = atv;
1012 		timevalsub(&ttv, &rtv);
1013 		timo = ttv.tv_sec > 24 * 60 * 60 ?
1014 		    24 * 60 * 60 * hz : tvtohz(&ttv);
1015 	}
1016 	/*
1017 	 * An event of interest may occur while we do not hold
1018 	 * sellock, so check TDF_SELECT and the number of collisions
1019 	 * and rescan the file descriptors if necessary.
1020 	 */
1021 	mtx_lock_spin(&sched_lock);
1022 	if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
1023 		mtx_unlock_spin(&sched_lock);
1024 		goto retry;
1025 	}
1026 	mtx_unlock_spin(&sched_lock);
1027 
1028 	if (timo > 0)
1029 		error = cv_timedwait_sig(&selwait, &sellock, timo);
1030 	else
1031 		error = cv_wait_sig(&selwait, &sellock);
1032 
1033 	if (error == 0)
1034 		goto retry;
1035 
1036 done:
1037 	clear_selinfo_list(td);
1038 	mtx_lock_spin(&sched_lock);
1039 	td->td_flags &= ~TDF_SELECT;
1040 	mtx_unlock_spin(&sched_lock);
1041 	mtx_unlock(&sellock);
1042 
1043 done_nosellock:
1044 	/* poll is not restarted after signals... */
1045 	if (error == ERESTART)
1046 		error = EINTR;
1047 	if (error == EWOULDBLOCK)
1048 		error = 0;
1049 	if (error == 0) {
1050 		error = copyout(bits, uap->fds, ni);
1051 		if (error)
1052 			goto out;
1053 	}
1054 out:
1055 	if (ni > sizeof(smallbits))
1056 		free(bits, M_TEMP);
1057 done2:
1058 	mtx_unlock(&Giant);
1059 	return (error);
1060 }
1061 
1062 static int
1063 pollscan(td, fds, nfd)
1064 	struct thread *td;
1065 	struct pollfd *fds;
1066 	u_int nfd;
1067 {
1068 	register struct filedesc *fdp = td->td_proc->p_fd;
1069 	int i;
1070 	struct file *fp;
1071 	int n = 0;
1072 
1073 	FILEDESC_LOCK(fdp);
1074 	for (i = 0; i < nfd; i++, fds++) {
1075 		if (fds->fd >= fdp->fd_nfiles) {
1076 			fds->revents = POLLNVAL;
1077 			n++;
1078 		} else if (fds->fd < 0) {
1079 			fds->revents = 0;
1080 		} else {
1081 			fp = fdp->fd_ofiles[fds->fd];
1082 			if (fp == NULL) {
1083 				fds->revents = POLLNVAL;
1084 				n++;
1085 			} else {
1086 				/*
1087 				 * Note: backend also returns POLLHUP and
1088 				 * POLLERR if appropriate.
1089 				 */
1090 				fds->revents = fo_poll(fp, fds->events,
1091 				    td->td_ucred, td);
1092 				if (fds->revents != 0)
1093 					n++;
1094 			}
1095 		}
1096 	}
1097 	FILEDESC_UNLOCK(fdp);
1098 	td->td_retval[0] = n;
1099 	return (0);
1100 }
1101 
1102 /*
1103  * OpenBSD poll system call.
1104  * XXX this isn't quite a true representation..  OpenBSD uses select ops.
1105  */
1106 #ifndef _SYS_SYSPROTO_H_
1107 struct openbsd_poll_args {
1108 	struct pollfd *fds;
1109 	u_int	nfds;
1110 	int	timeout;
1111 };
1112 #endif
1113 /*
1114  * MPSAFE
1115  */
1116 int
1117 openbsd_poll(td, uap)
1118 	register struct thread *td;
1119 	register struct openbsd_poll_args *uap;
1120 {
1121 	return (poll(td, (struct poll_args *)uap));
1122 }
1123 
1124 /*
1125  * Remove the references to the thread from all of the objects
1126  * we were polling.
1127  *
1128  * This code assumes that the underlying owner of the selinfo
1129  * structure will hold sellock before it changes it, and that
1130  * it will unlink itself from our list if it goes away.
1131  */
1132 void
1133 clear_selinfo_list(td)
1134 	struct thread *td;
1135 {
1136 	struct selinfo *si;
1137 
1138 	mtx_assert(&sellock, MA_OWNED);
1139 	TAILQ_FOREACH(si, &td->td_selq, si_thrlist)
1140 		si->si_thread = NULL;
1141 	TAILQ_INIT(&td->td_selq);
1142 }
1143 
1144 /*
1145  * Record a select request.
1146  */
1147 void
1148 selrecord(selector, sip)
1149 	struct thread *selector;
1150 	struct selinfo *sip;
1151 {
1152 
1153 	mtx_lock(&sellock);
1154 	/*
1155 	 * If the selinfo's thread pointer is NULL then take ownership of it.
1156 	 *
1157 	 * If the thread pointer is not NULL and it points to another
1158 	 * thread, then we have a collision.
1159 	 *
1160 	 * If the thread pointer is not NULL and points back to us then leave
1161 	 * it alone as we've already added pointed it at us and added it to
1162 	 * our list.
1163 	 */
1164 	if (sip->si_thread == NULL) {
1165 		sip->si_thread = selector;
1166 		TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist);
1167 	} else if (sip->si_thread != selector) {
1168 		sip->si_flags |= SI_COLL;
1169 	}
1170 
1171 	mtx_unlock(&sellock);
1172 }
1173 
1174 /* Wake up a selecting thread. */
1175 void
1176 selwakeup(sip)
1177 	struct selinfo *sip;
1178 {
1179 	doselwakeup(sip, -1);
1180 }
1181 
1182 /* Wake up a selecting thread, and set its priority. */
1183 void
1184 selwakeuppri(sip, pri)
1185 	struct selinfo *sip;
1186 	int pri;
1187 {
1188 	doselwakeup(sip, pri);
1189 }
1190 
1191 /*
1192  * Do a wakeup when a selectable event occurs.
1193  */
1194 static void
1195 doselwakeup(sip, pri)
1196 	struct selinfo *sip;
1197 	int pri;
1198 {
1199 	struct thread *td;
1200 
1201 	mtx_lock(&sellock);
1202 	td = sip->si_thread;
1203 	if ((sip->si_flags & SI_COLL) != 0) {
1204 		nselcoll++;
1205 		sip->si_flags &= ~SI_COLL;
1206 		cv_broadcastpri(&selwait, pri);
1207 	}
1208 	if (td == NULL) {
1209 		mtx_unlock(&sellock);
1210 		return;
1211 	}
1212 	TAILQ_REMOVE(&td->td_selq, sip, si_thrlist);
1213 	sip->si_thread = NULL;
1214 	mtx_lock_spin(&sched_lock);
1215 	if (td->td_wchan == &selwait) {
1216 		cv_waitq_remove(td);
1217 		TD_CLR_SLEEPING(td);
1218 		if (pri >= PRI_MIN && pri <= PRI_MAX && td->td_priority > pri)
1219 			td->td_priority = pri;
1220 		setrunnable(td);
1221 	} else
1222 		td->td_flags &= ~TDF_SELECT;
1223 	mtx_unlock_spin(&sched_lock);
1224 	mtx_unlock(&sellock);
1225 }
1226 
1227 static void selectinit(void *);
1228 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL)
1229 
1230 /* ARGSUSED*/
1231 static void
1232 selectinit(dummy)
1233 	void *dummy;
1234 {
1235 	cv_init(&selwait, "select");
1236 	mtx_init(&sellock, "sellck", NULL, MTX_DEF);
1237 }
1238