xref: /freebsd/sys/kern/sys_generic.c (revision 77b7cdf1999ee965ad494fddd184b18f532ac91a)
1 /*
2  * Copyright (c) 1982, 1986, 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
39  * $FreeBSD$
40  */
41 
42 #include "opt_ktrace.h"
43 
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/sysproto.h>
47 #include <sys/filedesc.h>
48 #include <sys/filio.h>
49 #include <sys/fcntl.h>
50 #include <sys/file.h>
51 #include <sys/proc.h>
52 #include <sys/signalvar.h>
53 #include <sys/socketvar.h>
54 #include <sys/uio.h>
55 #include <sys/kernel.h>
56 #include <sys/limits.h>
57 #include <sys/malloc.h>
58 #include <sys/poll.h>
59 #include <sys/resourcevar.h>
60 #include <sys/selinfo.h>
61 #include <sys/syscallsubr.h>
62 #include <sys/sysctl.h>
63 #include <sys/sysent.h>
64 #include <sys/bio.h>
65 #include <sys/buf.h>
66 #include <sys/condvar.h>
67 #ifdef KTRACE
68 #include <sys/ktrace.h>
69 #endif
70 #include <vm/vm.h>
71 #include <vm/vm_page.h>
72 
73 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
74 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
75 MALLOC_DEFINE(M_IOV, "iov", "large iov's");
76 
77 static int	pollscan(struct thread *, struct pollfd *, u_int);
78 static int	selscan(struct thread *, fd_mask **, fd_mask **, int);
79 static int	dofileread(struct thread *, struct file *, int, void *,
80 		    size_t, off_t, int);
81 static int	dofilewrite(struct thread *, struct file *, int,
82 		    const void *, size_t, off_t, int);
83 
84 /*
85  * Read system call.
86  */
87 #ifndef _SYS_SYSPROTO_H_
88 struct read_args {
89 	int	fd;
90 	void	*buf;
91 	size_t	nbyte;
92 };
93 #endif
94 /*
95  * MPSAFE
96  */
97 int
98 read(td, uap)
99 	struct thread *td;
100 	struct read_args *uap;
101 {
102 	struct file *fp;
103 	int error;
104 
105 	if ((error = fget_read(td, uap->fd, &fp)) == 0) {
106 		error = dofileread(td, fp, uap->fd, uap->buf,
107 			    uap->nbyte, (off_t)-1, 0);
108 		fdrop(fp, td);
109 	}
110 	return(error);
111 }
112 
113 /*
114  * Pread system call
115  */
116 #ifndef _SYS_SYSPROTO_H_
117 struct pread_args {
118 	int	fd;
119 	void	*buf;
120 	size_t	nbyte;
121 	int	pad;
122 	off_t	offset;
123 };
124 #endif
125 /*
126  * MPSAFE
127  */
128 int
129 pread(td, uap)
130 	struct thread *td;
131 	struct pread_args *uap;
132 {
133 	struct file *fp;
134 	int error;
135 
136 	if ((error = fget_read(td, uap->fd, &fp)) != 0)
137 		return (error);
138 	if (fp->f_type != DTYPE_VNODE) {
139 		error = ESPIPE;
140 	} else {
141 		error = dofileread(td, fp, uap->fd, uap->buf, uap->nbyte,
142 			    uap->offset, FOF_OFFSET);
143 	}
144 	fdrop(fp, td);
145 	return(error);
146 }
147 
148 /*
149  * Code common for read and pread
150  */
151 static int
152 dofileread(td, fp, fd, buf, nbyte, offset, flags)
153 	struct thread *td;
154 	struct file *fp;
155 	int fd, flags;
156 	void *buf;
157 	size_t nbyte;
158 	off_t offset;
159 {
160 	struct uio auio;
161 	struct iovec aiov;
162 	long cnt, error = 0;
163 #ifdef KTRACE
164 	struct iovec ktriov;
165 	struct uio ktruio;
166 	int didktr = 0;
167 #endif
168 
169 	aiov.iov_base = buf;
170 	aiov.iov_len = nbyte;
171 	auio.uio_iov = &aiov;
172 	auio.uio_iovcnt = 1;
173 	auio.uio_offset = offset;
174 	if (nbyte > INT_MAX)
175 		return (EINVAL);
176 	auio.uio_resid = nbyte;
177 	auio.uio_rw = UIO_READ;
178 	auio.uio_segflg = UIO_USERSPACE;
179 	auio.uio_td = td;
180 #ifdef KTRACE
181 	/*
182 	 * if tracing, save a copy of iovec
183 	 */
184 	if (KTRPOINT(td, KTR_GENIO)) {
185 		ktriov = aiov;
186 		ktruio = auio;
187 		didktr = 1;
188 	}
189 #endif
190 	cnt = nbyte;
191 
192 	if ((error = fo_read(fp, &auio, td->td_ucred, flags, td))) {
193 		if (auio.uio_resid != cnt && (error == ERESTART ||
194 		    error == EINTR || error == EWOULDBLOCK))
195 			error = 0;
196 	}
197 	cnt -= auio.uio_resid;
198 #ifdef KTRACE
199 	if (didktr && error == 0) {
200 		ktruio.uio_iov = &ktriov;
201 		ktruio.uio_resid = cnt;
202 		ktrgenio(fd, UIO_READ, &ktruio, error);
203 	}
204 #endif
205 	td->td_retval[0] = cnt;
206 	return (error);
207 }
208 
209 /*
210  * Scatter read system call.
211  */
212 #ifndef _SYS_SYSPROTO_H_
213 struct readv_args {
214 	int	fd;
215 	struct	iovec *iovp;
216 	u_int	iovcnt;
217 };
218 #endif
219 /*
220  * MPSAFE
221  */
222 int
223 readv(td, uap)
224 	struct thread *td;
225 	struct readv_args *uap;
226 {
227 	struct file *fp;
228 	struct uio auio;
229 	struct iovec *iov;
230 	struct iovec *needfree;
231 	struct iovec aiov[UIO_SMALLIOV];
232 	long i, cnt;
233 	int error;
234 	u_int iovlen;
235 #ifdef KTRACE
236 	struct iovec *ktriov = NULL;
237 	struct uio ktruio;
238 #endif
239 
240 	if ((error = fget_read(td, uap->fd, &fp)) != 0)
241 		return (error);
242 	needfree = NULL;
243 	/* note: can't use iovlen until iovcnt is validated */
244 	iovlen = uap->iovcnt * sizeof (struct iovec);
245 	if (uap->iovcnt > UIO_SMALLIOV) {
246 		if (uap->iovcnt > UIO_MAXIOV) {
247 			error = EINVAL;
248 			goto done;
249 		}
250 		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
251 		needfree = iov;
252 	} else
253 		iov = aiov;
254 	auio.uio_iov = iov;
255 	auio.uio_iovcnt = uap->iovcnt;
256 	auio.uio_rw = UIO_READ;
257 	auio.uio_segflg = UIO_USERSPACE;
258 	auio.uio_td = td;
259 	auio.uio_offset = -1;
260 	if ((error = copyin(uap->iovp, iov, iovlen)))
261 		goto done;
262 	auio.uio_resid = 0;
263 	for (i = 0; i < uap->iovcnt; i++) {
264 		if (iov->iov_len > INT_MAX - auio.uio_resid) {
265 			error = EINVAL;
266 			goto done;
267 		}
268 		auio.uio_resid += iov->iov_len;
269 		iov++;
270 	}
271 #ifdef KTRACE
272 	/*
273 	 * if tracing, save a copy of iovec
274 	 */
275 	if (KTRPOINT(td, KTR_GENIO))  {
276 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
277 		bcopy(auio.uio_iov, ktriov, iovlen);
278 		ktruio = auio;
279 	}
280 #endif
281 	cnt = auio.uio_resid;
282 	if ((error = fo_read(fp, &auio, td->td_ucred, 0, td))) {
283 		if (auio.uio_resid != cnt && (error == ERESTART ||
284 		    error == EINTR || error == EWOULDBLOCK))
285 			error = 0;
286 	}
287 	cnt -= auio.uio_resid;
288 #ifdef KTRACE
289 	if (ktriov != NULL) {
290 		if (error == 0) {
291 			ktruio.uio_iov = ktriov;
292 			ktruio.uio_resid = cnt;
293 			ktrgenio(uap->fd, UIO_READ, &ktruio, error);
294 		}
295 		FREE(ktriov, M_TEMP);
296 	}
297 #endif
298 	td->td_retval[0] = cnt;
299 done:
300 	fdrop(fp, td);
301 	if (needfree)
302 		FREE(needfree, M_IOV);
303 	return (error);
304 }
305 
306 /*
307  * Write system call
308  */
309 #ifndef _SYS_SYSPROTO_H_
310 struct write_args {
311 	int	fd;
312 	const void *buf;
313 	size_t	nbyte;
314 };
315 #endif
316 /*
317  * MPSAFE
318  */
319 int
320 write(td, uap)
321 	struct thread *td;
322 	struct write_args *uap;
323 {
324 	struct file *fp;
325 	int error;
326 
327 	if ((error = fget_write(td, uap->fd, &fp)) == 0) {
328 		error = dofilewrite(td, fp, uap->fd, uap->buf, uap->nbyte,
329 			    (off_t)-1, 0);
330 		fdrop(fp, td);
331 	} else {
332 		error = EBADF;	/* XXX this can't be right */
333 	}
334 	return(error);
335 }
336 
337 /*
338  * Pwrite system call
339  */
340 #ifndef _SYS_SYSPROTO_H_
341 struct pwrite_args {
342 	int	fd;
343 	const void *buf;
344 	size_t	nbyte;
345 	int	pad;
346 	off_t	offset;
347 };
348 #endif
349 /*
350  * MPSAFE
351  */
352 int
353 pwrite(td, uap)
354 	struct thread *td;
355 	struct pwrite_args *uap;
356 {
357 	struct file *fp;
358 	int error;
359 
360 	if ((error = fget_write(td, uap->fd, &fp)) == 0) {
361 		if (fp->f_type == DTYPE_VNODE) {
362 			error = dofilewrite(td, fp, uap->fd, uap->buf,
363 				    uap->nbyte, uap->offset, FOF_OFFSET);
364 		} else {
365 			error = ESPIPE;
366 		}
367 		fdrop(fp, td);
368 	} else {
369 		error = EBADF;	/* this can't be right */
370 	}
371 	return(error);
372 }
373 
374 static int
375 dofilewrite(td, fp, fd, buf, nbyte, offset, flags)
376 	struct thread *td;
377 	struct file *fp;
378 	int fd, flags;
379 	const void *buf;
380 	size_t nbyte;
381 	off_t offset;
382 {
383 	struct uio auio;
384 	struct iovec aiov;
385 	long cnt, error = 0;
386 #ifdef KTRACE
387 	struct iovec ktriov;
388 	struct uio ktruio;
389 	int didktr = 0;
390 #endif
391 
392 	aiov.iov_base = (void *)(uintptr_t)buf;
393 	aiov.iov_len = nbyte;
394 	auio.uio_iov = &aiov;
395 	auio.uio_iovcnt = 1;
396 	auio.uio_offset = offset;
397 	if (nbyte > INT_MAX)
398 		return (EINVAL);
399 	auio.uio_resid = nbyte;
400 	auio.uio_rw = UIO_WRITE;
401 	auio.uio_segflg = UIO_USERSPACE;
402 	auio.uio_td = td;
403 #ifdef KTRACE
404 	/*
405 	 * if tracing, save a copy of iovec and uio
406 	 */
407 	if (KTRPOINT(td, KTR_GENIO)) {
408 		ktriov = aiov;
409 		ktruio = auio;
410 		didktr = 1;
411 	}
412 #endif
413 	cnt = nbyte;
414 	if (fp->f_type == DTYPE_VNODE)
415 		bwillwrite();
416 	if ((error = fo_write(fp, &auio, td->td_ucred, flags, td))) {
417 		if (auio.uio_resid != cnt && (error == ERESTART ||
418 		    error == EINTR || error == EWOULDBLOCK))
419 			error = 0;
420 		/* Socket layer is responsible for issuing SIGPIPE. */
421 		if (error == EPIPE && fp->f_type != DTYPE_SOCKET) {
422 			PROC_LOCK(td->td_proc);
423 			psignal(td->td_proc, SIGPIPE);
424 			PROC_UNLOCK(td->td_proc);
425 		}
426 	}
427 	cnt -= auio.uio_resid;
428 #ifdef KTRACE
429 	if (didktr && error == 0) {
430 		ktruio.uio_iov = &ktriov;
431 		ktruio.uio_resid = cnt;
432 		ktrgenio(fd, UIO_WRITE, &ktruio, error);
433 	}
434 #endif
435 	td->td_retval[0] = cnt;
436 	return (error);
437 }
438 
439 /*
440  * Gather write system call
441  */
442 #ifndef _SYS_SYSPROTO_H_
443 struct writev_args {
444 	int	fd;
445 	struct	iovec *iovp;
446 	u_int	iovcnt;
447 };
448 #endif
449 /*
450  * MPSAFE
451  */
452 int
453 writev(td, uap)
454 	struct thread *td;
455 	register struct writev_args *uap;
456 {
457 	struct file *fp;
458 	struct uio auio;
459 	register struct iovec *iov;
460 	struct iovec *needfree;
461 	struct iovec aiov[UIO_SMALLIOV];
462 	long i, cnt, error = 0;
463 	u_int iovlen;
464 #ifdef KTRACE
465 	struct iovec *ktriov = NULL;
466 	struct uio ktruio;
467 #endif
468 
469 	mtx_lock(&Giant);
470 	if ((error = fget_write(td, uap->fd, &fp)) != 0) {
471 		error = EBADF;
472 		goto done2;
473 	}
474 	/* note: can't use iovlen until iovcnt is validated */
475 	iovlen = uap->iovcnt * sizeof (struct iovec);
476 	if (uap->iovcnt > UIO_SMALLIOV) {
477 		if (uap->iovcnt > UIO_MAXIOV) {
478 			needfree = NULL;
479 			error = EINVAL;
480 			goto done;
481 		}
482 		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
483 		needfree = iov;
484 	} else {
485 		iov = aiov;
486 		needfree = NULL;
487 	}
488 	auio.uio_iov = iov;
489 	auio.uio_iovcnt = uap->iovcnt;
490 	auio.uio_rw = UIO_WRITE;
491 	auio.uio_segflg = UIO_USERSPACE;
492 	auio.uio_td = td;
493 	auio.uio_offset = -1;
494 	if ((error = copyin(uap->iovp, iov, iovlen)))
495 		goto done;
496 	auio.uio_resid = 0;
497 	for (i = 0; i < uap->iovcnt; i++) {
498 		if (iov->iov_len > INT_MAX - auio.uio_resid) {
499 			error = EINVAL;
500 			goto done;
501 		}
502 		auio.uio_resid += iov->iov_len;
503 		iov++;
504 	}
505 #ifdef KTRACE
506 	/*
507 	 * if tracing, save a copy of iovec and uio
508 	 */
509 	if (KTRPOINT(td, KTR_GENIO))  {
510 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
511 		bcopy(auio.uio_iov, ktriov, iovlen);
512 		ktruio = auio;
513 	}
514 #endif
515 	cnt = auio.uio_resid;
516 	if (fp->f_type == DTYPE_VNODE)
517 		bwillwrite();
518 	if ((error = fo_write(fp, &auio, td->td_ucred, 0, td))) {
519 		if (auio.uio_resid != cnt && (error == ERESTART ||
520 		    error == EINTR || error == EWOULDBLOCK))
521 			error = 0;
522 		if (error == EPIPE) {
523 			PROC_LOCK(td->td_proc);
524 			psignal(td->td_proc, SIGPIPE);
525 			PROC_UNLOCK(td->td_proc);
526 		}
527 	}
528 	cnt -= auio.uio_resid;
529 #ifdef KTRACE
530 	if (ktriov != NULL) {
531 		if (error == 0) {
532 			ktruio.uio_iov = ktriov;
533 			ktruio.uio_resid = cnt;
534 			ktrgenio(uap->fd, UIO_WRITE, &ktruio, error);
535 		}
536 		FREE(ktriov, M_TEMP);
537 	}
538 #endif
539 	td->td_retval[0] = cnt;
540 done:
541 	fdrop(fp, td);
542 	if (needfree)
543 		FREE(needfree, M_IOV);
544 done2:
545 	mtx_unlock(&Giant);
546 	return (error);
547 }
548 
549 /*
550  * Ioctl system call
551  */
552 #ifndef _SYS_SYSPROTO_H_
553 struct ioctl_args {
554 	int	fd;
555 	u_long	com;
556 	caddr_t	data;
557 };
558 #endif
559 /*
560  * MPSAFE
561  */
562 /* ARGSUSED */
563 int
564 ioctl(td, uap)
565 	struct thread *td;
566 	register struct ioctl_args *uap;
567 {
568 	struct file *fp;
569 	register struct filedesc *fdp;
570 	register u_long com;
571 	int error = 0;
572 	register u_int size;
573 	caddr_t data, memp;
574 	int tmp;
575 #define STK_PARAMS	128
576 	union {
577 	    char stkbuf[STK_PARAMS];
578 	    long align;
579 	} ubuf;
580 
581 	if ((error = fget(td, uap->fd, &fp)) != 0)
582 		return (error);
583 	mtx_lock(&Giant);
584 	if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
585 		fdrop(fp, td);
586 		mtx_unlock(&Giant);
587 		return (EBADF);
588 	}
589 	fdp = td->td_proc->p_fd;
590 	switch (com = uap->com) {
591 	case FIONCLEX:
592 		FILEDESC_LOCK(fdp);
593 		fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE;
594 		FILEDESC_UNLOCK(fdp);
595 		fdrop(fp, td);
596 		mtx_unlock(&Giant);
597 		return (0);
598 	case FIOCLEX:
599 		FILEDESC_LOCK(fdp);
600 		fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE;
601 		FILEDESC_UNLOCK(fdp);
602 		fdrop(fp, td);
603 		mtx_unlock(&Giant);
604 		return (0);
605 	}
606 
607 	/*
608 	 * Interpret high order word to find amount of data to be
609 	 * copied to/from the user's address space.
610 	 */
611 	size = IOCPARM_LEN(com);
612 	if (size > IOCPARM_MAX) {
613 		fdrop(fp, td);
614 		mtx_unlock(&Giant);
615 		return (ENOTTY);
616 	}
617 
618 	memp = NULL;
619 	if (size > sizeof (ubuf.stkbuf)) {
620 		memp = malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
621 		data = memp;
622 	} else {
623 		data = ubuf.stkbuf;
624 	}
625 	if (com&IOC_IN) {
626 		if (size) {
627 			error = copyin(uap->data, data, (u_int)size);
628 			if (error) {
629 				if (memp)
630 					free(memp, M_IOCTLOPS);
631 				fdrop(fp, td);
632 				goto done;
633 			}
634 		} else {
635 			*(caddr_t *)data = uap->data;
636 		}
637 	} else if ((com&IOC_OUT) && size) {
638 		/*
639 		 * Zero the buffer so the user always
640 		 * gets back something deterministic.
641 		 */
642 		bzero(data, size);
643 	} else if (com&IOC_VOID) {
644 		*(caddr_t *)data = uap->data;
645 	}
646 
647 	switch (com) {
648 
649 	case FIONBIO:
650 		FILE_LOCK(fp);
651 		if ((tmp = *(int *)data))
652 			fp->f_flag |= FNONBLOCK;
653 		else
654 			fp->f_flag &= ~FNONBLOCK;
655 		FILE_UNLOCK(fp);
656 		error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
657 		break;
658 
659 	case FIOASYNC:
660 		FILE_LOCK(fp);
661 		if ((tmp = *(int *)data))
662 			fp->f_flag |= FASYNC;
663 		else
664 			fp->f_flag &= ~FASYNC;
665 		FILE_UNLOCK(fp);
666 		error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td);
667 		break;
668 
669 	default:
670 		error = fo_ioctl(fp, com, data, td->td_ucred, td);
671 		/*
672 		 * Copy any data to user, size was
673 		 * already set and checked above.
674 		 */
675 		if (error == 0 && (com&IOC_OUT) && size)
676 			error = copyout(data, uap->data, (u_int)size);
677 		break;
678 	}
679 	if (memp)
680 		free(memp, M_IOCTLOPS);
681 	fdrop(fp, td);
682 done:
683 	mtx_unlock(&Giant);
684 	return (error);
685 }
686 
687 /*
688  * sellock and selwait are initialized in selectinit() via SYSINIT.
689  */
690 struct mtx	sellock;
691 struct cv	selwait;
692 u_int		nselcoll;	/* Select collisions since boot */
693 SYSCTL_UINT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
694 
695 /*
696  * Select system call.
697  */
698 #ifndef _SYS_SYSPROTO_H_
699 struct select_args {
700 	int	nd;
701 	fd_set	*in, *ou, *ex;
702 	struct	timeval *tv;
703 };
704 #endif
705 /*
706  * MPSAFE
707  */
708 int
709 select(td, uap)
710 	register struct thread *td;
711 	register struct select_args *uap;
712 {
713 	struct timeval tv, *tvp;
714 	int error;
715 
716 	if (uap->tv != NULL) {
717 		error = copyin(uap->tv, &tv, sizeof(tv));
718 		if (error)
719 			return (error);
720 		tvp = &tv;
721 	} else
722 		tvp = NULL;
723 
724 	return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp));
725 }
726 
727 int
728 kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou,
729     fd_set *fd_ex, struct timeval *tvp)
730 {
731 	struct filedesc *fdp;
732 	/*
733 	 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
734 	 * infds with the new FD_SETSIZE of 1024, and more than enough for
735 	 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
736 	 * of 256.
737 	 */
738 	fd_mask s_selbits[howmany(2048, NFDBITS)];
739 	fd_mask *ibits[3], *obits[3], *selbits, *sbp;
740 	struct timeval atv, rtv, ttv;
741 	int error, timo;
742 	u_int ncoll, nbufbytes, ncpbytes, nfdbits;
743 
744 	if (nd < 0)
745 		return (EINVAL);
746 	fdp = td->td_proc->p_fd;
747 	mtx_lock(&Giant);
748 	FILEDESC_LOCK(fdp);
749 
750 	if (nd > td->td_proc->p_fd->fd_nfiles)
751 		nd = td->td_proc->p_fd->fd_nfiles;   /* forgiving; slightly wrong */
752 	FILEDESC_UNLOCK(fdp);
753 
754 	/*
755 	 * Allocate just enough bits for the non-null fd_sets.  Use the
756 	 * preallocated auto buffer if possible.
757 	 */
758 	nfdbits = roundup(nd, NFDBITS);
759 	ncpbytes = nfdbits / NBBY;
760 	nbufbytes = 0;
761 	if (fd_in != NULL)
762 		nbufbytes += 2 * ncpbytes;
763 	if (fd_ou != NULL)
764 		nbufbytes += 2 * ncpbytes;
765 	if (fd_ex != NULL)
766 		nbufbytes += 2 * ncpbytes;
767 	if (nbufbytes <= sizeof s_selbits)
768 		selbits = &s_selbits[0];
769 	else
770 		selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
771 
772 	/*
773 	 * Assign pointers into the bit buffers and fetch the input bits.
774 	 * Put the output buffers together so that they can be bzeroed
775 	 * together.
776 	 */
777 	sbp = selbits;
778 #define	getbits(name, x) \
779 	do {								\
780 		if (name == NULL)					\
781 			ibits[x] = NULL;				\
782 		else {							\
783 			ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp;	\
784 			obits[x] = sbp;					\
785 			sbp += ncpbytes / sizeof *sbp;			\
786 			error = copyin(name, ibits[x], ncpbytes);	\
787 			if (error != 0)					\
788 				goto done_nosellock;			\
789 		}							\
790 	} while (0)
791 	getbits(fd_in, 0);
792 	getbits(fd_ou, 1);
793 	getbits(fd_ex, 2);
794 #undef	getbits
795 	if (nbufbytes != 0)
796 		bzero(selbits, nbufbytes / 2);
797 
798 	if (tvp != NULL) {
799 		atv = *tvp;
800 		if (itimerfix(&atv)) {
801 			error = EINVAL;
802 			goto done_nosellock;
803 		}
804 		getmicrouptime(&rtv);
805 		timevaladd(&atv, &rtv);
806 	} else {
807 		atv.tv_sec = 0;
808 		atv.tv_usec = 0;
809 	}
810 	timo = 0;
811 	TAILQ_INIT(&td->td_selq);
812 	mtx_lock(&sellock);
813 retry:
814 	ncoll = nselcoll;
815 	mtx_lock_spin(&sched_lock);
816 	td->td_flags |= TDF_SELECT;
817 	mtx_unlock_spin(&sched_lock);
818 	mtx_unlock(&sellock);
819 
820 	error = selscan(td, ibits, obits, nd);
821 	mtx_lock(&sellock);
822 	if (error || td->td_retval[0])
823 		goto done;
824 	if (atv.tv_sec || atv.tv_usec) {
825 		getmicrouptime(&rtv);
826 		if (timevalcmp(&rtv, &atv, >=))
827 			goto done;
828 		ttv = atv;
829 		timevalsub(&ttv, &rtv);
830 		timo = ttv.tv_sec > 24 * 60 * 60 ?
831 		    24 * 60 * 60 * hz : tvtohz(&ttv);
832 	}
833 
834 	/*
835 	 * An event of interest may occur while we do not hold
836 	 * sellock, so check TDF_SELECT and the number of
837 	 * collisions and rescan the file descriptors if
838 	 * necessary.
839 	 */
840 	mtx_lock_spin(&sched_lock);
841 	if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
842 		mtx_unlock_spin(&sched_lock);
843 		goto retry;
844 	}
845 	mtx_unlock_spin(&sched_lock);
846 
847 	if (timo > 0)
848 		error = cv_timedwait_sig(&selwait, &sellock, timo);
849 	else
850 		error = cv_wait_sig(&selwait, &sellock);
851 
852 	if (error == 0)
853 		goto retry;
854 
855 done:
856 	clear_selinfo_list(td);
857 	mtx_lock_spin(&sched_lock);
858 	td->td_flags &= ~TDF_SELECT;
859 	mtx_unlock_spin(&sched_lock);
860 	mtx_unlock(&sellock);
861 
862 done_nosellock:
863 	/* select is not restarted after signals... */
864 	if (error == ERESTART)
865 		error = EINTR;
866 	if (error == EWOULDBLOCK)
867 		error = 0;
868 #define	putbits(name, x) \
869 	if (name && (error2 = copyout(obits[x], name, ncpbytes))) \
870 		error = error2;
871 	if (error == 0) {
872 		int error2;
873 
874 		putbits(fd_in, 0);
875 		putbits(fd_ou, 1);
876 		putbits(fd_ex, 2);
877 #undef putbits
878 	}
879 	if (selbits != &s_selbits[0])
880 		free(selbits, M_SELECT);
881 
882 	mtx_unlock(&Giant);
883 	return (error);
884 }
885 
886 static int
887 selscan(td, ibits, obits, nfd)
888 	struct thread *td;
889 	fd_mask **ibits, **obits;
890 	int nfd;
891 {
892 	int msk, i, fd;
893 	fd_mask bits;
894 	struct file *fp;
895 	int n = 0;
896 	/* Note: backend also returns POLLHUP/POLLERR if appropriate. */
897 	static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
898 	struct filedesc *fdp = td->td_proc->p_fd;
899 
900 	FILEDESC_LOCK(fdp);
901 	for (msk = 0; msk < 3; msk++) {
902 		if (ibits[msk] == NULL)
903 			continue;
904 		for (i = 0; i < nfd; i += NFDBITS) {
905 			bits = ibits[msk][i/NFDBITS];
906 			/* ffs(int mask) not portable, fd_mask is long */
907 			for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
908 				if (!(bits & 1))
909 					continue;
910 				if ((fp = fget_locked(fdp, fd)) == NULL) {
911 					FILEDESC_UNLOCK(fdp);
912 					return (EBADF);
913 				}
914 				if (fo_poll(fp, flag[msk], td->td_ucred,
915 				    td)) {
916 					obits[msk][(fd)/NFDBITS] |=
917 					    ((fd_mask)1 << ((fd) % NFDBITS));
918 					n++;
919 				}
920 			}
921 		}
922 	}
923 	FILEDESC_UNLOCK(fdp);
924 	td->td_retval[0] = n;
925 	return (0);
926 }
927 
928 /*
929  * Poll system call.
930  */
931 #ifndef _SYS_SYSPROTO_H_
932 struct poll_args {
933 	struct pollfd *fds;
934 	u_int	nfds;
935 	int	timeout;
936 };
937 #endif
938 /*
939  * MPSAFE
940  */
941 int
942 poll(td, uap)
943 	struct thread *td;
944 	struct poll_args *uap;
945 {
946 	caddr_t bits;
947 	char smallbits[32 * sizeof(struct pollfd)];
948 	struct timeval atv, rtv, ttv;
949 	int error = 0, timo;
950 	u_int ncoll, nfds;
951 	size_t ni;
952 
953 	nfds = uap->nfds;
954 
955 	mtx_lock(&Giant);
956 	/*
957 	 * This is kinda bogus.  We have fd limits, but that is not
958 	 * really related to the size of the pollfd array.  Make sure
959 	 * we let the process use at least FD_SETSIZE entries and at
960 	 * least enough for the current limits.  We want to be reasonably
961 	 * safe, but not overly restrictive.
962 	 */
963 	if ((nfds > td->td_proc->p_rlimit[RLIMIT_NOFILE].rlim_cur) &&
964 	    (nfds > FD_SETSIZE)) {
965 		error = EINVAL;
966 		goto done2;
967 	}
968 	ni = nfds * sizeof(struct pollfd);
969 	if (ni > sizeof(smallbits))
970 		bits = malloc(ni, M_TEMP, M_WAITOK);
971 	else
972 		bits = smallbits;
973 	error = copyin(uap->fds, bits, ni);
974 	if (error)
975 		goto done_nosellock;
976 	if (uap->timeout != INFTIM) {
977 		atv.tv_sec = uap->timeout / 1000;
978 		atv.tv_usec = (uap->timeout % 1000) * 1000;
979 		if (itimerfix(&atv)) {
980 			error = EINVAL;
981 			goto done_nosellock;
982 		}
983 		getmicrouptime(&rtv);
984 		timevaladd(&atv, &rtv);
985 	} else {
986 		atv.tv_sec = 0;
987 		atv.tv_usec = 0;
988 	}
989 	timo = 0;
990 	TAILQ_INIT(&td->td_selq);
991 	mtx_lock(&sellock);
992 retry:
993 	ncoll = nselcoll;
994 	mtx_lock_spin(&sched_lock);
995 	td->td_flags |= TDF_SELECT;
996 	mtx_unlock_spin(&sched_lock);
997 	mtx_unlock(&sellock);
998 
999 	error = pollscan(td, (struct pollfd *)bits, nfds);
1000 	mtx_lock(&sellock);
1001 	if (error || td->td_retval[0])
1002 		goto done;
1003 	if (atv.tv_sec || atv.tv_usec) {
1004 		getmicrouptime(&rtv);
1005 		if (timevalcmp(&rtv, &atv, >=))
1006 			goto done;
1007 		ttv = atv;
1008 		timevalsub(&ttv, &rtv);
1009 		timo = ttv.tv_sec > 24 * 60 * 60 ?
1010 		    24 * 60 * 60 * hz : tvtohz(&ttv);
1011 	}
1012 	/*
1013 	 * An event of interest may occur while we do not hold
1014 	 * sellock, so check TDF_SELECT and the number of collisions
1015 	 * and rescan the file descriptors if necessary.
1016 	 */
1017 	mtx_lock_spin(&sched_lock);
1018 	if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
1019 		mtx_unlock_spin(&sched_lock);
1020 		goto retry;
1021 	}
1022 	mtx_unlock_spin(&sched_lock);
1023 
1024 	if (timo > 0)
1025 		error = cv_timedwait_sig(&selwait, &sellock, timo);
1026 	else
1027 		error = cv_wait_sig(&selwait, &sellock);
1028 
1029 	if (error == 0)
1030 		goto retry;
1031 
1032 done:
1033 	clear_selinfo_list(td);
1034 	mtx_lock_spin(&sched_lock);
1035 	td->td_flags &= ~TDF_SELECT;
1036 	mtx_unlock_spin(&sched_lock);
1037 	mtx_unlock(&sellock);
1038 
1039 done_nosellock:
1040 	/* poll is not restarted after signals... */
1041 	if (error == ERESTART)
1042 		error = EINTR;
1043 	if (error == EWOULDBLOCK)
1044 		error = 0;
1045 	if (error == 0) {
1046 		error = copyout(bits, uap->fds, ni);
1047 		if (error)
1048 			goto out;
1049 	}
1050 out:
1051 	if (ni > sizeof(smallbits))
1052 		free(bits, M_TEMP);
1053 done2:
1054 	mtx_unlock(&Giant);
1055 	return (error);
1056 }
1057 
1058 static int
1059 pollscan(td, fds, nfd)
1060 	struct thread *td;
1061 	struct pollfd *fds;
1062 	u_int nfd;
1063 {
1064 	register struct filedesc *fdp = td->td_proc->p_fd;
1065 	int i;
1066 	struct file *fp;
1067 	int n = 0;
1068 
1069 	FILEDESC_LOCK(fdp);
1070 	for (i = 0; i < nfd; i++, fds++) {
1071 		if (fds->fd >= fdp->fd_nfiles) {
1072 			fds->revents = POLLNVAL;
1073 			n++;
1074 		} else if (fds->fd < 0) {
1075 			fds->revents = 0;
1076 		} else {
1077 			fp = fdp->fd_ofiles[fds->fd];
1078 			if (fp == NULL) {
1079 				fds->revents = POLLNVAL;
1080 				n++;
1081 			} else {
1082 				/*
1083 				 * Note: backend also returns POLLHUP and
1084 				 * POLLERR if appropriate.
1085 				 */
1086 				fds->revents = fo_poll(fp, fds->events,
1087 				    td->td_ucred, td);
1088 				if (fds->revents != 0)
1089 					n++;
1090 			}
1091 		}
1092 	}
1093 	FILEDESC_UNLOCK(fdp);
1094 	td->td_retval[0] = n;
1095 	return (0);
1096 }
1097 
1098 /*
1099  * OpenBSD poll system call.
1100  * XXX this isn't quite a true representation..  OpenBSD uses select ops.
1101  */
1102 #ifndef _SYS_SYSPROTO_H_
1103 struct openbsd_poll_args {
1104 	struct pollfd *fds;
1105 	u_int	nfds;
1106 	int	timeout;
1107 };
1108 #endif
1109 /*
1110  * MPSAFE
1111  */
1112 int
1113 openbsd_poll(td, uap)
1114 	register struct thread *td;
1115 	register struct openbsd_poll_args *uap;
1116 {
1117 	return (poll(td, (struct poll_args *)uap));
1118 }
1119 
1120 /*
1121  * Remove the references to the thread from all of the objects
1122  * we were polling.
1123  *
1124  * This code assumes that the underlying owner of the selinfo
1125  * structure will hold sellock before it changes it, and that
1126  * it will unlink itself from our list if it goes away.
1127  */
1128 void
1129 clear_selinfo_list(td)
1130 	struct thread *td;
1131 {
1132 	struct selinfo *si;
1133 
1134 	mtx_assert(&sellock, MA_OWNED);
1135 	TAILQ_FOREACH(si, &td->td_selq, si_thrlist)
1136 		si->si_thread = NULL;
1137 	TAILQ_INIT(&td->td_selq);
1138 }
1139 
1140 /*ARGSUSED*/
1141 int
1142 seltrue(dev, events, td)
1143 	dev_t dev;
1144 	int events;
1145 	struct thread *td;
1146 {
1147 
1148 	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
1149 }
1150 
1151 /*
1152  * Record a select request.
1153  */
1154 void
1155 selrecord(selector, sip)
1156 	struct thread *selector;
1157 	struct selinfo *sip;
1158 {
1159 
1160 	mtx_lock(&sellock);
1161 	/*
1162 	 * If the selinfo's thread pointer is NULL then take ownership of it.
1163 	 *
1164 	 * If the thread pointer is not NULL and it points to another
1165 	 * thread, then we have a collision.
1166 	 *
1167 	 * If the thread pointer is not NULL and points back to us then leave
1168 	 * it alone as we've already added pointed it at us and added it to
1169 	 * our list.
1170 	 */
1171 	if (sip->si_thread == NULL) {
1172 		sip->si_thread = selector;
1173 		TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist);
1174 	} else if (sip->si_thread != selector) {
1175 		sip->si_flags |= SI_COLL;
1176 	}
1177 
1178 	mtx_unlock(&sellock);
1179 }
1180 
1181 /*
1182  * Do a wakeup when a selectable event occurs.
1183  */
1184 void
1185 selwakeup(sip)
1186 	struct selinfo *sip;
1187 {
1188 	struct thread *td;
1189 
1190 	mtx_lock(&sellock);
1191 	td = sip->si_thread;
1192 	if ((sip->si_flags & SI_COLL) != 0) {
1193 		nselcoll++;
1194 		sip->si_flags &= ~SI_COLL;
1195 		cv_broadcast(&selwait);
1196 	}
1197 	if (td == NULL) {
1198 		mtx_unlock(&sellock);
1199 		return;
1200 	}
1201 	TAILQ_REMOVE(&td->td_selq, sip, si_thrlist);
1202 	sip->si_thread = NULL;
1203 	mtx_lock_spin(&sched_lock);
1204 	if (td->td_wchan == &selwait) {
1205 		cv_waitq_remove(td);
1206 		TD_CLR_SLEEPING(td);
1207 		setrunnable(td);
1208 	} else
1209 		td->td_flags &= ~TDF_SELECT;
1210 	mtx_unlock_spin(&sched_lock);
1211 	mtx_unlock(&sellock);
1212 }
1213 
1214 static void selectinit(void *);
1215 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL)
1216 
1217 /* ARGSUSED*/
1218 static void
1219 selectinit(dummy)
1220 	void *dummy;
1221 {
1222 	cv_init(&selwait, "select");
1223 	mtx_init(&sellock, "sellck", NULL, MTX_DEF);
1224 }
1225