xref: /freebsd/sys/kern/sys_generic.c (revision 3ff369fed2a08f32dda232c10470b949bef9489f)
1 /*
2  * Copyright (c) 1982, 1986, 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
39  * $FreeBSD$
40  */
41 
42 #include "opt_ktrace.h"
43 
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/sysproto.h>
47 #include <sys/filedesc.h>
48 #include <sys/filio.h>
49 #include <sys/fcntl.h>
50 #include <sys/file.h>
51 #include <sys/proc.h>
52 #include <sys/signalvar.h>
53 #include <sys/socketvar.h>
54 #include <sys/uio.h>
55 #include <sys/kernel.h>
56 #include <sys/malloc.h>
57 #include <sys/poll.h>
58 #include <sys/resourcevar.h>
59 #include <sys/selinfo.h>
60 #include <sys/sysctl.h>
61 #include <sys/sysent.h>
62 #include <sys/bio.h>
63 #include <sys/buf.h>
64 #include <sys/condvar.h>
65 #ifdef __alpha__
66 #include <sys/disklabel.h>
67 #endif
68 #ifdef KTRACE
69 #include <sys/ktrace.h>
70 #endif
71 #include <vm/vm.h>
72 #include <vm/vm_page.h>
73 
74 #include <machine/limits.h>
75 
76 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
77 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
78 MALLOC_DEFINE(M_IOV, "iov", "large iov's");
79 
80 static int	pollscan(struct thread *, struct pollfd *, u_int);
81 static int	selscan(struct thread *, fd_mask **, fd_mask **, int);
82 static int	dofileread(struct thread *, struct file *, int, void *,
83 		    size_t, off_t, int);
84 static int	dofilewrite(struct thread *, struct file *, int,
85 		    const void *, size_t, off_t, int);
86 
87 /*
88  * Read system call.
89  */
90 #ifndef _SYS_SYSPROTO_H_
91 struct read_args {
92 	int	fd;
93 	void	*buf;
94 	size_t	nbyte;
95 };
96 #endif
97 /*
98  * MPSAFE
99  */
100 int
101 read(td, uap)
102 	struct thread *td;
103 	struct read_args *uap;
104 {
105 	struct file *fp;
106 	int error;
107 
108 	if ((error = fget_read(td, uap->fd, &fp)) == 0) {
109 		error = dofileread(td, fp, uap->fd, uap->buf,
110 			    uap->nbyte, (off_t)-1, 0);
111 		fdrop(fp, td);
112 	}
113 	return(error);
114 }
115 
116 /*
117  * Pread system call
118  */
119 #ifndef _SYS_SYSPROTO_H_
120 struct pread_args {
121 	int	fd;
122 	void	*buf;
123 	size_t	nbyte;
124 	int	pad;
125 	off_t	offset;
126 };
127 #endif
128 /*
129  * MPSAFE
130  */
131 int
132 pread(td, uap)
133 	struct thread *td;
134 	struct pread_args *uap;
135 {
136 	struct file *fp;
137 	int error;
138 
139 	if ((error = fget_read(td, uap->fd, &fp)) != 0)
140 		return (error);
141 	if (fp->f_type != DTYPE_VNODE) {
142 		error = ESPIPE;
143 	} else {
144 		error = dofileread(td, fp, uap->fd, uap->buf, uap->nbyte,
145 			    uap->offset, FOF_OFFSET);
146 	}
147 	fdrop(fp, td);
148 	return(error);
149 }
150 
151 /*
152  * Code common for read and pread
153  */
154 int
155 dofileread(td, fp, fd, buf, nbyte, offset, flags)
156 	struct thread *td;
157 	struct file *fp;
158 	int fd, flags;
159 	void *buf;
160 	size_t nbyte;
161 	off_t offset;
162 {
163 	struct uio auio;
164 	struct iovec aiov;
165 	long cnt, error = 0;
166 #ifdef KTRACE
167 	struct iovec ktriov;
168 	struct uio ktruio;
169 	int didktr = 0;
170 #endif
171 
172 	aiov.iov_base = (caddr_t)buf;
173 	aiov.iov_len = nbyte;
174 	auio.uio_iov = &aiov;
175 	auio.uio_iovcnt = 1;
176 	auio.uio_offset = offset;
177 	if (nbyte > INT_MAX)
178 		return (EINVAL);
179 	auio.uio_resid = nbyte;
180 	auio.uio_rw = UIO_READ;
181 	auio.uio_segflg = UIO_USERSPACE;
182 	auio.uio_td = td;
183 #ifdef KTRACE
184 	/*
185 	 * if tracing, save a copy of iovec
186 	 */
187 	if (KTRPOINT(td, KTR_GENIO)) {
188 		ktriov = aiov;
189 		ktruio = auio;
190 		didktr = 1;
191 	}
192 #endif
193 	cnt = nbyte;
194 
195 	if ((error = fo_read(fp, &auio, fp->f_cred, flags, td))) {
196 		if (auio.uio_resid != cnt && (error == ERESTART ||
197 		    error == EINTR || error == EWOULDBLOCK))
198 			error = 0;
199 	}
200 	cnt -= auio.uio_resid;
201 #ifdef KTRACE
202 	if (didktr && error == 0) {
203 		ktruio.uio_iov = &ktriov;
204 		ktruio.uio_resid = cnt;
205 		ktrgenio(fd, UIO_READ, &ktruio, error);
206 	}
207 #endif
208 	td->td_retval[0] = cnt;
209 	return (error);
210 }
211 
212 /*
213  * Scatter read system call.
214  */
215 #ifndef _SYS_SYSPROTO_H_
216 struct readv_args {
217 	int	fd;
218 	struct	iovec *iovp;
219 	u_int	iovcnt;
220 };
221 #endif
222 /*
223  * MPSAFE
224  */
225 int
226 readv(td, uap)
227 	struct thread *td;
228 	struct readv_args *uap;
229 {
230 	struct file *fp;
231 	struct uio auio;
232 	struct iovec *iov;
233 	struct iovec *needfree;
234 	struct iovec aiov[UIO_SMALLIOV];
235 	long i, cnt;
236 	int error;
237 	u_int iovlen;
238 #ifdef KTRACE
239 	struct iovec *ktriov = NULL;
240 	struct uio ktruio;
241 #endif
242 
243 	if ((error = fget_read(td, uap->fd, &fp)) != 0)
244 		return (error);
245 	needfree = NULL;
246 	/* note: can't use iovlen until iovcnt is validated */
247 	iovlen = uap->iovcnt * sizeof (struct iovec);
248 	if (uap->iovcnt > UIO_SMALLIOV) {
249 		if (uap->iovcnt > UIO_MAXIOV) {
250 			error = EINVAL;
251 			goto done;
252 		}
253 		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
254 		needfree = iov;
255 	} else
256 		iov = aiov;
257 	auio.uio_iov = iov;
258 	auio.uio_iovcnt = uap->iovcnt;
259 	auio.uio_rw = UIO_READ;
260 	auio.uio_segflg = UIO_USERSPACE;
261 	auio.uio_td = td;
262 	auio.uio_offset = -1;
263 	if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)))
264 		goto done;
265 	auio.uio_resid = 0;
266 	for (i = 0; i < uap->iovcnt; i++) {
267 		if (iov->iov_len > INT_MAX - auio.uio_resid) {
268 			error = EINVAL;
269 			goto done;
270 		}
271 		auio.uio_resid += iov->iov_len;
272 		iov++;
273 	}
274 #ifdef KTRACE
275 	/*
276 	 * if tracing, save a copy of iovec
277 	 */
278 	if (KTRPOINT(td, KTR_GENIO))  {
279 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
280 		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
281 		ktruio = auio;
282 	}
283 #endif
284 	cnt = auio.uio_resid;
285 	if ((error = fo_read(fp, &auio, fp->f_cred, 0, td))) {
286 		if (auio.uio_resid != cnt && (error == ERESTART ||
287 		    error == EINTR || error == EWOULDBLOCK))
288 			error = 0;
289 	}
290 	cnt -= auio.uio_resid;
291 #ifdef KTRACE
292 	if (ktriov != NULL) {
293 		if (error == 0) {
294 			ktruio.uio_iov = ktriov;
295 			ktruio.uio_resid = cnt;
296 			ktrgenio(uap->fd, UIO_READ, &ktruio, error);
297 		}
298 		FREE(ktriov, M_TEMP);
299 	}
300 #endif
301 	td->td_retval[0] = cnt;
302 done:
303 	fdrop(fp, td);
304 	if (needfree)
305 		FREE(needfree, M_IOV);
306 	return (error);
307 }
308 
309 /*
310  * Write system call
311  */
312 #ifndef _SYS_SYSPROTO_H_
313 struct write_args {
314 	int	fd;
315 	const void *buf;
316 	size_t	nbyte;
317 };
318 #endif
319 /*
320  * MPSAFE
321  */
322 int
323 write(td, uap)
324 	struct thread *td;
325 	struct write_args *uap;
326 {
327 	struct file *fp;
328 	int error;
329 
330 	if ((error = fget_write(td, uap->fd, &fp)) == 0) {
331 		error = dofilewrite(td, fp, uap->fd, uap->buf, uap->nbyte,
332 			    (off_t)-1, 0);
333 		fdrop(fp, td);
334 	} else {
335 		error = EBADF;	/* XXX this can't be right */
336 	}
337 	return(error);
338 }
339 
340 /*
341  * Pwrite system call
342  */
343 #ifndef _SYS_SYSPROTO_H_
344 struct pwrite_args {
345 	int	fd;
346 	const void *buf;
347 	size_t	nbyte;
348 	int	pad;
349 	off_t	offset;
350 };
351 #endif
352 /*
353  * MPSAFE
354  */
355 int
356 pwrite(td, uap)
357 	struct thread *td;
358 	struct pwrite_args *uap;
359 {
360 	struct file *fp;
361 	int error;
362 
363 	if ((error = fget_write(td, uap->fd, &fp)) == 0) {
364 		if (fp->f_type == DTYPE_VNODE) {
365 			error = dofilewrite(td, fp, uap->fd, uap->buf,
366 				    uap->nbyte, uap->offset, FOF_OFFSET);
367 		} else {
368 			error = ESPIPE;
369 		}
370 		fdrop(fp, td);
371 	} else {
372 		error = EBADF;	/* this can't be right */
373 	}
374 	return(error);
375 }
376 
377 static int
378 dofilewrite(td, fp, fd, buf, nbyte, offset, flags)
379 	struct thread *td;
380 	struct file *fp;
381 	int fd, flags;
382 	const void *buf;
383 	size_t nbyte;
384 	off_t offset;
385 {
386 	struct uio auio;
387 	struct iovec aiov;
388 	long cnt, error = 0;
389 #ifdef KTRACE
390 	struct iovec ktriov;
391 	struct uio ktruio;
392 	int didktr = 0;
393 #endif
394 
395 	aiov.iov_base = (void *)(uintptr_t)buf;
396 	aiov.iov_len = nbyte;
397 	auio.uio_iov = &aiov;
398 	auio.uio_iovcnt = 1;
399 	auio.uio_offset = offset;
400 	if (nbyte > INT_MAX)
401 		return (EINVAL);
402 	auio.uio_resid = nbyte;
403 	auio.uio_rw = UIO_WRITE;
404 	auio.uio_segflg = UIO_USERSPACE;
405 	auio.uio_td = td;
406 #ifdef KTRACE
407 	/*
408 	 * if tracing, save a copy of iovec and uio
409 	 */
410 	if (KTRPOINT(td, KTR_GENIO)) {
411 		ktriov = aiov;
412 		ktruio = auio;
413 		didktr = 1;
414 	}
415 #endif
416 	cnt = nbyte;
417 	if (fp->f_type == DTYPE_VNODE)
418 		bwillwrite();
419 	if ((error = fo_write(fp, &auio, fp->f_cred, flags, td))) {
420 		if (auio.uio_resid != cnt && (error == ERESTART ||
421 		    error == EINTR || error == EWOULDBLOCK))
422 			error = 0;
423 		if (error == EPIPE) {
424 			PROC_LOCK(td->td_proc);
425 			psignal(td->td_proc, SIGPIPE);
426 			PROC_UNLOCK(td->td_proc);
427 		}
428 	}
429 	cnt -= auio.uio_resid;
430 #ifdef KTRACE
431 	if (didktr && error == 0) {
432 		ktruio.uio_iov = &ktriov;
433 		ktruio.uio_resid = cnt;
434 		ktrgenio(fd, UIO_WRITE, &ktruio, error);
435 	}
436 #endif
437 	td->td_retval[0] = cnt;
438 	return (error);
439 }
440 
441 /*
442  * Gather write system call
443  */
444 #ifndef _SYS_SYSPROTO_H_
445 struct writev_args {
446 	int	fd;
447 	struct	iovec *iovp;
448 	u_int	iovcnt;
449 };
450 #endif
451 /*
452  * MPSAFE
453  */
454 int
455 writev(td, uap)
456 	struct thread *td;
457 	register struct writev_args *uap;
458 {
459 	struct file *fp;
460 	struct uio auio;
461 	register struct iovec *iov;
462 	struct iovec *needfree;
463 	struct iovec aiov[UIO_SMALLIOV];
464 	long i, cnt, error = 0;
465 	u_int iovlen;
466 #ifdef KTRACE
467 	struct iovec *ktriov = NULL;
468 	struct uio ktruio;
469 #endif
470 
471 	mtx_lock(&Giant);
472 	if ((error = fget_write(td, uap->fd, &fp)) != 0) {
473 		error = EBADF;
474 		goto done2;
475 	}
476 	/* note: can't use iovlen until iovcnt is validated */
477 	iovlen = uap->iovcnt * sizeof (struct iovec);
478 	if (uap->iovcnt > UIO_SMALLIOV) {
479 		if (uap->iovcnt > UIO_MAXIOV) {
480 			needfree = NULL;
481 			error = EINVAL;
482 			goto done;
483 		}
484 		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
485 		needfree = iov;
486 	} else {
487 		iov = aiov;
488 		needfree = NULL;
489 	}
490 	auio.uio_iov = iov;
491 	auio.uio_iovcnt = uap->iovcnt;
492 	auio.uio_rw = UIO_WRITE;
493 	auio.uio_segflg = UIO_USERSPACE;
494 	auio.uio_td = td;
495 	auio.uio_offset = -1;
496 	if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)))
497 		goto done;
498 	auio.uio_resid = 0;
499 	for (i = 0; i < uap->iovcnt; i++) {
500 		if (iov->iov_len > INT_MAX - auio.uio_resid) {
501 			error = EINVAL;
502 			goto done;
503 		}
504 		auio.uio_resid += iov->iov_len;
505 		iov++;
506 	}
507 #ifdef KTRACE
508 	/*
509 	 * if tracing, save a copy of iovec and uio
510 	 */
511 	if (KTRPOINT(td, KTR_GENIO))  {
512 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
513 		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
514 		ktruio = auio;
515 	}
516 #endif
517 	cnt = auio.uio_resid;
518 	if (fp->f_type == DTYPE_VNODE)
519 		bwillwrite();
520 	if ((error = fo_write(fp, &auio, fp->f_cred, 0, td))) {
521 		if (auio.uio_resid != cnt && (error == ERESTART ||
522 		    error == EINTR || error == EWOULDBLOCK))
523 			error = 0;
524 		if (error == EPIPE) {
525 			PROC_LOCK(td->td_proc);
526 			psignal(td->td_proc, SIGPIPE);
527 			PROC_UNLOCK(td->td_proc);
528 		}
529 	}
530 	cnt -= auio.uio_resid;
531 #ifdef KTRACE
532 	if (ktriov != NULL) {
533 		if (error == 0) {
534 			ktruio.uio_iov = ktriov;
535 			ktruio.uio_resid = cnt;
536 			ktrgenio(uap->fd, UIO_WRITE, &ktruio, error);
537 		}
538 		FREE(ktriov, M_TEMP);
539 	}
540 #endif
541 	td->td_retval[0] = cnt;
542 done:
543 	fdrop(fp, td);
544 	if (needfree)
545 		FREE(needfree, M_IOV);
546 done2:
547 	mtx_unlock(&Giant);
548 	return (error);
549 }
550 
551 /*
552  * Ioctl system call
553  */
554 #ifndef _SYS_SYSPROTO_H_
555 struct ioctl_args {
556 	int	fd;
557 	u_long	com;
558 	caddr_t	data;
559 };
560 #endif
561 /*
562  * MPSAFE
563  */
564 /* ARGSUSED */
565 int
566 ioctl(td, uap)
567 	struct thread *td;
568 	register struct ioctl_args *uap;
569 {
570 	struct file *fp;
571 	register struct filedesc *fdp;
572 	register u_long com;
573 	int error = 0;
574 	register u_int size;
575 	caddr_t data, memp;
576 	int tmp;
577 #define STK_PARAMS	128
578 	union {
579 	    char stkbuf[STK_PARAMS];
580 	    long align;
581 	} ubuf;
582 
583 	if ((error = fget(td, uap->fd, &fp)) != 0)
584 		return (error);
585 	mtx_lock(&Giant);
586 	if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
587 		fdrop(fp, td);
588 		mtx_unlock(&Giant);
589 		return (EBADF);
590 	}
591 	fdp = td->td_proc->p_fd;
592 	switch (com = uap->com) {
593 	case FIONCLEX:
594 		FILEDESC_LOCK(fdp);
595 		fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE;
596 		FILEDESC_UNLOCK(fdp);
597 		fdrop(fp, td);
598 		mtx_unlock(&Giant);
599 		return (0);
600 	case FIOCLEX:
601 		FILEDESC_LOCK(fdp);
602 		fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE;
603 		FILEDESC_UNLOCK(fdp);
604 		fdrop(fp, td);
605 		mtx_unlock(&Giant);
606 		return (0);
607 	}
608 
609 	/*
610 	 * Interpret high order word to find amount of data to be
611 	 * copied to/from the user's address space.
612 	 */
613 	size = IOCPARM_LEN(com);
614 	if (size > IOCPARM_MAX) {
615 		fdrop(fp, td);
616 		mtx_unlock(&Giant);
617 		return (ENOTTY);
618 	}
619 
620 	memp = NULL;
621 	if (size > sizeof (ubuf.stkbuf)) {
622 		memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
623 		data = memp;
624 	} else {
625 		data = ubuf.stkbuf;
626 	}
627 	if (com&IOC_IN) {
628 		if (size) {
629 			error = copyin(uap->data, data, (u_int)size);
630 			if (error) {
631 				if (memp)
632 					free(memp, M_IOCTLOPS);
633 				fdrop(fp, td);
634 				goto done;
635 			}
636 		} else {
637 			*(caddr_t *)data = uap->data;
638 		}
639 	} else if ((com&IOC_OUT) && size) {
640 		/*
641 		 * Zero the buffer so the user always
642 		 * gets back something deterministic.
643 		 */
644 		bzero(data, size);
645 	} else if (com&IOC_VOID) {
646 		*(caddr_t *)data = uap->data;
647 	}
648 
649 	switch (com) {
650 
651 	case FIONBIO:
652 		FILE_LOCK(fp);
653 		if ((tmp = *(int *)data))
654 			fp->f_flag |= FNONBLOCK;
655 		else
656 			fp->f_flag &= ~FNONBLOCK;
657 		FILE_UNLOCK(fp);
658 		error = fo_ioctl(fp, FIONBIO, (caddr_t)&tmp, td);
659 		break;
660 
661 	case FIOASYNC:
662 		FILE_LOCK(fp);
663 		if ((tmp = *(int *)data))
664 			fp->f_flag |= FASYNC;
665 		else
666 			fp->f_flag &= ~FASYNC;
667 		FILE_UNLOCK(fp);
668 		error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, td);
669 		break;
670 
671 	default:
672 		error = fo_ioctl(fp, com, data, td);
673 		/*
674 		 * Copy any data to user, size was
675 		 * already set and checked above.
676 		 */
677 		if (error == 0 && (com&IOC_OUT) && size)
678 			error = copyout(data, uap->data, (u_int)size);
679 		break;
680 	}
681 	if (memp)
682 		free(memp, M_IOCTLOPS);
683 	fdrop(fp, td);
684 done:
685 	mtx_unlock(&Giant);
686 	return (error);
687 }
688 
689 /*
690  * sellock and selwait are initialized in selectinit() via SYSINIT.
691  */
692 struct mtx	sellock;
693 struct cv	selwait;
694 u_int		nselcoll;	/* Select collisions since boot */
695 SYSCTL_UINT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
696 
697 /*
698  * Select system call.
699  */
700 #ifndef _SYS_SYSPROTO_H_
701 struct select_args {
702 	int	nd;
703 	fd_set	*in, *ou, *ex;
704 	struct	timeval *tv;
705 };
706 #endif
707 /*
708  * MPSAFE
709  */
710 int
711 select(td, uap)
712 	register struct thread *td;
713 	register struct select_args *uap;
714 {
715 	struct filedesc *fdp;
716 	/*
717 	 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
718 	 * infds with the new FD_SETSIZE of 1024, and more than enough for
719 	 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
720 	 * of 256.
721 	 */
722 	fd_mask s_selbits[howmany(2048, NFDBITS)];
723 	fd_mask *ibits[3], *obits[3], *selbits, *sbp;
724 	struct timeval atv, rtv, ttv;
725 	int error, timo;
726 	u_int ncoll, nbufbytes, ncpbytes, nfdbits;
727 
728 	if (uap->nd < 0)
729 		return (EINVAL);
730 	fdp = td->td_proc->p_fd;
731 	mtx_lock(&Giant);
732 	FILEDESC_LOCK(fdp);
733 
734 	if (uap->nd > td->td_proc->p_fd->fd_nfiles)
735 		uap->nd = td->td_proc->p_fd->fd_nfiles;   /* forgiving; slightly wrong */
736 	FILEDESC_UNLOCK(fdp);
737 
738 	/*
739 	 * Allocate just enough bits for the non-null fd_sets.  Use the
740 	 * preallocated auto buffer if possible.
741 	 */
742 	nfdbits = roundup(uap->nd, NFDBITS);
743 	ncpbytes = nfdbits / NBBY;
744 	nbufbytes = 0;
745 	if (uap->in != NULL)
746 		nbufbytes += 2 * ncpbytes;
747 	if (uap->ou != NULL)
748 		nbufbytes += 2 * ncpbytes;
749 	if (uap->ex != NULL)
750 		nbufbytes += 2 * ncpbytes;
751 	if (nbufbytes <= sizeof s_selbits)
752 		selbits = &s_selbits[0];
753 	else
754 		selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
755 
756 	/*
757 	 * Assign pointers into the bit buffers and fetch the input bits.
758 	 * Put the output buffers together so that they can be bzeroed
759 	 * together.
760 	 */
761 	sbp = selbits;
762 #define	getbits(name, x) \
763 	do {								\
764 		if (uap->name == NULL)					\
765 			ibits[x] = NULL;				\
766 		else {							\
767 			ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp;	\
768 			obits[x] = sbp;					\
769 			sbp += ncpbytes / sizeof *sbp;			\
770 			error = copyin(uap->name, ibits[x], ncpbytes);	\
771 			if (error != 0)					\
772 				goto done_nosellock;			\
773 		}							\
774 	} while (0)
775 	getbits(in, 0);
776 	getbits(ou, 1);
777 	getbits(ex, 2);
778 #undef	getbits
779 	if (nbufbytes != 0)
780 		bzero(selbits, nbufbytes / 2);
781 
782 	if (uap->tv) {
783 		error = copyin((caddr_t)uap->tv, (caddr_t)&atv,
784 			sizeof (atv));
785 		if (error)
786 			goto done_nosellock;
787 		if (itimerfix(&atv)) {
788 			error = EINVAL;
789 			goto done_nosellock;
790 		}
791 		getmicrouptime(&rtv);
792 		timevaladd(&atv, &rtv);
793 	} else {
794 		atv.tv_sec = 0;
795 		atv.tv_usec = 0;
796 	}
797 	timo = 0;
798 	mtx_lock(&sellock);
799 retry:
800 	ncoll = nselcoll;
801 	mtx_lock_spin(&sched_lock);
802 	td->td_flags |= TDF_SELECT;
803 	mtx_unlock_spin(&sched_lock);
804 	mtx_unlock(&sellock);
805 
806 	/* XXX Is there a better place for this? */
807 	TAILQ_INIT(&td->td_selq);
808 	error = selscan(td, ibits, obits, uap->nd);
809 	mtx_lock(&sellock);
810 	if (error || td->td_retval[0])
811 		goto done;
812 	if (atv.tv_sec || atv.tv_usec) {
813 		getmicrouptime(&rtv);
814 		if (timevalcmp(&rtv, &atv, >=))
815 			goto done;
816 		ttv = atv;
817 		timevalsub(&ttv, &rtv);
818 		timo = ttv.tv_sec > 24 * 60 * 60 ?
819 		    24 * 60 * 60 * hz : tvtohz(&ttv);
820 	}
821 
822 	/*
823 	 * An event of interest may occur while we do not hold
824 	 * sellock, so check TDF_SELECT and the number of
825 	 * collisions and rescan the file descriptors if
826 	 * necessary.
827 	 */
828 	mtx_lock_spin(&sched_lock);
829 	if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
830 		mtx_unlock_spin(&sched_lock);
831 		goto retry;
832 	}
833 	mtx_unlock_spin(&sched_lock);
834 
835 	if (timo > 0)
836 		error = cv_timedwait_sig(&selwait, &sellock, timo);
837 	else
838 		error = cv_wait_sig(&selwait, &sellock);
839 
840 	if (error == 0)
841 		goto retry;
842 
843 done:
844 	clear_selinfo_list(td);
845 	mtx_lock_spin(&sched_lock);
846 	td->td_flags &= ~TDF_SELECT;
847 	mtx_unlock_spin(&sched_lock);
848 	mtx_unlock(&sellock);
849 
850 done_nosellock:
851 	/* select is not restarted after signals... */
852 	if (error == ERESTART)
853 		error = EINTR;
854 	if (error == EWOULDBLOCK)
855 		error = 0;
856 #define	putbits(name, x) \
857 	if (uap->name && (error2 = copyout(obits[x], uap->name, ncpbytes))) \
858 		error = error2;
859 	if (error == 0) {
860 		int error2;
861 
862 		putbits(in, 0);
863 		putbits(ou, 1);
864 		putbits(ex, 2);
865 #undef putbits
866 	}
867 	if (selbits != &s_selbits[0])
868 		free(selbits, M_SELECT);
869 
870 	mtx_unlock(&Giant);
871 	return (error);
872 }
873 
874 static int
875 selscan(td, ibits, obits, nfd)
876 	struct thread *td;
877 	fd_mask **ibits, **obits;
878 	int nfd;
879 {
880 	int msk, i, fd;
881 	fd_mask bits;
882 	struct file *fp;
883 	int n = 0;
884 	/* Note: backend also returns POLLHUP/POLLERR if appropriate. */
885 	static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
886 	struct filedesc *fdp = td->td_proc->p_fd;
887 
888 	FILEDESC_LOCK(fdp);
889 	for (msk = 0; msk < 3; msk++) {
890 		if (ibits[msk] == NULL)
891 			continue;
892 		for (i = 0; i < nfd; i += NFDBITS) {
893 			bits = ibits[msk][i/NFDBITS];
894 			/* ffs(int mask) not portable, fd_mask is long */
895 			for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
896 				if (!(bits & 1))
897 					continue;
898 				if ((fp = fget_locked(fdp, fd)) == NULL) {
899 					FILEDESC_UNLOCK(fdp);
900 					return (EBADF);
901 				}
902 				if (fo_poll(fp, flag[msk], fp->f_cred, td)) {
903 					obits[msk][(fd)/NFDBITS] |=
904 					    ((fd_mask)1 << ((fd) % NFDBITS));
905 					n++;
906 				}
907 			}
908 		}
909 	}
910 	FILEDESC_UNLOCK(fdp);
911 	td->td_retval[0] = n;
912 	return (0);
913 }
914 
915 /*
916  * Poll system call.
917  */
918 #ifndef _SYS_SYSPROTO_H_
919 struct poll_args {
920 	struct pollfd *fds;
921 	u_int	nfds;
922 	int	timeout;
923 };
924 #endif
925 /*
926  * MPSAFE
927  */
928 int
929 poll(td, uap)
930 	struct thread *td;
931 	struct poll_args *uap;
932 {
933 	caddr_t bits;
934 	char smallbits[32 * sizeof(struct pollfd)];
935 	struct timeval atv, rtv, ttv;
936 	int error = 0, timo;
937 	u_int ncoll, nfds;
938 	size_t ni;
939 
940 	nfds = SCARG(uap, nfds);
941 
942 	mtx_lock(&Giant);
943 	/*
944 	 * This is kinda bogus.  We have fd limits, but that is not
945 	 * really related to the size of the pollfd array.  Make sure
946 	 * we let the process use at least FD_SETSIZE entries and at
947 	 * least enough for the current limits.  We want to be reasonably
948 	 * safe, but not overly restrictive.
949 	 */
950 	if ((nfds > td->td_proc->p_rlimit[RLIMIT_NOFILE].rlim_cur) &&
951 	    (nfds > FD_SETSIZE)) {
952 		error = EINVAL;
953 		goto done2;
954 	}
955 	ni = nfds * sizeof(struct pollfd);
956 	if (ni > sizeof(smallbits))
957 		bits = malloc(ni, M_TEMP, M_WAITOK);
958 	else
959 		bits = smallbits;
960 	error = copyin(SCARG(uap, fds), bits, ni);
961 	if (error)
962 		goto done_nosellock;
963 	if (SCARG(uap, timeout) != INFTIM) {
964 		atv.tv_sec = SCARG(uap, timeout) / 1000;
965 		atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000;
966 		if (itimerfix(&atv)) {
967 			error = EINVAL;
968 			goto done_nosellock;
969 		}
970 		getmicrouptime(&rtv);
971 		timevaladd(&atv, &rtv);
972 	} else {
973 		atv.tv_sec = 0;
974 		atv.tv_usec = 0;
975 	}
976 	timo = 0;
977 	mtx_lock(&sellock);
978 retry:
979 	ncoll = nselcoll;
980 	mtx_lock_spin(&sched_lock);
981 	td->td_flags |= TDF_SELECT;
982 	mtx_unlock_spin(&sched_lock);
983 	mtx_unlock(&sellock);
984 
985 	/* XXX Is there a better place for this? */
986 	TAILQ_INIT(&td->td_selq);
987 	error = pollscan(td, (struct pollfd *)bits, nfds);
988 	mtx_lock(&sellock);
989 	if (error || td->td_retval[0])
990 		goto done;
991 	if (atv.tv_sec || atv.tv_usec) {
992 		getmicrouptime(&rtv);
993 		if (timevalcmp(&rtv, &atv, >=))
994 			goto done;
995 		ttv = atv;
996 		timevalsub(&ttv, &rtv);
997 		timo = ttv.tv_sec > 24 * 60 * 60 ?
998 		    24 * 60 * 60 * hz : tvtohz(&ttv);
999 	}
1000 	/*
1001 	 * An event of interest may occur while we do not hold
1002 	 * sellock, so check TDF_SELECT and the number of collisions
1003 	 * and rescan the file descriptors if necessary.
1004 	 */
1005 	mtx_lock_spin(&sched_lock);
1006 	if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
1007 		mtx_unlock_spin(&sched_lock);
1008 		goto retry;
1009 	}
1010 	mtx_unlock_spin(&sched_lock);
1011 
1012 	if (timo > 0)
1013 		error = cv_timedwait_sig(&selwait, &sellock, timo);
1014 	else
1015 		error = cv_wait_sig(&selwait, &sellock);
1016 
1017 	if (error == 0)
1018 		goto retry;
1019 
1020 done:
1021 	clear_selinfo_list(td);
1022 	mtx_lock_spin(&sched_lock);
1023 	td->td_flags &= ~TDF_SELECT;
1024 	mtx_unlock_spin(&sched_lock);
1025 	mtx_unlock(&sellock);
1026 
1027 done_nosellock:
1028 	/* poll is not restarted after signals... */
1029 	if (error == ERESTART)
1030 		error = EINTR;
1031 	if (error == EWOULDBLOCK)
1032 		error = 0;
1033 	if (error == 0) {
1034 		error = copyout(bits, SCARG(uap, fds), ni);
1035 		if (error)
1036 			goto out;
1037 	}
1038 out:
1039 	if (ni > sizeof(smallbits))
1040 		free(bits, M_TEMP);
1041 done2:
1042 	mtx_unlock(&Giant);
1043 	return (error);
1044 }
1045 
1046 static int
1047 pollscan(td, fds, nfd)
1048 	struct thread *td;
1049 	struct pollfd *fds;
1050 	u_int nfd;
1051 {
1052 	register struct filedesc *fdp = td->td_proc->p_fd;
1053 	int i;
1054 	struct file *fp;
1055 	int n = 0;
1056 
1057 	FILEDESC_LOCK(fdp);
1058 	for (i = 0; i < nfd; i++, fds++) {
1059 		if (fds->fd >= fdp->fd_nfiles) {
1060 			fds->revents = POLLNVAL;
1061 			n++;
1062 		} else if (fds->fd < 0) {
1063 			fds->revents = 0;
1064 		} else {
1065 			fp = fdp->fd_ofiles[fds->fd];
1066 			if (fp == NULL) {
1067 				fds->revents = POLLNVAL;
1068 				n++;
1069 			} else {
1070 				/*
1071 				 * Note: backend also returns POLLHUP and
1072 				 * POLLERR if appropriate.
1073 				 */
1074 				fds->revents = fo_poll(fp, fds->events,
1075 				    fp->f_cred, td);
1076 				if (fds->revents != 0)
1077 					n++;
1078 			}
1079 		}
1080 	}
1081 	FILEDESC_UNLOCK(fdp);
1082 	td->td_retval[0] = n;
1083 	return (0);
1084 }
1085 
1086 /*
1087  * OpenBSD poll system call.
1088  * XXX this isn't quite a true representation..  OpenBSD uses select ops.
1089  */
1090 #ifndef _SYS_SYSPROTO_H_
1091 struct openbsd_poll_args {
1092 	struct pollfd *fds;
1093 	u_int	nfds;
1094 	int	timeout;
1095 };
1096 #endif
1097 /*
1098  * MPSAFE
1099  */
1100 int
1101 openbsd_poll(td, uap)
1102 	register struct thread *td;
1103 	register struct openbsd_poll_args *uap;
1104 {
1105 	return (poll(td, (struct poll_args *)uap));
1106 }
1107 
1108 /*
1109  * Remove the references to the thread from all of the objects
1110  * we were polling.
1111  *
1112  * This code assumes that the underlying owner of the selinfo
1113  * structure will hold sellock before it changes it, and that
1114  * it will unlink itself from our list if it goes away.
1115  */
1116 void
1117 clear_selinfo_list(td)
1118 	struct thread *td;
1119 {
1120 	struct selinfo *si;
1121 
1122 	mtx_assert(&sellock, MA_OWNED);
1123 	TAILQ_FOREACH(si, &td->td_selq, si_thrlist)
1124 		si->si_thread = NULL;
1125 	TAILQ_INIT(&td->td_selq);
1126 }
1127 
1128 /*ARGSUSED*/
1129 int
1130 seltrue(dev, events, td)
1131 	dev_t dev;
1132 	int events;
1133 	struct thread *td;
1134 {
1135 
1136 	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
1137 }
1138 
1139 /*
1140  * Record a select request.
1141  */
1142 void
1143 selrecord(selector, sip)
1144 	struct thread *selector;
1145 	struct selinfo *sip;
1146 {
1147 
1148 	mtx_lock(&sellock);
1149 	/*
1150 	 * If the thread is NULL then take ownership of selinfo
1151 	 * however if the thread is not NULL and the thread points to
1152 	 * someone else, then we have a collision, otherwise leave it alone
1153 	 * as we've owned it in a previous selrecord on this selinfo.
1154 	 */
1155 	if (sip->si_thread == NULL) {
1156 		sip->si_thread = selector;
1157 		TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist);
1158 	} else if (sip->si_thread != selector) {
1159 		sip->si_flags |= SI_COLL;
1160 	}
1161 
1162 	mtx_unlock(&sellock);
1163 }
1164 
1165 /*
1166  * Do a wakeup when a selectable event occurs.
1167  */
1168 void
1169 selwakeup(sip)
1170 	struct selinfo *sip;
1171 {
1172 	struct thread *td;
1173 
1174 	mtx_lock(&sellock);
1175 	td = sip->si_thread;
1176 	if ((sip->si_flags & SI_COLL) != 0) {
1177 		nselcoll++;
1178 		sip->si_flags &= ~SI_COLL;
1179 		cv_broadcast(&selwait);
1180 	}
1181 	if (td == NULL) {
1182 		mtx_unlock(&sellock);
1183 		return;
1184 	}
1185 	TAILQ_REMOVE(&td->td_selq, sip, si_thrlist);
1186 	sip->si_thread = NULL;
1187 	mtx_lock_spin(&sched_lock);
1188 	if (td->td_wchan == (caddr_t)&selwait) {
1189 		if (td->td_proc->p_stat == SSLEEP)
1190 			setrunnable(td);
1191 		else
1192 			cv_waitq_remove(td);
1193 	} else
1194 		td->td_flags &= ~TDF_SELECT;
1195 	mtx_unlock_spin(&sched_lock);
1196 	mtx_unlock(&sellock);
1197 }
1198 
1199 static void selectinit(void *);
1200 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL)
1201 
1202 /* ARGSUSED*/
1203 static void
1204 selectinit(dummy)
1205 	void *dummy;
1206 {
1207 	cv_init(&selwait, "select");
1208 	mtx_init(&sellock, "sellck", NULL, MTX_DEF);
1209 }
1210