xref: /freebsd/sys/kern/sys_generic.c (revision ee2ea5ceafed78a5bd9810beb9e3ca927180c226)
1 /*
2  * Copyright (c) 1982, 1986, 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
39  * $FreeBSD$
40  */
41 
42 #include "opt_ktrace.h"
43 
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/sysproto.h>
47 #include <sys/filedesc.h>
48 #include <sys/filio.h>
49 #include <sys/fcntl.h>
50 #include <sys/file.h>
51 #include <sys/proc.h>
52 #include <sys/signalvar.h>
53 #include <sys/socketvar.h>
54 #include <sys/uio.h>
55 #include <sys/kernel.h>
56 #include <sys/malloc.h>
57 #include <sys/poll.h>
58 #include <sys/resourcevar.h>
59 #include <sys/selinfo.h>
60 #include <sys/sysctl.h>
61 #include <sys/sysent.h>
62 #include <sys/bio.h>
63 #include <sys/buf.h>
64 #include <sys/condvar.h>
65 #ifdef __alpha__
66 #include <sys/disklabel.h>
67 #endif
68 #ifdef KTRACE
69 #include <sys/ktrace.h>
70 #endif
71 #include <vm/vm.h>
72 #include <vm/vm_page.h>
73 
74 #include <machine/limits.h>
75 
76 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
77 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
78 MALLOC_DEFINE(M_IOV, "iov", "large iov's");
79 
80 static int	pollscan(struct thread *, struct pollfd *, u_int);
81 static int	selscan(struct thread *, fd_mask **, fd_mask **, int);
82 static int	dofileread(struct thread *, struct file *, int, void *,
83 		    size_t, off_t, int);
84 static int	dofilewrite(struct thread *, struct file *, int,
85 		    const void *, size_t, off_t, int);
86 
87 /*
88  * Read system call.
89  */
90 #ifndef _SYS_SYSPROTO_H_
91 struct read_args {
92 	int	fd;
93 	void	*buf;
94 	size_t	nbyte;
95 };
96 #endif
97 /*
98  * MPSAFE
99  */
100 int
101 read(td, uap)
102 	struct thread *td;
103 	struct read_args *uap;
104 {
105 	struct file *fp;
106 	int error;
107 
108 	if ((error = fget_read(td, uap->fd, &fp)) == 0) {
109 		error = dofileread(td, fp, uap->fd, uap->buf,
110 			    uap->nbyte, (off_t)-1, 0);
111 		fdrop(fp, td);
112 	}
113 	return(error);
114 }
115 
116 /*
117  * Pread system call
118  */
119 #ifndef _SYS_SYSPROTO_H_
120 struct pread_args {
121 	int	fd;
122 	void	*buf;
123 	size_t	nbyte;
124 	int	pad;
125 	off_t	offset;
126 };
127 #endif
128 /*
129  * MPSAFE
130  */
131 int
132 pread(td, uap)
133 	struct thread *td;
134 	struct pread_args *uap;
135 {
136 	struct file *fp;
137 	int error;
138 
139 	if ((error = fget_read(td, uap->fd, &fp)) != 0)
140 		return (error);
141 	if (fp->f_type != DTYPE_VNODE) {
142 		error = ESPIPE;
143 	} else {
144 		error = dofileread(td, fp, uap->fd, uap->buf, uap->nbyte,
145 			    uap->offset, FOF_OFFSET);
146 	}
147 	fdrop(fp, td);
148 	return(error);
149 }
150 
151 /*
152  * Code common for read and pread
153  */
154 int
155 dofileread(td, fp, fd, buf, nbyte, offset, flags)
156 	struct thread *td;
157 	struct file *fp;
158 	int fd, flags;
159 	void *buf;
160 	size_t nbyte;
161 	off_t offset;
162 {
163 	struct uio auio;
164 	struct iovec aiov;
165 	long cnt, error = 0;
166 #ifdef KTRACE
167 	struct iovec ktriov;
168 	struct uio ktruio;
169 	int didktr = 0;
170 #endif
171 
172 	aiov.iov_base = (caddr_t)buf;
173 	aiov.iov_len = nbyte;
174 	auio.uio_iov = &aiov;
175 	auio.uio_iovcnt = 1;
176 	auio.uio_offset = offset;
177 	if (nbyte > INT_MAX)
178 		return (EINVAL);
179 	auio.uio_resid = nbyte;
180 	auio.uio_rw = UIO_READ;
181 	auio.uio_segflg = UIO_USERSPACE;
182 	auio.uio_td = td;
183 #ifdef KTRACE
184 	/*
185 	 * if tracing, save a copy of iovec
186 	 */
187 	if (KTRPOINT(td->td_proc, KTR_GENIO)) {
188 		ktriov = aiov;
189 		ktruio = auio;
190 		didktr = 1;
191 	}
192 #endif
193 	cnt = nbyte;
194 
195 	if ((error = fo_read(fp, &auio, fp->f_cred, flags, td))) {
196 		if (auio.uio_resid != cnt && (error == ERESTART ||
197 		    error == EINTR || error == EWOULDBLOCK))
198 			error = 0;
199 	}
200 	cnt -= auio.uio_resid;
201 #ifdef KTRACE
202 	if (didktr && error == 0) {
203 		ktruio.uio_iov = &ktriov;
204 		ktruio.uio_resid = cnt;
205 		ktrgenio(td->td_proc->p_tracep, fd, UIO_READ, &ktruio, error);
206 	}
207 #endif
208 	td->td_retval[0] = cnt;
209 	return (error);
210 }
211 
212 /*
213  * Scatter read system call.
214  */
215 #ifndef _SYS_SYSPROTO_H_
216 struct readv_args {
217 	int	fd;
218 	struct	iovec *iovp;
219 	u_int	iovcnt;
220 };
221 #endif
222 /*
223  * MPSAFE
224  */
225 int
226 readv(td, uap)
227 	struct thread *td;
228 	struct readv_args *uap;
229 {
230 	struct file *fp;
231 	struct uio auio;
232 	struct iovec *iov;
233 	struct iovec *needfree;
234 	struct iovec aiov[UIO_SMALLIOV];
235 	long i, cnt, error = 0;
236 	u_int iovlen;
237 #ifdef KTRACE
238 	struct iovec *ktriov = NULL;
239 	struct uio ktruio;
240 #endif
241 	mtx_lock(&Giant);
242 
243 	if ((error = fget_read(td, uap->fd, &fp)) != 0)
244 		goto done2;
245 	/* note: can't use iovlen until iovcnt is validated */
246 	iovlen = uap->iovcnt * sizeof (struct iovec);
247 	if (uap->iovcnt > UIO_SMALLIOV) {
248 		if (uap->iovcnt > UIO_MAXIOV) {
249 			error = EINVAL;
250 			goto done2;
251 		}
252 		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
253 		needfree = iov;
254 	} else {
255 		iov = aiov;
256 		needfree = NULL;
257 	}
258 	auio.uio_iov = iov;
259 	auio.uio_iovcnt = uap->iovcnt;
260 	auio.uio_rw = UIO_READ;
261 	auio.uio_segflg = UIO_USERSPACE;
262 	auio.uio_td = td;
263 	auio.uio_offset = -1;
264 	if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)))
265 		goto done;
266 	auio.uio_resid = 0;
267 	for (i = 0; i < uap->iovcnt; i++) {
268 		if (iov->iov_len > INT_MAX - auio.uio_resid) {
269 			error = EINVAL;
270 			goto done;
271 		}
272 		auio.uio_resid += iov->iov_len;
273 		iov++;
274 	}
275 #ifdef KTRACE
276 	/*
277 	 * if tracing, save a copy of iovec
278 	 */
279 	if (KTRPOINT(td->td_proc, KTR_GENIO))  {
280 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
281 		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
282 		ktruio = auio;
283 	}
284 #endif
285 	cnt = auio.uio_resid;
286 	if ((error = fo_read(fp, &auio, fp->f_cred, 0, td))) {
287 		if (auio.uio_resid != cnt && (error == ERESTART ||
288 		    error == EINTR || error == EWOULDBLOCK))
289 			error = 0;
290 	}
291 	cnt -= auio.uio_resid;
292 #ifdef KTRACE
293 	if (ktriov != NULL) {
294 		if (error == 0) {
295 			ktruio.uio_iov = ktriov;
296 			ktruio.uio_resid = cnt;
297 			ktrgenio(td->td_proc->p_tracep, uap->fd, UIO_READ, &ktruio,
298 			    error);
299 		}
300 		FREE(ktriov, M_TEMP);
301 	}
302 #endif
303 	td->td_retval[0] = cnt;
304 done:
305 	fdrop(fp, td);
306 	if (needfree)
307 		FREE(needfree, M_IOV);
308 done2:
309 	mtx_unlock(&Giant);
310 	return (error);
311 }
312 
313 /*
314  * Write system call
315  */
316 #ifndef _SYS_SYSPROTO_H_
317 struct write_args {
318 	int	fd;
319 	const void *buf;
320 	size_t	nbyte;
321 };
322 #endif
323 /*
324  * MPSAFE
325  */
326 int
327 write(td, uap)
328 	struct thread *td;
329 	struct write_args *uap;
330 {
331 	struct file *fp;
332 	int error;
333 
334 	if ((error = fget_write(td, uap->fd, &fp)) == 0) {
335 		error = dofilewrite(td, fp, uap->fd, uap->buf, uap->nbyte,
336 			    (off_t)-1, 0);
337 		fdrop(fp, td);
338 	} else {
339 		error = EBADF;	/* XXX this can't be right */
340 	}
341 	return(error);
342 }
343 
344 /*
345  * Pwrite system call
346  */
347 #ifndef _SYS_SYSPROTO_H_
348 struct pwrite_args {
349 	int	fd;
350 	const void *buf;
351 	size_t	nbyte;
352 	int	pad;
353 	off_t	offset;
354 };
355 #endif
356 /*
357  * MPSAFE
358  */
359 int
360 pwrite(td, uap)
361 	struct thread *td;
362 	struct pwrite_args *uap;
363 {
364 	struct file *fp;
365 	int error;
366 
367 	if ((error = fget_write(td, uap->fd, &fp)) == 0) {
368 		if (fp->f_type == DTYPE_VNODE) {
369 			error = dofilewrite(td, fp, uap->fd, uap->buf,
370 				    uap->nbyte, uap->offset, FOF_OFFSET);
371 		} else {
372 			error = ESPIPE;
373 		}
374 		fdrop(fp, td);
375 	} else {
376 		error = EBADF;	/* this can't be right */
377 	}
378 	return(error);
379 }
380 
381 static int
382 dofilewrite(td, fp, fd, buf, nbyte, offset, flags)
383 	struct thread *td;
384 	struct file *fp;
385 	int fd, flags;
386 	const void *buf;
387 	size_t nbyte;
388 	off_t offset;
389 {
390 	struct uio auio;
391 	struct iovec aiov;
392 	long cnt, error = 0;
393 #ifdef KTRACE
394 	struct iovec ktriov;
395 	struct uio ktruio;
396 	int didktr = 0;
397 #endif
398 
399 	aiov.iov_base = (void *)(uintptr_t)buf;
400 	aiov.iov_len = nbyte;
401 	auio.uio_iov = &aiov;
402 	auio.uio_iovcnt = 1;
403 	auio.uio_offset = offset;
404 	if (nbyte > INT_MAX)
405 		return (EINVAL);
406 	auio.uio_resid = nbyte;
407 	auio.uio_rw = UIO_WRITE;
408 	auio.uio_segflg = UIO_USERSPACE;
409 	auio.uio_td = td;
410 #ifdef KTRACE
411 	/*
412 	 * if tracing, save a copy of iovec and uio
413 	 */
414 	if (KTRPOINT(td->td_proc, KTR_GENIO)) {
415 		ktriov = aiov;
416 		ktruio = auio;
417 		didktr = 1;
418 	}
419 #endif
420 	cnt = nbyte;
421 	if (fp->f_type == DTYPE_VNODE)
422 		bwillwrite();
423 	if ((error = fo_write(fp, &auio, fp->f_cred, flags, td))) {
424 		if (auio.uio_resid != cnt && (error == ERESTART ||
425 		    error == EINTR || error == EWOULDBLOCK))
426 			error = 0;
427 		if (error == EPIPE) {
428 			PROC_LOCK(td->td_proc);
429 			psignal(td->td_proc, SIGPIPE);
430 			PROC_UNLOCK(td->td_proc);
431 		}
432 	}
433 	cnt -= auio.uio_resid;
434 #ifdef KTRACE
435 	if (didktr && error == 0) {
436 		ktruio.uio_iov = &ktriov;
437 		ktruio.uio_resid = cnt;
438 		ktrgenio(td->td_proc->p_tracep, fd, UIO_WRITE, &ktruio, error);
439 	}
440 #endif
441 	td->td_retval[0] = cnt;
442 	return (error);
443 }
444 
445 /*
446  * Gather write system call
447  */
448 #ifndef _SYS_SYSPROTO_H_
449 struct writev_args {
450 	int	fd;
451 	struct	iovec *iovp;
452 	u_int	iovcnt;
453 };
454 #endif
455 /*
456  * MPSAFE
457  */
458 int
459 writev(td, uap)
460 	struct thread *td;
461 	register struct writev_args *uap;
462 {
463 	struct file *fp;
464 	struct uio auio;
465 	register struct iovec *iov;
466 	struct iovec *needfree;
467 	struct iovec aiov[UIO_SMALLIOV];
468 	long i, cnt, error = 0;
469 	u_int iovlen;
470 #ifdef KTRACE
471 	struct iovec *ktriov = NULL;
472 	struct uio ktruio;
473 #endif
474 
475 	mtx_lock(&Giant);
476 	if ((error = fget_write(td, uap->fd, &fp)) != 0) {
477 		error = EBADF;
478 		goto done2;
479 	}
480 	/* note: can't use iovlen until iovcnt is validated */
481 	iovlen = uap->iovcnt * sizeof (struct iovec);
482 	if (uap->iovcnt > UIO_SMALLIOV) {
483 		if (uap->iovcnt > UIO_MAXIOV) {
484 			needfree = NULL;
485 			error = EINVAL;
486 			goto done;
487 		}
488 		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
489 		needfree = iov;
490 	} else {
491 		iov = aiov;
492 		needfree = NULL;
493 	}
494 	auio.uio_iov = iov;
495 	auio.uio_iovcnt = uap->iovcnt;
496 	auio.uio_rw = UIO_WRITE;
497 	auio.uio_segflg = UIO_USERSPACE;
498 	auio.uio_td = td;
499 	auio.uio_offset = -1;
500 	if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)))
501 		goto done;
502 	auio.uio_resid = 0;
503 	for (i = 0; i < uap->iovcnt; i++) {
504 		if (iov->iov_len > INT_MAX - auio.uio_resid) {
505 			error = EINVAL;
506 			goto done;
507 		}
508 		auio.uio_resid += iov->iov_len;
509 		iov++;
510 	}
511 #ifdef KTRACE
512 	/*
513 	 * if tracing, save a copy of iovec and uio
514 	 */
515 	if (KTRPOINT(td->td_proc, KTR_GENIO))  {
516 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
517 		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
518 		ktruio = auio;
519 	}
520 #endif
521 	cnt = auio.uio_resid;
522 	if (fp->f_type == DTYPE_VNODE)
523 		bwillwrite();
524 	if ((error = fo_write(fp, &auio, fp->f_cred, 0, td))) {
525 		if (auio.uio_resid != cnt && (error == ERESTART ||
526 		    error == EINTR || error == EWOULDBLOCK))
527 			error = 0;
528 		if (error == EPIPE) {
529 			PROC_LOCK(td->td_proc);
530 			psignal(td->td_proc, SIGPIPE);
531 			PROC_UNLOCK(td->td_proc);
532 		}
533 	}
534 	cnt -= auio.uio_resid;
535 #ifdef KTRACE
536 	if (ktriov != NULL) {
537 		if (error == 0) {
538 			ktruio.uio_iov = ktriov;
539 			ktruio.uio_resid = cnt;
540 			ktrgenio(td->td_proc->p_tracep, uap->fd, UIO_WRITE, &ktruio,
541 			    error);
542 		}
543 		FREE(ktriov, M_TEMP);
544 	}
545 #endif
546 	td->td_retval[0] = cnt;
547 done:
548 	fdrop(fp, td);
549 	if (needfree)
550 		FREE(needfree, M_IOV);
551 done2:
552 	mtx_unlock(&Giant);
553 	return (error);
554 }
555 
556 /*
557  * Ioctl system call
558  */
559 #ifndef _SYS_SYSPROTO_H_
560 struct ioctl_args {
561 	int	fd;
562 	u_long	com;
563 	caddr_t	data;
564 };
565 #endif
566 /*
567  * MPSAFE
568  */
569 /* ARGSUSED */
570 int
571 ioctl(td, uap)
572 	struct thread *td;
573 	register struct ioctl_args *uap;
574 {
575 	struct file *fp;
576 	register struct filedesc *fdp;
577 	register u_long com;
578 	int error = 0;
579 	register u_int size;
580 	caddr_t data, memp;
581 	int tmp;
582 #define STK_PARAMS	128
583 	union {
584 	    char stkbuf[STK_PARAMS];
585 	    long align;
586 	} ubuf;
587 
588 	if ((error = fget(td, uap->fd, &fp)) != 0)
589 		return (error);
590 	mtx_lock(&Giant);
591 	if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
592 		fdrop(fp, td);
593 		mtx_unlock(&Giant);
594 		return (EBADF);
595 	}
596 	fdp = td->td_proc->p_fd;
597 	switch (com = uap->com) {
598 	case FIONCLEX:
599 		FILEDESC_LOCK(fdp);
600 		fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE;
601 		FILEDESC_UNLOCK(fdp);
602 		fdrop(fp, td);
603 		mtx_unlock(&Giant);
604 		return (0);
605 	case FIOCLEX:
606 		FILEDESC_LOCK(fdp);
607 		fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE;
608 		FILEDESC_UNLOCK(fdp);
609 		fdrop(fp, td);
610 		mtx_unlock(&Giant);
611 		return (0);
612 	}
613 
614 	/*
615 	 * Interpret high order word to find amount of data to be
616 	 * copied to/from the user's address space.
617 	 */
618 	size = IOCPARM_LEN(com);
619 	if (size > IOCPARM_MAX) {
620 		fdrop(fp, td);
621 		mtx_unlock(&Giant);
622 		return (ENOTTY);
623 	}
624 
625 	memp = NULL;
626 	if (size > sizeof (ubuf.stkbuf)) {
627 		memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
628 		data = memp;
629 	} else {
630 		data = ubuf.stkbuf;
631 	}
632 	if (com&IOC_IN) {
633 		if (size) {
634 			error = copyin(uap->data, data, (u_int)size);
635 			if (error) {
636 				if (memp)
637 					free(memp, M_IOCTLOPS);
638 				fdrop(fp, td);
639 				goto done;
640 			}
641 		} else {
642 			*(caddr_t *)data = uap->data;
643 		}
644 	} else if ((com&IOC_OUT) && size) {
645 		/*
646 		 * Zero the buffer so the user always
647 		 * gets back something deterministic.
648 		 */
649 		bzero(data, size);
650 	} else if (com&IOC_VOID) {
651 		*(caddr_t *)data = uap->data;
652 	}
653 
654 #ifdef __alpha__
655 	{
656 	int annoy = 1;
657 
658 	if (com == DIOCGDINFO_ALPHAHACK)
659 		com = DIOCGDINFO;
660 	else if (com == DIOCSDINFO_ALPHAHACK)
661 		com = DIOCSDINFO;
662 	else if (com == DIOCWDINFO_ALPHAHACK)
663 		com = DIOCWDINFO;
664 	else if (com == DIOCGDVIRGIN_ALPHAHACK)
665 		com = DIOCGDVIRGIN;
666 	else
667 		annoy = 0;
668 	if (annoy) {
669 		uprintf("Recompile this program, it uses obsolete ioctls.\n");
670 		printf("Program using uses obsolete ioctls used, recompile.\n");
671 		tsleep(&annoy, PPAUSE, "syncer", 15 * hz);
672 	}
673 	}
674 #endif
675 
676 	switch (com) {
677 
678 	case FIONBIO:
679 		FILE_LOCK(fp);
680 		if ((tmp = *(int *)data))
681 			fp->f_flag |= FNONBLOCK;
682 		else
683 			fp->f_flag &= ~FNONBLOCK;
684 		FILE_UNLOCK(fp);
685 		error = fo_ioctl(fp, FIONBIO, (caddr_t)&tmp, td);
686 		break;
687 
688 	case FIOASYNC:
689 		FILE_LOCK(fp);
690 		if ((tmp = *(int *)data))
691 			fp->f_flag |= FASYNC;
692 		else
693 			fp->f_flag &= ~FASYNC;
694 		FILE_UNLOCK(fp);
695 		error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, td);
696 		break;
697 
698 	default:
699 		error = fo_ioctl(fp, com, data, td);
700 		/*
701 		 * Copy any data to user, size was
702 		 * already set and checked above.
703 		 */
704 		if (error == 0 && (com&IOC_OUT) && size)
705 			error = copyout(data, uap->data, (u_int)size);
706 		break;
707 	}
708 	if (memp)
709 		free(memp, M_IOCTLOPS);
710 	fdrop(fp, td);
711 done:
712 	mtx_unlock(&Giant);
713 	return (error);
714 }
715 
716 /*
717  * sellock and selwait are initialized in selectinit() via SYSINIT.
718  */
719 struct mtx	sellock;
720 struct cv	selwait;
721 int	nselcoll;	/* Select collisions since boot */
722 SYSCTL_INT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
723 
724 /*
725  * Select system call.
726  */
727 #ifndef _SYS_SYSPROTO_H_
728 struct select_args {
729 	int	nd;
730 	fd_set	*in, *ou, *ex;
731 	struct	timeval *tv;
732 };
733 #endif
734 /*
735  * MPSAFE
736  */
737 int
738 select(td, uap)
739 	register struct thread *td;
740 	register struct select_args *uap;
741 {
742 	struct filedesc *fdp;
743 	/*
744 	 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
745 	 * infds with the new FD_SETSIZE of 1024, and more than enough for
746 	 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
747 	 * of 256.
748 	 */
749 	fd_mask s_selbits[howmany(2048, NFDBITS)];
750 	fd_mask *ibits[3], *obits[3], *selbits, *sbp;
751 	struct timeval atv, rtv, ttv;
752 	int ncoll, error, timo;
753 	u_int nbufbytes, ncpbytes, nfdbits;
754 
755 	if (uap->nd < 0)
756 		return (EINVAL);
757 	fdp = td->td_proc->p_fd;
758 	mtx_lock(&Giant);
759 	FILEDESC_LOCK(fdp);
760 
761 	if (uap->nd > td->td_proc->p_fd->fd_nfiles)
762 		uap->nd = td->td_proc->p_fd->fd_nfiles;   /* forgiving; slightly wrong */
763 	FILEDESC_UNLOCK(fdp);
764 
765 	/*
766 	 * Allocate just enough bits for the non-null fd_sets.  Use the
767 	 * preallocated auto buffer if possible.
768 	 */
769 	nfdbits = roundup(uap->nd, NFDBITS);
770 	ncpbytes = nfdbits / NBBY;
771 	nbufbytes = 0;
772 	if (uap->in != NULL)
773 		nbufbytes += 2 * ncpbytes;
774 	if (uap->ou != NULL)
775 		nbufbytes += 2 * ncpbytes;
776 	if (uap->ex != NULL)
777 		nbufbytes += 2 * ncpbytes;
778 	if (nbufbytes <= sizeof s_selbits)
779 		selbits = &s_selbits[0];
780 	else
781 		selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
782 
783 	/*
784 	 * Assign pointers into the bit buffers and fetch the input bits.
785 	 * Put the output buffers together so that they can be bzeroed
786 	 * together.
787 	 */
788 	sbp = selbits;
789 #define	getbits(name, x) \
790 	do {								\
791 		if (uap->name == NULL)					\
792 			ibits[x] = NULL;				\
793 		else {							\
794 			ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp;	\
795 			obits[x] = sbp;					\
796 			sbp += ncpbytes / sizeof *sbp;			\
797 			error = copyin(uap->name, ibits[x], ncpbytes);	\
798 			if (error != 0)					\
799 				goto done_nosellock;			\
800 		}							\
801 	} while (0)
802 	getbits(in, 0);
803 	getbits(ou, 1);
804 	getbits(ex, 2);
805 #undef	getbits
806 	if (nbufbytes != 0)
807 		bzero(selbits, nbufbytes / 2);
808 
809 	if (uap->tv) {
810 		error = copyin((caddr_t)uap->tv, (caddr_t)&atv,
811 			sizeof (atv));
812 		if (error)
813 			goto done_nosellock;
814 		if (itimerfix(&atv)) {
815 			error = EINVAL;
816 			goto done_nosellock;
817 		}
818 		getmicrouptime(&rtv);
819 		timevaladd(&atv, &rtv);
820 	} else {
821 		atv.tv_sec = 0;
822 		atv.tv_usec = 0;
823 	}
824 	timo = 0;
825 	mtx_lock(&sellock);
826 retry:
827 	ncoll = nselcoll;
828 	mtx_lock_spin(&sched_lock);
829 	td->td_flags |= TDF_SELECT;
830 	mtx_unlock_spin(&sched_lock);
831 	mtx_unlock(&sellock);
832 
833 	/* XXX Is there a better place for this? */
834 	TAILQ_INIT(&td->td_selq);
835 	error = selscan(td, ibits, obits, uap->nd);
836 	mtx_lock(&sellock);
837 	if (error || td->td_retval[0])
838 		goto done;
839 	if (atv.tv_sec || atv.tv_usec) {
840 		getmicrouptime(&rtv);
841 		if (timevalcmp(&rtv, &atv, >=))
842 			goto done;
843 		ttv = atv;
844 		timevalsub(&ttv, &rtv);
845 		timo = ttv.tv_sec > 24 * 60 * 60 ?
846 		    24 * 60 * 60 * hz : tvtohz(&ttv);
847 	}
848 
849 	/*
850 	 * An event of interest may occur while we do not hold
851 	 * sellock, so check TDF_SELECT and the number of
852 	 * collisions and rescan the file descriptors if
853 	 * necessary.
854 	 */
855 	mtx_lock_spin(&sched_lock);
856 	if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
857 		mtx_unlock_spin(&sched_lock);
858 		goto retry;
859 	}
860 	mtx_unlock_spin(&sched_lock);
861 
862 	if (timo > 0)
863 		error = cv_timedwait_sig(&selwait, &sellock, timo);
864 	else
865 		error = cv_wait_sig(&selwait, &sellock);
866 
867 	if (error == 0)
868 		goto retry;
869 
870 done:
871 	clear_selinfo_list(td);
872 	mtx_lock_spin(&sched_lock);
873 	td->td_flags &= ~TDF_SELECT;
874 	mtx_unlock_spin(&sched_lock);
875 	mtx_unlock(&sellock);
876 
877 done_nosellock:
878 	/* select is not restarted after signals... */
879 	if (error == ERESTART)
880 		error = EINTR;
881 	if (error == EWOULDBLOCK)
882 		error = 0;
883 #define	putbits(name, x) \
884 	if (uap->name && (error2 = copyout(obits[x], uap->name, ncpbytes))) \
885 		error = error2;
886 	if (error == 0) {
887 		int error2;
888 
889 		putbits(in, 0);
890 		putbits(ou, 1);
891 		putbits(ex, 2);
892 #undef putbits
893 	}
894 	if (selbits != &s_selbits[0])
895 		free(selbits, M_SELECT);
896 
897 	mtx_unlock(&Giant);
898 	return (error);
899 }
900 
901 static int
902 selscan(td, ibits, obits, nfd)
903 	struct thread *td;
904 	fd_mask **ibits, **obits;
905 	int nfd;
906 {
907 	int msk, i, fd;
908 	fd_mask bits;
909 	struct file *fp;
910 	int n = 0;
911 	/* Note: backend also returns POLLHUP/POLLERR if appropriate. */
912 	static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
913 	struct filedesc *fdp = td->td_proc->p_fd;
914 
915 	FILEDESC_LOCK(fdp);
916 	for (msk = 0; msk < 3; msk++) {
917 		if (ibits[msk] == NULL)
918 			continue;
919 		for (i = 0; i < nfd; i += NFDBITS) {
920 			bits = ibits[msk][i/NFDBITS];
921 			/* ffs(int mask) not portable, fd_mask is long */
922 			for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
923 				if (!(bits & 1))
924 					continue;
925 				if ((fp = fget_locked(fdp, fd)) == NULL) {
926 					FILEDESC_UNLOCK(fdp);
927 					return (EBADF);
928 				}
929 				if (fo_poll(fp, flag[msk], fp->f_cred, td)) {
930 					obits[msk][(fd)/NFDBITS] |=
931 					    ((fd_mask)1 << ((fd) % NFDBITS));
932 					n++;
933 				}
934 			}
935 		}
936 	}
937 	FILEDESC_UNLOCK(fdp);
938 	td->td_retval[0] = n;
939 	return (0);
940 }
941 
942 /*
943  * Poll system call.
944  */
945 #ifndef _SYS_SYSPROTO_H_
946 struct poll_args {
947 	struct pollfd *fds;
948 	u_int	nfds;
949 	int	timeout;
950 };
951 #endif
952 /*
953  * MPSAFE
954  */
955 int
956 poll(td, uap)
957 	struct thread *td;
958 	struct poll_args *uap;
959 {
960 	caddr_t bits;
961 	char smallbits[32 * sizeof(struct pollfd)];
962 	struct timeval atv, rtv, ttv;
963 	int ncoll, error = 0, timo;
964 	u_int nfds;
965 	size_t ni;
966 
967 	nfds = SCARG(uap, nfds);
968 
969 	mtx_lock(&Giant);
970 	/*
971 	 * This is kinda bogus.  We have fd limits, but that is not
972 	 * really related to the size of the pollfd array.  Make sure
973 	 * we let the process use at least FD_SETSIZE entries and at
974 	 * least enough for the current limits.  We want to be reasonably
975 	 * safe, but not overly restrictive.
976 	 */
977 	if ((nfds > td->td_proc->p_rlimit[RLIMIT_NOFILE].rlim_cur) &&
978 	    (nfds > FD_SETSIZE)) {
979 		error = EINVAL;
980 		goto done2;
981 	}
982 	ni = nfds * sizeof(struct pollfd);
983 	if (ni > sizeof(smallbits))
984 		bits = malloc(ni, M_TEMP, M_WAITOK);
985 	else
986 		bits = smallbits;
987 	error = copyin(SCARG(uap, fds), bits, ni);
988 	if (error)
989 		goto done_nosellock;
990 	if (SCARG(uap, timeout) != INFTIM) {
991 		atv.tv_sec = SCARG(uap, timeout) / 1000;
992 		atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000;
993 		if (itimerfix(&atv)) {
994 			error = EINVAL;
995 			goto done_nosellock;
996 		}
997 		getmicrouptime(&rtv);
998 		timevaladd(&atv, &rtv);
999 	} else {
1000 		atv.tv_sec = 0;
1001 		atv.tv_usec = 0;
1002 	}
1003 	timo = 0;
1004 	mtx_lock(&sellock);
1005 retry:
1006 	ncoll = nselcoll;
1007 	mtx_lock_spin(&sched_lock);
1008 	td->td_flags |= TDF_SELECT;
1009 	mtx_unlock_spin(&sched_lock);
1010 	mtx_unlock(&sellock);
1011 
1012 	/* XXX Is there a better place for this? */
1013 	TAILQ_INIT(&td->td_selq);
1014 	error = pollscan(td, (struct pollfd *)bits, nfds);
1015 	mtx_lock(&sellock);
1016 	if (error || td->td_retval[0])
1017 		goto done;
1018 	if (atv.tv_sec || atv.tv_usec) {
1019 		getmicrouptime(&rtv);
1020 		if (timevalcmp(&rtv, &atv, >=))
1021 			goto done;
1022 		ttv = atv;
1023 		timevalsub(&ttv, &rtv);
1024 		timo = ttv.tv_sec > 24 * 60 * 60 ?
1025 		    24 * 60 * 60 * hz : tvtohz(&ttv);
1026 	}
1027 	/*
1028 	 * An event of interest may occur while we do not hold
1029 	 * sellock, so check TDF_SELECT and the number of collisions
1030 	 * and rescan the file descriptors if necessary.
1031 	 */
1032 	mtx_lock_spin(&sched_lock);
1033 	if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
1034 		mtx_unlock_spin(&sched_lock);
1035 		goto retry;
1036 	}
1037 	mtx_unlock_spin(&sched_lock);
1038 
1039 	if (timo > 0)
1040 		error = cv_timedwait_sig(&selwait, &sellock, timo);
1041 	else
1042 		error = cv_wait_sig(&selwait, &sellock);
1043 
1044 	if (error == 0)
1045 		goto retry;
1046 
1047 done:
1048 	clear_selinfo_list(td);
1049 	mtx_lock_spin(&sched_lock);
1050 	td->td_flags &= ~TDF_SELECT;
1051 	mtx_unlock_spin(&sched_lock);
1052 	mtx_unlock(&sellock);
1053 
1054 done_nosellock:
1055 	/* poll is not restarted after signals... */
1056 	if (error == ERESTART)
1057 		error = EINTR;
1058 	if (error == EWOULDBLOCK)
1059 		error = 0;
1060 	if (error == 0) {
1061 		error = copyout(bits, SCARG(uap, fds), ni);
1062 		if (error)
1063 			goto out;
1064 	}
1065 out:
1066 	if (ni > sizeof(smallbits))
1067 		free(bits, M_TEMP);
1068 done2:
1069 	mtx_unlock(&Giant);
1070 	return (error);
1071 }
1072 
1073 static int
1074 pollscan(td, fds, nfd)
1075 	struct thread *td;
1076 	struct pollfd *fds;
1077 	u_int nfd;
1078 {
1079 	register struct filedesc *fdp = td->td_proc->p_fd;
1080 	int i;
1081 	struct file *fp;
1082 	int n = 0;
1083 
1084 	FILEDESC_LOCK(fdp);
1085 	for (i = 0; i < nfd; i++, fds++) {
1086 		if (fds->fd >= fdp->fd_nfiles) {
1087 			fds->revents = POLLNVAL;
1088 			n++;
1089 		} else if (fds->fd < 0) {
1090 			fds->revents = 0;
1091 		} else {
1092 			fp = fdp->fd_ofiles[fds->fd];
1093 			if (fp == NULL) {
1094 				fds->revents = POLLNVAL;
1095 				n++;
1096 			} else {
1097 				/*
1098 				 * Note: backend also returns POLLHUP and
1099 				 * POLLERR if appropriate.
1100 				 */
1101 				fds->revents = fo_poll(fp, fds->events,
1102 				    fp->f_cred, td);
1103 				if (fds->revents != 0)
1104 					n++;
1105 			}
1106 		}
1107 	}
1108 	FILEDESC_UNLOCK(fdp);
1109 	td->td_retval[0] = n;
1110 	return (0);
1111 }
1112 
1113 /*
1114  * OpenBSD poll system call.
1115  * XXX this isn't quite a true representation..  OpenBSD uses select ops.
1116  */
1117 #ifndef _SYS_SYSPROTO_H_
1118 struct openbsd_poll_args {
1119 	struct pollfd *fds;
1120 	u_int	nfds;
1121 	int	timeout;
1122 };
1123 #endif
1124 /*
1125  * MPSAFE
1126  */
1127 int
1128 openbsd_poll(td, uap)
1129 	register struct thread *td;
1130 	register struct openbsd_poll_args *uap;
1131 {
1132 	return (poll(td, (struct poll_args *)uap));
1133 }
1134 
1135 /*
1136  * Remove the references to the thread from all of the objects
1137  * we were polling.
1138  *
1139  * This code assumes that the underlying owner of the selinfo
1140  * structure will hold sellock before it changes it, and that
1141  * it will unlink itself from our list if it goes away.
1142  */
1143 void
1144 clear_selinfo_list(td)
1145 	struct thread *td;
1146 {
1147 	struct selinfo *si;
1148 
1149 	mtx_assert(&sellock, MA_OWNED);
1150 	TAILQ_FOREACH(si, &td->td_selq, si_thrlist)
1151 		si->si_thread = NULL;
1152 	TAILQ_INIT(&td->td_selq);
1153 }
1154 
1155 /*ARGSUSED*/
1156 int
1157 seltrue(dev, events, td)
1158 	dev_t dev;
1159 	int events;
1160 	struct thread *td;
1161 {
1162 
1163 	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
1164 }
1165 
1166 /*
1167  * Record a select request.
1168  */
1169 void
1170 selrecord(selector, sip)
1171 	struct thread *selector;
1172 	struct selinfo *sip;
1173 {
1174 
1175 	mtx_lock(&sellock);
1176 	/*
1177 	 * If the thread is NULL then take ownership of selinfo
1178 	 * however if the thread is not NULL and the thread points to
1179 	 * someone else, then we have a collision, otherwise leave it alone
1180 	 * as we've owned it in a previous selrecord on this selinfo.
1181 	 */
1182 	if (sip->si_thread == NULL) {
1183 		sip->si_thread = selector;
1184 		TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist);
1185 	} else if (sip->si_thread != selector) {
1186 		sip->si_flags |= SI_COLL;
1187 	}
1188 
1189 	mtx_unlock(&sellock);
1190 }
1191 
1192 /*
1193  * Do a wakeup when a selectable event occurs.
1194  */
1195 void
1196 selwakeup(sip)
1197 	struct selinfo *sip;
1198 {
1199 	struct thread *td;
1200 
1201 	mtx_lock(&sellock);
1202 	td = sip->si_thread;
1203 	if ((sip->si_flags & SI_COLL) != 0) {
1204 		nselcoll++;
1205 		sip->si_flags &= ~SI_COLL;
1206 		cv_broadcast(&selwait);
1207 	}
1208 	if (td == NULL) {
1209 		mtx_unlock(&sellock);
1210 		return;
1211 	}
1212 	TAILQ_REMOVE(&td->td_selq, sip, si_thrlist);
1213 	sip->si_thread = NULL;
1214 	mtx_lock_spin(&sched_lock);
1215 	if (td->td_wchan == (caddr_t)&selwait) {
1216 		if (td->td_proc->p_stat == SSLEEP)
1217 			setrunnable(td);
1218 		else
1219 			cv_waitq_remove(td);
1220 	} else
1221 		td->td_flags &= ~TDF_SELECT;
1222 	mtx_unlock_spin(&sched_lock);
1223 	mtx_unlock(&sellock);
1224 }
1225 
1226 static void selectinit(void *);
1227 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL)
1228 
1229 /* ARGSUSED*/
1230 static void
1231 selectinit(dummy)
1232 	void *dummy;
1233 {
1234 	cv_init(&selwait, "select");
1235 	mtx_init(&sellock, "sellck", NULL, MTX_DEF);
1236 }
1237