xref: /freebsd/sys/kern/kern_descrip.c (revision 77b7cdf1999ee965ad494fddd184b18f532ac91a)
1 /*
2  * Copyright (c) 1982, 1986, 1989, 1991, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)kern_descrip.c	8.6 (Berkeley) 4/19/94
39  * $FreeBSD$
40  */
41 
42 #include "opt_compat.h"
43 
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/syscallsubr.h>
47 #include <sys/sysproto.h>
48 #include <sys/conf.h>
49 #include <sys/filedesc.h>
50 #include <sys/lock.h>
51 #include <sys/kernel.h>
52 #include <sys/limits.h>
53 #include <sys/malloc.h>
54 #include <sys/mutex.h>
55 #include <sys/sysctl.h>
56 #include <sys/vnode.h>
57 #include <sys/mount.h>
58 #include <sys/proc.h>
59 #include <sys/namei.h>
60 #include <sys/file.h>
61 #include <sys/stat.h>
62 #include <sys/filio.h>
63 #include <sys/fcntl.h>
64 #include <sys/unistd.h>
65 #include <sys/resourcevar.h>
66 #include <sys/event.h>
67 #include <sys/sx.h>
68 #include <sys/socketvar.h>
69 #include <sys/signalvar.h>
70 
71 #include <vm/vm.h>
72 #include <vm/vm_extern.h>
73 #include <vm/uma.h>
74 
75 static MALLOC_DEFINE(M_FILEDESC, "file desc", "Open file descriptor table");
76 static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures");
77 
78 static uma_zone_t file_zone;
79 
80 static	 d_open_t  fdopen;
81 #define	NUMFDESC 64
82 
83 #define	CDEV_MAJOR 22
84 static struct cdevsw fildesc_cdevsw = {
85 	.d_open =	fdopen,
86 	.d_name =	"FD",
87 	.d_maj =	CDEV_MAJOR,
88 };
89 
90 /* How to treat 'new' parameter when allocating a fd for do_dup(). */
91 enum dup_type { DUP_VARIABLE, DUP_FIXED };
92 
93 static int do_dup(struct thread *td, enum dup_type type, int old, int new,
94     register_t *retval);
95 
96 /*
97  * Descriptor management.
98  */
99 struct filelist filehead;	/* head of list of open files */
100 int nfiles;			/* actual number of open files */
101 extern int cmask;
102 struct sx filelist_lock;	/* sx to protect filelist */
103 struct mtx sigio_lock;		/* mtx to protect pointers to sigio */
104 
105 /*
106  * System calls on descriptors.
107  */
108 #ifndef _SYS_SYSPROTO_H_
109 struct getdtablesize_args {
110 	int	dummy;
111 };
112 #endif
113 /*
114  * MPSAFE
115  */
116 /* ARGSUSED */
117 int
118 getdtablesize(td, uap)
119 	struct thread *td;
120 	struct getdtablesize_args *uap;
121 {
122 	struct proc *p = td->td_proc;
123 
124 	mtx_lock(&Giant);
125 	td->td_retval[0] =
126 	    min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfilesperproc);
127 	mtx_unlock(&Giant);
128 	return (0);
129 }
130 
131 /*
132  * Duplicate a file descriptor to a particular value.
133  *
134  * note: keep in mind that a potential race condition exists when closing
135  * descriptors from a shared descriptor table (via rfork).
136  */
137 #ifndef _SYS_SYSPROTO_H_
138 struct dup2_args {
139 	u_int	from;
140 	u_int	to;
141 };
142 #endif
143 /*
144  * MPSAFE
145  */
146 /* ARGSUSED */
147 int
148 dup2(td, uap)
149 	struct thread *td;
150 	struct dup2_args *uap;
151 {
152 
153 	return (do_dup(td, DUP_FIXED, (int)uap->from, (int)uap->to,
154 		    td->td_retval));
155 }
156 
157 /*
158  * Duplicate a file descriptor.
159  */
160 #ifndef _SYS_SYSPROTO_H_
161 struct dup_args {
162 	u_int	fd;
163 };
164 #endif
165 /*
166  * MPSAFE
167  */
168 /* ARGSUSED */
169 int
170 dup(td, uap)
171 	struct thread *td;
172 	struct dup_args *uap;
173 {
174 
175 	return (do_dup(td, DUP_VARIABLE, (int)uap->fd, 0, td->td_retval));
176 }
177 
178 /*
179  * The file control system call.
180  */
181 #ifndef _SYS_SYSPROTO_H_
182 struct fcntl_args {
183 	int	fd;
184 	int	cmd;
185 	long	arg;
186 };
187 #endif
188 /*
189  * MPSAFE
190  */
191 /* ARGSUSED */
192 int
193 fcntl(td, uap)
194 	struct thread *td;
195 	struct fcntl_args *uap;
196 {
197 	struct flock fl;
198 	intptr_t arg;
199 	int error;
200 
201 	error = 0;
202 	switch (uap->cmd) {
203 	case F_GETLK:
204 	case F_SETLK:
205 	case F_SETLKW:
206 		error = copyin((void *)(intptr_t)uap->arg, &fl, sizeof(fl));
207 		arg = (intptr_t)&fl;
208 		break;
209 	default:
210 		arg = uap->arg;
211 		break;
212 	}
213 	if (error)
214 		return (error);
215 	error = kern_fcntl(td, uap->fd, uap->cmd, arg);
216 	if (error)
217 		return (error);
218 	if (uap->cmd == F_GETLK)
219 		error = copyout(&fl, (void *)(intptr_t)uap->arg, sizeof(fl));
220 	return (error);
221 }
222 
223 int
224 kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
225 {
226 	struct filedesc *fdp;
227 	struct flock *flp;
228 	struct file *fp;
229 	struct proc *p;
230 	char *pop;
231 	struct vnode *vp;
232 	u_int newmin;
233 	int error, flg, tmp;
234 
235 	error = 0;
236 	flg = F_POSIX;
237 	p = td->td_proc;
238 	fdp = p->p_fd;
239 	mtx_lock(&Giant);
240 	FILEDESC_LOCK(fdp);
241 	if ((unsigned)fd >= fdp->fd_nfiles ||
242 	    (fp = fdp->fd_ofiles[fd]) == NULL) {
243 		FILEDESC_UNLOCK(fdp);
244 		error = EBADF;
245 		goto done2;
246 	}
247 	pop = &fdp->fd_ofileflags[fd];
248 
249 	switch (cmd) {
250 	case F_DUPFD:
251 		FILEDESC_UNLOCK(fdp);
252 		newmin = arg;
253 		if (newmin >= p->p_rlimit[RLIMIT_NOFILE].rlim_cur ||
254 		    newmin >= maxfilesperproc) {
255 			error = EINVAL;
256 			break;
257 		}
258 		error = do_dup(td, DUP_VARIABLE, fd, newmin, td->td_retval);
259 		break;
260 
261 	case F_GETFD:
262 		td->td_retval[0] = (*pop & UF_EXCLOSE) ? FD_CLOEXEC : 0;
263 		FILEDESC_UNLOCK(fdp);
264 		break;
265 
266 	case F_SETFD:
267 		*pop = (*pop &~ UF_EXCLOSE) |
268 		    (arg & FD_CLOEXEC ? UF_EXCLOSE : 0);
269 		FILEDESC_UNLOCK(fdp);
270 		break;
271 
272 	case F_GETFL:
273 		FILE_LOCK(fp);
274 		FILEDESC_UNLOCK(fdp);
275 		td->td_retval[0] = OFLAGS(fp->f_flag);
276 		FILE_UNLOCK(fp);
277 		break;
278 
279 	case F_SETFL:
280 		FILE_LOCK(fp);
281 		FILEDESC_UNLOCK(fdp);
282 		fhold_locked(fp);
283 		fp->f_flag &= ~FCNTLFLAGS;
284 		fp->f_flag |= FFLAGS(arg & ~O_ACCMODE) & FCNTLFLAGS;
285 		FILE_UNLOCK(fp);
286 		tmp = fp->f_flag & FNONBLOCK;
287 		error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
288 		if (error) {
289 			fdrop(fp, td);
290 			break;
291 		}
292 		tmp = fp->f_flag & FASYNC;
293 		error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td);
294 		if (error == 0) {
295 			fdrop(fp, td);
296 			break;
297 		}
298 		FILE_LOCK(fp);
299 		fp->f_flag &= ~FNONBLOCK;
300 		FILE_UNLOCK(fp);
301 		tmp = 0;
302 		(void)fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
303 		fdrop(fp, td);
304 		break;
305 
306 	case F_GETOWN:
307 		fhold(fp);
308 		FILEDESC_UNLOCK(fdp);
309 		error = fo_ioctl(fp, FIOGETOWN, &tmp, td->td_ucred, td);
310 		if (error == 0)
311 			td->td_retval[0] = tmp;
312 		fdrop(fp, td);
313 		break;
314 
315 	case F_SETOWN:
316 		fhold(fp);
317 		FILEDESC_UNLOCK(fdp);
318 		tmp = arg;
319 		error = fo_ioctl(fp, FIOSETOWN, &tmp, td->td_ucred, td);
320 		fdrop(fp, td);
321 		break;
322 
323 	case F_SETLKW:
324 		flg |= F_WAIT;
325 		/* FALLTHROUGH F_SETLK */
326 
327 	case F_SETLK:
328 		if (fp->f_type != DTYPE_VNODE) {
329 			FILEDESC_UNLOCK(fdp);
330 			error = EBADF;
331 			break;
332 		}
333 
334 		flp = (struct flock *)arg;
335 		if (flp->l_whence == SEEK_CUR) {
336 			if (fp->f_offset < 0 ||
337 			    (flp->l_start > 0 &&
338 			     fp->f_offset > OFF_MAX - flp->l_start)) {
339 				FILEDESC_UNLOCK(fdp);
340 				error = EOVERFLOW;
341 				break;
342 			}
343 			flp->l_start += fp->f_offset;
344 		}
345 
346 		/*
347 		 * VOP_ADVLOCK() may block.
348 		 */
349 		fhold(fp);
350 		FILEDESC_UNLOCK(fdp);
351 		vp = fp->f_data;
352 
353 		switch (flp->l_type) {
354 		case F_RDLCK:
355 			if ((fp->f_flag & FREAD) == 0) {
356 				error = EBADF;
357 				break;
358 			}
359 			PROC_LOCK(p->p_leader);
360 			p->p_leader->p_flag |= P_ADVLOCK;
361 			PROC_UNLOCK(p->p_leader);
362 			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
363 			    flp, flg);
364 			break;
365 		case F_WRLCK:
366 			if ((fp->f_flag & FWRITE) == 0) {
367 				error = EBADF;
368 				break;
369 			}
370 			PROC_LOCK(p->p_leader);
371 			p->p_leader->p_flag |= P_ADVLOCK;
372 			PROC_UNLOCK(p->p_leader);
373 			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
374 			    flp, flg);
375 			break;
376 		case F_UNLCK:
377 			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK,
378 			    flp, F_POSIX);
379 			break;
380 		default:
381 			error = EINVAL;
382 			break;
383 		}
384 		/* Check for race with close */
385 		FILEDESC_LOCK(fdp);
386 		if ((unsigned) fd >= fdp->fd_nfiles ||
387 		    fp != fdp->fd_ofiles[fd]) {
388 			FILEDESC_UNLOCK(fdp);
389 			flp->l_whence = SEEK_SET;
390 			flp->l_start = 0;
391 			flp->l_len = 0;
392 			flp->l_type = F_UNLCK;
393 			(void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
394 					   F_UNLCK, flp, F_POSIX);
395 		} else
396 			FILEDESC_UNLOCK(fdp);
397 		fdrop(fp, td);
398 		break;
399 
400 	case F_GETLK:
401 		if (fp->f_type != DTYPE_VNODE) {
402 			FILEDESC_UNLOCK(fdp);
403 			error = EBADF;
404 			break;
405 		}
406 		flp = (struct flock *)arg;
407 		if (flp->l_type != F_RDLCK && flp->l_type != F_WRLCK &&
408 		    flp->l_type != F_UNLCK) {
409 			FILEDESC_UNLOCK(fdp);
410 			error = EINVAL;
411 			break;
412 		}
413 		if (flp->l_whence == SEEK_CUR) {
414 			if ((flp->l_start > 0 &&
415 			    fp->f_offset > OFF_MAX - flp->l_start) ||
416 			    (flp->l_start < 0 &&
417 			     fp->f_offset < OFF_MIN - flp->l_start)) {
418 				FILEDESC_UNLOCK(fdp);
419 				error = EOVERFLOW;
420 				break;
421 			}
422 			flp->l_start += fp->f_offset;
423 		}
424 		/*
425 		 * VOP_ADVLOCK() may block.
426 		 */
427 		fhold(fp);
428 		FILEDESC_UNLOCK(fdp);
429 		vp = fp->f_data;
430 		error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK, flp,
431 		    F_POSIX);
432 		fdrop(fp, td);
433 		break;
434 	default:
435 		FILEDESC_UNLOCK(fdp);
436 		error = EINVAL;
437 		break;
438 	}
439 done2:
440 	mtx_unlock(&Giant);
441 	return (error);
442 }
443 
444 /*
445  * Common code for dup, dup2, and fcntl(F_DUPFD).
446  */
447 static int
448 do_dup(td, type, old, new, retval)
449 	enum dup_type type;
450 	int old, new;
451 	register_t *retval;
452 	struct thread *td;
453 {
454 	struct filedesc *fdp;
455 	struct proc *p;
456 	struct file *fp;
457 	struct file *delfp;
458 	int error, newfd;
459 
460 	p = td->td_proc;
461 	fdp = p->p_fd;
462 
463 	/*
464 	 * Verify we have a valid descriptor to dup from and possibly to
465 	 * dup to.
466 	 */
467 	if (old < 0 || new < 0 || new >= p->p_rlimit[RLIMIT_NOFILE].rlim_cur ||
468 	    new >= maxfilesperproc)
469 		return (EBADF);
470 	FILEDESC_LOCK(fdp);
471 	if (old >= fdp->fd_nfiles || fdp->fd_ofiles[old] == NULL) {
472 		FILEDESC_UNLOCK(fdp);
473 		return (EBADF);
474 	}
475 	if (type == DUP_FIXED && old == new) {
476 		*retval = new;
477 		FILEDESC_UNLOCK(fdp);
478 		return (0);
479 	}
480 	fp = fdp->fd_ofiles[old];
481 	fhold(fp);
482 
483 	/*
484 	 * Expand the table for the new descriptor if needed.  This may
485 	 * block and drop and reacquire the filedesc lock.
486 	 */
487 	if (type == DUP_VARIABLE || new >= fdp->fd_nfiles) {
488 		error = fdalloc(td, new, &newfd);
489 		if (error) {
490 			FILEDESC_UNLOCK(fdp);
491 			fdrop(fp, td);
492 			return (error);
493 		}
494 	}
495 	if (type == DUP_VARIABLE)
496 		new = newfd;
497 
498 	/*
499 	 * If the old file changed out from under us then treat it as a
500 	 * bad file descriptor.  Userland should do its own locking to
501 	 * avoid this case.
502 	 */
503 	if (fdp->fd_ofiles[old] != fp) {
504 		if (fdp->fd_ofiles[new] == NULL) {
505 			if (new < fdp->fd_freefile)
506 				fdp->fd_freefile = new;
507 			while (fdp->fd_lastfile > 0 &&
508 			    fdp->fd_ofiles[fdp->fd_lastfile] == NULL)
509 				fdp->fd_lastfile--;
510 		}
511 		FILEDESC_UNLOCK(fdp);
512 		fdrop(fp, td);
513 		return (EBADF);
514 	}
515 	KASSERT(old != new, ("new fd is same as old"));
516 
517 	/*
518 	 * Save info on the descriptor being overwritten.  We have
519 	 * to do the unmap now, but we cannot close it without
520 	 * introducing an ownership race for the slot.
521 	 */
522 	delfp = fdp->fd_ofiles[new];
523 	KASSERT(delfp == NULL || type == DUP_FIXED,
524 	    ("dup() picked an open file"));
525 #if 0
526 	if (delfp && (fdp->fd_ofileflags[new] & UF_MAPPED))
527 		(void) munmapfd(td, new);
528 #endif
529 
530 	/*
531 	 * Duplicate the source descriptor, update lastfile
532 	 */
533 	fdp->fd_ofiles[new] = fp;
534  	fdp->fd_ofileflags[new] = fdp->fd_ofileflags[old] &~ UF_EXCLOSE;
535 	if (new > fdp->fd_lastfile)
536 		fdp->fd_lastfile = new;
537 	FILEDESC_UNLOCK(fdp);
538 	*retval = new;
539 
540 	/*
541 	 * If we dup'd over a valid file, we now own the reference to it
542 	 * and must dispose of it using closef() semantics (as if a
543 	 * close() were performed on it).
544 	 */
545 	if (delfp) {
546 		mtx_lock(&Giant);
547 		(void) closef(delfp, td);
548 		mtx_unlock(&Giant);
549 	}
550 	return (0);
551 }
552 
553 /*
554  * If sigio is on the list associated with a process or process group,
555  * disable signalling from the device, remove sigio from the list and
556  * free sigio.
557  */
558 void
559 funsetown(sigiop)
560 	struct sigio **sigiop;
561 {
562 	struct sigio *sigio;
563 
564 	SIGIO_LOCK();
565 	sigio = *sigiop;
566 	if (sigio == NULL) {
567 		SIGIO_UNLOCK();
568 		return;
569 	}
570 	*(sigio->sio_myref) = NULL;
571 	if ((sigio)->sio_pgid < 0) {
572 		struct pgrp *pg = (sigio)->sio_pgrp;
573 		PGRP_LOCK(pg);
574 		SLIST_REMOVE(&sigio->sio_pgrp->pg_sigiolst, sigio,
575 			     sigio, sio_pgsigio);
576 		PGRP_UNLOCK(pg);
577 	} else {
578 		struct proc *p = (sigio)->sio_proc;
579 		PROC_LOCK(p);
580 		SLIST_REMOVE(&sigio->sio_proc->p_sigiolst, sigio,
581 			     sigio, sio_pgsigio);
582 		PROC_UNLOCK(p);
583 	}
584 	SIGIO_UNLOCK();
585 	crfree(sigio->sio_ucred);
586 	FREE(sigio, M_SIGIO);
587 }
588 
589 /*
590  * Free a list of sigio structures.
591  * We only need to lock the SIGIO_LOCK because we have made ourselves
592  * inaccessable to callers of fsetown and therefore do not need to lock
593  * the proc or pgrp struct for the list manipulation.
594  */
595 void
596 funsetownlst(sigiolst)
597 	struct sigiolst *sigiolst;
598 {
599 	struct proc *p;
600 	struct pgrp *pg;
601 	struct sigio *sigio;
602 
603 	sigio = SLIST_FIRST(sigiolst);
604 	if (sigio == NULL)
605 		return;
606 	p = NULL;
607 	pg = NULL;
608 
609 	/*
610 	 * Every entry of the list should belong
611 	 * to a single proc or pgrp.
612 	 */
613 	if (sigio->sio_pgid < 0) {
614 		pg = sigio->sio_pgrp;
615 		PGRP_LOCK_ASSERT(pg, MA_NOTOWNED);
616 	} else /* if (sigio->sio_pgid > 0) */ {
617 		p = sigio->sio_proc;
618 		PROC_LOCK_ASSERT(p, MA_NOTOWNED);
619 	}
620 
621 	SIGIO_LOCK();
622 	while ((sigio = SLIST_FIRST(sigiolst)) != NULL) {
623 		*(sigio->sio_myref) = NULL;
624 		if (pg != NULL) {
625 			KASSERT(sigio->sio_pgid < 0,
626 			    ("Proc sigio in pgrp sigio list"));
627 			KASSERT(sigio->sio_pgrp == pg,
628 			    ("Bogus pgrp in sigio list"));
629 			PGRP_LOCK(pg);
630 			SLIST_REMOVE(&pg->pg_sigiolst, sigio, sigio,
631 			    sio_pgsigio);
632 			PGRP_UNLOCK(pg);
633 		} else /* if (p != NULL) */ {
634 			KASSERT(sigio->sio_pgid > 0,
635 			    ("Pgrp sigio in proc sigio list"));
636 			KASSERT(sigio->sio_proc == p,
637 			    ("Bogus proc in sigio list"));
638 			PROC_LOCK(p);
639 			SLIST_REMOVE(&p->p_sigiolst, sigio, sigio,
640 			    sio_pgsigio);
641 			PROC_UNLOCK(p);
642 		}
643 		SIGIO_UNLOCK();
644 		crfree(sigio->sio_ucred);
645 		FREE(sigio, M_SIGIO);
646 		SIGIO_LOCK();
647 	}
648 	SIGIO_UNLOCK();
649 }
650 
651 /*
652  * This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg).
653  *
654  * After permission checking, add a sigio structure to the sigio list for
655  * the process or process group.
656  */
657 int
658 fsetown(pgid, sigiop)
659 	pid_t pgid;
660 	struct sigio **sigiop;
661 {
662 	struct proc *proc;
663 	struct pgrp *pgrp;
664 	struct sigio *sigio;
665 	int ret;
666 
667 	if (pgid == 0) {
668 		funsetown(sigiop);
669 		return (0);
670 	}
671 
672 	ret = 0;
673 
674 	/* Allocate and fill in the new sigio out of locks. */
675 	MALLOC(sigio, struct sigio *, sizeof(struct sigio), M_SIGIO, M_WAITOK);
676 	sigio->sio_pgid = pgid;
677 	sigio->sio_ucred = crhold(curthread->td_ucred);
678 	sigio->sio_myref = sigiop;
679 
680 	sx_slock(&proctree_lock);
681 	if (pgid > 0) {
682 		proc = pfind(pgid);
683 		if (proc == NULL) {
684 			ret = ESRCH;
685 			goto fail;
686 		}
687 
688 		/*
689 		 * Policy - Don't allow a process to FSETOWN a process
690 		 * in another session.
691 		 *
692 		 * Remove this test to allow maximum flexibility or
693 		 * restrict FSETOWN to the current process or process
694 		 * group for maximum safety.
695 		 */
696 		PROC_UNLOCK(proc);
697 		if (proc->p_session != curthread->td_proc->p_session) {
698 			ret = EPERM;
699 			goto fail;
700 		}
701 
702 		pgrp = NULL;
703 	} else /* if (pgid < 0) */ {
704 		pgrp = pgfind(-pgid);
705 		if (pgrp == NULL) {
706 			ret = ESRCH;
707 			goto fail;
708 		}
709 		PGRP_UNLOCK(pgrp);
710 
711 		/*
712 		 * Policy - Don't allow a process to FSETOWN a process
713 		 * in another session.
714 		 *
715 		 * Remove this test to allow maximum flexibility or
716 		 * restrict FSETOWN to the current process or process
717 		 * group for maximum safety.
718 		 */
719 		if (pgrp->pg_session != curthread->td_proc->p_session) {
720 			ret = EPERM;
721 			goto fail;
722 		}
723 
724 		proc = NULL;
725 	}
726 	funsetown(sigiop);
727 	if (pgid > 0) {
728 		PROC_LOCK(proc);
729 		/*
730 		 * Since funsetownlst() is called without the proctree
731 		 * locked, we need to check for P_WEXIT.
732 		 * XXX: is ESRCH correct?
733 		 */
734 		if ((proc->p_flag & P_WEXIT) != 0) {
735 			PROC_UNLOCK(proc);
736 			ret = ESRCH;
737 			goto fail;
738 		}
739 		SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio, sio_pgsigio);
740 		sigio->sio_proc = proc;
741 		PROC_UNLOCK(proc);
742 	} else {
743 		PGRP_LOCK(pgrp);
744 		SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio, sio_pgsigio);
745 		sigio->sio_pgrp = pgrp;
746 		PGRP_UNLOCK(pgrp);
747 	}
748 	sx_sunlock(&proctree_lock);
749 	SIGIO_LOCK();
750 	*sigiop = sigio;
751 	SIGIO_UNLOCK();
752 	return (0);
753 
754 fail:
755 	sx_sunlock(&proctree_lock);
756 	crfree(sigio->sio_ucred);
757 	FREE(sigio, M_SIGIO);
758 	return (ret);
759 }
760 
761 /*
762  * This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg).
763  */
764 pid_t
765 fgetown(sigiop)
766 	struct sigio **sigiop;
767 {
768 	pid_t pgid;
769 
770 	SIGIO_LOCK();
771 	pgid = (*sigiop != NULL) ? (*sigiop)->sio_pgid : 0;
772 	SIGIO_UNLOCK();
773 	return (pgid);
774 }
775 
776 /*
777  * Close a file descriptor.
778  */
779 #ifndef _SYS_SYSPROTO_H_
780 struct close_args {
781         int     fd;
782 };
783 #endif
784 /*
785  * MPSAFE
786  */
787 /* ARGSUSED */
788 int
789 close(td, uap)
790 	struct thread *td;
791 	struct close_args *uap;
792 {
793 	struct filedesc *fdp;
794 	struct file *fp;
795 	int fd, error;
796 
797 	fd = uap->fd;
798 	error = 0;
799 	fdp = td->td_proc->p_fd;
800 	mtx_lock(&Giant);
801 	FILEDESC_LOCK(fdp);
802 	if ((unsigned)fd >= fdp->fd_nfiles ||
803 	    (fp = fdp->fd_ofiles[fd]) == NULL) {
804 		FILEDESC_UNLOCK(fdp);
805 		error = EBADF;
806 		goto done2;
807 	}
808 #if 0
809 	if (fdp->fd_ofileflags[fd] & UF_MAPPED)
810 		(void) munmapfd(td, fd);
811 #endif
812 	fdp->fd_ofiles[fd] = NULL;
813 	fdp->fd_ofileflags[fd] = 0;
814 
815 	/*
816 	 * we now hold the fp reference that used to be owned by the descriptor
817 	 * array.
818 	 */
819 	while (fdp->fd_lastfile > 0 && fdp->fd_ofiles[fdp->fd_lastfile] == NULL)
820 		fdp->fd_lastfile--;
821 	if (fd < fdp->fd_freefile)
822 		fdp->fd_freefile = fd;
823 	if (fd < fdp->fd_knlistsize) {
824 		FILEDESC_UNLOCK(fdp);
825 		knote_fdclose(td, fd);
826 	} else
827 		FILEDESC_UNLOCK(fdp);
828 
829 	error = closef(fp, td);
830 done2:
831 	mtx_unlock(&Giant);
832 	return (error);
833 }
834 
835 #if defined(COMPAT_43) || defined(COMPAT_SUNOS)
836 /*
837  * Return status information about a file descriptor.
838  */
839 #ifndef _SYS_SYSPROTO_H_
840 struct ofstat_args {
841 	int	fd;
842 	struct	ostat *sb;
843 };
844 #endif
845 /*
846  * MPSAFE
847  */
848 /* ARGSUSED */
849 int
850 ofstat(td, uap)
851 	struct thread *td;
852 	struct ofstat_args *uap;
853 {
854 	struct file *fp;
855 	struct stat ub;
856 	struct ostat oub;
857 	int error;
858 
859 	mtx_lock(&Giant);
860 	if ((error = fget(td, uap->fd, &fp)) != 0)
861 		goto done2;
862 	error = fo_stat(fp, &ub, td->td_ucred, td);
863 	if (error == 0) {
864 		cvtstat(&ub, &oub);
865 		error = copyout(&oub, uap->sb, sizeof(oub));
866 	}
867 	fdrop(fp, td);
868 done2:
869 	mtx_unlock(&Giant);
870 	return (error);
871 }
872 #endif /* COMPAT_43 || COMPAT_SUNOS */
873 
874 /*
875  * Return status information about a file descriptor.
876  */
877 #ifndef _SYS_SYSPROTO_H_
878 struct fstat_args {
879 	int	fd;
880 	struct	stat *sb;
881 };
882 #endif
883 /*
884  * MPSAFE
885  */
886 /* ARGSUSED */
887 int
888 fstat(td, uap)
889 	struct thread *td;
890 	struct fstat_args *uap;
891 {
892 	struct file *fp;
893 	struct stat ub;
894 	int error;
895 
896 	mtx_lock(&Giant);
897 	if ((error = fget(td, uap->fd, &fp)) != 0)
898 		goto done2;
899 	error = fo_stat(fp, &ub, td->td_ucred, td);
900 	if (error == 0)
901 		error = copyout(&ub, uap->sb, sizeof(ub));
902 	fdrop(fp, td);
903 done2:
904 	mtx_unlock(&Giant);
905 	return (error);
906 }
907 
908 /*
909  * Return status information about a file descriptor.
910  */
911 #ifndef _SYS_SYSPROTO_H_
912 struct nfstat_args {
913 	int	fd;
914 	struct	nstat *sb;
915 };
916 #endif
917 /*
918  * MPSAFE
919  */
920 /* ARGSUSED */
921 int
922 nfstat(td, uap)
923 	struct thread *td;
924 	struct nfstat_args *uap;
925 {
926 	struct file *fp;
927 	struct stat ub;
928 	struct nstat nub;
929 	int error;
930 
931 	mtx_lock(&Giant);
932 	if ((error = fget(td, uap->fd, &fp)) != 0)
933 		goto done2;
934 	error = fo_stat(fp, &ub, td->td_ucred, td);
935 	if (error == 0) {
936 		cvtnstat(&ub, &nub);
937 		error = copyout(&nub, uap->sb, sizeof(nub));
938 	}
939 	fdrop(fp, td);
940 done2:
941 	mtx_unlock(&Giant);
942 	return (error);
943 }
944 
945 /*
946  * Return pathconf information about a file descriptor.
947  */
948 #ifndef _SYS_SYSPROTO_H_
949 struct fpathconf_args {
950 	int	fd;
951 	int	name;
952 };
953 #endif
954 /*
955  * MPSAFE
956  */
957 /* ARGSUSED */
958 int
959 fpathconf(td, uap)
960 	struct thread *td;
961 	struct fpathconf_args *uap;
962 {
963 	struct file *fp;
964 	struct vnode *vp;
965 	int error;
966 
967 	if ((error = fget(td, uap->fd, &fp)) != 0)
968 		return (error);
969 
970 	/* If asynchronous I/O is available, it works for all descriptors. */
971 	if (uap->name == _PC_ASYNC_IO) {
972 		td->td_retval[0] = async_io_version;
973 		goto out;
974 	}
975 	switch (fp->f_type) {
976 	case DTYPE_PIPE:
977 	case DTYPE_SOCKET:
978 		if (uap->name != _PC_PIPE_BUF) {
979 			error = EINVAL;
980 		} else {
981 			td->td_retval[0] = PIPE_BUF;
982 			error = 0;
983 		}
984 		break;
985 	case DTYPE_FIFO:
986 	case DTYPE_VNODE:
987 		vp = fp->f_data;
988 		mtx_lock(&Giant);
989 		error = VOP_PATHCONF(vp, uap->name, td->td_retval);
990 		mtx_unlock(&Giant);
991 		break;
992 	default:
993 		error = EOPNOTSUPP;
994 		break;
995 	}
996 out:
997 	fdrop(fp, td);
998 	return (error);
999 }
1000 
1001 /*
1002  * Allocate a file descriptor for the process.
1003  */
1004 static int fdexpand;
1005 SYSCTL_INT(_debug, OID_AUTO, fdexpand, CTLFLAG_RD, &fdexpand, 0, "");
1006 
1007 int
1008 fdalloc(td, want, result)
1009 	struct thread *td;
1010 	int want;
1011 	int *result;
1012 {
1013 	struct proc *p = td->td_proc;
1014 	struct filedesc *fdp = td->td_proc->p_fd;
1015 	int i;
1016 	int lim, last, nfiles;
1017 	struct file **newofile, **oldofile;
1018 	char *newofileflags;
1019 
1020 	FILEDESC_LOCK_ASSERT(fdp, MA_OWNED);
1021 
1022 	/*
1023 	 * Search for a free descriptor starting at the higher
1024 	 * of want or fd_freefile.  If that fails, consider
1025 	 * expanding the ofile array.
1026 	 */
1027 	lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfilesperproc);
1028 	for (;;) {
1029 		last = min(fdp->fd_nfiles, lim);
1030 		i = max(want, fdp->fd_freefile);
1031 		for (; i < last; i++) {
1032 			if (fdp->fd_ofiles[i] == NULL) {
1033 				fdp->fd_ofileflags[i] = 0;
1034 				if (i > fdp->fd_lastfile)
1035 					fdp->fd_lastfile = i;
1036 				if (want <= fdp->fd_freefile)
1037 					fdp->fd_freefile = i;
1038 				*result = i;
1039 				return (0);
1040 			}
1041 		}
1042 
1043 		/*
1044 		 * No space in current array.  Expand?
1045 		 */
1046 		if (i >= lim)
1047 			return (EMFILE);
1048 		if (fdp->fd_nfiles < NDEXTENT)
1049 			nfiles = NDEXTENT;
1050 		else
1051 			nfiles = 2 * fdp->fd_nfiles;
1052 		while (nfiles < want)
1053 			nfiles <<= 1;
1054 		FILEDESC_UNLOCK(fdp);
1055 		/*
1056 		 * XXX malloc() calls uma_large_malloc() for sizes larger
1057 		 * than KMEM_ZMAX bytes. uma_large_malloc() requires Giant.
1058 		 */
1059 		mtx_lock(&Giant);
1060 		newofile = malloc(nfiles * OFILESIZE, M_FILEDESC, M_WAITOK);
1061 		mtx_unlock(&Giant);
1062 
1063 		/*
1064 		 * Deal with file-table extend race that might have
1065 		 * occurred while filedesc was unlocked.
1066 		 */
1067 		FILEDESC_LOCK(fdp);
1068 		if (fdp->fd_nfiles >= nfiles) {
1069 			/* XXX uma_large_free() needs Giant. */
1070 			FILEDESC_UNLOCK(fdp);
1071 			mtx_lock(&Giant);
1072 			free(newofile, M_FILEDESC);
1073 			mtx_unlock(&Giant);
1074 			FILEDESC_LOCK(fdp);
1075 			continue;
1076 		}
1077 		newofileflags = (char *) &newofile[nfiles];
1078 		/*
1079 		 * Copy the existing ofile and ofileflags arrays
1080 		 * and zero the new portion of each array.
1081 		 */
1082 		i = fdp->fd_nfiles * sizeof(struct file *);
1083 		bcopy(fdp->fd_ofiles, newofile,	i);
1084 		bzero((char *)newofile + i,
1085 		    nfiles * sizeof(struct file *) - i);
1086 		i = fdp->fd_nfiles * sizeof(char);
1087 		bcopy(fdp->fd_ofileflags, newofileflags, i);
1088 		bzero(newofileflags + i, nfiles * sizeof(char) - i);
1089 		if (fdp->fd_nfiles > NDFILE)
1090 			oldofile = fdp->fd_ofiles;
1091 		else
1092 			oldofile = NULL;
1093 		fdp->fd_ofiles = newofile;
1094 		fdp->fd_ofileflags = newofileflags;
1095 		fdp->fd_nfiles = nfiles;
1096 		fdexpand++;
1097 		if (oldofile != NULL) {
1098 			/* XXX uma_large_free() needs Giant. */
1099 			FILEDESC_UNLOCK(fdp);
1100 			mtx_lock(&Giant);
1101 			free(oldofile, M_FILEDESC);
1102 			mtx_unlock(&Giant);
1103 			FILEDESC_LOCK(fdp);
1104 		}
1105 	}
1106 	return (0);
1107 }
1108 
1109 /*
1110  * Check to see whether n user file descriptors
1111  * are available to the process p.
1112  */
1113 int
1114 fdavail(td, n)
1115 	struct thread *td;
1116 	int n;
1117 {
1118 	struct proc *p = td->td_proc;
1119 	struct filedesc *fdp = td->td_proc->p_fd;
1120 	struct file **fpp;
1121 	int i, lim, last;
1122 
1123 	FILEDESC_LOCK_ASSERT(fdp, MA_OWNED);
1124 
1125 	lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfilesperproc);
1126 	if ((i = lim - fdp->fd_nfiles) > 0 && (n -= i) <= 0)
1127 		return (1);
1128 	last = min(fdp->fd_nfiles, lim);
1129 	fpp = &fdp->fd_ofiles[fdp->fd_freefile];
1130 	for (i = last - fdp->fd_freefile; --i >= 0; fpp++) {
1131 		if (*fpp == NULL && --n <= 0)
1132 			return (1);
1133 	}
1134 	return (0);
1135 }
1136 
1137 /*
1138  * Create a new open file structure and allocate
1139  * a file decriptor for the process that refers to it.
1140  */
1141 int
1142 falloc(td, resultfp, resultfd)
1143 	struct thread *td;
1144 	struct file **resultfp;
1145 	int *resultfd;
1146 {
1147 	struct proc *p = td->td_proc;
1148 	struct file *fp, *fq;
1149 	int error, i;
1150 
1151 	fp = uma_zalloc(file_zone, M_WAITOK | M_ZERO);
1152 	sx_xlock(&filelist_lock);
1153 	if (nfiles >= maxfiles) {
1154 		sx_xunlock(&filelist_lock);
1155 		uma_zfree(file_zone, fp);
1156 		tablefull("file");
1157 		return (ENFILE);
1158 	}
1159 	nfiles++;
1160 
1161 	/*
1162 	 * If the process has file descriptor zero open, add the new file
1163 	 * descriptor to the list of open files at that point, otherwise
1164 	 * put it at the front of the list of open files.
1165 	 */
1166 	fp->f_mtxp = mtx_pool_alloc();
1167 	fp->f_gcflag = 0;
1168 	fp->f_count = 1;
1169 	fp->f_cred = crhold(td->td_ucred);
1170 	fp->f_ops = &badfileops;
1171 	fp->f_seqcount = 1;
1172 	FILEDESC_LOCK(p->p_fd);
1173 	if ((fq = p->p_fd->fd_ofiles[0])) {
1174 		LIST_INSERT_AFTER(fq, fp, f_list);
1175 	} else {
1176 		LIST_INSERT_HEAD(&filehead, fp, f_list);
1177 	}
1178 	sx_xunlock(&filelist_lock);
1179 	if ((error = fdalloc(td, 0, &i))) {
1180 		FILEDESC_UNLOCK(p->p_fd);
1181 		fdrop(fp, td);
1182 		return (error);
1183 	}
1184 	p->p_fd->fd_ofiles[i] = fp;
1185 	FILEDESC_UNLOCK(p->p_fd);
1186 	if (resultfp)
1187 		*resultfp = fp;
1188 	if (resultfd)
1189 		*resultfd = i;
1190 	return (0);
1191 }
1192 
1193 /*
1194  * Free a file descriptor.
1195  */
1196 void
1197 ffree(fp)
1198 	struct file *fp;
1199 {
1200 
1201 	KASSERT(fp->f_count == 0, ("ffree: fp_fcount not 0!"));
1202 	sx_xlock(&filelist_lock);
1203 	LIST_REMOVE(fp, f_list);
1204 	nfiles--;
1205 	sx_xunlock(&filelist_lock);
1206 	crfree(fp->f_cred);
1207 	uma_zfree(file_zone, fp);
1208 }
1209 
1210 /*
1211  * Build a new filedesc structure from another.
1212  * Copy the current, root, and jail root vnode references.
1213  */
1214 struct filedesc *
1215 fdinit(fdp)
1216 	struct filedesc *fdp;
1217 {
1218 	struct filedesc0 *newfdp;
1219 
1220 	MALLOC(newfdp, struct filedesc0 *, sizeof(struct filedesc0),
1221 	    M_FILEDESC, M_WAITOK | M_ZERO);
1222 	mtx_init(&newfdp->fd_fd.fd_mtx, FILEDESC_LOCK_DESC, NULL, MTX_DEF);
1223 	newfdp->fd_fd.fd_cdir = fdp->fd_cdir;
1224 	if (newfdp->fd_fd.fd_cdir)
1225 		VREF(newfdp->fd_fd.fd_cdir);
1226 	newfdp->fd_fd.fd_rdir = fdp->fd_rdir;
1227 	if (newfdp->fd_fd.fd_rdir)
1228 		VREF(newfdp->fd_fd.fd_rdir);
1229 	newfdp->fd_fd.fd_jdir = fdp->fd_jdir;
1230 	if (newfdp->fd_fd.fd_jdir)
1231 		VREF(newfdp->fd_fd.fd_jdir);
1232 
1233 	/* Create the file descriptor table. */
1234 	newfdp->fd_fd.fd_refcnt = 1;
1235 	newfdp->fd_fd.fd_cmask = cmask;
1236 	newfdp->fd_fd.fd_ofiles = newfdp->fd_dfiles;
1237 	newfdp->fd_fd.fd_ofileflags = newfdp->fd_dfileflags;
1238 	newfdp->fd_fd.fd_nfiles = NDFILE;
1239 	newfdp->fd_fd.fd_knlistsize = -1;
1240 	return (&newfdp->fd_fd);
1241 }
1242 
1243 /*
1244  * Share a filedesc structure.
1245  */
1246 struct filedesc *
1247 fdshare(fdp)
1248 	struct filedesc *fdp;
1249 {
1250 	FILEDESC_LOCK(fdp);
1251 	fdp->fd_refcnt++;
1252 	FILEDESC_UNLOCK(fdp);
1253 	return (fdp);
1254 }
1255 
1256 /*
1257  * Copy a filedesc structure.
1258  * A NULL pointer in returns a NULL reference, this is to ease callers,
1259  * not catch errors.
1260  */
1261 struct filedesc *
1262 fdcopy(fdp)
1263 	struct filedesc *fdp;
1264 {
1265 	struct filedesc *newfdp;
1266 	struct file **fpp;
1267 	int i, j;
1268 
1269 	/* Certain daemons might not have file descriptors. */
1270 	if (fdp == NULL)
1271 		return (NULL);
1272 
1273 	FILEDESC_LOCK_ASSERT(fdp, MA_OWNED);
1274 
1275 	FILEDESC_UNLOCK(fdp);
1276 	MALLOC(newfdp, struct filedesc *, sizeof(struct filedesc0),
1277 	    M_FILEDESC, M_WAITOK);
1278 	FILEDESC_LOCK(fdp);
1279 	bcopy(fdp, newfdp, sizeof(struct filedesc));
1280 	FILEDESC_UNLOCK(fdp);
1281 	bzero(&newfdp->fd_mtx, sizeof(newfdp->fd_mtx));
1282 	mtx_init(&newfdp->fd_mtx, FILEDESC_LOCK_DESC, NULL, MTX_DEF);
1283 	if (newfdp->fd_cdir)
1284 		VREF(newfdp->fd_cdir);
1285 	if (newfdp->fd_rdir)
1286 		VREF(newfdp->fd_rdir);
1287 	if (newfdp->fd_jdir)
1288 		VREF(newfdp->fd_jdir);
1289 	newfdp->fd_refcnt = 1;
1290 
1291 	/*
1292 	 * If the number of open files fits in the internal arrays
1293 	 * of the open file structure, use them, otherwise allocate
1294 	 * additional memory for the number of descriptors currently
1295 	 * in use.
1296 	 */
1297 	FILEDESC_LOCK(fdp);
1298 	newfdp->fd_lastfile = fdp->fd_lastfile;
1299 	newfdp->fd_nfiles = fdp->fd_nfiles;
1300 	if (newfdp->fd_lastfile < NDFILE) {
1301 		newfdp->fd_ofiles = ((struct filedesc0 *) newfdp)->fd_dfiles;
1302 		newfdp->fd_ofileflags =
1303 		    ((struct filedesc0 *) newfdp)->fd_dfileflags;
1304 		i = NDFILE;
1305 	} else {
1306 		/*
1307 		 * Compute the smallest multiple of NDEXTENT needed
1308 		 * for the file descriptors currently in use,
1309 		 * allowing the table to shrink.
1310 		 */
1311 retry:
1312 		i = newfdp->fd_nfiles;
1313 		while (i > 2 * NDEXTENT && i > newfdp->fd_lastfile * 2)
1314 			i /= 2;
1315 		FILEDESC_UNLOCK(fdp);
1316 		MALLOC(newfdp->fd_ofiles, struct file **, i * OFILESIZE,
1317 		    M_FILEDESC, M_WAITOK);
1318 		FILEDESC_LOCK(fdp);
1319 		newfdp->fd_lastfile = fdp->fd_lastfile;
1320 		newfdp->fd_nfiles = fdp->fd_nfiles;
1321 		j = newfdp->fd_nfiles;
1322 		while (j > 2 * NDEXTENT && j > newfdp->fd_lastfile * 2)
1323 			j /= 2;
1324 		if (i != j) {
1325 			/*
1326 			 * The size of the original table has changed.
1327 			 * Go over once again.
1328 			 */
1329 			FILEDESC_UNLOCK(fdp);
1330 			FREE(newfdp->fd_ofiles, M_FILEDESC);
1331 			FILEDESC_LOCK(fdp);
1332 			newfdp->fd_lastfile = fdp->fd_lastfile;
1333 			newfdp->fd_nfiles = fdp->fd_nfiles;
1334 			goto retry;
1335 		}
1336 		newfdp->fd_ofileflags = (char *) &newfdp->fd_ofiles[i];
1337 	}
1338 	newfdp->fd_nfiles = i;
1339 	bcopy(fdp->fd_ofiles, newfdp->fd_ofiles, i * sizeof(struct file **));
1340 	bcopy(fdp->fd_ofileflags, newfdp->fd_ofileflags, i * sizeof(char));
1341 
1342 	/*
1343 	 * kq descriptors cannot be copied.
1344 	 */
1345 	if (newfdp->fd_knlistsize != -1) {
1346 		fpp = &newfdp->fd_ofiles[newfdp->fd_lastfile];
1347 		for (i = newfdp->fd_lastfile; i >= 0; i--, fpp--) {
1348 			if (*fpp != NULL && (*fpp)->f_type == DTYPE_KQUEUE) {
1349 				*fpp = NULL;
1350 				if (i < newfdp->fd_freefile)
1351 					newfdp->fd_freefile = i;
1352 			}
1353 			if (*fpp == NULL && i == newfdp->fd_lastfile && i > 0)
1354 				newfdp->fd_lastfile--;
1355 		}
1356 		newfdp->fd_knlist = NULL;
1357 		newfdp->fd_knlistsize = -1;
1358 		newfdp->fd_knhash = NULL;
1359 		newfdp->fd_knhashmask = 0;
1360 	}
1361 
1362 	fpp = newfdp->fd_ofiles;
1363 	for (i = newfdp->fd_lastfile; i-- >= 0; fpp++) {
1364 		if (*fpp != NULL)
1365 			fhold(*fpp);
1366 	}
1367 	return (newfdp);
1368 }
1369 
1370 /* A mutex to protect the association between a proc and filedesc. */
1371 struct mtx	fdesc_mtx;
1372 MTX_SYSINIT(fdesc, &fdesc_mtx, "fdesc", MTX_DEF);
1373 
1374 /*
1375  * Release a filedesc structure.
1376  */
1377 void
1378 fdfree(td)
1379 	struct thread *td;
1380 {
1381 	struct filedesc *fdp;
1382 	struct file **fpp;
1383 	int i;
1384 
1385 	/* Certain daemons might not have file descriptors. */
1386 	fdp = td->td_proc->p_fd;
1387 	if (fdp == NULL)
1388 		return;
1389 
1390 	FILEDESC_LOCK(fdp);
1391 	if (--fdp->fd_refcnt > 0) {
1392 		FILEDESC_UNLOCK(fdp);
1393 		return;
1394 	}
1395 
1396 	/*
1397 	 * We are the last reference to the structure, so we can
1398 	 * safely assume it will not change out from under us.
1399 	 */
1400 	FILEDESC_UNLOCK(fdp);
1401 	fpp = fdp->fd_ofiles;
1402 	for (i = fdp->fd_lastfile; i-- >= 0; fpp++) {
1403 		if (*fpp)
1404 			(void) closef(*fpp, td);
1405 	}
1406 
1407 	/* XXX This should happen earlier. */
1408 	mtx_lock(&fdesc_mtx);
1409 	td->td_proc->p_fd = NULL;
1410 	mtx_unlock(&fdesc_mtx);
1411 
1412 	if (fdp->fd_nfiles > NDFILE)
1413 		FREE(fdp->fd_ofiles, M_FILEDESC);
1414 	if (fdp->fd_cdir)
1415 		vrele(fdp->fd_cdir);
1416 	if (fdp->fd_rdir)
1417 		vrele(fdp->fd_rdir);
1418 	if (fdp->fd_jdir)
1419 		vrele(fdp->fd_jdir);
1420 	if (fdp->fd_knlist)
1421 		FREE(fdp->fd_knlist, M_KQUEUE);
1422 	if (fdp->fd_knhash)
1423 		FREE(fdp->fd_knhash, M_KQUEUE);
1424 	mtx_destroy(&fdp->fd_mtx);
1425 	FREE(fdp, M_FILEDESC);
1426 }
1427 
1428 /*
1429  * For setugid programs, we don't want to people to use that setugidness
1430  * to generate error messages which write to a file which otherwise would
1431  * otherwise be off-limits to the process.  We check for filesystems where
1432  * the vnode can change out from under us after execve (like [lin]procfs).
1433  *
1434  * Since setugidsafety calls this only for fd 0, 1 and 2, this check is
1435  * sufficient.  We also don't for check setugidness since we know we are.
1436  */
1437 static int
1438 is_unsafe(struct file *fp)
1439 {
1440 	if (fp->f_type == DTYPE_VNODE) {
1441 		struct vnode *vp = fp->f_data;
1442 
1443 		if ((vp->v_vflag & VV_PROCDEP) != 0)
1444 			return (1);
1445 	}
1446 	return (0);
1447 }
1448 
1449 /*
1450  * Make this setguid thing safe, if at all possible.
1451  */
1452 void
1453 setugidsafety(td)
1454 	struct thread *td;
1455 {
1456 	struct filedesc *fdp;
1457 	int i;
1458 
1459 	/* Certain daemons might not have file descriptors. */
1460 	fdp = td->td_proc->p_fd;
1461 	if (fdp == NULL)
1462 		return;
1463 
1464 	/*
1465 	 * Note: fdp->fd_ofiles may be reallocated out from under us while
1466 	 * we are blocked in a close.  Be careful!
1467 	 */
1468 	FILEDESC_LOCK(fdp);
1469 	for (i = 0; i <= fdp->fd_lastfile; i++) {
1470 		if (i > 2)
1471 			break;
1472 		if (fdp->fd_ofiles[i] && is_unsafe(fdp->fd_ofiles[i])) {
1473 			struct file *fp;
1474 
1475 #if 0
1476 			if ((fdp->fd_ofileflags[i] & UF_MAPPED) != 0)
1477 				(void) munmapfd(td, i);
1478 #endif
1479 			if (i < fdp->fd_knlistsize) {
1480 				FILEDESC_UNLOCK(fdp);
1481 				knote_fdclose(td, i);
1482 				FILEDESC_LOCK(fdp);
1483 			}
1484 			/*
1485 			 * NULL-out descriptor prior to close to avoid
1486 			 * a race while close blocks.
1487 			 */
1488 			fp = fdp->fd_ofiles[i];
1489 			fdp->fd_ofiles[i] = NULL;
1490 			fdp->fd_ofileflags[i] = 0;
1491 			if (i < fdp->fd_freefile)
1492 				fdp->fd_freefile = i;
1493 			FILEDESC_UNLOCK(fdp);
1494 			(void) closef(fp, td);
1495 			FILEDESC_LOCK(fdp);
1496 		}
1497 	}
1498 	while (fdp->fd_lastfile > 0 && fdp->fd_ofiles[fdp->fd_lastfile] == NULL)
1499 		fdp->fd_lastfile--;
1500 	FILEDESC_UNLOCK(fdp);
1501 }
1502 
1503 /*
1504  * Close any files on exec?
1505  */
1506 void
1507 fdcloseexec(td)
1508 	struct thread *td;
1509 {
1510 	struct filedesc *fdp;
1511 	int i;
1512 
1513 	/* Certain daemons might not have file descriptors. */
1514 	fdp = td->td_proc->p_fd;
1515 	if (fdp == NULL)
1516 		return;
1517 
1518 	FILEDESC_LOCK(fdp);
1519 
1520 	/*
1521 	 * We cannot cache fd_ofiles or fd_ofileflags since operations
1522 	 * may block and rip them out from under us.
1523 	 */
1524 	for (i = 0; i <= fdp->fd_lastfile; i++) {
1525 		if (fdp->fd_ofiles[i] != NULL &&
1526 		    (fdp->fd_ofileflags[i] & UF_EXCLOSE)) {
1527 			struct file *fp;
1528 
1529 #if 0
1530 			if (fdp->fd_ofileflags[i] & UF_MAPPED)
1531 				(void) munmapfd(td, i);
1532 #endif
1533 			if (i < fdp->fd_knlistsize) {
1534 				FILEDESC_UNLOCK(fdp);
1535 				knote_fdclose(td, i);
1536 				FILEDESC_LOCK(fdp);
1537 			}
1538 			/*
1539 			 * NULL-out descriptor prior to close to avoid
1540 			 * a race while close blocks.
1541 			 */
1542 			fp = fdp->fd_ofiles[i];
1543 			fdp->fd_ofiles[i] = NULL;
1544 			fdp->fd_ofileflags[i] = 0;
1545 			if (i < fdp->fd_freefile)
1546 				fdp->fd_freefile = i;
1547 			FILEDESC_UNLOCK(fdp);
1548 			(void) closef(fp, td);
1549 			FILEDESC_LOCK(fdp);
1550 		}
1551 	}
1552 	while (fdp->fd_lastfile > 0 && fdp->fd_ofiles[fdp->fd_lastfile] == NULL)
1553 		fdp->fd_lastfile--;
1554 	FILEDESC_UNLOCK(fdp);
1555 }
1556 
1557 /*
1558  * It is unsafe for set[ug]id processes to be started with file
1559  * descriptors 0..2 closed, as these descriptors are given implicit
1560  * significance in the Standard C library.  fdcheckstd() will create a
1561  * descriptor referencing /dev/null for each of stdin, stdout, and
1562  * stderr that is not already open.
1563  */
1564 int
1565 fdcheckstd(td)
1566 	struct thread *td;
1567 {
1568 	struct nameidata nd;
1569 	struct filedesc *fdp;
1570 	struct file *fp;
1571 	register_t retval;
1572 	int fd, i, error, flags, devnull;
1573 
1574 	fdp = td->td_proc->p_fd;
1575 	if (fdp == NULL)
1576 		return (0);
1577 	devnull = -1;
1578 	error = 0;
1579 	for (i = 0; i < 3; i++) {
1580 		if (fdp->fd_ofiles[i] != NULL)
1581 			continue;
1582 		if (devnull < 0) {
1583 			error = falloc(td, &fp, &fd);
1584 			if (error != 0)
1585 				break;
1586 			KASSERT(fd == i, ("oof, we didn't get our fd"));
1587 			NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, "/dev/null",
1588 			    td);
1589 			flags = FREAD | FWRITE;
1590 			error = vn_open(&nd, &flags, 0);
1591 			if (error != 0) {
1592 				FILEDESC_LOCK(fdp);
1593 				fdp->fd_ofiles[fd] = NULL;
1594 				FILEDESC_UNLOCK(fdp);
1595 				fdrop(fp, td);
1596 				break;
1597 			}
1598 			NDFREE(&nd, NDF_ONLY_PNBUF);
1599 			fp->f_data = nd.ni_vp;
1600 			fp->f_flag = flags;
1601 			fp->f_ops = &vnops;
1602 			fp->f_type = DTYPE_VNODE;
1603 			VOP_UNLOCK(nd.ni_vp, 0, td);
1604 			devnull = fd;
1605 		} else {
1606 			error = do_dup(td, DUP_FIXED, devnull, i, &retval);
1607 			if (error != 0)
1608 				break;
1609 		}
1610 	}
1611 	return (error);
1612 }
1613 
1614 /*
1615  * Internal form of close.
1616  * Decrement reference count on file structure.
1617  * Note: td may be NULL when closing a file
1618  * that was being passed in a message.
1619  */
1620 int
1621 closef(fp, td)
1622 	struct file *fp;
1623 	struct thread *td;
1624 {
1625 	struct vnode *vp;
1626 	struct flock lf;
1627 
1628 	if (fp == NULL)
1629 		return (0);
1630 	/*
1631 	 * POSIX record locking dictates that any close releases ALL
1632 	 * locks owned by this process.  This is handled by setting
1633 	 * a flag in the unlock to free ONLY locks obeying POSIX
1634 	 * semantics, and not to free BSD-style file locks.
1635 	 * If the descriptor was in a message, POSIX-style locks
1636 	 * aren't passed with the descriptor.
1637 	 */
1638 	if (td != NULL && (td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0 &&
1639 	    fp->f_type == DTYPE_VNODE) {
1640 		lf.l_whence = SEEK_SET;
1641 		lf.l_start = 0;
1642 		lf.l_len = 0;
1643 		lf.l_type = F_UNLCK;
1644 		vp = fp->f_data;
1645 		(void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc->p_leader,
1646 				   F_UNLCK, &lf, F_POSIX);
1647 	}
1648 	return (fdrop(fp, td));
1649 }
1650 
1651 /*
1652  * Drop reference on struct file passed in, may call closef if the
1653  * reference hits zero.
1654  */
1655 int
1656 fdrop(fp, td)
1657 	struct file *fp;
1658 	struct thread *td;
1659 {
1660 
1661 	FILE_LOCK(fp);
1662 	return (fdrop_locked(fp, td));
1663 }
1664 
1665 /*
1666  * Extract the file pointer associated with the specified descriptor for
1667  * the current user process.
1668  *
1669  * If the descriptor doesn't exist, EBADF is returned.
1670  *
1671  * If the descriptor exists but doesn't match 'flags' then
1672  * return EBADF for read attempts and EINVAL for write attempts.
1673  *
1674  * If 'hold' is set (non-zero) the file's refcount will be bumped on return.
1675  * It should be droped with fdrop().
1676  * If it is not set, then the refcount will not be bumped however the
1677  * thread's filedesc struct will be returned locked (for fgetsock).
1678  *
1679  * If an error occured the non-zero error is returned and *fpp is set to NULL.
1680  * Otherwise *fpp is set and zero is returned.
1681  */
1682 static __inline int
1683 _fget(struct thread *td, int fd, struct file **fpp, int flags, int hold)
1684 {
1685 	struct filedesc *fdp;
1686 	struct file *fp;
1687 
1688 	*fpp = NULL;
1689 	if (td == NULL || (fdp = td->td_proc->p_fd) == NULL)
1690 		return (EBADF);
1691 	FILEDESC_LOCK(fdp);
1692 	if ((fp = fget_locked(fdp, fd)) == NULL || fp->f_ops == &badfileops) {
1693 		FILEDESC_UNLOCK(fdp);
1694 		return (EBADF);
1695 	}
1696 
1697 	/*
1698 	 * Note: FREAD failures returns EBADF to maintain backwards
1699 	 * compatibility with what routines returned before.
1700 	 *
1701 	 * Only one flag, or 0, may be specified.
1702 	 */
1703 	if (flags == FREAD && (fp->f_flag & FREAD) == 0) {
1704 		FILEDESC_UNLOCK(fdp);
1705 		return (EBADF);
1706 	}
1707 	if (flags == FWRITE && (fp->f_flag & FWRITE) == 0) {
1708 		FILEDESC_UNLOCK(fdp);
1709 		return (EINVAL);
1710 	}
1711 	if (hold) {
1712 		fhold(fp);
1713 		FILEDESC_UNLOCK(fdp);
1714 	}
1715 	*fpp = fp;
1716 	return (0);
1717 }
1718 
1719 int
1720 fget(struct thread *td, int fd, struct file **fpp)
1721 {
1722 
1723 	return(_fget(td, fd, fpp, 0, 1));
1724 }
1725 
1726 int
1727 fget_read(struct thread *td, int fd, struct file **fpp)
1728 {
1729 
1730 	return(_fget(td, fd, fpp, FREAD, 1));
1731 }
1732 
1733 int
1734 fget_write(struct thread *td, int fd, struct file **fpp)
1735 {
1736 
1737 	return(_fget(td, fd, fpp, FWRITE, 1));
1738 }
1739 
1740 /*
1741  * Like fget() but loads the underlying vnode, or returns an error if
1742  * the descriptor does not represent a vnode.  Note that pipes use vnodes
1743  * but never have VM objects (so VOP_GETVOBJECT() calls will return an
1744  * error).  The returned vnode will be vref()d.
1745  */
1746 static __inline int
1747 _fgetvp(struct thread *td, int fd, struct vnode **vpp, int flags)
1748 {
1749 	struct file *fp;
1750 	int error;
1751 
1752 	*vpp = NULL;
1753 	if ((error = _fget(td, fd, &fp, 0, 0)) != 0)
1754 		return (error);
1755 	if (fp->f_type != DTYPE_VNODE && fp->f_type != DTYPE_FIFO) {
1756 		error = EINVAL;
1757 	} else {
1758 		*vpp = fp->f_data;
1759 		vref(*vpp);
1760 	}
1761 	FILEDESC_UNLOCK(td->td_proc->p_fd);
1762 	return (error);
1763 }
1764 
1765 int
1766 fgetvp(struct thread *td, int fd, struct vnode **vpp)
1767 {
1768 
1769 	return (_fgetvp(td, fd, vpp, 0));
1770 }
1771 
1772 int
1773 fgetvp_read(struct thread *td, int fd, struct vnode **vpp)
1774 {
1775 
1776 	return (_fgetvp(td, fd, vpp, FREAD));
1777 }
1778 
1779 int
1780 fgetvp_write(struct thread *td, int fd, struct vnode **vpp)
1781 {
1782 
1783 	return (_fgetvp(td, fd, vpp, FWRITE));
1784 }
1785 
1786 /*
1787  * Like fget() but loads the underlying socket, or returns an error if
1788  * the descriptor does not represent a socket.
1789  *
1790  * We bump the ref count on the returned socket.  XXX Also obtain the SX
1791  * lock in the future.
1792  */
1793 int
1794 fgetsock(struct thread *td, int fd, struct socket **spp, u_int *fflagp)
1795 {
1796 	struct file *fp;
1797 	int error;
1798 
1799 	*spp = NULL;
1800 	if (fflagp != NULL)
1801 		*fflagp = 0;
1802 	if ((error = _fget(td, fd, &fp, 0, 0)) != 0)
1803 		return (error);
1804 	if (fp->f_type != DTYPE_SOCKET) {
1805 		error = ENOTSOCK;
1806 	} else {
1807 		*spp = fp->f_data;
1808 		if (fflagp)
1809 			*fflagp = fp->f_flag;
1810 		soref(*spp);
1811 	}
1812 	FILEDESC_UNLOCK(td->td_proc->p_fd);
1813 	return (error);
1814 }
1815 
1816 /*
1817  * Drop the reference count on the the socket and XXX release the SX lock in
1818  * the future.  The last reference closes the socket.
1819  */
1820 void
1821 fputsock(struct socket *so)
1822 {
1823 
1824 	sorele(so);
1825 }
1826 
1827 /*
1828  * Drop reference on struct file passed in, may call closef if the
1829  * reference hits zero.
1830  * Expects struct file locked, and will unlock it.
1831  */
1832 int
1833 fdrop_locked(fp, td)
1834 	struct file *fp;
1835 	struct thread *td;
1836 {
1837 	struct flock lf;
1838 	struct vnode *vp;
1839 	int error;
1840 
1841 	FILE_LOCK_ASSERT(fp, MA_OWNED);
1842 
1843 	if (--fp->f_count > 0) {
1844 		FILE_UNLOCK(fp);
1845 		return (0);
1846 	}
1847 	mtx_lock(&Giant);
1848 	if (fp->f_count < 0)
1849 		panic("fdrop: count < 0");
1850 	if ((fp->f_flag & FHASLOCK) && fp->f_type == DTYPE_VNODE) {
1851 		lf.l_whence = SEEK_SET;
1852 		lf.l_start = 0;
1853 		lf.l_len = 0;
1854 		lf.l_type = F_UNLCK;
1855 		vp = fp->f_data;
1856 		FILE_UNLOCK(fp);
1857 		(void) VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK);
1858 	} else
1859 		FILE_UNLOCK(fp);
1860 	if (fp->f_ops != &badfileops)
1861 		error = fo_close(fp, td);
1862 	else
1863 		error = 0;
1864 	ffree(fp);
1865 	mtx_unlock(&Giant);
1866 	return (error);
1867 }
1868 
1869 /*
1870  * Apply an advisory lock on a file descriptor.
1871  *
1872  * Just attempt to get a record lock of the requested type on
1873  * the entire file (l_whence = SEEK_SET, l_start = 0, l_len = 0).
1874  */
1875 #ifndef _SYS_SYSPROTO_H_
1876 struct flock_args {
1877 	int	fd;
1878 	int	how;
1879 };
1880 #endif
1881 /*
1882  * MPSAFE
1883  */
1884 /* ARGSUSED */
1885 int
1886 flock(td, uap)
1887 	struct thread *td;
1888 	struct flock_args *uap;
1889 {
1890 	struct file *fp;
1891 	struct vnode *vp;
1892 	struct flock lf;
1893 	int error;
1894 
1895 	if ((error = fget(td, uap->fd, &fp)) != 0)
1896 		return (error);
1897 	if (fp->f_type != DTYPE_VNODE) {
1898 		fdrop(fp, td);
1899 		return (EOPNOTSUPP);
1900 	}
1901 
1902 	mtx_lock(&Giant);
1903 	vp = fp->f_data;
1904 	lf.l_whence = SEEK_SET;
1905 	lf.l_start = 0;
1906 	lf.l_len = 0;
1907 	if (uap->how & LOCK_UN) {
1908 		lf.l_type = F_UNLCK;
1909 		FILE_LOCK(fp);
1910 		fp->f_flag &= ~FHASLOCK;
1911 		FILE_UNLOCK(fp);
1912 		error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK);
1913 		goto done2;
1914 	}
1915 	if (uap->how & LOCK_EX)
1916 		lf.l_type = F_WRLCK;
1917 	else if (uap->how & LOCK_SH)
1918 		lf.l_type = F_RDLCK;
1919 	else {
1920 		error = EBADF;
1921 		goto done2;
1922 	}
1923 	FILE_LOCK(fp);
1924 	fp->f_flag |= FHASLOCK;
1925 	FILE_UNLOCK(fp);
1926 	error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,
1927 	    (uap->how & LOCK_NB) ? F_FLOCK : F_FLOCK | F_WAIT);
1928 done2:
1929 	fdrop(fp, td);
1930 	mtx_unlock(&Giant);
1931 	return (error);
1932 }
1933 
1934 /*
1935  * File Descriptor pseudo-device driver (/dev/fd/).
1936  *
1937  * Opening minor device N dup()s the file (if any) connected to file
1938  * descriptor N belonging to the calling process.  Note that this driver
1939  * consists of only the ``open()'' routine, because all subsequent
1940  * references to this file will be direct to the other driver.
1941  */
1942 /* ARGSUSED */
1943 static int
1944 fdopen(dev, mode, type, td)
1945 	dev_t dev;
1946 	int mode, type;
1947 	struct thread *td;
1948 {
1949 
1950 	/*
1951 	 * XXX Kludge: set curthread->td_dupfd to contain the value of the
1952 	 * the file descriptor being sought for duplication. The error
1953 	 * return ensures that the vnode for this device will be released
1954 	 * by vn_open. Open will detect this special error and take the
1955 	 * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN
1956 	 * will simply report the error.
1957 	 */
1958 	td->td_dupfd = dev2unit(dev);
1959 	return (ENODEV);
1960 }
1961 
1962 /*
1963  * Duplicate the specified descriptor to a free descriptor.
1964  */
1965 int
1966 dupfdopen(td, fdp, indx, dfd, mode, error)
1967 	struct thread *td;
1968 	struct filedesc *fdp;
1969 	int indx, dfd;
1970 	int mode;
1971 	int error;
1972 {
1973 	struct file *wfp;
1974 	struct file *fp;
1975 
1976 	/*
1977 	 * If the to-be-dup'd fd number is greater than the allowed number
1978 	 * of file descriptors, or the fd to be dup'd has already been
1979 	 * closed, then reject.
1980 	 */
1981 	FILEDESC_LOCK(fdp);
1982 	if (dfd < 0 || dfd >= fdp->fd_nfiles ||
1983 	    (wfp = fdp->fd_ofiles[dfd]) == NULL) {
1984 		FILEDESC_UNLOCK(fdp);
1985 		return (EBADF);
1986 	}
1987 
1988 	/*
1989 	 * There are two cases of interest here.
1990 	 *
1991 	 * For ENODEV simply dup (dfd) to file descriptor
1992 	 * (indx) and return.
1993 	 *
1994 	 * For ENXIO steal away the file structure from (dfd) and
1995 	 * store it in (indx).  (dfd) is effectively closed by
1996 	 * this operation.
1997 	 *
1998 	 * Any other error code is just returned.
1999 	 */
2000 	switch (error) {
2001 	case ENODEV:
2002 		/*
2003 		 * Check that the mode the file is being opened for is a
2004 		 * subset of the mode of the existing descriptor.
2005 		 */
2006 		FILE_LOCK(wfp);
2007 		if (((mode & (FREAD|FWRITE)) | wfp->f_flag) != wfp->f_flag) {
2008 			FILE_UNLOCK(wfp);
2009 			FILEDESC_UNLOCK(fdp);
2010 			return (EACCES);
2011 		}
2012 		fp = fdp->fd_ofiles[indx];
2013 #if 0
2014 		if (fp && fdp->fd_ofileflags[indx] & UF_MAPPED)
2015 			(void) munmapfd(td, indx);
2016 #endif
2017 		fdp->fd_ofiles[indx] = wfp;
2018 		fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd];
2019 		fhold_locked(wfp);
2020 		FILE_UNLOCK(wfp);
2021 		if (indx > fdp->fd_lastfile)
2022 			fdp->fd_lastfile = indx;
2023 		if (fp != NULL)
2024 			FILE_LOCK(fp);
2025 		FILEDESC_UNLOCK(fdp);
2026 		/*
2027 		 * We now own the reference to fp that the ofiles[] array
2028 		 * used to own.  Release it.
2029 		 */
2030 		if (fp != NULL)
2031 			fdrop_locked(fp, td);
2032 		return (0);
2033 
2034 	case ENXIO:
2035 		/*
2036 		 * Steal away the file pointer from dfd and stuff it into indx.
2037 		 */
2038 		fp = fdp->fd_ofiles[indx];
2039 #if 0
2040 		if (fp && fdp->fd_ofileflags[indx] & UF_MAPPED)
2041 			(void) munmapfd(td, indx);
2042 #endif
2043 		fdp->fd_ofiles[indx] = fdp->fd_ofiles[dfd];
2044 		fdp->fd_ofiles[dfd] = NULL;
2045 		fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd];
2046 		fdp->fd_ofileflags[dfd] = 0;
2047 
2048 		/*
2049 		 * Complete the clean up of the filedesc structure by
2050 		 * recomputing the various hints.
2051 		 */
2052 		if (indx > fdp->fd_lastfile) {
2053 			fdp->fd_lastfile = indx;
2054 		} else {
2055 			while (fdp->fd_lastfile > 0 &&
2056 			   fdp->fd_ofiles[fdp->fd_lastfile] == NULL) {
2057 				fdp->fd_lastfile--;
2058 			}
2059 			if (dfd < fdp->fd_freefile)
2060 				fdp->fd_freefile = dfd;
2061 		}
2062 		if (fp != NULL)
2063 			FILE_LOCK(fp);
2064 		FILEDESC_UNLOCK(fdp);
2065 
2066 		/*
2067 		 * we now own the reference to fp that the ofiles[] array
2068 		 * used to own.  Release it.
2069 		 */
2070 		if (fp != NULL)
2071 			fdrop_locked(fp, td);
2072 		return (0);
2073 
2074 	default:
2075 		FILEDESC_UNLOCK(fdp);
2076 		return (error);
2077 	}
2078 	/* NOTREACHED */
2079 }
2080 
2081 /*
2082  * Get file structures.
2083  */
2084 static int
2085 sysctl_kern_file(SYSCTL_HANDLER_ARGS)
2086 {
2087 	struct xfile xf;
2088 	struct filedesc *fdp;
2089 	struct file *fp;
2090 	struct proc *p;
2091 	int error, n;
2092 
2093 	sysctl_wire_old_buffer(req, 0);
2094 	if (req->oldptr == NULL) {
2095 		n = 16;		/* A slight overestimate. */
2096 		sx_slock(&filelist_lock);
2097 		LIST_FOREACH(fp, &filehead, f_list) {
2098 			/*
2099 			 * We should grab the lock, but this is an
2100 			 * estimate, so does it really matter?
2101 			 */
2102 			/* mtx_lock(fp->f_mtxp); */
2103 			n += fp->f_count;
2104 			/* mtx_unlock(f->f_mtxp); */
2105 		}
2106 		sx_sunlock(&filelist_lock);
2107 		return (SYSCTL_OUT(req, 0, n * sizeof(xf)));
2108 	}
2109 	error = 0;
2110 	bzero(&xf, sizeof(xf));
2111 	xf.xf_size = sizeof(xf);
2112 	sx_slock(&allproc_lock);
2113 	LIST_FOREACH(p, &allproc, p_list) {
2114 		PROC_LOCK(p);
2115 		xf.xf_pid = p->p_pid;
2116 		xf.xf_uid = p->p_ucred->cr_uid;
2117 		PROC_UNLOCK(p);
2118 		mtx_lock(&fdesc_mtx);
2119 		if ((fdp = p->p_fd) == NULL) {
2120 			mtx_unlock(&fdesc_mtx);
2121 			continue;
2122 		}
2123 		FILEDESC_LOCK(fdp);
2124 		for (n = 0; n < fdp->fd_nfiles; ++n) {
2125 			if ((fp = fdp->fd_ofiles[n]) == NULL)
2126 				continue;
2127 			xf.xf_fd = n;
2128 			xf.xf_file = fp;
2129 			xf.xf_data = fp->f_data;
2130 			xf.xf_type = fp->f_type;
2131 			xf.xf_count = fp->f_count;
2132 			xf.xf_msgcount = fp->f_msgcount;
2133 			xf.xf_offset = fp->f_offset;
2134 			xf.xf_flag = fp->f_flag;
2135 			error = SYSCTL_OUT(req, &xf, sizeof(xf));
2136 			if (error)
2137 				break;
2138 		}
2139 		FILEDESC_UNLOCK(fdp);
2140 		mtx_unlock(&fdesc_mtx);
2141 		if (error)
2142 			break;
2143 	}
2144 	sx_sunlock(&allproc_lock);
2145 	return (error);
2146 }
2147 
2148 SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD,
2149     0, 0, sysctl_kern_file, "S,xfile", "Entire file table");
2150 
2151 SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW,
2152     &maxfilesperproc, 0, "Maximum files allowed open per process");
2153 
2154 SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW,
2155     &maxfiles, 0, "Maximum number of files");
2156 
2157 SYSCTL_INT(_kern, OID_AUTO, openfiles, CTLFLAG_RD,
2158     &nfiles, 0, "System-wide number of open files");
2159 
2160 static void
2161 fildesc_drvinit(void *unused)
2162 {
2163 	dev_t dev;
2164 
2165 	dev = make_dev(&fildesc_cdevsw, 0, UID_ROOT, GID_WHEEL, 0666, "fd/0");
2166 	make_dev_alias(dev, "stdin");
2167 	dev = make_dev(&fildesc_cdevsw, 1, UID_ROOT, GID_WHEEL, 0666, "fd/1");
2168 	make_dev_alias(dev, "stdout");
2169 	dev = make_dev(&fildesc_cdevsw, 2, UID_ROOT, GID_WHEEL, 0666, "fd/2");
2170 	make_dev_alias(dev, "stderr");
2171 }
2172 
2173 static fo_rdwr_t	badfo_readwrite;
2174 static fo_ioctl_t	badfo_ioctl;
2175 static fo_poll_t	badfo_poll;
2176 static fo_kqfilter_t	badfo_kqfilter;
2177 static fo_stat_t	badfo_stat;
2178 static fo_close_t	badfo_close;
2179 
2180 struct fileops badfileops = {
2181 	badfo_readwrite,
2182 	badfo_readwrite,
2183 	badfo_ioctl,
2184 	badfo_poll,
2185 	badfo_kqfilter,
2186 	badfo_stat,
2187 	badfo_close,
2188 	0
2189 };
2190 
2191 static int
2192 badfo_readwrite(fp, uio, active_cred, flags, td)
2193 	struct file *fp;
2194 	struct uio *uio;
2195 	struct ucred *active_cred;
2196 	struct thread *td;
2197 	int flags;
2198 {
2199 
2200 	return (EBADF);
2201 }
2202 
2203 static int
2204 badfo_ioctl(fp, com, data, active_cred, td)
2205 	struct file *fp;
2206 	u_long com;
2207 	void *data;
2208 	struct ucred *active_cred;
2209 	struct thread *td;
2210 {
2211 
2212 	return (EBADF);
2213 }
2214 
2215 static int
2216 badfo_poll(fp, events, active_cred, td)
2217 	struct file *fp;
2218 	int events;
2219 	struct ucred *active_cred;
2220 	struct thread *td;
2221 {
2222 
2223 	return (0);
2224 }
2225 
2226 static int
2227 badfo_kqfilter(fp, kn)
2228 	struct file *fp;
2229 	struct knote *kn;
2230 {
2231 
2232 	return (0);
2233 }
2234 
2235 static int
2236 badfo_stat(fp, sb, active_cred, td)
2237 	struct file *fp;
2238 	struct stat *sb;
2239 	struct ucred *active_cred;
2240 	struct thread *td;
2241 {
2242 
2243 	return (EBADF);
2244 }
2245 
2246 static int
2247 badfo_close(fp, td)
2248 	struct file *fp;
2249 	struct thread *td;
2250 {
2251 
2252 	return (EBADF);
2253 }
2254 
2255 SYSINIT(fildescdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE+CDEV_MAJOR,
2256 					fildesc_drvinit,NULL)
2257 
2258 static void filelistinit(void *);
2259 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, filelistinit, NULL)
2260 
2261 /* ARGSUSED*/
2262 static void
2263 filelistinit(dummy)
2264 	void *dummy;
2265 {
2266 
2267 	file_zone = uma_zcreate("Files", sizeof(struct file), NULL, NULL,
2268 	    NULL, NULL, UMA_ALIGN_PTR, 0);
2269 	sx_init(&filelist_lock, "filelist lock");
2270 	mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF);
2271 }
2272