xref: /freebsd/sys/kern/kern_descrip.c (revision f9218d3d4fd34f082473b3a021c6d4d109fb47cf)
1 /*
2  * Copyright (c) 1982, 1986, 1989, 1991, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)kern_descrip.c	8.6 (Berkeley) 4/19/94
39  * $FreeBSD$
40  */
41 
42 #include "opt_compat.h"
43 
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/syscallsubr.h>
47 #include <sys/sysproto.h>
48 #include <sys/conf.h>
49 #include <sys/filedesc.h>
50 #include <sys/lock.h>
51 #include <sys/kernel.h>
52 #include <sys/malloc.h>
53 #include <sys/mutex.h>
54 #include <sys/sysctl.h>
55 #include <sys/vnode.h>
56 #include <sys/mount.h>
57 #include <sys/proc.h>
58 #include <sys/namei.h>
59 #include <sys/file.h>
60 #include <sys/stat.h>
61 #include <sys/filio.h>
62 #include <sys/fcntl.h>
63 #include <sys/unistd.h>
64 #include <sys/resourcevar.h>
65 #include <sys/event.h>
66 #include <sys/sx.h>
67 #include <sys/socketvar.h>
68 #include <sys/signalvar.h>
69 
70 #include <machine/limits.h>
71 
72 #include <vm/vm.h>
73 #include <vm/vm_extern.h>
74 #include <vm/uma.h>
75 
76 static MALLOC_DEFINE(M_FILEDESC, "file desc", "Open file descriptor table");
77 static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures");
78 
79 uma_zone_t file_zone;
80 
81 static	 d_open_t  fdopen;
82 #define	NUMFDESC 64
83 
84 #define	CDEV_MAJOR 22
85 static struct cdevsw fildesc_cdevsw = {
86 	.d_open =	fdopen,
87 	.d_name =	"FD",
88 	.d_maj =	CDEV_MAJOR,
89 };
90 
91 /* How to treat 'new' parameter when allocating a fd for do_dup(). */
92 enum dup_type { DUP_VARIABLE, DUP_FIXED };
93 
94 static int do_dup(struct thread *td, enum dup_type type, int old, int new,
95     register_t *retval);
96 
97 /*
98  * Descriptor management.
99  */
100 struct filelist filehead;	/* head of list of open files */
101 int nfiles;			/* actual number of open files */
102 extern int cmask;
103 struct sx filelist_lock;	/* sx to protect filelist */
104 struct mtx sigio_lock;		/* mtx to protect pointers to sigio */
105 
106 /*
107  * System calls on descriptors.
108  */
109 #ifndef _SYS_SYSPROTO_H_
110 struct getdtablesize_args {
111 	int	dummy;
112 };
113 #endif
114 /*
115  * MPSAFE
116  */
117 /* ARGSUSED */
118 int
119 getdtablesize(td, uap)
120 	struct thread *td;
121 	struct getdtablesize_args *uap;
122 {
123 	struct proc *p = td->td_proc;
124 
125 	mtx_lock(&Giant);
126 	td->td_retval[0] =
127 	    min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfilesperproc);
128 	mtx_unlock(&Giant);
129 	return (0);
130 }
131 
132 /*
133  * Duplicate a file descriptor to a particular value.
134  *
135  * note: keep in mind that a potential race condition exists when closing
136  * descriptors from a shared descriptor table (via rfork).
137  */
138 #ifndef _SYS_SYSPROTO_H_
139 struct dup2_args {
140 	u_int	from;
141 	u_int	to;
142 };
143 #endif
144 /*
145  * MPSAFE
146  */
147 /* ARGSUSED */
148 int
149 dup2(td, uap)
150 	struct thread *td;
151 	struct dup2_args *uap;
152 {
153 
154 	return (do_dup(td, DUP_FIXED, (int)uap->from, (int)uap->to,
155 		    td->td_retval));
156 }
157 
158 /*
159  * Duplicate a file descriptor.
160  */
161 #ifndef _SYS_SYSPROTO_H_
162 struct dup_args {
163 	u_int	fd;
164 };
165 #endif
166 /*
167  * MPSAFE
168  */
169 /* ARGSUSED */
170 int
171 dup(td, uap)
172 	struct thread *td;
173 	struct dup_args *uap;
174 {
175 
176 	return (do_dup(td, DUP_VARIABLE, (int)uap->fd, 0, td->td_retval));
177 }
178 
179 /*
180  * The file control system call.
181  */
182 #ifndef _SYS_SYSPROTO_H_
183 struct fcntl_args {
184 	int	fd;
185 	int	cmd;
186 	long	arg;
187 };
188 #endif
189 /*
190  * MPSAFE
191  */
192 /* ARGSUSED */
193 int
194 fcntl(td, uap)
195 	struct thread *td;
196 	struct fcntl_args *uap;
197 {
198 	struct flock fl;
199 	intptr_t arg;
200 	int error;
201 
202 	error = 0;
203 	switch (uap->cmd) {
204 	case F_GETLK:
205 	case F_SETLK:
206 	case F_SETLKW:
207 		error = copyin((void *)(intptr_t)uap->arg, &fl, sizeof(fl));
208 		arg = (intptr_t)&fl;
209 		break;
210 	default:
211 		arg = uap->arg;
212 		break;
213 	}
214 	if (error)
215 		return (error);
216 	error = kern_fcntl(td, uap->fd, uap->cmd, arg);
217 	if (error)
218 		return (error);
219 	if (uap->cmd == F_GETLK)
220 		error = copyout(&fl, (void *)(intptr_t)uap->arg, sizeof(fl));
221 	return (error);
222 }
223 
224 int
225 kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
226 {
227 	struct filedesc *fdp;
228 	struct flock *flp;
229 	struct file *fp;
230 	struct proc *p;
231 	char *pop;
232 	struct vnode *vp;
233 	u_int newmin;
234 	int error, flg, tmp;
235 
236 	error = 0;
237 	flg = F_POSIX;
238 	p = td->td_proc;
239 	fdp = p->p_fd;
240 	mtx_lock(&Giant);
241 	FILEDESC_LOCK(fdp);
242 	if ((unsigned)fd >= fdp->fd_nfiles ||
243 	    (fp = fdp->fd_ofiles[fd]) == NULL) {
244 		FILEDESC_UNLOCK(fdp);
245 		error = EBADF;
246 		goto done2;
247 	}
248 	pop = &fdp->fd_ofileflags[fd];
249 
250 	switch (cmd) {
251 	case F_DUPFD:
252 		FILEDESC_UNLOCK(fdp);
253 		newmin = arg;
254 		if (newmin >= p->p_rlimit[RLIMIT_NOFILE].rlim_cur ||
255 		    newmin >= maxfilesperproc) {
256 			error = EINVAL;
257 			break;
258 		}
259 		error = do_dup(td, DUP_VARIABLE, fd, newmin, td->td_retval);
260 		break;
261 
262 	case F_GETFD:
263 		td->td_retval[0] = (*pop & UF_EXCLOSE) ? FD_CLOEXEC : 0;
264 		FILEDESC_UNLOCK(fdp);
265 		break;
266 
267 	case F_SETFD:
268 		*pop = (*pop &~ UF_EXCLOSE) |
269 		    (arg & FD_CLOEXEC ? UF_EXCLOSE : 0);
270 		FILEDESC_UNLOCK(fdp);
271 		break;
272 
273 	case F_GETFL:
274 		FILE_LOCK(fp);
275 		FILEDESC_UNLOCK(fdp);
276 		td->td_retval[0] = OFLAGS(fp->f_flag);
277 		FILE_UNLOCK(fp);
278 		break;
279 
280 	case F_SETFL:
281 		FILE_LOCK(fp);
282 		FILEDESC_UNLOCK(fdp);
283 		fhold_locked(fp);
284 		fp->f_flag &= ~FCNTLFLAGS;
285 		fp->f_flag |= FFLAGS(arg & ~O_ACCMODE) & FCNTLFLAGS;
286 		FILE_UNLOCK(fp);
287 		tmp = fp->f_flag & FNONBLOCK;
288 		error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
289 		if (error) {
290 			fdrop(fp, td);
291 			break;
292 		}
293 		tmp = fp->f_flag & FASYNC;
294 		error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td);
295 		if (error == 0) {
296 			fdrop(fp, td);
297 			break;
298 		}
299 		FILE_LOCK(fp);
300 		fp->f_flag &= ~FNONBLOCK;
301 		FILE_UNLOCK(fp);
302 		tmp = 0;
303 		(void)fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
304 		fdrop(fp, td);
305 		break;
306 
307 	case F_GETOWN:
308 		fhold(fp);
309 		FILEDESC_UNLOCK(fdp);
310 		error = fo_ioctl(fp, FIOGETOWN, &tmp, td->td_ucred, td);
311 		if (error == 0)
312 			td->td_retval[0] = tmp;
313 		fdrop(fp, td);
314 		break;
315 
316 	case F_SETOWN:
317 		fhold(fp);
318 		FILEDESC_UNLOCK(fdp);
319 		tmp = arg;
320 		error = fo_ioctl(fp, FIOSETOWN, &tmp, td->td_ucred, td);
321 		fdrop(fp, td);
322 		break;
323 
324 	case F_SETLKW:
325 		flg |= F_WAIT;
326 		/* FALLTHROUGH F_SETLK */
327 
328 	case F_SETLK:
329 		if (fp->f_type != DTYPE_VNODE) {
330 			FILEDESC_UNLOCK(fdp);
331 			error = EBADF;
332 			break;
333 		}
334 
335 		flp = (struct flock *)arg;
336 		if (flp->l_whence == SEEK_CUR) {
337 			if (fp->f_offset < 0 ||
338 			    (flp->l_start > 0 &&
339 			     fp->f_offset > OFF_MAX - flp->l_start)) {
340 				FILEDESC_UNLOCK(fdp);
341 				error = EOVERFLOW;
342 				break;
343 			}
344 			flp->l_start += fp->f_offset;
345 		}
346 
347 		/*
348 		 * VOP_ADVLOCK() may block.
349 		 */
350 		fhold(fp);
351 		FILEDESC_UNLOCK(fdp);
352 		vp = fp->f_data;
353 
354 		switch (flp->l_type) {
355 		case F_RDLCK:
356 			if ((fp->f_flag & FREAD) == 0) {
357 				error = EBADF;
358 				break;
359 			}
360 			PROC_LOCK(p->p_leader);
361 			p->p_leader->p_flag |= P_ADVLOCK;
362 			PROC_UNLOCK(p->p_leader);
363 			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
364 			    flp, flg);
365 			break;
366 		case F_WRLCK:
367 			if ((fp->f_flag & FWRITE) == 0) {
368 				error = EBADF;
369 				break;
370 			}
371 			PROC_LOCK(p->p_leader);
372 			p->p_leader->p_flag |= P_ADVLOCK;
373 			PROC_UNLOCK(p->p_leader);
374 			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
375 			    flp, flg);
376 			break;
377 		case F_UNLCK:
378 			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK,
379 			    flp, F_POSIX);
380 			break;
381 		default:
382 			error = EINVAL;
383 			break;
384 		}
385 		/* Check for race with close */
386 		FILEDESC_LOCK(fdp);
387 		if ((unsigned) fd >= fdp->fd_nfiles ||
388 		    fp != fdp->fd_ofiles[fd]) {
389 			FILEDESC_UNLOCK(fdp);
390 			flp->l_whence = SEEK_SET;
391 			flp->l_start = 0;
392 			flp->l_len = 0;
393 			flp->l_type = F_UNLCK;
394 			(void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
395 					   F_UNLCK, flp, F_POSIX);
396 		} else
397 			FILEDESC_UNLOCK(fdp);
398 		fdrop(fp, td);
399 		break;
400 
401 	case F_GETLK:
402 		if (fp->f_type != DTYPE_VNODE) {
403 			FILEDESC_UNLOCK(fdp);
404 			error = EBADF;
405 			break;
406 		}
407 		flp = (struct flock *)arg;
408 		if (flp->l_type != F_RDLCK && flp->l_type != F_WRLCK &&
409 		    flp->l_type != F_UNLCK) {
410 			FILEDESC_UNLOCK(fdp);
411 			error = EINVAL;
412 			break;
413 		}
414 		if (flp->l_whence == SEEK_CUR) {
415 			if ((flp->l_start > 0 &&
416 			    fp->f_offset > OFF_MAX - flp->l_start) ||
417 			    (flp->l_start < 0 &&
418 			     fp->f_offset < OFF_MIN - flp->l_start)) {
419 				FILEDESC_UNLOCK(fdp);
420 				error = EOVERFLOW;
421 				break;
422 			}
423 			flp->l_start += fp->f_offset;
424 		}
425 		/*
426 		 * VOP_ADVLOCK() may block.
427 		 */
428 		fhold(fp);
429 		FILEDESC_UNLOCK(fdp);
430 		vp = fp->f_data;
431 		error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK, flp,
432 		    F_POSIX);
433 		fdrop(fp, td);
434 		break;
435 	default:
436 		FILEDESC_UNLOCK(fdp);
437 		error = EINVAL;
438 		break;
439 	}
440 done2:
441 	mtx_unlock(&Giant);
442 	return (error);
443 }
444 
445 /*
446  * Common code for dup, dup2, and fcntl(F_DUPFD).
447  */
448 static int
449 do_dup(td, type, old, new, retval)
450 	enum dup_type type;
451 	int old, new;
452 	register_t *retval;
453 	struct thread *td;
454 {
455 	struct filedesc *fdp;
456 	struct proc *p;
457 	struct file *fp;
458 	struct file *delfp;
459 	int error, newfd;
460 
461 	p = td->td_proc;
462 	fdp = p->p_fd;
463 
464 	/*
465 	 * Verify we have a valid descriptor to dup from and possibly to
466 	 * dup to.
467 	 */
468 	if (old < 0 || new < 0 || new >= p->p_rlimit[RLIMIT_NOFILE].rlim_cur ||
469 	    new >= maxfilesperproc)
470 		return (EBADF);
471 	FILEDESC_LOCK(fdp);
472 	if (old >= fdp->fd_nfiles || fdp->fd_ofiles[old] == NULL) {
473 		FILEDESC_UNLOCK(fdp);
474 		return (EBADF);
475 	}
476 	if (type == DUP_FIXED && old == new) {
477 		*retval = new;
478 		FILEDESC_UNLOCK(fdp);
479 		return (0);
480 	}
481 	fp = fdp->fd_ofiles[old];
482 	fhold(fp);
483 
484 	/*
485 	 * Expand the table for the new descriptor if needed.  This may
486 	 * block and drop and reacquire the filedesc lock.
487 	 */
488 	if (type == DUP_VARIABLE || new >= fdp->fd_nfiles) {
489 		error = fdalloc(td, new, &newfd);
490 		if (error) {
491 			FILEDESC_UNLOCK(fdp);
492 			fdrop(fp, td);
493 			return (error);
494 		}
495 	}
496 	if (type == DUP_VARIABLE)
497 		new = newfd;
498 
499 	/*
500 	 * If the old file changed out from under us then treat it as a
501 	 * bad file descriptor.  Userland should do its own locking to
502 	 * avoid this case.
503 	 */
504 	if (fdp->fd_ofiles[old] != fp) {
505 		if (fdp->fd_ofiles[new] == NULL) {
506 			if (new < fdp->fd_freefile)
507 				fdp->fd_freefile = new;
508 			while (fdp->fd_lastfile > 0 &&
509 			    fdp->fd_ofiles[fdp->fd_lastfile] == NULL)
510 				fdp->fd_lastfile--;
511 		}
512 		FILEDESC_UNLOCK(fdp);
513 		fdrop(fp, td);
514 		return (EBADF);
515 	}
516 	KASSERT(old != new, ("new fd is same as old"));
517 
518 	/*
519 	 * Save info on the descriptor being overwritten.  We have
520 	 * to do the unmap now, but we cannot close it without
521 	 * introducing an ownership race for the slot.
522 	 */
523 	delfp = fdp->fd_ofiles[new];
524 	KASSERT(delfp == NULL || type == DUP_FIXED,
525 	    ("dup() picked an open file"));
526 #if 0
527 	if (delfp && (fdp->fd_ofileflags[new] & UF_MAPPED))
528 		(void) munmapfd(td, new);
529 #endif
530 
531 	/*
532 	 * Duplicate the source descriptor, update lastfile
533 	 */
534 	fdp->fd_ofiles[new] = fp;
535  	fdp->fd_ofileflags[new] = fdp->fd_ofileflags[old] &~ UF_EXCLOSE;
536 	if (new > fdp->fd_lastfile)
537 		fdp->fd_lastfile = new;
538 	FILEDESC_UNLOCK(fdp);
539 	*retval = new;
540 
541 	/*
542 	 * If we dup'd over a valid file, we now own the reference to it
543 	 * and must dispose of it using closef() semantics (as if a
544 	 * close() were performed on it).
545 	 */
546 	if (delfp) {
547 		mtx_lock(&Giant);
548 		(void) closef(delfp, td);
549 		mtx_unlock(&Giant);
550 	}
551 	return (0);
552 }
553 
554 /*
555  * If sigio is on the list associated with a process or process group,
556  * disable signalling from the device, remove sigio from the list and
557  * free sigio.
558  */
559 void
560 funsetown(sigiop)
561 	struct sigio **sigiop;
562 {
563 	struct sigio *sigio;
564 
565 	SIGIO_LOCK();
566 	sigio = *sigiop;
567 	if (sigio == NULL) {
568 		SIGIO_UNLOCK();
569 		return;
570 	}
571 	*(sigio->sio_myref) = NULL;
572 	if ((sigio)->sio_pgid < 0) {
573 		struct pgrp *pg = (sigio)->sio_pgrp;
574 		PGRP_LOCK(pg);
575 		SLIST_REMOVE(&sigio->sio_pgrp->pg_sigiolst, sigio,
576 			     sigio, sio_pgsigio);
577 		PGRP_UNLOCK(pg);
578 	} else {
579 		struct proc *p = (sigio)->sio_proc;
580 		PROC_LOCK(p);
581 		SLIST_REMOVE(&sigio->sio_proc->p_sigiolst, sigio,
582 			     sigio, sio_pgsigio);
583 		PROC_UNLOCK(p);
584 	}
585 	SIGIO_UNLOCK();
586 	crfree(sigio->sio_ucred);
587 	FREE(sigio, M_SIGIO);
588 }
589 
590 /*
591  * Free a list of sigio structures.
592  * We only need to lock the SIGIO_LOCK because we have made ourselves
593  * inaccessable to callers of fsetown and therefore do not need to lock
594  * the proc or pgrp struct for the list manipulation.
595  */
596 void
597 funsetownlst(sigiolst)
598 	struct sigiolst *sigiolst;
599 {
600 	struct proc *p;
601 	struct pgrp *pg;
602 	struct sigio *sigio;
603 
604 	sigio = SLIST_FIRST(sigiolst);
605 	if (sigio == NULL)
606 		return;
607 	p = NULL;
608 	pg = NULL;
609 
610 	/*
611 	 * Every entry of the list should belong
612 	 * to a single proc or pgrp.
613 	 */
614 	if (sigio->sio_pgid < 0) {
615 		pg = sigio->sio_pgrp;
616 		PGRP_LOCK_ASSERT(pg, MA_NOTOWNED);
617 	} else /* if (sigio->sio_pgid > 0) */ {
618 		p = sigio->sio_proc;
619 		PROC_LOCK_ASSERT(p, MA_NOTOWNED);
620 	}
621 
622 	SIGIO_LOCK();
623 	while ((sigio = SLIST_FIRST(sigiolst)) != NULL) {
624 		*(sigio->sio_myref) = NULL;
625 		if (pg != NULL) {
626 			KASSERT(sigio->sio_pgid < 0,
627 			    ("Proc sigio in pgrp sigio list"));
628 			KASSERT(sigio->sio_pgrp == pg,
629 			    ("Bogus pgrp in sigio list"));
630 			PGRP_LOCK(pg);
631 			SLIST_REMOVE(&pg->pg_sigiolst, sigio, sigio,
632 			    sio_pgsigio);
633 			PGRP_UNLOCK(pg);
634 		} else /* if (p != NULL) */ {
635 			KASSERT(sigio->sio_pgid > 0,
636 			    ("Pgrp sigio in proc sigio list"));
637 			KASSERT(sigio->sio_proc == p,
638 			    ("Bogus proc in sigio list"));
639 			PROC_LOCK(p);
640 			SLIST_REMOVE(&p->p_sigiolst, sigio, sigio,
641 			    sio_pgsigio);
642 			PROC_UNLOCK(p);
643 		}
644 		SIGIO_UNLOCK();
645 		crfree(sigio->sio_ucred);
646 		FREE(sigio, M_SIGIO);
647 		SIGIO_LOCK();
648 	}
649 	SIGIO_UNLOCK();
650 }
651 
652 /*
653  * This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg).
654  *
655  * After permission checking, add a sigio structure to the sigio list for
656  * the process or process group.
657  */
658 int
659 fsetown(pgid, sigiop)
660 	pid_t pgid;
661 	struct sigio **sigiop;
662 {
663 	struct proc *proc;
664 	struct pgrp *pgrp;
665 	struct sigio *sigio;
666 	int ret;
667 
668 	if (pgid == 0) {
669 		funsetown(sigiop);
670 		return (0);
671 	}
672 
673 	ret = 0;
674 
675 	/* Allocate and fill in the new sigio out of locks. */
676 	MALLOC(sigio, struct sigio *, sizeof(struct sigio), M_SIGIO, M_WAITOK);
677 	sigio->sio_pgid = pgid;
678 	sigio->sio_ucred = crhold(curthread->td_ucred);
679 	sigio->sio_myref = sigiop;
680 
681 	sx_slock(&proctree_lock);
682 	if (pgid > 0) {
683 		proc = pfind(pgid);
684 		if (proc == NULL) {
685 			ret = ESRCH;
686 			goto fail;
687 		}
688 
689 		/*
690 		 * Policy - Don't allow a process to FSETOWN a process
691 		 * in another session.
692 		 *
693 		 * Remove this test to allow maximum flexibility or
694 		 * restrict FSETOWN to the current process or process
695 		 * group for maximum safety.
696 		 */
697 		PROC_UNLOCK(proc);
698 		if (proc->p_session != curthread->td_proc->p_session) {
699 			ret = EPERM;
700 			goto fail;
701 		}
702 
703 		pgrp = NULL;
704 	} else /* if (pgid < 0) */ {
705 		pgrp = pgfind(-pgid);
706 		if (pgrp == NULL) {
707 			ret = ESRCH;
708 			goto fail;
709 		}
710 		PGRP_UNLOCK(pgrp);
711 
712 		/*
713 		 * Policy - Don't allow a process to FSETOWN a process
714 		 * in another session.
715 		 *
716 		 * Remove this test to allow maximum flexibility or
717 		 * restrict FSETOWN to the current process or process
718 		 * group for maximum safety.
719 		 */
720 		if (pgrp->pg_session != curthread->td_proc->p_session) {
721 			ret = EPERM;
722 			goto fail;
723 		}
724 
725 		proc = NULL;
726 	}
727 	funsetown(sigiop);
728 	if (pgid > 0) {
729 		PROC_LOCK(proc);
730 		/*
731 		 * Since funsetownlst() is called without the proctree
732 		 * locked, we need to check for P_WEXIT.
733 		 * XXX: is ESRCH correct?
734 		 */
735 		if ((proc->p_flag & P_WEXIT) != 0) {
736 			PROC_UNLOCK(proc);
737 			ret = ESRCH;
738 			goto fail;
739 		}
740 		SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio, sio_pgsigio);
741 		sigio->sio_proc = proc;
742 		PROC_UNLOCK(proc);
743 	} else {
744 		PGRP_LOCK(pgrp);
745 		SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio, sio_pgsigio);
746 		sigio->sio_pgrp = pgrp;
747 		PGRP_UNLOCK(pgrp);
748 	}
749 	sx_sunlock(&proctree_lock);
750 	SIGIO_LOCK();
751 	*sigiop = sigio;
752 	SIGIO_UNLOCK();
753 	return (0);
754 
755 fail:
756 	sx_sunlock(&proctree_lock);
757 	crfree(sigio->sio_ucred);
758 	FREE(sigio, M_SIGIO);
759 	return (ret);
760 }
761 
762 /*
763  * This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg).
764  */
765 pid_t
766 fgetown(sigiop)
767 	struct sigio **sigiop;
768 {
769 	pid_t pgid;
770 
771 	SIGIO_LOCK();
772 	pgid = (*sigiop != NULL) ? (*sigiop)->sio_pgid : 0;
773 	SIGIO_UNLOCK();
774 	return (pgid);
775 }
776 
777 /*
778  * Close a file descriptor.
779  */
780 #ifndef _SYS_SYSPROTO_H_
781 struct close_args {
782         int     fd;
783 };
784 #endif
785 /*
786  * MPSAFE
787  */
788 /* ARGSUSED */
789 int
790 close(td, uap)
791 	struct thread *td;
792 	struct close_args *uap;
793 {
794 	struct filedesc *fdp;
795 	struct file *fp;
796 	int fd, error;
797 
798 	fd = uap->fd;
799 	error = 0;
800 	fdp = td->td_proc->p_fd;
801 	mtx_lock(&Giant);
802 	FILEDESC_LOCK(fdp);
803 	if ((unsigned)fd >= fdp->fd_nfiles ||
804 	    (fp = fdp->fd_ofiles[fd]) == NULL) {
805 		FILEDESC_UNLOCK(fdp);
806 		error = EBADF;
807 		goto done2;
808 	}
809 #if 0
810 	if (fdp->fd_ofileflags[fd] & UF_MAPPED)
811 		(void) munmapfd(td, fd);
812 #endif
813 	fdp->fd_ofiles[fd] = NULL;
814 	fdp->fd_ofileflags[fd] = 0;
815 
816 	/*
817 	 * we now hold the fp reference that used to be owned by the descriptor
818 	 * array.
819 	 */
820 	while (fdp->fd_lastfile > 0 && fdp->fd_ofiles[fdp->fd_lastfile] == NULL)
821 		fdp->fd_lastfile--;
822 	if (fd < fdp->fd_freefile)
823 		fdp->fd_freefile = fd;
824 	if (fd < fdp->fd_knlistsize) {
825 		FILEDESC_UNLOCK(fdp);
826 		knote_fdclose(td, fd);
827 	} else
828 		FILEDESC_UNLOCK(fdp);
829 
830 	error = closef(fp, td);
831 done2:
832 	mtx_unlock(&Giant);
833 	return (error);
834 }
835 
836 #if defined(COMPAT_43) || defined(COMPAT_SUNOS)
837 /*
838  * Return status information about a file descriptor.
839  */
840 #ifndef _SYS_SYSPROTO_H_
841 struct ofstat_args {
842 	int	fd;
843 	struct	ostat *sb;
844 };
845 #endif
846 /*
847  * MPSAFE
848  */
849 /* ARGSUSED */
850 int
851 ofstat(td, uap)
852 	struct thread *td;
853 	struct ofstat_args *uap;
854 {
855 	struct file *fp;
856 	struct stat ub;
857 	struct ostat oub;
858 	int error;
859 
860 	mtx_lock(&Giant);
861 	if ((error = fget(td, uap->fd, &fp)) != 0)
862 		goto done2;
863 	error = fo_stat(fp, &ub, td->td_ucred, td);
864 	if (error == 0) {
865 		cvtstat(&ub, &oub);
866 		error = copyout(&oub, uap->sb, sizeof(oub));
867 	}
868 	fdrop(fp, td);
869 done2:
870 	mtx_unlock(&Giant);
871 	return (error);
872 }
873 #endif /* COMPAT_43 || COMPAT_SUNOS */
874 
875 /*
876  * Return status information about a file descriptor.
877  */
878 #ifndef _SYS_SYSPROTO_H_
879 struct fstat_args {
880 	int	fd;
881 	struct	stat *sb;
882 };
883 #endif
884 /*
885  * MPSAFE
886  */
887 /* ARGSUSED */
888 int
889 fstat(td, uap)
890 	struct thread *td;
891 	struct fstat_args *uap;
892 {
893 	struct file *fp;
894 	struct stat ub;
895 	int error;
896 
897 	mtx_lock(&Giant);
898 	if ((error = fget(td, uap->fd, &fp)) != 0)
899 		goto done2;
900 	error = fo_stat(fp, &ub, td->td_ucred, td);
901 	if (error == 0)
902 		error = copyout(&ub, uap->sb, sizeof(ub));
903 	fdrop(fp, td);
904 done2:
905 	mtx_unlock(&Giant);
906 	return (error);
907 }
908 
909 /*
910  * Return status information about a file descriptor.
911  */
912 #ifndef _SYS_SYSPROTO_H_
913 struct nfstat_args {
914 	int	fd;
915 	struct	nstat *sb;
916 };
917 #endif
918 /*
919  * MPSAFE
920  */
921 /* ARGSUSED */
922 int
923 nfstat(td, uap)
924 	struct thread *td;
925 	struct nfstat_args *uap;
926 {
927 	struct file *fp;
928 	struct stat ub;
929 	struct nstat nub;
930 	int error;
931 
932 	mtx_lock(&Giant);
933 	if ((error = fget(td, uap->fd, &fp)) != 0)
934 		goto done2;
935 	error = fo_stat(fp, &ub, td->td_ucred, td);
936 	if (error == 0) {
937 		cvtnstat(&ub, &nub);
938 		error = copyout(&nub, uap->sb, sizeof(nub));
939 	}
940 	fdrop(fp, td);
941 done2:
942 	mtx_unlock(&Giant);
943 	return (error);
944 }
945 
946 /*
947  * Return pathconf information about a file descriptor.
948  */
949 #ifndef _SYS_SYSPROTO_H_
950 struct fpathconf_args {
951 	int	fd;
952 	int	name;
953 };
954 #endif
955 /*
956  * MPSAFE
957  */
958 /* ARGSUSED */
959 int
960 fpathconf(td, uap)
961 	struct thread *td;
962 	struct fpathconf_args *uap;
963 {
964 	struct file *fp;
965 	struct vnode *vp;
966 	int error;
967 
968 	if ((error = fget(td, uap->fd, &fp)) != 0)
969 		return (error);
970 
971 	/* If asynchronous I/O is available, it works for all descriptors. */
972 	if (uap->name == _PC_ASYNC_IO) {
973 		td->td_retval[0] = async_io_version;
974 		goto out;
975 	}
976 	switch (fp->f_type) {
977 	case DTYPE_PIPE:
978 	case DTYPE_SOCKET:
979 		if (uap->name != _PC_PIPE_BUF) {
980 			error = EINVAL;
981 		} else {
982 			td->td_retval[0] = PIPE_BUF;
983 			error = 0;
984 		}
985 		break;
986 	case DTYPE_FIFO:
987 	case DTYPE_VNODE:
988 		vp = fp->f_data;
989 		mtx_lock(&Giant);
990 		error = VOP_PATHCONF(vp, uap->name, td->td_retval);
991 		mtx_unlock(&Giant);
992 		break;
993 	default:
994 		error = EOPNOTSUPP;
995 		break;
996 	}
997 out:
998 	fdrop(fp, td);
999 	return (error);
1000 }
1001 
1002 /*
1003  * Allocate a file descriptor for the process.
1004  */
1005 static int fdexpand;
1006 SYSCTL_INT(_debug, OID_AUTO, fdexpand, CTLFLAG_RD, &fdexpand, 0, "");
1007 
1008 int
1009 fdalloc(td, want, result)
1010 	struct thread *td;
1011 	int want;
1012 	int *result;
1013 {
1014 	struct proc *p = td->td_proc;
1015 	struct filedesc *fdp = td->td_proc->p_fd;
1016 	int i;
1017 	int lim, last, nfiles;
1018 	struct file **newofile, **oldofile;
1019 	char *newofileflags;
1020 
1021 	FILEDESC_LOCK_ASSERT(fdp, MA_OWNED);
1022 
1023 	/*
1024 	 * Search for a free descriptor starting at the higher
1025 	 * of want or fd_freefile.  If that fails, consider
1026 	 * expanding the ofile array.
1027 	 */
1028 	lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfilesperproc);
1029 	for (;;) {
1030 		last = min(fdp->fd_nfiles, lim);
1031 		i = max(want, fdp->fd_freefile);
1032 		for (; i < last; i++) {
1033 			if (fdp->fd_ofiles[i] == NULL) {
1034 				fdp->fd_ofileflags[i] = 0;
1035 				if (i > fdp->fd_lastfile)
1036 					fdp->fd_lastfile = i;
1037 				if (want <= fdp->fd_freefile)
1038 					fdp->fd_freefile = i;
1039 				*result = i;
1040 				return (0);
1041 			}
1042 		}
1043 
1044 		/*
1045 		 * No space in current array.  Expand?
1046 		 */
1047 		if (i >= lim)
1048 			return (EMFILE);
1049 		if (fdp->fd_nfiles < NDEXTENT)
1050 			nfiles = NDEXTENT;
1051 		else
1052 			nfiles = 2 * fdp->fd_nfiles;
1053 		while (nfiles < want)
1054 			nfiles <<= 1;
1055 		FILEDESC_UNLOCK(fdp);
1056 		/*
1057 		 * XXX malloc() calls uma_large_malloc() for sizes larger
1058 		 * than KMEM_ZMAX bytes. uma_large_malloc() requires Giant.
1059 		 */
1060 		mtx_lock(&Giant);
1061 		newofile = malloc(nfiles * OFILESIZE, M_FILEDESC, M_WAITOK);
1062 		mtx_unlock(&Giant);
1063 
1064 		/*
1065 		 * Deal with file-table extend race that might have
1066 		 * occurred while filedesc was unlocked.
1067 		 */
1068 		FILEDESC_LOCK(fdp);
1069 		if (fdp->fd_nfiles >= nfiles) {
1070 			/* XXX uma_large_free() needs Giant. */
1071 			FILEDESC_UNLOCK(fdp);
1072 			mtx_lock(&Giant);
1073 			free(newofile, M_FILEDESC);
1074 			mtx_unlock(&Giant);
1075 			FILEDESC_LOCK(fdp);
1076 			continue;
1077 		}
1078 		newofileflags = (char *) &newofile[nfiles];
1079 		/*
1080 		 * Copy the existing ofile and ofileflags arrays
1081 		 * and zero the new portion of each array.
1082 		 */
1083 		i = fdp->fd_nfiles * sizeof(struct file *);
1084 		bcopy(fdp->fd_ofiles, newofile,	i);
1085 		bzero((char *)newofile + i,
1086 		    nfiles * sizeof(struct file *) - i);
1087 		i = fdp->fd_nfiles * sizeof(char);
1088 		bcopy(fdp->fd_ofileflags, newofileflags, i);
1089 		bzero(newofileflags + i, nfiles * sizeof(char) - i);
1090 		if (fdp->fd_nfiles > NDFILE)
1091 			oldofile = fdp->fd_ofiles;
1092 		else
1093 			oldofile = NULL;
1094 		fdp->fd_ofiles = newofile;
1095 		fdp->fd_ofileflags = newofileflags;
1096 		fdp->fd_nfiles = nfiles;
1097 		fdexpand++;
1098 		if (oldofile != NULL) {
1099 			/* XXX uma_large_free() needs Giant. */
1100 			FILEDESC_UNLOCK(fdp);
1101 			mtx_lock(&Giant);
1102 			free(oldofile, M_FILEDESC);
1103 			mtx_unlock(&Giant);
1104 			FILEDESC_LOCK(fdp);
1105 		}
1106 	}
1107 	return (0);
1108 }
1109 
1110 /*
1111  * Check to see whether n user file descriptors
1112  * are available to the process p.
1113  */
1114 int
1115 fdavail(td, n)
1116 	struct thread *td;
1117 	int n;
1118 {
1119 	struct proc *p = td->td_proc;
1120 	struct filedesc *fdp = td->td_proc->p_fd;
1121 	struct file **fpp;
1122 	int i, lim, last;
1123 
1124 	FILEDESC_LOCK_ASSERT(fdp, MA_OWNED);
1125 
1126 	lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfilesperproc);
1127 	if ((i = lim - fdp->fd_nfiles) > 0 && (n -= i) <= 0)
1128 		return (1);
1129 	last = min(fdp->fd_nfiles, lim);
1130 	fpp = &fdp->fd_ofiles[fdp->fd_freefile];
1131 	for (i = last - fdp->fd_freefile; --i >= 0; fpp++) {
1132 		if (*fpp == NULL && --n <= 0)
1133 			return (1);
1134 	}
1135 	return (0);
1136 }
1137 
1138 /*
1139  * Create a new open file structure and allocate
1140  * a file decriptor for the process that refers to it.
1141  */
1142 int
1143 falloc(td, resultfp, resultfd)
1144 	struct thread *td;
1145 	struct file **resultfp;
1146 	int *resultfd;
1147 {
1148 	struct proc *p = td->td_proc;
1149 	struct file *fp, *fq;
1150 	int error, i;
1151 
1152 	fp = uma_zalloc(file_zone, M_WAITOK | M_ZERO);
1153 	sx_xlock(&filelist_lock);
1154 	if (nfiles >= maxfiles) {
1155 		sx_xunlock(&filelist_lock);
1156 		uma_zfree(file_zone, fp);
1157 		tablefull("file");
1158 		return (ENFILE);
1159 	}
1160 	nfiles++;
1161 
1162 	/*
1163 	 * If the process has file descriptor zero open, add the new file
1164 	 * descriptor to the list of open files at that point, otherwise
1165 	 * put it at the front of the list of open files.
1166 	 */
1167 	fp->f_mtxp = mtx_pool_alloc();
1168 	fp->f_gcflag = 0;
1169 	fp->f_count = 1;
1170 	fp->f_cred = crhold(td->td_ucred);
1171 	fp->f_ops = &badfileops;
1172 	fp->f_seqcount = 1;
1173 	FILEDESC_LOCK(p->p_fd);
1174 	if ((fq = p->p_fd->fd_ofiles[0])) {
1175 		LIST_INSERT_AFTER(fq, fp, f_list);
1176 	} else {
1177 		LIST_INSERT_HEAD(&filehead, fp, f_list);
1178 	}
1179 	sx_xunlock(&filelist_lock);
1180 	if ((error = fdalloc(td, 0, &i))) {
1181 		FILEDESC_UNLOCK(p->p_fd);
1182 		fdrop(fp, td);
1183 		return (error);
1184 	}
1185 	p->p_fd->fd_ofiles[i] = fp;
1186 	FILEDESC_UNLOCK(p->p_fd);
1187 	if (resultfp)
1188 		*resultfp = fp;
1189 	if (resultfd)
1190 		*resultfd = i;
1191 	return (0);
1192 }
1193 
1194 /*
1195  * Free a file descriptor.
1196  */
1197 void
1198 ffree(fp)
1199 	struct file *fp;
1200 {
1201 
1202 	KASSERT(fp->f_count == 0, ("ffree: fp_fcount not 0!"));
1203 	sx_xlock(&filelist_lock);
1204 	LIST_REMOVE(fp, f_list);
1205 	nfiles--;
1206 	sx_xunlock(&filelist_lock);
1207 	crfree(fp->f_cred);
1208 	uma_zfree(file_zone, fp);
1209 }
1210 
1211 /*
1212  * Build a new filedesc structure from another.
1213  * Copy the current, root, and jail root vnode references.
1214  */
1215 struct filedesc *
1216 fdinit(fdp)
1217 	struct filedesc *fdp;
1218 {
1219 	struct filedesc0 *newfdp;
1220 
1221 	MALLOC(newfdp, struct filedesc0 *, sizeof(struct filedesc0),
1222 	    M_FILEDESC, M_WAITOK | M_ZERO);
1223 	mtx_init(&newfdp->fd_fd.fd_mtx, FILEDESC_LOCK_DESC, NULL, MTX_DEF);
1224 	newfdp->fd_fd.fd_cdir = fdp->fd_cdir;
1225 	if (newfdp->fd_fd.fd_cdir)
1226 		VREF(newfdp->fd_fd.fd_cdir);
1227 	newfdp->fd_fd.fd_rdir = fdp->fd_rdir;
1228 	if (newfdp->fd_fd.fd_rdir)
1229 		VREF(newfdp->fd_fd.fd_rdir);
1230 	newfdp->fd_fd.fd_jdir = fdp->fd_jdir;
1231 	if (newfdp->fd_fd.fd_jdir)
1232 		VREF(newfdp->fd_fd.fd_jdir);
1233 
1234 	/* Create the file descriptor table. */
1235 	newfdp->fd_fd.fd_refcnt = 1;
1236 	newfdp->fd_fd.fd_cmask = cmask;
1237 	newfdp->fd_fd.fd_ofiles = newfdp->fd_dfiles;
1238 	newfdp->fd_fd.fd_ofileflags = newfdp->fd_dfileflags;
1239 	newfdp->fd_fd.fd_nfiles = NDFILE;
1240 	newfdp->fd_fd.fd_knlistsize = -1;
1241 	return (&newfdp->fd_fd);
1242 }
1243 
1244 /*
1245  * Share a filedesc structure.
1246  */
1247 struct filedesc *
1248 fdshare(fdp)
1249 	struct filedesc *fdp;
1250 {
1251 	FILEDESC_LOCK(fdp);
1252 	fdp->fd_refcnt++;
1253 	FILEDESC_UNLOCK(fdp);
1254 	return (fdp);
1255 }
1256 
1257 /*
1258  * Copy a filedesc structure.
1259  * A NULL pointer in returns a NULL reference, this is to ease callers,
1260  * not catch errors.
1261  */
1262 struct filedesc *
1263 fdcopy(fdp)
1264 	struct filedesc *fdp;
1265 {
1266 	struct filedesc *newfdp;
1267 	struct file **fpp;
1268 	int i, j;
1269 
1270 	/* Certain daemons might not have file descriptors. */
1271 	if (fdp == NULL)
1272 		return (NULL);
1273 
1274 	FILEDESC_LOCK_ASSERT(fdp, MA_OWNED);
1275 
1276 	FILEDESC_UNLOCK(fdp);
1277 	MALLOC(newfdp, struct filedesc *, sizeof(struct filedesc0),
1278 	    M_FILEDESC, M_WAITOK);
1279 	FILEDESC_LOCK(fdp);
1280 	bcopy(fdp, newfdp, sizeof(struct filedesc));
1281 	FILEDESC_UNLOCK(fdp);
1282 	bzero(&newfdp->fd_mtx, sizeof(newfdp->fd_mtx));
1283 	mtx_init(&newfdp->fd_mtx, FILEDESC_LOCK_DESC, NULL, MTX_DEF);
1284 	if (newfdp->fd_cdir)
1285 		VREF(newfdp->fd_cdir);
1286 	if (newfdp->fd_rdir)
1287 		VREF(newfdp->fd_rdir);
1288 	if (newfdp->fd_jdir)
1289 		VREF(newfdp->fd_jdir);
1290 	newfdp->fd_refcnt = 1;
1291 
1292 	/*
1293 	 * If the number of open files fits in the internal arrays
1294 	 * of the open file structure, use them, otherwise allocate
1295 	 * additional memory for the number of descriptors currently
1296 	 * in use.
1297 	 */
1298 	FILEDESC_LOCK(fdp);
1299 	newfdp->fd_lastfile = fdp->fd_lastfile;
1300 	newfdp->fd_nfiles = fdp->fd_nfiles;
1301 	if (newfdp->fd_lastfile < NDFILE) {
1302 		newfdp->fd_ofiles = ((struct filedesc0 *) newfdp)->fd_dfiles;
1303 		newfdp->fd_ofileflags =
1304 		    ((struct filedesc0 *) newfdp)->fd_dfileflags;
1305 		i = NDFILE;
1306 	} else {
1307 		/*
1308 		 * Compute the smallest multiple of NDEXTENT needed
1309 		 * for the file descriptors currently in use,
1310 		 * allowing the table to shrink.
1311 		 */
1312 retry:
1313 		i = newfdp->fd_nfiles;
1314 		while (i > 2 * NDEXTENT && i > newfdp->fd_lastfile * 2)
1315 			i /= 2;
1316 		FILEDESC_UNLOCK(fdp);
1317 		MALLOC(newfdp->fd_ofiles, struct file **, i * OFILESIZE,
1318 		    M_FILEDESC, M_WAITOK);
1319 		FILEDESC_LOCK(fdp);
1320 		newfdp->fd_lastfile = fdp->fd_lastfile;
1321 		newfdp->fd_nfiles = fdp->fd_nfiles;
1322 		j = newfdp->fd_nfiles;
1323 		while (j > 2 * NDEXTENT && j > newfdp->fd_lastfile * 2)
1324 			j /= 2;
1325 		if (i != j) {
1326 			/*
1327 			 * The size of the original table has changed.
1328 			 * Go over once again.
1329 			 */
1330 			FILEDESC_UNLOCK(fdp);
1331 			FREE(newfdp->fd_ofiles, M_FILEDESC);
1332 			FILEDESC_LOCK(fdp);
1333 			newfdp->fd_lastfile = fdp->fd_lastfile;
1334 			newfdp->fd_nfiles = fdp->fd_nfiles;
1335 			goto retry;
1336 		}
1337 		newfdp->fd_ofileflags = (char *) &newfdp->fd_ofiles[i];
1338 	}
1339 	newfdp->fd_nfiles = i;
1340 	bcopy(fdp->fd_ofiles, newfdp->fd_ofiles, i * sizeof(struct file **));
1341 	bcopy(fdp->fd_ofileflags, newfdp->fd_ofileflags, i * sizeof(char));
1342 
1343 	/*
1344 	 * kq descriptors cannot be copied.
1345 	 */
1346 	if (newfdp->fd_knlistsize != -1) {
1347 		fpp = &newfdp->fd_ofiles[newfdp->fd_lastfile];
1348 		for (i = newfdp->fd_lastfile; i >= 0; i--, fpp--) {
1349 			if (*fpp != NULL && (*fpp)->f_type == DTYPE_KQUEUE) {
1350 				*fpp = NULL;
1351 				if (i < newfdp->fd_freefile)
1352 					newfdp->fd_freefile = i;
1353 			}
1354 			if (*fpp == NULL && i == newfdp->fd_lastfile && i > 0)
1355 				newfdp->fd_lastfile--;
1356 		}
1357 		newfdp->fd_knlist = NULL;
1358 		newfdp->fd_knlistsize = -1;
1359 		newfdp->fd_knhash = NULL;
1360 		newfdp->fd_knhashmask = 0;
1361 	}
1362 
1363 	fpp = newfdp->fd_ofiles;
1364 	for (i = newfdp->fd_lastfile; i-- >= 0; fpp++) {
1365 		if (*fpp != NULL)
1366 			fhold(*fpp);
1367 	}
1368 	return (newfdp);
1369 }
1370 
1371 /* A mutex to protect the association between a proc and filedesc. */
1372 struct mtx	fdesc_mtx;
1373 MTX_SYSINIT(fdesc, &fdesc_mtx, "fdesc", MTX_DEF);
1374 
1375 /*
1376  * Release a filedesc structure.
1377  */
1378 void
1379 fdfree(td)
1380 	struct thread *td;
1381 {
1382 	struct filedesc *fdp;
1383 	struct file **fpp;
1384 	int i;
1385 
1386 	/* Certain daemons might not have file descriptors. */
1387 	fdp = td->td_proc->p_fd;
1388 	if (fdp == NULL)
1389 		return;
1390 
1391 	FILEDESC_LOCK(fdp);
1392 	if (--fdp->fd_refcnt > 0) {
1393 		FILEDESC_UNLOCK(fdp);
1394 		return;
1395 	}
1396 
1397 	/*
1398 	 * We are the last reference to the structure, so we can
1399 	 * safely assume it will not change out from under us.
1400 	 */
1401 	FILEDESC_UNLOCK(fdp);
1402 	fpp = fdp->fd_ofiles;
1403 	for (i = fdp->fd_lastfile; i-- >= 0; fpp++) {
1404 		if (*fpp)
1405 			(void) closef(*fpp, td);
1406 	}
1407 
1408 	/* XXX This should happen earlier. */
1409 	mtx_lock(&fdesc_mtx);
1410 	td->td_proc->p_fd = NULL;
1411 	mtx_unlock(&fdesc_mtx);
1412 
1413 	if (fdp->fd_nfiles > NDFILE)
1414 		FREE(fdp->fd_ofiles, M_FILEDESC);
1415 	if (fdp->fd_cdir)
1416 		vrele(fdp->fd_cdir);
1417 	if (fdp->fd_rdir)
1418 		vrele(fdp->fd_rdir);
1419 	if (fdp->fd_jdir)
1420 		vrele(fdp->fd_jdir);
1421 	if (fdp->fd_knlist)
1422 		FREE(fdp->fd_knlist, M_KQUEUE);
1423 	if (fdp->fd_knhash)
1424 		FREE(fdp->fd_knhash, M_KQUEUE);
1425 	mtx_destroy(&fdp->fd_mtx);
1426 	FREE(fdp, M_FILEDESC);
1427 }
1428 
1429 /*
1430  * For setugid programs, we don't want to people to use that setugidness
1431  * to generate error messages which write to a file which otherwise would
1432  * otherwise be off-limits to the process.  We check for filesystems where
1433  * the vnode can change out from under us after execve (like [lin]procfs).
1434  *
1435  * Since setugidsafety calls this only for fd 0, 1 and 2, this check is
1436  * sufficient.  We also don't for check setugidness since we know we are.
1437  */
1438 static int
1439 is_unsafe(struct file *fp)
1440 {
1441 	if (fp->f_type == DTYPE_VNODE) {
1442 		struct vnode *vp = fp->f_data;
1443 
1444 		if ((vp->v_vflag & VV_PROCDEP) != 0)
1445 			return (1);
1446 	}
1447 	return (0);
1448 }
1449 
1450 /*
1451  * Make this setguid thing safe, if at all possible.
1452  */
1453 void
1454 setugidsafety(td)
1455 	struct thread *td;
1456 {
1457 	struct filedesc *fdp;
1458 	int i;
1459 
1460 	/* Certain daemons might not have file descriptors. */
1461 	fdp = td->td_proc->p_fd;
1462 	if (fdp == NULL)
1463 		return;
1464 
1465 	/*
1466 	 * Note: fdp->fd_ofiles may be reallocated out from under us while
1467 	 * we are blocked in a close.  Be careful!
1468 	 */
1469 	FILEDESC_LOCK(fdp);
1470 	for (i = 0; i <= fdp->fd_lastfile; i++) {
1471 		if (i > 2)
1472 			break;
1473 		if (fdp->fd_ofiles[i] && is_unsafe(fdp->fd_ofiles[i])) {
1474 			struct file *fp;
1475 
1476 #if 0
1477 			if ((fdp->fd_ofileflags[i] & UF_MAPPED) != 0)
1478 				(void) munmapfd(td, i);
1479 #endif
1480 			if (i < fdp->fd_knlistsize) {
1481 				FILEDESC_UNLOCK(fdp);
1482 				knote_fdclose(td, i);
1483 				FILEDESC_LOCK(fdp);
1484 			}
1485 			/*
1486 			 * NULL-out descriptor prior to close to avoid
1487 			 * a race while close blocks.
1488 			 */
1489 			fp = fdp->fd_ofiles[i];
1490 			fdp->fd_ofiles[i] = NULL;
1491 			fdp->fd_ofileflags[i] = 0;
1492 			if (i < fdp->fd_freefile)
1493 				fdp->fd_freefile = i;
1494 			FILEDESC_UNLOCK(fdp);
1495 			(void) closef(fp, td);
1496 			FILEDESC_LOCK(fdp);
1497 		}
1498 	}
1499 	while (fdp->fd_lastfile > 0 && fdp->fd_ofiles[fdp->fd_lastfile] == NULL)
1500 		fdp->fd_lastfile--;
1501 	FILEDESC_UNLOCK(fdp);
1502 }
1503 
1504 /*
1505  * Close any files on exec?
1506  */
1507 void
1508 fdcloseexec(td)
1509 	struct thread *td;
1510 {
1511 	struct filedesc *fdp;
1512 	int i;
1513 
1514 	/* Certain daemons might not have file descriptors. */
1515 	fdp = td->td_proc->p_fd;
1516 	if (fdp == NULL)
1517 		return;
1518 
1519 	FILEDESC_LOCK(fdp);
1520 
1521 	/*
1522 	 * We cannot cache fd_ofiles or fd_ofileflags since operations
1523 	 * may block and rip them out from under us.
1524 	 */
1525 	for (i = 0; i <= fdp->fd_lastfile; i++) {
1526 		if (fdp->fd_ofiles[i] != NULL &&
1527 		    (fdp->fd_ofileflags[i] & UF_EXCLOSE)) {
1528 			struct file *fp;
1529 
1530 #if 0
1531 			if (fdp->fd_ofileflags[i] & UF_MAPPED)
1532 				(void) munmapfd(td, i);
1533 #endif
1534 			if (i < fdp->fd_knlistsize) {
1535 				FILEDESC_UNLOCK(fdp);
1536 				knote_fdclose(td, i);
1537 				FILEDESC_LOCK(fdp);
1538 			}
1539 			/*
1540 			 * NULL-out descriptor prior to close to avoid
1541 			 * a race while close blocks.
1542 			 */
1543 			fp = fdp->fd_ofiles[i];
1544 			fdp->fd_ofiles[i] = NULL;
1545 			fdp->fd_ofileflags[i] = 0;
1546 			if (i < fdp->fd_freefile)
1547 				fdp->fd_freefile = i;
1548 			FILEDESC_UNLOCK(fdp);
1549 			(void) closef(fp, td);
1550 			FILEDESC_LOCK(fdp);
1551 		}
1552 	}
1553 	while (fdp->fd_lastfile > 0 && fdp->fd_ofiles[fdp->fd_lastfile] == NULL)
1554 		fdp->fd_lastfile--;
1555 	FILEDESC_UNLOCK(fdp);
1556 }
1557 
1558 /*
1559  * It is unsafe for set[ug]id processes to be started with file
1560  * descriptors 0..2 closed, as these descriptors are given implicit
1561  * significance in the Standard C library.  fdcheckstd() will create a
1562  * descriptor referencing /dev/null for each of stdin, stdout, and
1563  * stderr that is not already open.
1564  */
1565 int
1566 fdcheckstd(td)
1567 	struct thread *td;
1568 {
1569 	struct nameidata nd;
1570 	struct filedesc *fdp;
1571 	struct file *fp;
1572 	register_t retval;
1573 	int fd, i, error, flags, devnull;
1574 
1575 	fdp = td->td_proc->p_fd;
1576 	if (fdp == NULL)
1577 		return (0);
1578 	devnull = -1;
1579 	error = 0;
1580 	for (i = 0; i < 3; i++) {
1581 		if (fdp->fd_ofiles[i] != NULL)
1582 			continue;
1583 		if (devnull < 0) {
1584 			error = falloc(td, &fp, &fd);
1585 			if (error != 0)
1586 				break;
1587 			KASSERT(fd == i, ("oof, we didn't get our fd"));
1588 			NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, "/dev/null",
1589 			    td);
1590 			flags = FREAD | FWRITE;
1591 			error = vn_open(&nd, &flags, 0);
1592 			if (error != 0) {
1593 				FILEDESC_LOCK(fdp);
1594 				fdp->fd_ofiles[fd] = NULL;
1595 				FILEDESC_UNLOCK(fdp);
1596 				fdrop(fp, td);
1597 				break;
1598 			}
1599 			NDFREE(&nd, NDF_ONLY_PNBUF);
1600 			fp->f_data = nd.ni_vp;
1601 			fp->f_flag = flags;
1602 			fp->f_ops = &vnops;
1603 			fp->f_type = DTYPE_VNODE;
1604 			VOP_UNLOCK(nd.ni_vp, 0, td);
1605 			devnull = fd;
1606 		} else {
1607 			error = do_dup(td, DUP_FIXED, devnull, i, &retval);
1608 			if (error != 0)
1609 				break;
1610 		}
1611 	}
1612 	return (error);
1613 }
1614 
1615 /*
1616  * Internal form of close.
1617  * Decrement reference count on file structure.
1618  * Note: td may be NULL when closing a file
1619  * that was being passed in a message.
1620  */
1621 int
1622 closef(fp, td)
1623 	struct file *fp;
1624 	struct thread *td;
1625 {
1626 	struct vnode *vp;
1627 	struct flock lf;
1628 
1629 	if (fp == NULL)
1630 		return (0);
1631 	/*
1632 	 * POSIX record locking dictates that any close releases ALL
1633 	 * locks owned by this process.  This is handled by setting
1634 	 * a flag in the unlock to free ONLY locks obeying POSIX
1635 	 * semantics, and not to free BSD-style file locks.
1636 	 * If the descriptor was in a message, POSIX-style locks
1637 	 * aren't passed with the descriptor.
1638 	 */
1639 	if (td != NULL && (td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0 &&
1640 	    fp->f_type == DTYPE_VNODE) {
1641 		lf.l_whence = SEEK_SET;
1642 		lf.l_start = 0;
1643 		lf.l_len = 0;
1644 		lf.l_type = F_UNLCK;
1645 		vp = fp->f_data;
1646 		(void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc->p_leader,
1647 				   F_UNLCK, &lf, F_POSIX);
1648 	}
1649 	return (fdrop(fp, td));
1650 }
1651 
1652 /*
1653  * Drop reference on struct file passed in, may call closef if the
1654  * reference hits zero.
1655  */
1656 int
1657 fdrop(fp, td)
1658 	struct file *fp;
1659 	struct thread *td;
1660 {
1661 
1662 	FILE_LOCK(fp);
1663 	return (fdrop_locked(fp, td));
1664 }
1665 
1666 /*
1667  * Extract the file pointer associated with the specified descriptor for
1668  * the current user process.
1669  *
1670  * If the descriptor doesn't exist, EBADF is returned.
1671  *
1672  * If the descriptor exists but doesn't match 'flags' then
1673  * return EBADF for read attempts and EINVAL for write attempts.
1674  *
1675  * If 'hold' is set (non-zero) the file's refcount will be bumped on return.
1676  * It should be droped with fdrop().
1677  * If it is not set, then the refcount will not be bumped however the
1678  * thread's filedesc struct will be returned locked (for fgetsock).
1679  *
1680  * If an error occured the non-zero error is returned and *fpp is set to NULL.
1681  * Otherwise *fpp is set and zero is returned.
1682  */
1683 static __inline int
1684 _fget(struct thread *td, int fd, struct file **fpp, int flags, int hold)
1685 {
1686 	struct filedesc *fdp;
1687 	struct file *fp;
1688 
1689 	*fpp = NULL;
1690 	if (td == NULL || (fdp = td->td_proc->p_fd) == NULL)
1691 		return (EBADF);
1692 	FILEDESC_LOCK(fdp);
1693 	if ((fp = fget_locked(fdp, fd)) == NULL || fp->f_ops == &badfileops) {
1694 		FILEDESC_UNLOCK(fdp);
1695 		return (EBADF);
1696 	}
1697 
1698 	/*
1699 	 * Note: FREAD failures returns EBADF to maintain backwards
1700 	 * compatibility with what routines returned before.
1701 	 *
1702 	 * Only one flag, or 0, may be specified.
1703 	 */
1704 	if (flags == FREAD && (fp->f_flag & FREAD) == 0) {
1705 		FILEDESC_UNLOCK(fdp);
1706 		return (EBADF);
1707 	}
1708 	if (flags == FWRITE && (fp->f_flag & FWRITE) == 0) {
1709 		FILEDESC_UNLOCK(fdp);
1710 		return (EINVAL);
1711 	}
1712 	if (hold) {
1713 		fhold(fp);
1714 		FILEDESC_UNLOCK(fdp);
1715 	}
1716 	*fpp = fp;
1717 	return (0);
1718 }
1719 
1720 int
1721 fget(struct thread *td, int fd, struct file **fpp)
1722 {
1723 
1724 	return(_fget(td, fd, fpp, 0, 1));
1725 }
1726 
1727 int
1728 fget_read(struct thread *td, int fd, struct file **fpp)
1729 {
1730 
1731 	return(_fget(td, fd, fpp, FREAD, 1));
1732 }
1733 
1734 int
1735 fget_write(struct thread *td, int fd, struct file **fpp)
1736 {
1737 
1738 	return(_fget(td, fd, fpp, FWRITE, 1));
1739 }
1740 
1741 /*
1742  * Like fget() but loads the underlying vnode, or returns an error if
1743  * the descriptor does not represent a vnode.  Note that pipes use vnodes
1744  * but never have VM objects (so VOP_GETVOBJECT() calls will return an
1745  * error).  The returned vnode will be vref()d.
1746  */
1747 static __inline int
1748 _fgetvp(struct thread *td, int fd, struct vnode **vpp, int flags)
1749 {
1750 	struct file *fp;
1751 	int error;
1752 
1753 	*vpp = NULL;
1754 	if ((error = _fget(td, fd, &fp, 0, 0)) != 0)
1755 		return (error);
1756 	if (fp->f_type != DTYPE_VNODE && fp->f_type != DTYPE_FIFO) {
1757 		error = EINVAL;
1758 	} else {
1759 		*vpp = fp->f_data;
1760 		vref(*vpp);
1761 	}
1762 	FILEDESC_UNLOCK(td->td_proc->p_fd);
1763 	return (error);
1764 }
1765 
1766 int
1767 fgetvp(struct thread *td, int fd, struct vnode **vpp)
1768 {
1769 
1770 	return (_fgetvp(td, fd, vpp, 0));
1771 }
1772 
1773 int
1774 fgetvp_read(struct thread *td, int fd, struct vnode **vpp)
1775 {
1776 
1777 	return (_fgetvp(td, fd, vpp, FREAD));
1778 }
1779 
1780 int
1781 fgetvp_write(struct thread *td, int fd, struct vnode **vpp)
1782 {
1783 
1784 	return (_fgetvp(td, fd, vpp, FWRITE));
1785 }
1786 
1787 /*
1788  * Like fget() but loads the underlying socket, or returns an error if
1789  * the descriptor does not represent a socket.
1790  *
1791  * We bump the ref count on the returned socket.  XXX Also obtain the SX
1792  * lock in the future.
1793  */
1794 int
1795 fgetsock(struct thread *td, int fd, struct socket **spp, u_int *fflagp)
1796 {
1797 	struct file *fp;
1798 	int error;
1799 
1800 	*spp = NULL;
1801 	if (fflagp != NULL)
1802 		*fflagp = 0;
1803 	if ((error = _fget(td, fd, &fp, 0, 0)) != 0)
1804 		return (error);
1805 	if (fp->f_type != DTYPE_SOCKET) {
1806 		error = ENOTSOCK;
1807 	} else {
1808 		*spp = fp->f_data;
1809 		if (fflagp)
1810 			*fflagp = fp->f_flag;
1811 		soref(*spp);
1812 	}
1813 	FILEDESC_UNLOCK(td->td_proc->p_fd);
1814 	return (error);
1815 }
1816 
1817 /*
1818  * Drop the reference count on the the socket and XXX release the SX lock in
1819  * the future.  The last reference closes the socket.
1820  */
1821 void
1822 fputsock(struct socket *so)
1823 {
1824 
1825 	sorele(so);
1826 }
1827 
1828 /*
1829  * Drop reference on struct file passed in, may call closef if the
1830  * reference hits zero.
1831  * Expects struct file locked, and will unlock it.
1832  */
1833 int
1834 fdrop_locked(fp, td)
1835 	struct file *fp;
1836 	struct thread *td;
1837 {
1838 	struct flock lf;
1839 	struct vnode *vp;
1840 	int error;
1841 
1842 	FILE_LOCK_ASSERT(fp, MA_OWNED);
1843 
1844 	if (--fp->f_count > 0) {
1845 		FILE_UNLOCK(fp);
1846 		return (0);
1847 	}
1848 	mtx_lock(&Giant);
1849 	if (fp->f_count < 0)
1850 		panic("fdrop: count < 0");
1851 	if ((fp->f_flag & FHASLOCK) && fp->f_type == DTYPE_VNODE) {
1852 		lf.l_whence = SEEK_SET;
1853 		lf.l_start = 0;
1854 		lf.l_len = 0;
1855 		lf.l_type = F_UNLCK;
1856 		vp = fp->f_data;
1857 		FILE_UNLOCK(fp);
1858 		(void) VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK);
1859 	} else
1860 		FILE_UNLOCK(fp);
1861 	if (fp->f_ops != &badfileops)
1862 		error = fo_close(fp, td);
1863 	else
1864 		error = 0;
1865 	ffree(fp);
1866 	mtx_unlock(&Giant);
1867 	return (error);
1868 }
1869 
1870 /*
1871  * Apply an advisory lock on a file descriptor.
1872  *
1873  * Just attempt to get a record lock of the requested type on
1874  * the entire file (l_whence = SEEK_SET, l_start = 0, l_len = 0).
1875  */
1876 #ifndef _SYS_SYSPROTO_H_
1877 struct flock_args {
1878 	int	fd;
1879 	int	how;
1880 };
1881 #endif
1882 /*
1883  * MPSAFE
1884  */
1885 /* ARGSUSED */
1886 int
1887 flock(td, uap)
1888 	struct thread *td;
1889 	struct flock_args *uap;
1890 {
1891 	struct file *fp;
1892 	struct vnode *vp;
1893 	struct flock lf;
1894 	int error;
1895 
1896 	if ((error = fget(td, uap->fd, &fp)) != 0)
1897 		return (error);
1898 	if (fp->f_type != DTYPE_VNODE) {
1899 		fdrop(fp, td);
1900 		return (EOPNOTSUPP);
1901 	}
1902 
1903 	mtx_lock(&Giant);
1904 	vp = fp->f_data;
1905 	lf.l_whence = SEEK_SET;
1906 	lf.l_start = 0;
1907 	lf.l_len = 0;
1908 	if (uap->how & LOCK_UN) {
1909 		lf.l_type = F_UNLCK;
1910 		FILE_LOCK(fp);
1911 		fp->f_flag &= ~FHASLOCK;
1912 		FILE_UNLOCK(fp);
1913 		error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK);
1914 		goto done2;
1915 	}
1916 	if (uap->how & LOCK_EX)
1917 		lf.l_type = F_WRLCK;
1918 	else if (uap->how & LOCK_SH)
1919 		lf.l_type = F_RDLCK;
1920 	else {
1921 		error = EBADF;
1922 		goto done2;
1923 	}
1924 	FILE_LOCK(fp);
1925 	fp->f_flag |= FHASLOCK;
1926 	FILE_UNLOCK(fp);
1927 	error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,
1928 	    (uap->how & LOCK_NB) ? F_FLOCK : F_FLOCK | F_WAIT);
1929 done2:
1930 	fdrop(fp, td);
1931 	mtx_unlock(&Giant);
1932 	return (error);
1933 }
1934 
1935 /*
1936  * File Descriptor pseudo-device driver (/dev/fd/).
1937  *
1938  * Opening minor device N dup()s the file (if any) connected to file
1939  * descriptor N belonging to the calling process.  Note that this driver
1940  * consists of only the ``open()'' routine, because all subsequent
1941  * references to this file will be direct to the other driver.
1942  */
1943 /* ARGSUSED */
1944 static int
1945 fdopen(dev, mode, type, td)
1946 	dev_t dev;
1947 	int mode, type;
1948 	struct thread *td;
1949 {
1950 
1951 	/*
1952 	 * XXX Kludge: set curthread->td_dupfd to contain the value of the
1953 	 * the file descriptor being sought for duplication. The error
1954 	 * return ensures that the vnode for this device will be released
1955 	 * by vn_open. Open will detect this special error and take the
1956 	 * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN
1957 	 * will simply report the error.
1958 	 */
1959 	td->td_dupfd = dev2unit(dev);
1960 	return (ENODEV);
1961 }
1962 
1963 /*
1964  * Duplicate the specified descriptor to a free descriptor.
1965  */
1966 int
1967 dupfdopen(td, fdp, indx, dfd, mode, error)
1968 	struct thread *td;
1969 	struct filedesc *fdp;
1970 	int indx, dfd;
1971 	int mode;
1972 	int error;
1973 {
1974 	struct file *wfp;
1975 	struct file *fp;
1976 
1977 	/*
1978 	 * If the to-be-dup'd fd number is greater than the allowed number
1979 	 * of file descriptors, or the fd to be dup'd has already been
1980 	 * closed, then reject.
1981 	 */
1982 	FILEDESC_LOCK(fdp);
1983 	if (dfd < 0 || dfd >= fdp->fd_nfiles ||
1984 	    (wfp = fdp->fd_ofiles[dfd]) == NULL) {
1985 		FILEDESC_UNLOCK(fdp);
1986 		return (EBADF);
1987 	}
1988 
1989 	/*
1990 	 * There are two cases of interest here.
1991 	 *
1992 	 * For ENODEV simply dup (dfd) to file descriptor
1993 	 * (indx) and return.
1994 	 *
1995 	 * For ENXIO steal away the file structure from (dfd) and
1996 	 * store it in (indx).  (dfd) is effectively closed by
1997 	 * this operation.
1998 	 *
1999 	 * Any other error code is just returned.
2000 	 */
2001 	switch (error) {
2002 	case ENODEV:
2003 		/*
2004 		 * Check that the mode the file is being opened for is a
2005 		 * subset of the mode of the existing descriptor.
2006 		 */
2007 		FILE_LOCK(wfp);
2008 		if (((mode & (FREAD|FWRITE)) | wfp->f_flag) != wfp->f_flag) {
2009 			FILE_UNLOCK(wfp);
2010 			FILEDESC_UNLOCK(fdp);
2011 			return (EACCES);
2012 		}
2013 		fp = fdp->fd_ofiles[indx];
2014 #if 0
2015 		if (fp && fdp->fd_ofileflags[indx] & UF_MAPPED)
2016 			(void) munmapfd(td, indx);
2017 #endif
2018 		fdp->fd_ofiles[indx] = wfp;
2019 		fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd];
2020 		fhold_locked(wfp);
2021 		FILE_UNLOCK(wfp);
2022 		if (indx > fdp->fd_lastfile)
2023 			fdp->fd_lastfile = indx;
2024 		if (fp != NULL)
2025 			FILE_LOCK(fp);
2026 		FILEDESC_UNLOCK(fdp);
2027 		/*
2028 		 * We now own the reference to fp that the ofiles[] array
2029 		 * used to own.  Release it.
2030 		 */
2031 		if (fp != NULL)
2032 			fdrop_locked(fp, td);
2033 		return (0);
2034 
2035 	case ENXIO:
2036 		/*
2037 		 * Steal away the file pointer from dfd and stuff it into indx.
2038 		 */
2039 		fp = fdp->fd_ofiles[indx];
2040 #if 0
2041 		if (fp && fdp->fd_ofileflags[indx] & UF_MAPPED)
2042 			(void) munmapfd(td, indx);
2043 #endif
2044 		fdp->fd_ofiles[indx] = fdp->fd_ofiles[dfd];
2045 		fdp->fd_ofiles[dfd] = NULL;
2046 		fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd];
2047 		fdp->fd_ofileflags[dfd] = 0;
2048 
2049 		/*
2050 		 * Complete the clean up of the filedesc structure by
2051 		 * recomputing the various hints.
2052 		 */
2053 		if (indx > fdp->fd_lastfile) {
2054 			fdp->fd_lastfile = indx;
2055 		} else {
2056 			while (fdp->fd_lastfile > 0 &&
2057 			   fdp->fd_ofiles[fdp->fd_lastfile] == NULL) {
2058 				fdp->fd_lastfile--;
2059 			}
2060 			if (dfd < fdp->fd_freefile)
2061 				fdp->fd_freefile = dfd;
2062 		}
2063 		if (fp != NULL)
2064 			FILE_LOCK(fp);
2065 		FILEDESC_UNLOCK(fdp);
2066 
2067 		/*
2068 		 * we now own the reference to fp that the ofiles[] array
2069 		 * used to own.  Release it.
2070 		 */
2071 		if (fp != NULL)
2072 			fdrop_locked(fp, td);
2073 		return (0);
2074 
2075 	default:
2076 		FILEDESC_UNLOCK(fdp);
2077 		return (error);
2078 	}
2079 	/* NOTREACHED */
2080 }
2081 
2082 /*
2083  * Get file structures.
2084  */
2085 static int
2086 sysctl_kern_file(SYSCTL_HANDLER_ARGS)
2087 {
2088 	struct xfile xf;
2089 	struct filedesc *fdp;
2090 	struct file *fp;
2091 	struct proc *p;
2092 	int error, n;
2093 
2094 	sysctl_wire_old_buffer(req, 0);
2095 	if (req->oldptr == NULL) {
2096 		n = 16;		/* A slight overestimate. */
2097 		sx_slock(&filelist_lock);
2098 		LIST_FOREACH(fp, &filehead, f_list) {
2099 			/*
2100 			 * We should grab the lock, but this is an
2101 			 * estimate, so does it really matter?
2102 			 */
2103 			/* mtx_lock(fp->f_mtxp); */
2104 			n += fp->f_count;
2105 			/* mtx_unlock(f->f_mtxp); */
2106 		}
2107 		sx_sunlock(&filelist_lock);
2108 		return (SYSCTL_OUT(req, 0, n * sizeof(xf)));
2109 	}
2110 	error = 0;
2111 	bzero(&xf, sizeof(xf));
2112 	xf.xf_size = sizeof(xf);
2113 	sx_slock(&allproc_lock);
2114 	LIST_FOREACH(p, &allproc, p_list) {
2115 		PROC_LOCK(p);
2116 		xf.xf_pid = p->p_pid;
2117 		xf.xf_uid = p->p_ucred->cr_uid;
2118 		PROC_UNLOCK(p);
2119 		mtx_lock(&fdesc_mtx);
2120 		if ((fdp = p->p_fd) == NULL) {
2121 			mtx_unlock(&fdesc_mtx);
2122 			continue;
2123 		}
2124 		FILEDESC_LOCK(fdp);
2125 		for (n = 0; n < fdp->fd_nfiles; ++n) {
2126 			if ((fp = fdp->fd_ofiles[n]) == NULL)
2127 				continue;
2128 			xf.xf_fd = n;
2129 			xf.xf_file = fp;
2130 			xf.xf_data = fp->f_data;
2131 			xf.xf_type = fp->f_type;
2132 			xf.xf_count = fp->f_count;
2133 			xf.xf_msgcount = fp->f_msgcount;
2134 			xf.xf_offset = fp->f_offset;
2135 			xf.xf_flag = fp->f_flag;
2136 			error = SYSCTL_OUT(req, &xf, sizeof(xf));
2137 			if (error)
2138 				break;
2139 		}
2140 		FILEDESC_UNLOCK(fdp);
2141 		mtx_unlock(&fdesc_mtx);
2142 		if (error)
2143 			break;
2144 	}
2145 	sx_sunlock(&allproc_lock);
2146 	return (error);
2147 }
2148 
2149 SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD,
2150     0, 0, sysctl_kern_file, "S,xfile", "Entire file table");
2151 
2152 SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW,
2153     &maxfilesperproc, 0, "Maximum files allowed open per process");
2154 
2155 SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW,
2156     &maxfiles, 0, "Maximum number of files");
2157 
2158 SYSCTL_INT(_kern, OID_AUTO, openfiles, CTLFLAG_RD,
2159     &nfiles, 0, "System-wide number of open files");
2160 
2161 static void
2162 fildesc_drvinit(void *unused)
2163 {
2164 	dev_t dev;
2165 
2166 	dev = make_dev(&fildesc_cdevsw, 0, UID_ROOT, GID_WHEEL, 0666, "fd/0");
2167 	make_dev_alias(dev, "stdin");
2168 	dev = make_dev(&fildesc_cdevsw, 1, UID_ROOT, GID_WHEEL, 0666, "fd/1");
2169 	make_dev_alias(dev, "stdout");
2170 	dev = make_dev(&fildesc_cdevsw, 2, UID_ROOT, GID_WHEEL, 0666, "fd/2");
2171 	make_dev_alias(dev, "stderr");
2172 }
2173 
2174 static fo_rdwr_t	badfo_readwrite;
2175 static fo_ioctl_t	badfo_ioctl;
2176 static fo_poll_t	badfo_poll;
2177 static fo_kqfilter_t	badfo_kqfilter;
2178 static fo_stat_t	badfo_stat;
2179 static fo_close_t	badfo_close;
2180 
2181 struct fileops badfileops = {
2182 	badfo_readwrite,
2183 	badfo_readwrite,
2184 	badfo_ioctl,
2185 	badfo_poll,
2186 	badfo_kqfilter,
2187 	badfo_stat,
2188 	badfo_close,
2189 	0
2190 };
2191 
2192 static int
2193 badfo_readwrite(fp, uio, active_cred, flags, td)
2194 	struct file *fp;
2195 	struct uio *uio;
2196 	struct ucred *active_cred;
2197 	struct thread *td;
2198 	int flags;
2199 {
2200 
2201 	return (EBADF);
2202 }
2203 
2204 static int
2205 badfo_ioctl(fp, com, data, active_cred, td)
2206 	struct file *fp;
2207 	u_long com;
2208 	void *data;
2209 	struct ucred *active_cred;
2210 	struct thread *td;
2211 {
2212 
2213 	return (EBADF);
2214 }
2215 
2216 static int
2217 badfo_poll(fp, events, active_cred, td)
2218 	struct file *fp;
2219 	int events;
2220 	struct ucred *active_cred;
2221 	struct thread *td;
2222 {
2223 
2224 	return (0);
2225 }
2226 
2227 static int
2228 badfo_kqfilter(fp, kn)
2229 	struct file *fp;
2230 	struct knote *kn;
2231 {
2232 
2233 	return (0);
2234 }
2235 
2236 static int
2237 badfo_stat(fp, sb, active_cred, td)
2238 	struct file *fp;
2239 	struct stat *sb;
2240 	struct ucred *active_cred;
2241 	struct thread *td;
2242 {
2243 
2244 	return (EBADF);
2245 }
2246 
2247 static int
2248 badfo_close(fp, td)
2249 	struct file *fp;
2250 	struct thread *td;
2251 {
2252 
2253 	return (EBADF);
2254 }
2255 
2256 SYSINIT(fildescdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE+CDEV_MAJOR,
2257 					fildesc_drvinit,NULL)
2258 
2259 static void filelistinit(void *);
2260 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, filelistinit, NULL)
2261 
2262 /* ARGSUSED*/
2263 static void
2264 filelistinit(dummy)
2265 	void *dummy;
2266 {
2267 
2268 	file_zone = uma_zcreate("Files", sizeof(struct file), NULL, NULL,
2269 	    NULL, NULL, UMA_ALIGN_PTR, 0);
2270 	sx_init(&filelist_lock, "filelist lock");
2271 	mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF);
2272 }
2273