xref: /freebsd/sys/kern/kern_descrip.c (revision 5d10e1f7dfbe41e77a7bccca3740086b848df587)
1 /*
2  * Copyright (c) 1982, 1986, 1989, 1991, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)kern_descrip.c	8.6 (Berkeley) 4/19/94
39  * $FreeBSD$
40  */
41 
42 #include "opt_compat.h"
43 
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/syscallsubr.h>
47 #include <sys/sysproto.h>
48 #include <sys/conf.h>
49 #include <sys/filedesc.h>
50 #include <sys/lock.h>
51 #include <sys/kernel.h>
52 #include <sys/malloc.h>
53 #include <sys/mutex.h>
54 #include <sys/sysctl.h>
55 #include <sys/vnode.h>
56 #include <sys/mount.h>
57 #include <sys/proc.h>
58 #include <sys/namei.h>
59 #include <sys/file.h>
60 #include <sys/stat.h>
61 #include <sys/filio.h>
62 #include <sys/fcntl.h>
63 #include <sys/unistd.h>
64 #include <sys/resourcevar.h>
65 #include <sys/event.h>
66 #include <sys/sx.h>
67 #include <sys/socketvar.h>
68 #include <sys/signalvar.h>
69 
70 #include <machine/limits.h>
71 
72 #include <vm/vm.h>
73 #include <vm/vm_extern.h>
74 #include <vm/uma.h>
75 
76 static MALLOC_DEFINE(M_FILEDESC, "file desc", "Open file descriptor table");
77 static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures");
78 
79 uma_zone_t file_zone;
80 
81 static	 d_open_t  fdopen;
82 #define	NUMFDESC 64
83 
84 #define	CDEV_MAJOR 22
85 static struct cdevsw fildesc_cdevsw = {
86 	/* open */	fdopen,
87 	/* close */	noclose,
88 	/* read */	noread,
89 	/* write */	nowrite,
90 	/* ioctl */	noioctl,
91 	/* poll */	nopoll,
92 	/* mmap */	nommap,
93 	/* strategy */	nostrategy,
94 	/* name */	"FD",
95 	/* maj */	CDEV_MAJOR,
96 	/* dump */	nodump,
97 	/* psize */	nopsize,
98 	/* flags */	0,
99 };
100 
101 /* How to treat 'new' parameter when allocating a fd for do_dup(). */
102 enum dup_type { DUP_VARIABLE, DUP_FIXED };
103 
104 static int do_dup(struct thread *td, enum dup_type type, int old, int new,
105     register_t *retval);
106 static int badfo_readwrite(struct file *fp, struct uio *uio,
107     struct ucred *active_cred, int flags, struct thread *td);
108 static int badfo_ioctl(struct file *fp, u_long com, void *data,
109     struct ucred *active_cred, struct thread *td);
110 static int badfo_poll(struct file *fp, int events,
111     struct ucred *active_cred, struct thread *td);
112 static int badfo_kqfilter(struct file *fp, struct knote *kn);
113 static int badfo_stat(struct file *fp, struct stat *sb,
114     struct ucred *active_cred, struct thread *td);
115 static int badfo_close(struct file *fp, struct thread *td);
116 
117 /*
118  * Descriptor management.
119  */
120 struct filelist filehead;	/* head of list of open files */
121 int nfiles;			/* actual number of open files */
122 extern int cmask;
123 struct sx filelist_lock;	/* sx to protect filelist */
124 struct mtx sigio_lock;		/* mtx to protect pointers to sigio */
125 
126 /*
127  * System calls on descriptors.
128  */
129 #ifndef _SYS_SYSPROTO_H_
130 struct getdtablesize_args {
131 	int	dummy;
132 };
133 #endif
134 /*
135  * MPSAFE
136  */
137 /* ARGSUSED */
138 int
139 getdtablesize(td, uap)
140 	struct thread *td;
141 	struct getdtablesize_args *uap;
142 {
143 	struct proc *p = td->td_proc;
144 
145 	mtx_lock(&Giant);
146 	td->td_retval[0] =
147 	    min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfilesperproc);
148 	mtx_unlock(&Giant);
149 	return (0);
150 }
151 
152 /*
153  * Duplicate a file descriptor to a particular value.
154  *
155  * note: keep in mind that a potential race condition exists when closing
156  * descriptors from a shared descriptor table (via rfork).
157  */
158 #ifndef _SYS_SYSPROTO_H_
159 struct dup2_args {
160 	u_int	from;
161 	u_int	to;
162 };
163 #endif
164 /*
165  * MPSAFE
166  */
167 /* ARGSUSED */
168 int
169 dup2(td, uap)
170 	struct thread *td;
171 	struct dup2_args *uap;
172 {
173 
174 	return (do_dup(td, DUP_FIXED, (int)uap->from, (int)uap->to,
175 		    td->td_retval));
176 }
177 
178 /*
179  * Duplicate a file descriptor.
180  */
181 #ifndef _SYS_SYSPROTO_H_
182 struct dup_args {
183 	u_int	fd;
184 };
185 #endif
186 /*
187  * MPSAFE
188  */
189 /* ARGSUSED */
190 int
191 dup(td, uap)
192 	struct thread *td;
193 	struct dup_args *uap;
194 {
195 
196 	return (do_dup(td, DUP_VARIABLE, (int)uap->fd, 0, td->td_retval));
197 }
198 
199 /*
200  * The file control system call.
201  */
202 #ifndef _SYS_SYSPROTO_H_
203 struct fcntl_args {
204 	int	fd;
205 	int	cmd;
206 	long	arg;
207 };
208 #endif
209 /*
210  * MPSAFE
211  */
212 /* ARGSUSED */
213 int
214 fcntl(td, uap)
215 	struct thread *td;
216 	register struct fcntl_args *uap;
217 {
218 	struct flock fl;
219 	intptr_t arg;
220 	int error;
221 
222 	error = 0;
223 	switch (uap->cmd) {
224 	case F_GETLK:
225 	case F_SETLK:
226 	case F_SETLKW:
227 		error = copyin((void *)(intptr_t)uap->arg, &fl, sizeof(fl));
228 		arg = (intptr_t)&fl;
229 		break;
230 	default:
231 		arg = uap->arg;
232 		break;
233 	}
234 	if (error)
235 		return (error);
236 	error = kern_fcntl(td, uap->fd, uap->cmd, arg);
237 	if (error)
238 		return (error);
239 	if (uap->cmd == F_GETLK)
240 		error = copyout(&fl, (void *)(intptr_t)uap->arg, sizeof(fl));
241 	return (error);
242 }
243 
244 int
245 kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
246 {
247 	register struct filedesc *fdp;
248 	struct flock *flp;
249 	register struct file *fp;
250 	struct proc *p;
251 	register char *pop;
252 	struct vnode *vp;
253 	u_int newmin;
254 	int error, flg, tmp;
255 
256 	error = 0;
257 	flg = F_POSIX;
258 	p = td->td_proc;
259 	fdp = p->p_fd;
260 	mtx_lock(&Giant);
261 	FILEDESC_LOCK(fdp);
262 	if ((unsigned)fd >= fdp->fd_nfiles ||
263 	    (fp = fdp->fd_ofiles[fd]) == NULL) {
264 		FILEDESC_UNLOCK(fdp);
265 		error = EBADF;
266 		goto done2;
267 	}
268 	pop = &fdp->fd_ofileflags[fd];
269 
270 	switch (cmd) {
271 	case F_DUPFD:
272 		FILEDESC_UNLOCK(fdp);
273 		newmin = arg;
274 		if (newmin >= p->p_rlimit[RLIMIT_NOFILE].rlim_cur ||
275 		    newmin >= maxfilesperproc) {
276 			error = EINVAL;
277 			break;
278 		}
279 		error = do_dup(td, DUP_VARIABLE, fd, newmin, td->td_retval);
280 		break;
281 
282 	case F_GETFD:
283 		td->td_retval[0] = (*pop & UF_EXCLOSE) ? FD_CLOEXEC : 0;
284 		FILEDESC_UNLOCK(fdp);
285 		break;
286 
287 	case F_SETFD:
288 		*pop = (*pop &~ UF_EXCLOSE) |
289 		    (arg & FD_CLOEXEC ? UF_EXCLOSE : 0);
290 		FILEDESC_UNLOCK(fdp);
291 		break;
292 
293 	case F_GETFL:
294 		FILE_LOCK(fp);
295 		FILEDESC_UNLOCK(fdp);
296 		td->td_retval[0] = OFLAGS(fp->f_flag);
297 		FILE_UNLOCK(fp);
298 		break;
299 
300 	case F_SETFL:
301 		fhold(fp);
302 		FILEDESC_UNLOCK(fdp);
303 		fp->f_flag &= ~FCNTLFLAGS;
304 		fp->f_flag |= FFLAGS(arg & ~O_ACCMODE) & FCNTLFLAGS;
305 		tmp = fp->f_flag & FNONBLOCK;
306 		error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
307 		if (error) {
308 			fdrop(fp, td);
309 			break;
310 		}
311 		tmp = fp->f_flag & FASYNC;
312 		error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td);
313 		if (error == 0) {
314 			fdrop(fp, td);
315 			break;
316 		}
317 		fp->f_flag &= ~FNONBLOCK;
318 		tmp = 0;
319 		(void)fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
320 		fdrop(fp, td);
321 		break;
322 
323 	case F_GETOWN:
324 		fhold(fp);
325 		FILEDESC_UNLOCK(fdp);
326 		error = fo_ioctl(fp, FIOGETOWN, &tmp, td->td_ucred, td);
327 		if (error == 0)
328 			td->td_retval[0] = tmp;
329 		fdrop(fp, td);
330 		break;
331 
332 	case F_SETOWN:
333 		fhold(fp);
334 		FILEDESC_UNLOCK(fdp);
335 		tmp = arg;
336 		error = fo_ioctl(fp, FIOSETOWN, &tmp, td->td_ucred, td);
337 		fdrop(fp, td);
338 		break;
339 
340 	case F_SETLKW:
341 		flg |= F_WAIT;
342 		/* FALLTHROUGH F_SETLK */
343 
344 	case F_SETLK:
345 		if (fp->f_type != DTYPE_VNODE) {
346 			FILEDESC_UNLOCK(fdp);
347 			error = EBADF;
348 			break;
349 		}
350 
351 		flp = (struct flock *)arg;
352 		if (flp->l_whence == SEEK_CUR) {
353 			if (fp->f_offset < 0 ||
354 			    (flp->l_start > 0 &&
355 			     fp->f_offset > OFF_MAX - flp->l_start)) {
356 				FILEDESC_UNLOCK(fdp);
357 				error = EOVERFLOW;
358 				break;
359 			}
360 			flp->l_start += fp->f_offset;
361 		}
362 
363 		/*
364 		 * VOP_ADVLOCK() may block.
365 		 */
366 		fhold(fp);
367 		FILEDESC_UNLOCK(fdp);
368 		vp = (struct vnode *)fp->f_data;
369 
370 		switch (flp->l_type) {
371 		case F_RDLCK:
372 			if ((fp->f_flag & FREAD) == 0) {
373 				error = EBADF;
374 				break;
375 			}
376 			PROC_LOCK(p);
377 			p->p_flag |= P_ADVLOCK;
378 			PROC_UNLOCK(p);
379 			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
380 			    flp, flg);
381 			break;
382 		case F_WRLCK:
383 			if ((fp->f_flag & FWRITE) == 0) {
384 				error = EBADF;
385 				break;
386 			}
387 			PROC_LOCK(p);
388 			p->p_flag |= P_ADVLOCK;
389 			PROC_UNLOCK(p);
390 			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
391 			    flp, flg);
392 			break;
393 		case F_UNLCK:
394 			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK,
395 			    flp, F_POSIX);
396 			break;
397 		default:
398 			error = EINVAL;
399 			break;
400 		}
401 		fdrop(fp, td);
402 		break;
403 
404 	case F_GETLK:
405 		if (fp->f_type != DTYPE_VNODE) {
406 			FILEDESC_UNLOCK(fdp);
407 			error = EBADF;
408 			break;
409 		}
410 		flp = (struct flock *)arg;
411 		if (flp->l_type != F_RDLCK && flp->l_type != F_WRLCK &&
412 		    flp->l_type != F_UNLCK) {
413 			FILEDESC_UNLOCK(fdp);
414 			error = EINVAL;
415 			break;
416 		}
417 		if (flp->l_whence == SEEK_CUR) {
418 			if ((flp->l_start > 0 &&
419 			    fp->f_offset > OFF_MAX - flp->l_start) ||
420 			    (flp->l_start < 0 &&
421 			     fp->f_offset < OFF_MIN - flp->l_start)) {
422 				FILEDESC_UNLOCK(fdp);
423 				error = EOVERFLOW;
424 				break;
425 			}
426 			flp->l_start += fp->f_offset;
427 		}
428 		/*
429 		 * VOP_ADVLOCK() may block.
430 		 */
431 		fhold(fp);
432 		FILEDESC_UNLOCK(fdp);
433 		vp = (struct vnode *)fp->f_data;
434 		error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK, flp,
435 		    F_POSIX);
436 		fdrop(fp, td);
437 		break;
438 	default:
439 		FILEDESC_UNLOCK(fdp);
440 		error = EINVAL;
441 		break;
442 	}
443 done2:
444 	mtx_unlock(&Giant);
445 	return (error);
446 }
447 
448 /*
449  * Common code for dup, dup2, and fcntl(F_DUPFD).
450  * filedesc must be locked, but will be unlocked as a side effect.
451  */
452 static int
453 do_dup(td, type, old, new, retval)
454 	enum dup_type type;
455 	int old, new;
456 	register_t *retval;
457 	struct thread *td;
458 {
459 	register struct filedesc *fdp;
460 	struct proc *p;
461 	struct file *fp;
462 	struct file *delfp;
463 	int error, newfd;
464 
465 	p = td->td_proc;
466 	fdp = p->p_fd;
467 
468 	/*
469 	 * Verify we have a valid descriptor to dup from and possibly to
470 	 * dup to.
471 	 */
472 	FILEDESC_LOCK(fdp);
473 	if (old >= fdp->fd_nfiles || fdp->fd_ofiles[old] == NULL ||
474 	    new >= p->p_rlimit[RLIMIT_NOFILE].rlim_cur ||
475 	    new >= maxfilesperproc) {
476 		FILEDESC_UNLOCK(fdp);
477 		return (EBADF);
478 	}
479 	if (type == DUP_FIXED && old == new) {
480 		*retval = new;
481 		FILEDESC_UNLOCK(fdp);
482 		return (0);
483 	}
484 	fp = fdp->fd_ofiles[old];
485 	fhold(fp);
486 
487 	/*
488 	 * Expand the table for the new descriptor if needed.  This may
489 	 * block and drop and reacquire the filedesc lock.
490 	 */
491 	if (type == DUP_VARIABLE || new >= fdp->fd_nfiles) {
492 		error = fdalloc(td, new, &newfd);
493 		if (error) {
494 			FILEDESC_UNLOCK(fdp);
495 			return (error);
496 		}
497 	}
498 	if (type == DUP_VARIABLE)
499 		new = newfd;
500 
501 	/*
502 	 * If the old file changed out from under us then treat it as a
503 	 * bad file descriptor.  Userland should do its own locking to
504 	 * avoid this case.
505 	 */
506 	if (fdp->fd_ofiles[old] != fp) {
507 		if (fdp->fd_ofiles[new] == NULL) {
508 			if (new < fdp->fd_freefile)
509 				fdp->fd_freefile = new;
510 			while (fdp->fd_lastfile > 0 &&
511 			    fdp->fd_ofiles[fdp->fd_lastfile] == NULL)
512 				fdp->fd_lastfile--;
513 		}
514 		FILEDESC_UNLOCK(fdp);
515 		fdrop(fp, td);
516 		return (EBADF);
517 	}
518 	KASSERT(old != new, ("new fd is same as old"));
519 
520 	/*
521 	 * Save info on the descriptor being overwritten.  We have
522 	 * to do the unmap now, but we cannot close it without
523 	 * introducing an ownership race for the slot.
524 	 */
525 	delfp = fdp->fd_ofiles[new];
526 	KASSERT(delfp == NULL || type == DUP_FIXED,
527 	    ("dup() picked an open file"));
528 #if 0
529 	if (delfp && (fdp->fd_ofileflags[new] & UF_MAPPED))
530 		(void) munmapfd(td, new);
531 #endif
532 
533 	/*
534 	 * Duplicate the source descriptor, update lastfile
535 	 */
536 	fdp->fd_ofiles[new] = fp;
537  	fdp->fd_ofileflags[new] = fdp->fd_ofileflags[old] &~ UF_EXCLOSE;
538 	if (new > fdp->fd_lastfile)
539 		fdp->fd_lastfile = new;
540 	FILEDESC_UNLOCK(fdp);
541 	*retval = new;
542 
543 	/*
544 	 * If we dup'd over a valid file, we now own the reference to it
545 	 * and must dispose of it using closef() semantics (as if a
546 	 * close() were performed on it).
547 	 */
548 	if (delfp) {
549 		mtx_lock(&Giant);
550 		(void) closef(delfp, td);
551 		mtx_unlock(&Giant);
552 	}
553 	return (0);
554 }
555 
556 /*
557  * If sigio is on the list associated with a process or process group,
558  * disable signalling from the device, remove sigio from the list and
559  * free sigio.
560  */
561 void
562 funsetown(sigiop)
563 	struct sigio **sigiop;
564 {
565 	struct sigio *sigio;
566 
567 	SIGIO_LOCK();
568 	sigio = *sigiop;
569 	if (sigio == NULL) {
570 		SIGIO_UNLOCK();
571 		return;
572 	}
573 	*(sigio->sio_myref) = NULL;
574 	if ((sigio)->sio_pgid < 0) {
575 		struct pgrp *pg = (sigio)->sio_pgrp;
576 		PGRP_LOCK(pg);
577 		SLIST_REMOVE(&sigio->sio_pgrp->pg_sigiolst, sigio,
578 			     sigio, sio_pgsigio);
579 		PGRP_UNLOCK(pg);
580 	} else {
581 		struct proc *p = (sigio)->sio_proc;
582 		PROC_LOCK(p);
583 		SLIST_REMOVE(&sigio->sio_proc->p_sigiolst, sigio,
584 			     sigio, sio_pgsigio);
585 		PROC_UNLOCK(p);
586 	}
587 	SIGIO_UNLOCK();
588 	crfree(sigio->sio_ucred);
589 	FREE(sigio, M_SIGIO);
590 }
591 
592 /*
593  * Free a list of sigio structures.
594  * We only need to lock the SIGIO_LOCK because we have made ourselves
595  * inaccessable to callers of fsetown and therefore do not need to lock
596  * the proc or pgrp struct for the list manipulation.
597  */
598 void
599 funsetownlst(sigiolst)
600 	struct sigiolst *sigiolst;
601 {
602 	struct proc *p;
603 	struct pgrp *pg;
604 	struct sigio *sigio;
605 
606 	sigio = SLIST_FIRST(sigiolst);
607 	if (sigio == NULL)
608 		return;
609 	p = NULL;
610 	pg = NULL;
611 
612 	/*
613 	 * Every entry of the list should belong
614 	 * to a single proc or pgrp.
615 	 */
616 	if (sigio->sio_pgid < 0) {
617 		pg = sigio->sio_pgrp;
618 		PGRP_LOCK_ASSERT(pg, MA_NOTOWNED);
619 	} else /* if (sigio->sio_pgid > 0) */ {
620 		p = sigio->sio_proc;
621 		PROC_LOCK_ASSERT(p, MA_NOTOWNED);
622 	}
623 
624 	SIGIO_LOCK();
625 	while ((sigio = SLIST_FIRST(sigiolst)) != NULL) {
626 		*(sigio->sio_myref) = NULL;
627 		if (pg != NULL) {
628 			KASSERT(sigio->sio_pgid < 0,
629 			    ("Proc sigio in pgrp sigio list"));
630 			KASSERT(sigio->sio_pgrp == pg,
631 			    ("Bogus pgrp in sigio list"));
632 			PGRP_LOCK(pg);
633 			SLIST_REMOVE(&pg->pg_sigiolst, sigio, sigio,
634 			    sio_pgsigio);
635 			PGRP_UNLOCK(pg);
636 		} else /* if (p != NULL) */ {
637 			KASSERT(sigio->sio_pgid > 0,
638 			    ("Pgrp sigio in proc sigio list"));
639 			KASSERT(sigio->sio_proc == p,
640 			    ("Bogus proc in sigio list"));
641 			PROC_LOCK(p);
642 			SLIST_REMOVE(&p->p_sigiolst, sigio, sigio,
643 			    sio_pgsigio);
644 			PROC_UNLOCK(p);
645 		}
646 		SIGIO_UNLOCK();
647 		crfree(sigio->sio_ucred);
648 		FREE(sigio, M_SIGIO);
649 		SIGIO_LOCK();
650 	}
651 	SIGIO_UNLOCK();
652 }
653 
654 /*
655  * This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg).
656  *
657  * After permission checking, add a sigio structure to the sigio list for
658  * the process or process group.
659  */
660 int
661 fsetown(pgid, sigiop)
662 	pid_t pgid;
663 	struct sigio **sigiop;
664 {
665 	struct proc *proc;
666 	struct pgrp *pgrp;
667 	struct sigio *sigio;
668 	int ret;
669 
670 	if (pgid == 0) {
671 		funsetown(sigiop);
672 		return (0);
673 	}
674 
675 	ret = 0;
676 
677 	/* Allocate and fill in the new sigio out of locks. */
678 	MALLOC(sigio, struct sigio *, sizeof(struct sigio), M_SIGIO, M_WAITOK);
679 	sigio->sio_pgid = pgid;
680 	sigio->sio_ucred = crhold(curthread->td_ucred);
681 	sigio->sio_myref = sigiop;
682 
683 	sx_slock(&proctree_lock);
684 	if (pgid > 0) {
685 		proc = pfind(pgid);
686 		if (proc == NULL) {
687 			ret = ESRCH;
688 			goto fail;
689 		}
690 
691 		/*
692 		 * Policy - Don't allow a process to FSETOWN a process
693 		 * in another session.
694 		 *
695 		 * Remove this test to allow maximum flexibility or
696 		 * restrict FSETOWN to the current process or process
697 		 * group for maximum safety.
698 		 */
699 		PROC_UNLOCK(proc);
700 		if (proc->p_session != curthread->td_proc->p_session) {
701 			ret = EPERM;
702 			goto fail;
703 		}
704 
705 		pgrp = NULL;
706 	} else /* if (pgid < 0) */ {
707 		pgrp = pgfind(-pgid);
708 		if (pgrp == NULL) {
709 			ret = ESRCH;
710 			goto fail;
711 		}
712 		PGRP_UNLOCK(pgrp);
713 
714 		/*
715 		 * Policy - Don't allow a process to FSETOWN a process
716 		 * in another session.
717 		 *
718 		 * Remove this test to allow maximum flexibility or
719 		 * restrict FSETOWN to the current process or process
720 		 * group for maximum safety.
721 		 */
722 		if (pgrp->pg_session != curthread->td_proc->p_session) {
723 			ret = EPERM;
724 			goto fail;
725 		}
726 
727 		proc = NULL;
728 	}
729 	funsetown(sigiop);
730 	if (pgid > 0) {
731 		PROC_LOCK(proc);
732 		/*
733 		 * Since funsetownlst() is called without the proctree
734 		 * locked, we need to check for P_WEXIT.
735 		 * XXX: is ESRCH correct?
736 		 */
737 		if ((proc->p_flag & P_WEXIT) != 0) {
738 			PROC_UNLOCK(proc);
739 			ret = ESRCH;
740 			goto fail;
741 		}
742 		SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio, sio_pgsigio);
743 		sigio->sio_proc = proc;
744 		PROC_UNLOCK(proc);
745 	} else {
746 		PGRP_LOCK(pgrp);
747 		SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio, sio_pgsigio);
748 		sigio->sio_pgrp = pgrp;
749 		PGRP_UNLOCK(pgrp);
750 	}
751 	sx_sunlock(&proctree_lock);
752 	SIGIO_LOCK();
753 	*sigiop = sigio;
754 	SIGIO_UNLOCK();
755 	return (0);
756 
757 fail:
758 	sx_sunlock(&proctree_lock);
759 	crfree(sigio->sio_ucred);
760 	FREE(sigio, M_SIGIO);
761 	return (ret);
762 }
763 
764 /*
765  * This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg).
766  */
767 pid_t
768 fgetown(sigiop)
769 	struct sigio **sigiop;
770 {
771 	pid_t pgid;
772 
773 	SIGIO_LOCK();
774 	pgid = (*sigiop != NULL) ? (*sigiop)->sio_pgid : 0;
775 	SIGIO_UNLOCK();
776 	return (pgid);
777 }
778 
779 /*
780  * Close a file descriptor.
781  */
782 #ifndef _SYS_SYSPROTO_H_
783 struct close_args {
784         int     fd;
785 };
786 #endif
787 /*
788  * MPSAFE
789  */
790 /* ARGSUSED */
791 int
792 close(td, uap)
793 	struct thread *td;
794 	struct close_args *uap;
795 {
796 	register struct filedesc *fdp;
797 	register struct file *fp;
798 	int fd, error;
799 
800 	fd = uap->fd;
801 	error = 0;
802 	fdp = td->td_proc->p_fd;
803 	mtx_lock(&Giant);
804 	FILEDESC_LOCK(fdp);
805 	if ((unsigned)fd >= fdp->fd_nfiles ||
806 	    (fp = fdp->fd_ofiles[fd]) == NULL) {
807 		FILEDESC_UNLOCK(fdp);
808 		error = EBADF;
809 		goto done2;
810 	}
811 #if 0
812 	if (fdp->fd_ofileflags[fd] & UF_MAPPED)
813 		(void) munmapfd(td, fd);
814 #endif
815 	fdp->fd_ofiles[fd] = NULL;
816 	fdp->fd_ofileflags[fd] = 0;
817 
818 	/*
819 	 * we now hold the fp reference that used to be owned by the descriptor
820 	 * array.
821 	 */
822 	while (fdp->fd_lastfile > 0 && fdp->fd_ofiles[fdp->fd_lastfile] == NULL)
823 		fdp->fd_lastfile--;
824 	if (fd < fdp->fd_freefile)
825 		fdp->fd_freefile = fd;
826 	if (fd < fdp->fd_knlistsize) {
827 		FILEDESC_UNLOCK(fdp);
828 		knote_fdclose(td, fd);
829 	} else
830 		FILEDESC_UNLOCK(fdp);
831 
832 	error = closef(fp, td);
833 done2:
834 	mtx_unlock(&Giant);
835 	return (error);
836 }
837 
838 #if defined(COMPAT_43) || defined(COMPAT_SUNOS)
839 /*
840  * Return status information about a file descriptor.
841  */
842 #ifndef _SYS_SYSPROTO_H_
843 struct ofstat_args {
844 	int	fd;
845 	struct	ostat *sb;
846 };
847 #endif
848 /*
849  * MPSAFE
850  */
851 /* ARGSUSED */
852 int
853 ofstat(td, uap)
854 	struct thread *td;
855 	register struct ofstat_args *uap;
856 {
857 	struct file *fp;
858 	struct stat ub;
859 	struct ostat oub;
860 	int error;
861 
862 	mtx_lock(&Giant);
863 	if ((error = fget(td, uap->fd, &fp)) != 0)
864 		goto done2;
865 	error = fo_stat(fp, &ub, td->td_ucred, td);
866 	if (error == 0) {
867 		cvtstat(&ub, &oub);
868 		error = copyout(&oub, uap->sb, sizeof(oub));
869 	}
870 	fdrop(fp, td);
871 done2:
872 	mtx_unlock(&Giant);
873 	return (error);
874 }
875 #endif /* COMPAT_43 || COMPAT_SUNOS */
876 
877 /*
878  * Return status information about a file descriptor.
879  */
880 #ifndef _SYS_SYSPROTO_H_
881 struct fstat_args {
882 	int	fd;
883 	struct	stat *sb;
884 };
885 #endif
886 /*
887  * MPSAFE
888  */
889 /* ARGSUSED */
890 int
891 fstat(td, uap)
892 	struct thread *td;
893 	struct fstat_args *uap;
894 {
895 	struct file *fp;
896 	struct stat ub;
897 	int error;
898 
899 	mtx_lock(&Giant);
900 	if ((error = fget(td, uap->fd, &fp)) != 0)
901 		goto done2;
902 	error = fo_stat(fp, &ub, td->td_ucred, td);
903 	if (error == 0)
904 		error = copyout(&ub, uap->sb, sizeof(ub));
905 	fdrop(fp, td);
906 done2:
907 	mtx_unlock(&Giant);
908 	return (error);
909 }
910 
911 /*
912  * Return status information about a file descriptor.
913  */
914 #ifndef _SYS_SYSPROTO_H_
915 struct nfstat_args {
916 	int	fd;
917 	struct	nstat *sb;
918 };
919 #endif
920 /*
921  * MPSAFE
922  */
923 /* ARGSUSED */
924 int
925 nfstat(td, uap)
926 	struct thread *td;
927 	register struct nfstat_args *uap;
928 {
929 	struct file *fp;
930 	struct stat ub;
931 	struct nstat nub;
932 	int error;
933 
934 	mtx_lock(&Giant);
935 	if ((error = fget(td, uap->fd, &fp)) != 0)
936 		goto done2;
937 	error = fo_stat(fp, &ub, td->td_ucred, td);
938 	if (error == 0) {
939 		cvtnstat(&ub, &nub);
940 		error = copyout(&nub, uap->sb, sizeof(nub));
941 	}
942 	fdrop(fp, td);
943 done2:
944 	mtx_unlock(&Giant);
945 	return (error);
946 }
947 
948 /*
949  * Return pathconf information about a file descriptor.
950  */
951 #ifndef _SYS_SYSPROTO_H_
952 struct fpathconf_args {
953 	int	fd;
954 	int	name;
955 };
956 #endif
957 /*
958  * MPSAFE
959  */
960 /* ARGSUSED */
961 int
962 fpathconf(td, uap)
963 	struct thread *td;
964 	register struct fpathconf_args *uap;
965 {
966 	struct file *fp;
967 	struct vnode *vp;
968 	int error;
969 
970 	if ((error = fget(td, uap->fd, &fp)) != 0)
971 		return (error);
972 	switch (fp->f_type) {
973 	case DTYPE_PIPE:
974 	case DTYPE_SOCKET:
975 		if (uap->name != _PC_PIPE_BUF) {
976 			error = EINVAL;
977 		} else {
978 			td->td_retval[0] = PIPE_BUF;
979 			error = 0;
980 		}
981 		break;
982 	case DTYPE_FIFO:
983 	case DTYPE_VNODE:
984 		vp = (struct vnode *)fp->f_data;
985 		mtx_lock(&Giant);
986 		error = VOP_PATHCONF(vp, uap->name, td->td_retval);
987 		mtx_unlock(&Giant);
988 		break;
989 	default:
990 		error = EOPNOTSUPP;
991 		break;
992 	}
993 	fdrop(fp, td);
994 	return (error);
995 }
996 
997 /*
998  * Allocate a file descriptor for the process.
999  */
1000 static int fdexpand;
1001 SYSCTL_INT(_debug, OID_AUTO, fdexpand, CTLFLAG_RD, &fdexpand, 0, "");
1002 
1003 int
1004 fdalloc(td, want, result)
1005 	struct thread *td;
1006 	int want;
1007 	int *result;
1008 {
1009 	struct proc *p = td->td_proc;
1010 	register struct filedesc *fdp = td->td_proc->p_fd;
1011 	register int i;
1012 	int lim, last, nfiles;
1013 	struct file **newofile, **oldofile;
1014 	char *newofileflags;
1015 
1016 	FILEDESC_LOCK_ASSERT(fdp, MA_OWNED);
1017 
1018 	/*
1019 	 * Search for a free descriptor starting at the higher
1020 	 * of want or fd_freefile.  If that fails, consider
1021 	 * expanding the ofile array.
1022 	 */
1023 	lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfilesperproc);
1024 	for (;;) {
1025 		last = min(fdp->fd_nfiles, lim);
1026 		i = max(want, fdp->fd_freefile);
1027 		for (; i < last; i++) {
1028 			if (fdp->fd_ofiles[i] == NULL) {
1029 				fdp->fd_ofileflags[i] = 0;
1030 				if (i > fdp->fd_lastfile)
1031 					fdp->fd_lastfile = i;
1032 				if (want <= fdp->fd_freefile)
1033 					fdp->fd_freefile = i;
1034 				*result = i;
1035 				return (0);
1036 			}
1037 		}
1038 
1039 		/*
1040 		 * No space in current array.  Expand?
1041 		 */
1042 		if (i >= lim)
1043 			return (EMFILE);
1044 		if (fdp->fd_nfiles < NDEXTENT)
1045 			nfiles = NDEXTENT;
1046 		else
1047 			nfiles = 2 * fdp->fd_nfiles;
1048 		while (nfiles < want)
1049 			nfiles <<= 1;
1050 		FILEDESC_UNLOCK(fdp);
1051 		newofile = malloc(nfiles * OFILESIZE, M_FILEDESC, M_WAITOK);
1052 
1053 		/*
1054 		 * Deal with file-table extend race that might have
1055 		 * occurred while filedesc was unlocked.
1056 		 */
1057 		FILEDESC_LOCK(fdp);
1058 		if (fdp->fd_nfiles >= nfiles) {
1059 			free(newofile, M_FILEDESC);
1060 			continue;
1061 		}
1062 		newofileflags = (char *) &newofile[nfiles];
1063 		/*
1064 		 * Copy the existing ofile and ofileflags arrays
1065 		 * and zero the new portion of each array.
1066 		 */
1067 		i = fdp->fd_nfiles * sizeof(struct file *);
1068 		bcopy(fdp->fd_ofiles, newofile,	i);
1069 		bzero((char *)newofile + i,
1070 		    nfiles * sizeof(struct file *) - i);
1071 		i = fdp->fd_nfiles * sizeof(char);
1072 		bcopy(fdp->fd_ofileflags, newofileflags, i);
1073 		bzero(newofileflags + i, nfiles * sizeof(char) - i);
1074 		if (fdp->fd_nfiles > NDFILE)
1075 			oldofile = fdp->fd_ofiles;
1076 		else
1077 			oldofile = NULL;
1078 		fdp->fd_ofiles = newofile;
1079 		fdp->fd_ofileflags = newofileflags;
1080 		fdp->fd_nfiles = nfiles;
1081 		fdexpand++;
1082 		if (oldofile != NULL)
1083 			free(oldofile, M_FILEDESC);
1084 	}
1085 	return (0);
1086 }
1087 
1088 /*
1089  * Check to see whether n user file descriptors
1090  * are available to the process p.
1091  */
1092 int
1093 fdavail(td, n)
1094 	struct thread *td;
1095 	register int n;
1096 {
1097 	struct proc *p = td->td_proc;
1098 	register struct filedesc *fdp = td->td_proc->p_fd;
1099 	register struct file **fpp;
1100 	register int i, lim, last;
1101 
1102 	FILEDESC_LOCK_ASSERT(fdp, MA_OWNED);
1103 
1104 	lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfilesperproc);
1105 	if ((i = lim - fdp->fd_nfiles) > 0 && (n -= i) <= 0)
1106 		return (1);
1107 	last = min(fdp->fd_nfiles, lim);
1108 	fpp = &fdp->fd_ofiles[fdp->fd_freefile];
1109 	for (i = last - fdp->fd_freefile; --i >= 0; fpp++) {
1110 		if (*fpp == NULL && --n <= 0)
1111 			return (1);
1112 	}
1113 	return (0);
1114 }
1115 
1116 /*
1117  * Create a new open file structure and allocate
1118  * a file decriptor for the process that refers to it.
1119  */
1120 int
1121 falloc(td, resultfp, resultfd)
1122 	register struct thread *td;
1123 	struct file **resultfp;
1124 	int *resultfd;
1125 {
1126 	struct proc *p = td->td_proc;
1127 	register struct file *fp, *fq;
1128 	int error, i;
1129 
1130 	fp = uma_zalloc(file_zone, M_WAITOK | M_ZERO);
1131 	sx_xlock(&filelist_lock);
1132 	if (nfiles >= maxfiles) {
1133 		sx_xunlock(&filelist_lock);
1134 		uma_zfree(file_zone, fp);
1135 		tablefull("file");
1136 		return (ENFILE);
1137 	}
1138 	nfiles++;
1139 
1140 	/*
1141 	 * If the process has file descriptor zero open, add the new file
1142 	 * descriptor to the list of open files at that point, otherwise
1143 	 * put it at the front of the list of open files.
1144 	 */
1145 	fp->f_mtxp = mtx_pool_alloc();
1146 	fp->f_gcflag = 0;
1147 	fp->f_count = 1;
1148 	fp->f_cred = crhold(td->td_ucred);
1149 	fp->f_ops = &badfileops;
1150 	fp->f_seqcount = 1;
1151 	FILEDESC_LOCK(p->p_fd);
1152 	if ((fq = p->p_fd->fd_ofiles[0])) {
1153 		LIST_INSERT_AFTER(fq, fp, f_list);
1154 	} else {
1155 		LIST_INSERT_HEAD(&filehead, fp, f_list);
1156 	}
1157 	sx_xunlock(&filelist_lock);
1158 	if ((error = fdalloc(td, 0, &i))) {
1159 		FILEDESC_UNLOCK(p->p_fd);
1160 		fdrop(fp, td);
1161 		return (error);
1162 	}
1163 	p->p_fd->fd_ofiles[i] = fp;
1164 	FILEDESC_UNLOCK(p->p_fd);
1165 	if (resultfp)
1166 		*resultfp = fp;
1167 	if (resultfd)
1168 		*resultfd = i;
1169 	return (0);
1170 }
1171 
1172 /*
1173  * Free a file descriptor.
1174  */
1175 void
1176 ffree(fp)
1177 	register struct file *fp;
1178 {
1179 
1180 	KASSERT(fp->f_count == 0, ("ffree: fp_fcount not 0!"));
1181 	sx_xlock(&filelist_lock);
1182 	LIST_REMOVE(fp, f_list);
1183 	nfiles--;
1184 	sx_xunlock(&filelist_lock);
1185 	crfree(fp->f_cred);
1186 	uma_zfree(file_zone, fp);
1187 }
1188 
1189 /*
1190  * Build a new filedesc structure.
1191  */
1192 struct filedesc *
1193 fdinit(td)
1194 	struct thread *td;
1195 {
1196 	register struct filedesc0 *newfdp;
1197 	register struct filedesc *fdp = td->td_proc->p_fd;
1198 
1199 	MALLOC(newfdp, struct filedesc0 *, sizeof(struct filedesc0),
1200 	    M_FILEDESC, M_WAITOK | M_ZERO);
1201 	mtx_init(&newfdp->fd_fd.fd_mtx, FILEDESC_LOCK_DESC, NULL, MTX_DEF);
1202 	FILEDESC_LOCK(&newfdp->fd_fd);
1203 	newfdp->fd_fd.fd_cdir = fdp->fd_cdir;
1204 	if (newfdp->fd_fd.fd_cdir)
1205 		VREF(newfdp->fd_fd.fd_cdir);
1206 	newfdp->fd_fd.fd_rdir = fdp->fd_rdir;
1207 	if (newfdp->fd_fd.fd_rdir)
1208 		VREF(newfdp->fd_fd.fd_rdir);
1209 	newfdp->fd_fd.fd_jdir = fdp->fd_jdir;
1210 	if (newfdp->fd_fd.fd_jdir)
1211 		VREF(newfdp->fd_fd.fd_jdir);
1212 
1213 	/* Create the file descriptor table. */
1214 	newfdp->fd_fd.fd_refcnt = 1;
1215 	newfdp->fd_fd.fd_cmask = cmask;
1216 	newfdp->fd_fd.fd_ofiles = newfdp->fd_dfiles;
1217 	newfdp->fd_fd.fd_ofileflags = newfdp->fd_dfileflags;
1218 	newfdp->fd_fd.fd_nfiles = NDFILE;
1219 	newfdp->fd_fd.fd_knlistsize = -1;
1220 	FILEDESC_UNLOCK(&newfdp->fd_fd);
1221 
1222 	return (&newfdp->fd_fd);
1223 }
1224 
1225 /*
1226  * Share a filedesc structure.
1227  */
1228 struct filedesc *
1229 fdshare(p)
1230 	struct proc *p;
1231 {
1232 	FILEDESC_LOCK(p->p_fd);
1233 	p->p_fd->fd_refcnt++;
1234 	FILEDESC_UNLOCK(p->p_fd);
1235 	return (p->p_fd);
1236 }
1237 
1238 /*
1239  * Copy a filedesc structure.
1240  */
1241 struct filedesc *
1242 fdcopy(td)
1243 	struct thread *td;
1244 {
1245 	register struct filedesc *newfdp, *fdp;
1246 	register struct file **fpp;
1247 	register int i, j;
1248 
1249 	/* Certain daemons might not have file descriptors. */
1250 	fdp = td->td_proc->p_fd;
1251 	if (fdp == NULL)
1252 		return (NULL);
1253 
1254 	FILEDESC_LOCK_ASSERT(fdp, MA_OWNED);
1255 
1256 	FILEDESC_UNLOCK(fdp);
1257 	MALLOC(newfdp, struct filedesc *, sizeof(struct filedesc0),
1258 	    M_FILEDESC, M_WAITOK);
1259 	FILEDESC_LOCK(fdp);
1260 	bcopy(fdp, newfdp, sizeof(struct filedesc));
1261 	FILEDESC_UNLOCK(fdp);
1262 	bzero(&newfdp->fd_mtx, sizeof(newfdp->fd_mtx));
1263 	mtx_init(&newfdp->fd_mtx, FILEDESC_LOCK_DESC, NULL, MTX_DEF);
1264 	if (newfdp->fd_cdir)
1265 		VREF(newfdp->fd_cdir);
1266 	if (newfdp->fd_rdir)
1267 		VREF(newfdp->fd_rdir);
1268 	if (newfdp->fd_jdir)
1269 		VREF(newfdp->fd_jdir);
1270 	newfdp->fd_refcnt = 1;
1271 
1272 	/*
1273 	 * If the number of open files fits in the internal arrays
1274 	 * of the open file structure, use them, otherwise allocate
1275 	 * additional memory for the number of descriptors currently
1276 	 * in use.
1277 	 */
1278 	FILEDESC_LOCK(fdp);
1279 	newfdp->fd_lastfile = fdp->fd_lastfile;
1280 	newfdp->fd_nfiles = fdp->fd_nfiles;
1281 	if (newfdp->fd_lastfile < NDFILE) {
1282 		newfdp->fd_ofiles = ((struct filedesc0 *) newfdp)->fd_dfiles;
1283 		newfdp->fd_ofileflags =
1284 		    ((struct filedesc0 *) newfdp)->fd_dfileflags;
1285 		i = NDFILE;
1286 	} else {
1287 		/*
1288 		 * Compute the smallest multiple of NDEXTENT needed
1289 		 * for the file descriptors currently in use,
1290 		 * allowing the table to shrink.
1291 		 */
1292 retry:
1293 		i = newfdp->fd_nfiles;
1294 		while (i > 2 * NDEXTENT && i > newfdp->fd_lastfile * 2)
1295 			i /= 2;
1296 		FILEDESC_UNLOCK(fdp);
1297 		MALLOC(newfdp->fd_ofiles, struct file **, i * OFILESIZE,
1298 		    M_FILEDESC, M_WAITOK);
1299 		FILEDESC_LOCK(fdp);
1300 		newfdp->fd_lastfile = fdp->fd_lastfile;
1301 		newfdp->fd_nfiles = fdp->fd_nfiles;
1302 		j = newfdp->fd_nfiles;
1303 		while (j > 2 * NDEXTENT && j > newfdp->fd_lastfile * 2)
1304 			j /= 2;
1305 		if (i != j) {
1306 			/*
1307 			 * The size of the original table has changed.
1308 			 * Go over once again.
1309 			 */
1310 			FILEDESC_UNLOCK(fdp);
1311 			FREE(newfdp->fd_ofiles, M_FILEDESC);
1312 			FILEDESC_LOCK(fdp);
1313 			newfdp->fd_lastfile = fdp->fd_lastfile;
1314 			newfdp->fd_nfiles = fdp->fd_nfiles;
1315 			goto retry;
1316 		}
1317 		newfdp->fd_ofileflags = (char *) &newfdp->fd_ofiles[i];
1318 	}
1319 	newfdp->fd_nfiles = i;
1320 	bcopy(fdp->fd_ofiles, newfdp->fd_ofiles, i * sizeof(struct file **));
1321 	bcopy(fdp->fd_ofileflags, newfdp->fd_ofileflags, i * sizeof(char));
1322 
1323 	/*
1324 	 * kq descriptors cannot be copied.
1325 	 */
1326 	if (newfdp->fd_knlistsize != -1) {
1327 		fpp = &newfdp->fd_ofiles[newfdp->fd_lastfile];
1328 		for (i = newfdp->fd_lastfile; i >= 0; i--, fpp--) {
1329 			if (*fpp != NULL && (*fpp)->f_type == DTYPE_KQUEUE) {
1330 				*fpp = NULL;
1331 				if (i < newfdp->fd_freefile)
1332 					newfdp->fd_freefile = i;
1333 			}
1334 			if (*fpp == NULL && i == newfdp->fd_lastfile && i > 0)
1335 				newfdp->fd_lastfile--;
1336 		}
1337 		newfdp->fd_knlist = NULL;
1338 		newfdp->fd_knlistsize = -1;
1339 		newfdp->fd_knhash = NULL;
1340 		newfdp->fd_knhashmask = 0;
1341 	}
1342 
1343 	fpp = newfdp->fd_ofiles;
1344 	for (i = newfdp->fd_lastfile; i-- >= 0; fpp++) {
1345 		if (*fpp != NULL)
1346 			fhold(*fpp);
1347 	}
1348 	return (newfdp);
1349 }
1350 
1351 /*
1352  * Release a filedesc structure.
1353  */
1354 void
1355 fdfree(td)
1356 	struct thread *td;
1357 {
1358 	register struct filedesc *fdp;
1359 	struct file **fpp;
1360 	register int i;
1361 
1362 	/* Certain daemons might not have file descriptors. */
1363 	fdp = td->td_proc->p_fd;
1364 	if (fdp == NULL)
1365 		return;
1366 
1367 	FILEDESC_LOCK(fdp);
1368 	if (--fdp->fd_refcnt > 0) {
1369 		FILEDESC_UNLOCK(fdp);
1370 		return;
1371 	}
1372 
1373 	/*
1374 	 * We are the last reference to the structure, so we can
1375 	 * safely assume it will not change out from under us.
1376 	 */
1377 	FILEDESC_UNLOCK(fdp);
1378 	fpp = fdp->fd_ofiles;
1379 	for (i = fdp->fd_lastfile; i-- >= 0; fpp++) {
1380 		if (*fpp)
1381 			(void) closef(*fpp, td);
1382 	}
1383 	td->td_proc->p_fd = NULL;
1384 	if (fdp->fd_nfiles > NDFILE)
1385 		FREE(fdp->fd_ofiles, M_FILEDESC);
1386 	if (fdp->fd_cdir)
1387 		vrele(fdp->fd_cdir);
1388 	if (fdp->fd_rdir)
1389 		vrele(fdp->fd_rdir);
1390 	if (fdp->fd_jdir)
1391 		vrele(fdp->fd_jdir);
1392 	if (fdp->fd_knlist)
1393 		FREE(fdp->fd_knlist, M_KQUEUE);
1394 	if (fdp->fd_knhash)
1395 		FREE(fdp->fd_knhash, M_KQUEUE);
1396 	mtx_destroy(&fdp->fd_mtx);
1397 	FREE(fdp, M_FILEDESC);
1398 }
1399 
1400 /*
1401  * For setugid programs, we don't want to people to use that setugidness
1402  * to generate error messages which write to a file which otherwise would
1403  * otherwise be off-limits to the process.  We check for filesystems where
1404  * the vnode can change out from under us after execve (like [lin]procfs).
1405  *
1406  * Since setugidsafety calls this only for fd 0, 1 and 2, this check is
1407  * sufficient.  We also don't for check setugidness since we know we are.
1408  */
1409 static int
1410 is_unsafe(struct file *fp)
1411 {
1412 	if (fp->f_type == DTYPE_VNODE) {
1413 		struct vnode *vp = (struct vnode *)fp->f_data;
1414 
1415 		if ((vp->v_vflag & VV_PROCDEP) != 0)
1416 			return (1);
1417 	}
1418 	return (0);
1419 }
1420 
1421 /*
1422  * Make this setguid thing safe, if at all possible.
1423  */
1424 void
1425 setugidsafety(td)
1426 	struct thread *td;
1427 {
1428 	struct filedesc *fdp;
1429 	register int i;
1430 
1431 	/* Certain daemons might not have file descriptors. */
1432 	fdp = td->td_proc->p_fd;
1433 	if (fdp == NULL)
1434 		return;
1435 
1436 	/*
1437 	 * Note: fdp->fd_ofiles may be reallocated out from under us while
1438 	 * we are blocked in a close.  Be careful!
1439 	 */
1440 	FILEDESC_LOCK(fdp);
1441 	for (i = 0; i <= fdp->fd_lastfile; i++) {
1442 		if (i > 2)
1443 			break;
1444 		if (fdp->fd_ofiles[i] && is_unsafe(fdp->fd_ofiles[i])) {
1445 			struct file *fp;
1446 
1447 #if 0
1448 			if ((fdp->fd_ofileflags[i] & UF_MAPPED) != 0)
1449 				(void) munmapfd(td, i);
1450 #endif
1451 			if (i < fdp->fd_knlistsize) {
1452 				FILEDESC_UNLOCK(fdp);
1453 				knote_fdclose(td, i);
1454 				FILEDESC_LOCK(fdp);
1455 			}
1456 			/*
1457 			 * NULL-out descriptor prior to close to avoid
1458 			 * a race while close blocks.
1459 			 */
1460 			fp = fdp->fd_ofiles[i];
1461 			fdp->fd_ofiles[i] = NULL;
1462 			fdp->fd_ofileflags[i] = 0;
1463 			if (i < fdp->fd_freefile)
1464 				fdp->fd_freefile = i;
1465 			FILEDESC_UNLOCK(fdp);
1466 			(void) closef(fp, td);
1467 			FILEDESC_LOCK(fdp);
1468 		}
1469 	}
1470 	while (fdp->fd_lastfile > 0 && fdp->fd_ofiles[fdp->fd_lastfile] == NULL)
1471 		fdp->fd_lastfile--;
1472 	FILEDESC_UNLOCK(fdp);
1473 }
1474 
1475 /*
1476  * Close any files on exec?
1477  */
1478 void
1479 fdcloseexec(td)
1480 	struct thread *td;
1481 {
1482 	struct filedesc *fdp;
1483 	register int i;
1484 
1485 	/* Certain daemons might not have file descriptors. */
1486 	fdp = td->td_proc->p_fd;
1487 	if (fdp == NULL)
1488 		return;
1489 
1490 	FILEDESC_LOCK(fdp);
1491 
1492 	/*
1493 	 * We cannot cache fd_ofiles or fd_ofileflags since operations
1494 	 * may block and rip them out from under us.
1495 	 */
1496 	for (i = 0; i <= fdp->fd_lastfile; i++) {
1497 		if (fdp->fd_ofiles[i] != NULL &&
1498 		    (fdp->fd_ofileflags[i] & UF_EXCLOSE)) {
1499 			struct file *fp;
1500 
1501 #if 0
1502 			if (fdp->fd_ofileflags[i] & UF_MAPPED)
1503 				(void) munmapfd(td, i);
1504 #endif
1505 			if (i < fdp->fd_knlistsize) {
1506 				FILEDESC_UNLOCK(fdp);
1507 				knote_fdclose(td, i);
1508 				FILEDESC_LOCK(fdp);
1509 			}
1510 			/*
1511 			 * NULL-out descriptor prior to close to avoid
1512 			 * a race while close blocks.
1513 			 */
1514 			fp = fdp->fd_ofiles[i];
1515 			fdp->fd_ofiles[i] = NULL;
1516 			fdp->fd_ofileflags[i] = 0;
1517 			if (i < fdp->fd_freefile)
1518 				fdp->fd_freefile = i;
1519 			FILEDESC_UNLOCK(fdp);
1520 			(void) closef(fp, td);
1521 			FILEDESC_LOCK(fdp);
1522 		}
1523 	}
1524 	while (fdp->fd_lastfile > 0 && fdp->fd_ofiles[fdp->fd_lastfile] == NULL)
1525 		fdp->fd_lastfile--;
1526 	FILEDESC_UNLOCK(fdp);
1527 }
1528 
1529 /*
1530  * It is unsafe for set[ug]id processes to be started with file
1531  * descriptors 0..2 closed, as these descriptors are given implicit
1532  * significance in the Standard C library.  fdcheckstd() will create a
1533  * descriptor referencing /dev/null for each of stdin, stdout, and
1534  * stderr that is not already open.
1535  */
1536 int
1537 fdcheckstd(td)
1538 	struct thread *td;
1539 {
1540 	struct nameidata nd;
1541 	struct filedesc *fdp;
1542 	struct file *fp;
1543 	register_t retval;
1544 	int fd, i, error, flags, devnull;
1545 
1546 	fdp = td->td_proc->p_fd;
1547 	if (fdp == NULL)
1548 		return (0);
1549 	devnull = -1;
1550 	error = 0;
1551 	for (i = 0; i < 3; i++) {
1552 		if (fdp->fd_ofiles[i] != NULL)
1553 			continue;
1554 		if (devnull < 0) {
1555 			error = falloc(td, &fp, &fd);
1556 			if (error != 0)
1557 				break;
1558 			KASSERT(fd == i, ("oof, we didn't get our fd"));
1559 			NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, "/dev/null",
1560 			    td);
1561 			flags = FREAD | FWRITE;
1562 			error = vn_open(&nd, &flags, 0);
1563 			if (error != 0) {
1564 				FILEDESC_LOCK(fdp);
1565 				fdp->fd_ofiles[fd] = NULL;
1566 				FILEDESC_UNLOCK(fdp);
1567 				fdrop(fp, td);
1568 				break;
1569 			}
1570 			NDFREE(&nd, NDF_ONLY_PNBUF);
1571 			fp->f_data = nd.ni_vp;
1572 			fp->f_flag = flags;
1573 			fp->f_ops = &vnops;
1574 			fp->f_type = DTYPE_VNODE;
1575 			VOP_UNLOCK(nd.ni_vp, 0, td);
1576 			devnull = fd;
1577 		} else {
1578 			error = do_dup(td, DUP_FIXED, devnull, i, &retval);
1579 			if (error != 0)
1580 				break;
1581 		}
1582 	}
1583 	return (error);
1584 }
1585 
1586 /*
1587  * Internal form of close.
1588  * Decrement reference count on file structure.
1589  * Note: td may be NULL when closing a file
1590  * that was being passed in a message.
1591  */
1592 int
1593 closef(fp, td)
1594 	register struct file *fp;
1595 	register struct thread *td;
1596 {
1597 	struct vnode *vp;
1598 	struct flock lf;
1599 
1600 	if (fp == NULL)
1601 		return (0);
1602 	/*
1603 	 * POSIX record locking dictates that any close releases ALL
1604 	 * locks owned by this process.  This is handled by setting
1605 	 * a flag in the unlock to free ONLY locks obeying POSIX
1606 	 * semantics, and not to free BSD-style file locks.
1607 	 * If the descriptor was in a message, POSIX-style locks
1608 	 * aren't passed with the descriptor.
1609 	 */
1610 	if (td && (td->td_proc->p_flag & P_ADVLOCK) &&
1611 	    fp->f_type == DTYPE_VNODE) {
1612 		lf.l_whence = SEEK_SET;
1613 		lf.l_start = 0;
1614 		lf.l_len = 0;
1615 		lf.l_type = F_UNLCK;
1616 		vp = (struct vnode *)fp->f_data;
1617 		(void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc->p_leader,
1618 		    F_UNLCK, &lf, F_POSIX);
1619 	}
1620 	return (fdrop(fp, td));
1621 }
1622 
1623 /*
1624  * Drop reference on struct file passed in, may call closef if the
1625  * reference hits zero.
1626  */
1627 int
1628 fdrop(fp, td)
1629 	struct file *fp;
1630 	struct thread *td;
1631 {
1632 
1633 	FILE_LOCK(fp);
1634 	return (fdrop_locked(fp, td));
1635 }
1636 
1637 /*
1638  * Extract the file pointer associated with the specified descriptor for
1639  * the current user process.
1640  *
1641  * If the descriptor doesn't exist, EBADF is returned.
1642  *
1643  * If the descriptor exists but doesn't match 'flags' then
1644  * return EBADF for read attempts and EINVAL for write attempts.
1645  *
1646  * If 'hold' is set (non-zero) the file's refcount will be bumped on return.
1647  * It should be droped with fdrop().
1648  * If it is not set, then the refcount will not be bumped however the
1649  * thread's filedesc struct will be returned locked (for fgetsock).
1650  *
1651  * If an error occured the non-zero error is returned and *fpp is set to NULL.
1652  * Otherwise *fpp is set and zero is returned.
1653  */
1654 static __inline int
1655 _fget(struct thread *td, int fd, struct file **fpp, int flags, int hold)
1656 {
1657 	struct filedesc *fdp;
1658 	struct file *fp;
1659 
1660 	*fpp = NULL;
1661 	if (td == NULL || (fdp = td->td_proc->p_fd) == NULL)
1662 		return (EBADF);
1663 	FILEDESC_LOCK(fdp);
1664 	if ((fp = fget_locked(fdp, fd)) == NULL || fp->f_ops == &badfileops) {
1665 		FILEDESC_UNLOCK(fdp);
1666 		return (EBADF);
1667 	}
1668 
1669 	/*
1670 	 * Note: FREAD failures returns EBADF to maintain backwards
1671 	 * compatibility with what routines returned before.
1672 	 *
1673 	 * Only one flag, or 0, may be specified.
1674 	 */
1675 	if (flags == FREAD && (fp->f_flag & FREAD) == 0) {
1676 		FILEDESC_UNLOCK(fdp);
1677 		return (EBADF);
1678 	}
1679 	if (flags == FWRITE && (fp->f_flag & FWRITE) == 0) {
1680 		FILEDESC_UNLOCK(fdp);
1681 		return (EINVAL);
1682 	}
1683 	if (hold) {
1684 		fhold(fp);
1685 		FILEDESC_UNLOCK(fdp);
1686 	}
1687 	*fpp = fp;
1688 	return (0);
1689 }
1690 
1691 int
1692 fget(struct thread *td, int fd, struct file **fpp)
1693 {
1694 
1695 	return(_fget(td, fd, fpp, 0, 1));
1696 }
1697 
1698 int
1699 fget_read(struct thread *td, int fd, struct file **fpp)
1700 {
1701 
1702 	return(_fget(td, fd, fpp, FREAD, 1));
1703 }
1704 
1705 int
1706 fget_write(struct thread *td, int fd, struct file **fpp)
1707 {
1708 
1709 	return(_fget(td, fd, fpp, FWRITE, 1));
1710 }
1711 
1712 /*
1713  * Like fget() but loads the underlying vnode, or returns an error if
1714  * the descriptor does not represent a vnode.  Note that pipes use vnodes
1715  * but never have VM objects (so VOP_GETVOBJECT() calls will return an
1716  * error).  The returned vnode will be vref()d.
1717  */
1718 static __inline int
1719 _fgetvp(struct thread *td, int fd, struct vnode **vpp, int flags)
1720 {
1721 	struct file *fp;
1722 	int error;
1723 
1724 	*vpp = NULL;
1725 	if ((error = _fget(td, fd, &fp, 0, 0)) != 0)
1726 		return (error);
1727 	if (fp->f_type != DTYPE_VNODE && fp->f_type != DTYPE_FIFO) {
1728 		error = EINVAL;
1729 	} else {
1730 		*vpp = (struct vnode *)fp->f_data;
1731 		vref(*vpp);
1732 	}
1733 	FILEDESC_UNLOCK(td->td_proc->p_fd);
1734 	return (error);
1735 }
1736 
1737 int
1738 fgetvp(struct thread *td, int fd, struct vnode **vpp)
1739 {
1740 
1741 	return (_fgetvp(td, fd, vpp, 0));
1742 }
1743 
1744 int
1745 fgetvp_read(struct thread *td, int fd, struct vnode **vpp)
1746 {
1747 
1748 	return (_fgetvp(td, fd, vpp, FREAD));
1749 }
1750 
1751 int
1752 fgetvp_write(struct thread *td, int fd, struct vnode **vpp)
1753 {
1754 
1755 	return (_fgetvp(td, fd, vpp, FWRITE));
1756 }
1757 
1758 /*
1759  * Like fget() but loads the underlying socket, or returns an error if
1760  * the descriptor does not represent a socket.
1761  *
1762  * We bump the ref count on the returned socket.  XXX Also obtain the SX
1763  * lock in the future.
1764  */
1765 int
1766 fgetsock(struct thread *td, int fd, struct socket **spp, u_int *fflagp)
1767 {
1768 	struct file *fp;
1769 	int error;
1770 
1771 	*spp = NULL;
1772 	if (fflagp != NULL)
1773 		*fflagp = 0;
1774 	if ((error = _fget(td, fd, &fp, 0, 0)) != 0)
1775 		return (error);
1776 	if (fp->f_type != DTYPE_SOCKET) {
1777 		error = ENOTSOCK;
1778 	} else {
1779 		*spp = (struct socket *)fp->f_data;
1780 		if (fflagp)
1781 			*fflagp = fp->f_flag;
1782 		soref(*spp);
1783 	}
1784 	FILEDESC_UNLOCK(td->td_proc->p_fd);
1785 	return (error);
1786 }
1787 
1788 /*
1789  * Drop the reference count on the the socket and XXX release the SX lock in
1790  * the future.  The last reference closes the socket.
1791  */
1792 void
1793 fputsock(struct socket *so)
1794 {
1795 
1796 	sorele(so);
1797 }
1798 
1799 /*
1800  * Drop reference on struct file passed in, may call closef if the
1801  * reference hits zero.
1802  * Expects struct file locked, and will unlock it.
1803  */
1804 int
1805 fdrop_locked(fp, td)
1806 	struct file *fp;
1807 	struct thread *td;
1808 {
1809 	struct flock lf;
1810 	struct vnode *vp;
1811 	int error;
1812 
1813 	FILE_LOCK_ASSERT(fp, MA_OWNED);
1814 
1815 	if (--fp->f_count > 0) {
1816 		FILE_UNLOCK(fp);
1817 		return (0);
1818 	}
1819 	mtx_lock(&Giant);
1820 	if (fp->f_count < 0)
1821 		panic("fdrop: count < 0");
1822 	if ((fp->f_flag & FHASLOCK) && fp->f_type == DTYPE_VNODE) {
1823 		lf.l_whence = SEEK_SET;
1824 		lf.l_start = 0;
1825 		lf.l_len = 0;
1826 		lf.l_type = F_UNLCK;
1827 		vp = (struct vnode *)fp->f_data;
1828 		FILE_UNLOCK(fp);
1829 		(void) VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK);
1830 	} else
1831 		FILE_UNLOCK(fp);
1832 	if (fp->f_ops != &badfileops)
1833 		error = fo_close(fp, td);
1834 	else
1835 		error = 0;
1836 	ffree(fp);
1837 	mtx_unlock(&Giant);
1838 	return (error);
1839 }
1840 
1841 /*
1842  * Apply an advisory lock on a file descriptor.
1843  *
1844  * Just attempt to get a record lock of the requested type on
1845  * the entire file (l_whence = SEEK_SET, l_start = 0, l_len = 0).
1846  */
1847 #ifndef _SYS_SYSPROTO_H_
1848 struct flock_args {
1849 	int	fd;
1850 	int	how;
1851 };
1852 #endif
1853 /*
1854  * MPSAFE
1855  */
1856 /* ARGSUSED */
1857 int
1858 flock(td, uap)
1859 	struct thread *td;
1860 	register struct flock_args *uap;
1861 {
1862 	struct file *fp;
1863 	struct vnode *vp;
1864 	struct flock lf;
1865 	int error;
1866 
1867 	if ((error = fget(td, uap->fd, &fp)) != 0)
1868 		return (error);
1869 	if (fp->f_type != DTYPE_VNODE) {
1870 		fdrop(fp, td);
1871 		return (EOPNOTSUPP);
1872 	}
1873 
1874 	mtx_lock(&Giant);
1875 	vp = (struct vnode *)fp->f_data;
1876 	lf.l_whence = SEEK_SET;
1877 	lf.l_start = 0;
1878 	lf.l_len = 0;
1879 	if (uap->how & LOCK_UN) {
1880 		lf.l_type = F_UNLCK;
1881 		FILE_LOCK(fp);
1882 		fp->f_flag &= ~FHASLOCK;
1883 		FILE_UNLOCK(fp);
1884 		error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK);
1885 		goto done2;
1886 	}
1887 	if (uap->how & LOCK_EX)
1888 		lf.l_type = F_WRLCK;
1889 	else if (uap->how & LOCK_SH)
1890 		lf.l_type = F_RDLCK;
1891 	else {
1892 		error = EBADF;
1893 		goto done2;
1894 	}
1895 	FILE_LOCK(fp);
1896 	fp->f_flag |= FHASLOCK;
1897 	FILE_UNLOCK(fp);
1898 	error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,
1899 	    (uap->how & LOCK_NB) ? F_FLOCK : F_FLOCK | F_WAIT);
1900 done2:
1901 	fdrop(fp, td);
1902 	mtx_unlock(&Giant);
1903 	return (error);
1904 }
1905 
1906 /*
1907  * File Descriptor pseudo-device driver (/dev/fd/).
1908  *
1909  * Opening minor device N dup()s the file (if any) connected to file
1910  * descriptor N belonging to the calling process.  Note that this driver
1911  * consists of only the ``open()'' routine, because all subsequent
1912  * references to this file will be direct to the other driver.
1913  */
1914 /* ARGSUSED */
1915 static int
1916 fdopen(dev, mode, type, td)
1917 	dev_t dev;
1918 	int mode, type;
1919 	struct thread *td;
1920 {
1921 
1922 	/*
1923 	 * XXX Kludge: set curthread->td_dupfd to contain the value of the
1924 	 * the file descriptor being sought for duplication. The error
1925 	 * return ensures that the vnode for this device will be released
1926 	 * by vn_open. Open will detect this special error and take the
1927 	 * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN
1928 	 * will simply report the error.
1929 	 */
1930 	td->td_dupfd = dev2unit(dev);
1931 	return (ENODEV);
1932 }
1933 
1934 /*
1935  * Duplicate the specified descriptor to a free descriptor.
1936  */
1937 int
1938 dupfdopen(td, fdp, indx, dfd, mode, error)
1939 	struct thread *td;
1940 	struct filedesc *fdp;
1941 	int indx, dfd;
1942 	int mode;
1943 	int error;
1944 {
1945 	register struct file *wfp;
1946 	struct file *fp;
1947 
1948 	/*
1949 	 * If the to-be-dup'd fd number is greater than the allowed number
1950 	 * of file descriptors, or the fd to be dup'd has already been
1951 	 * closed, then reject.
1952 	 */
1953 	FILEDESC_LOCK(fdp);
1954 	if (dfd < 0 || dfd >= fdp->fd_nfiles ||
1955 	    (wfp = fdp->fd_ofiles[dfd]) == NULL) {
1956 		FILEDESC_UNLOCK(fdp);
1957 		return (EBADF);
1958 	}
1959 
1960 	/*
1961 	 * There are two cases of interest here.
1962 	 *
1963 	 * For ENODEV simply dup (dfd) to file descriptor
1964 	 * (indx) and return.
1965 	 *
1966 	 * For ENXIO steal away the file structure from (dfd) and
1967 	 * store it in (indx).  (dfd) is effectively closed by
1968 	 * this operation.
1969 	 *
1970 	 * Any other error code is just returned.
1971 	 */
1972 	switch (error) {
1973 	case ENODEV:
1974 		/*
1975 		 * Check that the mode the file is being opened for is a
1976 		 * subset of the mode of the existing descriptor.
1977 		 */
1978 		FILE_LOCK(wfp);
1979 		if (((mode & (FREAD|FWRITE)) | wfp->f_flag) != wfp->f_flag) {
1980 			FILE_UNLOCK(wfp);
1981 			FILEDESC_UNLOCK(fdp);
1982 			return (EACCES);
1983 		}
1984 		fp = fdp->fd_ofiles[indx];
1985 #if 0
1986 		if (fp && fdp->fd_ofileflags[indx] & UF_MAPPED)
1987 			(void) munmapfd(td, indx);
1988 #endif
1989 		fdp->fd_ofiles[indx] = wfp;
1990 		fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd];
1991 		fhold_locked(wfp);
1992 		FILE_UNLOCK(wfp);
1993 		if (indx > fdp->fd_lastfile)
1994 			fdp->fd_lastfile = indx;
1995 		if (fp != NULL)
1996 			FILE_LOCK(fp);
1997 		FILEDESC_UNLOCK(fdp);
1998 		/*
1999 		 * We now own the reference to fp that the ofiles[] array
2000 		 * used to own.  Release it.
2001 		 */
2002 		if (fp != NULL)
2003 			fdrop_locked(fp, td);
2004 		return (0);
2005 
2006 	case ENXIO:
2007 		/*
2008 		 * Steal away the file pointer from dfd and stuff it into indx.
2009 		 */
2010 		fp = fdp->fd_ofiles[indx];
2011 #if 0
2012 		if (fp && fdp->fd_ofileflags[indx] & UF_MAPPED)
2013 			(void) munmapfd(td, indx);
2014 #endif
2015 		fdp->fd_ofiles[indx] = fdp->fd_ofiles[dfd];
2016 		fdp->fd_ofiles[dfd] = NULL;
2017 		fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd];
2018 		fdp->fd_ofileflags[dfd] = 0;
2019 
2020 		/*
2021 		 * Complete the clean up of the filedesc structure by
2022 		 * recomputing the various hints.
2023 		 */
2024 		if (indx > fdp->fd_lastfile) {
2025 			fdp->fd_lastfile = indx;
2026 		} else {
2027 			while (fdp->fd_lastfile > 0 &&
2028 			   fdp->fd_ofiles[fdp->fd_lastfile] == NULL) {
2029 				fdp->fd_lastfile--;
2030 			}
2031 			if (dfd < fdp->fd_freefile)
2032 				fdp->fd_freefile = dfd;
2033 		}
2034 		if (fp != NULL)
2035 			FILE_LOCK(fp);
2036 		FILEDESC_UNLOCK(fdp);
2037 
2038 		/*
2039 		 * we now own the reference to fp that the ofiles[] array
2040 		 * used to own.  Release it.
2041 		 */
2042 		if (fp != NULL)
2043 			fdrop_locked(fp, td);
2044 		return (0);
2045 
2046 	default:
2047 		FILEDESC_UNLOCK(fdp);
2048 		return (error);
2049 	}
2050 	/* NOTREACHED */
2051 }
2052 
2053 /*
2054  * Get file structures.
2055  */
2056 static int
2057 sysctl_kern_file(SYSCTL_HANDLER_ARGS)
2058 {
2059 	struct xfile xf;
2060 	struct filedesc *fdp;
2061 	struct file *fp;
2062 	struct proc *p;
2063 	int error, n;
2064 
2065 	sysctl_wire_old_buffer(req, 0);
2066 	if (req->oldptr == NULL) {
2067 		n = 16;		/* A slight overestimate. */
2068 		sx_slock(&filelist_lock);
2069 		LIST_FOREACH(fp, &filehead, f_list) {
2070 			/*
2071 			 * We should grab the lock, but this is an
2072 			 * estimate, so does it really matter?
2073 			 */
2074 			/* mtx_lock(fp->f_mtxp); */
2075 			n += fp->f_count;
2076 			/* mtx_unlock(f->f_mtxp); */
2077 		}
2078 		sx_sunlock(&filelist_lock);
2079 		return (SYSCTL_OUT(req, 0, n * sizeof(xf)));
2080 	}
2081 	error = 0;
2082 	bzero(&xf, sizeof(xf));
2083 	xf.xf_size = sizeof(xf);
2084 	sx_slock(&allproc_lock);
2085 	LIST_FOREACH(p, &allproc, p_list) {
2086 		PROC_LOCK(p);
2087 		xf.xf_pid = p->p_pid;
2088 		xf.xf_uid = p->p_ucred->cr_uid;
2089 		if ((fdp = p->p_fd) == NULL) {
2090 			PROC_UNLOCK(p);
2091 			continue;
2092 		}
2093 		FILEDESC_LOCK(fdp);
2094 		for (n = 0; n < fdp->fd_nfiles; ++n) {
2095 			if ((fp = fdp->fd_ofiles[n]) == NULL)
2096 				continue;
2097 			xf.xf_fd = n;
2098 			xf.xf_file = fp;
2099 #define	XF_COPY(field) xf.xf_##field = fp->f_##field
2100 			XF_COPY(type);
2101 			XF_COPY(count);
2102 			XF_COPY(msgcount);
2103 			XF_COPY(offset);
2104 			XF_COPY(data);
2105 			XF_COPY(flag);
2106 #undef XF_COPY
2107 			error = SYSCTL_OUT(req, &xf, sizeof(xf));
2108 			if (error)
2109 				break;
2110 		}
2111 		FILEDESC_UNLOCK(fdp);
2112 		PROC_UNLOCK(p);
2113 		if (error)
2114 			break;
2115 	}
2116 	sx_sunlock(&allproc_lock);
2117 	return (error);
2118 }
2119 
2120 SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD,
2121     0, 0, sysctl_kern_file, "S,xfile", "Entire file table");
2122 
2123 SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW,
2124     &maxfilesperproc, 0, "Maximum files allowed open per process");
2125 
2126 SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW,
2127     &maxfiles, 0, "Maximum number of files");
2128 
2129 SYSCTL_INT(_kern, OID_AUTO, openfiles, CTLFLAG_RD,
2130     &nfiles, 0, "System-wide number of open files");
2131 
2132 static void
2133 fildesc_drvinit(void *unused)
2134 {
2135 	dev_t dev;
2136 
2137 	dev = make_dev(&fildesc_cdevsw, 0, UID_BIN, GID_BIN, 0666, "fd/0");
2138 	make_dev_alias(dev, "stdin");
2139 	dev = make_dev(&fildesc_cdevsw, 1, UID_BIN, GID_BIN, 0666, "fd/1");
2140 	make_dev_alias(dev, "stdout");
2141 	dev = make_dev(&fildesc_cdevsw, 2, UID_BIN, GID_BIN, 0666, "fd/2");
2142 	make_dev_alias(dev, "stderr");
2143 	if (!devfs_present) {
2144 		int fd;
2145 
2146 		for (fd = 3; fd < NUMFDESC; fd++)
2147 			make_dev(&fildesc_cdevsw, fd, UID_BIN, GID_BIN, 0666,
2148 			    "fd/%d", fd);
2149 	}
2150 }
2151 
2152 struct fileops badfileops = {
2153 	badfo_readwrite,
2154 	badfo_readwrite,
2155 	badfo_ioctl,
2156 	badfo_poll,
2157 	badfo_kqfilter,
2158 	badfo_stat,
2159 	badfo_close
2160 };
2161 
2162 static int
2163 badfo_readwrite(fp, uio, active_cred, flags, td)
2164 	struct file *fp;
2165 	struct uio *uio;
2166 	struct ucred *active_cred;
2167 	struct thread *td;
2168 	int flags;
2169 {
2170 
2171 	return (EBADF);
2172 }
2173 
2174 static int
2175 badfo_ioctl(fp, com, data, active_cred, td)
2176 	struct file *fp;
2177 	u_long com;
2178 	void *data;
2179 	struct ucred *active_cred;
2180 	struct thread *td;
2181 {
2182 
2183 	return (EBADF);
2184 }
2185 
2186 static int
2187 badfo_poll(fp, events, active_cred, td)
2188 	struct file *fp;
2189 	int events;
2190 	struct ucred *active_cred;
2191 	struct thread *td;
2192 {
2193 
2194 	return (0);
2195 }
2196 
2197 static int
2198 badfo_kqfilter(fp, kn)
2199 	struct file *fp;
2200 	struct knote *kn;
2201 {
2202 
2203 	return (0);
2204 }
2205 
2206 static int
2207 badfo_stat(fp, sb, active_cred, td)
2208 	struct file *fp;
2209 	struct stat *sb;
2210 	struct ucred *active_cred;
2211 	struct thread *td;
2212 {
2213 
2214 	return (EBADF);
2215 }
2216 
2217 static int
2218 badfo_close(fp, td)
2219 	struct file *fp;
2220 	struct thread *td;
2221 {
2222 
2223 	return (EBADF);
2224 }
2225 
2226 SYSINIT(fildescdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE+CDEV_MAJOR,
2227 					fildesc_drvinit,NULL)
2228 
2229 static void filelistinit(void *);
2230 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, filelistinit, NULL)
2231 
2232 /* ARGSUSED*/
2233 static void
2234 filelistinit(dummy)
2235 	void *dummy;
2236 {
2237 
2238 	file_zone = uma_zcreate("Files", sizeof(struct file), NULL, NULL,
2239 	    NULL, NULL, UMA_ALIGN_PTR, 0);
2240 	sx_init(&filelist_lock, "filelist lock");
2241 	mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF);
2242 }
2243