xref: /freebsd/sys/kern/kern_descrip.c (revision 10b9d77bf1ccf2f3affafa6261692cb92cf7e992)
1 /*-
2  * Copyright (c) 1982, 1986, 1989, 1991, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 4. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  *	@(#)kern_descrip.c	8.6 (Berkeley) 4/19/94
35  */
36 
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD$");
39 
40 #include "opt_compat.h"
41 #include "opt_ddb.h"
42 #include "opt_ktrace.h"
43 
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 
47 #include <sys/conf.h>
48 #include <sys/domain.h>
49 #include <sys/fcntl.h>
50 #include <sys/file.h>
51 #include <sys/filedesc.h>
52 #include <sys/filio.h>
53 #include <sys/jail.h>
54 #include <sys/kernel.h>
55 #include <sys/limits.h>
56 #include <sys/lock.h>
57 #include <sys/malloc.h>
58 #include <sys/mount.h>
59 #include <sys/mqueue.h>
60 #include <sys/mutex.h>
61 #include <sys/namei.h>
62 #include <sys/priv.h>
63 #include <sys/proc.h>
64 #include <sys/protosw.h>
65 #include <sys/resourcevar.h>
66 #include <sys/signalvar.h>
67 #include <sys/socketvar.h>
68 #include <sys/stat.h>
69 #include <sys/sx.h>
70 #include <sys/syscallsubr.h>
71 #include <sys/sysctl.h>
72 #include <sys/sysproto.h>
73 #include <sys/tty.h>
74 #include <sys/unistd.h>
75 #include <sys/user.h>
76 #include <sys/vnode.h>
77 #ifdef KTRACE
78 #include <sys/ktrace.h>
79 #endif
80 
81 #include <net/vnet.h>
82 
83 #include <security/audit/audit.h>
84 
85 #include <vm/uma.h>
86 
87 #include <ddb/ddb.h>
88 
89 static MALLOC_DEFINE(M_FILEDESC, "filedesc", "Open file descriptor table");
90 static MALLOC_DEFINE(M_FILEDESC_TO_LEADER, "filedesc_to_leader",
91 		     "file desc to leader structures");
92 static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures");
93 
94 static uma_zone_t file_zone;
95 
96 
97 /* Flags for do_dup() */
98 #define DUP_FIXED	0x1	/* Force fixed allocation */
99 #define DUP_FCNTL	0x2	/* fcntl()-style errors */
100 
101 static int do_dup(struct thread *td, int flags, int old, int new,
102     register_t *retval);
103 static int	fd_first_free(struct filedesc *, int, int);
104 static int	fd_last_used(struct filedesc *, int, int);
105 static void	fdgrowtable(struct filedesc *, int);
106 static void	fdunused(struct filedesc *fdp, int fd);
107 static void	fdused(struct filedesc *fdp, int fd);
108 
109 /*
110  * A process is initially started out with NDFILE descriptors stored within
111  * this structure, selected to be enough for typical applications based on
112  * the historical limit of 20 open files (and the usage of descriptors by
113  * shells).  If these descriptors are exhausted, a larger descriptor table
114  * may be allocated, up to a process' resource limit; the internal arrays
115  * are then unused.
116  */
117 #define NDFILE		20
118 #define NDSLOTSIZE	sizeof(NDSLOTTYPE)
119 #define	NDENTRIES	(NDSLOTSIZE * __CHAR_BIT)
120 #define NDSLOT(x)	((x) / NDENTRIES)
121 #define NDBIT(x)	((NDSLOTTYPE)1 << ((x) % NDENTRIES))
122 #define	NDSLOTS(x)	(((x) + NDENTRIES - 1) / NDENTRIES)
123 
124 /*
125  * Storage required per open file descriptor.
126  */
127 #define OFILESIZE (sizeof(struct file *) + sizeof(char))
128 
129 /*
130  * Storage to hold unused ofiles that need to be reclaimed.
131  */
132 struct freetable {
133 	struct file	**ft_table;
134 	SLIST_ENTRY(freetable) ft_next;
135 };
136 
137 /*
138  * Basic allocation of descriptors:
139  * one of the above, plus arrays for NDFILE descriptors.
140  */
141 struct filedesc0 {
142 	struct	filedesc fd_fd;
143 	/*
144 	 * ofiles which need to be reclaimed on free.
145 	 */
146 	SLIST_HEAD(,freetable) fd_free;
147 	/*
148 	 * These arrays are used when the number of open files is
149 	 * <= NDFILE, and are then pointed to by the pointers above.
150 	 */
151 	struct	file *fd_dfiles[NDFILE];
152 	char	fd_dfileflags[NDFILE];
153 	NDSLOTTYPE fd_dmap[NDSLOTS(NDFILE)];
154 };
155 
156 /*
157  * Descriptor management.
158  */
159 volatile int openfiles;			/* actual number of open files */
160 struct mtx sigio_lock;		/* mtx to protect pointers to sigio */
161 void	(*mq_fdclose)(struct thread *td, int fd, struct file *fp);
162 
163 /* A mutex to protect the association between a proc and filedesc. */
164 static struct mtx	fdesc_mtx;
165 
166 /*
167  * Find the first zero bit in the given bitmap, starting at low and not
168  * exceeding size - 1.
169  */
170 static int
171 fd_first_free(struct filedesc *fdp, int low, int size)
172 {
173 	NDSLOTTYPE *map = fdp->fd_map;
174 	NDSLOTTYPE mask;
175 	int off, maxoff;
176 
177 	if (low >= size)
178 		return (low);
179 
180 	off = NDSLOT(low);
181 	if (low % NDENTRIES) {
182 		mask = ~(~(NDSLOTTYPE)0 >> (NDENTRIES - (low % NDENTRIES)));
183 		if ((mask &= ~map[off]) != 0UL)
184 			return (off * NDENTRIES + ffsl(mask) - 1);
185 		++off;
186 	}
187 	for (maxoff = NDSLOTS(size); off < maxoff; ++off)
188 		if (map[off] != ~0UL)
189 			return (off * NDENTRIES + ffsl(~map[off]) - 1);
190 	return (size);
191 }
192 
193 /*
194  * Find the highest non-zero bit in the given bitmap, starting at low and
195  * not exceeding size - 1.
196  */
197 static int
198 fd_last_used(struct filedesc *fdp, int low, int size)
199 {
200 	NDSLOTTYPE *map = fdp->fd_map;
201 	NDSLOTTYPE mask;
202 	int off, minoff;
203 
204 	if (low >= size)
205 		return (-1);
206 
207 	off = NDSLOT(size);
208 	if (size % NDENTRIES) {
209 		mask = ~(~(NDSLOTTYPE)0 << (size % NDENTRIES));
210 		if ((mask &= map[off]) != 0)
211 			return (off * NDENTRIES + flsl(mask) - 1);
212 		--off;
213 	}
214 	for (minoff = NDSLOT(low); off >= minoff; --off)
215 		if (map[off] != 0)
216 			return (off * NDENTRIES + flsl(map[off]) - 1);
217 	return (low - 1);
218 }
219 
220 static int
221 fdisused(struct filedesc *fdp, int fd)
222 {
223         KASSERT(fd >= 0 && fd < fdp->fd_nfiles,
224             ("file descriptor %d out of range (0, %d)", fd, fdp->fd_nfiles));
225 	return ((fdp->fd_map[NDSLOT(fd)] & NDBIT(fd)) != 0);
226 }
227 
228 /*
229  * Mark a file descriptor as used.
230  */
231 static void
232 fdused(struct filedesc *fdp, int fd)
233 {
234 
235 	FILEDESC_XLOCK_ASSERT(fdp);
236 	KASSERT(!fdisused(fdp, fd),
237 	    ("fd already used"));
238 
239 	fdp->fd_map[NDSLOT(fd)] |= NDBIT(fd);
240 	if (fd > fdp->fd_lastfile)
241 		fdp->fd_lastfile = fd;
242 	if (fd == fdp->fd_freefile)
243 		fdp->fd_freefile = fd_first_free(fdp, fd, fdp->fd_nfiles);
244 }
245 
246 /*
247  * Mark a file descriptor as unused.
248  */
249 static void
250 fdunused(struct filedesc *fdp, int fd)
251 {
252 
253 	FILEDESC_XLOCK_ASSERT(fdp);
254 	KASSERT(fdisused(fdp, fd),
255 	    ("fd is already unused"));
256 	KASSERT(fdp->fd_ofiles[fd] == NULL,
257 	    ("fd is still in use"));
258 
259 	fdp->fd_map[NDSLOT(fd)] &= ~NDBIT(fd);
260 	if (fd < fdp->fd_freefile)
261 		fdp->fd_freefile = fd;
262 	if (fd == fdp->fd_lastfile)
263 		fdp->fd_lastfile = fd_last_used(fdp, 0, fd);
264 }
265 
266 /*
267  * System calls on descriptors.
268  */
269 #ifndef _SYS_SYSPROTO_H_
270 struct getdtablesize_args {
271 	int	dummy;
272 };
273 #endif
274 /* ARGSUSED */
275 int
276 getdtablesize(struct thread *td, struct getdtablesize_args *uap)
277 {
278 	struct proc *p = td->td_proc;
279 
280 	PROC_LOCK(p);
281 	td->td_retval[0] =
282 	    min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
283 	PROC_UNLOCK(p);
284 	return (0);
285 }
286 
287 /*
288  * Duplicate a file descriptor to a particular value.
289  *
290  * Note: keep in mind that a potential race condition exists when closing
291  * descriptors from a shared descriptor table (via rfork).
292  */
293 #ifndef _SYS_SYSPROTO_H_
294 struct dup2_args {
295 	u_int	from;
296 	u_int	to;
297 };
298 #endif
299 /* ARGSUSED */
300 int
301 dup2(struct thread *td, struct dup2_args *uap)
302 {
303 
304 	return (do_dup(td, DUP_FIXED, (int)uap->from, (int)uap->to,
305 		    td->td_retval));
306 }
307 
308 /*
309  * Duplicate a file descriptor.
310  */
311 #ifndef _SYS_SYSPROTO_H_
312 struct dup_args {
313 	u_int	fd;
314 };
315 #endif
316 /* ARGSUSED */
317 int
318 dup(struct thread *td, struct dup_args *uap)
319 {
320 
321 	return (do_dup(td, 0, (int)uap->fd, 0, td->td_retval));
322 }
323 
324 /*
325  * The file control system call.
326  */
327 #ifndef _SYS_SYSPROTO_H_
328 struct fcntl_args {
329 	int	fd;
330 	int	cmd;
331 	long	arg;
332 };
333 #endif
334 /* ARGSUSED */
335 int
336 fcntl(struct thread *td, struct fcntl_args *uap)
337 {
338 	struct flock fl;
339 	struct oflock ofl;
340 	intptr_t arg;
341 	int error;
342 	int cmd;
343 
344 	error = 0;
345 	cmd = uap->cmd;
346 	switch (uap->cmd) {
347 	case F_OGETLK:
348 	case F_OSETLK:
349 	case F_OSETLKW:
350 		/*
351 		 * Convert old flock structure to new.
352 		 */
353 		error = copyin((void *)(intptr_t)uap->arg, &ofl, sizeof(ofl));
354 		fl.l_start = ofl.l_start;
355 		fl.l_len = ofl.l_len;
356 		fl.l_pid = ofl.l_pid;
357 		fl.l_type = ofl.l_type;
358 		fl.l_whence = ofl.l_whence;
359 		fl.l_sysid = 0;
360 
361 		switch (uap->cmd) {
362 		case F_OGETLK:
363 		    cmd = F_GETLK;
364 		    break;
365 		case F_OSETLK:
366 		    cmd = F_SETLK;
367 		    break;
368 		case F_OSETLKW:
369 		    cmd = F_SETLKW;
370 		    break;
371 		}
372 		arg = (intptr_t)&fl;
373 		break;
374         case F_GETLK:
375         case F_SETLK:
376         case F_SETLKW:
377 	case F_SETLK_REMOTE:
378                 error = copyin((void *)(intptr_t)uap->arg, &fl, sizeof(fl));
379                 arg = (intptr_t)&fl;
380                 break;
381 	default:
382 		arg = uap->arg;
383 		break;
384 	}
385 	if (error)
386 		return (error);
387 	error = kern_fcntl(td, uap->fd, cmd, arg);
388 	if (error)
389 		return (error);
390 	if (uap->cmd == F_OGETLK) {
391 		ofl.l_start = fl.l_start;
392 		ofl.l_len = fl.l_len;
393 		ofl.l_pid = fl.l_pid;
394 		ofl.l_type = fl.l_type;
395 		ofl.l_whence = fl.l_whence;
396 		error = copyout(&ofl, (void *)(intptr_t)uap->arg, sizeof(ofl));
397 	} else if (uap->cmd == F_GETLK) {
398 		error = copyout(&fl, (void *)(intptr_t)uap->arg, sizeof(fl));
399 	}
400 	return (error);
401 }
402 
403 static inline struct file *
404 fdtofp(int fd, struct filedesc *fdp)
405 {
406 	struct file *fp;
407 
408 	FILEDESC_LOCK_ASSERT(fdp);
409 	if ((unsigned)fd >= fdp->fd_nfiles ||
410 	    (fp = fdp->fd_ofiles[fd]) == NULL)
411 		return (NULL);
412 	return (fp);
413 }
414 
415 int
416 kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
417 {
418 	struct filedesc *fdp;
419 	struct flock *flp;
420 	struct file *fp;
421 	struct proc *p;
422 	char *pop;
423 	struct vnode *vp;
424 	int error, flg, tmp;
425 	int vfslocked;
426 	u_int old, new;
427 	uint64_t bsize;
428 
429 	vfslocked = 0;
430 	error = 0;
431 	flg = F_POSIX;
432 	p = td->td_proc;
433 	fdp = p->p_fd;
434 
435 	switch (cmd) {
436 	case F_DUPFD:
437 		tmp = arg;
438 		error = do_dup(td, DUP_FCNTL, fd, tmp, td->td_retval);
439 		break;
440 
441 	case F_DUP2FD:
442 		tmp = arg;
443 		error = do_dup(td, DUP_FIXED, fd, tmp, td->td_retval);
444 		break;
445 
446 	case F_GETFD:
447 		FILEDESC_SLOCK(fdp);
448 		if ((fp = fdtofp(fd, fdp)) == NULL) {
449 			FILEDESC_SUNLOCK(fdp);
450 			error = EBADF;
451 			break;
452 		}
453 		pop = &fdp->fd_ofileflags[fd];
454 		td->td_retval[0] = (*pop & UF_EXCLOSE) ? FD_CLOEXEC : 0;
455 		FILEDESC_SUNLOCK(fdp);
456 		break;
457 
458 	case F_SETFD:
459 		FILEDESC_XLOCK(fdp);
460 		if ((fp = fdtofp(fd, fdp)) == NULL) {
461 			FILEDESC_XUNLOCK(fdp);
462 			error = EBADF;
463 			break;
464 		}
465 		pop = &fdp->fd_ofileflags[fd];
466 		*pop = (*pop &~ UF_EXCLOSE) |
467 		    (arg & FD_CLOEXEC ? UF_EXCLOSE : 0);
468 		FILEDESC_XUNLOCK(fdp);
469 		break;
470 
471 	case F_GETFL:
472 		FILEDESC_SLOCK(fdp);
473 		if ((fp = fdtofp(fd, fdp)) == NULL) {
474 			FILEDESC_SUNLOCK(fdp);
475 			error = EBADF;
476 			break;
477 		}
478 		td->td_retval[0] = OFLAGS(fp->f_flag);
479 		FILEDESC_SUNLOCK(fdp);
480 		break;
481 
482 	case F_SETFL:
483 		FILEDESC_SLOCK(fdp);
484 		if ((fp = fdtofp(fd, fdp)) == NULL) {
485 			FILEDESC_SUNLOCK(fdp);
486 			error = EBADF;
487 			break;
488 		}
489 		fhold(fp);
490 		FILEDESC_SUNLOCK(fdp);
491 		do {
492 			tmp = flg = fp->f_flag;
493 			tmp &= ~FCNTLFLAGS;
494 			tmp |= FFLAGS(arg & ~O_ACCMODE) & FCNTLFLAGS;
495 		} while(atomic_cmpset_int(&fp->f_flag, flg, tmp) == 0);
496 		tmp = fp->f_flag & FNONBLOCK;
497 		error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
498 		if (error) {
499 			fdrop(fp, td);
500 			break;
501 		}
502 		tmp = fp->f_flag & FASYNC;
503 		error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td);
504 		if (error == 0) {
505 			fdrop(fp, td);
506 			break;
507 		}
508 		atomic_clear_int(&fp->f_flag, FNONBLOCK);
509 		tmp = 0;
510 		(void)fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
511 		fdrop(fp, td);
512 		break;
513 
514 	case F_GETOWN:
515 		FILEDESC_SLOCK(fdp);
516 		if ((fp = fdtofp(fd, fdp)) == NULL) {
517 			FILEDESC_SUNLOCK(fdp);
518 			error = EBADF;
519 			break;
520 		}
521 		fhold(fp);
522 		FILEDESC_SUNLOCK(fdp);
523 		error = fo_ioctl(fp, FIOGETOWN, &tmp, td->td_ucred, td);
524 		if (error == 0)
525 			td->td_retval[0] = tmp;
526 		fdrop(fp, td);
527 		break;
528 
529 	case F_SETOWN:
530 		FILEDESC_SLOCK(fdp);
531 		if ((fp = fdtofp(fd, fdp)) == NULL) {
532 			FILEDESC_SUNLOCK(fdp);
533 			error = EBADF;
534 			break;
535 		}
536 		fhold(fp);
537 		FILEDESC_SUNLOCK(fdp);
538 		tmp = arg;
539 		error = fo_ioctl(fp, FIOSETOWN, &tmp, td->td_ucred, td);
540 		fdrop(fp, td);
541 		break;
542 
543 	case F_SETLK_REMOTE:
544 		error = priv_check(td, PRIV_NFS_LOCKD);
545 		if (error)
546 			return (error);
547 		flg = F_REMOTE;
548 		goto do_setlk;
549 
550 	case F_SETLKW:
551 		flg |= F_WAIT;
552 		/* FALLTHROUGH F_SETLK */
553 
554 	case F_SETLK:
555 	do_setlk:
556 		FILEDESC_SLOCK(fdp);
557 		if ((fp = fdtofp(fd, fdp)) == NULL) {
558 			FILEDESC_SUNLOCK(fdp);
559 			error = EBADF;
560 			break;
561 		}
562 		if (fp->f_type != DTYPE_VNODE) {
563 			FILEDESC_SUNLOCK(fdp);
564 			error = EBADF;
565 			break;
566 		}
567 		flp = (struct flock *)arg;
568 		if (flp->l_whence == SEEK_CUR) {
569 			if (fp->f_offset < 0 ||
570 			    (flp->l_start > 0 &&
571 			     fp->f_offset > OFF_MAX - flp->l_start)) {
572 				FILEDESC_SUNLOCK(fdp);
573 				error = EOVERFLOW;
574 				break;
575 			}
576 			flp->l_start += fp->f_offset;
577 		}
578 
579 		/*
580 		 * VOP_ADVLOCK() may block.
581 		 */
582 		fhold(fp);
583 		FILEDESC_SUNLOCK(fdp);
584 		vp = fp->f_vnode;
585 		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
586 		switch (flp->l_type) {
587 		case F_RDLCK:
588 			if ((fp->f_flag & FREAD) == 0) {
589 				error = EBADF;
590 				break;
591 			}
592 			PROC_LOCK(p->p_leader);
593 			p->p_leader->p_flag |= P_ADVLOCK;
594 			PROC_UNLOCK(p->p_leader);
595 			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
596 			    flp, flg);
597 			break;
598 		case F_WRLCK:
599 			if ((fp->f_flag & FWRITE) == 0) {
600 				error = EBADF;
601 				break;
602 			}
603 			PROC_LOCK(p->p_leader);
604 			p->p_leader->p_flag |= P_ADVLOCK;
605 			PROC_UNLOCK(p->p_leader);
606 			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
607 			    flp, flg);
608 			break;
609 		case F_UNLCK:
610 			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK,
611 			    flp, flg);
612 			break;
613 		case F_UNLCKSYS:
614 			/*
615 			 * Temporary api for testing remote lock
616 			 * infrastructure.
617 			 */
618 			if (flg != F_REMOTE) {
619 				error = EINVAL;
620 				break;
621 			}
622 			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
623 			    F_UNLCKSYS, flp, flg);
624 			break;
625 		default:
626 			error = EINVAL;
627 			break;
628 		}
629 		VFS_UNLOCK_GIANT(vfslocked);
630 		vfslocked = 0;
631 		/* Check for race with close */
632 		FILEDESC_SLOCK(fdp);
633 		if ((unsigned) fd >= fdp->fd_nfiles ||
634 		    fp != fdp->fd_ofiles[fd]) {
635 			FILEDESC_SUNLOCK(fdp);
636 			flp->l_whence = SEEK_SET;
637 			flp->l_start = 0;
638 			flp->l_len = 0;
639 			flp->l_type = F_UNLCK;
640 			vfslocked = VFS_LOCK_GIANT(vp->v_mount);
641 			(void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
642 					   F_UNLCK, flp, F_POSIX);
643 			VFS_UNLOCK_GIANT(vfslocked);
644 			vfslocked = 0;
645 		} else
646 			FILEDESC_SUNLOCK(fdp);
647 		fdrop(fp, td);
648 		break;
649 
650 	case F_GETLK:
651 		FILEDESC_SLOCK(fdp);
652 		if ((fp = fdtofp(fd, fdp)) == NULL) {
653 			FILEDESC_SUNLOCK(fdp);
654 			error = EBADF;
655 			break;
656 		}
657 		if (fp->f_type != DTYPE_VNODE) {
658 			FILEDESC_SUNLOCK(fdp);
659 			error = EBADF;
660 			break;
661 		}
662 		flp = (struct flock *)arg;
663 		if (flp->l_type != F_RDLCK && flp->l_type != F_WRLCK &&
664 		    flp->l_type != F_UNLCK) {
665 			FILEDESC_SUNLOCK(fdp);
666 			error = EINVAL;
667 			break;
668 		}
669 		if (flp->l_whence == SEEK_CUR) {
670 			if ((flp->l_start > 0 &&
671 			    fp->f_offset > OFF_MAX - flp->l_start) ||
672 			    (flp->l_start < 0 &&
673 			     fp->f_offset < OFF_MIN - flp->l_start)) {
674 				FILEDESC_SUNLOCK(fdp);
675 				error = EOVERFLOW;
676 				break;
677 			}
678 			flp->l_start += fp->f_offset;
679 		}
680 		/*
681 		 * VOP_ADVLOCK() may block.
682 		 */
683 		fhold(fp);
684 		FILEDESC_SUNLOCK(fdp);
685 		vp = fp->f_vnode;
686 		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
687 		error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK, flp,
688 		    F_POSIX);
689 		VFS_UNLOCK_GIANT(vfslocked);
690 		vfslocked = 0;
691 		fdrop(fp, td);
692 		break;
693 
694 	case F_RDAHEAD:
695 		arg = arg ? 128 * 1024: 0;
696 		/* FALLTHROUGH */
697 	case F_READAHEAD:
698 		FILEDESC_SLOCK(fdp);
699 		if ((fp = fdtofp(fd, fdp)) == NULL) {
700 			FILEDESC_SUNLOCK(fdp);
701 			error = EBADF;
702 			break;
703 		}
704 		if (fp->f_type != DTYPE_VNODE) {
705 			FILEDESC_SUNLOCK(fdp);
706 			error = EBADF;
707 			break;
708 		}
709 		fhold(fp);
710 		FILEDESC_SUNLOCK(fdp);
711 		if (arg != 0) {
712 			vp = fp->f_vnode;
713 			vfslocked = VFS_LOCK_GIANT(vp->v_mount);
714 			error = vn_lock(vp, LK_SHARED);
715 			if (error != 0)
716 				goto readahead_vnlock_fail;
717 			bsize = fp->f_vnode->v_mount->mnt_stat.f_iosize;
718 			VOP_UNLOCK(vp, 0);
719 			fp->f_seqcount = (arg + bsize - 1) / bsize;
720 			do {
721 				new = old = fp->f_flag;
722 				new |= FRDAHEAD;
723 			} while (!atomic_cmpset_rel_int(&fp->f_flag, old, new));
724 readahead_vnlock_fail:
725 			VFS_UNLOCK_GIANT(vfslocked);
726 			vfslocked = 0;
727 		} else {
728 			do {
729 				new = old = fp->f_flag;
730 				new &= ~FRDAHEAD;
731 			} while (!atomic_cmpset_rel_int(&fp->f_flag, old, new));
732 		}
733 		fdrop(fp, td);
734 		break;
735 
736 	default:
737 		error = EINVAL;
738 		break;
739 	}
740 	VFS_UNLOCK_GIANT(vfslocked);
741 	return (error);
742 }
743 
744 /*
745  * Common code for dup, dup2, fcntl(F_DUPFD) and fcntl(F_DUP2FD).
746  */
747 static int
748 do_dup(struct thread *td, int flags, int old, int new,
749     register_t *retval)
750 {
751 	struct filedesc *fdp;
752 	struct proc *p;
753 	struct file *fp;
754 	struct file *delfp;
755 	int error, holdleaders, maxfd;
756 
757 	p = td->td_proc;
758 	fdp = p->p_fd;
759 
760 	/*
761 	 * Verify we have a valid descriptor to dup from and possibly to
762 	 * dup to. Unlike dup() and dup2(), fcntl()'s F_DUPFD should
763 	 * return EINVAL when the new descriptor is out of bounds.
764 	 */
765 	if (old < 0)
766 		return (EBADF);
767 	if (new < 0)
768 		return (flags & DUP_FCNTL ? EINVAL : EBADF);
769 	PROC_LOCK(p);
770 	maxfd = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
771 	PROC_UNLOCK(p);
772 	if (new >= maxfd)
773 		return (flags & DUP_FCNTL ? EINVAL : EMFILE);
774 
775 	FILEDESC_XLOCK(fdp);
776 	if (old >= fdp->fd_nfiles || fdp->fd_ofiles[old] == NULL) {
777 		FILEDESC_XUNLOCK(fdp);
778 		return (EBADF);
779 	}
780 	if (flags & DUP_FIXED && old == new) {
781 		*retval = new;
782 		FILEDESC_XUNLOCK(fdp);
783 		return (0);
784 	}
785 	fp = fdp->fd_ofiles[old];
786 	fhold(fp);
787 
788 	/*
789 	 * If the caller specified a file descriptor, make sure the file
790 	 * table is large enough to hold it, and grab it.  Otherwise, just
791 	 * allocate a new descriptor the usual way.  Since the filedesc
792 	 * lock may be temporarily dropped in the process, we have to look
793 	 * out for a race.
794 	 */
795 	if (flags & DUP_FIXED) {
796 		if (new >= fdp->fd_nfiles)
797 			fdgrowtable(fdp, new + 1);
798 		if (fdp->fd_ofiles[new] == NULL)
799 			fdused(fdp, new);
800 	} else {
801 		if ((error = fdalloc(td, new, &new)) != 0) {
802 			FILEDESC_XUNLOCK(fdp);
803 			fdrop(fp, td);
804 			return (error);
805 		}
806 	}
807 
808 	/*
809 	 * If the old file changed out from under us then treat it as a
810 	 * bad file descriptor.  Userland should do its own locking to
811 	 * avoid this case.
812 	 */
813 	if (fdp->fd_ofiles[old] != fp) {
814 		/* we've allocated a descriptor which we won't use */
815 		if (fdp->fd_ofiles[new] == NULL)
816 			fdunused(fdp, new);
817 		FILEDESC_XUNLOCK(fdp);
818 		fdrop(fp, td);
819 		return (EBADF);
820 	}
821 	KASSERT(old != new,
822 	    ("new fd is same as old"));
823 
824 	/*
825 	 * Save info on the descriptor being overwritten.  We cannot close
826 	 * it without introducing an ownership race for the slot, since we
827 	 * need to drop the filedesc lock to call closef().
828 	 *
829 	 * XXX this duplicates parts of close().
830 	 */
831 	delfp = fdp->fd_ofiles[new];
832 	holdleaders = 0;
833 	if (delfp != NULL) {
834 		if (td->td_proc->p_fdtol != NULL) {
835 			/*
836 			 * Ask fdfree() to sleep to ensure that all relevant
837 			 * process leaders can be traversed in closef().
838 			 */
839 			fdp->fd_holdleaderscount++;
840 			holdleaders = 1;
841 		}
842 	}
843 
844 	/*
845 	 * Duplicate the source descriptor
846 	 */
847 	fdp->fd_ofiles[new] = fp;
848 	fdp->fd_ofileflags[new] = fdp->fd_ofileflags[old] &~ UF_EXCLOSE;
849 	if (new > fdp->fd_lastfile)
850 		fdp->fd_lastfile = new;
851 	*retval = new;
852 
853 	/*
854 	 * If we dup'd over a valid file, we now own the reference to it
855 	 * and must dispose of it using closef() semantics (as if a
856 	 * close() were performed on it).
857 	 *
858 	 * XXX this duplicates parts of close().
859 	 */
860 	if (delfp != NULL) {
861 		knote_fdclose(td, new);
862 		if (delfp->f_type == DTYPE_MQUEUE)
863 			mq_fdclose(td, new, delfp);
864 		FILEDESC_XUNLOCK(fdp);
865 		(void) closef(delfp, td);
866 		if (holdleaders) {
867 			FILEDESC_XLOCK(fdp);
868 			fdp->fd_holdleaderscount--;
869 			if (fdp->fd_holdleaderscount == 0 &&
870 			    fdp->fd_holdleaderswakeup != 0) {
871 				fdp->fd_holdleaderswakeup = 0;
872 				wakeup(&fdp->fd_holdleaderscount);
873 			}
874 			FILEDESC_XUNLOCK(fdp);
875 		}
876 	} else {
877 		FILEDESC_XUNLOCK(fdp);
878 	}
879 	return (0);
880 }
881 
882 /*
883  * If sigio is on the list associated with a process or process group,
884  * disable signalling from the device, remove sigio from the list and
885  * free sigio.
886  */
887 void
888 funsetown(struct sigio **sigiop)
889 {
890 	struct sigio *sigio;
891 
892 	SIGIO_LOCK();
893 	sigio = *sigiop;
894 	if (sigio == NULL) {
895 		SIGIO_UNLOCK();
896 		return;
897 	}
898 	*(sigio->sio_myref) = NULL;
899 	if ((sigio)->sio_pgid < 0) {
900 		struct pgrp *pg = (sigio)->sio_pgrp;
901 		PGRP_LOCK(pg);
902 		SLIST_REMOVE(&sigio->sio_pgrp->pg_sigiolst, sigio,
903 			     sigio, sio_pgsigio);
904 		PGRP_UNLOCK(pg);
905 	} else {
906 		struct proc *p = (sigio)->sio_proc;
907 		PROC_LOCK(p);
908 		SLIST_REMOVE(&sigio->sio_proc->p_sigiolst, sigio,
909 			     sigio, sio_pgsigio);
910 		PROC_UNLOCK(p);
911 	}
912 	SIGIO_UNLOCK();
913 	crfree(sigio->sio_ucred);
914 	free(sigio, M_SIGIO);
915 }
916 
917 /*
918  * Free a list of sigio structures.
919  * We only need to lock the SIGIO_LOCK because we have made ourselves
920  * inaccessible to callers of fsetown and therefore do not need to lock
921  * the proc or pgrp struct for the list manipulation.
922  */
923 void
924 funsetownlst(struct sigiolst *sigiolst)
925 {
926 	struct proc *p;
927 	struct pgrp *pg;
928 	struct sigio *sigio;
929 
930 	sigio = SLIST_FIRST(sigiolst);
931 	if (sigio == NULL)
932 		return;
933 	p = NULL;
934 	pg = NULL;
935 
936 	/*
937 	 * Every entry of the list should belong
938 	 * to a single proc or pgrp.
939 	 */
940 	if (sigio->sio_pgid < 0) {
941 		pg = sigio->sio_pgrp;
942 		PGRP_LOCK_ASSERT(pg, MA_NOTOWNED);
943 	} else /* if (sigio->sio_pgid > 0) */ {
944 		p = sigio->sio_proc;
945 		PROC_LOCK_ASSERT(p, MA_NOTOWNED);
946 	}
947 
948 	SIGIO_LOCK();
949 	while ((sigio = SLIST_FIRST(sigiolst)) != NULL) {
950 		*(sigio->sio_myref) = NULL;
951 		if (pg != NULL) {
952 			KASSERT(sigio->sio_pgid < 0,
953 			    ("Proc sigio in pgrp sigio list"));
954 			KASSERT(sigio->sio_pgrp == pg,
955 			    ("Bogus pgrp in sigio list"));
956 			PGRP_LOCK(pg);
957 			SLIST_REMOVE(&pg->pg_sigiolst, sigio, sigio,
958 			    sio_pgsigio);
959 			PGRP_UNLOCK(pg);
960 		} else /* if (p != NULL) */ {
961 			KASSERT(sigio->sio_pgid > 0,
962 			    ("Pgrp sigio in proc sigio list"));
963 			KASSERT(sigio->sio_proc == p,
964 			    ("Bogus proc in sigio list"));
965 			PROC_LOCK(p);
966 			SLIST_REMOVE(&p->p_sigiolst, sigio, sigio,
967 			    sio_pgsigio);
968 			PROC_UNLOCK(p);
969 		}
970 		SIGIO_UNLOCK();
971 		crfree(sigio->sio_ucred);
972 		free(sigio, M_SIGIO);
973 		SIGIO_LOCK();
974 	}
975 	SIGIO_UNLOCK();
976 }
977 
978 /*
979  * This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg).
980  *
981  * After permission checking, add a sigio structure to the sigio list for
982  * the process or process group.
983  */
984 int
985 fsetown(pid_t pgid, struct sigio **sigiop)
986 {
987 	struct proc *proc;
988 	struct pgrp *pgrp;
989 	struct sigio *sigio;
990 	int ret;
991 
992 	if (pgid == 0) {
993 		funsetown(sigiop);
994 		return (0);
995 	}
996 
997 	ret = 0;
998 
999 	/* Allocate and fill in the new sigio out of locks. */
1000 	sigio = malloc(sizeof(struct sigio), M_SIGIO, M_WAITOK);
1001 	sigio->sio_pgid = pgid;
1002 	sigio->sio_ucred = crhold(curthread->td_ucred);
1003 	sigio->sio_myref = sigiop;
1004 
1005 	sx_slock(&proctree_lock);
1006 	if (pgid > 0) {
1007 		proc = pfind(pgid);
1008 		if (proc == NULL) {
1009 			ret = ESRCH;
1010 			goto fail;
1011 		}
1012 
1013 		/*
1014 		 * Policy - Don't allow a process to FSETOWN a process
1015 		 * in another session.
1016 		 *
1017 		 * Remove this test to allow maximum flexibility or
1018 		 * restrict FSETOWN to the current process or process
1019 		 * group for maximum safety.
1020 		 */
1021 		PROC_UNLOCK(proc);
1022 		if (proc->p_session != curthread->td_proc->p_session) {
1023 			ret = EPERM;
1024 			goto fail;
1025 		}
1026 
1027 		pgrp = NULL;
1028 	} else /* if (pgid < 0) */ {
1029 		pgrp = pgfind(-pgid);
1030 		if (pgrp == NULL) {
1031 			ret = ESRCH;
1032 			goto fail;
1033 		}
1034 		PGRP_UNLOCK(pgrp);
1035 
1036 		/*
1037 		 * Policy - Don't allow a process to FSETOWN a process
1038 		 * in another session.
1039 		 *
1040 		 * Remove this test to allow maximum flexibility or
1041 		 * restrict FSETOWN to the current process or process
1042 		 * group for maximum safety.
1043 		 */
1044 		if (pgrp->pg_session != curthread->td_proc->p_session) {
1045 			ret = EPERM;
1046 			goto fail;
1047 		}
1048 
1049 		proc = NULL;
1050 	}
1051 	funsetown(sigiop);
1052 	if (pgid > 0) {
1053 		PROC_LOCK(proc);
1054 		/*
1055 		 * Since funsetownlst() is called without the proctree
1056 		 * locked, we need to check for P_WEXIT.
1057 		 * XXX: is ESRCH correct?
1058 		 */
1059 		if ((proc->p_flag & P_WEXIT) != 0) {
1060 			PROC_UNLOCK(proc);
1061 			ret = ESRCH;
1062 			goto fail;
1063 		}
1064 		SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio, sio_pgsigio);
1065 		sigio->sio_proc = proc;
1066 		PROC_UNLOCK(proc);
1067 	} else {
1068 		PGRP_LOCK(pgrp);
1069 		SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio, sio_pgsigio);
1070 		sigio->sio_pgrp = pgrp;
1071 		PGRP_UNLOCK(pgrp);
1072 	}
1073 	sx_sunlock(&proctree_lock);
1074 	SIGIO_LOCK();
1075 	*sigiop = sigio;
1076 	SIGIO_UNLOCK();
1077 	return (0);
1078 
1079 fail:
1080 	sx_sunlock(&proctree_lock);
1081 	crfree(sigio->sio_ucred);
1082 	free(sigio, M_SIGIO);
1083 	return (ret);
1084 }
1085 
1086 /*
1087  * This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg).
1088  */
1089 pid_t
1090 fgetown(sigiop)
1091 	struct sigio **sigiop;
1092 {
1093 	pid_t pgid;
1094 
1095 	SIGIO_LOCK();
1096 	pgid = (*sigiop != NULL) ? (*sigiop)->sio_pgid : 0;
1097 	SIGIO_UNLOCK();
1098 	return (pgid);
1099 }
1100 
1101 /*
1102  * Close a file descriptor.
1103  */
1104 #ifndef _SYS_SYSPROTO_H_
1105 struct close_args {
1106 	int     fd;
1107 };
1108 #endif
1109 /* ARGSUSED */
1110 int
1111 close(td, uap)
1112 	struct thread *td;
1113 	struct close_args *uap;
1114 {
1115 
1116 	return (kern_close(td, uap->fd));
1117 }
1118 
1119 int
1120 kern_close(td, fd)
1121 	struct thread *td;
1122 	int fd;
1123 {
1124 	struct filedesc *fdp;
1125 	struct file *fp;
1126 	int error;
1127 	int holdleaders;
1128 
1129 	error = 0;
1130 	holdleaders = 0;
1131 	fdp = td->td_proc->p_fd;
1132 
1133 	AUDIT_SYSCLOSE(td, fd);
1134 
1135 	FILEDESC_XLOCK(fdp);
1136 	if ((unsigned)fd >= fdp->fd_nfiles ||
1137 	    (fp = fdp->fd_ofiles[fd]) == NULL) {
1138 		FILEDESC_XUNLOCK(fdp);
1139 		return (EBADF);
1140 	}
1141 	fdp->fd_ofiles[fd] = NULL;
1142 	fdp->fd_ofileflags[fd] = 0;
1143 	fdunused(fdp, fd);
1144 	if (td->td_proc->p_fdtol != NULL) {
1145 		/*
1146 		 * Ask fdfree() to sleep to ensure that all relevant
1147 		 * process leaders can be traversed in closef().
1148 		 */
1149 		fdp->fd_holdleaderscount++;
1150 		holdleaders = 1;
1151 	}
1152 
1153 	/*
1154 	 * We now hold the fp reference that used to be owned by the
1155 	 * descriptor array.  We have to unlock the FILEDESC *AFTER*
1156 	 * knote_fdclose to prevent a race of the fd getting opened, a knote
1157 	 * added, and deleteing a knote for the new fd.
1158 	 */
1159 	knote_fdclose(td, fd);
1160 	if (fp->f_type == DTYPE_MQUEUE)
1161 		mq_fdclose(td, fd, fp);
1162 	FILEDESC_XUNLOCK(fdp);
1163 
1164 	error = closef(fp, td);
1165 	if (holdleaders) {
1166 		FILEDESC_XLOCK(fdp);
1167 		fdp->fd_holdleaderscount--;
1168 		if (fdp->fd_holdleaderscount == 0 &&
1169 		    fdp->fd_holdleaderswakeup != 0) {
1170 			fdp->fd_holdleaderswakeup = 0;
1171 			wakeup(&fdp->fd_holdleaderscount);
1172 		}
1173 		FILEDESC_XUNLOCK(fdp);
1174 	}
1175 	return (error);
1176 }
1177 
1178 /*
1179  * Close open file descriptors.
1180  */
1181 #ifndef _SYS_SYSPROTO_H_
1182 struct closefrom_args {
1183 	int	lowfd;
1184 };
1185 #endif
1186 /* ARGSUSED */
1187 int
1188 closefrom(struct thread *td, struct closefrom_args *uap)
1189 {
1190 	struct filedesc *fdp;
1191 	int fd;
1192 
1193 	fdp = td->td_proc->p_fd;
1194 	AUDIT_ARG_FD(uap->lowfd);
1195 
1196 	/*
1197 	 * Treat negative starting file descriptor values identical to
1198 	 * closefrom(0) which closes all files.
1199 	 */
1200 	if (uap->lowfd < 0)
1201 		uap->lowfd = 0;
1202 	FILEDESC_SLOCK(fdp);
1203 	for (fd = uap->lowfd; fd < fdp->fd_nfiles; fd++) {
1204 		if (fdp->fd_ofiles[fd] != NULL) {
1205 			FILEDESC_SUNLOCK(fdp);
1206 			(void)kern_close(td, fd);
1207 			FILEDESC_SLOCK(fdp);
1208 		}
1209 	}
1210 	FILEDESC_SUNLOCK(fdp);
1211 	return (0);
1212 }
1213 
1214 #if defined(COMPAT_43)
1215 /*
1216  * Return status information about a file descriptor.
1217  */
1218 #ifndef _SYS_SYSPROTO_H_
1219 struct ofstat_args {
1220 	int	fd;
1221 	struct	ostat *sb;
1222 };
1223 #endif
1224 /* ARGSUSED */
1225 int
1226 ofstat(struct thread *td, struct ofstat_args *uap)
1227 {
1228 	struct ostat oub;
1229 	struct stat ub;
1230 	int error;
1231 
1232 	error = kern_fstat(td, uap->fd, &ub);
1233 	if (error == 0) {
1234 		cvtstat(&ub, &oub);
1235 		error = copyout(&oub, uap->sb, sizeof(oub));
1236 	}
1237 	return (error);
1238 }
1239 #endif /* COMPAT_43 */
1240 
1241 /*
1242  * Return status information about a file descriptor.
1243  */
1244 #ifndef _SYS_SYSPROTO_H_
1245 struct fstat_args {
1246 	int	fd;
1247 	struct	stat *sb;
1248 };
1249 #endif
1250 /* ARGSUSED */
1251 int
1252 fstat(struct thread *td, struct fstat_args *uap)
1253 {
1254 	struct stat ub;
1255 	int error;
1256 
1257 	error = kern_fstat(td, uap->fd, &ub);
1258 	if (error == 0)
1259 		error = copyout(&ub, uap->sb, sizeof(ub));
1260 	return (error);
1261 }
1262 
1263 int
1264 kern_fstat(struct thread *td, int fd, struct stat *sbp)
1265 {
1266 	struct file *fp;
1267 	int error;
1268 
1269 	AUDIT_ARG_FD(fd);
1270 
1271 	if ((error = fget(td, fd, &fp)) != 0)
1272 		return (error);
1273 
1274 	AUDIT_ARG_FILE(td->td_proc, fp);
1275 
1276 	error = fo_stat(fp, sbp, td->td_ucred, td);
1277 	fdrop(fp, td);
1278 #ifdef KTRACE
1279 	if (error == 0 && KTRPOINT(td, KTR_STRUCT))
1280 		ktrstat(sbp);
1281 #endif
1282 	return (error);
1283 }
1284 
1285 /*
1286  * Return status information about a file descriptor.
1287  */
1288 #ifndef _SYS_SYSPROTO_H_
1289 struct nfstat_args {
1290 	int	fd;
1291 	struct	nstat *sb;
1292 };
1293 #endif
1294 /* ARGSUSED */
1295 int
1296 nfstat(struct thread *td, struct nfstat_args *uap)
1297 {
1298 	struct nstat nub;
1299 	struct stat ub;
1300 	int error;
1301 
1302 	error = kern_fstat(td, uap->fd, &ub);
1303 	if (error == 0) {
1304 		cvtnstat(&ub, &nub);
1305 		error = copyout(&nub, uap->sb, sizeof(nub));
1306 	}
1307 	return (error);
1308 }
1309 
1310 /*
1311  * Return pathconf information about a file descriptor.
1312  */
1313 #ifndef _SYS_SYSPROTO_H_
1314 struct fpathconf_args {
1315 	int	fd;
1316 	int	name;
1317 };
1318 #endif
1319 /* ARGSUSED */
1320 int
1321 fpathconf(struct thread *td, struct fpathconf_args *uap)
1322 {
1323 	struct file *fp;
1324 	struct vnode *vp;
1325 	int error;
1326 
1327 	if ((error = fget(td, uap->fd, &fp)) != 0)
1328 		return (error);
1329 
1330 	/* If asynchronous I/O is available, it works for all descriptors. */
1331 	if (uap->name == _PC_ASYNC_IO) {
1332 		td->td_retval[0] = async_io_version;
1333 		goto out;
1334 	}
1335 	vp = fp->f_vnode;
1336 	if (vp != NULL) {
1337 		int vfslocked;
1338 		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
1339 		vn_lock(vp, LK_SHARED | LK_RETRY);
1340 		error = VOP_PATHCONF(vp, uap->name, td->td_retval);
1341 		VOP_UNLOCK(vp, 0);
1342 		VFS_UNLOCK_GIANT(vfslocked);
1343 	} else if (fp->f_type == DTYPE_PIPE || fp->f_type == DTYPE_SOCKET) {
1344 		if (uap->name != _PC_PIPE_BUF) {
1345 			error = EINVAL;
1346 		} else {
1347 			td->td_retval[0] = PIPE_BUF;
1348 		error = 0;
1349 		}
1350 	} else {
1351 		error = EOPNOTSUPP;
1352 	}
1353 out:
1354 	fdrop(fp, td);
1355 	return (error);
1356 }
1357 
1358 /*
1359  * Grow the file table to accomodate (at least) nfd descriptors.  This may
1360  * block and drop the filedesc lock, but it will reacquire it before
1361  * returning.
1362  */
1363 static void
1364 fdgrowtable(struct filedesc *fdp, int nfd)
1365 {
1366 	struct filedesc0 *fdp0;
1367 	struct freetable *fo;
1368 	struct file **ntable;
1369 	struct file **otable;
1370 	char *nfileflags;
1371 	int nnfiles, onfiles;
1372 	NDSLOTTYPE *nmap;
1373 
1374 	FILEDESC_XLOCK_ASSERT(fdp);
1375 
1376 	KASSERT(fdp->fd_nfiles > 0,
1377 	    ("zero-length file table"));
1378 
1379 	/* compute the size of the new table */
1380 	onfiles = fdp->fd_nfiles;
1381 	nnfiles = NDSLOTS(nfd) * NDENTRIES; /* round up */
1382 	if (nnfiles <= onfiles)
1383 		/* the table is already large enough */
1384 		return;
1385 
1386 	/* allocate a new table and (if required) new bitmaps */
1387 	FILEDESC_XUNLOCK(fdp);
1388 	ntable = malloc((nnfiles * OFILESIZE) + sizeof(struct freetable),
1389 	    M_FILEDESC, M_ZERO | M_WAITOK);
1390 	nfileflags = (char *)&ntable[nnfiles];
1391 	if (NDSLOTS(nnfiles) > NDSLOTS(onfiles))
1392 		nmap = malloc(NDSLOTS(nnfiles) * NDSLOTSIZE,
1393 		    M_FILEDESC, M_ZERO | M_WAITOK);
1394 	else
1395 		nmap = NULL;
1396 	FILEDESC_XLOCK(fdp);
1397 
1398 	/*
1399 	 * We now have new tables ready to go.  Since we dropped the
1400 	 * filedesc lock to call malloc(), watch out for a race.
1401 	 */
1402 	onfiles = fdp->fd_nfiles;
1403 	if (onfiles >= nnfiles) {
1404 		/* we lost the race, but that's OK */
1405 		free(ntable, M_FILEDESC);
1406 		if (nmap != NULL)
1407 			free(nmap, M_FILEDESC);
1408 		return;
1409 	}
1410 	bcopy(fdp->fd_ofiles, ntable, onfiles * sizeof(*ntable));
1411 	bcopy(fdp->fd_ofileflags, nfileflags, onfiles);
1412 	otable = fdp->fd_ofiles;
1413 	fdp->fd_ofileflags = nfileflags;
1414 	fdp->fd_ofiles = ntable;
1415 	/*
1416 	 * We must preserve ofiles until the process exits because we can't
1417 	 * be certain that no threads have references to the old table via
1418 	 * _fget().
1419 	 */
1420 	if (onfiles > NDFILE) {
1421 		fo = (struct freetable *)&otable[onfiles];
1422 		fdp0 = (struct filedesc0 *)fdp;
1423 		fo->ft_table = otable;
1424 		SLIST_INSERT_HEAD(&fdp0->fd_free, fo, ft_next);
1425 	}
1426 	if (NDSLOTS(nnfiles) > NDSLOTS(onfiles)) {
1427 		bcopy(fdp->fd_map, nmap, NDSLOTS(onfiles) * sizeof(*nmap));
1428 		if (NDSLOTS(onfiles) > NDSLOTS(NDFILE))
1429 			free(fdp->fd_map, M_FILEDESC);
1430 		fdp->fd_map = nmap;
1431 	}
1432 	fdp->fd_nfiles = nnfiles;
1433 }
1434 
1435 /*
1436  * Allocate a file descriptor for the process.
1437  */
1438 int
1439 fdalloc(struct thread *td, int minfd, int *result)
1440 {
1441 	struct proc *p = td->td_proc;
1442 	struct filedesc *fdp = p->p_fd;
1443 	int fd = -1, maxfd;
1444 
1445 	FILEDESC_XLOCK_ASSERT(fdp);
1446 
1447 	if (fdp->fd_freefile > minfd)
1448 		minfd = fdp->fd_freefile;
1449 
1450 	PROC_LOCK(p);
1451 	maxfd = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
1452 	PROC_UNLOCK(p);
1453 
1454 	/*
1455 	 * Search the bitmap for a free descriptor.  If none is found, try
1456 	 * to grow the file table.  Keep at it until we either get a file
1457 	 * descriptor or run into process or system limits; fdgrowtable()
1458 	 * may drop the filedesc lock, so we're in a race.
1459 	 */
1460 	for (;;) {
1461 		fd = fd_first_free(fdp, minfd, fdp->fd_nfiles);
1462 		if (fd >= maxfd)
1463 			return (EMFILE);
1464 		if (fd < fdp->fd_nfiles)
1465 			break;
1466 		fdgrowtable(fdp, min(fdp->fd_nfiles * 2, maxfd));
1467 	}
1468 
1469 	/*
1470 	 * Perform some sanity checks, then mark the file descriptor as
1471 	 * used and return it to the caller.
1472 	 */
1473 	KASSERT(!fdisused(fdp, fd),
1474 	    ("fd_first_free() returned non-free descriptor"));
1475 	KASSERT(fdp->fd_ofiles[fd] == NULL,
1476 	    ("free descriptor isn't"));
1477 	fdp->fd_ofileflags[fd] = 0; /* XXX needed? */
1478 	fdused(fdp, fd);
1479 	*result = fd;
1480 	return (0);
1481 }
1482 
1483 /*
1484  * Check to see whether n user file descriptors are available to the process
1485  * p.
1486  */
1487 int
1488 fdavail(struct thread *td, int n)
1489 {
1490 	struct proc *p = td->td_proc;
1491 	struct filedesc *fdp = td->td_proc->p_fd;
1492 	struct file **fpp;
1493 	int i, lim, last;
1494 
1495 	FILEDESC_LOCK_ASSERT(fdp);
1496 
1497 	PROC_LOCK(p);
1498 	lim = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
1499 	PROC_UNLOCK(p);
1500 	if ((i = lim - fdp->fd_nfiles) > 0 && (n -= i) <= 0)
1501 		return (1);
1502 	last = min(fdp->fd_nfiles, lim);
1503 	fpp = &fdp->fd_ofiles[fdp->fd_freefile];
1504 	for (i = last - fdp->fd_freefile; --i >= 0; fpp++) {
1505 		if (*fpp == NULL && --n <= 0)
1506 			return (1);
1507 	}
1508 	return (0);
1509 }
1510 
1511 /*
1512  * Create a new open file structure and allocate a file decriptor for the
1513  * process that refers to it.  We add one reference to the file for the
1514  * descriptor table and one reference for resultfp. This is to prevent us
1515  * being preempted and the entry in the descriptor table closed after we
1516  * release the FILEDESC lock.
1517  */
1518 int
1519 falloc(struct thread *td, struct file **resultfp, int *resultfd)
1520 {
1521 	struct proc *p = td->td_proc;
1522 	struct file *fp;
1523 	int error, i;
1524 	int maxuserfiles = maxfiles - (maxfiles / 20);
1525 	static struct timeval lastfail;
1526 	static int curfail;
1527 
1528 	fp = uma_zalloc(file_zone, M_WAITOK | M_ZERO);
1529 	if ((openfiles >= maxuserfiles &&
1530 	    priv_check(td, PRIV_MAXFILES) != 0) ||
1531 	    openfiles >= maxfiles) {
1532 		if (ppsratecheck(&lastfail, &curfail, 1)) {
1533 			printf("kern.maxfiles limit exceeded by uid %i, please see tuning(7).\n",
1534 				td->td_ucred->cr_ruid);
1535 		}
1536 		uma_zfree(file_zone, fp);
1537 		return (ENFILE);
1538 	}
1539 	atomic_add_int(&openfiles, 1);
1540 
1541 	/*
1542 	 * If the process has file descriptor zero open, add the new file
1543 	 * descriptor to the list of open files at that point, otherwise
1544 	 * put it at the front of the list of open files.
1545 	 */
1546 	refcount_init(&fp->f_count, 1);
1547 	if (resultfp)
1548 		fhold(fp);
1549 	fp->f_cred = crhold(td->td_ucred);
1550 	fp->f_ops = &badfileops;
1551 	fp->f_data = NULL;
1552 	fp->f_vnode = NULL;
1553 	FILEDESC_XLOCK(p->p_fd);
1554 	if ((error = fdalloc(td, 0, &i))) {
1555 		FILEDESC_XUNLOCK(p->p_fd);
1556 		fdrop(fp, td);
1557 		if (resultfp)
1558 			fdrop(fp, td);
1559 		return (error);
1560 	}
1561 	p->p_fd->fd_ofiles[i] = fp;
1562 	FILEDESC_XUNLOCK(p->p_fd);
1563 	if (resultfp)
1564 		*resultfp = fp;
1565 	if (resultfd)
1566 		*resultfd = i;
1567 	return (0);
1568 }
1569 
1570 /*
1571  * Build a new filedesc structure from another.
1572  * Copy the current, root, and jail root vnode references.
1573  */
1574 struct filedesc *
1575 fdinit(struct filedesc *fdp)
1576 {
1577 	struct filedesc0 *newfdp;
1578 
1579 	newfdp = malloc(sizeof *newfdp, M_FILEDESC, M_WAITOK | M_ZERO);
1580 	FILEDESC_LOCK_INIT(&newfdp->fd_fd);
1581 	if (fdp != NULL) {
1582 		FILEDESC_XLOCK(fdp);
1583 		newfdp->fd_fd.fd_cdir = fdp->fd_cdir;
1584 		if (newfdp->fd_fd.fd_cdir)
1585 			VREF(newfdp->fd_fd.fd_cdir);
1586 		newfdp->fd_fd.fd_rdir = fdp->fd_rdir;
1587 		if (newfdp->fd_fd.fd_rdir)
1588 			VREF(newfdp->fd_fd.fd_rdir);
1589 		newfdp->fd_fd.fd_jdir = fdp->fd_jdir;
1590 		if (newfdp->fd_fd.fd_jdir)
1591 			VREF(newfdp->fd_fd.fd_jdir);
1592 		FILEDESC_XUNLOCK(fdp);
1593 	}
1594 
1595 	/* Create the file descriptor table. */
1596 	newfdp->fd_fd.fd_refcnt = 1;
1597 	newfdp->fd_fd.fd_holdcnt = 1;
1598 	newfdp->fd_fd.fd_cmask = CMASK;
1599 	newfdp->fd_fd.fd_ofiles = newfdp->fd_dfiles;
1600 	newfdp->fd_fd.fd_ofileflags = newfdp->fd_dfileflags;
1601 	newfdp->fd_fd.fd_nfiles = NDFILE;
1602 	newfdp->fd_fd.fd_map = newfdp->fd_dmap;
1603 	newfdp->fd_fd.fd_lastfile = -1;
1604 	return (&newfdp->fd_fd);
1605 }
1606 
1607 static struct filedesc *
1608 fdhold(struct proc *p)
1609 {
1610 	struct filedesc *fdp;
1611 
1612 	mtx_lock(&fdesc_mtx);
1613 	fdp = p->p_fd;
1614 	if (fdp != NULL)
1615 		fdp->fd_holdcnt++;
1616 	mtx_unlock(&fdesc_mtx);
1617 	return (fdp);
1618 }
1619 
1620 static void
1621 fddrop(struct filedesc *fdp)
1622 {
1623 	struct filedesc0 *fdp0;
1624 	struct freetable *ft;
1625 	int i;
1626 
1627 	mtx_lock(&fdesc_mtx);
1628 	i = --fdp->fd_holdcnt;
1629 	mtx_unlock(&fdesc_mtx);
1630 	if (i > 0)
1631 		return;
1632 
1633 	FILEDESC_LOCK_DESTROY(fdp);
1634 	fdp0 = (struct filedesc0 *)fdp;
1635 	while ((ft = SLIST_FIRST(&fdp0->fd_free)) != NULL) {
1636 		SLIST_REMOVE_HEAD(&fdp0->fd_free, ft_next);
1637 		free(ft->ft_table, M_FILEDESC);
1638 	}
1639 	free(fdp, M_FILEDESC);
1640 }
1641 
1642 /*
1643  * Share a filedesc structure.
1644  */
1645 struct filedesc *
1646 fdshare(struct filedesc *fdp)
1647 {
1648 
1649 	FILEDESC_XLOCK(fdp);
1650 	fdp->fd_refcnt++;
1651 	FILEDESC_XUNLOCK(fdp);
1652 	return (fdp);
1653 }
1654 
1655 /*
1656  * Unshare a filedesc structure, if necessary by making a copy
1657  */
1658 void
1659 fdunshare(struct proc *p, struct thread *td)
1660 {
1661 
1662 	FILEDESC_XLOCK(p->p_fd);
1663 	if (p->p_fd->fd_refcnt > 1) {
1664 		struct filedesc *tmp;
1665 
1666 		FILEDESC_XUNLOCK(p->p_fd);
1667 		tmp = fdcopy(p->p_fd);
1668 		fdfree(td);
1669 		p->p_fd = tmp;
1670 	} else
1671 		FILEDESC_XUNLOCK(p->p_fd);
1672 }
1673 
1674 /*
1675  * Copy a filedesc structure.  A NULL pointer in returns a NULL reference,
1676  * this is to ease callers, not catch errors.
1677  */
1678 struct filedesc *
1679 fdcopy(struct filedesc *fdp)
1680 {
1681 	struct filedesc *newfdp;
1682 	int i;
1683 
1684 	/* Certain daemons might not have file descriptors. */
1685 	if (fdp == NULL)
1686 		return (NULL);
1687 
1688 	newfdp = fdinit(fdp);
1689 	FILEDESC_SLOCK(fdp);
1690 	while (fdp->fd_lastfile >= newfdp->fd_nfiles) {
1691 		FILEDESC_SUNLOCK(fdp);
1692 		FILEDESC_XLOCK(newfdp);
1693 		fdgrowtable(newfdp, fdp->fd_lastfile + 1);
1694 		FILEDESC_XUNLOCK(newfdp);
1695 		FILEDESC_SLOCK(fdp);
1696 	}
1697 	/* copy everything except kqueue descriptors */
1698 	newfdp->fd_freefile = -1;
1699 	for (i = 0; i <= fdp->fd_lastfile; ++i) {
1700 		if (fdisused(fdp, i) &&
1701 		    fdp->fd_ofiles[i]->f_type != DTYPE_KQUEUE &&
1702 		    fdp->fd_ofiles[i]->f_ops != &badfileops) {
1703 			newfdp->fd_ofiles[i] = fdp->fd_ofiles[i];
1704 			newfdp->fd_ofileflags[i] = fdp->fd_ofileflags[i];
1705 			fhold(newfdp->fd_ofiles[i]);
1706 			newfdp->fd_lastfile = i;
1707 		} else {
1708 			if (newfdp->fd_freefile == -1)
1709 				newfdp->fd_freefile = i;
1710 		}
1711 	}
1712 	newfdp->fd_cmask = fdp->fd_cmask;
1713 	FILEDESC_SUNLOCK(fdp);
1714 	FILEDESC_XLOCK(newfdp);
1715 	for (i = 0; i <= newfdp->fd_lastfile; ++i)
1716 		if (newfdp->fd_ofiles[i] != NULL)
1717 			fdused(newfdp, i);
1718 	if (newfdp->fd_freefile == -1)
1719 		newfdp->fd_freefile = i;
1720 	FILEDESC_XUNLOCK(newfdp);
1721 	return (newfdp);
1722 }
1723 
1724 /*
1725  * Release a filedesc structure.
1726  */
1727 void
1728 fdfree(struct thread *td)
1729 {
1730 	struct filedesc *fdp;
1731 	struct file **fpp;
1732 	int i, locked;
1733 	struct filedesc_to_leader *fdtol;
1734 	struct file *fp;
1735 	struct vnode *cdir, *jdir, *rdir, *vp;
1736 	struct flock lf;
1737 
1738 	/* Certain daemons might not have file descriptors. */
1739 	fdp = td->td_proc->p_fd;
1740 	if (fdp == NULL)
1741 		return;
1742 
1743 	/* Check for special need to clear POSIX style locks */
1744 	fdtol = td->td_proc->p_fdtol;
1745 	if (fdtol != NULL) {
1746 		FILEDESC_XLOCK(fdp);
1747 		KASSERT(fdtol->fdl_refcount > 0,
1748 			("filedesc_to_refcount botch: fdl_refcount=%d",
1749 			 fdtol->fdl_refcount));
1750 		if (fdtol->fdl_refcount == 1 &&
1751 		    (td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
1752 			for (i = 0, fpp = fdp->fd_ofiles;
1753 			     i <= fdp->fd_lastfile;
1754 			     i++, fpp++) {
1755 				if (*fpp == NULL ||
1756 				    (*fpp)->f_type != DTYPE_VNODE)
1757 					continue;
1758 				fp = *fpp;
1759 				fhold(fp);
1760 				FILEDESC_XUNLOCK(fdp);
1761 				lf.l_whence = SEEK_SET;
1762 				lf.l_start = 0;
1763 				lf.l_len = 0;
1764 				lf.l_type = F_UNLCK;
1765 				vp = fp->f_vnode;
1766 				locked = VFS_LOCK_GIANT(vp->v_mount);
1767 				(void) VOP_ADVLOCK(vp,
1768 						   (caddr_t)td->td_proc->
1769 						   p_leader,
1770 						   F_UNLCK,
1771 						   &lf,
1772 						   F_POSIX);
1773 				VFS_UNLOCK_GIANT(locked);
1774 				FILEDESC_XLOCK(fdp);
1775 				fdrop(fp, td);
1776 				fpp = fdp->fd_ofiles + i;
1777 			}
1778 		}
1779 	retry:
1780 		if (fdtol->fdl_refcount == 1) {
1781 			if (fdp->fd_holdleaderscount > 0 &&
1782 			    (td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
1783 				/*
1784 				 * close() or do_dup() has cleared a reference
1785 				 * in a shared file descriptor table.
1786 				 */
1787 				fdp->fd_holdleaderswakeup = 1;
1788 				sx_sleep(&fdp->fd_holdleaderscount,
1789 				    FILEDESC_LOCK(fdp), PLOCK, "fdlhold", 0);
1790 				goto retry;
1791 			}
1792 			if (fdtol->fdl_holdcount > 0) {
1793 				/*
1794 				 * Ensure that fdtol->fdl_leader remains
1795 				 * valid in closef().
1796 				 */
1797 				fdtol->fdl_wakeup = 1;
1798 				sx_sleep(fdtol, FILEDESC_LOCK(fdp), PLOCK,
1799 				    "fdlhold", 0);
1800 				goto retry;
1801 			}
1802 		}
1803 		fdtol->fdl_refcount--;
1804 		if (fdtol->fdl_refcount == 0 &&
1805 		    fdtol->fdl_holdcount == 0) {
1806 			fdtol->fdl_next->fdl_prev = fdtol->fdl_prev;
1807 			fdtol->fdl_prev->fdl_next = fdtol->fdl_next;
1808 		} else
1809 			fdtol = NULL;
1810 		td->td_proc->p_fdtol = NULL;
1811 		FILEDESC_XUNLOCK(fdp);
1812 		if (fdtol != NULL)
1813 			free(fdtol, M_FILEDESC_TO_LEADER);
1814 	}
1815 	FILEDESC_XLOCK(fdp);
1816 	i = --fdp->fd_refcnt;
1817 	FILEDESC_XUNLOCK(fdp);
1818 	if (i > 0)
1819 		return;
1820 
1821 	fpp = fdp->fd_ofiles;
1822 	for (i = fdp->fd_lastfile; i-- >= 0; fpp++) {
1823 		if (*fpp) {
1824 			FILEDESC_XLOCK(fdp);
1825 			fp = *fpp;
1826 			*fpp = NULL;
1827 			FILEDESC_XUNLOCK(fdp);
1828 			(void) closef(fp, td);
1829 		}
1830 	}
1831 	FILEDESC_XLOCK(fdp);
1832 
1833 	/* XXX This should happen earlier. */
1834 	mtx_lock(&fdesc_mtx);
1835 	td->td_proc->p_fd = NULL;
1836 	mtx_unlock(&fdesc_mtx);
1837 
1838 	if (fdp->fd_nfiles > NDFILE)
1839 		free(fdp->fd_ofiles, M_FILEDESC);
1840 	if (NDSLOTS(fdp->fd_nfiles) > NDSLOTS(NDFILE))
1841 		free(fdp->fd_map, M_FILEDESC);
1842 
1843 	fdp->fd_nfiles = 0;
1844 
1845 	cdir = fdp->fd_cdir;
1846 	fdp->fd_cdir = NULL;
1847 	rdir = fdp->fd_rdir;
1848 	fdp->fd_rdir = NULL;
1849 	jdir = fdp->fd_jdir;
1850 	fdp->fd_jdir = NULL;
1851 	FILEDESC_XUNLOCK(fdp);
1852 
1853 	if (cdir) {
1854 		locked = VFS_LOCK_GIANT(cdir->v_mount);
1855 		vrele(cdir);
1856 		VFS_UNLOCK_GIANT(locked);
1857 	}
1858 	if (rdir) {
1859 		locked = VFS_LOCK_GIANT(rdir->v_mount);
1860 		vrele(rdir);
1861 		VFS_UNLOCK_GIANT(locked);
1862 	}
1863 	if (jdir) {
1864 		locked = VFS_LOCK_GIANT(jdir->v_mount);
1865 		vrele(jdir);
1866 		VFS_UNLOCK_GIANT(locked);
1867 	}
1868 
1869 	fddrop(fdp);
1870 }
1871 
1872 /*
1873  * For setugid programs, we don't want to people to use that setugidness
1874  * to generate error messages which write to a file which otherwise would
1875  * otherwise be off-limits to the process.  We check for filesystems where
1876  * the vnode can change out from under us after execve (like [lin]procfs).
1877  *
1878  * Since setugidsafety calls this only for fd 0, 1 and 2, this check is
1879  * sufficient.  We also don't check for setugidness since we know we are.
1880  */
1881 static int
1882 is_unsafe(struct file *fp)
1883 {
1884 	if (fp->f_type == DTYPE_VNODE) {
1885 		struct vnode *vp = fp->f_vnode;
1886 
1887 		if ((vp->v_vflag & VV_PROCDEP) != 0)
1888 			return (1);
1889 	}
1890 	return (0);
1891 }
1892 
1893 /*
1894  * Make this setguid thing safe, if at all possible.
1895  */
1896 void
1897 setugidsafety(struct thread *td)
1898 {
1899 	struct filedesc *fdp;
1900 	int i;
1901 
1902 	/* Certain daemons might not have file descriptors. */
1903 	fdp = td->td_proc->p_fd;
1904 	if (fdp == NULL)
1905 		return;
1906 
1907 	/*
1908 	 * Note: fdp->fd_ofiles may be reallocated out from under us while
1909 	 * we are blocked in a close.  Be careful!
1910 	 */
1911 	FILEDESC_XLOCK(fdp);
1912 	for (i = 0; i <= fdp->fd_lastfile; i++) {
1913 		if (i > 2)
1914 			break;
1915 		if (fdp->fd_ofiles[i] && is_unsafe(fdp->fd_ofiles[i])) {
1916 			struct file *fp;
1917 
1918 			knote_fdclose(td, i);
1919 			/*
1920 			 * NULL-out descriptor prior to close to avoid
1921 			 * a race while close blocks.
1922 			 */
1923 			fp = fdp->fd_ofiles[i];
1924 			fdp->fd_ofiles[i] = NULL;
1925 			fdp->fd_ofileflags[i] = 0;
1926 			fdunused(fdp, i);
1927 			FILEDESC_XUNLOCK(fdp);
1928 			(void) closef(fp, td);
1929 			FILEDESC_XLOCK(fdp);
1930 		}
1931 	}
1932 	FILEDESC_XUNLOCK(fdp);
1933 }
1934 
1935 /*
1936  * If a specific file object occupies a specific file descriptor, close the
1937  * file descriptor entry and drop a reference on the file object.  This is a
1938  * convenience function to handle a subsequent error in a function that calls
1939  * falloc() that handles the race that another thread might have closed the
1940  * file descriptor out from under the thread creating the file object.
1941  */
1942 void
1943 fdclose(struct filedesc *fdp, struct file *fp, int idx, struct thread *td)
1944 {
1945 
1946 	FILEDESC_XLOCK(fdp);
1947 	if (fdp->fd_ofiles[idx] == fp) {
1948 		fdp->fd_ofiles[idx] = NULL;
1949 		fdunused(fdp, idx);
1950 		FILEDESC_XUNLOCK(fdp);
1951 		fdrop(fp, td);
1952 	} else
1953 		FILEDESC_XUNLOCK(fdp);
1954 }
1955 
1956 /*
1957  * Close any files on exec?
1958  */
1959 void
1960 fdcloseexec(struct thread *td)
1961 {
1962 	struct filedesc *fdp;
1963 	int i;
1964 
1965 	/* Certain daemons might not have file descriptors. */
1966 	fdp = td->td_proc->p_fd;
1967 	if (fdp == NULL)
1968 		return;
1969 
1970 	FILEDESC_XLOCK(fdp);
1971 
1972 	/*
1973 	 * We cannot cache fd_ofiles or fd_ofileflags since operations
1974 	 * may block and rip them out from under us.
1975 	 */
1976 	for (i = 0; i <= fdp->fd_lastfile; i++) {
1977 		if (fdp->fd_ofiles[i] != NULL &&
1978 		    (fdp->fd_ofiles[i]->f_type == DTYPE_MQUEUE ||
1979 		    (fdp->fd_ofileflags[i] & UF_EXCLOSE))) {
1980 			struct file *fp;
1981 
1982 			knote_fdclose(td, i);
1983 			/*
1984 			 * NULL-out descriptor prior to close to avoid
1985 			 * a race while close blocks.
1986 			 */
1987 			fp = fdp->fd_ofiles[i];
1988 			fdp->fd_ofiles[i] = NULL;
1989 			fdp->fd_ofileflags[i] = 0;
1990 			fdunused(fdp, i);
1991 			if (fp->f_type == DTYPE_MQUEUE)
1992 				mq_fdclose(td, i, fp);
1993 			FILEDESC_XUNLOCK(fdp);
1994 			(void) closef(fp, td);
1995 			FILEDESC_XLOCK(fdp);
1996 		}
1997 	}
1998 	FILEDESC_XUNLOCK(fdp);
1999 }
2000 
2001 /*
2002  * It is unsafe for set[ug]id processes to be started with file
2003  * descriptors 0..2 closed, as these descriptors are given implicit
2004  * significance in the Standard C library.  fdcheckstd() will create a
2005  * descriptor referencing /dev/null for each of stdin, stdout, and
2006  * stderr that is not already open.
2007  */
2008 int
2009 fdcheckstd(struct thread *td)
2010 {
2011 	struct filedesc *fdp;
2012 	register_t retval, save;
2013 	int i, error, devnull;
2014 
2015 	fdp = td->td_proc->p_fd;
2016 	if (fdp == NULL)
2017 		return (0);
2018 	KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared"));
2019 	devnull = -1;
2020 	error = 0;
2021 	for (i = 0; i < 3; i++) {
2022 		if (fdp->fd_ofiles[i] != NULL)
2023 			continue;
2024 		if (devnull < 0) {
2025 			save = td->td_retval[0];
2026 			error = kern_open(td, "/dev/null", UIO_SYSSPACE,
2027 			    O_RDWR, 0);
2028 			devnull = td->td_retval[0];
2029 			td->td_retval[0] = save;
2030 			if (error)
2031 				break;
2032 			KASSERT(devnull == i, ("oof, we didn't get our fd"));
2033 		} else {
2034 			error = do_dup(td, DUP_FIXED, devnull, i, &retval);
2035 			if (error != 0)
2036 				break;
2037 		}
2038 	}
2039 	return (error);
2040 }
2041 
2042 /*
2043  * Internal form of close.  Decrement reference count on file structure.
2044  * Note: td may be NULL when closing a file that was being passed in a
2045  * message.
2046  *
2047  * XXXRW: Giant is not required for the caller, but often will be held; this
2048  * makes it moderately likely the Giant will be recursed in the VFS case.
2049  */
2050 int
2051 closef(struct file *fp, struct thread *td)
2052 {
2053 	struct vnode *vp;
2054 	struct flock lf;
2055 	struct filedesc_to_leader *fdtol;
2056 	struct filedesc *fdp;
2057 
2058 	/*
2059 	 * POSIX record locking dictates that any close releases ALL
2060 	 * locks owned by this process.  This is handled by setting
2061 	 * a flag in the unlock to free ONLY locks obeying POSIX
2062 	 * semantics, and not to free BSD-style file locks.
2063 	 * If the descriptor was in a message, POSIX-style locks
2064 	 * aren't passed with the descriptor, and the thread pointer
2065 	 * will be NULL.  Callers should be careful only to pass a
2066 	 * NULL thread pointer when there really is no owning
2067 	 * context that might have locks, or the locks will be
2068 	 * leaked.
2069 	 */
2070 	if (fp->f_type == DTYPE_VNODE && td != NULL) {
2071 		int vfslocked;
2072 
2073 		vp = fp->f_vnode;
2074 		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
2075 		if ((td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
2076 			lf.l_whence = SEEK_SET;
2077 			lf.l_start = 0;
2078 			lf.l_len = 0;
2079 			lf.l_type = F_UNLCK;
2080 			(void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc->p_leader,
2081 					   F_UNLCK, &lf, F_POSIX);
2082 		}
2083 		fdtol = td->td_proc->p_fdtol;
2084 		if (fdtol != NULL) {
2085 			/*
2086 			 * Handle special case where file descriptor table is
2087 			 * shared between multiple process leaders.
2088 			 */
2089 			fdp = td->td_proc->p_fd;
2090 			FILEDESC_XLOCK(fdp);
2091 			for (fdtol = fdtol->fdl_next;
2092 			     fdtol != td->td_proc->p_fdtol;
2093 			     fdtol = fdtol->fdl_next) {
2094 				if ((fdtol->fdl_leader->p_flag &
2095 				     P_ADVLOCK) == 0)
2096 					continue;
2097 				fdtol->fdl_holdcount++;
2098 				FILEDESC_XUNLOCK(fdp);
2099 				lf.l_whence = SEEK_SET;
2100 				lf.l_start = 0;
2101 				lf.l_len = 0;
2102 				lf.l_type = F_UNLCK;
2103 				vp = fp->f_vnode;
2104 				(void) VOP_ADVLOCK(vp,
2105 						   (caddr_t)fdtol->fdl_leader,
2106 						   F_UNLCK, &lf, F_POSIX);
2107 				FILEDESC_XLOCK(fdp);
2108 				fdtol->fdl_holdcount--;
2109 				if (fdtol->fdl_holdcount == 0 &&
2110 				    fdtol->fdl_wakeup != 0) {
2111 					fdtol->fdl_wakeup = 0;
2112 					wakeup(fdtol);
2113 				}
2114 			}
2115 			FILEDESC_XUNLOCK(fdp);
2116 		}
2117 		VFS_UNLOCK_GIANT(vfslocked);
2118 	}
2119 	return (fdrop(fp, td));
2120 }
2121 
2122 /*
2123  * Initialize the file pointer with the specified properties.
2124  *
2125  * The ops are set with release semantics to be certain that the flags, type,
2126  * and data are visible when ops is.  This is to prevent ops methods from being
2127  * called with bad data.
2128  */
2129 void
2130 finit(struct file *fp, u_int flag, short type, void *data, struct fileops *ops)
2131 {
2132 	fp->f_data = data;
2133 	fp->f_flag = flag;
2134 	fp->f_type = type;
2135 	atomic_store_rel_ptr((volatile uintptr_t *)&fp->f_ops, (uintptr_t)ops);
2136 }
2137 
2138 struct file *
2139 fget_unlocked(struct filedesc *fdp, int fd)
2140 {
2141 	struct file *fp;
2142 	u_int count;
2143 
2144 	if (fd < 0 || fd >= fdp->fd_nfiles)
2145 		return (NULL);
2146 	/*
2147 	 * Fetch the descriptor locklessly.  We avoid fdrop() races by
2148 	 * never raising a refcount above 0.  To accomplish this we have
2149 	 * to use a cmpset loop rather than an atomic_add.  The descriptor
2150 	 * must be re-verified once we acquire a reference to be certain
2151 	 * that the identity is still correct and we did not lose a race
2152 	 * due to preemption.
2153 	 */
2154 	for (;;) {
2155 		fp = fdp->fd_ofiles[fd];
2156 		if (fp == NULL)
2157 			break;
2158 		count = fp->f_count;
2159 		if (count == 0)
2160 			continue;
2161 		/*
2162 		 * Use an acquire barrier to prevent caching of fd_ofiles
2163 		 * so it is refreshed for verification.
2164 		 */
2165 		if (atomic_cmpset_acq_int(&fp->f_count, count, count + 1) != 1)
2166 			continue;
2167 		if (fp == fdp->fd_ofiles[fd])
2168 			break;
2169 		fdrop(fp, curthread);
2170 	}
2171 
2172 	return (fp);
2173 }
2174 
2175 /*
2176  * Extract the file pointer associated with the specified descriptor for the
2177  * current user process.
2178  *
2179  * If the descriptor doesn't exist or doesn't match 'flags', EBADF is
2180  * returned.
2181  *
2182  * If an error occured the non-zero error is returned and *fpp is set to
2183  * NULL.  Otherwise *fpp is held and set and zero is returned.  Caller is
2184  * responsible for fdrop().
2185  */
2186 static __inline int
2187 _fget(struct thread *td, int fd, struct file **fpp, int flags)
2188 {
2189 	struct filedesc *fdp;
2190 	struct file *fp;
2191 
2192 	*fpp = NULL;
2193 	if (td == NULL || (fdp = td->td_proc->p_fd) == NULL)
2194 		return (EBADF);
2195 	if ((fp = fget_unlocked(fdp, fd)) == NULL)
2196 		return (EBADF);
2197 	if (fp->f_ops == &badfileops) {
2198 		fdrop(fp, td);
2199 		return (EBADF);
2200 	}
2201 	/*
2202 	 * FREAD and FWRITE failure return EBADF as per POSIX.
2203 	 *
2204 	 * Only one flag, or 0, may be specified.
2205 	 */
2206 	if ((flags == FREAD && (fp->f_flag & FREAD) == 0) ||
2207 	    (flags == FWRITE && (fp->f_flag & FWRITE) == 0)) {
2208 		fdrop(fp, td);
2209 		return (EBADF);
2210 	}
2211 	*fpp = fp;
2212 	return (0);
2213 }
2214 
2215 int
2216 fget(struct thread *td, int fd, struct file **fpp)
2217 {
2218 
2219 	return(_fget(td, fd, fpp, 0));
2220 }
2221 
2222 int
2223 fget_read(struct thread *td, int fd, struct file **fpp)
2224 {
2225 
2226 	return(_fget(td, fd, fpp, FREAD));
2227 }
2228 
2229 int
2230 fget_write(struct thread *td, int fd, struct file **fpp)
2231 {
2232 
2233 	return(_fget(td, fd, fpp, FWRITE));
2234 }
2235 
2236 /*
2237  * Like fget() but loads the underlying vnode, or returns an error if the
2238  * descriptor does not represent a vnode.  Note that pipes use vnodes but
2239  * never have VM objects.  The returned vnode will be vref()'d.
2240  *
2241  * XXX: what about the unused flags ?
2242  */
2243 static __inline int
2244 _fgetvp(struct thread *td, int fd, struct vnode **vpp, int flags)
2245 {
2246 	struct file *fp;
2247 	int error;
2248 
2249 	*vpp = NULL;
2250 	if ((error = _fget(td, fd, &fp, flags)) != 0)
2251 		return (error);
2252 	if (fp->f_vnode == NULL) {
2253 		error = EINVAL;
2254 	} else {
2255 		*vpp = fp->f_vnode;
2256 		vref(*vpp);
2257 	}
2258 	fdrop(fp, td);
2259 
2260 	return (error);
2261 }
2262 
2263 int
2264 fgetvp(struct thread *td, int fd, struct vnode **vpp)
2265 {
2266 
2267 	return (_fgetvp(td, fd, vpp, 0));
2268 }
2269 
2270 int
2271 fgetvp_read(struct thread *td, int fd, struct vnode **vpp)
2272 {
2273 
2274 	return (_fgetvp(td, fd, vpp, FREAD));
2275 }
2276 
2277 #ifdef notyet
2278 int
2279 fgetvp_write(struct thread *td, int fd, struct vnode **vpp)
2280 {
2281 
2282 	return (_fgetvp(td, fd, vpp, FWRITE));
2283 }
2284 #endif
2285 
2286 /*
2287  * Like fget() but loads the underlying socket, or returns an error if the
2288  * descriptor does not represent a socket.
2289  *
2290  * We bump the ref count on the returned socket.  XXX Also obtain the SX lock
2291  * in the future.
2292  *
2293  * Note: fgetsock() and fputsock() are deprecated, as consumers should rely
2294  * on their file descriptor reference to prevent the socket from being free'd
2295  * during use.
2296  */
2297 int
2298 fgetsock(struct thread *td, int fd, struct socket **spp, u_int *fflagp)
2299 {
2300 	struct file *fp;
2301 	int error;
2302 
2303 	*spp = NULL;
2304 	if (fflagp != NULL)
2305 		*fflagp = 0;
2306 	if ((error = _fget(td, fd, &fp, 0)) != 0)
2307 		return (error);
2308 	if (fp->f_type != DTYPE_SOCKET) {
2309 		error = ENOTSOCK;
2310 	} else {
2311 		*spp = fp->f_data;
2312 		if (fflagp)
2313 			*fflagp = fp->f_flag;
2314 		SOCK_LOCK(*spp);
2315 		soref(*spp);
2316 		SOCK_UNLOCK(*spp);
2317 	}
2318 	fdrop(fp, td);
2319 
2320 	return (error);
2321 }
2322 
2323 /*
2324  * Drop the reference count on the socket and XXX release the SX lock in the
2325  * future.  The last reference closes the socket.
2326  *
2327  * Note: fputsock() is deprecated, see comment for fgetsock().
2328  */
2329 void
2330 fputsock(struct socket *so)
2331 {
2332 
2333 	ACCEPT_LOCK();
2334 	SOCK_LOCK(so);
2335 	CURVNET_SET(so->so_vnet);
2336 	sorele(so);
2337 	CURVNET_RESTORE();
2338 }
2339 
2340 /*
2341  * Handle the last reference to a file being closed.
2342  */
2343 int
2344 _fdrop(struct file *fp, struct thread *td)
2345 {
2346 	int error;
2347 
2348 	error = 0;
2349 	if (fp->f_count != 0)
2350 		panic("fdrop: count %d", fp->f_count);
2351 	if (fp->f_ops != &badfileops)
2352 		error = fo_close(fp, td);
2353 	/*
2354 	 * The f_cdevpriv cannot be assigned non-NULL value while we
2355 	 * are destroying the file.
2356 	 */
2357 	if (fp->f_cdevpriv != NULL)
2358 		devfs_fpdrop(fp);
2359 	atomic_subtract_int(&openfiles, 1);
2360 	crfree(fp->f_cred);
2361 	uma_zfree(file_zone, fp);
2362 
2363 	return (error);
2364 }
2365 
2366 /*
2367  * Apply an advisory lock on a file descriptor.
2368  *
2369  * Just attempt to get a record lock of the requested type on the entire file
2370  * (l_whence = SEEK_SET, l_start = 0, l_len = 0).
2371  */
2372 #ifndef _SYS_SYSPROTO_H_
2373 struct flock_args {
2374 	int	fd;
2375 	int	how;
2376 };
2377 #endif
2378 /* ARGSUSED */
2379 int
2380 flock(struct thread *td, struct flock_args *uap)
2381 {
2382 	struct file *fp;
2383 	struct vnode *vp;
2384 	struct flock lf;
2385 	int vfslocked;
2386 	int error;
2387 
2388 	if ((error = fget(td, uap->fd, &fp)) != 0)
2389 		return (error);
2390 	if (fp->f_type != DTYPE_VNODE) {
2391 		fdrop(fp, td);
2392 		return (EOPNOTSUPP);
2393 	}
2394 
2395 	vp = fp->f_vnode;
2396 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
2397 	lf.l_whence = SEEK_SET;
2398 	lf.l_start = 0;
2399 	lf.l_len = 0;
2400 	if (uap->how & LOCK_UN) {
2401 		lf.l_type = F_UNLCK;
2402 		atomic_clear_int(&fp->f_flag, FHASLOCK);
2403 		error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK);
2404 		goto done2;
2405 	}
2406 	if (uap->how & LOCK_EX)
2407 		lf.l_type = F_WRLCK;
2408 	else if (uap->how & LOCK_SH)
2409 		lf.l_type = F_RDLCK;
2410 	else {
2411 		error = EBADF;
2412 		goto done2;
2413 	}
2414 	atomic_set_int(&fp->f_flag, FHASLOCK);
2415 	error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,
2416 	    (uap->how & LOCK_NB) ? F_FLOCK : F_FLOCK | F_WAIT);
2417 done2:
2418 	fdrop(fp, td);
2419 	VFS_UNLOCK_GIANT(vfslocked);
2420 	return (error);
2421 }
2422 /*
2423  * Duplicate the specified descriptor to a free descriptor.
2424  */
2425 int
2426 dupfdopen(struct thread *td, struct filedesc *fdp, int indx, int dfd, int mode, int error)
2427 {
2428 	struct file *wfp;
2429 	struct file *fp;
2430 
2431 	/*
2432 	 * If the to-be-dup'd fd number is greater than the allowed number
2433 	 * of file descriptors, or the fd to be dup'd has already been
2434 	 * closed, then reject.
2435 	 */
2436 	FILEDESC_XLOCK(fdp);
2437 	if (dfd < 0 || dfd >= fdp->fd_nfiles ||
2438 	    (wfp = fdp->fd_ofiles[dfd]) == NULL) {
2439 		FILEDESC_XUNLOCK(fdp);
2440 		return (EBADF);
2441 	}
2442 
2443 	/*
2444 	 * There are two cases of interest here.
2445 	 *
2446 	 * For ENODEV simply dup (dfd) to file descriptor (indx) and return.
2447 	 *
2448 	 * For ENXIO steal away the file structure from (dfd) and store it in
2449 	 * (indx).  (dfd) is effectively closed by this operation.
2450 	 *
2451 	 * Any other error code is just returned.
2452 	 */
2453 	switch (error) {
2454 	case ENODEV:
2455 		/*
2456 		 * Check that the mode the file is being opened for is a
2457 		 * subset of the mode of the existing descriptor.
2458 		 */
2459 		if (((mode & (FREAD|FWRITE)) | wfp->f_flag) != wfp->f_flag) {
2460 			FILEDESC_XUNLOCK(fdp);
2461 			return (EACCES);
2462 		}
2463 		fp = fdp->fd_ofiles[indx];
2464 		fdp->fd_ofiles[indx] = wfp;
2465 		fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd];
2466 		if (fp == NULL)
2467 			fdused(fdp, indx);
2468 		fhold(wfp);
2469 		FILEDESC_XUNLOCK(fdp);
2470 		if (fp != NULL)
2471 			/*
2472 			 * We now own the reference to fp that the ofiles[]
2473 			 * array used to own.  Release it.
2474 			 */
2475 			fdrop(fp, td);
2476 		return (0);
2477 
2478 	case ENXIO:
2479 		/*
2480 		 * Steal away the file pointer from dfd and stuff it into indx.
2481 		 */
2482 		fp = fdp->fd_ofiles[indx];
2483 		fdp->fd_ofiles[indx] = fdp->fd_ofiles[dfd];
2484 		fdp->fd_ofiles[dfd] = NULL;
2485 		fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd];
2486 		fdp->fd_ofileflags[dfd] = 0;
2487 		fdunused(fdp, dfd);
2488 		if (fp == NULL)
2489 			fdused(fdp, indx);
2490 		FILEDESC_XUNLOCK(fdp);
2491 
2492 		/*
2493 		 * We now own the reference to fp that the ofiles[] array
2494 		 * used to own.  Release it.
2495 		 */
2496 		if (fp != NULL)
2497 			fdrop(fp, td);
2498 		return (0);
2499 
2500 	default:
2501 		FILEDESC_XUNLOCK(fdp);
2502 		return (error);
2503 	}
2504 	/* NOTREACHED */
2505 }
2506 
2507 /*
2508  * Scan all active processes and prisons to see if any of them have a current
2509  * or root directory of `olddp'. If so, replace them with the new mount point.
2510  */
2511 void
2512 mountcheckdirs(struct vnode *olddp, struct vnode *newdp)
2513 {
2514 	struct filedesc *fdp;
2515 	struct prison *pr;
2516 	struct proc *p;
2517 	int nrele;
2518 
2519 	if (vrefcnt(olddp) == 1)
2520 		return;
2521 	nrele = 0;
2522 	sx_slock(&allproc_lock);
2523 	FOREACH_PROC_IN_SYSTEM(p) {
2524 		fdp = fdhold(p);
2525 		if (fdp == NULL)
2526 			continue;
2527 		FILEDESC_XLOCK(fdp);
2528 		if (fdp->fd_cdir == olddp) {
2529 			vref(newdp);
2530 			fdp->fd_cdir = newdp;
2531 			nrele++;
2532 		}
2533 		if (fdp->fd_rdir == olddp) {
2534 			vref(newdp);
2535 			fdp->fd_rdir = newdp;
2536 			nrele++;
2537 		}
2538 		if (fdp->fd_jdir == olddp) {
2539 			vref(newdp);
2540 			fdp->fd_jdir = newdp;
2541 			nrele++;
2542 		}
2543 		FILEDESC_XUNLOCK(fdp);
2544 		fddrop(fdp);
2545 	}
2546 	sx_sunlock(&allproc_lock);
2547 	if (rootvnode == olddp) {
2548 		vref(newdp);
2549 		rootvnode = newdp;
2550 		nrele++;
2551 	}
2552 	mtx_lock(&prison0.pr_mtx);
2553 	if (prison0.pr_root == olddp) {
2554 		vref(newdp);
2555 		prison0.pr_root = newdp;
2556 		nrele++;
2557 	}
2558 	mtx_unlock(&prison0.pr_mtx);
2559 	sx_slock(&allprison_lock);
2560 	TAILQ_FOREACH(pr, &allprison, pr_list) {
2561 		mtx_lock(&pr->pr_mtx);
2562 		if (pr->pr_root == olddp) {
2563 			vref(newdp);
2564 			pr->pr_root = newdp;
2565 			nrele++;
2566 		}
2567 		mtx_unlock(&pr->pr_mtx);
2568 	}
2569 	sx_sunlock(&allprison_lock);
2570 	while (nrele--)
2571 		vrele(olddp);
2572 }
2573 
2574 struct filedesc_to_leader *
2575 filedesc_to_leader_alloc(struct filedesc_to_leader *old, struct filedesc *fdp, struct proc *leader)
2576 {
2577 	struct filedesc_to_leader *fdtol;
2578 
2579 	fdtol = malloc(sizeof(struct filedesc_to_leader),
2580 	       M_FILEDESC_TO_LEADER,
2581 	       M_WAITOK);
2582 	fdtol->fdl_refcount = 1;
2583 	fdtol->fdl_holdcount = 0;
2584 	fdtol->fdl_wakeup = 0;
2585 	fdtol->fdl_leader = leader;
2586 	if (old != NULL) {
2587 		FILEDESC_XLOCK(fdp);
2588 		fdtol->fdl_next = old->fdl_next;
2589 		fdtol->fdl_prev = old;
2590 		old->fdl_next = fdtol;
2591 		fdtol->fdl_next->fdl_prev = fdtol;
2592 		FILEDESC_XUNLOCK(fdp);
2593 	} else {
2594 		fdtol->fdl_next = fdtol;
2595 		fdtol->fdl_prev = fdtol;
2596 	}
2597 	return (fdtol);
2598 }
2599 
2600 /*
2601  * Get file structures globally.
2602  */
2603 static int
2604 sysctl_kern_file(SYSCTL_HANDLER_ARGS)
2605 {
2606 	struct xfile xf;
2607 	struct filedesc *fdp;
2608 	struct file *fp;
2609 	struct proc *p;
2610 	int error, n;
2611 
2612 	error = sysctl_wire_old_buffer(req, 0);
2613 	if (error != 0)
2614 		return (error);
2615 	if (req->oldptr == NULL) {
2616 		n = 0;
2617 		sx_slock(&allproc_lock);
2618 		FOREACH_PROC_IN_SYSTEM(p) {
2619 			if (p->p_state == PRS_NEW)
2620 				continue;
2621 			fdp = fdhold(p);
2622 			if (fdp == NULL)
2623 				continue;
2624 			/* overestimates sparse tables. */
2625 			if (fdp->fd_lastfile > 0)
2626 				n += fdp->fd_lastfile;
2627 			fddrop(fdp);
2628 		}
2629 		sx_sunlock(&allproc_lock);
2630 		return (SYSCTL_OUT(req, 0, n * sizeof(xf)));
2631 	}
2632 	error = 0;
2633 	bzero(&xf, sizeof(xf));
2634 	xf.xf_size = sizeof(xf);
2635 	sx_slock(&allproc_lock);
2636 	FOREACH_PROC_IN_SYSTEM(p) {
2637 		if (p->p_state == PRS_NEW)
2638 			continue;
2639 		PROC_LOCK(p);
2640 		if (p_cansee(req->td, p) != 0) {
2641 			PROC_UNLOCK(p);
2642 			continue;
2643 		}
2644 		xf.xf_pid = p->p_pid;
2645 		xf.xf_uid = p->p_ucred->cr_uid;
2646 		PROC_UNLOCK(p);
2647 		fdp = fdhold(p);
2648 		if (fdp == NULL)
2649 			continue;
2650 		FILEDESC_SLOCK(fdp);
2651 		for (n = 0; fdp->fd_refcnt > 0 && n < fdp->fd_nfiles; ++n) {
2652 			if ((fp = fdp->fd_ofiles[n]) == NULL)
2653 				continue;
2654 			xf.xf_fd = n;
2655 			xf.xf_file = fp;
2656 			xf.xf_data = fp->f_data;
2657 			xf.xf_vnode = fp->f_vnode;
2658 			xf.xf_type = fp->f_type;
2659 			xf.xf_count = fp->f_count;
2660 			xf.xf_msgcount = 0;
2661 			xf.xf_offset = fp->f_offset;
2662 			xf.xf_flag = fp->f_flag;
2663 			error = SYSCTL_OUT(req, &xf, sizeof(xf));
2664 			if (error)
2665 				break;
2666 		}
2667 		FILEDESC_SUNLOCK(fdp);
2668 		fddrop(fdp);
2669 		if (error)
2670 			break;
2671 	}
2672 	sx_sunlock(&allproc_lock);
2673 	return (error);
2674 }
2675 
2676 SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD,
2677     0, 0, sysctl_kern_file, "S,xfile", "Entire file table");
2678 
2679 #ifdef KINFO_OFILE_SIZE
2680 CTASSERT(sizeof(struct kinfo_ofile) == KINFO_OFILE_SIZE);
2681 #endif
2682 
2683 #ifdef COMPAT_FREEBSD7
2684 static int
2685 export_vnode_for_osysctl(struct vnode *vp, int type,
2686     struct kinfo_ofile *kif, struct filedesc *fdp, struct sysctl_req *req)
2687 {
2688 	int error;
2689 	char *fullpath, *freepath;
2690 	int vfslocked;
2691 
2692 	bzero(kif, sizeof(*kif));
2693 	kif->kf_structsize = sizeof(*kif);
2694 
2695 	vref(vp);
2696 	kif->kf_fd = type;
2697 	kif->kf_type = KF_TYPE_VNODE;
2698 	/* This function only handles directories. */
2699 	if (vp->v_type != VDIR) {
2700 		vrele(vp);
2701 		return (ENOTDIR);
2702 	}
2703 	kif->kf_vnode_type = KF_VTYPE_VDIR;
2704 
2705 	/*
2706 	 * This is not a true file descriptor, so we set a bogus refcount
2707 	 * and offset to indicate these fields should be ignored.
2708 	 */
2709 	kif->kf_ref_count = -1;
2710 	kif->kf_offset = -1;
2711 
2712 	freepath = NULL;
2713 	fullpath = "-";
2714 	FILEDESC_SUNLOCK(fdp);
2715 	vn_fullpath(curthread, vp, &fullpath, &freepath);
2716 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
2717 	vrele(vp);
2718 	VFS_UNLOCK_GIANT(vfslocked);
2719 	strlcpy(kif->kf_path, fullpath, sizeof(kif->kf_path));
2720 	if (freepath != NULL)
2721 		free(freepath, M_TEMP);
2722 	error = SYSCTL_OUT(req, kif, sizeof(*kif));
2723 	FILEDESC_SLOCK(fdp);
2724 	return (error);
2725 }
2726 
2727 /*
2728  * Get per-process file descriptors for use by procstat(1), et al.
2729  */
2730 static int
2731 sysctl_kern_proc_ofiledesc(SYSCTL_HANDLER_ARGS)
2732 {
2733 	char *fullpath, *freepath;
2734 	struct kinfo_ofile *kif;
2735 	struct filedesc *fdp;
2736 	int error, i, *name;
2737 	struct socket *so;
2738 	struct vnode *vp;
2739 	struct file *fp;
2740 	struct proc *p;
2741 	struct tty *tp;
2742 	int vfslocked;
2743 
2744 	name = (int *)arg1;
2745 	if ((p = pfind((pid_t)name[0])) == NULL)
2746 		return (ESRCH);
2747 	if ((error = p_candebug(curthread, p))) {
2748 		PROC_UNLOCK(p);
2749 		return (error);
2750 	}
2751 	fdp = fdhold(p);
2752 	PROC_UNLOCK(p);
2753 	if (fdp == NULL)
2754 		return (ENOENT);
2755 	kif = malloc(sizeof(*kif), M_TEMP, M_WAITOK);
2756 	FILEDESC_SLOCK(fdp);
2757 	if (fdp->fd_cdir != NULL)
2758 		export_vnode_for_osysctl(fdp->fd_cdir, KF_FD_TYPE_CWD, kif,
2759 				fdp, req);
2760 	if (fdp->fd_rdir != NULL)
2761 		export_vnode_for_osysctl(fdp->fd_rdir, KF_FD_TYPE_ROOT, kif,
2762 				fdp, req);
2763 	if (fdp->fd_jdir != NULL)
2764 		export_vnode_for_osysctl(fdp->fd_jdir, KF_FD_TYPE_JAIL, kif,
2765 				fdp, req);
2766 	for (i = 0; i < fdp->fd_nfiles; i++) {
2767 		if ((fp = fdp->fd_ofiles[i]) == NULL)
2768 			continue;
2769 		bzero(kif, sizeof(*kif));
2770 		kif->kf_structsize = sizeof(*kif);
2771 		vp = NULL;
2772 		so = NULL;
2773 		tp = NULL;
2774 		kif->kf_fd = i;
2775 		switch (fp->f_type) {
2776 		case DTYPE_VNODE:
2777 			kif->kf_type = KF_TYPE_VNODE;
2778 			vp = fp->f_vnode;
2779 			break;
2780 
2781 		case DTYPE_SOCKET:
2782 			kif->kf_type = KF_TYPE_SOCKET;
2783 			so = fp->f_data;
2784 			break;
2785 
2786 		case DTYPE_PIPE:
2787 			kif->kf_type = KF_TYPE_PIPE;
2788 			break;
2789 
2790 		case DTYPE_FIFO:
2791 			kif->kf_type = KF_TYPE_FIFO;
2792 			vp = fp->f_vnode;
2793 			break;
2794 
2795 		case DTYPE_KQUEUE:
2796 			kif->kf_type = KF_TYPE_KQUEUE;
2797 			break;
2798 
2799 		case DTYPE_CRYPTO:
2800 			kif->kf_type = KF_TYPE_CRYPTO;
2801 			break;
2802 
2803 		case DTYPE_MQUEUE:
2804 			kif->kf_type = KF_TYPE_MQUEUE;
2805 			break;
2806 
2807 		case DTYPE_SHM:
2808 			kif->kf_type = KF_TYPE_SHM;
2809 			break;
2810 
2811 		case DTYPE_SEM:
2812 			kif->kf_type = KF_TYPE_SEM;
2813 			break;
2814 
2815 		case DTYPE_PTS:
2816 			kif->kf_type = KF_TYPE_PTS;
2817 			tp = fp->f_data;
2818 			break;
2819 
2820 		default:
2821 			kif->kf_type = KF_TYPE_UNKNOWN;
2822 			break;
2823 		}
2824 		kif->kf_ref_count = fp->f_count;
2825 		if (fp->f_flag & FREAD)
2826 			kif->kf_flags |= KF_FLAG_READ;
2827 		if (fp->f_flag & FWRITE)
2828 			kif->kf_flags |= KF_FLAG_WRITE;
2829 		if (fp->f_flag & FAPPEND)
2830 			kif->kf_flags |= KF_FLAG_APPEND;
2831 		if (fp->f_flag & FASYNC)
2832 			kif->kf_flags |= KF_FLAG_ASYNC;
2833 		if (fp->f_flag & FFSYNC)
2834 			kif->kf_flags |= KF_FLAG_FSYNC;
2835 		if (fp->f_flag & FNONBLOCK)
2836 			kif->kf_flags |= KF_FLAG_NONBLOCK;
2837 		if (fp->f_flag & O_DIRECT)
2838 			kif->kf_flags |= KF_FLAG_DIRECT;
2839 		if (fp->f_flag & FHASLOCK)
2840 			kif->kf_flags |= KF_FLAG_HASLOCK;
2841 		kif->kf_offset = fp->f_offset;
2842 		if (vp != NULL) {
2843 			vref(vp);
2844 			switch (vp->v_type) {
2845 			case VNON:
2846 				kif->kf_vnode_type = KF_VTYPE_VNON;
2847 				break;
2848 			case VREG:
2849 				kif->kf_vnode_type = KF_VTYPE_VREG;
2850 				break;
2851 			case VDIR:
2852 				kif->kf_vnode_type = KF_VTYPE_VDIR;
2853 				break;
2854 			case VBLK:
2855 				kif->kf_vnode_type = KF_VTYPE_VBLK;
2856 				break;
2857 			case VCHR:
2858 				kif->kf_vnode_type = KF_VTYPE_VCHR;
2859 				break;
2860 			case VLNK:
2861 				kif->kf_vnode_type = KF_VTYPE_VLNK;
2862 				break;
2863 			case VSOCK:
2864 				kif->kf_vnode_type = KF_VTYPE_VSOCK;
2865 				break;
2866 			case VFIFO:
2867 				kif->kf_vnode_type = KF_VTYPE_VFIFO;
2868 				break;
2869 			case VBAD:
2870 				kif->kf_vnode_type = KF_VTYPE_VBAD;
2871 				break;
2872 			default:
2873 				kif->kf_vnode_type = KF_VTYPE_UNKNOWN;
2874 				break;
2875 			}
2876 			/*
2877 			 * It is OK to drop the filedesc lock here as we will
2878 			 * re-validate and re-evaluate its properties when
2879 			 * the loop continues.
2880 			 */
2881 			freepath = NULL;
2882 			fullpath = "-";
2883 			FILEDESC_SUNLOCK(fdp);
2884 			vn_fullpath(curthread, vp, &fullpath, &freepath);
2885 			vfslocked = VFS_LOCK_GIANT(vp->v_mount);
2886 			vrele(vp);
2887 			VFS_UNLOCK_GIANT(vfslocked);
2888 			strlcpy(kif->kf_path, fullpath,
2889 			    sizeof(kif->kf_path));
2890 			if (freepath != NULL)
2891 				free(freepath, M_TEMP);
2892 			FILEDESC_SLOCK(fdp);
2893 		}
2894 		if (so != NULL) {
2895 			struct sockaddr *sa;
2896 
2897 			if (so->so_proto->pr_usrreqs->pru_sockaddr(so, &sa)
2898 			    == 0 && sa->sa_len <= sizeof(kif->kf_sa_local)) {
2899 				bcopy(sa, &kif->kf_sa_local, sa->sa_len);
2900 				free(sa, M_SONAME);
2901 			}
2902 			if (so->so_proto->pr_usrreqs->pru_peeraddr(so, &sa)
2903 			    == 0 && sa->sa_len <= sizeof(kif->kf_sa_peer)) {
2904 				bcopy(sa, &kif->kf_sa_peer, sa->sa_len);
2905 				free(sa, M_SONAME);
2906 			}
2907 			kif->kf_sock_domain =
2908 			    so->so_proto->pr_domain->dom_family;
2909 			kif->kf_sock_type = so->so_type;
2910 			kif->kf_sock_protocol = so->so_proto->pr_protocol;
2911 		}
2912 		if (tp != NULL) {
2913 			strlcpy(kif->kf_path, tty_devname(tp),
2914 			    sizeof(kif->kf_path));
2915 		}
2916 		error = SYSCTL_OUT(req, kif, sizeof(*kif));
2917 		if (error)
2918 			break;
2919 	}
2920 	FILEDESC_SUNLOCK(fdp);
2921 	fddrop(fdp);
2922 	free(kif, M_TEMP);
2923 	return (0);
2924 }
2925 
2926 static SYSCTL_NODE(_kern_proc, KERN_PROC_OFILEDESC, ofiledesc, CTLFLAG_RD,
2927     sysctl_kern_proc_ofiledesc, "Process ofiledesc entries");
2928 #endif	/* COMPAT_FREEBSD7 */
2929 
2930 #ifdef KINFO_FILE_SIZE
2931 CTASSERT(sizeof(struct kinfo_file) == KINFO_FILE_SIZE);
2932 #endif
2933 
2934 static int
2935 export_vnode_for_sysctl(struct vnode *vp, int type,
2936     struct kinfo_file *kif, struct filedesc *fdp, struct sysctl_req *req)
2937 {
2938 	int error;
2939 	char *fullpath, *freepath;
2940 	int vfslocked;
2941 
2942 	bzero(kif, sizeof(*kif));
2943 
2944 	vref(vp);
2945 	kif->kf_fd = type;
2946 	kif->kf_type = KF_TYPE_VNODE;
2947 	/* This function only handles directories. */
2948 	if (vp->v_type != VDIR) {
2949 		vrele(vp);
2950 		return (ENOTDIR);
2951 	}
2952 	kif->kf_vnode_type = KF_VTYPE_VDIR;
2953 
2954 	/*
2955 	 * This is not a true file descriptor, so we set a bogus refcount
2956 	 * and offset to indicate these fields should be ignored.
2957 	 */
2958 	kif->kf_ref_count = -1;
2959 	kif->kf_offset = -1;
2960 
2961 	freepath = NULL;
2962 	fullpath = "-";
2963 	FILEDESC_SUNLOCK(fdp);
2964 	vn_fullpath(curthread, vp, &fullpath, &freepath);
2965 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
2966 	vrele(vp);
2967 	VFS_UNLOCK_GIANT(vfslocked);
2968 	strlcpy(kif->kf_path, fullpath, sizeof(kif->kf_path));
2969 	if (freepath != NULL)
2970 		free(freepath, M_TEMP);
2971 	/* Pack record size down */
2972 	kif->kf_structsize = offsetof(struct kinfo_file, kf_path) +
2973 	    strlen(kif->kf_path) + 1;
2974 	kif->kf_structsize = roundup(kif->kf_structsize, sizeof(uint64_t));
2975 	error = SYSCTL_OUT(req, kif, kif->kf_structsize);
2976 	FILEDESC_SLOCK(fdp);
2977 	return (error);
2978 }
2979 
2980 /*
2981  * Get per-process file descriptors for use by procstat(1), et al.
2982  */
2983 static int
2984 sysctl_kern_proc_filedesc(SYSCTL_HANDLER_ARGS)
2985 {
2986 	char *fullpath, *freepath;
2987 	struct kinfo_file *kif;
2988 	struct filedesc *fdp;
2989 	int error, i, *name;
2990 	struct socket *so;
2991 	struct vnode *vp;
2992 	struct file *fp;
2993 	struct proc *p;
2994 	struct tty *tp;
2995 	int vfslocked;
2996 	size_t oldidx;
2997 
2998 	name = (int *)arg1;
2999 	if ((p = pfind((pid_t)name[0])) == NULL)
3000 		return (ESRCH);
3001 	if ((error = p_candebug(curthread, p))) {
3002 		PROC_UNLOCK(p);
3003 		return (error);
3004 	}
3005 	fdp = fdhold(p);
3006 	PROC_UNLOCK(p);
3007 	if (fdp == NULL)
3008 		return (ENOENT);
3009 	kif = malloc(sizeof(*kif), M_TEMP, M_WAITOK);
3010 	FILEDESC_SLOCK(fdp);
3011 	if (fdp->fd_cdir != NULL)
3012 		export_vnode_for_sysctl(fdp->fd_cdir, KF_FD_TYPE_CWD, kif,
3013 				fdp, req);
3014 	if (fdp->fd_rdir != NULL)
3015 		export_vnode_for_sysctl(fdp->fd_rdir, KF_FD_TYPE_ROOT, kif,
3016 				fdp, req);
3017 	if (fdp->fd_jdir != NULL)
3018 		export_vnode_for_sysctl(fdp->fd_jdir, KF_FD_TYPE_JAIL, kif,
3019 				fdp, req);
3020 	for (i = 0; i < fdp->fd_nfiles; i++) {
3021 		if ((fp = fdp->fd_ofiles[i]) == NULL)
3022 			continue;
3023 		bzero(kif, sizeof(*kif));
3024 		vp = NULL;
3025 		so = NULL;
3026 		tp = NULL;
3027 		kif->kf_fd = i;
3028 		switch (fp->f_type) {
3029 		case DTYPE_VNODE:
3030 			kif->kf_type = KF_TYPE_VNODE;
3031 			vp = fp->f_vnode;
3032 			break;
3033 
3034 		case DTYPE_SOCKET:
3035 			kif->kf_type = KF_TYPE_SOCKET;
3036 			so = fp->f_data;
3037 			break;
3038 
3039 		case DTYPE_PIPE:
3040 			kif->kf_type = KF_TYPE_PIPE;
3041 			break;
3042 
3043 		case DTYPE_FIFO:
3044 			kif->kf_type = KF_TYPE_FIFO;
3045 			vp = fp->f_vnode;
3046 			break;
3047 
3048 		case DTYPE_KQUEUE:
3049 			kif->kf_type = KF_TYPE_KQUEUE;
3050 			break;
3051 
3052 		case DTYPE_CRYPTO:
3053 			kif->kf_type = KF_TYPE_CRYPTO;
3054 			break;
3055 
3056 		case DTYPE_MQUEUE:
3057 			kif->kf_type = KF_TYPE_MQUEUE;
3058 			break;
3059 
3060 		case DTYPE_SHM:
3061 			kif->kf_type = KF_TYPE_SHM;
3062 			break;
3063 
3064 		case DTYPE_SEM:
3065 			kif->kf_type = KF_TYPE_SEM;
3066 			break;
3067 
3068 		case DTYPE_PTS:
3069 			kif->kf_type = KF_TYPE_PTS;
3070 			tp = fp->f_data;
3071 			break;
3072 
3073 		default:
3074 			kif->kf_type = KF_TYPE_UNKNOWN;
3075 			break;
3076 		}
3077 		kif->kf_ref_count = fp->f_count;
3078 		if (fp->f_flag & FREAD)
3079 			kif->kf_flags |= KF_FLAG_READ;
3080 		if (fp->f_flag & FWRITE)
3081 			kif->kf_flags |= KF_FLAG_WRITE;
3082 		if (fp->f_flag & FAPPEND)
3083 			kif->kf_flags |= KF_FLAG_APPEND;
3084 		if (fp->f_flag & FASYNC)
3085 			kif->kf_flags |= KF_FLAG_ASYNC;
3086 		if (fp->f_flag & FFSYNC)
3087 			kif->kf_flags |= KF_FLAG_FSYNC;
3088 		if (fp->f_flag & FNONBLOCK)
3089 			kif->kf_flags |= KF_FLAG_NONBLOCK;
3090 		if (fp->f_flag & O_DIRECT)
3091 			kif->kf_flags |= KF_FLAG_DIRECT;
3092 		if (fp->f_flag & FHASLOCK)
3093 			kif->kf_flags |= KF_FLAG_HASLOCK;
3094 		kif->kf_offset = fp->f_offset;
3095 		if (vp != NULL) {
3096 			vref(vp);
3097 			switch (vp->v_type) {
3098 			case VNON:
3099 				kif->kf_vnode_type = KF_VTYPE_VNON;
3100 				break;
3101 			case VREG:
3102 				kif->kf_vnode_type = KF_VTYPE_VREG;
3103 				break;
3104 			case VDIR:
3105 				kif->kf_vnode_type = KF_VTYPE_VDIR;
3106 				break;
3107 			case VBLK:
3108 				kif->kf_vnode_type = KF_VTYPE_VBLK;
3109 				break;
3110 			case VCHR:
3111 				kif->kf_vnode_type = KF_VTYPE_VCHR;
3112 				break;
3113 			case VLNK:
3114 				kif->kf_vnode_type = KF_VTYPE_VLNK;
3115 				break;
3116 			case VSOCK:
3117 				kif->kf_vnode_type = KF_VTYPE_VSOCK;
3118 				break;
3119 			case VFIFO:
3120 				kif->kf_vnode_type = KF_VTYPE_VFIFO;
3121 				break;
3122 			case VBAD:
3123 				kif->kf_vnode_type = KF_VTYPE_VBAD;
3124 				break;
3125 			default:
3126 				kif->kf_vnode_type = KF_VTYPE_UNKNOWN;
3127 				break;
3128 			}
3129 			/*
3130 			 * It is OK to drop the filedesc lock here as we will
3131 			 * re-validate and re-evaluate its properties when
3132 			 * the loop continues.
3133 			 */
3134 			freepath = NULL;
3135 			fullpath = "-";
3136 			FILEDESC_SUNLOCK(fdp);
3137 			vn_fullpath(curthread, vp, &fullpath, &freepath);
3138 			vfslocked = VFS_LOCK_GIANT(vp->v_mount);
3139 			vrele(vp);
3140 			VFS_UNLOCK_GIANT(vfslocked);
3141 			strlcpy(kif->kf_path, fullpath,
3142 			    sizeof(kif->kf_path));
3143 			if (freepath != NULL)
3144 				free(freepath, M_TEMP);
3145 			FILEDESC_SLOCK(fdp);
3146 		}
3147 		if (so != NULL) {
3148 			struct sockaddr *sa;
3149 
3150 			if (so->so_proto->pr_usrreqs->pru_sockaddr(so, &sa)
3151 			    == 0 && sa->sa_len <= sizeof(kif->kf_sa_local)) {
3152 				bcopy(sa, &kif->kf_sa_local, sa->sa_len);
3153 				free(sa, M_SONAME);
3154 			}
3155 			if (so->so_proto->pr_usrreqs->pru_peeraddr(so, &sa)
3156 			    == 0 && sa->sa_len <= sizeof(kif->kf_sa_peer)) {
3157 				bcopy(sa, &kif->kf_sa_peer, sa->sa_len);
3158 				free(sa, M_SONAME);
3159 			}
3160 			kif->kf_sock_domain =
3161 			    so->so_proto->pr_domain->dom_family;
3162 			kif->kf_sock_type = so->so_type;
3163 			kif->kf_sock_protocol = so->so_proto->pr_protocol;
3164 		}
3165 		if (tp != NULL) {
3166 			strlcpy(kif->kf_path, tty_devname(tp),
3167 			    sizeof(kif->kf_path));
3168 		}
3169 		/* Pack record size down */
3170 		kif->kf_structsize = offsetof(struct kinfo_file, kf_path) +
3171 		    strlen(kif->kf_path) + 1;
3172 		kif->kf_structsize = roundup(kif->kf_structsize,
3173 		    sizeof(uint64_t));
3174 		oldidx = req->oldidx;
3175 		error = SYSCTL_OUT(req, kif, kif->kf_structsize);
3176 		if (error) {
3177 			if (error == ENOMEM) {
3178 				/*
3179 				 * The hack to keep the ABI of sysctl
3180 				 * kern.proc.filedesc intact, but not
3181 				 * to account a partially copied
3182 				 * kinfo_file into the oldidx.
3183 				 */
3184 				req->oldidx = oldidx;
3185 				error = 0;
3186 			}
3187 			break;
3188 		}
3189 	}
3190 	FILEDESC_SUNLOCK(fdp);
3191 	fddrop(fdp);
3192 	free(kif, M_TEMP);
3193 	return (error);
3194 }
3195 
3196 static SYSCTL_NODE(_kern_proc, KERN_PROC_FILEDESC, filedesc, CTLFLAG_RD,
3197     sysctl_kern_proc_filedesc, "Process filedesc entries");
3198 
3199 #ifdef DDB
3200 /*
3201  * For the purposes of debugging, generate a human-readable string for the
3202  * file type.
3203  */
3204 static const char *
3205 file_type_to_name(short type)
3206 {
3207 
3208 	switch (type) {
3209 	case 0:
3210 		return ("zero");
3211 	case DTYPE_VNODE:
3212 		return ("vnod");
3213 	case DTYPE_SOCKET:
3214 		return ("sock");
3215 	case DTYPE_PIPE:
3216 		return ("pipe");
3217 	case DTYPE_FIFO:
3218 		return ("fifo");
3219 	case DTYPE_KQUEUE:
3220 		return ("kque");
3221 	case DTYPE_CRYPTO:
3222 		return ("crpt");
3223 	case DTYPE_MQUEUE:
3224 		return ("mque");
3225 	case DTYPE_SHM:
3226 		return ("shm");
3227 	case DTYPE_SEM:
3228 		return ("ksem");
3229 	default:
3230 		return ("unkn");
3231 	}
3232 }
3233 
3234 /*
3235  * For the purposes of debugging, identify a process (if any, perhaps one of
3236  * many) that references the passed file in its file descriptor array. Return
3237  * NULL if none.
3238  */
3239 static struct proc *
3240 file_to_first_proc(struct file *fp)
3241 {
3242 	struct filedesc *fdp;
3243 	struct proc *p;
3244 	int n;
3245 
3246 	FOREACH_PROC_IN_SYSTEM(p) {
3247 		if (p->p_state == PRS_NEW)
3248 			continue;
3249 		fdp = p->p_fd;
3250 		if (fdp == NULL)
3251 			continue;
3252 		for (n = 0; n < fdp->fd_nfiles; n++) {
3253 			if (fp == fdp->fd_ofiles[n])
3254 				return (p);
3255 		}
3256 	}
3257 	return (NULL);
3258 }
3259 
3260 static void
3261 db_print_file(struct file *fp, int header)
3262 {
3263 	struct proc *p;
3264 
3265 	if (header)
3266 		db_printf("%8s %4s %8s %8s %4s %5s %6s %8s %5s %12s\n",
3267 		    "File", "Type", "Data", "Flag", "GCFl", "Count",
3268 		    "MCount", "Vnode", "FPID", "FCmd");
3269 	p = file_to_first_proc(fp);
3270 	db_printf("%8p %4s %8p %08x %04x %5d %6d %8p %5d %12s\n", fp,
3271 	    file_type_to_name(fp->f_type), fp->f_data, fp->f_flag,
3272 	    0, fp->f_count, 0, fp->f_vnode,
3273 	    p != NULL ? p->p_pid : -1, p != NULL ? p->p_comm : "-");
3274 }
3275 
3276 DB_SHOW_COMMAND(file, db_show_file)
3277 {
3278 	struct file *fp;
3279 
3280 	if (!have_addr) {
3281 		db_printf("usage: show file <addr>\n");
3282 		return;
3283 	}
3284 	fp = (struct file *)addr;
3285 	db_print_file(fp, 1);
3286 }
3287 
3288 DB_SHOW_COMMAND(files, db_show_files)
3289 {
3290 	struct filedesc *fdp;
3291 	struct file *fp;
3292 	struct proc *p;
3293 	int header;
3294 	int n;
3295 
3296 	header = 1;
3297 	FOREACH_PROC_IN_SYSTEM(p) {
3298 		if (p->p_state == PRS_NEW)
3299 			continue;
3300 		if ((fdp = p->p_fd) == NULL)
3301 			continue;
3302 		for (n = 0; n < fdp->fd_nfiles; ++n) {
3303 			if ((fp = fdp->fd_ofiles[n]) == NULL)
3304 				continue;
3305 			db_print_file(fp, header);
3306 			header = 0;
3307 		}
3308 	}
3309 }
3310 #endif
3311 
3312 SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW,
3313     &maxfilesperproc, 0, "Maximum files allowed open per process");
3314 
3315 SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW,
3316     &maxfiles, 0, "Maximum number of files");
3317 
3318 SYSCTL_INT(_kern, OID_AUTO, openfiles, CTLFLAG_RD,
3319     __DEVOLATILE(int *, &openfiles), 0, "System-wide number of open files");
3320 
3321 /* ARGSUSED*/
3322 static void
3323 filelistinit(void *dummy)
3324 {
3325 
3326 	file_zone = uma_zcreate("Files", sizeof(struct file), NULL, NULL,
3327 	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
3328 	mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF);
3329 	mtx_init(&fdesc_mtx, "fdesc", NULL, MTX_DEF);
3330 }
3331 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, filelistinit, NULL);
3332 
3333 /*-------------------------------------------------------------------*/
3334 
3335 static int
3336 badfo_readwrite(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td)
3337 {
3338 
3339 	return (EBADF);
3340 }
3341 
3342 static int
3343 badfo_truncate(struct file *fp, off_t length, struct ucred *active_cred, struct thread *td)
3344 {
3345 
3346 	return (EINVAL);
3347 }
3348 
3349 static int
3350 badfo_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred, struct thread *td)
3351 {
3352 
3353 	return (EBADF);
3354 }
3355 
3356 static int
3357 badfo_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td)
3358 {
3359 
3360 	return (0);
3361 }
3362 
3363 static int
3364 badfo_kqfilter(struct file *fp, struct knote *kn)
3365 {
3366 
3367 	return (EBADF);
3368 }
3369 
3370 static int
3371 badfo_stat(struct file *fp, struct stat *sb, struct ucred *active_cred, struct thread *td)
3372 {
3373 
3374 	return (EBADF);
3375 }
3376 
3377 static int
3378 badfo_close(struct file *fp, struct thread *td)
3379 {
3380 
3381 	return (EBADF);
3382 }
3383 
3384 struct fileops badfileops = {
3385 	.fo_read = badfo_readwrite,
3386 	.fo_write = badfo_readwrite,
3387 	.fo_truncate = badfo_truncate,
3388 	.fo_ioctl = badfo_ioctl,
3389 	.fo_poll = badfo_poll,
3390 	.fo_kqfilter = badfo_kqfilter,
3391 	.fo_stat = badfo_stat,
3392 	.fo_close = badfo_close,
3393 };
3394 
3395 
3396 /*-------------------------------------------------------------------*/
3397 
3398 /*
3399  * File Descriptor pseudo-device driver (/dev/fd/).
3400  *
3401  * Opening minor device N dup()s the file (if any) connected to file
3402  * descriptor N belonging to the calling process.  Note that this driver
3403  * consists of only the ``open()'' routine, because all subsequent
3404  * references to this file will be direct to the other driver.
3405  *
3406  * XXX: we could give this one a cloning event handler if necessary.
3407  */
3408 
3409 /* ARGSUSED */
3410 static int
3411 fdopen(struct cdev *dev, int mode, int type, struct thread *td)
3412 {
3413 
3414 	/*
3415 	 * XXX Kludge: set curthread->td_dupfd to contain the value of the
3416 	 * the file descriptor being sought for duplication. The error
3417 	 * return ensures that the vnode for this device will be released
3418 	 * by vn_open. Open will detect this special error and take the
3419 	 * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN
3420 	 * will simply report the error.
3421 	 */
3422 	td->td_dupfd = dev2unit(dev);
3423 	return (ENODEV);
3424 }
3425 
3426 static struct cdevsw fildesc_cdevsw = {
3427 	.d_version =	D_VERSION,
3428 	.d_open =	fdopen,
3429 	.d_name =	"FD",
3430 };
3431 
3432 static void
3433 fildesc_drvinit(void *unused)
3434 {
3435 	struct cdev *dev;
3436 
3437 	dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 0, NULL,
3438 	    UID_ROOT, GID_WHEEL, 0666, "fd/0");
3439 	make_dev_alias(dev, "stdin");
3440 	dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 1, NULL,
3441 	    UID_ROOT, GID_WHEEL, 0666, "fd/1");
3442 	make_dev_alias(dev, "stdout");
3443 	dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 2, NULL,
3444 	    UID_ROOT, GID_WHEEL, 0666, "fd/2");
3445 	make_dev_alias(dev, "stderr");
3446 }
3447 
3448 SYSINIT(fildescdev, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, fildesc_drvinit, NULL);
3449