xref: /freebsd/sys/kern/vfs_syscalls.c (revision 9ccc37e32070303fb293a2a1697ffa71eeb49b25)
1 /*-
2  * Copyright (c) 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 4. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  *	@(#)vfs_syscalls.c	8.13 (Berkeley) 4/15/94
35  */
36 
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD$");
39 
40 #include "opt_capsicum.h"
41 #include "opt_compat.h"
42 #include "opt_kdtrace.h"
43 #include "opt_ktrace.h"
44 
45 #include <sys/param.h>
46 #include <sys/systm.h>
47 #include <sys/bio.h>
48 #include <sys/buf.h>
49 #include <sys/capability.h>
50 #include <sys/disk.h>
51 #include <sys/sysent.h>
52 #include <sys/malloc.h>
53 #include <sys/mount.h>
54 #include <sys/mutex.h>
55 #include <sys/sysproto.h>
56 #include <sys/namei.h>
57 #include <sys/filedesc.h>
58 #include <sys/kernel.h>
59 #include <sys/fcntl.h>
60 #include <sys/file.h>
61 #include <sys/filio.h>
62 #include <sys/limits.h>
63 #include <sys/linker.h>
64 #include <sys/sdt.h>
65 #include <sys/stat.h>
66 #include <sys/sx.h>
67 #include <sys/unistd.h>
68 #include <sys/vnode.h>
69 #include <sys/priv.h>
70 #include <sys/proc.h>
71 #include <sys/dirent.h>
72 #include <sys/jail.h>
73 #include <sys/syscallsubr.h>
74 #include <sys/sysctl.h>
75 #ifdef KTRACE
76 #include <sys/ktrace.h>
77 #endif
78 
79 #include <machine/stdarg.h>
80 
81 #include <security/audit/audit.h>
82 #include <security/mac/mac_framework.h>
83 
84 #include <vm/vm.h>
85 #include <vm/vm_object.h>
86 #include <vm/vm_page.h>
87 #include <vm/uma.h>
88 
89 #include <ufs/ufs/quota.h>
90 
91 static MALLOC_DEFINE(M_FADVISE, "fadvise", "posix_fadvise(2) information");
92 
93 SDT_PROVIDER_DEFINE(vfs);
94 SDT_PROBE_DEFINE(vfs, , stat, mode, mode);
95 SDT_PROBE_ARGTYPE(vfs, , stat, mode, 0, "char *");
96 SDT_PROBE_ARGTYPE(vfs, , stat, mode, 1, "int");
97 SDT_PROBE_DEFINE(vfs, , stat, reg, reg);
98 SDT_PROBE_ARGTYPE(vfs, , stat, reg, 0, "char *");
99 SDT_PROBE_ARGTYPE(vfs, , stat, reg, 1, "int");
100 
101 static int chroot_refuse_vdir_fds(struct filedesc *fdp);
102 static int getutimes(const struct timeval *, enum uio_seg, struct timespec *);
103 static int setfflags(struct thread *td, struct vnode *, int);
104 static int setutimes(struct thread *td, struct vnode *,
105     const struct timespec *, int, int);
106 static int vn_access(struct vnode *vp, int user_flags, struct ucred *cred,
107     struct thread *td);
108 
109 /*
110  * The module initialization routine for POSIX asynchronous I/O will
111  * set this to the version of AIO that it implements.  (Zero means
112  * that it is not implemented.)  This value is used here by pathconf()
113  * and in kern_descrip.c by fpathconf().
114  */
115 int async_io_version;
116 
117 #ifdef DEBUG
118 static int syncprt = 0;
119 SYSCTL_INT(_debug, OID_AUTO, syncprt, CTLFLAG_RW, &syncprt, 0, "");
120 #endif
121 
122 /*
123  * Sync each mounted filesystem.
124  */
125 #ifndef _SYS_SYSPROTO_H_
126 struct sync_args {
127 	int     dummy;
128 };
129 #endif
130 /* ARGSUSED */
131 int
132 sys_sync(td, uap)
133 	struct thread *td;
134 	struct sync_args *uap;
135 {
136 	struct mount *mp, *nmp;
137 	int save, vfslocked;
138 
139 	mtx_lock(&mountlist_mtx);
140 	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
141 		if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
142 			nmp = TAILQ_NEXT(mp, mnt_list);
143 			continue;
144 		}
145 		vfslocked = VFS_LOCK_GIANT(mp);
146 		if ((mp->mnt_flag & MNT_RDONLY) == 0 &&
147 		    vn_start_write(NULL, &mp, V_NOWAIT) == 0) {
148 			save = curthread_pflags_set(TDP_SYNCIO);
149 			vfs_msync(mp, MNT_NOWAIT);
150 			VFS_SYNC(mp, MNT_NOWAIT);
151 			curthread_pflags_restore(save);
152 			vn_finished_write(mp);
153 		}
154 		VFS_UNLOCK_GIANT(vfslocked);
155 		mtx_lock(&mountlist_mtx);
156 		nmp = TAILQ_NEXT(mp, mnt_list);
157 		vfs_unbusy(mp);
158 	}
159 	mtx_unlock(&mountlist_mtx);
160 	return (0);
161 }
162 
163 /*
164  * Change filesystem quotas.
165  */
166 #ifndef _SYS_SYSPROTO_H_
167 struct quotactl_args {
168 	char *path;
169 	int cmd;
170 	int uid;
171 	caddr_t arg;
172 };
173 #endif
174 int
175 sys_quotactl(td, uap)
176 	struct thread *td;
177 	register struct quotactl_args /* {
178 		char *path;
179 		int cmd;
180 		int uid;
181 		caddr_t arg;
182 	} */ *uap;
183 {
184 	struct mount *mp;
185 	int vfslocked;
186 	int error;
187 	struct nameidata nd;
188 
189 	AUDIT_ARG_CMD(uap->cmd);
190 	AUDIT_ARG_UID(uap->uid);
191 	if (!prison_allow(td->td_ucred, PR_ALLOW_QUOTAS))
192 		return (EPERM);
193 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
194 	   UIO_USERSPACE, uap->path, td);
195 	if ((error = namei(&nd)) != 0)
196 		return (error);
197 	vfslocked = NDHASGIANT(&nd);
198 	NDFREE(&nd, NDF_ONLY_PNBUF);
199 	mp = nd.ni_vp->v_mount;
200 	vfs_ref(mp);
201 	vput(nd.ni_vp);
202 	error = vfs_busy(mp, 0);
203 	vfs_rel(mp);
204 	if (error) {
205 		VFS_UNLOCK_GIANT(vfslocked);
206 		return (error);
207 	}
208 	error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, uap->arg);
209 
210 	/*
211 	 * Since quota on operation typically needs to open quota
212 	 * file, the Q_QUOTAON handler needs to unbusy the mount point
213 	 * before calling into namei.  Otherwise, unmount might be
214 	 * started between two vfs_busy() invocations (first is our,
215 	 * second is from mount point cross-walk code in lookup()),
216 	 * causing deadlock.
217 	 *
218 	 * Require that Q_QUOTAON handles the vfs_busy() reference on
219 	 * its own, always returning with ubusied mount point.
220 	 */
221 	if ((uap->cmd >> SUBCMDSHIFT) != Q_QUOTAON)
222 		vfs_unbusy(mp);
223 	VFS_UNLOCK_GIANT(vfslocked);
224 	return (error);
225 }
226 
227 /*
228  * Used by statfs conversion routines to scale the block size up if
229  * necessary so that all of the block counts are <= 'max_size'.  Note
230  * that 'max_size' should be a bitmask, i.e. 2^n - 1 for some non-zero
231  * value of 'n'.
232  */
233 void
234 statfs_scale_blocks(struct statfs *sf, long max_size)
235 {
236 	uint64_t count;
237 	int shift;
238 
239 	KASSERT(powerof2(max_size + 1), ("%s: invalid max_size", __func__));
240 
241 	/*
242 	 * Attempt to scale the block counts to give a more accurate
243 	 * overview to userland of the ratio of free space to used
244 	 * space.  To do this, find the largest block count and compute
245 	 * a divisor that lets it fit into a signed integer <= max_size.
246 	 */
247 	if (sf->f_bavail < 0)
248 		count = -sf->f_bavail;
249 	else
250 		count = sf->f_bavail;
251 	count = MAX(sf->f_blocks, MAX(sf->f_bfree, count));
252 	if (count <= max_size)
253 		return;
254 
255 	count >>= flsl(max_size);
256 	shift = 0;
257 	while (count > 0) {
258 		shift++;
259 		count >>=1;
260 	}
261 
262 	sf->f_bsize <<= shift;
263 	sf->f_blocks >>= shift;
264 	sf->f_bfree >>= shift;
265 	sf->f_bavail >>= shift;
266 }
267 
268 /*
269  * Get filesystem statistics.
270  */
271 #ifndef _SYS_SYSPROTO_H_
272 struct statfs_args {
273 	char *path;
274 	struct statfs *buf;
275 };
276 #endif
277 int
278 sys_statfs(td, uap)
279 	struct thread *td;
280 	register struct statfs_args /* {
281 		char *path;
282 		struct statfs *buf;
283 	} */ *uap;
284 {
285 	struct statfs sf;
286 	int error;
287 
288 	error = kern_statfs(td, uap->path, UIO_USERSPACE, &sf);
289 	if (error == 0)
290 		error = copyout(&sf, uap->buf, sizeof(sf));
291 	return (error);
292 }
293 
294 int
295 kern_statfs(struct thread *td, char *path, enum uio_seg pathseg,
296     struct statfs *buf)
297 {
298 	struct mount *mp;
299 	struct statfs *sp, sb;
300 	int vfslocked;
301 	int error;
302 	struct nameidata nd;
303 
304 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | MPSAFE |
305 	    AUDITVNODE1, pathseg, path, td);
306 	error = namei(&nd);
307 	if (error)
308 		return (error);
309 	vfslocked = NDHASGIANT(&nd);
310 	mp = nd.ni_vp->v_mount;
311 	vfs_ref(mp);
312 	NDFREE(&nd, NDF_ONLY_PNBUF);
313 	vput(nd.ni_vp);
314 	error = vfs_busy(mp, 0);
315 	vfs_rel(mp);
316 	if (error) {
317 		VFS_UNLOCK_GIANT(vfslocked);
318 		return (error);
319 	}
320 #ifdef MAC
321 	error = mac_mount_check_stat(td->td_ucred, mp);
322 	if (error)
323 		goto out;
324 #endif
325 	/*
326 	 * Set these in case the underlying filesystem fails to do so.
327 	 */
328 	sp = &mp->mnt_stat;
329 	sp->f_version = STATFS_VERSION;
330 	sp->f_namemax = NAME_MAX;
331 	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
332 	error = VFS_STATFS(mp, sp);
333 	if (error)
334 		goto out;
335 	if (priv_check(td, PRIV_VFS_GENERATION)) {
336 		bcopy(sp, &sb, sizeof(sb));
337 		sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
338 		prison_enforce_statfs(td->td_ucred, mp, &sb);
339 		sp = &sb;
340 	}
341 	*buf = *sp;
342 out:
343 	vfs_unbusy(mp);
344 	VFS_UNLOCK_GIANT(vfslocked);
345 	return (error);
346 }
347 
348 /*
349  * Get filesystem statistics.
350  */
351 #ifndef _SYS_SYSPROTO_H_
352 struct fstatfs_args {
353 	int fd;
354 	struct statfs *buf;
355 };
356 #endif
357 int
358 sys_fstatfs(td, uap)
359 	struct thread *td;
360 	register struct fstatfs_args /* {
361 		int fd;
362 		struct statfs *buf;
363 	} */ *uap;
364 {
365 	struct statfs sf;
366 	int error;
367 
368 	error = kern_fstatfs(td, uap->fd, &sf);
369 	if (error == 0)
370 		error = copyout(&sf, uap->buf, sizeof(sf));
371 	return (error);
372 }
373 
374 int
375 kern_fstatfs(struct thread *td, int fd, struct statfs *buf)
376 {
377 	struct file *fp;
378 	struct mount *mp;
379 	struct statfs *sp, sb;
380 	int vfslocked;
381 	struct vnode *vp;
382 	int error;
383 
384 	AUDIT_ARG_FD(fd);
385 	error = getvnode(td->td_proc->p_fd, fd, CAP_FSTATFS, &fp);
386 	if (error)
387 		return (error);
388 	vp = fp->f_vnode;
389 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
390 	vn_lock(vp, LK_SHARED | LK_RETRY);
391 #ifdef AUDIT
392 	AUDIT_ARG_VNODE1(vp);
393 #endif
394 	mp = vp->v_mount;
395 	if (mp)
396 		vfs_ref(mp);
397 	VOP_UNLOCK(vp, 0);
398 	fdrop(fp, td);
399 	if (mp == NULL) {
400 		error = EBADF;
401 		goto out;
402 	}
403 	error = vfs_busy(mp, 0);
404 	vfs_rel(mp);
405 	if (error) {
406 		VFS_UNLOCK_GIANT(vfslocked);
407 		return (error);
408 	}
409 #ifdef MAC
410 	error = mac_mount_check_stat(td->td_ucred, mp);
411 	if (error)
412 		goto out;
413 #endif
414 	/*
415 	 * Set these in case the underlying filesystem fails to do so.
416 	 */
417 	sp = &mp->mnt_stat;
418 	sp->f_version = STATFS_VERSION;
419 	sp->f_namemax = NAME_MAX;
420 	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
421 	error = VFS_STATFS(mp, sp);
422 	if (error)
423 		goto out;
424 	if (priv_check(td, PRIV_VFS_GENERATION)) {
425 		bcopy(sp, &sb, sizeof(sb));
426 		sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
427 		prison_enforce_statfs(td->td_ucred, mp, &sb);
428 		sp = &sb;
429 	}
430 	*buf = *sp;
431 out:
432 	if (mp)
433 		vfs_unbusy(mp);
434 	VFS_UNLOCK_GIANT(vfslocked);
435 	return (error);
436 }
437 
438 /*
439  * Get statistics on all filesystems.
440  */
441 #ifndef _SYS_SYSPROTO_H_
442 struct getfsstat_args {
443 	struct statfs *buf;
444 	long bufsize;
445 	int flags;
446 };
447 #endif
448 int
449 sys_getfsstat(td, uap)
450 	struct thread *td;
451 	register struct getfsstat_args /* {
452 		struct statfs *buf;
453 		long bufsize;
454 		int flags;
455 	} */ *uap;
456 {
457 
458 	return (kern_getfsstat(td, &uap->buf, uap->bufsize, UIO_USERSPACE,
459 	    uap->flags));
460 }
461 
462 /*
463  * If (bufsize > 0 && bufseg == UIO_SYSSPACE)
464  * 	The caller is responsible for freeing memory which will be allocated
465  *	in '*buf'.
466  */
467 int
468 kern_getfsstat(struct thread *td, struct statfs **buf, size_t bufsize,
469     enum uio_seg bufseg, int flags)
470 {
471 	struct mount *mp, *nmp;
472 	struct statfs *sfsp, *sp, sb;
473 	size_t count, maxcount;
474 	int vfslocked;
475 	int error;
476 
477 	maxcount = bufsize / sizeof(struct statfs);
478 	if (bufsize == 0)
479 		sfsp = NULL;
480 	else if (bufseg == UIO_USERSPACE)
481 		sfsp = *buf;
482 	else /* if (bufseg == UIO_SYSSPACE) */ {
483 		count = 0;
484 		mtx_lock(&mountlist_mtx);
485 		TAILQ_FOREACH(mp, &mountlist, mnt_list) {
486 			count++;
487 		}
488 		mtx_unlock(&mountlist_mtx);
489 		if (maxcount > count)
490 			maxcount = count;
491 		sfsp = *buf = malloc(maxcount * sizeof(struct statfs), M_TEMP,
492 		    M_WAITOK);
493 	}
494 	count = 0;
495 	mtx_lock(&mountlist_mtx);
496 	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
497 		if (prison_canseemount(td->td_ucred, mp) != 0) {
498 			nmp = TAILQ_NEXT(mp, mnt_list);
499 			continue;
500 		}
501 #ifdef MAC
502 		if (mac_mount_check_stat(td->td_ucred, mp) != 0) {
503 			nmp = TAILQ_NEXT(mp, mnt_list);
504 			continue;
505 		}
506 #endif
507 		if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
508 			nmp = TAILQ_NEXT(mp, mnt_list);
509 			continue;
510 		}
511 		vfslocked = VFS_LOCK_GIANT(mp);
512 		if (sfsp && count < maxcount) {
513 			sp = &mp->mnt_stat;
514 			/*
515 			 * Set these in case the underlying filesystem
516 			 * fails to do so.
517 			 */
518 			sp->f_version = STATFS_VERSION;
519 			sp->f_namemax = NAME_MAX;
520 			sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
521 			/*
522 			 * If MNT_NOWAIT or MNT_LAZY is specified, do not
523 			 * refresh the fsstat cache. MNT_NOWAIT or MNT_LAZY
524 			 * overrides MNT_WAIT.
525 			 */
526 			if (((flags & (MNT_LAZY|MNT_NOWAIT)) == 0 ||
527 			    (flags & MNT_WAIT)) &&
528 			    (error = VFS_STATFS(mp, sp))) {
529 				VFS_UNLOCK_GIANT(vfslocked);
530 				mtx_lock(&mountlist_mtx);
531 				nmp = TAILQ_NEXT(mp, mnt_list);
532 				vfs_unbusy(mp);
533 				continue;
534 			}
535 			if (priv_check(td, PRIV_VFS_GENERATION)) {
536 				bcopy(sp, &sb, sizeof(sb));
537 				sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
538 				prison_enforce_statfs(td->td_ucred, mp, &sb);
539 				sp = &sb;
540 			}
541 			if (bufseg == UIO_SYSSPACE)
542 				bcopy(sp, sfsp, sizeof(*sp));
543 			else /* if (bufseg == UIO_USERSPACE) */ {
544 				error = copyout(sp, sfsp, sizeof(*sp));
545 				if (error) {
546 					vfs_unbusy(mp);
547 					VFS_UNLOCK_GIANT(vfslocked);
548 					return (error);
549 				}
550 			}
551 			sfsp++;
552 		}
553 		VFS_UNLOCK_GIANT(vfslocked);
554 		count++;
555 		mtx_lock(&mountlist_mtx);
556 		nmp = TAILQ_NEXT(mp, mnt_list);
557 		vfs_unbusy(mp);
558 	}
559 	mtx_unlock(&mountlist_mtx);
560 	if (sfsp && count > maxcount)
561 		td->td_retval[0] = maxcount;
562 	else
563 		td->td_retval[0] = count;
564 	return (0);
565 }
566 
567 #ifdef COMPAT_FREEBSD4
568 /*
569  * Get old format filesystem statistics.
570  */
571 static void cvtstatfs(struct statfs *, struct ostatfs *);
572 
573 #ifndef _SYS_SYSPROTO_H_
574 struct freebsd4_statfs_args {
575 	char *path;
576 	struct ostatfs *buf;
577 };
578 #endif
579 int
580 freebsd4_statfs(td, uap)
581 	struct thread *td;
582 	struct freebsd4_statfs_args /* {
583 		char *path;
584 		struct ostatfs *buf;
585 	} */ *uap;
586 {
587 	struct ostatfs osb;
588 	struct statfs sf;
589 	int error;
590 
591 	error = kern_statfs(td, uap->path, UIO_USERSPACE, &sf);
592 	if (error)
593 		return (error);
594 	cvtstatfs(&sf, &osb);
595 	return (copyout(&osb, uap->buf, sizeof(osb)));
596 }
597 
598 /*
599  * Get filesystem statistics.
600  */
601 #ifndef _SYS_SYSPROTO_H_
602 struct freebsd4_fstatfs_args {
603 	int fd;
604 	struct ostatfs *buf;
605 };
606 #endif
607 int
608 freebsd4_fstatfs(td, uap)
609 	struct thread *td;
610 	struct freebsd4_fstatfs_args /* {
611 		int fd;
612 		struct ostatfs *buf;
613 	} */ *uap;
614 {
615 	struct ostatfs osb;
616 	struct statfs sf;
617 	int error;
618 
619 	error = kern_fstatfs(td, uap->fd, &sf);
620 	if (error)
621 		return (error);
622 	cvtstatfs(&sf, &osb);
623 	return (copyout(&osb, uap->buf, sizeof(osb)));
624 }
625 
626 /*
627  * Get statistics on all filesystems.
628  */
629 #ifndef _SYS_SYSPROTO_H_
630 struct freebsd4_getfsstat_args {
631 	struct ostatfs *buf;
632 	long bufsize;
633 	int flags;
634 };
635 #endif
636 int
637 freebsd4_getfsstat(td, uap)
638 	struct thread *td;
639 	register struct freebsd4_getfsstat_args /* {
640 		struct ostatfs *buf;
641 		long bufsize;
642 		int flags;
643 	} */ *uap;
644 {
645 	struct statfs *buf, *sp;
646 	struct ostatfs osb;
647 	size_t count, size;
648 	int error;
649 
650 	count = uap->bufsize / sizeof(struct ostatfs);
651 	size = count * sizeof(struct statfs);
652 	error = kern_getfsstat(td, &buf, size, UIO_SYSSPACE, uap->flags);
653 	if (size > 0) {
654 		count = td->td_retval[0];
655 		sp = buf;
656 		while (count > 0 && error == 0) {
657 			cvtstatfs(sp, &osb);
658 			error = copyout(&osb, uap->buf, sizeof(osb));
659 			sp++;
660 			uap->buf++;
661 			count--;
662 		}
663 		free(buf, M_TEMP);
664 	}
665 	return (error);
666 }
667 
668 /*
669  * Implement fstatfs() for (NFS) file handles.
670  */
671 #ifndef _SYS_SYSPROTO_H_
672 struct freebsd4_fhstatfs_args {
673 	struct fhandle *u_fhp;
674 	struct ostatfs *buf;
675 };
676 #endif
677 int
678 freebsd4_fhstatfs(td, uap)
679 	struct thread *td;
680 	struct freebsd4_fhstatfs_args /* {
681 		struct fhandle *u_fhp;
682 		struct ostatfs *buf;
683 	} */ *uap;
684 {
685 	struct ostatfs osb;
686 	struct statfs sf;
687 	fhandle_t fh;
688 	int error;
689 
690 	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
691 	if (error)
692 		return (error);
693 	error = kern_fhstatfs(td, fh, &sf);
694 	if (error)
695 		return (error);
696 	cvtstatfs(&sf, &osb);
697 	return (copyout(&osb, uap->buf, sizeof(osb)));
698 }
699 
700 /*
701  * Convert a new format statfs structure to an old format statfs structure.
702  */
703 static void
704 cvtstatfs(nsp, osp)
705 	struct statfs *nsp;
706 	struct ostatfs *osp;
707 {
708 
709 	statfs_scale_blocks(nsp, LONG_MAX);
710 	bzero(osp, sizeof(*osp));
711 	osp->f_bsize = nsp->f_bsize;
712 	osp->f_iosize = MIN(nsp->f_iosize, LONG_MAX);
713 	osp->f_blocks = nsp->f_blocks;
714 	osp->f_bfree = nsp->f_bfree;
715 	osp->f_bavail = nsp->f_bavail;
716 	osp->f_files = MIN(nsp->f_files, LONG_MAX);
717 	osp->f_ffree = MIN(nsp->f_ffree, LONG_MAX);
718 	osp->f_owner = nsp->f_owner;
719 	osp->f_type = nsp->f_type;
720 	osp->f_flags = nsp->f_flags;
721 	osp->f_syncwrites = MIN(nsp->f_syncwrites, LONG_MAX);
722 	osp->f_asyncwrites = MIN(nsp->f_asyncwrites, LONG_MAX);
723 	osp->f_syncreads = MIN(nsp->f_syncreads, LONG_MAX);
724 	osp->f_asyncreads = MIN(nsp->f_asyncreads, LONG_MAX);
725 	strlcpy(osp->f_fstypename, nsp->f_fstypename,
726 	    MIN(MFSNAMELEN, OMFSNAMELEN));
727 	strlcpy(osp->f_mntonname, nsp->f_mntonname,
728 	    MIN(MNAMELEN, OMNAMELEN));
729 	strlcpy(osp->f_mntfromname, nsp->f_mntfromname,
730 	    MIN(MNAMELEN, OMNAMELEN));
731 	osp->f_fsid = nsp->f_fsid;
732 }
733 #endif /* COMPAT_FREEBSD4 */
734 
735 /*
736  * Change current working directory to a given file descriptor.
737  */
738 #ifndef _SYS_SYSPROTO_H_
739 struct fchdir_args {
740 	int	fd;
741 };
742 #endif
743 int
744 sys_fchdir(td, uap)
745 	struct thread *td;
746 	struct fchdir_args /* {
747 		int fd;
748 	} */ *uap;
749 {
750 	register struct filedesc *fdp = td->td_proc->p_fd;
751 	struct vnode *vp, *tdp, *vpold;
752 	struct mount *mp;
753 	struct file *fp;
754 	int vfslocked;
755 	int error;
756 
757 	AUDIT_ARG_FD(uap->fd);
758 	if ((error = getvnode(fdp, uap->fd, CAP_FCHDIR, &fp)) != 0)
759 		return (error);
760 	vp = fp->f_vnode;
761 	VREF(vp);
762 	fdrop(fp, td);
763 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
764 	vn_lock(vp, LK_SHARED | LK_RETRY);
765 	AUDIT_ARG_VNODE1(vp);
766 	error = change_dir(vp, td);
767 	while (!error && (mp = vp->v_mountedhere) != NULL) {
768 		int tvfslocked;
769 		if (vfs_busy(mp, 0))
770 			continue;
771 		tvfslocked = VFS_LOCK_GIANT(mp);
772 		error = VFS_ROOT(mp, LK_SHARED, &tdp);
773 		vfs_unbusy(mp);
774 		if (error) {
775 			VFS_UNLOCK_GIANT(tvfslocked);
776 			break;
777 		}
778 		vput(vp);
779 		VFS_UNLOCK_GIANT(vfslocked);
780 		vp = tdp;
781 		vfslocked = tvfslocked;
782 	}
783 	if (error) {
784 		vput(vp);
785 		VFS_UNLOCK_GIANT(vfslocked);
786 		return (error);
787 	}
788 	VOP_UNLOCK(vp, 0);
789 	VFS_UNLOCK_GIANT(vfslocked);
790 	FILEDESC_XLOCK(fdp);
791 	vpold = fdp->fd_cdir;
792 	fdp->fd_cdir = vp;
793 	FILEDESC_XUNLOCK(fdp);
794 	vfslocked = VFS_LOCK_GIANT(vpold->v_mount);
795 	vrele(vpold);
796 	VFS_UNLOCK_GIANT(vfslocked);
797 	return (0);
798 }
799 
800 /*
801  * Change current working directory (``.'').
802  */
803 #ifndef _SYS_SYSPROTO_H_
804 struct chdir_args {
805 	char	*path;
806 };
807 #endif
808 int
809 sys_chdir(td, uap)
810 	struct thread *td;
811 	struct chdir_args /* {
812 		char *path;
813 	} */ *uap;
814 {
815 
816 	return (kern_chdir(td, uap->path, UIO_USERSPACE));
817 }
818 
819 int
820 kern_chdir(struct thread *td, char *path, enum uio_seg pathseg)
821 {
822 	register struct filedesc *fdp = td->td_proc->p_fd;
823 	int error;
824 	struct nameidata nd;
825 	struct vnode *vp;
826 	int vfslocked;
827 
828 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1 |
829 	    MPSAFE, pathseg, path, td);
830 	if ((error = namei(&nd)) != 0)
831 		return (error);
832 	vfslocked = NDHASGIANT(&nd);
833 	if ((error = change_dir(nd.ni_vp, td)) != 0) {
834 		vput(nd.ni_vp);
835 		VFS_UNLOCK_GIANT(vfslocked);
836 		NDFREE(&nd, NDF_ONLY_PNBUF);
837 		return (error);
838 	}
839 	VOP_UNLOCK(nd.ni_vp, 0);
840 	VFS_UNLOCK_GIANT(vfslocked);
841 	NDFREE(&nd, NDF_ONLY_PNBUF);
842 	FILEDESC_XLOCK(fdp);
843 	vp = fdp->fd_cdir;
844 	fdp->fd_cdir = nd.ni_vp;
845 	FILEDESC_XUNLOCK(fdp);
846 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
847 	vrele(vp);
848 	VFS_UNLOCK_GIANT(vfslocked);
849 	return (0);
850 }
851 
852 /*
853  * Helper function for raised chroot(2) security function:  Refuse if
854  * any filedescriptors are open directories.
855  */
856 static int
857 chroot_refuse_vdir_fds(fdp)
858 	struct filedesc *fdp;
859 {
860 	struct vnode *vp;
861 	struct file *fp;
862 	int fd;
863 
864 	FILEDESC_LOCK_ASSERT(fdp);
865 
866 	for (fd = 0; fd < fdp->fd_nfiles ; fd++) {
867 		fp = fget_locked(fdp, fd);
868 		if (fp == NULL)
869 			continue;
870 		if (fp->f_type == DTYPE_VNODE) {
871 			vp = fp->f_vnode;
872 			if (vp->v_type == VDIR)
873 				return (EPERM);
874 		}
875 	}
876 	return (0);
877 }
878 
879 /*
880  * This sysctl determines if we will allow a process to chroot(2) if it
881  * has a directory open:
882  *	0: disallowed for all processes.
883  *	1: allowed for processes that were not already chroot(2)'ed.
884  *	2: allowed for all processes.
885  */
886 
887 static int chroot_allow_open_directories = 1;
888 
889 SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW,
890      &chroot_allow_open_directories, 0,
891      "Allow a process to chroot(2) if it has a directory open");
892 
893 /*
894  * Change notion of root (``/'') directory.
895  */
896 #ifndef _SYS_SYSPROTO_H_
897 struct chroot_args {
898 	char	*path;
899 };
900 #endif
901 int
902 sys_chroot(td, uap)
903 	struct thread *td;
904 	struct chroot_args /* {
905 		char *path;
906 	} */ *uap;
907 {
908 	int error;
909 	struct nameidata nd;
910 	int vfslocked;
911 
912 	error = priv_check(td, PRIV_VFS_CHROOT);
913 	if (error)
914 		return (error);
915 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | MPSAFE |
916 	    AUDITVNODE1, UIO_USERSPACE, uap->path, td);
917 	error = namei(&nd);
918 	if (error)
919 		goto error;
920 	vfslocked = NDHASGIANT(&nd);
921 	if ((error = change_dir(nd.ni_vp, td)) != 0)
922 		goto e_vunlock;
923 #ifdef MAC
924 	if ((error = mac_vnode_check_chroot(td->td_ucred, nd.ni_vp)))
925 		goto e_vunlock;
926 #endif
927 	VOP_UNLOCK(nd.ni_vp, 0);
928 	error = change_root(nd.ni_vp, td);
929 	vrele(nd.ni_vp);
930 	VFS_UNLOCK_GIANT(vfslocked);
931 	NDFREE(&nd, NDF_ONLY_PNBUF);
932 	return (error);
933 e_vunlock:
934 	vput(nd.ni_vp);
935 	VFS_UNLOCK_GIANT(vfslocked);
936 error:
937 	NDFREE(&nd, NDF_ONLY_PNBUF);
938 	return (error);
939 }
940 
941 /*
942  * Common routine for chroot and chdir.  Callers must provide a locked vnode
943  * instance.
944  */
945 int
946 change_dir(vp, td)
947 	struct vnode *vp;
948 	struct thread *td;
949 {
950 	int error;
951 
952 	ASSERT_VOP_LOCKED(vp, "change_dir(): vp not locked");
953 	if (vp->v_type != VDIR)
954 		return (ENOTDIR);
955 #ifdef MAC
956 	error = mac_vnode_check_chdir(td->td_ucred, vp);
957 	if (error)
958 		return (error);
959 #endif
960 	error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td);
961 	return (error);
962 }
963 
964 /*
965  * Common routine for kern_chroot() and jail_attach().  The caller is
966  * responsible for invoking priv_check() and mac_vnode_check_chroot() to
967  * authorize this operation.
968  */
969 int
970 change_root(vp, td)
971 	struct vnode *vp;
972 	struct thread *td;
973 {
974 	struct filedesc *fdp;
975 	struct vnode *oldvp;
976 	int vfslocked;
977 	int error;
978 
979 	VFS_ASSERT_GIANT(vp->v_mount);
980 	fdp = td->td_proc->p_fd;
981 	FILEDESC_XLOCK(fdp);
982 	if (chroot_allow_open_directories == 0 ||
983 	    (chroot_allow_open_directories == 1 && fdp->fd_rdir != rootvnode)) {
984 		error = chroot_refuse_vdir_fds(fdp);
985 		if (error) {
986 			FILEDESC_XUNLOCK(fdp);
987 			return (error);
988 		}
989 	}
990 	oldvp = fdp->fd_rdir;
991 	fdp->fd_rdir = vp;
992 	VREF(fdp->fd_rdir);
993 	if (!fdp->fd_jdir) {
994 		fdp->fd_jdir = vp;
995 		VREF(fdp->fd_jdir);
996 	}
997 	FILEDESC_XUNLOCK(fdp);
998 	vfslocked = VFS_LOCK_GIANT(oldvp->v_mount);
999 	vrele(oldvp);
1000 	VFS_UNLOCK_GIANT(vfslocked);
1001 	return (0);
1002 }
1003 
1004 static __inline cap_rights_t
1005 flags_to_rights(int flags)
1006 {
1007 	cap_rights_t rights = 0;
1008 
1009 	switch ((flags & O_ACCMODE)) {
1010 	case O_RDONLY:
1011 		rights |= CAP_READ;
1012 		break;
1013 
1014 	case O_RDWR:
1015 		rights |= CAP_READ;
1016 		/* fall through */
1017 
1018 	case O_WRONLY:
1019 		rights |= CAP_WRITE;
1020 		break;
1021 
1022 	case O_EXEC:
1023 		rights |= CAP_FEXECVE;
1024 		break;
1025 	}
1026 
1027 	if (flags & O_CREAT)
1028 		rights |= CAP_CREATE;
1029 
1030 	if (flags & O_TRUNC)
1031 		rights |= CAP_FTRUNCATE;
1032 
1033 	if ((flags & O_EXLOCK) || (flags & O_SHLOCK))
1034 		rights |= CAP_FLOCK;
1035 
1036 	return (rights);
1037 }
1038 
1039 /*
1040  * Check permissions, allocate an open file structure, and call the device
1041  * open routine if any.
1042  */
1043 #ifndef _SYS_SYSPROTO_H_
1044 struct open_args {
1045 	char	*path;
1046 	int	flags;
1047 	int	mode;
1048 };
1049 #endif
1050 int
1051 sys_open(td, uap)
1052 	struct thread *td;
1053 	register struct open_args /* {
1054 		char *path;
1055 		int flags;
1056 		int mode;
1057 	} */ *uap;
1058 {
1059 
1060 	return (kern_open(td, uap->path, UIO_USERSPACE, uap->flags, uap->mode));
1061 }
1062 
1063 #ifndef _SYS_SYSPROTO_H_
1064 struct openat_args {
1065 	int	fd;
1066 	char	*path;
1067 	int	flag;
1068 	int	mode;
1069 };
1070 #endif
1071 int
1072 sys_openat(struct thread *td, struct openat_args *uap)
1073 {
1074 
1075 	return (kern_openat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flag,
1076 	    uap->mode));
1077 }
1078 
1079 int
1080 kern_open(struct thread *td, char *path, enum uio_seg pathseg, int flags,
1081     int mode)
1082 {
1083 
1084 	return (kern_openat(td, AT_FDCWD, path, pathseg, flags, mode));
1085 }
1086 
1087 int
1088 kern_openat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
1089     int flags, int mode)
1090 {
1091 	struct proc *p = td->td_proc;
1092 	struct filedesc *fdp = p->p_fd;
1093 	struct file *fp;
1094 	struct vnode *vp;
1095 	int cmode;
1096 	struct file *nfp;
1097 	int type, indx = -1, error, error_open;
1098 	struct flock lf;
1099 	struct nameidata nd;
1100 	int vfslocked;
1101 	cap_rights_t rights_needed = CAP_LOOKUP;
1102 
1103 	AUDIT_ARG_FFLAGS(flags);
1104 	AUDIT_ARG_MODE(mode);
1105 	/* XXX: audit dirfd */
1106 	rights_needed |= flags_to_rights(flags);
1107 	/*
1108 	 * Only one of the O_EXEC, O_RDONLY, O_WRONLY and O_RDWR flags
1109 	 * may be specified.
1110 	 */
1111 	if (flags & O_EXEC) {
1112 		if (flags & O_ACCMODE)
1113 			return (EINVAL);
1114 	} else if ((flags & O_ACCMODE) == O_ACCMODE)
1115 		return (EINVAL);
1116 	else
1117 		flags = FFLAGS(flags);
1118 
1119 	/*
1120 	 * allocate the file descriptor, but don't install a descriptor yet
1121 	 */
1122 	error = falloc_noinstall(td, &nfp);
1123 	if (error)
1124 		return (error);
1125 	/* An extra reference on `nfp' has been held for us by falloc_noinstall(). */
1126 	fp = nfp;
1127 	/* Set the flags early so the finit in devfs can pick them up. */
1128 	fp->f_flag = flags & FMASK;
1129 	cmode = ((mode &~ fdp->fd_cmask) & ALLPERMS) &~ S_ISTXT;
1130 	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | AUDITVNODE1 | MPSAFE, pathseg,
1131 	    path, fd, rights_needed, td);
1132 	td->td_dupfd = -1;		/* XXX check for fdopen */
1133 	error = vn_open(&nd, &flags, cmode, fp);
1134 	if (error) {
1135 		/*
1136 		 * If the vn_open replaced the method vector, something
1137 		 * wonderous happened deep below and we just pass it up
1138 		 * pretending we know what we do.
1139 		 */
1140 		if (error == ENXIO && fp->f_ops != &badfileops)
1141 			goto success;
1142 
1143 		/*
1144 		 * handle special fdopen() case.  bleh.  dupfdopen() is
1145 		 * responsible for dropping the old contents of ofiles[indx]
1146 		 * if it succeeds.
1147 		 *
1148 		 * Don't do this for relative (capability) lookups; we don't
1149 		 * understand exactly what would happen, and we don't think
1150 		 * that it ever should.
1151 		 */
1152 		if ((nd.ni_strictrelative == 0) &&
1153 		    (error == ENODEV || error == ENXIO) &&
1154 		    (td->td_dupfd >= 0)) {
1155 			/* XXX from fdopen */
1156 			error_open = error;
1157 			if ((error = finstall(td, fp, &indx, flags)) != 0)
1158 				goto bad_unlocked;
1159 			if ((error = dupfdopen(td, fdp, indx, td->td_dupfd,
1160 			    flags, error_open)) == 0)
1161 				goto success;
1162 		}
1163 		/*
1164 		 * Clean up the descriptor, but only if another thread hadn't
1165 		 * replaced or closed it.
1166 		 */
1167 		if (indx != -1)
1168 			fdclose(fdp, fp, indx, td);
1169 		fdrop(fp, td);
1170 
1171 		if (error == ERESTART)
1172 			error = EINTR;
1173 		return (error);
1174 	}
1175 	td->td_dupfd = 0;
1176 	vfslocked = NDHASGIANT(&nd);
1177 	NDFREE(&nd, NDF_ONLY_PNBUF);
1178 	vp = nd.ni_vp;
1179 
1180 	/*
1181 	 * Store the vnode, for any f_type. Typically, the vnode use
1182 	 * count is decremented by direct call to vn_closefile() for
1183 	 * files that switched type in the cdevsw fdopen() method.
1184 	 */
1185 	fp->f_vnode = vp;
1186 	/*
1187 	 * If the file wasn't claimed by devfs bind it to the normal
1188 	 * vnode operations here.
1189 	 */
1190 	if (fp->f_ops == &badfileops) {
1191 		KASSERT(vp->v_type != VFIFO, ("Unexpected fifo."));
1192 		fp->f_seqcount = 1;
1193 		finit(fp, flags & FMASK, DTYPE_VNODE, vp, &vnops);
1194 	}
1195 
1196 	VOP_UNLOCK(vp, 0);
1197 	if (fp->f_type == DTYPE_VNODE && (flags & (O_EXLOCK | O_SHLOCK)) != 0) {
1198 		lf.l_whence = SEEK_SET;
1199 		lf.l_start = 0;
1200 		lf.l_len = 0;
1201 		if (flags & O_EXLOCK)
1202 			lf.l_type = F_WRLCK;
1203 		else
1204 			lf.l_type = F_RDLCK;
1205 		type = F_FLOCK;
1206 		if ((flags & FNONBLOCK) == 0)
1207 			type |= F_WAIT;
1208 		if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,
1209 			    type)) != 0)
1210 			goto bad;
1211 		atomic_set_int(&fp->f_flag, FHASLOCK);
1212 	}
1213 	if (flags & O_TRUNC) {
1214 		error = fo_truncate(fp, 0, td->td_ucred, td);
1215 		if (error)
1216 			goto bad;
1217 	}
1218 	VFS_UNLOCK_GIANT(vfslocked);
1219 success:
1220 	/*
1221 	 * If we haven't already installed the FD (for dupfdopen), do so now.
1222 	 */
1223 	if (indx == -1) {
1224 #ifdef CAPABILITIES
1225 		if (nd.ni_strictrelative == 1) {
1226 			/*
1227 			 * We are doing a strict relative lookup; wrap the
1228 			 * result in a capability.
1229 			 */
1230 			if ((error = kern_capwrap(td, fp, nd.ni_baserights,
1231 			    &indx)) != 0)
1232 				goto bad_unlocked;
1233 		} else
1234 #endif
1235 			if ((error = finstall(td, fp, &indx, flags)) != 0)
1236 				goto bad_unlocked;
1237 
1238 	}
1239 
1240 	/*
1241 	 * Release our private reference, leaving the one associated with
1242 	 * the descriptor table intact.
1243 	 */
1244 	fdrop(fp, td);
1245 	td->td_retval[0] = indx;
1246 	return (0);
1247 bad:
1248 	VFS_UNLOCK_GIANT(vfslocked);
1249 bad_unlocked:
1250 	if (indx != -1)
1251 		fdclose(fdp, fp, indx, td);
1252 	fdrop(fp, td);
1253 	td->td_retval[0] = -1;
1254 	return (error);
1255 }
1256 
1257 #ifdef COMPAT_43
1258 /*
1259  * Create a file.
1260  */
1261 #ifndef _SYS_SYSPROTO_H_
1262 struct ocreat_args {
1263 	char	*path;
1264 	int	mode;
1265 };
1266 #endif
1267 int
1268 ocreat(td, uap)
1269 	struct thread *td;
1270 	register struct ocreat_args /* {
1271 		char *path;
1272 		int mode;
1273 	} */ *uap;
1274 {
1275 
1276 	return (kern_open(td, uap->path, UIO_USERSPACE,
1277 	    O_WRONLY | O_CREAT | O_TRUNC, uap->mode));
1278 }
1279 #endif /* COMPAT_43 */
1280 
1281 /*
1282  * Create a special file.
1283  */
1284 #ifndef _SYS_SYSPROTO_H_
1285 struct mknod_args {
1286 	char	*path;
1287 	int	mode;
1288 	int	dev;
1289 };
1290 #endif
1291 int
1292 sys_mknod(td, uap)
1293 	struct thread *td;
1294 	register struct mknod_args /* {
1295 		char *path;
1296 		int mode;
1297 		int dev;
1298 	} */ *uap;
1299 {
1300 
1301 	return (kern_mknod(td, uap->path, UIO_USERSPACE, uap->mode, uap->dev));
1302 }
1303 
1304 #ifndef _SYS_SYSPROTO_H_
1305 struct mknodat_args {
1306 	int	fd;
1307 	char	*path;
1308 	mode_t	mode;
1309 	dev_t	dev;
1310 };
1311 #endif
1312 int
1313 sys_mknodat(struct thread *td, struct mknodat_args *uap)
1314 {
1315 
1316 	return (kern_mknodat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode,
1317 	    uap->dev));
1318 }
1319 
1320 int
1321 kern_mknod(struct thread *td, char *path, enum uio_seg pathseg, int mode,
1322     int dev)
1323 {
1324 
1325 	return (kern_mknodat(td, AT_FDCWD, path, pathseg, mode, dev));
1326 }
1327 
1328 int
1329 kern_mknodat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
1330     int mode, int dev)
1331 {
1332 	struct vnode *vp;
1333 	struct mount *mp;
1334 	struct vattr vattr;
1335 	int error;
1336 	int whiteout = 0;
1337 	struct nameidata nd;
1338 	int vfslocked;
1339 
1340 	AUDIT_ARG_MODE(mode);
1341 	AUDIT_ARG_DEV(dev);
1342 	switch (mode & S_IFMT) {
1343 	case S_IFCHR:
1344 	case S_IFBLK:
1345 		error = priv_check(td, PRIV_VFS_MKNOD_DEV);
1346 		break;
1347 	case S_IFMT:
1348 		error = priv_check(td, PRIV_VFS_MKNOD_BAD);
1349 		break;
1350 	case S_IFWHT:
1351 		error = priv_check(td, PRIV_VFS_MKNOD_WHT);
1352 		break;
1353 	case S_IFIFO:
1354 		if (dev == 0)
1355 			return (kern_mkfifoat(td, fd, path, pathseg, mode));
1356 		/* FALLTHROUGH */
1357 	default:
1358 		error = EINVAL;
1359 		break;
1360 	}
1361 	if (error)
1362 		return (error);
1363 restart:
1364 	bwillwrite();
1365 	NDINIT_ATRIGHTS(&nd, CREATE,
1366 	    LOCKPARENT | SAVENAME | MPSAFE | AUDITVNODE1, pathseg, path, fd,
1367 	    CAP_MKFIFO, td);
1368 	if ((error = namei(&nd)) != 0)
1369 		return (error);
1370 	vfslocked = NDHASGIANT(&nd);
1371 	vp = nd.ni_vp;
1372 	if (vp != NULL) {
1373 		NDFREE(&nd, NDF_ONLY_PNBUF);
1374 		if (vp == nd.ni_dvp)
1375 			vrele(nd.ni_dvp);
1376 		else
1377 			vput(nd.ni_dvp);
1378 		vrele(vp);
1379 		VFS_UNLOCK_GIANT(vfslocked);
1380 		return (EEXIST);
1381 	} else {
1382 		VATTR_NULL(&vattr);
1383 		vattr.va_mode = (mode & ALLPERMS) &
1384 		    ~td->td_proc->p_fd->fd_cmask;
1385 		vattr.va_rdev = dev;
1386 		whiteout = 0;
1387 
1388 		switch (mode & S_IFMT) {
1389 		case S_IFMT:	/* used by badsect to flag bad sectors */
1390 			vattr.va_type = VBAD;
1391 			break;
1392 		case S_IFCHR:
1393 			vattr.va_type = VCHR;
1394 			break;
1395 		case S_IFBLK:
1396 			vattr.va_type = VBLK;
1397 			break;
1398 		case S_IFWHT:
1399 			whiteout = 1;
1400 			break;
1401 		default:
1402 			panic("kern_mknod: invalid mode");
1403 		}
1404 	}
1405 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1406 		NDFREE(&nd, NDF_ONLY_PNBUF);
1407 		vput(nd.ni_dvp);
1408 		VFS_UNLOCK_GIANT(vfslocked);
1409 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
1410 			return (error);
1411 		goto restart;
1412 	}
1413 #ifdef MAC
1414 	if (error == 0 && !whiteout)
1415 		error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp,
1416 		    &nd.ni_cnd, &vattr);
1417 #endif
1418 	if (!error) {
1419 		if (whiteout)
1420 			error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE);
1421 		else {
1422 			error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp,
1423 						&nd.ni_cnd, &vattr);
1424 			if (error == 0)
1425 				vput(nd.ni_vp);
1426 		}
1427 	}
1428 	NDFREE(&nd, NDF_ONLY_PNBUF);
1429 	vput(nd.ni_dvp);
1430 	vn_finished_write(mp);
1431 	VFS_UNLOCK_GIANT(vfslocked);
1432 	return (error);
1433 }
1434 
1435 /*
1436  * Create a named pipe.
1437  */
1438 #ifndef _SYS_SYSPROTO_H_
1439 struct mkfifo_args {
1440 	char	*path;
1441 	int	mode;
1442 };
1443 #endif
1444 int
1445 sys_mkfifo(td, uap)
1446 	struct thread *td;
1447 	register struct mkfifo_args /* {
1448 		char *path;
1449 		int mode;
1450 	} */ *uap;
1451 {
1452 
1453 	return (kern_mkfifo(td, uap->path, UIO_USERSPACE, uap->mode));
1454 }
1455 
1456 #ifndef _SYS_SYSPROTO_H_
1457 struct mkfifoat_args {
1458 	int	fd;
1459 	char	*path;
1460 	mode_t	mode;
1461 };
1462 #endif
1463 int
1464 sys_mkfifoat(struct thread *td, struct mkfifoat_args *uap)
1465 {
1466 
1467 	return (kern_mkfifoat(td, uap->fd, uap->path, UIO_USERSPACE,
1468 	    uap->mode));
1469 }
1470 
1471 int
1472 kern_mkfifo(struct thread *td, char *path, enum uio_seg pathseg, int mode)
1473 {
1474 
1475 	return (kern_mkfifoat(td, AT_FDCWD, path, pathseg, mode));
1476 }
1477 
1478 int
1479 kern_mkfifoat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
1480     int mode)
1481 {
1482 	struct mount *mp;
1483 	struct vattr vattr;
1484 	int error;
1485 	struct nameidata nd;
1486 	int vfslocked;
1487 
1488 	AUDIT_ARG_MODE(mode);
1489 restart:
1490 	bwillwrite();
1491 	NDINIT_AT(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE | AUDITVNODE1,
1492 	    pathseg, path, fd, td);
1493 	if ((error = namei(&nd)) != 0)
1494 		return (error);
1495 	vfslocked = NDHASGIANT(&nd);
1496 	if (nd.ni_vp != NULL) {
1497 		NDFREE(&nd, NDF_ONLY_PNBUF);
1498 		if (nd.ni_vp == nd.ni_dvp)
1499 			vrele(nd.ni_dvp);
1500 		else
1501 			vput(nd.ni_dvp);
1502 		vrele(nd.ni_vp);
1503 		VFS_UNLOCK_GIANT(vfslocked);
1504 		return (EEXIST);
1505 	}
1506 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1507 		NDFREE(&nd, NDF_ONLY_PNBUF);
1508 		vput(nd.ni_dvp);
1509 		VFS_UNLOCK_GIANT(vfslocked);
1510 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
1511 			return (error);
1512 		goto restart;
1513 	}
1514 	VATTR_NULL(&vattr);
1515 	vattr.va_type = VFIFO;
1516 	vattr.va_mode = (mode & ALLPERMS) & ~td->td_proc->p_fd->fd_cmask;
1517 #ifdef MAC
1518 	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
1519 	    &vattr);
1520 	if (error)
1521 		goto out;
1522 #endif
1523 	error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
1524 	if (error == 0)
1525 		vput(nd.ni_vp);
1526 #ifdef MAC
1527 out:
1528 #endif
1529 	vput(nd.ni_dvp);
1530 	vn_finished_write(mp);
1531 	VFS_UNLOCK_GIANT(vfslocked);
1532 	NDFREE(&nd, NDF_ONLY_PNBUF);
1533 	return (error);
1534 }
1535 
1536 /*
1537  * Make a hard file link.
1538  */
1539 #ifndef _SYS_SYSPROTO_H_
1540 struct link_args {
1541 	char	*path;
1542 	char	*link;
1543 };
1544 #endif
1545 int
1546 sys_link(td, uap)
1547 	struct thread *td;
1548 	register struct link_args /* {
1549 		char *path;
1550 		char *link;
1551 	} */ *uap;
1552 {
1553 
1554 	return (kern_link(td, uap->path, uap->link, UIO_USERSPACE));
1555 }
1556 
1557 #ifndef _SYS_SYSPROTO_H_
1558 struct linkat_args {
1559 	int	fd1;
1560 	char	*path1;
1561 	int	fd2;
1562 	char	*path2;
1563 	int	flag;
1564 };
1565 #endif
1566 int
1567 sys_linkat(struct thread *td, struct linkat_args *uap)
1568 {
1569 	int flag;
1570 
1571 	flag = uap->flag;
1572 	if (flag & ~AT_SYMLINK_FOLLOW)
1573 		return (EINVAL);
1574 
1575 	return (kern_linkat(td, uap->fd1, uap->fd2, uap->path1, uap->path2,
1576 	    UIO_USERSPACE, (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW));
1577 }
1578 
1579 int hardlink_check_uid = 0;
1580 SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_uid, CTLFLAG_RW,
1581     &hardlink_check_uid, 0,
1582     "Unprivileged processes cannot create hard links to files owned by other "
1583     "users");
1584 static int hardlink_check_gid = 0;
1585 SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_gid, CTLFLAG_RW,
1586     &hardlink_check_gid, 0,
1587     "Unprivileged processes cannot create hard links to files owned by other "
1588     "groups");
1589 
1590 static int
1591 can_hardlink(struct vnode *vp, struct ucred *cred)
1592 {
1593 	struct vattr va;
1594 	int error;
1595 
1596 	if (!hardlink_check_uid && !hardlink_check_gid)
1597 		return (0);
1598 
1599 	error = VOP_GETATTR(vp, &va, cred);
1600 	if (error != 0)
1601 		return (error);
1602 
1603 	if (hardlink_check_uid && cred->cr_uid != va.va_uid) {
1604 		error = priv_check_cred(cred, PRIV_VFS_LINK, 0);
1605 		if (error)
1606 			return (error);
1607 	}
1608 
1609 	if (hardlink_check_gid && !groupmember(va.va_gid, cred)) {
1610 		error = priv_check_cred(cred, PRIV_VFS_LINK, 0);
1611 		if (error)
1612 			return (error);
1613 	}
1614 
1615 	return (0);
1616 }
1617 
1618 int
1619 kern_link(struct thread *td, char *path, char *link, enum uio_seg segflg)
1620 {
1621 
1622 	return (kern_linkat(td, AT_FDCWD, AT_FDCWD, path,link, segflg, FOLLOW));
1623 }
1624 
1625 int
1626 kern_linkat(struct thread *td, int fd1, int fd2, char *path1, char *path2,
1627     enum uio_seg segflg, int follow)
1628 {
1629 	struct vnode *vp;
1630 	struct mount *mp;
1631 	struct nameidata nd;
1632 	int vfslocked;
1633 	int lvfslocked;
1634 	int error;
1635 
1636 	bwillwrite();
1637 	NDINIT_AT(&nd, LOOKUP, follow | MPSAFE | AUDITVNODE1, segflg, path1,
1638 	    fd1, td);
1639 
1640 	if ((error = namei(&nd)) != 0)
1641 		return (error);
1642 	vfslocked = NDHASGIANT(&nd);
1643 	NDFREE(&nd, NDF_ONLY_PNBUF);
1644 	vp = nd.ni_vp;
1645 	if (vp->v_type == VDIR) {
1646 		vrele(vp);
1647 		VFS_UNLOCK_GIANT(vfslocked);
1648 		return (EPERM);		/* POSIX */
1649 	}
1650 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
1651 		vrele(vp);
1652 		VFS_UNLOCK_GIANT(vfslocked);
1653 		return (error);
1654 	}
1655 	NDINIT_AT(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE | AUDITVNODE2,
1656 	    segflg, path2, fd2, td);
1657 	if ((error = namei(&nd)) == 0) {
1658 		lvfslocked = NDHASGIANT(&nd);
1659 		if (nd.ni_vp != NULL) {
1660 			if (nd.ni_dvp == nd.ni_vp)
1661 				vrele(nd.ni_dvp);
1662 			else
1663 				vput(nd.ni_dvp);
1664 			vrele(nd.ni_vp);
1665 			error = EEXIST;
1666 		} else if ((error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY))
1667 		    == 0) {
1668 			error = can_hardlink(vp, td->td_ucred);
1669 			if (error == 0)
1670 #ifdef MAC
1671 				error = mac_vnode_check_link(td->td_ucred,
1672 				    nd.ni_dvp, vp, &nd.ni_cnd);
1673 			if (error == 0)
1674 #endif
1675 				error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd);
1676 			VOP_UNLOCK(vp, 0);
1677 			vput(nd.ni_dvp);
1678 		}
1679 		NDFREE(&nd, NDF_ONLY_PNBUF);
1680 		VFS_UNLOCK_GIANT(lvfslocked);
1681 	}
1682 	vrele(vp);
1683 	vn_finished_write(mp);
1684 	VFS_UNLOCK_GIANT(vfslocked);
1685 	return (error);
1686 }
1687 
1688 /*
1689  * Make a symbolic link.
1690  */
1691 #ifndef _SYS_SYSPROTO_H_
1692 struct symlink_args {
1693 	char	*path;
1694 	char	*link;
1695 };
1696 #endif
1697 int
1698 sys_symlink(td, uap)
1699 	struct thread *td;
1700 	register struct symlink_args /* {
1701 		char *path;
1702 		char *link;
1703 	} */ *uap;
1704 {
1705 
1706 	return (kern_symlink(td, uap->path, uap->link, UIO_USERSPACE));
1707 }
1708 
1709 #ifndef _SYS_SYSPROTO_H_
1710 struct symlinkat_args {
1711 	char	*path;
1712 	int	fd;
1713 	char	*path2;
1714 };
1715 #endif
1716 int
1717 sys_symlinkat(struct thread *td, struct symlinkat_args *uap)
1718 {
1719 
1720 	return (kern_symlinkat(td, uap->path1, uap->fd, uap->path2,
1721 	    UIO_USERSPACE));
1722 }
1723 
1724 int
1725 kern_symlink(struct thread *td, char *path, char *link, enum uio_seg segflg)
1726 {
1727 
1728 	return (kern_symlinkat(td, path, AT_FDCWD, link, segflg));
1729 }
1730 
1731 int
1732 kern_symlinkat(struct thread *td, char *path1, int fd, char *path2,
1733     enum uio_seg segflg)
1734 {
1735 	struct mount *mp;
1736 	struct vattr vattr;
1737 	char *syspath;
1738 	int error;
1739 	struct nameidata nd;
1740 	int vfslocked;
1741 
1742 	if (segflg == UIO_SYSSPACE) {
1743 		syspath = path1;
1744 	} else {
1745 		syspath = uma_zalloc(namei_zone, M_WAITOK);
1746 		if ((error = copyinstr(path1, syspath, MAXPATHLEN, NULL)) != 0)
1747 			goto out;
1748 	}
1749 	AUDIT_ARG_TEXT(syspath);
1750 restart:
1751 	bwillwrite();
1752 	NDINIT_AT(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE | AUDITVNODE1,
1753 	    segflg, path2, fd, td);
1754 	if ((error = namei(&nd)) != 0)
1755 		goto out;
1756 	vfslocked = NDHASGIANT(&nd);
1757 	if (nd.ni_vp) {
1758 		NDFREE(&nd, NDF_ONLY_PNBUF);
1759 		if (nd.ni_vp == nd.ni_dvp)
1760 			vrele(nd.ni_dvp);
1761 		else
1762 			vput(nd.ni_dvp);
1763 		vrele(nd.ni_vp);
1764 		VFS_UNLOCK_GIANT(vfslocked);
1765 		error = EEXIST;
1766 		goto out;
1767 	}
1768 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1769 		NDFREE(&nd, NDF_ONLY_PNBUF);
1770 		vput(nd.ni_dvp);
1771 		VFS_UNLOCK_GIANT(vfslocked);
1772 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
1773 			goto out;
1774 		goto restart;
1775 	}
1776 	VATTR_NULL(&vattr);
1777 	vattr.va_mode = ACCESSPERMS &~ td->td_proc->p_fd->fd_cmask;
1778 #ifdef MAC
1779 	vattr.va_type = VLNK;
1780 	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
1781 	    &vattr);
1782 	if (error)
1783 		goto out2;
1784 #endif
1785 	error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, syspath);
1786 	if (error == 0)
1787 		vput(nd.ni_vp);
1788 #ifdef MAC
1789 out2:
1790 #endif
1791 	NDFREE(&nd, NDF_ONLY_PNBUF);
1792 	vput(nd.ni_dvp);
1793 	vn_finished_write(mp);
1794 	VFS_UNLOCK_GIANT(vfslocked);
1795 out:
1796 	if (segflg != UIO_SYSSPACE)
1797 		uma_zfree(namei_zone, syspath);
1798 	return (error);
1799 }
1800 
1801 /*
1802  * Delete a whiteout from the filesystem.
1803  */
1804 int
1805 sys_undelete(td, uap)
1806 	struct thread *td;
1807 	register struct undelete_args /* {
1808 		char *path;
1809 	} */ *uap;
1810 {
1811 	int error;
1812 	struct mount *mp;
1813 	struct nameidata nd;
1814 	int vfslocked;
1815 
1816 restart:
1817 	bwillwrite();
1818 	NDINIT(&nd, DELETE, LOCKPARENT | DOWHITEOUT | MPSAFE | AUDITVNODE1,
1819 	    UIO_USERSPACE, uap->path, td);
1820 	error = namei(&nd);
1821 	if (error)
1822 		return (error);
1823 	vfslocked = NDHASGIANT(&nd);
1824 
1825 	if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) {
1826 		NDFREE(&nd, NDF_ONLY_PNBUF);
1827 		if (nd.ni_vp == nd.ni_dvp)
1828 			vrele(nd.ni_dvp);
1829 		else
1830 			vput(nd.ni_dvp);
1831 		if (nd.ni_vp)
1832 			vrele(nd.ni_vp);
1833 		VFS_UNLOCK_GIANT(vfslocked);
1834 		return (EEXIST);
1835 	}
1836 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1837 		NDFREE(&nd, NDF_ONLY_PNBUF);
1838 		vput(nd.ni_dvp);
1839 		VFS_UNLOCK_GIANT(vfslocked);
1840 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
1841 			return (error);
1842 		goto restart;
1843 	}
1844 	error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE);
1845 	NDFREE(&nd, NDF_ONLY_PNBUF);
1846 	vput(nd.ni_dvp);
1847 	vn_finished_write(mp);
1848 	VFS_UNLOCK_GIANT(vfslocked);
1849 	return (error);
1850 }
1851 
1852 /*
1853  * Delete a name from the filesystem.
1854  */
1855 #ifndef _SYS_SYSPROTO_H_
1856 struct unlink_args {
1857 	char	*path;
1858 };
1859 #endif
1860 int
1861 sys_unlink(td, uap)
1862 	struct thread *td;
1863 	struct unlink_args /* {
1864 		char *path;
1865 	} */ *uap;
1866 {
1867 
1868 	return (kern_unlink(td, uap->path, UIO_USERSPACE));
1869 }
1870 
1871 #ifndef _SYS_SYSPROTO_H_
1872 struct unlinkat_args {
1873 	int	fd;
1874 	char	*path;
1875 	int	flag;
1876 };
1877 #endif
1878 int
1879 sys_unlinkat(struct thread *td, struct unlinkat_args *uap)
1880 {
1881 	int flag = uap->flag;
1882 	int fd = uap->fd;
1883 	char *path = uap->path;
1884 
1885 	if (flag & ~AT_REMOVEDIR)
1886 		return (EINVAL);
1887 
1888 	if (flag & AT_REMOVEDIR)
1889 		return (kern_rmdirat(td, fd, path, UIO_USERSPACE));
1890 	else
1891 		return (kern_unlinkat(td, fd, path, UIO_USERSPACE, 0));
1892 }
1893 
1894 int
1895 kern_unlink(struct thread *td, char *path, enum uio_seg pathseg)
1896 {
1897 
1898 	return (kern_unlinkat(td, AT_FDCWD, path, pathseg, 0));
1899 }
1900 
1901 int
1902 kern_unlinkat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
1903     ino_t oldinum)
1904 {
1905 	struct mount *mp;
1906 	struct vnode *vp;
1907 	int error;
1908 	struct nameidata nd;
1909 	struct stat sb;
1910 	int vfslocked;
1911 
1912 restart:
1913 	bwillwrite();
1914 	NDINIT_AT(&nd, DELETE, LOCKPARENT | LOCKLEAF | MPSAFE | AUDITVNODE1,
1915 	    pathseg, path, fd, td);
1916 	if ((error = namei(&nd)) != 0)
1917 		return (error == EINVAL ? EPERM : error);
1918 	vfslocked = NDHASGIANT(&nd);
1919 	vp = nd.ni_vp;
1920 	if (vp->v_type == VDIR && oldinum == 0) {
1921 		error = EPERM;		/* POSIX */
1922 	} else if (oldinum != 0 &&
1923 		  ((error = vn_stat(vp, &sb, td->td_ucred, NOCRED, td)) == 0) &&
1924 		  sb.st_ino != oldinum) {
1925 			error = EIDRM;	/* Identifier removed */
1926 	} else {
1927 		/*
1928 		 * The root of a mounted filesystem cannot be deleted.
1929 		 *
1930 		 * XXX: can this only be a VDIR case?
1931 		 */
1932 		if (vp->v_vflag & VV_ROOT)
1933 			error = EBUSY;
1934 	}
1935 	if (error == 0) {
1936 		if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1937 			NDFREE(&nd, NDF_ONLY_PNBUF);
1938 			vput(nd.ni_dvp);
1939 			if (vp == nd.ni_dvp)
1940 				vrele(vp);
1941 			else
1942 				vput(vp);
1943 			VFS_UNLOCK_GIANT(vfslocked);
1944 			if ((error = vn_start_write(NULL, &mp,
1945 			    V_XSLEEP | PCATCH)) != 0)
1946 				return (error);
1947 			goto restart;
1948 		}
1949 #ifdef MAC
1950 		error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp,
1951 		    &nd.ni_cnd);
1952 		if (error)
1953 			goto out;
1954 #endif
1955 		error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
1956 #ifdef MAC
1957 out:
1958 #endif
1959 		vn_finished_write(mp);
1960 	}
1961 	NDFREE(&nd, NDF_ONLY_PNBUF);
1962 	vput(nd.ni_dvp);
1963 	if (vp == nd.ni_dvp)
1964 		vrele(vp);
1965 	else
1966 		vput(vp);
1967 	VFS_UNLOCK_GIANT(vfslocked);
1968 	return (error);
1969 }
1970 
1971 /*
1972  * Reposition read/write file offset.
1973  */
1974 #ifndef _SYS_SYSPROTO_H_
1975 struct lseek_args {
1976 	int	fd;
1977 	int	pad;
1978 	off_t	offset;
1979 	int	whence;
1980 };
1981 #endif
1982 int
1983 sys_lseek(td, uap)
1984 	struct thread *td;
1985 	register struct lseek_args /* {
1986 		int fd;
1987 		int pad;
1988 		off_t offset;
1989 		int whence;
1990 	} */ *uap;
1991 {
1992 	struct ucred *cred = td->td_ucred;
1993 	struct file *fp;
1994 	struct vnode *vp;
1995 	struct vattr vattr;
1996 	off_t offset, size;
1997 	int error, noneg;
1998 	int vfslocked;
1999 
2000 	AUDIT_ARG_FD(uap->fd);
2001 	if ((error = fget(td, uap->fd, CAP_SEEK, &fp)) != 0)
2002 		return (error);
2003 	if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) {
2004 		fdrop(fp, td);
2005 		return (ESPIPE);
2006 	}
2007 	vp = fp->f_vnode;
2008 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
2009 	noneg = (vp->v_type != VCHR);
2010 	offset = uap->offset;
2011 	switch (uap->whence) {
2012 	case L_INCR:
2013 		if (noneg &&
2014 		    (fp->f_offset < 0 ||
2015 		    (offset > 0 && fp->f_offset > OFF_MAX - offset))) {
2016 			error = EOVERFLOW;
2017 			break;
2018 		}
2019 		offset += fp->f_offset;
2020 		break;
2021 	case L_XTND:
2022 		vn_lock(vp, LK_SHARED | LK_RETRY);
2023 		error = VOP_GETATTR(vp, &vattr, cred);
2024 		VOP_UNLOCK(vp, 0);
2025 		if (error)
2026 			break;
2027 
2028 		/*
2029 		 * If the file references a disk device, then fetch
2030 		 * the media size and use that to determine the ending
2031 		 * offset.
2032 		 */
2033 		if (vattr.va_size == 0 && vp->v_type == VCHR &&
2034 		    fo_ioctl(fp, DIOCGMEDIASIZE, &size, cred, td) == 0)
2035 			vattr.va_size = size;
2036 		if (noneg &&
2037 		    (vattr.va_size > OFF_MAX ||
2038 		    (offset > 0 && vattr.va_size > OFF_MAX - offset))) {
2039 			error = EOVERFLOW;
2040 			break;
2041 		}
2042 		offset += vattr.va_size;
2043 		break;
2044 	case L_SET:
2045 		break;
2046 	case SEEK_DATA:
2047 		error = fo_ioctl(fp, FIOSEEKDATA, &offset, cred, td);
2048 		break;
2049 	case SEEK_HOLE:
2050 		error = fo_ioctl(fp, FIOSEEKHOLE, &offset, cred, td);
2051 		break;
2052 	default:
2053 		error = EINVAL;
2054 	}
2055 	if (error == 0 && noneg && offset < 0)
2056 		error = EINVAL;
2057 	if (error != 0)
2058 		goto drop;
2059 	fp->f_offset = offset;
2060 	VFS_KNOTE_UNLOCKED(vp, 0);
2061 	*(off_t *)(td->td_retval) = fp->f_offset;
2062 drop:
2063 	fdrop(fp, td);
2064 	VFS_UNLOCK_GIANT(vfslocked);
2065 	return (error);
2066 }
2067 
2068 #if defined(COMPAT_43)
2069 /*
2070  * Reposition read/write file offset.
2071  */
2072 #ifndef _SYS_SYSPROTO_H_
2073 struct olseek_args {
2074 	int	fd;
2075 	long	offset;
2076 	int	whence;
2077 };
2078 #endif
2079 int
2080 olseek(td, uap)
2081 	struct thread *td;
2082 	register struct olseek_args /* {
2083 		int fd;
2084 		long offset;
2085 		int whence;
2086 	} */ *uap;
2087 {
2088 	struct lseek_args /* {
2089 		int fd;
2090 		int pad;
2091 		off_t offset;
2092 		int whence;
2093 	} */ nuap;
2094 
2095 	nuap.fd = uap->fd;
2096 	nuap.offset = uap->offset;
2097 	nuap.whence = uap->whence;
2098 	return (sys_lseek(td, &nuap));
2099 }
2100 #endif /* COMPAT_43 */
2101 
2102 /* Version with the 'pad' argument */
2103 int
2104 freebsd6_lseek(td, uap)
2105 	struct thread *td;
2106 	register struct freebsd6_lseek_args *uap;
2107 {
2108 	struct lseek_args ouap;
2109 
2110 	ouap.fd = uap->fd;
2111 	ouap.offset = uap->offset;
2112 	ouap.whence = uap->whence;
2113 	return (sys_lseek(td, &ouap));
2114 }
2115 
2116 /*
2117  * Check access permissions using passed credentials.
2118  */
2119 static int
2120 vn_access(vp, user_flags, cred, td)
2121 	struct vnode	*vp;
2122 	int		user_flags;
2123 	struct ucred	*cred;
2124 	struct thread	*td;
2125 {
2126 	int error;
2127 	accmode_t accmode;
2128 
2129 	/* Flags == 0 means only check for existence. */
2130 	error = 0;
2131 	if (user_flags) {
2132 		accmode = 0;
2133 		if (user_flags & R_OK)
2134 			accmode |= VREAD;
2135 		if (user_flags & W_OK)
2136 			accmode |= VWRITE;
2137 		if (user_flags & X_OK)
2138 			accmode |= VEXEC;
2139 #ifdef MAC
2140 		error = mac_vnode_check_access(cred, vp, accmode);
2141 		if (error)
2142 			return (error);
2143 #endif
2144 		if ((accmode & VWRITE) == 0 || (error = vn_writechk(vp)) == 0)
2145 			error = VOP_ACCESS(vp, accmode, cred, td);
2146 	}
2147 	return (error);
2148 }
2149 
2150 /*
2151  * Check access permissions using "real" credentials.
2152  */
2153 #ifndef _SYS_SYSPROTO_H_
2154 struct access_args {
2155 	char	*path;
2156 	int	amode;
2157 };
2158 #endif
2159 int
2160 sys_access(td, uap)
2161 	struct thread *td;
2162 	register struct access_args /* {
2163 		char *path;
2164 		int amode;
2165 	} */ *uap;
2166 {
2167 
2168 	return (kern_access(td, uap->path, UIO_USERSPACE, uap->amode));
2169 }
2170 
2171 #ifndef _SYS_SYSPROTO_H_
2172 struct faccessat_args {
2173 	int	dirfd;
2174 	char	*path;
2175 	int	amode;
2176 	int	flag;
2177 }
2178 #endif
2179 int
2180 sys_faccessat(struct thread *td, struct faccessat_args *uap)
2181 {
2182 
2183 	if (uap->flag & ~AT_EACCESS)
2184 		return (EINVAL);
2185 	return (kern_accessat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flag,
2186 	    uap->amode));
2187 }
2188 
2189 int
2190 kern_access(struct thread *td, char *path, enum uio_seg pathseg, int amode)
2191 {
2192 
2193 	return (kern_accessat(td, AT_FDCWD, path, pathseg, 0, amode));
2194 }
2195 
2196 int
2197 kern_accessat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
2198     int flag, int amode)
2199 {
2200 	struct ucred *cred, *tmpcred;
2201 	struct vnode *vp;
2202 	struct nameidata nd;
2203 	int vfslocked;
2204 	int error;
2205 
2206 	/*
2207 	 * Create and modify a temporary credential instead of one that
2208 	 * is potentially shared.
2209 	 */
2210 	if (!(flag & AT_EACCESS)) {
2211 		cred = td->td_ucred;
2212 		tmpcred = crdup(cred);
2213 		tmpcred->cr_uid = cred->cr_ruid;
2214 		tmpcred->cr_groups[0] = cred->cr_rgid;
2215 		td->td_ucred = tmpcred;
2216 	} else
2217 		cred = tmpcred = td->td_ucred;
2218 	AUDIT_ARG_VALUE(amode);
2219 	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | MPSAFE |
2220 	    AUDITVNODE1, pathseg, path, fd, CAP_FSTAT, td);
2221 	if ((error = namei(&nd)) != 0)
2222 		goto out1;
2223 	vfslocked = NDHASGIANT(&nd);
2224 	vp = nd.ni_vp;
2225 
2226 	error = vn_access(vp, amode, tmpcred, td);
2227 	NDFREE(&nd, NDF_ONLY_PNBUF);
2228 	vput(vp);
2229 	VFS_UNLOCK_GIANT(vfslocked);
2230 out1:
2231 	if (!(flag & AT_EACCESS)) {
2232 		td->td_ucred = cred;
2233 		crfree(tmpcred);
2234 	}
2235 	return (error);
2236 }
2237 
2238 /*
2239  * Check access permissions using "effective" credentials.
2240  */
2241 #ifndef _SYS_SYSPROTO_H_
2242 struct eaccess_args {
2243 	char	*path;
2244 	int	amode;
2245 };
2246 #endif
2247 int
2248 sys_eaccess(td, uap)
2249 	struct thread *td;
2250 	register struct eaccess_args /* {
2251 		char *path;
2252 		int amode;
2253 	} */ *uap;
2254 {
2255 
2256 	return (kern_eaccess(td, uap->path, UIO_USERSPACE, uap->amode));
2257 }
2258 
2259 int
2260 kern_eaccess(struct thread *td, char *path, enum uio_seg pathseg, int amode)
2261 {
2262 
2263 	return (kern_accessat(td, AT_FDCWD, path, pathseg, AT_EACCESS, amode));
2264 }
2265 
2266 #if defined(COMPAT_43)
2267 /*
2268  * Get file status; this version follows links.
2269  */
2270 #ifndef _SYS_SYSPROTO_H_
2271 struct ostat_args {
2272 	char	*path;
2273 	struct ostat *ub;
2274 };
2275 #endif
2276 int
2277 ostat(td, uap)
2278 	struct thread *td;
2279 	register struct ostat_args /* {
2280 		char *path;
2281 		struct ostat *ub;
2282 	} */ *uap;
2283 {
2284 	struct stat sb;
2285 	struct ostat osb;
2286 	int error;
2287 
2288 	error = kern_stat(td, uap->path, UIO_USERSPACE, &sb);
2289 	if (error)
2290 		return (error);
2291 	cvtstat(&sb, &osb);
2292 	error = copyout(&osb, uap->ub, sizeof (osb));
2293 	return (error);
2294 }
2295 
2296 /*
2297  * Get file status; this version does not follow links.
2298  */
2299 #ifndef _SYS_SYSPROTO_H_
2300 struct olstat_args {
2301 	char	*path;
2302 	struct ostat *ub;
2303 };
2304 #endif
2305 int
2306 olstat(td, uap)
2307 	struct thread *td;
2308 	register struct olstat_args /* {
2309 		char *path;
2310 		struct ostat *ub;
2311 	} */ *uap;
2312 {
2313 	struct stat sb;
2314 	struct ostat osb;
2315 	int error;
2316 
2317 	error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb);
2318 	if (error)
2319 		return (error);
2320 	cvtstat(&sb, &osb);
2321 	error = copyout(&osb, uap->ub, sizeof (osb));
2322 	return (error);
2323 }
2324 
2325 /*
2326  * Convert from an old to a new stat structure.
2327  */
2328 void
2329 cvtstat(st, ost)
2330 	struct stat *st;
2331 	struct ostat *ost;
2332 {
2333 
2334 	ost->st_dev = st->st_dev;
2335 	ost->st_ino = st->st_ino;
2336 	ost->st_mode = st->st_mode;
2337 	ost->st_nlink = st->st_nlink;
2338 	ost->st_uid = st->st_uid;
2339 	ost->st_gid = st->st_gid;
2340 	ost->st_rdev = st->st_rdev;
2341 	if (st->st_size < (quad_t)1 << 32)
2342 		ost->st_size = st->st_size;
2343 	else
2344 		ost->st_size = -2;
2345 	ost->st_atim = st->st_atim;
2346 	ost->st_mtim = st->st_mtim;
2347 	ost->st_ctim = st->st_ctim;
2348 	ost->st_blksize = st->st_blksize;
2349 	ost->st_blocks = st->st_blocks;
2350 	ost->st_flags = st->st_flags;
2351 	ost->st_gen = st->st_gen;
2352 }
2353 #endif /* COMPAT_43 */
2354 
2355 /*
2356  * Get file status; this version follows links.
2357  */
2358 #ifndef _SYS_SYSPROTO_H_
2359 struct stat_args {
2360 	char	*path;
2361 	struct stat *ub;
2362 };
2363 #endif
2364 int
2365 sys_stat(td, uap)
2366 	struct thread *td;
2367 	register struct stat_args /* {
2368 		char *path;
2369 		struct stat *ub;
2370 	} */ *uap;
2371 {
2372 	struct stat sb;
2373 	int error;
2374 
2375 	error = kern_stat(td, uap->path, UIO_USERSPACE, &sb);
2376 	if (error == 0)
2377 		error = copyout(&sb, uap->ub, sizeof (sb));
2378 	return (error);
2379 }
2380 
2381 #ifndef _SYS_SYSPROTO_H_
2382 struct fstatat_args {
2383 	int	fd;
2384 	char	*path;
2385 	struct stat	*buf;
2386 	int	flag;
2387 }
2388 #endif
2389 int
2390 sys_fstatat(struct thread *td, struct fstatat_args *uap)
2391 {
2392 	struct stat sb;
2393 	int error;
2394 
2395 	error = kern_statat(td, uap->flag, uap->fd, uap->path,
2396 	    UIO_USERSPACE, &sb);
2397 	if (error == 0)
2398 		error = copyout(&sb, uap->buf, sizeof (sb));
2399 	return (error);
2400 }
2401 
2402 int
2403 kern_stat(struct thread *td, char *path, enum uio_seg pathseg, struct stat *sbp)
2404 {
2405 
2406 	return (kern_statat(td, 0, AT_FDCWD, path, pathseg, sbp));
2407 }
2408 
2409 int
2410 kern_statat(struct thread *td, int flag, int fd, char *path,
2411     enum uio_seg pathseg, struct stat *sbp)
2412 {
2413 
2414 	return (kern_statat_vnhook(td, flag, fd, path, pathseg, sbp, NULL));
2415 }
2416 
2417 int
2418 kern_statat_vnhook(struct thread *td, int flag, int fd, char *path,
2419     enum uio_seg pathseg, struct stat *sbp,
2420     void (*hook)(struct vnode *vp, struct stat *sbp))
2421 {
2422 	struct nameidata nd;
2423 	struct stat sb;
2424 	int error, vfslocked;
2425 
2426 	if (flag & ~AT_SYMLINK_NOFOLLOW)
2427 		return (EINVAL);
2428 
2429 	NDINIT_ATRIGHTS(&nd, LOOKUP, ((flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW :
2430 	    FOLLOW) | LOCKSHARED | LOCKLEAF | AUDITVNODE1 | MPSAFE, pathseg,
2431 	    path, fd, CAP_FSTAT, td);
2432 
2433 	if ((error = namei(&nd)) != 0)
2434 		return (error);
2435 	vfslocked = NDHASGIANT(&nd);
2436 	error = vn_stat(nd.ni_vp, &sb, td->td_ucred, NOCRED, td);
2437 	if (!error) {
2438 		SDT_PROBE(vfs, , stat, mode, path, sb.st_mode, 0, 0, 0);
2439 		if (S_ISREG(sb.st_mode))
2440 			SDT_PROBE(vfs, , stat, reg, path, pathseg, 0, 0, 0);
2441 		if (__predict_false(hook != NULL))
2442 			hook(nd.ni_vp, &sb);
2443 	}
2444 	NDFREE(&nd, NDF_ONLY_PNBUF);
2445 	vput(nd.ni_vp);
2446 	VFS_UNLOCK_GIANT(vfslocked);
2447 	if (error)
2448 		return (error);
2449 	*sbp = sb;
2450 #ifdef KTRACE
2451 	if (KTRPOINT(td, KTR_STRUCT))
2452 		ktrstat(&sb);
2453 #endif
2454 	return (0);
2455 }
2456 
2457 /*
2458  * Get file status; this version does not follow links.
2459  */
2460 #ifndef _SYS_SYSPROTO_H_
2461 struct lstat_args {
2462 	char	*path;
2463 	struct stat *ub;
2464 };
2465 #endif
2466 int
2467 sys_lstat(td, uap)
2468 	struct thread *td;
2469 	register struct lstat_args /* {
2470 		char *path;
2471 		struct stat *ub;
2472 	} */ *uap;
2473 {
2474 	struct stat sb;
2475 	int error;
2476 
2477 	error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb);
2478 	if (error == 0)
2479 		error = copyout(&sb, uap->ub, sizeof (sb));
2480 	return (error);
2481 }
2482 
2483 int
2484 kern_lstat(struct thread *td, char *path, enum uio_seg pathseg, struct stat *sbp)
2485 {
2486 
2487 	return (kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, path, pathseg,
2488 	    sbp));
2489 }
2490 
2491 /*
2492  * Implementation of the NetBSD [l]stat() functions.
2493  */
2494 void
2495 cvtnstat(sb, nsb)
2496 	struct stat *sb;
2497 	struct nstat *nsb;
2498 {
2499 	bzero(nsb, sizeof *nsb);
2500 	nsb->st_dev = sb->st_dev;
2501 	nsb->st_ino = sb->st_ino;
2502 	nsb->st_mode = sb->st_mode;
2503 	nsb->st_nlink = sb->st_nlink;
2504 	nsb->st_uid = sb->st_uid;
2505 	nsb->st_gid = sb->st_gid;
2506 	nsb->st_rdev = sb->st_rdev;
2507 	nsb->st_atim = sb->st_atim;
2508 	nsb->st_mtim = sb->st_mtim;
2509 	nsb->st_ctim = sb->st_ctim;
2510 	nsb->st_size = sb->st_size;
2511 	nsb->st_blocks = sb->st_blocks;
2512 	nsb->st_blksize = sb->st_blksize;
2513 	nsb->st_flags = sb->st_flags;
2514 	nsb->st_gen = sb->st_gen;
2515 	nsb->st_birthtim = sb->st_birthtim;
2516 }
2517 
2518 #ifndef _SYS_SYSPROTO_H_
2519 struct nstat_args {
2520 	char	*path;
2521 	struct nstat *ub;
2522 };
2523 #endif
2524 int
2525 sys_nstat(td, uap)
2526 	struct thread *td;
2527 	register struct nstat_args /* {
2528 		char *path;
2529 		struct nstat *ub;
2530 	} */ *uap;
2531 {
2532 	struct stat sb;
2533 	struct nstat nsb;
2534 	int error;
2535 
2536 	error = kern_stat(td, uap->path, UIO_USERSPACE, &sb);
2537 	if (error)
2538 		return (error);
2539 	cvtnstat(&sb, &nsb);
2540 	error = copyout(&nsb, uap->ub, sizeof (nsb));
2541 	return (error);
2542 }
2543 
2544 /*
2545  * NetBSD lstat.  Get file status; this version does not follow links.
2546  */
2547 #ifndef _SYS_SYSPROTO_H_
2548 struct lstat_args {
2549 	char	*path;
2550 	struct stat *ub;
2551 };
2552 #endif
2553 int
2554 sys_nlstat(td, uap)
2555 	struct thread *td;
2556 	register struct nlstat_args /* {
2557 		char *path;
2558 		struct nstat *ub;
2559 	} */ *uap;
2560 {
2561 	struct stat sb;
2562 	struct nstat nsb;
2563 	int error;
2564 
2565 	error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb);
2566 	if (error)
2567 		return (error);
2568 	cvtnstat(&sb, &nsb);
2569 	error = copyout(&nsb, uap->ub, sizeof (nsb));
2570 	return (error);
2571 }
2572 
2573 /*
2574  * Get configurable pathname variables.
2575  */
2576 #ifndef _SYS_SYSPROTO_H_
2577 struct pathconf_args {
2578 	char	*path;
2579 	int	name;
2580 };
2581 #endif
2582 int
2583 sys_pathconf(td, uap)
2584 	struct thread *td;
2585 	register struct pathconf_args /* {
2586 		char *path;
2587 		int name;
2588 	} */ *uap;
2589 {
2590 
2591 	return (kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name, FOLLOW));
2592 }
2593 
2594 #ifndef _SYS_SYSPROTO_H_
2595 struct lpathconf_args {
2596 	char	*path;
2597 	int	name;
2598 };
2599 #endif
2600 int
2601 sys_lpathconf(td, uap)
2602 	struct thread *td;
2603 	register struct lpathconf_args /* {
2604 		char *path;
2605 		int name;
2606 	} */ *uap;
2607 {
2608 
2609 	return (kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name, NOFOLLOW));
2610 }
2611 
2612 int
2613 kern_pathconf(struct thread *td, char *path, enum uio_seg pathseg, int name,
2614     u_long flags)
2615 {
2616 	struct nameidata nd;
2617 	int error, vfslocked;
2618 
2619 	NDINIT(&nd, LOOKUP, LOCKSHARED | LOCKLEAF | MPSAFE | AUDITVNODE1 |
2620 	    flags, pathseg, path, td);
2621 	if ((error = namei(&nd)) != 0)
2622 		return (error);
2623 	vfslocked = NDHASGIANT(&nd);
2624 	NDFREE(&nd, NDF_ONLY_PNBUF);
2625 
2626 	/* If asynchronous I/O is available, it works for all files. */
2627 	if (name == _PC_ASYNC_IO)
2628 		td->td_retval[0] = async_io_version;
2629 	else
2630 		error = VOP_PATHCONF(nd.ni_vp, name, td->td_retval);
2631 	vput(nd.ni_vp);
2632 	VFS_UNLOCK_GIANT(vfslocked);
2633 	return (error);
2634 }
2635 
2636 /*
2637  * Return target name of a symbolic link.
2638  */
2639 #ifndef _SYS_SYSPROTO_H_
2640 struct readlink_args {
2641 	char	*path;
2642 	char	*buf;
2643 	size_t	count;
2644 };
2645 #endif
2646 int
2647 sys_readlink(td, uap)
2648 	struct thread *td;
2649 	register struct readlink_args /* {
2650 		char *path;
2651 		char *buf;
2652 		size_t count;
2653 	} */ *uap;
2654 {
2655 
2656 	return (kern_readlink(td, uap->path, UIO_USERSPACE, uap->buf,
2657 	    UIO_USERSPACE, uap->count));
2658 }
2659 #ifndef _SYS_SYSPROTO_H_
2660 struct readlinkat_args {
2661 	int	fd;
2662 	char	*path;
2663 	char	*buf;
2664 	size_t	bufsize;
2665 };
2666 #endif
2667 int
2668 sys_readlinkat(struct thread *td, struct readlinkat_args *uap)
2669 {
2670 
2671 	return (kern_readlinkat(td, uap->fd, uap->path, UIO_USERSPACE,
2672 	    uap->buf, UIO_USERSPACE, uap->bufsize));
2673 }
2674 
2675 int
2676 kern_readlink(struct thread *td, char *path, enum uio_seg pathseg, char *buf,
2677     enum uio_seg bufseg, size_t count)
2678 {
2679 
2680 	return (kern_readlinkat(td, AT_FDCWD, path, pathseg, buf, bufseg,
2681 	    count));
2682 }
2683 
2684 int
2685 kern_readlinkat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
2686     char *buf, enum uio_seg bufseg, size_t count)
2687 {
2688 	struct vnode *vp;
2689 	struct iovec aiov;
2690 	struct uio auio;
2691 	int error;
2692 	struct nameidata nd;
2693 	int vfslocked;
2694 
2695 	if (count > IOSIZE_MAX)
2696 		return (EINVAL);
2697 
2698 	NDINIT_AT(&nd, LOOKUP, NOFOLLOW | LOCKSHARED | LOCKLEAF | MPSAFE |
2699 	    AUDITVNODE1, pathseg, path, fd, td);
2700 
2701 	if ((error = namei(&nd)) != 0)
2702 		return (error);
2703 	NDFREE(&nd, NDF_ONLY_PNBUF);
2704 	vfslocked = NDHASGIANT(&nd);
2705 	vp = nd.ni_vp;
2706 #ifdef MAC
2707 	error = mac_vnode_check_readlink(td->td_ucred, vp);
2708 	if (error) {
2709 		vput(vp);
2710 		VFS_UNLOCK_GIANT(vfslocked);
2711 		return (error);
2712 	}
2713 #endif
2714 	if (vp->v_type != VLNK)
2715 		error = EINVAL;
2716 	else {
2717 		aiov.iov_base = buf;
2718 		aiov.iov_len = count;
2719 		auio.uio_iov = &aiov;
2720 		auio.uio_iovcnt = 1;
2721 		auio.uio_offset = 0;
2722 		auio.uio_rw = UIO_READ;
2723 		auio.uio_segflg = bufseg;
2724 		auio.uio_td = td;
2725 		auio.uio_resid = count;
2726 		error = VOP_READLINK(vp, &auio, td->td_ucred);
2727 	}
2728 	vput(vp);
2729 	VFS_UNLOCK_GIANT(vfslocked);
2730 	td->td_retval[0] = count - auio.uio_resid;
2731 	return (error);
2732 }
2733 
2734 /*
2735  * Common implementation code for chflags() and fchflags().
2736  */
2737 static int
2738 setfflags(td, vp, flags)
2739 	struct thread *td;
2740 	struct vnode *vp;
2741 	int flags;
2742 {
2743 	int error;
2744 	struct mount *mp;
2745 	struct vattr vattr;
2746 
2747 	/*
2748 	 * Prevent non-root users from setting flags on devices.  When
2749 	 * a device is reused, users can retain ownership of the device
2750 	 * if they are allowed to set flags and programs assume that
2751 	 * chown can't fail when done as root.
2752 	 */
2753 	if (vp->v_type == VCHR || vp->v_type == VBLK) {
2754 		error = priv_check(td, PRIV_VFS_CHFLAGS_DEV);
2755 		if (error)
2756 			return (error);
2757 	}
2758 
2759 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
2760 		return (error);
2761 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2762 	VATTR_NULL(&vattr);
2763 	vattr.va_flags = flags;
2764 #ifdef MAC
2765 	error = mac_vnode_check_setflags(td->td_ucred, vp, vattr.va_flags);
2766 	if (error == 0)
2767 #endif
2768 		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
2769 	VOP_UNLOCK(vp, 0);
2770 	vn_finished_write(mp);
2771 	return (error);
2772 }
2773 
2774 /*
2775  * Change flags of a file given a path name.
2776  */
2777 #ifndef _SYS_SYSPROTO_H_
2778 struct chflags_args {
2779 	char	*path;
2780 	int	flags;
2781 };
2782 #endif
2783 int
2784 sys_chflags(td, uap)
2785 	struct thread *td;
2786 	register struct chflags_args /* {
2787 		char *path;
2788 		int flags;
2789 	} */ *uap;
2790 {
2791 	int error;
2792 	struct nameidata nd;
2793 	int vfslocked;
2794 
2795 	AUDIT_ARG_FFLAGS(uap->flags);
2796 	NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1, UIO_USERSPACE,
2797 	    uap->path, td);
2798 	if ((error = namei(&nd)) != 0)
2799 		return (error);
2800 	NDFREE(&nd, NDF_ONLY_PNBUF);
2801 	vfslocked = NDHASGIANT(&nd);
2802 	error = setfflags(td, nd.ni_vp, uap->flags);
2803 	vrele(nd.ni_vp);
2804 	VFS_UNLOCK_GIANT(vfslocked);
2805 	return (error);
2806 }
2807 
2808 /*
2809  * Same as chflags() but doesn't follow symlinks.
2810  */
2811 int
2812 sys_lchflags(td, uap)
2813 	struct thread *td;
2814 	register struct lchflags_args /* {
2815 		char *path;
2816 		int flags;
2817 	} */ *uap;
2818 {
2819 	int error;
2820 	struct nameidata nd;
2821 	int vfslocked;
2822 
2823 	AUDIT_ARG_FFLAGS(uap->flags);
2824 	NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE | AUDITVNODE1, UIO_USERSPACE,
2825 	    uap->path, td);
2826 	if ((error = namei(&nd)) != 0)
2827 		return (error);
2828 	vfslocked = NDHASGIANT(&nd);
2829 	NDFREE(&nd, NDF_ONLY_PNBUF);
2830 	error = setfflags(td, nd.ni_vp, uap->flags);
2831 	vrele(nd.ni_vp);
2832 	VFS_UNLOCK_GIANT(vfslocked);
2833 	return (error);
2834 }
2835 
2836 /*
2837  * Change flags of a file given a file descriptor.
2838  */
2839 #ifndef _SYS_SYSPROTO_H_
2840 struct fchflags_args {
2841 	int	fd;
2842 	int	flags;
2843 };
2844 #endif
2845 int
2846 sys_fchflags(td, uap)
2847 	struct thread *td;
2848 	register struct fchflags_args /* {
2849 		int fd;
2850 		int flags;
2851 	} */ *uap;
2852 {
2853 	struct file *fp;
2854 	int vfslocked;
2855 	int error;
2856 
2857 	AUDIT_ARG_FD(uap->fd);
2858 	AUDIT_ARG_FFLAGS(uap->flags);
2859 	if ((error = getvnode(td->td_proc->p_fd, uap->fd, CAP_FCHFLAGS,
2860 	    &fp)) != 0)
2861 		return (error);
2862 	vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
2863 #ifdef AUDIT
2864 	vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY);
2865 	AUDIT_ARG_VNODE1(fp->f_vnode);
2866 	VOP_UNLOCK(fp->f_vnode, 0);
2867 #endif
2868 	error = setfflags(td, fp->f_vnode, uap->flags);
2869 	VFS_UNLOCK_GIANT(vfslocked);
2870 	fdrop(fp, td);
2871 	return (error);
2872 }
2873 
2874 /*
2875  * Common implementation code for chmod(), lchmod() and fchmod().
2876  */
2877 int
2878 setfmode(td, cred, vp, mode)
2879 	struct thread *td;
2880 	struct ucred *cred;
2881 	struct vnode *vp;
2882 	int mode;
2883 {
2884 	int error;
2885 	struct mount *mp;
2886 	struct vattr vattr;
2887 
2888 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
2889 		return (error);
2890 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2891 	VATTR_NULL(&vattr);
2892 	vattr.va_mode = mode & ALLPERMS;
2893 #ifdef MAC
2894 	error = mac_vnode_check_setmode(cred, vp, vattr.va_mode);
2895 	if (error == 0)
2896 #endif
2897 		error = VOP_SETATTR(vp, &vattr, cred);
2898 	VOP_UNLOCK(vp, 0);
2899 	vn_finished_write(mp);
2900 	return (error);
2901 }
2902 
2903 /*
2904  * Change mode of a file given path name.
2905  */
2906 #ifndef _SYS_SYSPROTO_H_
2907 struct chmod_args {
2908 	char	*path;
2909 	int	mode;
2910 };
2911 #endif
2912 int
2913 sys_chmod(td, uap)
2914 	struct thread *td;
2915 	register struct chmod_args /* {
2916 		char *path;
2917 		int mode;
2918 	} */ *uap;
2919 {
2920 
2921 	return (kern_chmod(td, uap->path, UIO_USERSPACE, uap->mode));
2922 }
2923 
2924 #ifndef _SYS_SYSPROTO_H_
2925 struct fchmodat_args {
2926 	int	dirfd;
2927 	char	*path;
2928 	mode_t	mode;
2929 	int	flag;
2930 }
2931 #endif
2932 int
2933 sys_fchmodat(struct thread *td, struct fchmodat_args *uap)
2934 {
2935 	int flag = uap->flag;
2936 	int fd = uap->fd;
2937 	char *path = uap->path;
2938 	mode_t mode = uap->mode;
2939 
2940 	if (flag & ~AT_SYMLINK_NOFOLLOW)
2941 		return (EINVAL);
2942 
2943 	return (kern_fchmodat(td, fd, path, UIO_USERSPACE, mode, flag));
2944 }
2945 
2946 int
2947 kern_chmod(struct thread *td, char *path, enum uio_seg pathseg, int mode)
2948 {
2949 
2950 	return (kern_fchmodat(td, AT_FDCWD, path, pathseg, mode, 0));
2951 }
2952 
2953 /*
2954  * Change mode of a file given path name (don't follow links.)
2955  */
2956 #ifndef _SYS_SYSPROTO_H_
2957 struct lchmod_args {
2958 	char	*path;
2959 	int	mode;
2960 };
2961 #endif
2962 int
2963 sys_lchmod(td, uap)
2964 	struct thread *td;
2965 	register struct lchmod_args /* {
2966 		char *path;
2967 		int mode;
2968 	} */ *uap;
2969 {
2970 
2971 	return (kern_fchmodat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
2972 	    uap->mode, AT_SYMLINK_NOFOLLOW));
2973 }
2974 
2975 
2976 int
2977 kern_fchmodat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
2978     mode_t mode, int flag)
2979 {
2980 	int error;
2981 	struct nameidata nd;
2982 	int vfslocked;
2983 	int follow;
2984 
2985 	AUDIT_ARG_MODE(mode);
2986 	follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
2987 	NDINIT_ATRIGHTS(&nd, LOOKUP,  follow | MPSAFE | AUDITVNODE1, pathseg,
2988 	    path, fd, CAP_FCHMOD, td);
2989 	if ((error = namei(&nd)) != 0)
2990 		return (error);
2991 	vfslocked = NDHASGIANT(&nd);
2992 	NDFREE(&nd, NDF_ONLY_PNBUF);
2993 	error = setfmode(td, td->td_ucred, nd.ni_vp, mode);
2994 	vrele(nd.ni_vp);
2995 	VFS_UNLOCK_GIANT(vfslocked);
2996 	return (error);
2997 }
2998 
2999 /*
3000  * Change mode of a file given a file descriptor.
3001  */
3002 #ifndef _SYS_SYSPROTO_H_
3003 struct fchmod_args {
3004 	int	fd;
3005 	int	mode;
3006 };
3007 #endif
3008 int
3009 sys_fchmod(struct thread *td, struct fchmod_args *uap)
3010 {
3011 	struct file *fp;
3012 	int error;
3013 
3014 	AUDIT_ARG_FD(uap->fd);
3015 	AUDIT_ARG_MODE(uap->mode);
3016 
3017 	error = fget(td, uap->fd, CAP_FCHMOD, &fp);
3018 	if (error != 0)
3019 		return (error);
3020 	error = fo_chmod(fp, uap->mode, td->td_ucred, td);
3021 	fdrop(fp, td);
3022 	return (error);
3023 }
3024 
3025 /*
3026  * Common implementation for chown(), lchown(), and fchown()
3027  */
3028 int
3029 setfown(td, cred, vp, uid, gid)
3030 	struct thread *td;
3031 	struct ucred *cred;
3032 	struct vnode *vp;
3033 	uid_t uid;
3034 	gid_t gid;
3035 {
3036 	int error;
3037 	struct mount *mp;
3038 	struct vattr vattr;
3039 
3040 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
3041 		return (error);
3042 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3043 	VATTR_NULL(&vattr);
3044 	vattr.va_uid = uid;
3045 	vattr.va_gid = gid;
3046 #ifdef MAC
3047 	error = mac_vnode_check_setowner(cred, vp, vattr.va_uid,
3048 	    vattr.va_gid);
3049 	if (error == 0)
3050 #endif
3051 		error = VOP_SETATTR(vp, &vattr, cred);
3052 	VOP_UNLOCK(vp, 0);
3053 	vn_finished_write(mp);
3054 	return (error);
3055 }
3056 
3057 /*
3058  * Set ownership given a path name.
3059  */
3060 #ifndef _SYS_SYSPROTO_H_
3061 struct chown_args {
3062 	char	*path;
3063 	int	uid;
3064 	int	gid;
3065 };
3066 #endif
3067 int
3068 sys_chown(td, uap)
3069 	struct thread *td;
3070 	register struct chown_args /* {
3071 		char *path;
3072 		int uid;
3073 		int gid;
3074 	} */ *uap;
3075 {
3076 
3077 	return (kern_chown(td, uap->path, UIO_USERSPACE, uap->uid, uap->gid));
3078 }
3079 
3080 #ifndef _SYS_SYSPROTO_H_
3081 struct fchownat_args {
3082 	int fd;
3083 	const char * path;
3084 	uid_t uid;
3085 	gid_t gid;
3086 	int flag;
3087 };
3088 #endif
3089 int
3090 sys_fchownat(struct thread *td, struct fchownat_args *uap)
3091 {
3092 	int flag;
3093 
3094 	flag = uap->flag;
3095 	if (flag & ~AT_SYMLINK_NOFOLLOW)
3096 		return (EINVAL);
3097 
3098 	return (kern_fchownat(td, uap->fd, uap->path, UIO_USERSPACE, uap->uid,
3099 	    uap->gid, uap->flag));
3100 }
3101 
3102 int
3103 kern_chown(struct thread *td, char *path, enum uio_seg pathseg, int uid,
3104     int gid)
3105 {
3106 
3107 	return (kern_fchownat(td, AT_FDCWD, path, pathseg, uid, gid, 0));
3108 }
3109 
3110 int
3111 kern_fchownat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
3112     int uid, int gid, int flag)
3113 {
3114 	struct nameidata nd;
3115 	int error, vfslocked, follow;
3116 
3117 	AUDIT_ARG_OWNER(uid, gid);
3118 	follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
3119 	NDINIT_ATRIGHTS(&nd, LOOKUP, follow | MPSAFE | AUDITVNODE1, pathseg,
3120 	    path, fd, CAP_FCHOWN, td);
3121 
3122 	if ((error = namei(&nd)) != 0)
3123 		return (error);
3124 	vfslocked = NDHASGIANT(&nd);
3125 	NDFREE(&nd, NDF_ONLY_PNBUF);
3126 	error = setfown(td, td->td_ucred, nd.ni_vp, uid, gid);
3127 	vrele(nd.ni_vp);
3128 	VFS_UNLOCK_GIANT(vfslocked);
3129 	return (error);
3130 }
3131 
3132 /*
3133  * Set ownership given a path name, do not cross symlinks.
3134  */
3135 #ifndef _SYS_SYSPROTO_H_
3136 struct lchown_args {
3137 	char	*path;
3138 	int	uid;
3139 	int	gid;
3140 };
3141 #endif
3142 int
3143 sys_lchown(td, uap)
3144 	struct thread *td;
3145 	register struct lchown_args /* {
3146 		char *path;
3147 		int uid;
3148 		int gid;
3149 	} */ *uap;
3150 {
3151 
3152 	return (kern_lchown(td, uap->path, UIO_USERSPACE, uap->uid, uap->gid));
3153 }
3154 
3155 int
3156 kern_lchown(struct thread *td, char *path, enum uio_seg pathseg, int uid,
3157     int gid)
3158 {
3159 
3160 	return (kern_fchownat(td, AT_FDCWD, path, pathseg, uid, gid,
3161 	    AT_SYMLINK_NOFOLLOW));
3162 }
3163 
3164 /*
3165  * Set ownership given a file descriptor.
3166  */
3167 #ifndef _SYS_SYSPROTO_H_
3168 struct fchown_args {
3169 	int	fd;
3170 	int	uid;
3171 	int	gid;
3172 };
3173 #endif
3174 int
3175 sys_fchown(td, uap)
3176 	struct thread *td;
3177 	register struct fchown_args /* {
3178 		int fd;
3179 		int uid;
3180 		int gid;
3181 	} */ *uap;
3182 {
3183 	struct file *fp;
3184 	int error;
3185 
3186 	AUDIT_ARG_FD(uap->fd);
3187 	AUDIT_ARG_OWNER(uap->uid, uap->gid);
3188 	error = fget(td, uap->fd, CAP_FCHOWN, &fp);
3189 	if (error != 0)
3190 		return (error);
3191 	error = fo_chown(fp, uap->uid, uap->gid, td->td_ucred, td);
3192 	fdrop(fp, td);
3193 	return (error);
3194 }
3195 
3196 /*
3197  * Common implementation code for utimes(), lutimes(), and futimes().
3198  */
3199 static int
3200 getutimes(usrtvp, tvpseg, tsp)
3201 	const struct timeval *usrtvp;
3202 	enum uio_seg tvpseg;
3203 	struct timespec *tsp;
3204 {
3205 	struct timeval tv[2];
3206 	const struct timeval *tvp;
3207 	int error;
3208 
3209 	if (usrtvp == NULL) {
3210 		vfs_timestamp(&tsp[0]);
3211 		tsp[1] = tsp[0];
3212 	} else {
3213 		if (tvpseg == UIO_SYSSPACE) {
3214 			tvp = usrtvp;
3215 		} else {
3216 			if ((error = copyin(usrtvp, tv, sizeof(tv))) != 0)
3217 				return (error);
3218 			tvp = tv;
3219 		}
3220 
3221 		if (tvp[0].tv_usec < 0 || tvp[0].tv_usec >= 1000000 ||
3222 		    tvp[1].tv_usec < 0 || tvp[1].tv_usec >= 1000000)
3223 			return (EINVAL);
3224 		TIMEVAL_TO_TIMESPEC(&tvp[0], &tsp[0]);
3225 		TIMEVAL_TO_TIMESPEC(&tvp[1], &tsp[1]);
3226 	}
3227 	return (0);
3228 }
3229 
3230 /*
3231  * Common implementation code for utimes(), lutimes(), and futimes().
3232  */
3233 static int
3234 setutimes(td, vp, ts, numtimes, nullflag)
3235 	struct thread *td;
3236 	struct vnode *vp;
3237 	const struct timespec *ts;
3238 	int numtimes;
3239 	int nullflag;
3240 {
3241 	int error, setbirthtime;
3242 	struct mount *mp;
3243 	struct vattr vattr;
3244 
3245 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
3246 		return (error);
3247 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3248 	setbirthtime = 0;
3249 	if (numtimes < 3 && !VOP_GETATTR(vp, &vattr, td->td_ucred) &&
3250 	    timespeccmp(&ts[1], &vattr.va_birthtime, < ))
3251 		setbirthtime = 1;
3252 	VATTR_NULL(&vattr);
3253 	vattr.va_atime = ts[0];
3254 	vattr.va_mtime = ts[1];
3255 	if (setbirthtime)
3256 		vattr.va_birthtime = ts[1];
3257 	if (numtimes > 2)
3258 		vattr.va_birthtime = ts[2];
3259 	if (nullflag)
3260 		vattr.va_vaflags |= VA_UTIMES_NULL;
3261 #ifdef MAC
3262 	error = mac_vnode_check_setutimes(td->td_ucred, vp, vattr.va_atime,
3263 	    vattr.va_mtime);
3264 #endif
3265 	if (error == 0)
3266 		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
3267 	VOP_UNLOCK(vp, 0);
3268 	vn_finished_write(mp);
3269 	return (error);
3270 }
3271 
3272 /*
3273  * Set the access and modification times of a file.
3274  */
3275 #ifndef _SYS_SYSPROTO_H_
3276 struct utimes_args {
3277 	char	*path;
3278 	struct	timeval *tptr;
3279 };
3280 #endif
3281 int
3282 sys_utimes(td, uap)
3283 	struct thread *td;
3284 	register struct utimes_args /* {
3285 		char *path;
3286 		struct timeval *tptr;
3287 	} */ *uap;
3288 {
3289 
3290 	return (kern_utimes(td, uap->path, UIO_USERSPACE, uap->tptr,
3291 	    UIO_USERSPACE));
3292 }
3293 
3294 #ifndef _SYS_SYSPROTO_H_
3295 struct futimesat_args {
3296 	int fd;
3297 	const char * path;
3298 	const struct timeval * times;
3299 };
3300 #endif
3301 int
3302 sys_futimesat(struct thread *td, struct futimesat_args *uap)
3303 {
3304 
3305 	return (kern_utimesat(td, uap->fd, uap->path, UIO_USERSPACE,
3306 	    uap->times, UIO_USERSPACE));
3307 }
3308 
3309 int
3310 kern_utimes(struct thread *td, char *path, enum uio_seg pathseg,
3311     struct timeval *tptr, enum uio_seg tptrseg)
3312 {
3313 
3314 	return (kern_utimesat(td, AT_FDCWD, path, pathseg, tptr, tptrseg));
3315 }
3316 
3317 int
3318 kern_utimesat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
3319     struct timeval *tptr, enum uio_seg tptrseg)
3320 {
3321 	struct nameidata nd;
3322 	struct timespec ts[2];
3323 	int error, vfslocked;
3324 
3325 	if ((error = getutimes(tptr, tptrseg, ts)) != 0)
3326 		return (error);
3327 	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1, pathseg,
3328 	    path, fd, CAP_FUTIMES, td);
3329 
3330 	if ((error = namei(&nd)) != 0)
3331 		return (error);
3332 	vfslocked = NDHASGIANT(&nd);
3333 	NDFREE(&nd, NDF_ONLY_PNBUF);
3334 	error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL);
3335 	vrele(nd.ni_vp);
3336 	VFS_UNLOCK_GIANT(vfslocked);
3337 	return (error);
3338 }
3339 
3340 /*
3341  * Set the access and modification times of a file.
3342  */
3343 #ifndef _SYS_SYSPROTO_H_
3344 struct lutimes_args {
3345 	char	*path;
3346 	struct	timeval *tptr;
3347 };
3348 #endif
3349 int
3350 sys_lutimes(td, uap)
3351 	struct thread *td;
3352 	register struct lutimes_args /* {
3353 		char *path;
3354 		struct timeval *tptr;
3355 	} */ *uap;
3356 {
3357 
3358 	return (kern_lutimes(td, uap->path, UIO_USERSPACE, uap->tptr,
3359 	    UIO_USERSPACE));
3360 }
3361 
3362 int
3363 kern_lutimes(struct thread *td, char *path, enum uio_seg pathseg,
3364     struct timeval *tptr, enum uio_seg tptrseg)
3365 {
3366 	struct timespec ts[2];
3367 	int error;
3368 	struct nameidata nd;
3369 	int vfslocked;
3370 
3371 	if ((error = getutimes(tptr, tptrseg, ts)) != 0)
3372 		return (error);
3373 	NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE | AUDITVNODE1, pathseg, path, td);
3374 	if ((error = namei(&nd)) != 0)
3375 		return (error);
3376 	vfslocked = NDHASGIANT(&nd);
3377 	NDFREE(&nd, NDF_ONLY_PNBUF);
3378 	error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL);
3379 	vrele(nd.ni_vp);
3380 	VFS_UNLOCK_GIANT(vfslocked);
3381 	return (error);
3382 }
3383 
3384 /*
3385  * Set the access and modification times of a file.
3386  */
3387 #ifndef _SYS_SYSPROTO_H_
3388 struct futimes_args {
3389 	int	fd;
3390 	struct	timeval *tptr;
3391 };
3392 #endif
3393 int
3394 sys_futimes(td, uap)
3395 	struct thread *td;
3396 	register struct futimes_args /* {
3397 		int  fd;
3398 		struct timeval *tptr;
3399 	} */ *uap;
3400 {
3401 
3402 	return (kern_futimes(td, uap->fd, uap->tptr, UIO_USERSPACE));
3403 }
3404 
3405 int
3406 kern_futimes(struct thread *td, int fd, struct timeval *tptr,
3407     enum uio_seg tptrseg)
3408 {
3409 	struct timespec ts[2];
3410 	struct file *fp;
3411 	int vfslocked;
3412 	int error;
3413 
3414 	AUDIT_ARG_FD(fd);
3415 	if ((error = getutimes(tptr, tptrseg, ts)) != 0)
3416 		return (error);
3417 	if ((error = getvnode(td->td_proc->p_fd, fd, CAP_FUTIMES, &fp))
3418 	    != 0)
3419 		return (error);
3420 	vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
3421 #ifdef AUDIT
3422 	vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY);
3423 	AUDIT_ARG_VNODE1(fp->f_vnode);
3424 	VOP_UNLOCK(fp->f_vnode, 0);
3425 #endif
3426 	error = setutimes(td, fp->f_vnode, ts, 2, tptr == NULL);
3427 	VFS_UNLOCK_GIANT(vfslocked);
3428 	fdrop(fp, td);
3429 	return (error);
3430 }
3431 
3432 /*
3433  * Truncate a file given its path name.
3434  */
3435 #ifndef _SYS_SYSPROTO_H_
3436 struct truncate_args {
3437 	char	*path;
3438 	int	pad;
3439 	off_t	length;
3440 };
3441 #endif
3442 int
3443 sys_truncate(td, uap)
3444 	struct thread *td;
3445 	register struct truncate_args /* {
3446 		char *path;
3447 		int pad;
3448 		off_t length;
3449 	} */ *uap;
3450 {
3451 
3452 	return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length));
3453 }
3454 
3455 int
3456 kern_truncate(struct thread *td, char *path, enum uio_seg pathseg, off_t length)
3457 {
3458 	struct mount *mp;
3459 	struct vnode *vp;
3460 	struct vattr vattr;
3461 	int error;
3462 	struct nameidata nd;
3463 	int vfslocked;
3464 
3465 	if (length < 0)
3466 		return(EINVAL);
3467 	NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1, pathseg, path, td);
3468 	if ((error = namei(&nd)) != 0)
3469 		return (error);
3470 	vfslocked = NDHASGIANT(&nd);
3471 	vp = nd.ni_vp;
3472 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
3473 		vrele(vp);
3474 		VFS_UNLOCK_GIANT(vfslocked);
3475 		return (error);
3476 	}
3477 	NDFREE(&nd, NDF_ONLY_PNBUF);
3478 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3479 	if (vp->v_type == VDIR)
3480 		error = EISDIR;
3481 #ifdef MAC
3482 	else if ((error = mac_vnode_check_write(td->td_ucred, NOCRED, vp))) {
3483 	}
3484 #endif
3485 	else if ((error = vn_writechk(vp)) == 0 &&
3486 	    (error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td)) == 0) {
3487 		VATTR_NULL(&vattr);
3488 		vattr.va_size = length;
3489 		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
3490 	}
3491 	vput(vp);
3492 	vn_finished_write(mp);
3493 	VFS_UNLOCK_GIANT(vfslocked);
3494 	return (error);
3495 }
3496 
3497 #if defined(COMPAT_43)
3498 /*
3499  * Truncate a file given its path name.
3500  */
3501 #ifndef _SYS_SYSPROTO_H_
3502 struct otruncate_args {
3503 	char	*path;
3504 	long	length;
3505 };
3506 #endif
3507 int
3508 otruncate(td, uap)
3509 	struct thread *td;
3510 	register struct otruncate_args /* {
3511 		char *path;
3512 		long length;
3513 	} */ *uap;
3514 {
3515 	struct truncate_args /* {
3516 		char *path;
3517 		int pad;
3518 		off_t length;
3519 	} */ nuap;
3520 
3521 	nuap.path = uap->path;
3522 	nuap.length = uap->length;
3523 	return (sys_truncate(td, &nuap));
3524 }
3525 #endif /* COMPAT_43 */
3526 
3527 /* Versions with the pad argument */
3528 int
3529 freebsd6_truncate(struct thread *td, struct freebsd6_truncate_args *uap)
3530 {
3531 	struct truncate_args ouap;
3532 
3533 	ouap.path = uap->path;
3534 	ouap.length = uap->length;
3535 	return (sys_truncate(td, &ouap));
3536 }
3537 
3538 int
3539 freebsd6_ftruncate(struct thread *td, struct freebsd6_ftruncate_args *uap)
3540 {
3541 	struct ftruncate_args ouap;
3542 
3543 	ouap.fd = uap->fd;
3544 	ouap.length = uap->length;
3545 	return (sys_ftruncate(td, &ouap));
3546 }
3547 
3548 /*
3549  * Sync an open file.
3550  */
3551 #ifndef _SYS_SYSPROTO_H_
3552 struct fsync_args {
3553 	int	fd;
3554 };
3555 #endif
3556 int
3557 sys_fsync(td, uap)
3558 	struct thread *td;
3559 	struct fsync_args /* {
3560 		int fd;
3561 	} */ *uap;
3562 {
3563 	struct vnode *vp;
3564 	struct mount *mp;
3565 	struct file *fp;
3566 	int vfslocked;
3567 	int error, lock_flags;
3568 
3569 	AUDIT_ARG_FD(uap->fd);
3570 	if ((error = getvnode(td->td_proc->p_fd, uap->fd, CAP_FSYNC,
3571 	    &fp)) != 0)
3572 		return (error);
3573 	vp = fp->f_vnode;
3574 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
3575 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
3576 		goto drop;
3577 	if (MNT_SHARED_WRITES(mp) ||
3578 	    ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) {
3579 		lock_flags = LK_SHARED;
3580 	} else {
3581 		lock_flags = LK_EXCLUSIVE;
3582 	}
3583 	vn_lock(vp, lock_flags | LK_RETRY);
3584 	AUDIT_ARG_VNODE1(vp);
3585 	if (vp->v_object != NULL) {
3586 		VM_OBJECT_LOCK(vp->v_object);
3587 		vm_object_page_clean(vp->v_object, 0, 0, 0);
3588 		VM_OBJECT_UNLOCK(vp->v_object);
3589 	}
3590 	error = VOP_FSYNC(vp, MNT_WAIT, td);
3591 
3592 	VOP_UNLOCK(vp, 0);
3593 	vn_finished_write(mp);
3594 drop:
3595 	VFS_UNLOCK_GIANT(vfslocked);
3596 	fdrop(fp, td);
3597 	return (error);
3598 }
3599 
3600 /*
3601  * Rename files.  Source and destination must either both be directories, or
3602  * both not be directories.  If target is a directory, it must be empty.
3603  */
3604 #ifndef _SYS_SYSPROTO_H_
3605 struct rename_args {
3606 	char	*from;
3607 	char	*to;
3608 };
3609 #endif
3610 int
3611 sys_rename(td, uap)
3612 	struct thread *td;
3613 	register struct rename_args /* {
3614 		char *from;
3615 		char *to;
3616 	} */ *uap;
3617 {
3618 
3619 	return (kern_rename(td, uap->from, uap->to, UIO_USERSPACE));
3620 }
3621 
3622 #ifndef _SYS_SYSPROTO_H_
3623 struct renameat_args {
3624 	int	oldfd;
3625 	char	*old;
3626 	int	newfd;
3627 	char	*new;
3628 };
3629 #endif
3630 int
3631 sys_renameat(struct thread *td, struct renameat_args *uap)
3632 {
3633 
3634 	return (kern_renameat(td, uap->oldfd, uap->old, uap->newfd, uap->new,
3635 	    UIO_USERSPACE));
3636 }
3637 
3638 int
3639 kern_rename(struct thread *td, char *from, char *to, enum uio_seg pathseg)
3640 {
3641 
3642 	return (kern_renameat(td, AT_FDCWD, from, AT_FDCWD, to, pathseg));
3643 }
3644 
3645 int
3646 kern_renameat(struct thread *td, int oldfd, char *old, int newfd, char *new,
3647     enum uio_seg pathseg)
3648 {
3649 	struct mount *mp = NULL;
3650 	struct vnode *tvp, *fvp, *tdvp;
3651 	struct nameidata fromnd, tond;
3652 	int tvfslocked;
3653 	int fvfslocked;
3654 	int error;
3655 
3656 	bwillwrite();
3657 #ifdef MAC
3658 	NDINIT_ATRIGHTS(&fromnd, DELETE, LOCKPARENT | LOCKLEAF | SAVESTART |
3659 	    MPSAFE | AUDITVNODE1, pathseg, old, oldfd, CAP_DELETE, td);
3660 #else
3661 	NDINIT_ATRIGHTS(&fromnd, DELETE, WANTPARENT | SAVESTART | MPSAFE |
3662 	    AUDITVNODE1, pathseg, old, oldfd, CAP_DELETE, td);
3663 #endif
3664 
3665 	if ((error = namei(&fromnd)) != 0)
3666 		return (error);
3667 	fvfslocked = NDHASGIANT(&fromnd);
3668 	tvfslocked = 0;
3669 #ifdef MAC
3670 	error = mac_vnode_check_rename_from(td->td_ucred, fromnd.ni_dvp,
3671 	    fromnd.ni_vp, &fromnd.ni_cnd);
3672 	VOP_UNLOCK(fromnd.ni_dvp, 0);
3673 	if (fromnd.ni_dvp != fromnd.ni_vp)
3674 		VOP_UNLOCK(fromnd.ni_vp, 0);
3675 #endif
3676 	fvp = fromnd.ni_vp;
3677 	if (error == 0)
3678 		error = vn_start_write(fvp, &mp, V_WAIT | PCATCH);
3679 	if (error != 0) {
3680 		NDFREE(&fromnd, NDF_ONLY_PNBUF);
3681 		vrele(fromnd.ni_dvp);
3682 		vrele(fvp);
3683 		goto out1;
3684 	}
3685 	NDINIT_ATRIGHTS(&tond, RENAME, LOCKPARENT | LOCKLEAF | NOCACHE |
3686 	    SAVESTART | MPSAFE | AUDITVNODE2, pathseg, new, newfd, CAP_CREATE,
3687 	    td);
3688 	if (fromnd.ni_vp->v_type == VDIR)
3689 		tond.ni_cnd.cn_flags |= WILLBEDIR;
3690 	if ((error = namei(&tond)) != 0) {
3691 		/* Translate error code for rename("dir1", "dir2/."). */
3692 		if (error == EISDIR && fvp->v_type == VDIR)
3693 			error = EINVAL;
3694 		NDFREE(&fromnd, NDF_ONLY_PNBUF);
3695 		vrele(fromnd.ni_dvp);
3696 		vrele(fvp);
3697 		vn_finished_write(mp);
3698 		goto out1;
3699 	}
3700 	tvfslocked = NDHASGIANT(&tond);
3701 	tdvp = tond.ni_dvp;
3702 	tvp = tond.ni_vp;
3703 	if (tvp != NULL) {
3704 		if (fvp->v_type == VDIR && tvp->v_type != VDIR) {
3705 			error = ENOTDIR;
3706 			goto out;
3707 		} else if (fvp->v_type != VDIR && tvp->v_type == VDIR) {
3708 			error = EISDIR;
3709 			goto out;
3710 		}
3711 	}
3712 	if (fvp == tdvp) {
3713 		error = EINVAL;
3714 		goto out;
3715 	}
3716 	/*
3717 	 * If the source is the same as the destination (that is, if they
3718 	 * are links to the same vnode), then there is nothing to do.
3719 	 */
3720 	if (fvp == tvp)
3721 		error = -1;
3722 #ifdef MAC
3723 	else
3724 		error = mac_vnode_check_rename_to(td->td_ucred, tdvp,
3725 		    tond.ni_vp, fromnd.ni_dvp == tdvp, &tond.ni_cnd);
3726 #endif
3727 out:
3728 	if (!error) {
3729 		error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd,
3730 				   tond.ni_dvp, tond.ni_vp, &tond.ni_cnd);
3731 		NDFREE(&fromnd, NDF_ONLY_PNBUF);
3732 		NDFREE(&tond, NDF_ONLY_PNBUF);
3733 	} else {
3734 		NDFREE(&fromnd, NDF_ONLY_PNBUF);
3735 		NDFREE(&tond, NDF_ONLY_PNBUF);
3736 		if (tvp)
3737 			vput(tvp);
3738 		if (tdvp == tvp)
3739 			vrele(tdvp);
3740 		else
3741 			vput(tdvp);
3742 		vrele(fromnd.ni_dvp);
3743 		vrele(fvp);
3744 	}
3745 	vrele(tond.ni_startdir);
3746 	vn_finished_write(mp);
3747 out1:
3748 	if (fromnd.ni_startdir)
3749 		vrele(fromnd.ni_startdir);
3750 	VFS_UNLOCK_GIANT(fvfslocked);
3751 	VFS_UNLOCK_GIANT(tvfslocked);
3752 	if (error == -1)
3753 		return (0);
3754 	return (error);
3755 }
3756 
3757 /*
3758  * Make a directory file.
3759  */
3760 #ifndef _SYS_SYSPROTO_H_
3761 struct mkdir_args {
3762 	char	*path;
3763 	int	mode;
3764 };
3765 #endif
3766 int
3767 sys_mkdir(td, uap)
3768 	struct thread *td;
3769 	register struct mkdir_args /* {
3770 		char *path;
3771 		int mode;
3772 	} */ *uap;
3773 {
3774 
3775 	return (kern_mkdir(td, uap->path, UIO_USERSPACE, uap->mode));
3776 }
3777 
3778 #ifndef _SYS_SYSPROTO_H_
3779 struct mkdirat_args {
3780 	int	fd;
3781 	char	*path;
3782 	mode_t	mode;
3783 };
3784 #endif
3785 int
3786 sys_mkdirat(struct thread *td, struct mkdirat_args *uap)
3787 {
3788 
3789 	return (kern_mkdirat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode));
3790 }
3791 
3792 int
3793 kern_mkdir(struct thread *td, char *path, enum uio_seg segflg, int mode)
3794 {
3795 
3796 	return (kern_mkdirat(td, AT_FDCWD, path, segflg, mode));
3797 }
3798 
3799 int
3800 kern_mkdirat(struct thread *td, int fd, char *path, enum uio_seg segflg,
3801     int mode)
3802 {
3803 	struct mount *mp;
3804 	struct vnode *vp;
3805 	struct vattr vattr;
3806 	int error;
3807 	struct nameidata nd;
3808 	int vfslocked;
3809 
3810 	AUDIT_ARG_MODE(mode);
3811 restart:
3812 	bwillwrite();
3813 	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE |
3814 	    AUDITVNODE1, segflg, path, fd, CAP_MKDIR, td);
3815 	nd.ni_cnd.cn_flags |= WILLBEDIR;
3816 	if ((error = namei(&nd)) != 0)
3817 		return (error);
3818 	vfslocked = NDHASGIANT(&nd);
3819 	vp = nd.ni_vp;
3820 	if (vp != NULL) {
3821 		NDFREE(&nd, NDF_ONLY_PNBUF);
3822 		/*
3823 		 * XXX namei called with LOCKPARENT but not LOCKLEAF has
3824 		 * the strange behaviour of leaving the vnode unlocked
3825 		 * if the target is the same vnode as the parent.
3826 		 */
3827 		if (vp == nd.ni_dvp)
3828 			vrele(nd.ni_dvp);
3829 		else
3830 			vput(nd.ni_dvp);
3831 		vrele(vp);
3832 		VFS_UNLOCK_GIANT(vfslocked);
3833 		return (EEXIST);
3834 	}
3835 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
3836 		NDFREE(&nd, NDF_ONLY_PNBUF);
3837 		vput(nd.ni_dvp);
3838 		VFS_UNLOCK_GIANT(vfslocked);
3839 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
3840 			return (error);
3841 		goto restart;
3842 	}
3843 	VATTR_NULL(&vattr);
3844 	vattr.va_type = VDIR;
3845 	vattr.va_mode = (mode & ACCESSPERMS) &~ td->td_proc->p_fd->fd_cmask;
3846 #ifdef MAC
3847 	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
3848 	    &vattr);
3849 	if (error)
3850 		goto out;
3851 #endif
3852 	error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
3853 #ifdef MAC
3854 out:
3855 #endif
3856 	NDFREE(&nd, NDF_ONLY_PNBUF);
3857 	vput(nd.ni_dvp);
3858 	if (!error)
3859 		vput(nd.ni_vp);
3860 	vn_finished_write(mp);
3861 	VFS_UNLOCK_GIANT(vfslocked);
3862 	return (error);
3863 }
3864 
3865 /*
3866  * Remove a directory file.
3867  */
3868 #ifndef _SYS_SYSPROTO_H_
3869 struct rmdir_args {
3870 	char	*path;
3871 };
3872 #endif
3873 int
3874 sys_rmdir(td, uap)
3875 	struct thread *td;
3876 	struct rmdir_args /* {
3877 		char *path;
3878 	} */ *uap;
3879 {
3880 
3881 	return (kern_rmdir(td, uap->path, UIO_USERSPACE));
3882 }
3883 
3884 int
3885 kern_rmdir(struct thread *td, char *path, enum uio_seg pathseg)
3886 {
3887 
3888 	return (kern_rmdirat(td, AT_FDCWD, path, pathseg));
3889 }
3890 
3891 int
3892 kern_rmdirat(struct thread *td, int fd, char *path, enum uio_seg pathseg)
3893 {
3894 	struct mount *mp;
3895 	struct vnode *vp;
3896 	int error;
3897 	struct nameidata nd;
3898 	int vfslocked;
3899 
3900 restart:
3901 	bwillwrite();
3902 	NDINIT_ATRIGHTS(&nd, DELETE, LOCKPARENT | LOCKLEAF | MPSAFE |
3903 	    AUDITVNODE1, pathseg, path, fd, CAP_RMDIR, td);
3904 	if ((error = namei(&nd)) != 0)
3905 		return (error);
3906 	vfslocked = NDHASGIANT(&nd);
3907 	vp = nd.ni_vp;
3908 	if (vp->v_type != VDIR) {
3909 		error = ENOTDIR;
3910 		goto out;
3911 	}
3912 	/*
3913 	 * No rmdir "." please.
3914 	 */
3915 	if (nd.ni_dvp == vp) {
3916 		error = EINVAL;
3917 		goto out;
3918 	}
3919 	/*
3920 	 * The root of a mounted filesystem cannot be deleted.
3921 	 */
3922 	if (vp->v_vflag & VV_ROOT) {
3923 		error = EBUSY;
3924 		goto out;
3925 	}
3926 #ifdef MAC
3927 	error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp,
3928 	    &nd.ni_cnd);
3929 	if (error)
3930 		goto out;
3931 #endif
3932 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
3933 		NDFREE(&nd, NDF_ONLY_PNBUF);
3934 		vput(vp);
3935 		if (nd.ni_dvp == vp)
3936 			vrele(nd.ni_dvp);
3937 		else
3938 			vput(nd.ni_dvp);
3939 		VFS_UNLOCK_GIANT(vfslocked);
3940 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
3941 			return (error);
3942 		goto restart;
3943 	}
3944 	error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
3945 	vn_finished_write(mp);
3946 out:
3947 	NDFREE(&nd, NDF_ONLY_PNBUF);
3948 	vput(vp);
3949 	if (nd.ni_dvp == vp)
3950 		vrele(nd.ni_dvp);
3951 	else
3952 		vput(nd.ni_dvp);
3953 	VFS_UNLOCK_GIANT(vfslocked);
3954 	return (error);
3955 }
3956 
3957 #ifdef COMPAT_43
3958 /*
3959  * Read a block of directory entries in a filesystem independent format.
3960  */
3961 #ifndef _SYS_SYSPROTO_H_
3962 struct ogetdirentries_args {
3963 	int	fd;
3964 	char	*buf;
3965 	u_int	count;
3966 	long	*basep;
3967 };
3968 #endif
3969 int
3970 ogetdirentries(struct thread *td, struct ogetdirentries_args *uap)
3971 {
3972 	long loff;
3973 	int error;
3974 
3975 	error = kern_ogetdirentries(td, uap, &loff);
3976 	if (error == 0)
3977 		error = copyout(&loff, uap->basep, sizeof(long));
3978 	return (error);
3979 }
3980 
3981 int
3982 kern_ogetdirentries(struct thread *td, struct ogetdirentries_args *uap,
3983     long *ploff)
3984 {
3985 	struct vnode *vp;
3986 	struct file *fp;
3987 	struct uio auio, kuio;
3988 	struct iovec aiov, kiov;
3989 	struct dirent *dp, *edp;
3990 	caddr_t dirbuf;
3991 	int error, eofflag, readcnt, vfslocked;
3992 	long loff;
3993 
3994 	/* XXX arbitrary sanity limit on `count'. */
3995 	if (uap->count > 64 * 1024)
3996 		return (EINVAL);
3997 	if ((error = getvnode(td->td_proc->p_fd, uap->fd, CAP_READ,
3998 	    &fp)) != 0)
3999 		return (error);
4000 	if ((fp->f_flag & FREAD) == 0) {
4001 		fdrop(fp, td);
4002 		return (EBADF);
4003 	}
4004 	vp = fp->f_vnode;
4005 unionread:
4006 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
4007 	if (vp->v_type != VDIR) {
4008 		VFS_UNLOCK_GIANT(vfslocked);
4009 		fdrop(fp, td);
4010 		return (EINVAL);
4011 	}
4012 	aiov.iov_base = uap->buf;
4013 	aiov.iov_len = uap->count;
4014 	auio.uio_iov = &aiov;
4015 	auio.uio_iovcnt = 1;
4016 	auio.uio_rw = UIO_READ;
4017 	auio.uio_segflg = UIO_USERSPACE;
4018 	auio.uio_td = td;
4019 	auio.uio_resid = uap->count;
4020 	vn_lock(vp, LK_SHARED | LK_RETRY);
4021 	loff = auio.uio_offset = fp->f_offset;
4022 #ifdef MAC
4023 	error = mac_vnode_check_readdir(td->td_ucred, vp);
4024 	if (error) {
4025 		VOP_UNLOCK(vp, 0);
4026 		VFS_UNLOCK_GIANT(vfslocked);
4027 		fdrop(fp, td);
4028 		return (error);
4029 	}
4030 #endif
4031 #	if (BYTE_ORDER != LITTLE_ENDIAN)
4032 		if (vp->v_mount->mnt_maxsymlinklen <= 0) {
4033 			error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag,
4034 			    NULL, NULL);
4035 			fp->f_offset = auio.uio_offset;
4036 		} else
4037 #	endif
4038 	{
4039 		kuio = auio;
4040 		kuio.uio_iov = &kiov;
4041 		kuio.uio_segflg = UIO_SYSSPACE;
4042 		kiov.iov_len = uap->count;
4043 		dirbuf = malloc(uap->count, M_TEMP, M_WAITOK);
4044 		kiov.iov_base = dirbuf;
4045 		error = VOP_READDIR(vp, &kuio, fp->f_cred, &eofflag,
4046 			    NULL, NULL);
4047 		fp->f_offset = kuio.uio_offset;
4048 		if (error == 0) {
4049 			readcnt = uap->count - kuio.uio_resid;
4050 			edp = (struct dirent *)&dirbuf[readcnt];
4051 			for (dp = (struct dirent *)dirbuf; dp < edp; ) {
4052 #				if (BYTE_ORDER == LITTLE_ENDIAN)
4053 					/*
4054 					 * The expected low byte of
4055 					 * dp->d_namlen is our dp->d_type.
4056 					 * The high MBZ byte of dp->d_namlen
4057 					 * is our dp->d_namlen.
4058 					 */
4059 					dp->d_type = dp->d_namlen;
4060 					dp->d_namlen = 0;
4061 #				else
4062 					/*
4063 					 * The dp->d_type is the high byte
4064 					 * of the expected dp->d_namlen,
4065 					 * so must be zero'ed.
4066 					 */
4067 					dp->d_type = 0;
4068 #				endif
4069 				if (dp->d_reclen > 0) {
4070 					dp = (struct dirent *)
4071 					    ((char *)dp + dp->d_reclen);
4072 				} else {
4073 					error = EIO;
4074 					break;
4075 				}
4076 			}
4077 			if (dp >= edp)
4078 				error = uiomove(dirbuf, readcnt, &auio);
4079 		}
4080 		free(dirbuf, M_TEMP);
4081 	}
4082 	if (error) {
4083 		VOP_UNLOCK(vp, 0);
4084 		VFS_UNLOCK_GIANT(vfslocked);
4085 		fdrop(fp, td);
4086 		return (error);
4087 	}
4088 	if (uap->count == auio.uio_resid &&
4089 	    (vp->v_vflag & VV_ROOT) &&
4090 	    (vp->v_mount->mnt_flag & MNT_UNION)) {
4091 		struct vnode *tvp = vp;
4092 		vp = vp->v_mount->mnt_vnodecovered;
4093 		VREF(vp);
4094 		fp->f_vnode = vp;
4095 		fp->f_data = vp;
4096 		fp->f_offset = 0;
4097 		vput(tvp);
4098 		VFS_UNLOCK_GIANT(vfslocked);
4099 		goto unionread;
4100 	}
4101 	VOP_UNLOCK(vp, 0);
4102 	VFS_UNLOCK_GIANT(vfslocked);
4103 	fdrop(fp, td);
4104 	td->td_retval[0] = uap->count - auio.uio_resid;
4105 	if (error == 0)
4106 		*ploff = loff;
4107 	return (error);
4108 }
4109 #endif /* COMPAT_43 */
4110 
4111 /*
4112  * Read a block of directory entries in a filesystem independent format.
4113  */
4114 #ifndef _SYS_SYSPROTO_H_
4115 struct getdirentries_args {
4116 	int	fd;
4117 	char	*buf;
4118 	u_int	count;
4119 	long	*basep;
4120 };
4121 #endif
4122 int
4123 sys_getdirentries(td, uap)
4124 	struct thread *td;
4125 	register struct getdirentries_args /* {
4126 		int fd;
4127 		char *buf;
4128 		u_int count;
4129 		long *basep;
4130 	} */ *uap;
4131 {
4132 	long base;
4133 	int error;
4134 
4135 	error = kern_getdirentries(td, uap->fd, uap->buf, uap->count, &base);
4136 	if (error)
4137 		return (error);
4138 	if (uap->basep != NULL)
4139 		error = copyout(&base, uap->basep, sizeof(long));
4140 	return (error);
4141 }
4142 
4143 int
4144 kern_getdirentries(struct thread *td, int fd, char *buf, u_int count,
4145     long *basep)
4146 {
4147 	struct vnode *vp;
4148 	struct file *fp;
4149 	struct uio auio;
4150 	struct iovec aiov;
4151 	int vfslocked;
4152 	long loff;
4153 	int error, eofflag;
4154 
4155 	AUDIT_ARG_FD(fd);
4156 	auio.uio_resid = count;
4157 	if (auio.uio_resid > IOSIZE_MAX)
4158 		return (EINVAL);
4159 	if ((error = getvnode(td->td_proc->p_fd, fd, CAP_READ | CAP_SEEK,
4160 	    &fp)) != 0)
4161 		return (error);
4162 	if ((fp->f_flag & FREAD) == 0) {
4163 		fdrop(fp, td);
4164 		return (EBADF);
4165 	}
4166 	vp = fp->f_vnode;
4167 unionread:
4168 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
4169 	if (vp->v_type != VDIR) {
4170 		VFS_UNLOCK_GIANT(vfslocked);
4171 		error = EINVAL;
4172 		goto fail;
4173 	}
4174 	aiov.iov_base = buf;
4175 	aiov.iov_len = count;
4176 	auio.uio_iov = &aiov;
4177 	auio.uio_iovcnt = 1;
4178 	auio.uio_rw = UIO_READ;
4179 	auio.uio_segflg = UIO_USERSPACE;
4180 	auio.uio_td = td;
4181 	vn_lock(vp, LK_SHARED | LK_RETRY);
4182 	AUDIT_ARG_VNODE1(vp);
4183 	loff = auio.uio_offset = fp->f_offset;
4184 #ifdef MAC
4185 	error = mac_vnode_check_readdir(td->td_ucred, vp);
4186 	if (error == 0)
4187 #endif
4188 		error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL,
4189 		    NULL);
4190 	fp->f_offset = auio.uio_offset;
4191 	if (error) {
4192 		VOP_UNLOCK(vp, 0);
4193 		VFS_UNLOCK_GIANT(vfslocked);
4194 		goto fail;
4195 	}
4196 	if (count == auio.uio_resid &&
4197 	    (vp->v_vflag & VV_ROOT) &&
4198 	    (vp->v_mount->mnt_flag & MNT_UNION)) {
4199 		struct vnode *tvp = vp;
4200 		vp = vp->v_mount->mnt_vnodecovered;
4201 		VREF(vp);
4202 		fp->f_vnode = vp;
4203 		fp->f_data = vp;
4204 		fp->f_offset = 0;
4205 		vput(tvp);
4206 		VFS_UNLOCK_GIANT(vfslocked);
4207 		goto unionread;
4208 	}
4209 	VOP_UNLOCK(vp, 0);
4210 	VFS_UNLOCK_GIANT(vfslocked);
4211 	*basep = loff;
4212 	td->td_retval[0] = count - auio.uio_resid;
4213 fail:
4214 	fdrop(fp, td);
4215 	return (error);
4216 }
4217 
4218 #ifndef _SYS_SYSPROTO_H_
4219 struct getdents_args {
4220 	int fd;
4221 	char *buf;
4222 	size_t count;
4223 };
4224 #endif
4225 int
4226 sys_getdents(td, uap)
4227 	struct thread *td;
4228 	register struct getdents_args /* {
4229 		int fd;
4230 		char *buf;
4231 		u_int count;
4232 	} */ *uap;
4233 {
4234 	struct getdirentries_args ap;
4235 	ap.fd = uap->fd;
4236 	ap.buf = uap->buf;
4237 	ap.count = uap->count;
4238 	ap.basep = NULL;
4239 	return (sys_getdirentries(td, &ap));
4240 }
4241 
4242 /*
4243  * Set the mode mask for creation of filesystem nodes.
4244  */
4245 #ifndef _SYS_SYSPROTO_H_
4246 struct umask_args {
4247 	int	newmask;
4248 };
4249 #endif
4250 int
4251 sys_umask(td, uap)
4252 	struct thread *td;
4253 	struct umask_args /* {
4254 		int newmask;
4255 	} */ *uap;
4256 {
4257 	register struct filedesc *fdp;
4258 
4259 	FILEDESC_XLOCK(td->td_proc->p_fd);
4260 	fdp = td->td_proc->p_fd;
4261 	td->td_retval[0] = fdp->fd_cmask;
4262 	fdp->fd_cmask = uap->newmask & ALLPERMS;
4263 	FILEDESC_XUNLOCK(td->td_proc->p_fd);
4264 	return (0);
4265 }
4266 
4267 /*
4268  * Void all references to file by ripping underlying filesystem away from
4269  * vnode.
4270  */
4271 #ifndef _SYS_SYSPROTO_H_
4272 struct revoke_args {
4273 	char	*path;
4274 };
4275 #endif
4276 int
4277 sys_revoke(td, uap)
4278 	struct thread *td;
4279 	register struct revoke_args /* {
4280 		char *path;
4281 	} */ *uap;
4282 {
4283 	struct vnode *vp;
4284 	struct vattr vattr;
4285 	int error;
4286 	struct nameidata nd;
4287 	int vfslocked;
4288 
4289 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
4290 	    UIO_USERSPACE, uap->path, td);
4291 	if ((error = namei(&nd)) != 0)
4292 		return (error);
4293 	vfslocked = NDHASGIANT(&nd);
4294 	vp = nd.ni_vp;
4295 	NDFREE(&nd, NDF_ONLY_PNBUF);
4296 	if (vp->v_type != VCHR || vp->v_rdev == NULL) {
4297 		error = EINVAL;
4298 		goto out;
4299 	}
4300 #ifdef MAC
4301 	error = mac_vnode_check_revoke(td->td_ucred, vp);
4302 	if (error)
4303 		goto out;
4304 #endif
4305 	error = VOP_GETATTR(vp, &vattr, td->td_ucred);
4306 	if (error)
4307 		goto out;
4308 	if (td->td_ucred->cr_uid != vattr.va_uid) {
4309 		error = priv_check(td, PRIV_VFS_ADMIN);
4310 		if (error)
4311 			goto out;
4312 	}
4313 	if (vcount(vp) > 1)
4314 		VOP_REVOKE(vp, REVOKEALL);
4315 out:
4316 	vput(vp);
4317 	VFS_UNLOCK_GIANT(vfslocked);
4318 	return (error);
4319 }
4320 
4321 /*
4322  * Convert a user file descriptor to a kernel file entry and check that, if it
4323  * is a capability, the correct rights are present. A reference on the file
4324  * entry is held upon returning.
4325  */
4326 int
4327 getvnode(struct filedesc *fdp, int fd, cap_rights_t rights,
4328     struct file **fpp)
4329 {
4330 	struct file *fp;
4331 #ifdef CAPABILITIES
4332 	struct file *fp_fromcap;
4333 #endif
4334 	int error;
4335 
4336 	error = 0;
4337 	fp = NULL;
4338 	if ((fdp == NULL) || (fp = fget_unlocked(fdp, fd)) == NULL)
4339 		return (EBADF);
4340 #ifdef CAPABILITIES
4341 	/*
4342 	 * If the file descriptor is for a capability, test rights and use the
4343 	 * file descriptor referenced by the capability.
4344 	 */
4345 	error = cap_funwrap(fp, rights, &fp_fromcap);
4346 	if (error) {
4347 		fdrop(fp, curthread);
4348 		return (error);
4349 	}
4350 	if (fp != fp_fromcap) {
4351 		fhold(fp_fromcap);
4352 		fdrop(fp, curthread);
4353 		fp = fp_fromcap;
4354 	}
4355 #endif /* CAPABILITIES */
4356 
4357 	/*
4358 	 * The file could be not of the vnode type, or it may be not
4359 	 * yet fully initialized, in which case the f_vnode pointer
4360 	 * may be set, but f_ops is still badfileops.  E.g.,
4361 	 * devfs_open() transiently create such situation to
4362 	 * facilitate csw d_fdopen().
4363 	 *
4364 	 * Dupfdopen() handling in kern_openat() installs the
4365 	 * half-baked file into the process descriptor table, allowing
4366 	 * other thread to dereference it. Guard against the race by
4367 	 * checking f_ops.
4368 	 */
4369 	if (fp->f_vnode == NULL || fp->f_ops == &badfileops) {
4370 		fdrop(fp, curthread);
4371 		return (EINVAL);
4372 	}
4373 	*fpp = fp;
4374 	return (0);
4375 }
4376 
4377 
4378 /*
4379  * Get an (NFS) file handle.
4380  */
4381 #ifndef _SYS_SYSPROTO_H_
4382 struct lgetfh_args {
4383 	char	*fname;
4384 	fhandle_t *fhp;
4385 };
4386 #endif
4387 int
4388 sys_lgetfh(td, uap)
4389 	struct thread *td;
4390 	register struct lgetfh_args *uap;
4391 {
4392 	struct nameidata nd;
4393 	fhandle_t fh;
4394 	register struct vnode *vp;
4395 	int vfslocked;
4396 	int error;
4397 
4398 	error = priv_check(td, PRIV_VFS_GETFH);
4399 	if (error)
4400 		return (error);
4401 	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
4402 	    UIO_USERSPACE, uap->fname, td);
4403 	error = namei(&nd);
4404 	if (error)
4405 		return (error);
4406 	vfslocked = NDHASGIANT(&nd);
4407 	NDFREE(&nd, NDF_ONLY_PNBUF);
4408 	vp = nd.ni_vp;
4409 	bzero(&fh, sizeof(fh));
4410 	fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid;
4411 	error = VOP_VPTOFH(vp, &fh.fh_fid);
4412 	vput(vp);
4413 	VFS_UNLOCK_GIANT(vfslocked);
4414 	if (error)
4415 		return (error);
4416 	error = copyout(&fh, uap->fhp, sizeof (fh));
4417 	return (error);
4418 }
4419 
4420 #ifndef _SYS_SYSPROTO_H_
4421 struct getfh_args {
4422 	char	*fname;
4423 	fhandle_t *fhp;
4424 };
4425 #endif
4426 int
4427 sys_getfh(td, uap)
4428 	struct thread *td;
4429 	register struct getfh_args *uap;
4430 {
4431 	struct nameidata nd;
4432 	fhandle_t fh;
4433 	register struct vnode *vp;
4434 	int vfslocked;
4435 	int error;
4436 
4437 	error = priv_check(td, PRIV_VFS_GETFH);
4438 	if (error)
4439 		return (error);
4440 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
4441 	    UIO_USERSPACE, uap->fname, td);
4442 	error = namei(&nd);
4443 	if (error)
4444 		return (error);
4445 	vfslocked = NDHASGIANT(&nd);
4446 	NDFREE(&nd, NDF_ONLY_PNBUF);
4447 	vp = nd.ni_vp;
4448 	bzero(&fh, sizeof(fh));
4449 	fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid;
4450 	error = VOP_VPTOFH(vp, &fh.fh_fid);
4451 	vput(vp);
4452 	VFS_UNLOCK_GIANT(vfslocked);
4453 	if (error)
4454 		return (error);
4455 	error = copyout(&fh, uap->fhp, sizeof (fh));
4456 	return (error);
4457 }
4458 
4459 /*
4460  * syscall for the rpc.lockd to use to translate a NFS file handle into an
4461  * open descriptor.
4462  *
4463  * warning: do not remove the priv_check() call or this becomes one giant
4464  * security hole.
4465  */
4466 #ifndef _SYS_SYSPROTO_H_
4467 struct fhopen_args {
4468 	const struct fhandle *u_fhp;
4469 	int flags;
4470 };
4471 #endif
4472 int
4473 sys_fhopen(td, uap)
4474 	struct thread *td;
4475 	struct fhopen_args /* {
4476 		const struct fhandle *u_fhp;
4477 		int flags;
4478 	} */ *uap;
4479 {
4480 	struct proc *p = td->td_proc;
4481 	struct mount *mp;
4482 	struct vnode *vp;
4483 	struct fhandle fhp;
4484 	struct vattr vat;
4485 	struct vattr *vap = &vat;
4486 	struct flock lf;
4487 	struct file *fp;
4488 	register struct filedesc *fdp = p->p_fd;
4489 	int fmode, error, type;
4490 	accmode_t accmode;
4491 	struct file *nfp;
4492 	int vfslocked;
4493 	int indx;
4494 
4495 	error = priv_check(td, PRIV_VFS_FHOPEN);
4496 	if (error)
4497 		return (error);
4498 	fmode = FFLAGS(uap->flags);
4499 	/* why not allow a non-read/write open for our lockd? */
4500 	if (((fmode & (FREAD | FWRITE)) == 0) || (fmode & O_CREAT))
4501 		return (EINVAL);
4502 	error = copyin(uap->u_fhp, &fhp, sizeof(fhp));
4503 	if (error)
4504 		return(error);
4505 	/* find the mount point */
4506 	mp = vfs_busyfs(&fhp.fh_fsid);
4507 	if (mp == NULL)
4508 		return (ESTALE);
4509 	vfslocked = VFS_LOCK_GIANT(mp);
4510 	/* now give me my vnode, it gets returned to me locked */
4511 	error = VFS_FHTOVP(mp, &fhp.fh_fid, LK_EXCLUSIVE, &vp);
4512 	vfs_unbusy(mp);
4513 	if (error)
4514 		goto out;
4515 	/*
4516 	 * from now on we have to make sure not
4517 	 * to forget about the vnode
4518 	 * any error that causes an abort must vput(vp)
4519 	 * just set error = err and 'goto bad;'.
4520 	 */
4521 
4522 	/*
4523 	 * from vn_open
4524 	 */
4525 	if (vp->v_type == VLNK) {
4526 		error = EMLINK;
4527 		goto bad;
4528 	}
4529 	if (vp->v_type == VSOCK) {
4530 		error = EOPNOTSUPP;
4531 		goto bad;
4532 	}
4533 	if (vp->v_type != VDIR && fmode & O_DIRECTORY) {
4534 		error = ENOTDIR;
4535 		goto bad;
4536 	}
4537 	accmode = 0;
4538 	if (fmode & (FWRITE | O_TRUNC)) {
4539 		if (vp->v_type == VDIR) {
4540 			error = EISDIR;
4541 			goto bad;
4542 		}
4543 		error = vn_writechk(vp);
4544 		if (error)
4545 			goto bad;
4546 		accmode |= VWRITE;
4547 	}
4548 	if (fmode & FREAD)
4549 		accmode |= VREAD;
4550 	if ((fmode & O_APPEND) && (fmode & FWRITE))
4551 		accmode |= VAPPEND;
4552 #ifdef MAC
4553 	error = mac_vnode_check_open(td->td_ucred, vp, accmode);
4554 	if (error)
4555 		goto bad;
4556 #endif
4557 	if (accmode) {
4558 		error = VOP_ACCESS(vp, accmode, td->td_ucred, td);
4559 		if (error)
4560 			goto bad;
4561 	}
4562 	if (fmode & O_TRUNC) {
4563 		vfs_ref(mp);
4564 		VOP_UNLOCK(vp, 0);				/* XXX */
4565 		if ((error = vn_start_write(NULL, &mp, V_WAIT | PCATCH)) != 0) {
4566 			vrele(vp);
4567 			vfs_rel(mp);
4568 			goto out;
4569 		}
4570 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);	/* XXX */
4571 		vfs_rel(mp);
4572 #ifdef MAC
4573 		/*
4574 		 * We don't yet have fp->f_cred, so use td->td_ucred, which
4575 		 * should be right.
4576 		 */
4577 		error = mac_vnode_check_write(td->td_ucred, td->td_ucred, vp);
4578 		if (error == 0) {
4579 #endif
4580 			VATTR_NULL(vap);
4581 			vap->va_size = 0;
4582 			error = VOP_SETATTR(vp, vap, td->td_ucred);
4583 #ifdef MAC
4584 		}
4585 #endif
4586 		vn_finished_write(mp);
4587 		if (error)
4588 			goto bad;
4589 	}
4590 	error = VOP_OPEN(vp, fmode, td->td_ucred, td, NULL);
4591 	if (error)
4592 		goto bad;
4593 
4594 	if (fmode & FWRITE)
4595 		vp->v_writecount++;
4596 
4597 	/*
4598 	 * end of vn_open code
4599 	 */
4600 
4601 	if ((error = falloc(td, &nfp, &indx, fmode)) != 0) {
4602 		if (fmode & FWRITE)
4603 			vp->v_writecount--;
4604 		goto bad;
4605 	}
4606 	/* An extra reference on `nfp' has been held for us by falloc(). */
4607 	fp = nfp;
4608 	nfp->f_vnode = vp;
4609 	finit(nfp, fmode & FMASK, DTYPE_VNODE, vp, &vnops);
4610 	if (fmode & (O_EXLOCK | O_SHLOCK)) {
4611 		lf.l_whence = SEEK_SET;
4612 		lf.l_start = 0;
4613 		lf.l_len = 0;
4614 		if (fmode & O_EXLOCK)
4615 			lf.l_type = F_WRLCK;
4616 		else
4617 			lf.l_type = F_RDLCK;
4618 		type = F_FLOCK;
4619 		if ((fmode & FNONBLOCK) == 0)
4620 			type |= F_WAIT;
4621 		VOP_UNLOCK(vp, 0);
4622 		if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,
4623 			    type)) != 0) {
4624 			/*
4625 			 * The lock request failed.  Normally close the
4626 			 * descriptor but handle the case where someone might
4627 			 * have dup()d or close()d it when we weren't looking.
4628 			 */
4629 			fdclose(fdp, fp, indx, td);
4630 
4631 			/*
4632 			 * release our private reference
4633 			 */
4634 			fdrop(fp, td);
4635 			goto out;
4636 		}
4637 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4638 		atomic_set_int(&fp->f_flag, FHASLOCK);
4639 	}
4640 
4641 	VOP_UNLOCK(vp, 0);
4642 	fdrop(fp, td);
4643 	VFS_UNLOCK_GIANT(vfslocked);
4644 	td->td_retval[0] = indx;
4645 	return (0);
4646 
4647 bad:
4648 	vput(vp);
4649 out:
4650 	VFS_UNLOCK_GIANT(vfslocked);
4651 	return (error);
4652 }
4653 
4654 /*
4655  * Stat an (NFS) file handle.
4656  */
4657 #ifndef _SYS_SYSPROTO_H_
4658 struct fhstat_args {
4659 	struct fhandle *u_fhp;
4660 	struct stat *sb;
4661 };
4662 #endif
4663 int
4664 sys_fhstat(td, uap)
4665 	struct thread *td;
4666 	register struct fhstat_args /* {
4667 		struct fhandle *u_fhp;
4668 		struct stat *sb;
4669 	} */ *uap;
4670 {
4671 	struct stat sb;
4672 	fhandle_t fh;
4673 	struct mount *mp;
4674 	struct vnode *vp;
4675 	int vfslocked;
4676 	int error;
4677 
4678 	error = priv_check(td, PRIV_VFS_FHSTAT);
4679 	if (error)
4680 		return (error);
4681 	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
4682 	if (error)
4683 		return (error);
4684 	if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
4685 		return (ESTALE);
4686 	vfslocked = VFS_LOCK_GIANT(mp);
4687 	error = VFS_FHTOVP(mp, &fh.fh_fid, LK_EXCLUSIVE, &vp);
4688 	vfs_unbusy(mp);
4689 	if (error) {
4690 		VFS_UNLOCK_GIANT(vfslocked);
4691 		return (error);
4692 	}
4693 	error = vn_stat(vp, &sb, td->td_ucred, NOCRED, td);
4694 	vput(vp);
4695 	VFS_UNLOCK_GIANT(vfslocked);
4696 	if (error)
4697 		return (error);
4698 	error = copyout(&sb, uap->sb, sizeof(sb));
4699 	return (error);
4700 }
4701 
4702 /*
4703  * Implement fstatfs() for (NFS) file handles.
4704  */
4705 #ifndef _SYS_SYSPROTO_H_
4706 struct fhstatfs_args {
4707 	struct fhandle *u_fhp;
4708 	struct statfs *buf;
4709 };
4710 #endif
4711 int
4712 sys_fhstatfs(td, uap)
4713 	struct thread *td;
4714 	struct fhstatfs_args /* {
4715 		struct fhandle *u_fhp;
4716 		struct statfs *buf;
4717 	} */ *uap;
4718 {
4719 	struct statfs sf;
4720 	fhandle_t fh;
4721 	int error;
4722 
4723 	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
4724 	if (error)
4725 		return (error);
4726 	error = kern_fhstatfs(td, fh, &sf);
4727 	if (error)
4728 		return (error);
4729 	return (copyout(&sf, uap->buf, sizeof(sf)));
4730 }
4731 
4732 int
4733 kern_fhstatfs(struct thread *td, fhandle_t fh, struct statfs *buf)
4734 {
4735 	struct statfs *sp;
4736 	struct mount *mp;
4737 	struct vnode *vp;
4738 	int vfslocked;
4739 	int error;
4740 
4741 	error = priv_check(td, PRIV_VFS_FHSTATFS);
4742 	if (error)
4743 		return (error);
4744 	if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
4745 		return (ESTALE);
4746 	vfslocked = VFS_LOCK_GIANT(mp);
4747 	error = VFS_FHTOVP(mp, &fh.fh_fid, LK_EXCLUSIVE, &vp);
4748 	if (error) {
4749 		vfs_unbusy(mp);
4750 		VFS_UNLOCK_GIANT(vfslocked);
4751 		return (error);
4752 	}
4753 	vput(vp);
4754 	error = prison_canseemount(td->td_ucred, mp);
4755 	if (error)
4756 		goto out;
4757 #ifdef MAC
4758 	error = mac_mount_check_stat(td->td_ucred, mp);
4759 	if (error)
4760 		goto out;
4761 #endif
4762 	/*
4763 	 * Set these in case the underlying filesystem fails to do so.
4764 	 */
4765 	sp = &mp->mnt_stat;
4766 	sp->f_version = STATFS_VERSION;
4767 	sp->f_namemax = NAME_MAX;
4768 	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
4769 	error = VFS_STATFS(mp, sp);
4770 	if (error == 0)
4771 		*buf = *sp;
4772 out:
4773 	vfs_unbusy(mp);
4774 	VFS_UNLOCK_GIANT(vfslocked);
4775 	return (error);
4776 }
4777 
4778 int
4779 kern_posix_fallocate(struct thread *td, int fd, off_t offset, off_t len)
4780 {
4781 	struct file *fp;
4782 	struct mount *mp;
4783 	struct vnode *vp;
4784 	off_t olen, ooffset;
4785 	int error, vfslocked;
4786 
4787 	fp = NULL;
4788 	vfslocked = 0;
4789 	error = fget(td, fd, CAP_WRITE, &fp);
4790 	if (error != 0)
4791 		goto out;
4792 
4793 	switch (fp->f_type) {
4794 	case DTYPE_VNODE:
4795 		break;
4796 	case DTYPE_PIPE:
4797 	case DTYPE_FIFO:
4798 		error = ESPIPE;
4799 		goto out;
4800 	default:
4801 		error = ENODEV;
4802 		goto out;
4803 	}
4804 	if ((fp->f_flag & FWRITE) == 0) {
4805 		error = EBADF;
4806 		goto out;
4807 	}
4808 	vp = fp->f_vnode;
4809 	if (vp->v_type != VREG) {
4810 		error = ENODEV;
4811 		goto out;
4812 	}
4813 	if (offset < 0 || len <= 0) {
4814 		error = EINVAL;
4815 		goto out;
4816 	}
4817 	/* Check for wrap. */
4818 	if (offset > OFF_MAX - len) {
4819 		error = EFBIG;
4820 		goto out;
4821 	}
4822 
4823 	/* Allocating blocks may take a long time, so iterate. */
4824 	for (;;) {
4825 		olen = len;
4826 		ooffset = offset;
4827 
4828 		bwillwrite();
4829 		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
4830 		mp = NULL;
4831 		error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
4832 		if (error != 0) {
4833 			VFS_UNLOCK_GIANT(vfslocked);
4834 			break;
4835 		}
4836 		error = vn_lock(vp, LK_EXCLUSIVE);
4837 		if (error != 0) {
4838 			vn_finished_write(mp);
4839 			VFS_UNLOCK_GIANT(vfslocked);
4840 			break;
4841 		}
4842 #ifdef MAC
4843 		error = mac_vnode_check_write(td->td_ucred, fp->f_cred, vp);
4844 		if (error == 0)
4845 #endif
4846 			error = VOP_ALLOCATE(vp, &offset, &len);
4847 		VOP_UNLOCK(vp, 0);
4848 		vn_finished_write(mp);
4849 		VFS_UNLOCK_GIANT(vfslocked);
4850 
4851 		if (olen + ooffset != offset + len) {
4852 			panic("offset + len changed from %jx/%jx to %jx/%jx",
4853 			    ooffset, olen, offset, len);
4854 		}
4855 		if (error != 0 || len == 0)
4856 			break;
4857 		KASSERT(olen > len, ("Iteration did not make progress?"));
4858 		maybe_yield();
4859 	}
4860  out:
4861 	if (fp != NULL)
4862 		fdrop(fp, td);
4863 	return (error);
4864 }
4865 
4866 int
4867 sys_posix_fallocate(struct thread *td, struct posix_fallocate_args *uap)
4868 {
4869 
4870 	return (kern_posix_fallocate(td, uap->fd, uap->offset, uap->len));
4871 }
4872 
4873 /*
4874  * Unlike madvise(2), we do not make a best effort to remember every
4875  * possible caching hint.  Instead, we remember the last setting with
4876  * the exception that we will allow POSIX_FADV_NORMAL to adjust the
4877  * region of any current setting.
4878  */
4879 int
4880 kern_posix_fadvise(struct thread *td, int fd, off_t offset, off_t len,
4881     int advice)
4882 {
4883 	struct fadvise_info *fa, *new;
4884 	struct file *fp;
4885 	struct vnode *vp;
4886 	off_t end;
4887 	int error;
4888 
4889 	if (offset < 0 || len < 0 || offset > OFF_MAX - len)
4890 		return (EINVAL);
4891 	switch (advice) {
4892 	case POSIX_FADV_SEQUENTIAL:
4893 	case POSIX_FADV_RANDOM:
4894 	case POSIX_FADV_NOREUSE:
4895 		new = malloc(sizeof(*fa), M_FADVISE, M_WAITOK);
4896 		break;
4897 	case POSIX_FADV_NORMAL:
4898 	case POSIX_FADV_WILLNEED:
4899 	case POSIX_FADV_DONTNEED:
4900 		new = NULL;
4901 		break;
4902 	default:
4903 		return (EINVAL);
4904 	}
4905 	/* XXX: CAP_POSIX_FADVISE? */
4906 	error = fget(td, fd, 0, &fp);
4907 	if (error != 0)
4908 		goto out;
4909 
4910 	switch (fp->f_type) {
4911 	case DTYPE_VNODE:
4912 		break;
4913 	case DTYPE_PIPE:
4914 	case DTYPE_FIFO:
4915 		error = ESPIPE;
4916 		goto out;
4917 	default:
4918 		error = ENODEV;
4919 		goto out;
4920 	}
4921 	vp = fp->f_vnode;
4922 	if (vp->v_type != VREG) {
4923 		error = ENODEV;
4924 		goto out;
4925 	}
4926 	if (len == 0)
4927 		end = OFF_MAX;
4928 	else
4929 		end = offset + len - 1;
4930 	switch (advice) {
4931 	case POSIX_FADV_SEQUENTIAL:
4932 	case POSIX_FADV_RANDOM:
4933 	case POSIX_FADV_NOREUSE:
4934 		/*
4935 		 * Try to merge any existing non-standard region with
4936 		 * this new region if possible, otherwise create a new
4937 		 * non-standard region for this request.
4938 		 */
4939 		mtx_pool_lock(mtxpool_sleep, fp);
4940 		fa = fp->f_advice;
4941 		if (fa != NULL && fa->fa_advice == advice &&
4942 		    ((fa->fa_start <= end && fa->fa_end >= offset) ||
4943 		    (end != OFF_MAX && fa->fa_start == end + 1) ||
4944 		    (fa->fa_end != OFF_MAX && fa->fa_end + 1 == offset))) {
4945 			if (offset < fa->fa_start)
4946 				fa->fa_start = offset;
4947 			if (end > fa->fa_end)
4948 				fa->fa_end = end;
4949 		} else {
4950 			new->fa_advice = advice;
4951 			new->fa_start = offset;
4952 			new->fa_end = end;
4953 			fp->f_advice = new;
4954 			new = fa;
4955 		}
4956 		mtx_pool_unlock(mtxpool_sleep, fp);
4957 		break;
4958 	case POSIX_FADV_NORMAL:
4959 		/*
4960 		 * If a the "normal" region overlaps with an existing
4961 		 * non-standard region, trim or remove the
4962 		 * non-standard region.
4963 		 */
4964 		mtx_pool_lock(mtxpool_sleep, fp);
4965 		fa = fp->f_advice;
4966 		if (fa != NULL) {
4967 			if (offset <= fa->fa_start && end >= fa->fa_end) {
4968 				new = fa;
4969 				fp->f_advice = NULL;
4970 			} else if (offset <= fa->fa_start &&
4971  			    end >= fa->fa_start)
4972 				fa->fa_start = end + 1;
4973 			else if (offset <= fa->fa_end && end >= fa->fa_end)
4974 				fa->fa_end = offset - 1;
4975 			else if (offset >= fa->fa_start && end <= fa->fa_end) {
4976 				/*
4977 				 * If the "normal" region is a middle
4978 				 * portion of the existing
4979 				 * non-standard region, just remove
4980 				 * the whole thing rather than picking
4981 				 * one side or the other to
4982 				 * preserve.
4983 				 */
4984 				new = fa;
4985 				fp->f_advice = NULL;
4986 			}
4987 		}
4988 		mtx_pool_unlock(mtxpool_sleep, fp);
4989 		break;
4990 	case POSIX_FADV_WILLNEED:
4991 	case POSIX_FADV_DONTNEED:
4992 		error = VOP_ADVISE(vp, offset, end, advice);
4993 		break;
4994 	}
4995 out:
4996 	if (fp != NULL)
4997 		fdrop(fp, td);
4998 	free(new, M_FADVISE);
4999 	return (error);
5000 }
5001 
5002 int
5003 sys_posix_fadvise(struct thread *td, struct posix_fadvise_args *uap)
5004 {
5005 
5006 	return (kern_posix_fadvise(td, uap->fd, uap->offset, uap->len,
5007 	    uap->advice));
5008 }
5009