xref: /freebsd/sys/kern/vfs_syscalls.c (revision 99429157e8615dc3b7f11afbe3ed92de7476a5db)
1 /*-
2  * Copyright (c) 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  *	@(#)vfs_syscalls.c	8.13 (Berkeley) 4/15/94
35  */
36 
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD$");
39 
40 #include "opt_capsicum.h"
41 #include "opt_compat.h"
42 #include "opt_ktrace.h"
43 
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/bio.h>
47 #include <sys/buf.h>
48 #include <sys/capsicum.h>
49 #include <sys/disk.h>
50 #include <sys/sysent.h>
51 #include <sys/malloc.h>
52 #include <sys/mount.h>
53 #include <sys/mutex.h>
54 #include <sys/sysproto.h>
55 #include <sys/namei.h>
56 #include <sys/filedesc.h>
57 #include <sys/kernel.h>
58 #include <sys/fcntl.h>
59 #include <sys/file.h>
60 #include <sys/filio.h>
61 #include <sys/limits.h>
62 #include <sys/linker.h>
63 #include <sys/rwlock.h>
64 #include <sys/sdt.h>
65 #include <sys/stat.h>
66 #include <sys/sx.h>
67 #include <sys/unistd.h>
68 #include <sys/vnode.h>
69 #include <sys/priv.h>
70 #include <sys/proc.h>
71 #include <sys/dirent.h>
72 #include <sys/jail.h>
73 #include <sys/syscallsubr.h>
74 #include <sys/sysctl.h>
75 #ifdef KTRACE
76 #include <sys/ktrace.h>
77 #endif
78 
79 #include <machine/stdarg.h>
80 
81 #include <security/audit/audit.h>
82 #include <security/mac/mac_framework.h>
83 
84 #include <vm/vm.h>
85 #include <vm/vm_object.h>
86 #include <vm/vm_page.h>
87 #include <vm/uma.h>
88 
89 #include <ufs/ufs/quota.h>
90 
91 MALLOC_DEFINE(M_FADVISE, "fadvise", "posix_fadvise(2) information");
92 
93 SDT_PROVIDER_DEFINE(vfs);
94 SDT_PROBE_DEFINE2(vfs, , stat, mode, "char *", "int");
95 SDT_PROBE_DEFINE2(vfs, , stat, reg, "char *", "int");
96 
97 static int kern_chflagsat(struct thread *td, int fd, const char *path,
98     enum uio_seg pathseg, u_long flags, int atflag);
99 static int setfflags(struct thread *td, struct vnode *, u_long);
100 static int getutimes(const struct timeval *, enum uio_seg, struct timespec *);
101 static int getutimens(const struct timespec *, enum uio_seg,
102     struct timespec *, int *);
103 static int setutimes(struct thread *td, struct vnode *,
104     const struct timespec *, int, int);
105 static int vn_access(struct vnode *vp, int user_flags, struct ucred *cred,
106     struct thread *td);
107 
108 /*
109  * Sync each mounted filesystem.
110  */
111 #ifndef _SYS_SYSPROTO_H_
112 struct sync_args {
113 	int     dummy;
114 };
115 #endif
116 /* ARGSUSED */
117 int
118 sys_sync(struct thread *td, struct sync_args *uap)
119 {
120 	struct mount *mp, *nmp;
121 	int save;
122 
123 	mtx_lock(&mountlist_mtx);
124 	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
125 		if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
126 			nmp = TAILQ_NEXT(mp, mnt_list);
127 			continue;
128 		}
129 		if ((mp->mnt_flag & MNT_RDONLY) == 0 &&
130 		    vn_start_write(NULL, &mp, V_NOWAIT) == 0) {
131 			save = curthread_pflags_set(TDP_SYNCIO);
132 			vfs_msync(mp, MNT_NOWAIT);
133 			VFS_SYNC(mp, MNT_NOWAIT);
134 			curthread_pflags_restore(save);
135 			vn_finished_write(mp);
136 		}
137 		mtx_lock(&mountlist_mtx);
138 		nmp = TAILQ_NEXT(mp, mnt_list);
139 		vfs_unbusy(mp);
140 	}
141 	mtx_unlock(&mountlist_mtx);
142 	return (0);
143 }
144 
145 /*
146  * Change filesystem quotas.
147  */
148 #ifndef _SYS_SYSPROTO_H_
149 struct quotactl_args {
150 	char *path;
151 	int cmd;
152 	int uid;
153 	caddr_t arg;
154 };
155 #endif
156 int
157 sys_quotactl(struct thread *td, struct quotactl_args *uap)
158 {
159 	struct mount *mp;
160 	struct nameidata nd;
161 	int error;
162 
163 	AUDIT_ARG_CMD(uap->cmd);
164 	AUDIT_ARG_UID(uap->uid);
165 	if (!prison_allow(td->td_ucred, PR_ALLOW_QUOTAS))
166 		return (EPERM);
167 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
168 	    uap->path, td);
169 	if ((error = namei(&nd)) != 0)
170 		return (error);
171 	NDFREE(&nd, NDF_ONLY_PNBUF);
172 	mp = nd.ni_vp->v_mount;
173 	vfs_ref(mp);
174 	vput(nd.ni_vp);
175 	error = vfs_busy(mp, 0);
176 	vfs_rel(mp);
177 	if (error != 0)
178 		return (error);
179 	error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, uap->arg);
180 
181 	/*
182 	 * Since quota on operation typically needs to open quota
183 	 * file, the Q_QUOTAON handler needs to unbusy the mount point
184 	 * before calling into namei.  Otherwise, unmount might be
185 	 * started between two vfs_busy() invocations (first is our,
186 	 * second is from mount point cross-walk code in lookup()),
187 	 * causing deadlock.
188 	 *
189 	 * Require that Q_QUOTAON handles the vfs_busy() reference on
190 	 * its own, always returning with ubusied mount point.
191 	 */
192 	if ((uap->cmd >> SUBCMDSHIFT) != Q_QUOTAON)
193 		vfs_unbusy(mp);
194 	return (error);
195 }
196 
197 /*
198  * Used by statfs conversion routines to scale the block size up if
199  * necessary so that all of the block counts are <= 'max_size'.  Note
200  * that 'max_size' should be a bitmask, i.e. 2^n - 1 for some non-zero
201  * value of 'n'.
202  */
203 void
204 statfs_scale_blocks(struct statfs *sf, long max_size)
205 {
206 	uint64_t count;
207 	int shift;
208 
209 	KASSERT(powerof2(max_size + 1), ("%s: invalid max_size", __func__));
210 
211 	/*
212 	 * Attempt to scale the block counts to give a more accurate
213 	 * overview to userland of the ratio of free space to used
214 	 * space.  To do this, find the largest block count and compute
215 	 * a divisor that lets it fit into a signed integer <= max_size.
216 	 */
217 	if (sf->f_bavail < 0)
218 		count = -sf->f_bavail;
219 	else
220 		count = sf->f_bavail;
221 	count = MAX(sf->f_blocks, MAX(sf->f_bfree, count));
222 	if (count <= max_size)
223 		return;
224 
225 	count >>= flsl(max_size);
226 	shift = 0;
227 	while (count > 0) {
228 		shift++;
229 		count >>=1;
230 	}
231 
232 	sf->f_bsize <<= shift;
233 	sf->f_blocks >>= shift;
234 	sf->f_bfree >>= shift;
235 	sf->f_bavail >>= shift;
236 }
237 
238 static int
239 kern_do_statfs(struct thread *td, struct mount *mp, struct statfs *buf)
240 {
241 	struct statfs *sp;
242 	int error;
243 
244 	if (mp == NULL)
245 		return (EBADF);
246 	error = vfs_busy(mp, 0);
247 	vfs_rel(mp);
248 	if (error != 0)
249 		return (error);
250 #ifdef MAC
251 	error = mac_mount_check_stat(td->td_ucred, mp);
252 	if (error != 0)
253 		goto out;
254 #endif
255 	/*
256 	 * Set these in case the underlying filesystem fails to do so.
257 	 */
258 	sp = &mp->mnt_stat;
259 	sp->f_version = STATFS_VERSION;
260 	sp->f_namemax = NAME_MAX;
261 	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
262 	error = VFS_STATFS(mp, sp);
263 	if (error != 0)
264 		goto out;
265 	*buf = *sp;
266 	if (priv_check(td, PRIV_VFS_GENERATION)) {
267 		buf->f_fsid.val[0] = buf->f_fsid.val[1] = 0;
268 		prison_enforce_statfs(td->td_ucred, mp, buf);
269 	}
270 out:
271 	vfs_unbusy(mp);
272 	return (error);
273 }
274 
275 /*
276  * Get filesystem statistics.
277  */
278 #ifndef _SYS_SYSPROTO_H_
279 struct statfs_args {
280 	char *path;
281 	struct statfs *buf;
282 };
283 #endif
284 int
285 sys_statfs(struct thread *td, struct statfs_args *uap)
286 {
287 	struct statfs *sfp;
288 	int error;
289 
290 	sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
291 	error = kern_statfs(td, uap->path, UIO_USERSPACE, sfp);
292 	if (error == 0)
293 		error = copyout(sfp, uap->buf, sizeof(struct statfs));
294 	free(sfp, M_STATFS);
295 	return (error);
296 }
297 
298 int
299 kern_statfs(struct thread *td, char *path, enum uio_seg pathseg,
300     struct statfs *buf)
301 {
302 	struct mount *mp;
303 	struct nameidata nd;
304 	int error;
305 
306 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
307 	    pathseg, path, td);
308 	error = namei(&nd);
309 	if (error != 0)
310 		return (error);
311 	mp = nd.ni_vp->v_mount;
312 	vfs_ref(mp);
313 	NDFREE(&nd, NDF_ONLY_PNBUF);
314 	vput(nd.ni_vp);
315 	return (kern_do_statfs(td, mp, buf));
316 }
317 
318 /*
319  * Get filesystem statistics.
320  */
321 #ifndef _SYS_SYSPROTO_H_
322 struct fstatfs_args {
323 	int fd;
324 	struct statfs *buf;
325 };
326 #endif
327 int
328 sys_fstatfs(struct thread *td, struct fstatfs_args *uap)
329 {
330 	struct statfs *sfp;
331 	int error;
332 
333 	sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
334 	error = kern_fstatfs(td, uap->fd, sfp);
335 	if (error == 0)
336 		error = copyout(sfp, uap->buf, sizeof(struct statfs));
337 	free(sfp, M_STATFS);
338 	return (error);
339 }
340 
341 int
342 kern_fstatfs(struct thread *td, int fd, struct statfs *buf)
343 {
344 	struct file *fp;
345 	struct mount *mp;
346 	struct vnode *vp;
347 	cap_rights_t rights;
348 	int error;
349 
350 	AUDIT_ARG_FD(fd);
351 	error = getvnode(td, fd, cap_rights_init(&rights, CAP_FSTATFS), &fp);
352 	if (error != 0)
353 		return (error);
354 	vp = fp->f_vnode;
355 	vn_lock(vp, LK_SHARED | LK_RETRY);
356 #ifdef AUDIT
357 	AUDIT_ARG_VNODE1(vp);
358 #endif
359 	mp = vp->v_mount;
360 	if (mp != NULL)
361 		vfs_ref(mp);
362 	VOP_UNLOCK(vp, 0);
363 	fdrop(fp, td);
364 	return (kern_do_statfs(td, mp, buf));
365 }
366 
367 /*
368  * Get statistics on all filesystems.
369  */
370 #ifndef _SYS_SYSPROTO_H_
371 struct getfsstat_args {
372 	struct statfs *buf;
373 	long bufsize;
374 	int mode;
375 };
376 #endif
377 int
378 sys_getfsstat(struct thread *td, struct getfsstat_args *uap)
379 {
380 	size_t count;
381 	int error;
382 
383 	if (uap->bufsize < 0 || uap->bufsize > SIZE_MAX)
384 		return (EINVAL);
385 	error = kern_getfsstat(td, &uap->buf, uap->bufsize, &count,
386 	    UIO_USERSPACE, uap->mode);
387 	if (error == 0)
388 		td->td_retval[0] = count;
389 	return (error);
390 }
391 
392 /*
393  * If (bufsize > 0 && bufseg == UIO_SYSSPACE)
394  *	The caller is responsible for freeing memory which will be allocated
395  *	in '*buf'.
396  */
397 int
398 kern_getfsstat(struct thread *td, struct statfs **buf, size_t bufsize,
399     size_t *countp, enum uio_seg bufseg, int mode)
400 {
401 	struct mount *mp, *nmp;
402 	struct statfs *sfsp, *sp, *sptmp, *tofree;
403 	size_t count, maxcount;
404 	int error;
405 
406 	switch (mode) {
407 	case MNT_WAIT:
408 	case MNT_NOWAIT:
409 		break;
410 	default:
411 		if (bufseg == UIO_SYSSPACE)
412 			*buf = NULL;
413 		return (EINVAL);
414 	}
415 restart:
416 	maxcount = bufsize / sizeof(struct statfs);
417 	if (bufsize == 0) {
418 		sfsp = NULL;
419 		tofree = NULL;
420 	} else if (bufseg == UIO_USERSPACE) {
421 		sfsp = *buf;
422 		tofree = NULL;
423 	} else /* if (bufseg == UIO_SYSSPACE) */ {
424 		count = 0;
425 		mtx_lock(&mountlist_mtx);
426 		TAILQ_FOREACH(mp, &mountlist, mnt_list) {
427 			count++;
428 		}
429 		mtx_unlock(&mountlist_mtx);
430 		if (maxcount > count)
431 			maxcount = count;
432 		tofree = sfsp = *buf = malloc(maxcount * sizeof(struct statfs),
433 		    M_STATFS, M_WAITOK);
434 	}
435 	count = 0;
436 	mtx_lock(&mountlist_mtx);
437 	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
438 		if (prison_canseemount(td->td_ucred, mp) != 0) {
439 			nmp = TAILQ_NEXT(mp, mnt_list);
440 			continue;
441 		}
442 #ifdef MAC
443 		if (mac_mount_check_stat(td->td_ucred, mp) != 0) {
444 			nmp = TAILQ_NEXT(mp, mnt_list);
445 			continue;
446 		}
447 #endif
448 		if (mode == MNT_WAIT) {
449 			if (vfs_busy(mp, MBF_MNTLSTLOCK) != 0) {
450 				/*
451 				 * If vfs_busy() failed, and MBF_NOWAIT
452 				 * wasn't passed, then the mp is gone.
453 				 * Furthermore, because of MBF_MNTLSTLOCK,
454 				 * the mountlist_mtx was dropped.  We have
455 				 * no other choice than to start over.
456 				 */
457 				mtx_unlock(&mountlist_mtx);
458 				free(tofree, M_STATFS);
459 				goto restart;
460 			}
461 		} else {
462 			if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK) != 0) {
463 				nmp = TAILQ_NEXT(mp, mnt_list);
464 				continue;
465 			}
466 		}
467 		if (sfsp != NULL && count < maxcount) {
468 			sp = &mp->mnt_stat;
469 			/*
470 			 * Set these in case the underlying filesystem
471 			 * fails to do so.
472 			 */
473 			sp->f_version = STATFS_VERSION;
474 			sp->f_namemax = NAME_MAX;
475 			sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
476 			/*
477 			 * If MNT_NOWAIT is specified, do not refresh
478 			 * the fsstat cache.
479 			 */
480 			if (mode != MNT_NOWAIT) {
481 				error = VFS_STATFS(mp, sp);
482 				if (error != 0) {
483 					mtx_lock(&mountlist_mtx);
484 					nmp = TAILQ_NEXT(mp, mnt_list);
485 					vfs_unbusy(mp);
486 					continue;
487 				}
488 			}
489 			if (priv_check(td, PRIV_VFS_GENERATION)) {
490 				sptmp = malloc(sizeof(struct statfs), M_STATFS,
491 				    M_WAITOK);
492 				*sptmp = *sp;
493 				sptmp->f_fsid.val[0] = sptmp->f_fsid.val[1] = 0;
494 				prison_enforce_statfs(td->td_ucred, mp, sptmp);
495 				sp = sptmp;
496 			} else
497 				sptmp = NULL;
498 			if (bufseg == UIO_SYSSPACE) {
499 				bcopy(sp, sfsp, sizeof(*sp));
500 				free(sptmp, M_STATFS);
501 			} else /* if (bufseg == UIO_USERSPACE) */ {
502 				error = copyout(sp, sfsp, sizeof(*sp));
503 				free(sptmp, M_STATFS);
504 				if (error != 0) {
505 					vfs_unbusy(mp);
506 					return (error);
507 				}
508 			}
509 			sfsp++;
510 		}
511 		count++;
512 		mtx_lock(&mountlist_mtx);
513 		nmp = TAILQ_NEXT(mp, mnt_list);
514 		vfs_unbusy(mp);
515 	}
516 	mtx_unlock(&mountlist_mtx);
517 	if (sfsp != NULL && count > maxcount)
518 		*countp = maxcount;
519 	else
520 		*countp = count;
521 	return (0);
522 }
523 
524 #ifdef COMPAT_FREEBSD4
525 /*
526  * Get old format filesystem statistics.
527  */
528 static void cvtstatfs(struct statfs *, struct ostatfs *);
529 
530 #ifndef _SYS_SYSPROTO_H_
531 struct freebsd4_statfs_args {
532 	char *path;
533 	struct ostatfs *buf;
534 };
535 #endif
536 int
537 freebsd4_statfs(struct thread *td, struct freebsd4_statfs_args *uap)
538 {
539 	struct ostatfs osb;
540 	struct statfs *sfp;
541 	int error;
542 
543 	sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
544 	error = kern_statfs(td, uap->path, UIO_USERSPACE, sfp);
545 	if (error == 0) {
546 		cvtstatfs(sfp, &osb);
547 		error = copyout(&osb, uap->buf, sizeof(osb));
548 	}
549 	free(sfp, M_STATFS);
550 	return (error);
551 }
552 
553 /*
554  * Get filesystem statistics.
555  */
556 #ifndef _SYS_SYSPROTO_H_
557 struct freebsd4_fstatfs_args {
558 	int fd;
559 	struct ostatfs *buf;
560 };
561 #endif
562 int
563 freebsd4_fstatfs(struct thread *td, struct freebsd4_fstatfs_args *uap)
564 {
565 	struct ostatfs osb;
566 	struct statfs *sfp;
567 	int error;
568 
569 	sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
570 	error = kern_fstatfs(td, uap->fd, sfp);
571 	if (error == 0) {
572 		cvtstatfs(sfp, &osb);
573 		error = copyout(&osb, uap->buf, sizeof(osb));
574 	}
575 	free(sfp, M_STATFS);
576 	return (error);
577 }
578 
579 /*
580  * Get statistics on all filesystems.
581  */
582 #ifndef _SYS_SYSPROTO_H_
583 struct freebsd4_getfsstat_args {
584 	struct ostatfs *buf;
585 	long bufsize;
586 	int mode;
587 };
588 #endif
589 int
590 freebsd4_getfsstat(struct thread *td, struct freebsd4_getfsstat_args *uap)
591 {
592 	struct statfs *buf, *sp;
593 	struct ostatfs osb;
594 	size_t count, size;
595 	int error;
596 
597 	if (uap->bufsize < 0)
598 		return (EINVAL);
599 	count = uap->bufsize / sizeof(struct ostatfs);
600 	if (count > SIZE_MAX / sizeof(struct statfs))
601 		return (EINVAL);
602 	size = count * sizeof(struct statfs);
603 	error = kern_getfsstat(td, &buf, size, &count, UIO_SYSSPACE,
604 	    uap->mode);
605 	td->td_retval[0] = count;
606 	if (size != 0) {
607 		sp = buf;
608 		while (count != 0 && error == 0) {
609 			cvtstatfs(sp, &osb);
610 			error = copyout(&osb, uap->buf, sizeof(osb));
611 			sp++;
612 			uap->buf++;
613 			count--;
614 		}
615 		free(buf, M_STATFS);
616 	}
617 	return (error);
618 }
619 
620 /*
621  * Implement fstatfs() for (NFS) file handles.
622  */
623 #ifndef _SYS_SYSPROTO_H_
624 struct freebsd4_fhstatfs_args {
625 	struct fhandle *u_fhp;
626 	struct ostatfs *buf;
627 };
628 #endif
629 int
630 freebsd4_fhstatfs(struct thread *td, struct freebsd4_fhstatfs_args *uap)
631 {
632 	struct ostatfs osb;
633 	struct statfs *sfp;
634 	fhandle_t fh;
635 	int error;
636 
637 	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
638 	if (error != 0)
639 		return (error);
640 	sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
641 	error = kern_fhstatfs(td, fh, sfp);
642 	if (error == 0) {
643 		cvtstatfs(sfp, &osb);
644 		error = copyout(&osb, uap->buf, sizeof(osb));
645 	}
646 	free(sfp, M_STATFS);
647 	return (error);
648 }
649 
650 /*
651  * Convert a new format statfs structure to an old format statfs structure.
652  */
653 static void
654 cvtstatfs(struct statfs *nsp, struct ostatfs *osp)
655 {
656 
657 	statfs_scale_blocks(nsp, LONG_MAX);
658 	bzero(osp, sizeof(*osp));
659 	osp->f_bsize = nsp->f_bsize;
660 	osp->f_iosize = MIN(nsp->f_iosize, LONG_MAX);
661 	osp->f_blocks = nsp->f_blocks;
662 	osp->f_bfree = nsp->f_bfree;
663 	osp->f_bavail = nsp->f_bavail;
664 	osp->f_files = MIN(nsp->f_files, LONG_MAX);
665 	osp->f_ffree = MIN(nsp->f_ffree, LONG_MAX);
666 	osp->f_owner = nsp->f_owner;
667 	osp->f_type = nsp->f_type;
668 	osp->f_flags = nsp->f_flags;
669 	osp->f_syncwrites = MIN(nsp->f_syncwrites, LONG_MAX);
670 	osp->f_asyncwrites = MIN(nsp->f_asyncwrites, LONG_MAX);
671 	osp->f_syncreads = MIN(nsp->f_syncreads, LONG_MAX);
672 	osp->f_asyncreads = MIN(nsp->f_asyncreads, LONG_MAX);
673 	strlcpy(osp->f_fstypename, nsp->f_fstypename,
674 	    MIN(MFSNAMELEN, OMFSNAMELEN));
675 	strlcpy(osp->f_mntonname, nsp->f_mntonname,
676 	    MIN(MNAMELEN, OMNAMELEN));
677 	strlcpy(osp->f_mntfromname, nsp->f_mntfromname,
678 	    MIN(MNAMELEN, OMNAMELEN));
679 	osp->f_fsid = nsp->f_fsid;
680 }
681 #endif /* COMPAT_FREEBSD4 */
682 
683 /*
684  * Change current working directory to a given file descriptor.
685  */
686 #ifndef _SYS_SYSPROTO_H_
687 struct fchdir_args {
688 	int	fd;
689 };
690 #endif
691 int
692 sys_fchdir(struct thread *td, struct fchdir_args *uap)
693 {
694 	struct vnode *vp, *tdp;
695 	struct mount *mp;
696 	struct file *fp;
697 	cap_rights_t rights;
698 	int error;
699 
700 	AUDIT_ARG_FD(uap->fd);
701 	error = getvnode(td, uap->fd, cap_rights_init(&rights, CAP_FCHDIR),
702 	    &fp);
703 	if (error != 0)
704 		return (error);
705 	vp = fp->f_vnode;
706 	vrefact(vp);
707 	fdrop(fp, td);
708 	vn_lock(vp, LK_SHARED | LK_RETRY);
709 	AUDIT_ARG_VNODE1(vp);
710 	error = change_dir(vp, td);
711 	while (!error && (mp = vp->v_mountedhere) != NULL) {
712 		if (vfs_busy(mp, 0))
713 			continue;
714 		error = VFS_ROOT(mp, LK_SHARED, &tdp);
715 		vfs_unbusy(mp);
716 		if (error != 0)
717 			break;
718 		vput(vp);
719 		vp = tdp;
720 	}
721 	if (error != 0) {
722 		vput(vp);
723 		return (error);
724 	}
725 	VOP_UNLOCK(vp, 0);
726 	pwd_chdir(td, vp);
727 	return (0);
728 }
729 
730 /*
731  * Change current working directory (``.'').
732  */
733 #ifndef _SYS_SYSPROTO_H_
734 struct chdir_args {
735 	char	*path;
736 };
737 #endif
738 int
739 sys_chdir(struct thread *td, struct chdir_args *uap)
740 {
741 
742 	return (kern_chdir(td, uap->path, UIO_USERSPACE));
743 }
744 
745 int
746 kern_chdir(struct thread *td, char *path, enum uio_seg pathseg)
747 {
748 	struct nameidata nd;
749 	int error;
750 
751 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
752 	    pathseg, path, td);
753 	if ((error = namei(&nd)) != 0)
754 		return (error);
755 	if ((error = change_dir(nd.ni_vp, td)) != 0) {
756 		vput(nd.ni_vp);
757 		NDFREE(&nd, NDF_ONLY_PNBUF);
758 		return (error);
759 	}
760 	VOP_UNLOCK(nd.ni_vp, 0);
761 	NDFREE(&nd, NDF_ONLY_PNBUF);
762 	pwd_chdir(td, nd.ni_vp);
763 	return (0);
764 }
765 
766 /*
767  * Change notion of root (``/'') directory.
768  */
769 #ifndef _SYS_SYSPROTO_H_
770 struct chroot_args {
771 	char	*path;
772 };
773 #endif
774 int
775 sys_chroot(struct thread *td, struct chroot_args *uap)
776 {
777 	struct nameidata nd;
778 	int error;
779 
780 	error = priv_check(td, PRIV_VFS_CHROOT);
781 	if (error != 0)
782 		return (error);
783 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
784 	    UIO_USERSPACE, uap->path, td);
785 	error = namei(&nd);
786 	if (error != 0)
787 		goto error;
788 	error = change_dir(nd.ni_vp, td);
789 	if (error != 0)
790 		goto e_vunlock;
791 #ifdef MAC
792 	error = mac_vnode_check_chroot(td->td_ucred, nd.ni_vp);
793 	if (error != 0)
794 		goto e_vunlock;
795 #endif
796 	VOP_UNLOCK(nd.ni_vp, 0);
797 	error = pwd_chroot(td, nd.ni_vp);
798 	vrele(nd.ni_vp);
799 	NDFREE(&nd, NDF_ONLY_PNBUF);
800 	return (error);
801 e_vunlock:
802 	vput(nd.ni_vp);
803 error:
804 	NDFREE(&nd, NDF_ONLY_PNBUF);
805 	return (error);
806 }
807 
808 /*
809  * Common routine for chroot and chdir.  Callers must provide a locked vnode
810  * instance.
811  */
812 int
813 change_dir(struct vnode *vp, struct thread *td)
814 {
815 #ifdef MAC
816 	int error;
817 #endif
818 
819 	ASSERT_VOP_LOCKED(vp, "change_dir(): vp not locked");
820 	if (vp->v_type != VDIR)
821 		return (ENOTDIR);
822 #ifdef MAC
823 	error = mac_vnode_check_chdir(td->td_ucred, vp);
824 	if (error != 0)
825 		return (error);
826 #endif
827 	return (VOP_ACCESS(vp, VEXEC, td->td_ucred, td));
828 }
829 
830 static __inline void
831 flags_to_rights(int flags, cap_rights_t *rightsp)
832 {
833 
834 	if (flags & O_EXEC) {
835 		cap_rights_set(rightsp, CAP_FEXECVE);
836 	} else {
837 		switch ((flags & O_ACCMODE)) {
838 		case O_RDONLY:
839 			cap_rights_set(rightsp, CAP_READ);
840 			break;
841 		case O_RDWR:
842 			cap_rights_set(rightsp, CAP_READ);
843 			/* FALLTHROUGH */
844 		case O_WRONLY:
845 			cap_rights_set(rightsp, CAP_WRITE);
846 			if (!(flags & (O_APPEND | O_TRUNC)))
847 				cap_rights_set(rightsp, CAP_SEEK);
848 			break;
849 		}
850 	}
851 
852 	if (flags & O_CREAT)
853 		cap_rights_set(rightsp, CAP_CREATE);
854 
855 	if (flags & O_TRUNC)
856 		cap_rights_set(rightsp, CAP_FTRUNCATE);
857 
858 	if (flags & (O_SYNC | O_FSYNC))
859 		cap_rights_set(rightsp, CAP_FSYNC);
860 
861 	if (flags & (O_EXLOCK | O_SHLOCK))
862 		cap_rights_set(rightsp, CAP_FLOCK);
863 }
864 
865 /*
866  * Check permissions, allocate an open file structure, and call the device
867  * open routine if any.
868  */
869 #ifndef _SYS_SYSPROTO_H_
870 struct open_args {
871 	char	*path;
872 	int	flags;
873 	int	mode;
874 };
875 #endif
876 int
877 sys_open(struct thread *td, struct open_args *uap)
878 {
879 
880 	return (kern_openat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
881 	    uap->flags, uap->mode));
882 }
883 
884 #ifndef _SYS_SYSPROTO_H_
885 struct openat_args {
886 	int	fd;
887 	char	*path;
888 	int	flag;
889 	int	mode;
890 };
891 #endif
892 int
893 sys_openat(struct thread *td, struct openat_args *uap)
894 {
895 
896 	AUDIT_ARG_FD(uap->fd);
897 	return (kern_openat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flag,
898 	    uap->mode));
899 }
900 
901 int
902 kern_openat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
903     int flags, int mode)
904 {
905 	struct proc *p = td->td_proc;
906 	struct filedesc *fdp = p->p_fd;
907 	struct file *fp;
908 	struct vnode *vp;
909 	struct nameidata nd;
910 	cap_rights_t rights;
911 	int cmode, error, indx;
912 
913 	indx = -1;
914 
915 	AUDIT_ARG_FFLAGS(flags);
916 	AUDIT_ARG_MODE(mode);
917 	cap_rights_init(&rights, CAP_LOOKUP);
918 	flags_to_rights(flags, &rights);
919 	/*
920 	 * Only one of the O_EXEC, O_RDONLY, O_WRONLY and O_RDWR flags
921 	 * may be specified.
922 	 */
923 	if (flags & O_EXEC) {
924 		if (flags & O_ACCMODE)
925 			return (EINVAL);
926 	} else if ((flags & O_ACCMODE) == O_ACCMODE) {
927 		return (EINVAL);
928 	} else {
929 		flags = FFLAGS(flags);
930 	}
931 
932 	/*
933 	 * Allocate a file structure. The descriptor to reference it
934 	 * is allocated and set by finstall() below.
935 	 */
936 	error = falloc_noinstall(td, &fp);
937 	if (error != 0)
938 		return (error);
939 	/*
940 	 * An extra reference on `fp' has been held for us by
941 	 * falloc_noinstall().
942 	 */
943 	/* Set the flags early so the finit in devfs can pick them up. */
944 	fp->f_flag = flags & FMASK;
945 	cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
946 	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, fd,
947 	    &rights, td);
948 	td->td_dupfd = -1;		/* XXX check for fdopen */
949 	error = vn_open(&nd, &flags, cmode, fp);
950 	if (error != 0) {
951 		/*
952 		 * If the vn_open replaced the method vector, something
953 		 * wonderous happened deep below and we just pass it up
954 		 * pretending we know what we do.
955 		 */
956 		if (error == ENXIO && fp->f_ops != &badfileops)
957 			goto success;
958 
959 		/*
960 		 * Handle special fdopen() case. bleh.
961 		 *
962 		 * Don't do this for relative (capability) lookups; we don't
963 		 * understand exactly what would happen, and we don't think
964 		 * that it ever should.
965 		 */
966 		if ((nd.ni_lcf & NI_LCF_STRICTRELATIVE) == 0 &&
967 		    (error == ENODEV || error == ENXIO) &&
968 		    td->td_dupfd >= 0) {
969 			error = dupfdopen(td, fdp, td->td_dupfd, flags, error,
970 			    &indx);
971 			if (error == 0)
972 				goto success;
973 		}
974 
975 		goto bad;
976 	}
977 	td->td_dupfd = 0;
978 	NDFREE(&nd, NDF_ONLY_PNBUF);
979 	vp = nd.ni_vp;
980 
981 	/*
982 	 * Store the vnode, for any f_type. Typically, the vnode use
983 	 * count is decremented by direct call to vn_closefile() for
984 	 * files that switched type in the cdevsw fdopen() method.
985 	 */
986 	fp->f_vnode = vp;
987 	/*
988 	 * If the file wasn't claimed by devfs bind it to the normal
989 	 * vnode operations here.
990 	 */
991 	if (fp->f_ops == &badfileops) {
992 		KASSERT(vp->v_type != VFIFO, ("Unexpected fifo."));
993 		fp->f_seqcount = 1;
994 		finit(fp, (flags & FMASK) | (fp->f_flag & FHASLOCK),
995 		    DTYPE_VNODE, vp, &vnops);
996 	}
997 
998 	VOP_UNLOCK(vp, 0);
999 	if (flags & O_TRUNC) {
1000 		error = fo_truncate(fp, 0, td->td_ucred, td);
1001 		if (error != 0)
1002 			goto bad;
1003 	}
1004 success:
1005 	/*
1006 	 * If we haven't already installed the FD (for dupfdopen), do so now.
1007 	 */
1008 	if (indx == -1) {
1009 		struct filecaps *fcaps;
1010 
1011 #ifdef CAPABILITIES
1012 		if ((nd.ni_lcf & NI_LCF_STRICTRELATIVE) != 0)
1013 			fcaps = &nd.ni_filecaps;
1014 		else
1015 #endif
1016 			fcaps = NULL;
1017 		error = finstall(td, fp, &indx, flags, fcaps);
1018 		/* On success finstall() consumes fcaps. */
1019 		if (error != 0) {
1020 			filecaps_free(&nd.ni_filecaps);
1021 			goto bad;
1022 		}
1023 	} else {
1024 		filecaps_free(&nd.ni_filecaps);
1025 	}
1026 
1027 	/*
1028 	 * Release our private reference, leaving the one associated with
1029 	 * the descriptor table intact.
1030 	 */
1031 	fdrop(fp, td);
1032 	td->td_retval[0] = indx;
1033 	return (0);
1034 bad:
1035 	KASSERT(indx == -1, ("indx=%d, should be -1", indx));
1036 	fdrop(fp, td);
1037 	return (error);
1038 }
1039 
1040 #ifdef COMPAT_43
1041 /*
1042  * Create a file.
1043  */
1044 #ifndef _SYS_SYSPROTO_H_
1045 struct ocreat_args {
1046 	char	*path;
1047 	int	mode;
1048 };
1049 #endif
1050 int
1051 ocreat(struct thread *td, struct ocreat_args *uap)
1052 {
1053 
1054 	return (kern_openat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
1055 	    O_WRONLY | O_CREAT | O_TRUNC, uap->mode));
1056 }
1057 #endif /* COMPAT_43 */
1058 
1059 /*
1060  * Create a special file.
1061  */
1062 #ifndef _SYS_SYSPROTO_H_
1063 struct mknod_args {
1064 	char	*path;
1065 	int	mode;
1066 	int	dev;
1067 };
1068 #endif
1069 int
1070 sys_mknod(struct thread *td, struct mknod_args *uap)
1071 {
1072 
1073 	return (kern_mknodat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
1074 	    uap->mode, uap->dev));
1075 }
1076 
1077 #ifndef _SYS_SYSPROTO_H_
1078 struct mknodat_args {
1079 	int	fd;
1080 	char	*path;
1081 	mode_t	mode;
1082 	dev_t	dev;
1083 };
1084 #endif
1085 int
1086 sys_mknodat(struct thread *td, struct mknodat_args *uap)
1087 {
1088 
1089 	return (kern_mknodat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode,
1090 	    uap->dev));
1091 }
1092 
1093 int
1094 kern_mknodat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
1095     int mode, int dev)
1096 {
1097 	struct vnode *vp;
1098 	struct mount *mp;
1099 	struct vattr vattr;
1100 	struct nameidata nd;
1101 	cap_rights_t rights;
1102 	int error, whiteout = 0;
1103 
1104 	AUDIT_ARG_MODE(mode);
1105 	AUDIT_ARG_DEV(dev);
1106 	switch (mode & S_IFMT) {
1107 	case S_IFCHR:
1108 	case S_IFBLK:
1109 		error = priv_check(td, PRIV_VFS_MKNOD_DEV);
1110 		if (error == 0 && dev == VNOVAL)
1111 			error = EINVAL;
1112 		break;
1113 	case S_IFMT:
1114 		error = priv_check(td, PRIV_VFS_MKNOD_BAD);
1115 		break;
1116 	case S_IFWHT:
1117 		error = priv_check(td, PRIV_VFS_MKNOD_WHT);
1118 		break;
1119 	case S_IFIFO:
1120 		if (dev == 0)
1121 			return (kern_mkfifoat(td, fd, path, pathseg, mode));
1122 		/* FALLTHROUGH */
1123 	default:
1124 		error = EINVAL;
1125 		break;
1126 	}
1127 	if (error != 0)
1128 		return (error);
1129 restart:
1130 	bwillwrite();
1131 	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 |
1132 	    NOCACHE, pathseg, path, fd, cap_rights_init(&rights, CAP_MKNODAT),
1133 	    td);
1134 	if ((error = namei(&nd)) != 0)
1135 		return (error);
1136 	vp = nd.ni_vp;
1137 	if (vp != NULL) {
1138 		NDFREE(&nd, NDF_ONLY_PNBUF);
1139 		if (vp == nd.ni_dvp)
1140 			vrele(nd.ni_dvp);
1141 		else
1142 			vput(nd.ni_dvp);
1143 		vrele(vp);
1144 		return (EEXIST);
1145 	} else {
1146 		VATTR_NULL(&vattr);
1147 		vattr.va_mode = (mode & ALLPERMS) &
1148 		    ~td->td_proc->p_fd->fd_cmask;
1149 		vattr.va_rdev = dev;
1150 		whiteout = 0;
1151 
1152 		switch (mode & S_IFMT) {
1153 		case S_IFMT:	/* used by badsect to flag bad sectors */
1154 			vattr.va_type = VBAD;
1155 			break;
1156 		case S_IFCHR:
1157 			vattr.va_type = VCHR;
1158 			break;
1159 		case S_IFBLK:
1160 			vattr.va_type = VBLK;
1161 			break;
1162 		case S_IFWHT:
1163 			whiteout = 1;
1164 			break;
1165 		default:
1166 			panic("kern_mknod: invalid mode");
1167 		}
1168 	}
1169 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1170 		NDFREE(&nd, NDF_ONLY_PNBUF);
1171 		vput(nd.ni_dvp);
1172 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
1173 			return (error);
1174 		goto restart;
1175 	}
1176 #ifdef MAC
1177 	if (error == 0 && !whiteout)
1178 		error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp,
1179 		    &nd.ni_cnd, &vattr);
1180 #endif
1181 	if (error == 0) {
1182 		if (whiteout)
1183 			error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE);
1184 		else {
1185 			error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp,
1186 						&nd.ni_cnd, &vattr);
1187 			if (error == 0)
1188 				vput(nd.ni_vp);
1189 		}
1190 	}
1191 	NDFREE(&nd, NDF_ONLY_PNBUF);
1192 	vput(nd.ni_dvp);
1193 	vn_finished_write(mp);
1194 	return (error);
1195 }
1196 
1197 /*
1198  * Create a named pipe.
1199  */
1200 #ifndef _SYS_SYSPROTO_H_
1201 struct mkfifo_args {
1202 	char	*path;
1203 	int	mode;
1204 };
1205 #endif
1206 int
1207 sys_mkfifo(struct thread *td, struct mkfifo_args *uap)
1208 {
1209 
1210 	return (kern_mkfifoat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
1211 	    uap->mode));
1212 }
1213 
1214 #ifndef _SYS_SYSPROTO_H_
1215 struct mkfifoat_args {
1216 	int	fd;
1217 	char	*path;
1218 	mode_t	mode;
1219 };
1220 #endif
1221 int
1222 sys_mkfifoat(struct thread *td, struct mkfifoat_args *uap)
1223 {
1224 
1225 	return (kern_mkfifoat(td, uap->fd, uap->path, UIO_USERSPACE,
1226 	    uap->mode));
1227 }
1228 
1229 int
1230 kern_mkfifoat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
1231     int mode)
1232 {
1233 	struct mount *mp;
1234 	struct vattr vattr;
1235 	struct nameidata nd;
1236 	cap_rights_t rights;
1237 	int error;
1238 
1239 	AUDIT_ARG_MODE(mode);
1240 restart:
1241 	bwillwrite();
1242 	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 |
1243 	    NOCACHE, pathseg, path, fd, cap_rights_init(&rights, CAP_MKFIFOAT),
1244 	    td);
1245 	if ((error = namei(&nd)) != 0)
1246 		return (error);
1247 	if (nd.ni_vp != NULL) {
1248 		NDFREE(&nd, NDF_ONLY_PNBUF);
1249 		if (nd.ni_vp == nd.ni_dvp)
1250 			vrele(nd.ni_dvp);
1251 		else
1252 			vput(nd.ni_dvp);
1253 		vrele(nd.ni_vp);
1254 		return (EEXIST);
1255 	}
1256 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1257 		NDFREE(&nd, NDF_ONLY_PNBUF);
1258 		vput(nd.ni_dvp);
1259 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
1260 			return (error);
1261 		goto restart;
1262 	}
1263 	VATTR_NULL(&vattr);
1264 	vattr.va_type = VFIFO;
1265 	vattr.va_mode = (mode & ALLPERMS) & ~td->td_proc->p_fd->fd_cmask;
1266 #ifdef MAC
1267 	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
1268 	    &vattr);
1269 	if (error != 0)
1270 		goto out;
1271 #endif
1272 	error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
1273 	if (error == 0)
1274 		vput(nd.ni_vp);
1275 #ifdef MAC
1276 out:
1277 #endif
1278 	vput(nd.ni_dvp);
1279 	vn_finished_write(mp);
1280 	NDFREE(&nd, NDF_ONLY_PNBUF);
1281 	return (error);
1282 }
1283 
1284 /*
1285  * Make a hard file link.
1286  */
1287 #ifndef _SYS_SYSPROTO_H_
1288 struct link_args {
1289 	char	*path;
1290 	char	*link;
1291 };
1292 #endif
1293 int
1294 sys_link(struct thread *td, struct link_args *uap)
1295 {
1296 
1297 	return (kern_linkat(td, AT_FDCWD, AT_FDCWD, uap->path, uap->link,
1298 	    UIO_USERSPACE, FOLLOW));
1299 }
1300 
1301 #ifndef _SYS_SYSPROTO_H_
1302 struct linkat_args {
1303 	int	fd1;
1304 	char	*path1;
1305 	int	fd2;
1306 	char	*path2;
1307 	int	flag;
1308 };
1309 #endif
1310 int
1311 sys_linkat(struct thread *td, struct linkat_args *uap)
1312 {
1313 	int flag;
1314 
1315 	flag = uap->flag;
1316 	if (flag & ~AT_SYMLINK_FOLLOW)
1317 		return (EINVAL);
1318 
1319 	return (kern_linkat(td, uap->fd1, uap->fd2, uap->path1, uap->path2,
1320 	    UIO_USERSPACE, (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW));
1321 }
1322 
1323 int hardlink_check_uid = 0;
1324 SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_uid, CTLFLAG_RW,
1325     &hardlink_check_uid, 0,
1326     "Unprivileged processes cannot create hard links to files owned by other "
1327     "users");
1328 static int hardlink_check_gid = 0;
1329 SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_gid, CTLFLAG_RW,
1330     &hardlink_check_gid, 0,
1331     "Unprivileged processes cannot create hard links to files owned by other "
1332     "groups");
1333 
1334 static int
1335 can_hardlink(struct vnode *vp, struct ucred *cred)
1336 {
1337 	struct vattr va;
1338 	int error;
1339 
1340 	if (!hardlink_check_uid && !hardlink_check_gid)
1341 		return (0);
1342 
1343 	error = VOP_GETATTR(vp, &va, cred);
1344 	if (error != 0)
1345 		return (error);
1346 
1347 	if (hardlink_check_uid && cred->cr_uid != va.va_uid) {
1348 		error = priv_check_cred(cred, PRIV_VFS_LINK, 0);
1349 		if (error != 0)
1350 			return (error);
1351 	}
1352 
1353 	if (hardlink_check_gid && !groupmember(va.va_gid, cred)) {
1354 		error = priv_check_cred(cred, PRIV_VFS_LINK, 0);
1355 		if (error != 0)
1356 			return (error);
1357 	}
1358 
1359 	return (0);
1360 }
1361 
1362 int
1363 kern_linkat(struct thread *td, int fd1, int fd2, char *path1, char *path2,
1364     enum uio_seg segflg, int follow)
1365 {
1366 	struct vnode *vp;
1367 	struct mount *mp;
1368 	struct nameidata nd;
1369 	cap_rights_t rights;
1370 	int error;
1371 
1372 again:
1373 	bwillwrite();
1374 	NDINIT_ATRIGHTS(&nd, LOOKUP, follow | AUDITVNODE1, segflg, path1, fd1,
1375 	    cap_rights_init(&rights, CAP_LINKAT_SOURCE), td);
1376 
1377 	if ((error = namei(&nd)) != 0)
1378 		return (error);
1379 	NDFREE(&nd, NDF_ONLY_PNBUF);
1380 	vp = nd.ni_vp;
1381 	if (vp->v_type == VDIR) {
1382 		vrele(vp);
1383 		return (EPERM);		/* POSIX */
1384 	}
1385 	NDINIT_ATRIGHTS(&nd, CREATE,
1386 	    LOCKPARENT | SAVENAME | AUDITVNODE2 | NOCACHE, segflg, path2, fd2,
1387 	    cap_rights_init(&rights, CAP_LINKAT_TARGET), td);
1388 	if ((error = namei(&nd)) == 0) {
1389 		if (nd.ni_vp != NULL) {
1390 			NDFREE(&nd, NDF_ONLY_PNBUF);
1391 			if (nd.ni_dvp == nd.ni_vp)
1392 				vrele(nd.ni_dvp);
1393 			else
1394 				vput(nd.ni_dvp);
1395 			vrele(nd.ni_vp);
1396 			vrele(vp);
1397 			return (EEXIST);
1398 		} else if (nd.ni_dvp->v_mount != vp->v_mount) {
1399 			/*
1400 			 * Cross-device link.  No need to recheck
1401 			 * vp->v_type, since it cannot change, except
1402 			 * to VBAD.
1403 			 */
1404 			NDFREE(&nd, NDF_ONLY_PNBUF);
1405 			vput(nd.ni_dvp);
1406 			vrele(vp);
1407 			return (EXDEV);
1408 		} else if ((error = vn_lock(vp, LK_EXCLUSIVE)) == 0) {
1409 			error = can_hardlink(vp, td->td_ucred);
1410 #ifdef MAC
1411 			if (error == 0)
1412 				error = mac_vnode_check_link(td->td_ucred,
1413 				    nd.ni_dvp, vp, &nd.ni_cnd);
1414 #endif
1415 			if (error != 0) {
1416 				vput(vp);
1417 				vput(nd.ni_dvp);
1418 				NDFREE(&nd, NDF_ONLY_PNBUF);
1419 				return (error);
1420 			}
1421 			error = vn_start_write(vp, &mp, V_NOWAIT);
1422 			if (error != 0) {
1423 				vput(vp);
1424 				vput(nd.ni_dvp);
1425 				NDFREE(&nd, NDF_ONLY_PNBUF);
1426 				error = vn_start_write(NULL, &mp,
1427 				    V_XSLEEP | PCATCH);
1428 				if (error != 0)
1429 					return (error);
1430 				goto again;
1431 			}
1432 			error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd);
1433 			VOP_UNLOCK(vp, 0);
1434 			vput(nd.ni_dvp);
1435 			vn_finished_write(mp);
1436 			NDFREE(&nd, NDF_ONLY_PNBUF);
1437 		} else {
1438 			vput(nd.ni_dvp);
1439 			NDFREE(&nd, NDF_ONLY_PNBUF);
1440 			vrele(vp);
1441 			goto again;
1442 		}
1443 	}
1444 	vrele(vp);
1445 	return (error);
1446 }
1447 
1448 /*
1449  * Make a symbolic link.
1450  */
1451 #ifndef _SYS_SYSPROTO_H_
1452 struct symlink_args {
1453 	char	*path;
1454 	char	*link;
1455 };
1456 #endif
1457 int
1458 sys_symlink(struct thread *td, struct symlink_args *uap)
1459 {
1460 
1461 	return (kern_symlinkat(td, uap->path, AT_FDCWD, uap->link,
1462 	    UIO_USERSPACE));
1463 }
1464 
1465 #ifndef _SYS_SYSPROTO_H_
1466 struct symlinkat_args {
1467 	char	*path;
1468 	int	fd;
1469 	char	*path2;
1470 };
1471 #endif
1472 int
1473 sys_symlinkat(struct thread *td, struct symlinkat_args *uap)
1474 {
1475 
1476 	return (kern_symlinkat(td, uap->path1, uap->fd, uap->path2,
1477 	    UIO_USERSPACE));
1478 }
1479 
1480 int
1481 kern_symlinkat(struct thread *td, char *path1, int fd, char *path2,
1482     enum uio_seg segflg)
1483 {
1484 	struct mount *mp;
1485 	struct vattr vattr;
1486 	char *syspath;
1487 	struct nameidata nd;
1488 	int error;
1489 	cap_rights_t rights;
1490 
1491 	if (segflg == UIO_SYSSPACE) {
1492 		syspath = path1;
1493 	} else {
1494 		syspath = uma_zalloc(namei_zone, M_WAITOK);
1495 		if ((error = copyinstr(path1, syspath, MAXPATHLEN, NULL)) != 0)
1496 			goto out;
1497 	}
1498 	AUDIT_ARG_TEXT(syspath);
1499 restart:
1500 	bwillwrite();
1501 	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 |
1502 	    NOCACHE, segflg, path2, fd, cap_rights_init(&rights, CAP_SYMLINKAT),
1503 	    td);
1504 	if ((error = namei(&nd)) != 0)
1505 		goto out;
1506 	if (nd.ni_vp) {
1507 		NDFREE(&nd, NDF_ONLY_PNBUF);
1508 		if (nd.ni_vp == nd.ni_dvp)
1509 			vrele(nd.ni_dvp);
1510 		else
1511 			vput(nd.ni_dvp);
1512 		vrele(nd.ni_vp);
1513 		error = EEXIST;
1514 		goto out;
1515 	}
1516 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1517 		NDFREE(&nd, NDF_ONLY_PNBUF);
1518 		vput(nd.ni_dvp);
1519 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
1520 			goto out;
1521 		goto restart;
1522 	}
1523 	VATTR_NULL(&vattr);
1524 	vattr.va_mode = ACCESSPERMS &~ td->td_proc->p_fd->fd_cmask;
1525 #ifdef MAC
1526 	vattr.va_type = VLNK;
1527 	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
1528 	    &vattr);
1529 	if (error != 0)
1530 		goto out2;
1531 #endif
1532 	error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, syspath);
1533 	if (error == 0)
1534 		vput(nd.ni_vp);
1535 #ifdef MAC
1536 out2:
1537 #endif
1538 	NDFREE(&nd, NDF_ONLY_PNBUF);
1539 	vput(nd.ni_dvp);
1540 	vn_finished_write(mp);
1541 out:
1542 	if (segflg != UIO_SYSSPACE)
1543 		uma_zfree(namei_zone, syspath);
1544 	return (error);
1545 }
1546 
1547 /*
1548  * Delete a whiteout from the filesystem.
1549  */
1550 #ifndef _SYS_SYSPROTO_H_
1551 struct undelete_args {
1552 	char *path;
1553 };
1554 #endif
1555 int
1556 sys_undelete(struct thread *td, struct undelete_args *uap)
1557 {
1558 	struct mount *mp;
1559 	struct nameidata nd;
1560 	int error;
1561 
1562 restart:
1563 	bwillwrite();
1564 	NDINIT(&nd, DELETE, LOCKPARENT | DOWHITEOUT | AUDITVNODE1,
1565 	    UIO_USERSPACE, uap->path, td);
1566 	error = namei(&nd);
1567 	if (error != 0)
1568 		return (error);
1569 
1570 	if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) {
1571 		NDFREE(&nd, NDF_ONLY_PNBUF);
1572 		if (nd.ni_vp == nd.ni_dvp)
1573 			vrele(nd.ni_dvp);
1574 		else
1575 			vput(nd.ni_dvp);
1576 		if (nd.ni_vp)
1577 			vrele(nd.ni_vp);
1578 		return (EEXIST);
1579 	}
1580 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1581 		NDFREE(&nd, NDF_ONLY_PNBUF);
1582 		vput(nd.ni_dvp);
1583 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
1584 			return (error);
1585 		goto restart;
1586 	}
1587 	error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE);
1588 	NDFREE(&nd, NDF_ONLY_PNBUF);
1589 	vput(nd.ni_dvp);
1590 	vn_finished_write(mp);
1591 	return (error);
1592 }
1593 
1594 /*
1595  * Delete a name from the filesystem.
1596  */
1597 #ifndef _SYS_SYSPROTO_H_
1598 struct unlink_args {
1599 	char	*path;
1600 };
1601 #endif
1602 int
1603 sys_unlink(struct thread *td, struct unlink_args *uap)
1604 {
1605 
1606 	return (kern_unlinkat(td, AT_FDCWD, uap->path, UIO_USERSPACE, 0));
1607 }
1608 
1609 #ifndef _SYS_SYSPROTO_H_
1610 struct unlinkat_args {
1611 	int	fd;
1612 	char	*path;
1613 	int	flag;
1614 };
1615 #endif
1616 int
1617 sys_unlinkat(struct thread *td, struct unlinkat_args *uap)
1618 {
1619 	int flag = uap->flag;
1620 	int fd = uap->fd;
1621 	char *path = uap->path;
1622 
1623 	if (flag & ~AT_REMOVEDIR)
1624 		return (EINVAL);
1625 
1626 	if (flag & AT_REMOVEDIR)
1627 		return (kern_rmdirat(td, fd, path, UIO_USERSPACE));
1628 	else
1629 		return (kern_unlinkat(td, fd, path, UIO_USERSPACE, 0));
1630 }
1631 
1632 int
1633 kern_unlinkat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
1634     ino_t oldinum)
1635 {
1636 	struct mount *mp;
1637 	struct vnode *vp;
1638 	struct nameidata nd;
1639 	struct stat sb;
1640 	cap_rights_t rights;
1641 	int error;
1642 
1643 restart:
1644 	bwillwrite();
1645 	NDINIT_ATRIGHTS(&nd, DELETE, LOCKPARENT | LOCKLEAF | AUDITVNODE1,
1646 	    pathseg, path, fd, cap_rights_init(&rights, CAP_UNLINKAT), td);
1647 	if ((error = namei(&nd)) != 0)
1648 		return (error == EINVAL ? EPERM : error);
1649 	vp = nd.ni_vp;
1650 	if (vp->v_type == VDIR && oldinum == 0) {
1651 		error = EPERM;		/* POSIX */
1652 	} else if (oldinum != 0 &&
1653 		  ((error = vn_stat(vp, &sb, td->td_ucred, NOCRED, td)) == 0) &&
1654 		  sb.st_ino != oldinum) {
1655 			error = EIDRM;	/* Identifier removed */
1656 	} else {
1657 		/*
1658 		 * The root of a mounted filesystem cannot be deleted.
1659 		 *
1660 		 * XXX: can this only be a VDIR case?
1661 		 */
1662 		if (vp->v_vflag & VV_ROOT)
1663 			error = EBUSY;
1664 	}
1665 	if (error == 0) {
1666 		if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1667 			NDFREE(&nd, NDF_ONLY_PNBUF);
1668 			vput(nd.ni_dvp);
1669 			if (vp == nd.ni_dvp)
1670 				vrele(vp);
1671 			else
1672 				vput(vp);
1673 			if ((error = vn_start_write(NULL, &mp,
1674 			    V_XSLEEP | PCATCH)) != 0)
1675 				return (error);
1676 			goto restart;
1677 		}
1678 #ifdef MAC
1679 		error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp,
1680 		    &nd.ni_cnd);
1681 		if (error != 0)
1682 			goto out;
1683 #endif
1684 		vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK);
1685 		error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
1686 #ifdef MAC
1687 out:
1688 #endif
1689 		vn_finished_write(mp);
1690 	}
1691 	NDFREE(&nd, NDF_ONLY_PNBUF);
1692 	vput(nd.ni_dvp);
1693 	if (vp == nd.ni_dvp)
1694 		vrele(vp);
1695 	else
1696 		vput(vp);
1697 	return (error);
1698 }
1699 
1700 /*
1701  * Reposition read/write file offset.
1702  */
1703 #ifndef _SYS_SYSPROTO_H_
1704 struct lseek_args {
1705 	int	fd;
1706 	int	pad;
1707 	off_t	offset;
1708 	int	whence;
1709 };
1710 #endif
1711 int
1712 sys_lseek(struct thread *td, struct lseek_args *uap)
1713 {
1714 
1715 	return (kern_lseek(td, uap->fd, uap->offset, uap->whence));
1716 }
1717 
1718 int
1719 kern_lseek(struct thread *td, int fd, off_t offset, int whence)
1720 {
1721 	struct file *fp;
1722 	cap_rights_t rights;
1723 	int error;
1724 
1725 	AUDIT_ARG_FD(fd);
1726 	error = fget(td, fd, cap_rights_init(&rights, CAP_SEEK), &fp);
1727 	if (error != 0)
1728 		return (error);
1729 	error = (fp->f_ops->fo_flags & DFLAG_SEEKABLE) != 0 ?
1730 	    fo_seek(fp, offset, whence, td) : ESPIPE;
1731 	fdrop(fp, td);
1732 	return (error);
1733 }
1734 
1735 #if defined(COMPAT_43)
1736 /*
1737  * Reposition read/write file offset.
1738  */
1739 #ifndef _SYS_SYSPROTO_H_
1740 struct olseek_args {
1741 	int	fd;
1742 	long	offset;
1743 	int	whence;
1744 };
1745 #endif
1746 int
1747 olseek(struct thread *td, struct olseek_args *uap)
1748 {
1749 
1750 	return (kern_lseek(td, uap->fd, uap->offset, uap->whence));
1751 }
1752 #endif /* COMPAT_43 */
1753 
1754 #if defined(COMPAT_FREEBSD6)
1755 /* Version with the 'pad' argument */
1756 int
1757 freebsd6_lseek(struct thread *td, struct freebsd6_lseek_args *uap)
1758 {
1759 
1760 	return (kern_lseek(td, uap->fd, uap->offset, uap->whence));
1761 }
1762 #endif
1763 
1764 /*
1765  * Check access permissions using passed credentials.
1766  */
1767 static int
1768 vn_access(struct vnode *vp, int user_flags, struct ucred *cred,
1769      struct thread *td)
1770 {
1771 	accmode_t accmode;
1772 	int error;
1773 
1774 	/* Flags == 0 means only check for existence. */
1775 	if (user_flags == 0)
1776 		return (0);
1777 
1778 	accmode = 0;
1779 	if (user_flags & R_OK)
1780 		accmode |= VREAD;
1781 	if (user_flags & W_OK)
1782 		accmode |= VWRITE;
1783 	if (user_flags & X_OK)
1784 		accmode |= VEXEC;
1785 #ifdef MAC
1786 	error = mac_vnode_check_access(cred, vp, accmode);
1787 	if (error != 0)
1788 		return (error);
1789 #endif
1790 	if ((accmode & VWRITE) == 0 || (error = vn_writechk(vp)) == 0)
1791 		error = VOP_ACCESS(vp, accmode, cred, td);
1792 	return (error);
1793 }
1794 
1795 /*
1796  * Check access permissions using "real" credentials.
1797  */
1798 #ifndef _SYS_SYSPROTO_H_
1799 struct access_args {
1800 	char	*path;
1801 	int	amode;
1802 };
1803 #endif
1804 int
1805 sys_access(struct thread *td, struct access_args *uap)
1806 {
1807 
1808 	return (kern_accessat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
1809 	    0, uap->amode));
1810 }
1811 
1812 #ifndef _SYS_SYSPROTO_H_
1813 struct faccessat_args {
1814 	int	dirfd;
1815 	char	*path;
1816 	int	amode;
1817 	int	flag;
1818 }
1819 #endif
1820 int
1821 sys_faccessat(struct thread *td, struct faccessat_args *uap)
1822 {
1823 
1824 	return (kern_accessat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flag,
1825 	    uap->amode));
1826 }
1827 
1828 int
1829 kern_accessat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
1830     int flag, int amode)
1831 {
1832 	struct ucred *cred, *usecred;
1833 	struct vnode *vp;
1834 	struct nameidata nd;
1835 	cap_rights_t rights;
1836 	int error;
1837 
1838 	if (flag & ~AT_EACCESS)
1839 		return (EINVAL);
1840 	if (amode != F_OK && (amode & ~(R_OK | W_OK | X_OK)) != 0)
1841 		return (EINVAL);
1842 
1843 	/*
1844 	 * Create and modify a temporary credential instead of one that
1845 	 * is potentially shared (if we need one).
1846 	 */
1847 	cred = td->td_ucred;
1848 	if ((flag & AT_EACCESS) == 0 &&
1849 	    ((cred->cr_uid != cred->cr_ruid ||
1850 	    cred->cr_rgid != cred->cr_groups[0]))) {
1851 		usecred = crdup(cred);
1852 		usecred->cr_uid = cred->cr_ruid;
1853 		usecred->cr_groups[0] = cred->cr_rgid;
1854 		td->td_ucred = usecred;
1855 	} else
1856 		usecred = cred;
1857 	AUDIT_ARG_VALUE(amode);
1858 	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF |
1859 	    AUDITVNODE1, pathseg, path, fd, cap_rights_init(&rights, CAP_FSTAT),
1860 	    td);
1861 	if ((error = namei(&nd)) != 0)
1862 		goto out;
1863 	vp = nd.ni_vp;
1864 
1865 	error = vn_access(vp, amode, usecred, td);
1866 	NDFREE(&nd, NDF_ONLY_PNBUF);
1867 	vput(vp);
1868 out:
1869 	if (usecred != cred) {
1870 		td->td_ucred = cred;
1871 		crfree(usecred);
1872 	}
1873 	return (error);
1874 }
1875 
1876 /*
1877  * Check access permissions using "effective" credentials.
1878  */
1879 #ifndef _SYS_SYSPROTO_H_
1880 struct eaccess_args {
1881 	char	*path;
1882 	int	amode;
1883 };
1884 #endif
1885 int
1886 sys_eaccess(struct thread *td, struct eaccess_args *uap)
1887 {
1888 
1889 	return (kern_accessat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
1890 	    AT_EACCESS, uap->amode));
1891 }
1892 
1893 #if defined(COMPAT_43)
1894 /*
1895  * Get file status; this version follows links.
1896  */
1897 #ifndef _SYS_SYSPROTO_H_
1898 struct ostat_args {
1899 	char	*path;
1900 	struct ostat *ub;
1901 };
1902 #endif
1903 int
1904 ostat(struct thread *td, struct ostat_args *uap)
1905 {
1906 	struct stat sb;
1907 	struct ostat osb;
1908 	int error;
1909 
1910 	error = kern_statat(td, 0, AT_FDCWD, uap->path, UIO_USERSPACE,
1911 	    &sb, NULL);
1912 	if (error != 0)
1913 		return (error);
1914 	cvtstat(&sb, &osb);
1915 	return (copyout(&osb, uap->ub, sizeof (osb)));
1916 }
1917 
1918 /*
1919  * Get file status; this version does not follow links.
1920  */
1921 #ifndef _SYS_SYSPROTO_H_
1922 struct olstat_args {
1923 	char	*path;
1924 	struct ostat *ub;
1925 };
1926 #endif
1927 int
1928 olstat(struct thread *td, struct olstat_args *uap)
1929 {
1930 	struct stat sb;
1931 	struct ostat osb;
1932 	int error;
1933 
1934 	error = kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, uap->path,
1935 	    UIO_USERSPACE, &sb, NULL);
1936 	if (error != 0)
1937 		return (error);
1938 	cvtstat(&sb, &osb);
1939 	return (copyout(&osb, uap->ub, sizeof (osb)));
1940 }
1941 
1942 /*
1943  * Convert from an old to a new stat structure.
1944  */
1945 void
1946 cvtstat(struct stat *st, struct ostat *ost)
1947 {
1948 
1949 	bzero(ost, sizeof(*ost));
1950 	ost->st_dev = st->st_dev;
1951 	ost->st_ino = st->st_ino;
1952 	ost->st_mode = st->st_mode;
1953 	ost->st_nlink = st->st_nlink;
1954 	ost->st_uid = st->st_uid;
1955 	ost->st_gid = st->st_gid;
1956 	ost->st_rdev = st->st_rdev;
1957 	if (st->st_size < (quad_t)1 << 32)
1958 		ost->st_size = st->st_size;
1959 	else
1960 		ost->st_size = -2;
1961 	ost->st_atim = st->st_atim;
1962 	ost->st_mtim = st->st_mtim;
1963 	ost->st_ctim = st->st_ctim;
1964 	ost->st_blksize = st->st_blksize;
1965 	ost->st_blocks = st->st_blocks;
1966 	ost->st_flags = st->st_flags;
1967 	ost->st_gen = st->st_gen;
1968 }
1969 #endif /* COMPAT_43 */
1970 
1971 /*
1972  * Get file status; this version follows links.
1973  */
1974 #ifndef _SYS_SYSPROTO_H_
1975 struct stat_args {
1976 	char	*path;
1977 	struct stat *ub;
1978 };
1979 #endif
1980 int
1981 sys_stat(struct thread *td, struct stat_args *uap)
1982 {
1983 	struct stat sb;
1984 	int error;
1985 
1986 	error = kern_statat(td, 0, AT_FDCWD, uap->path, UIO_USERSPACE,
1987 	    &sb, NULL);
1988 	if (error == 0)
1989 		error = copyout(&sb, uap->ub, sizeof (sb));
1990 	return (error);
1991 }
1992 
1993 #ifndef _SYS_SYSPROTO_H_
1994 struct fstatat_args {
1995 	int	fd;
1996 	char	*path;
1997 	struct stat	*buf;
1998 	int	flag;
1999 }
2000 #endif
2001 int
2002 sys_fstatat(struct thread *td, struct fstatat_args *uap)
2003 {
2004 	struct stat sb;
2005 	int error;
2006 
2007 	error = kern_statat(td, uap->flag, uap->fd, uap->path,
2008 	    UIO_USERSPACE, &sb, NULL);
2009 	if (error == 0)
2010 		error = copyout(&sb, uap->buf, sizeof (sb));
2011 	return (error);
2012 }
2013 
2014 int
2015 kern_statat(struct thread *td, int flag, int fd, char *path,
2016     enum uio_seg pathseg, struct stat *sbp,
2017     void (*hook)(struct vnode *vp, struct stat *sbp))
2018 {
2019 	struct nameidata nd;
2020 	struct stat sb;
2021 	cap_rights_t rights;
2022 	int error;
2023 
2024 	if (flag & ~AT_SYMLINK_NOFOLLOW)
2025 		return (EINVAL);
2026 
2027 	NDINIT_ATRIGHTS(&nd, LOOKUP, ((flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW :
2028 	    FOLLOW) | LOCKSHARED | LOCKLEAF | AUDITVNODE1, pathseg, path, fd,
2029 	    cap_rights_init(&rights, CAP_FSTAT), td);
2030 
2031 	if ((error = namei(&nd)) != 0)
2032 		return (error);
2033 	error = vn_stat(nd.ni_vp, &sb, td->td_ucred, NOCRED, td);
2034 	if (error == 0) {
2035 		SDT_PROBE2(vfs, , stat, mode, path, sb.st_mode);
2036 		if (S_ISREG(sb.st_mode))
2037 			SDT_PROBE2(vfs, , stat, reg, path, pathseg);
2038 		if (__predict_false(hook != NULL))
2039 			hook(nd.ni_vp, &sb);
2040 	}
2041 	NDFREE(&nd, NDF_ONLY_PNBUF);
2042 	vput(nd.ni_vp);
2043 	if (error != 0)
2044 		return (error);
2045 	*sbp = sb;
2046 #ifdef KTRACE
2047 	if (KTRPOINT(td, KTR_STRUCT))
2048 		ktrstat(&sb);
2049 #endif
2050 	return (0);
2051 }
2052 
2053 /*
2054  * Get file status; this version does not follow links.
2055  */
2056 #ifndef _SYS_SYSPROTO_H_
2057 struct lstat_args {
2058 	char	*path;
2059 	struct stat *ub;
2060 };
2061 #endif
2062 int
2063 sys_lstat(struct thread *td, struct lstat_args *uap)
2064 {
2065 	struct stat sb;
2066 	int error;
2067 
2068 	error = kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, uap->path,
2069 	    UIO_USERSPACE, &sb, NULL);
2070 	if (error == 0)
2071 		error = copyout(&sb, uap->ub, sizeof (sb));
2072 	return (error);
2073 }
2074 
2075 /*
2076  * Implementation of the NetBSD [l]stat() functions.
2077  */
2078 void
2079 cvtnstat( struct stat *sb, struct nstat *nsb)
2080 {
2081 
2082 	bzero(nsb, sizeof *nsb);
2083 	nsb->st_dev = sb->st_dev;
2084 	nsb->st_ino = sb->st_ino;
2085 	nsb->st_mode = sb->st_mode;
2086 	nsb->st_nlink = sb->st_nlink;
2087 	nsb->st_uid = sb->st_uid;
2088 	nsb->st_gid = sb->st_gid;
2089 	nsb->st_rdev = sb->st_rdev;
2090 	nsb->st_atim = sb->st_atim;
2091 	nsb->st_mtim = sb->st_mtim;
2092 	nsb->st_ctim = sb->st_ctim;
2093 	nsb->st_size = sb->st_size;
2094 	nsb->st_blocks = sb->st_blocks;
2095 	nsb->st_blksize = sb->st_blksize;
2096 	nsb->st_flags = sb->st_flags;
2097 	nsb->st_gen = sb->st_gen;
2098 	nsb->st_birthtim = sb->st_birthtim;
2099 }
2100 
2101 #ifndef _SYS_SYSPROTO_H_
2102 struct nstat_args {
2103 	char	*path;
2104 	struct nstat *ub;
2105 };
2106 #endif
2107 int
2108 sys_nstat(struct thread *td, struct nstat_args *uap)
2109 {
2110 	struct stat sb;
2111 	struct nstat nsb;
2112 	int error;
2113 
2114 	error = kern_statat(td, 0, AT_FDCWD, uap->path, UIO_USERSPACE,
2115 	    &sb, NULL);
2116 	if (error != 0)
2117 		return (error);
2118 	cvtnstat(&sb, &nsb);
2119 	return (copyout(&nsb, uap->ub, sizeof (nsb)));
2120 }
2121 
2122 /*
2123  * NetBSD lstat.  Get file status; this version does not follow links.
2124  */
2125 #ifndef _SYS_SYSPROTO_H_
2126 struct lstat_args {
2127 	char	*path;
2128 	struct stat *ub;
2129 };
2130 #endif
2131 int
2132 sys_nlstat(struct thread *td, struct nlstat_args *uap)
2133 {
2134 	struct stat sb;
2135 	struct nstat nsb;
2136 	int error;
2137 
2138 	error = kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, uap->path,
2139 	    UIO_USERSPACE, &sb, NULL);
2140 	if (error != 0)
2141 		return (error);
2142 	cvtnstat(&sb, &nsb);
2143 	return (copyout(&nsb, uap->ub, sizeof (nsb)));
2144 }
2145 
2146 /*
2147  * Get configurable pathname variables.
2148  */
2149 #ifndef _SYS_SYSPROTO_H_
2150 struct pathconf_args {
2151 	char	*path;
2152 	int	name;
2153 };
2154 #endif
2155 int
2156 sys_pathconf(struct thread *td, struct pathconf_args *uap)
2157 {
2158 
2159 	return (kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name, FOLLOW));
2160 }
2161 
2162 #ifndef _SYS_SYSPROTO_H_
2163 struct lpathconf_args {
2164 	char	*path;
2165 	int	name;
2166 };
2167 #endif
2168 int
2169 sys_lpathconf(struct thread *td, struct lpathconf_args *uap)
2170 {
2171 
2172 	return (kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name,
2173 	    NOFOLLOW));
2174 }
2175 
2176 int
2177 kern_pathconf(struct thread *td, char *path, enum uio_seg pathseg, int name,
2178     u_long flags)
2179 {
2180 	struct nameidata nd;
2181 	int error;
2182 
2183 	NDINIT(&nd, LOOKUP, LOCKSHARED | LOCKLEAF | AUDITVNODE1 | flags,
2184 	    pathseg, path, td);
2185 	if ((error = namei(&nd)) != 0)
2186 		return (error);
2187 	NDFREE(&nd, NDF_ONLY_PNBUF);
2188 
2189 	error = VOP_PATHCONF(nd.ni_vp, name, td->td_retval);
2190 	vput(nd.ni_vp);
2191 	return (error);
2192 }
2193 
2194 /*
2195  * Return target name of a symbolic link.
2196  */
2197 #ifndef _SYS_SYSPROTO_H_
2198 struct readlink_args {
2199 	char	*path;
2200 	char	*buf;
2201 	size_t	count;
2202 };
2203 #endif
2204 int
2205 sys_readlink(struct thread *td, struct readlink_args *uap)
2206 {
2207 
2208 	return (kern_readlinkat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
2209 	    uap->buf, UIO_USERSPACE, uap->count));
2210 }
2211 #ifndef _SYS_SYSPROTO_H_
2212 struct readlinkat_args {
2213 	int	fd;
2214 	char	*path;
2215 	char	*buf;
2216 	size_t	bufsize;
2217 };
2218 #endif
2219 int
2220 sys_readlinkat(struct thread *td, struct readlinkat_args *uap)
2221 {
2222 
2223 	return (kern_readlinkat(td, uap->fd, uap->path, UIO_USERSPACE,
2224 	    uap->buf, UIO_USERSPACE, uap->bufsize));
2225 }
2226 
2227 int
2228 kern_readlinkat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
2229     char *buf, enum uio_seg bufseg, size_t count)
2230 {
2231 	struct vnode *vp;
2232 	struct iovec aiov;
2233 	struct uio auio;
2234 	struct nameidata nd;
2235 	int error;
2236 
2237 	if (count > IOSIZE_MAX)
2238 		return (EINVAL);
2239 
2240 	NDINIT_AT(&nd, LOOKUP, NOFOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
2241 	    pathseg, path, fd, td);
2242 
2243 	if ((error = namei(&nd)) != 0)
2244 		return (error);
2245 	NDFREE(&nd, NDF_ONLY_PNBUF);
2246 	vp = nd.ni_vp;
2247 #ifdef MAC
2248 	error = mac_vnode_check_readlink(td->td_ucred, vp);
2249 	if (error != 0) {
2250 		vput(vp);
2251 		return (error);
2252 	}
2253 #endif
2254 	if (vp->v_type != VLNK)
2255 		error = EINVAL;
2256 	else {
2257 		aiov.iov_base = buf;
2258 		aiov.iov_len = count;
2259 		auio.uio_iov = &aiov;
2260 		auio.uio_iovcnt = 1;
2261 		auio.uio_offset = 0;
2262 		auio.uio_rw = UIO_READ;
2263 		auio.uio_segflg = bufseg;
2264 		auio.uio_td = td;
2265 		auio.uio_resid = count;
2266 		error = VOP_READLINK(vp, &auio, td->td_ucred);
2267 		td->td_retval[0] = count - auio.uio_resid;
2268 	}
2269 	vput(vp);
2270 	return (error);
2271 }
2272 
2273 /*
2274  * Common implementation code for chflags() and fchflags().
2275  */
2276 static int
2277 setfflags(struct thread *td, struct vnode *vp, u_long flags)
2278 {
2279 	struct mount *mp;
2280 	struct vattr vattr;
2281 	int error;
2282 
2283 	/* We can't support the value matching VNOVAL. */
2284 	if (flags == VNOVAL)
2285 		return (EOPNOTSUPP);
2286 
2287 	/*
2288 	 * Prevent non-root users from setting flags on devices.  When
2289 	 * a device is reused, users can retain ownership of the device
2290 	 * if they are allowed to set flags and programs assume that
2291 	 * chown can't fail when done as root.
2292 	 */
2293 	if (vp->v_type == VCHR || vp->v_type == VBLK) {
2294 		error = priv_check(td, PRIV_VFS_CHFLAGS_DEV);
2295 		if (error != 0)
2296 			return (error);
2297 	}
2298 
2299 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
2300 		return (error);
2301 	VATTR_NULL(&vattr);
2302 	vattr.va_flags = flags;
2303 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2304 #ifdef MAC
2305 	error = mac_vnode_check_setflags(td->td_ucred, vp, vattr.va_flags);
2306 	if (error == 0)
2307 #endif
2308 		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
2309 	VOP_UNLOCK(vp, 0);
2310 	vn_finished_write(mp);
2311 	return (error);
2312 }
2313 
2314 /*
2315  * Change flags of a file given a path name.
2316  */
2317 #ifndef _SYS_SYSPROTO_H_
2318 struct chflags_args {
2319 	const char *path;
2320 	u_long	flags;
2321 };
2322 #endif
2323 int
2324 sys_chflags(struct thread *td, struct chflags_args *uap)
2325 {
2326 
2327 	return (kern_chflagsat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
2328 	    uap->flags, 0));
2329 }
2330 
2331 #ifndef _SYS_SYSPROTO_H_
2332 struct chflagsat_args {
2333 	int	fd;
2334 	const char *path;
2335 	u_long	flags;
2336 	int	atflag;
2337 }
2338 #endif
2339 int
2340 sys_chflagsat(struct thread *td, struct chflagsat_args *uap)
2341 {
2342 	int fd = uap->fd;
2343 	const char *path = uap->path;
2344 	u_long flags = uap->flags;
2345 	int atflag = uap->atflag;
2346 
2347 	if (atflag & ~AT_SYMLINK_NOFOLLOW)
2348 		return (EINVAL);
2349 
2350 	return (kern_chflagsat(td, fd, path, UIO_USERSPACE, flags, atflag));
2351 }
2352 
2353 /*
2354  * Same as chflags() but doesn't follow symlinks.
2355  */
2356 #ifndef _SYS_SYSPROTO_H_
2357 struct lchflags_args {
2358 	const char *path;
2359 	u_long flags;
2360 };
2361 #endif
2362 int
2363 sys_lchflags(struct thread *td, struct lchflags_args *uap)
2364 {
2365 
2366 	return (kern_chflagsat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
2367 	    uap->flags, AT_SYMLINK_NOFOLLOW));
2368 }
2369 
2370 static int
2371 kern_chflagsat(struct thread *td, int fd, const char *path,
2372     enum uio_seg pathseg, u_long flags, int atflag)
2373 {
2374 	struct nameidata nd;
2375 	cap_rights_t rights;
2376 	int error, follow;
2377 
2378 	AUDIT_ARG_FFLAGS(flags);
2379 	follow = (atflag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
2380 	NDINIT_ATRIGHTS(&nd, LOOKUP, follow | AUDITVNODE1, pathseg, path, fd,
2381 	    cap_rights_init(&rights, CAP_FCHFLAGS), td);
2382 	if ((error = namei(&nd)) != 0)
2383 		return (error);
2384 	NDFREE(&nd, NDF_ONLY_PNBUF);
2385 	error = setfflags(td, nd.ni_vp, flags);
2386 	vrele(nd.ni_vp);
2387 	return (error);
2388 }
2389 
2390 /*
2391  * Change flags of a file given a file descriptor.
2392  */
2393 #ifndef _SYS_SYSPROTO_H_
2394 struct fchflags_args {
2395 	int	fd;
2396 	u_long	flags;
2397 };
2398 #endif
2399 int
2400 sys_fchflags(struct thread *td, struct fchflags_args *uap)
2401 {
2402 	struct file *fp;
2403 	cap_rights_t rights;
2404 	int error;
2405 
2406 	AUDIT_ARG_FD(uap->fd);
2407 	AUDIT_ARG_FFLAGS(uap->flags);
2408 	error = getvnode(td, uap->fd, cap_rights_init(&rights, CAP_FCHFLAGS),
2409 	    &fp);
2410 	if (error != 0)
2411 		return (error);
2412 #ifdef AUDIT
2413 	vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY);
2414 	AUDIT_ARG_VNODE1(fp->f_vnode);
2415 	VOP_UNLOCK(fp->f_vnode, 0);
2416 #endif
2417 	error = setfflags(td, fp->f_vnode, uap->flags);
2418 	fdrop(fp, td);
2419 	return (error);
2420 }
2421 
2422 /*
2423  * Common implementation code for chmod(), lchmod() and fchmod().
2424  */
2425 int
2426 setfmode(struct thread *td, struct ucred *cred, struct vnode *vp, int mode)
2427 {
2428 	struct mount *mp;
2429 	struct vattr vattr;
2430 	int error;
2431 
2432 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
2433 		return (error);
2434 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2435 	VATTR_NULL(&vattr);
2436 	vattr.va_mode = mode & ALLPERMS;
2437 #ifdef MAC
2438 	error = mac_vnode_check_setmode(cred, vp, vattr.va_mode);
2439 	if (error == 0)
2440 #endif
2441 		error = VOP_SETATTR(vp, &vattr, cred);
2442 	VOP_UNLOCK(vp, 0);
2443 	vn_finished_write(mp);
2444 	return (error);
2445 }
2446 
2447 /*
2448  * Change mode of a file given path name.
2449  */
2450 #ifndef _SYS_SYSPROTO_H_
2451 struct chmod_args {
2452 	char	*path;
2453 	int	mode;
2454 };
2455 #endif
2456 int
2457 sys_chmod(struct thread *td, struct chmod_args *uap)
2458 {
2459 
2460 	return (kern_fchmodat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
2461 	    uap->mode, 0));
2462 }
2463 
2464 #ifndef _SYS_SYSPROTO_H_
2465 struct fchmodat_args {
2466 	int	dirfd;
2467 	char	*path;
2468 	mode_t	mode;
2469 	int	flag;
2470 }
2471 #endif
2472 int
2473 sys_fchmodat(struct thread *td, struct fchmodat_args *uap)
2474 {
2475 	int flag = uap->flag;
2476 	int fd = uap->fd;
2477 	char *path = uap->path;
2478 	mode_t mode = uap->mode;
2479 
2480 	if (flag & ~AT_SYMLINK_NOFOLLOW)
2481 		return (EINVAL);
2482 
2483 	return (kern_fchmodat(td, fd, path, UIO_USERSPACE, mode, flag));
2484 }
2485 
2486 /*
2487  * Change mode of a file given path name (don't follow links.)
2488  */
2489 #ifndef _SYS_SYSPROTO_H_
2490 struct lchmod_args {
2491 	char	*path;
2492 	int	mode;
2493 };
2494 #endif
2495 int
2496 sys_lchmod(struct thread *td, struct lchmod_args *uap)
2497 {
2498 
2499 	return (kern_fchmodat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
2500 	    uap->mode, AT_SYMLINK_NOFOLLOW));
2501 }
2502 
2503 int
2504 kern_fchmodat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
2505     mode_t mode, int flag)
2506 {
2507 	struct nameidata nd;
2508 	cap_rights_t rights;
2509 	int error, follow;
2510 
2511 	AUDIT_ARG_MODE(mode);
2512 	follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
2513 	NDINIT_ATRIGHTS(&nd, LOOKUP, follow | AUDITVNODE1, pathseg, path, fd,
2514 	    cap_rights_init(&rights, CAP_FCHMOD), td);
2515 	if ((error = namei(&nd)) != 0)
2516 		return (error);
2517 	NDFREE(&nd, NDF_ONLY_PNBUF);
2518 	error = setfmode(td, td->td_ucred, nd.ni_vp, mode);
2519 	vrele(nd.ni_vp);
2520 	return (error);
2521 }
2522 
2523 /*
2524  * Change mode of a file given a file descriptor.
2525  */
2526 #ifndef _SYS_SYSPROTO_H_
2527 struct fchmod_args {
2528 	int	fd;
2529 	int	mode;
2530 };
2531 #endif
2532 int
2533 sys_fchmod(struct thread *td, struct fchmod_args *uap)
2534 {
2535 	struct file *fp;
2536 	cap_rights_t rights;
2537 	int error;
2538 
2539 	AUDIT_ARG_FD(uap->fd);
2540 	AUDIT_ARG_MODE(uap->mode);
2541 
2542 	error = fget(td, uap->fd, cap_rights_init(&rights, CAP_FCHMOD), &fp);
2543 	if (error != 0)
2544 		return (error);
2545 	error = fo_chmod(fp, uap->mode, td->td_ucred, td);
2546 	fdrop(fp, td);
2547 	return (error);
2548 }
2549 
2550 /*
2551  * Common implementation for chown(), lchown(), and fchown()
2552  */
2553 int
2554 setfown(struct thread *td, struct ucred *cred, struct vnode *vp, uid_t uid,
2555     gid_t gid)
2556 {
2557 	struct mount *mp;
2558 	struct vattr vattr;
2559 	int error;
2560 
2561 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
2562 		return (error);
2563 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2564 	VATTR_NULL(&vattr);
2565 	vattr.va_uid = uid;
2566 	vattr.va_gid = gid;
2567 #ifdef MAC
2568 	error = mac_vnode_check_setowner(cred, vp, vattr.va_uid,
2569 	    vattr.va_gid);
2570 	if (error == 0)
2571 #endif
2572 		error = VOP_SETATTR(vp, &vattr, cred);
2573 	VOP_UNLOCK(vp, 0);
2574 	vn_finished_write(mp);
2575 	return (error);
2576 }
2577 
2578 /*
2579  * Set ownership given a path name.
2580  */
2581 #ifndef _SYS_SYSPROTO_H_
2582 struct chown_args {
2583 	char	*path;
2584 	int	uid;
2585 	int	gid;
2586 };
2587 #endif
2588 int
2589 sys_chown(struct thread *td, struct chown_args *uap)
2590 {
2591 
2592 	return (kern_fchownat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->uid,
2593 	    uap->gid, 0));
2594 }
2595 
2596 #ifndef _SYS_SYSPROTO_H_
2597 struct fchownat_args {
2598 	int fd;
2599 	const char * path;
2600 	uid_t uid;
2601 	gid_t gid;
2602 	int flag;
2603 };
2604 #endif
2605 int
2606 sys_fchownat(struct thread *td, struct fchownat_args *uap)
2607 {
2608 	int flag;
2609 
2610 	flag = uap->flag;
2611 	if (flag & ~AT_SYMLINK_NOFOLLOW)
2612 		return (EINVAL);
2613 
2614 	return (kern_fchownat(td, uap->fd, uap->path, UIO_USERSPACE, uap->uid,
2615 	    uap->gid, uap->flag));
2616 }
2617 
2618 int
2619 kern_fchownat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
2620     int uid, int gid, int flag)
2621 {
2622 	struct nameidata nd;
2623 	cap_rights_t rights;
2624 	int error, follow;
2625 
2626 	AUDIT_ARG_OWNER(uid, gid);
2627 	follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
2628 	NDINIT_ATRIGHTS(&nd, LOOKUP, follow | AUDITVNODE1, pathseg, path, fd,
2629 	    cap_rights_init(&rights, CAP_FCHOWN), td);
2630 
2631 	if ((error = namei(&nd)) != 0)
2632 		return (error);
2633 	NDFREE(&nd, NDF_ONLY_PNBUF);
2634 	error = setfown(td, td->td_ucred, nd.ni_vp, uid, gid);
2635 	vrele(nd.ni_vp);
2636 	return (error);
2637 }
2638 
2639 /*
2640  * Set ownership given a path name, do not cross symlinks.
2641  */
2642 #ifndef _SYS_SYSPROTO_H_
2643 struct lchown_args {
2644 	char	*path;
2645 	int	uid;
2646 	int	gid;
2647 };
2648 #endif
2649 int
2650 sys_lchown(struct thread *td, struct lchown_args *uap)
2651 {
2652 
2653 	return (kern_fchownat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
2654 	    uap->uid, uap->gid, AT_SYMLINK_NOFOLLOW));
2655 }
2656 
2657 /*
2658  * Set ownership given a file descriptor.
2659  */
2660 #ifndef _SYS_SYSPROTO_H_
2661 struct fchown_args {
2662 	int	fd;
2663 	int	uid;
2664 	int	gid;
2665 };
2666 #endif
2667 int
2668 sys_fchown(struct thread *td, struct fchown_args *uap)
2669 {
2670 	struct file *fp;
2671 	cap_rights_t rights;
2672 	int error;
2673 
2674 	AUDIT_ARG_FD(uap->fd);
2675 	AUDIT_ARG_OWNER(uap->uid, uap->gid);
2676 	error = fget(td, uap->fd, cap_rights_init(&rights, CAP_FCHOWN), &fp);
2677 	if (error != 0)
2678 		return (error);
2679 	error = fo_chown(fp, uap->uid, uap->gid, td->td_ucred, td);
2680 	fdrop(fp, td);
2681 	return (error);
2682 }
2683 
2684 /*
2685  * Common implementation code for utimes(), lutimes(), and futimes().
2686  */
2687 static int
2688 getutimes(const struct timeval *usrtvp, enum uio_seg tvpseg,
2689     struct timespec *tsp)
2690 {
2691 	struct timeval tv[2];
2692 	const struct timeval *tvp;
2693 	int error;
2694 
2695 	if (usrtvp == NULL) {
2696 		vfs_timestamp(&tsp[0]);
2697 		tsp[1] = tsp[0];
2698 	} else {
2699 		if (tvpseg == UIO_SYSSPACE) {
2700 			tvp = usrtvp;
2701 		} else {
2702 			if ((error = copyin(usrtvp, tv, sizeof(tv))) != 0)
2703 				return (error);
2704 			tvp = tv;
2705 		}
2706 
2707 		if (tvp[0].tv_usec < 0 || tvp[0].tv_usec >= 1000000 ||
2708 		    tvp[1].tv_usec < 0 || tvp[1].tv_usec >= 1000000)
2709 			return (EINVAL);
2710 		TIMEVAL_TO_TIMESPEC(&tvp[0], &tsp[0]);
2711 		TIMEVAL_TO_TIMESPEC(&tvp[1], &tsp[1]);
2712 	}
2713 	return (0);
2714 }
2715 
2716 /*
2717  * Common implementation code for futimens(), utimensat().
2718  */
2719 #define	UTIMENS_NULL	0x1
2720 #define	UTIMENS_EXIT	0x2
2721 static int
2722 getutimens(const struct timespec *usrtsp, enum uio_seg tspseg,
2723     struct timespec *tsp, int *retflags)
2724 {
2725 	struct timespec tsnow;
2726 	int error;
2727 
2728 	vfs_timestamp(&tsnow);
2729 	*retflags = 0;
2730 	if (usrtsp == NULL) {
2731 		tsp[0] = tsnow;
2732 		tsp[1] = tsnow;
2733 		*retflags |= UTIMENS_NULL;
2734 		return (0);
2735 	}
2736 	if (tspseg == UIO_SYSSPACE) {
2737 		tsp[0] = usrtsp[0];
2738 		tsp[1] = usrtsp[1];
2739 	} else if ((error = copyin(usrtsp, tsp, sizeof(*tsp) * 2)) != 0)
2740 		return (error);
2741 	if (tsp[0].tv_nsec == UTIME_OMIT && tsp[1].tv_nsec == UTIME_OMIT)
2742 		*retflags |= UTIMENS_EXIT;
2743 	if (tsp[0].tv_nsec == UTIME_NOW && tsp[1].tv_nsec == UTIME_NOW)
2744 		*retflags |= UTIMENS_NULL;
2745 	if (tsp[0].tv_nsec == UTIME_OMIT)
2746 		tsp[0].tv_sec = VNOVAL;
2747 	else if (tsp[0].tv_nsec == UTIME_NOW)
2748 		tsp[0] = tsnow;
2749 	else if (tsp[0].tv_nsec < 0 || tsp[0].tv_nsec >= 1000000000L)
2750 		return (EINVAL);
2751 	if (tsp[1].tv_nsec == UTIME_OMIT)
2752 		tsp[1].tv_sec = VNOVAL;
2753 	else if (tsp[1].tv_nsec == UTIME_NOW)
2754 		tsp[1] = tsnow;
2755 	else if (tsp[1].tv_nsec < 0 || tsp[1].tv_nsec >= 1000000000L)
2756 		return (EINVAL);
2757 
2758 	return (0);
2759 }
2760 
2761 /*
2762  * Common implementation code for utimes(), lutimes(), futimes(), futimens(),
2763  * and utimensat().
2764  */
2765 static int
2766 setutimes(struct thread *td, struct vnode *vp, const struct timespec *ts,
2767     int numtimes, int nullflag)
2768 {
2769 	struct mount *mp;
2770 	struct vattr vattr;
2771 	int error, setbirthtime;
2772 
2773 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
2774 		return (error);
2775 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2776 	setbirthtime = 0;
2777 	if (numtimes < 3 && !VOP_GETATTR(vp, &vattr, td->td_ucred) &&
2778 	    timespeccmp(&ts[1], &vattr.va_birthtime, < ))
2779 		setbirthtime = 1;
2780 	VATTR_NULL(&vattr);
2781 	vattr.va_atime = ts[0];
2782 	vattr.va_mtime = ts[1];
2783 	if (setbirthtime)
2784 		vattr.va_birthtime = ts[1];
2785 	if (numtimes > 2)
2786 		vattr.va_birthtime = ts[2];
2787 	if (nullflag)
2788 		vattr.va_vaflags |= VA_UTIMES_NULL;
2789 #ifdef MAC
2790 	error = mac_vnode_check_setutimes(td->td_ucred, vp, vattr.va_atime,
2791 	    vattr.va_mtime);
2792 #endif
2793 	if (error == 0)
2794 		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
2795 	VOP_UNLOCK(vp, 0);
2796 	vn_finished_write(mp);
2797 	return (error);
2798 }
2799 
2800 /*
2801  * Set the access and modification times of a file.
2802  */
2803 #ifndef _SYS_SYSPROTO_H_
2804 struct utimes_args {
2805 	char	*path;
2806 	struct	timeval *tptr;
2807 };
2808 #endif
2809 int
2810 sys_utimes(struct thread *td, struct utimes_args *uap)
2811 {
2812 
2813 	return (kern_utimesat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
2814 	    uap->tptr, UIO_USERSPACE));
2815 }
2816 
2817 #ifndef _SYS_SYSPROTO_H_
2818 struct futimesat_args {
2819 	int fd;
2820 	const char * path;
2821 	const struct timeval * times;
2822 };
2823 #endif
2824 int
2825 sys_futimesat(struct thread *td, struct futimesat_args *uap)
2826 {
2827 
2828 	return (kern_utimesat(td, uap->fd, uap->path, UIO_USERSPACE,
2829 	    uap->times, UIO_USERSPACE));
2830 }
2831 
2832 int
2833 kern_utimesat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
2834     struct timeval *tptr, enum uio_seg tptrseg)
2835 {
2836 	struct nameidata nd;
2837 	struct timespec ts[2];
2838 	cap_rights_t rights;
2839 	int error;
2840 
2841 	if ((error = getutimes(tptr, tptrseg, ts)) != 0)
2842 		return (error);
2843 	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, fd,
2844 	    cap_rights_init(&rights, CAP_FUTIMES), td);
2845 
2846 	if ((error = namei(&nd)) != 0)
2847 		return (error);
2848 	NDFREE(&nd, NDF_ONLY_PNBUF);
2849 	error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL);
2850 	vrele(nd.ni_vp);
2851 	return (error);
2852 }
2853 
2854 /*
2855  * Set the access and modification times of a file.
2856  */
2857 #ifndef _SYS_SYSPROTO_H_
2858 struct lutimes_args {
2859 	char	*path;
2860 	struct	timeval *tptr;
2861 };
2862 #endif
2863 int
2864 sys_lutimes(struct thread *td, struct lutimes_args *uap)
2865 {
2866 
2867 	return (kern_lutimes(td, uap->path, UIO_USERSPACE, uap->tptr,
2868 	    UIO_USERSPACE));
2869 }
2870 
2871 int
2872 kern_lutimes(struct thread *td, char *path, enum uio_seg pathseg,
2873     struct timeval *tptr, enum uio_seg tptrseg)
2874 {
2875 	struct timespec ts[2];
2876 	struct nameidata nd;
2877 	int error;
2878 
2879 	if ((error = getutimes(tptr, tptrseg, ts)) != 0)
2880 		return (error);
2881 	NDINIT(&nd, LOOKUP, NOFOLLOW | AUDITVNODE1, pathseg, path, td);
2882 	if ((error = namei(&nd)) != 0)
2883 		return (error);
2884 	NDFREE(&nd, NDF_ONLY_PNBUF);
2885 	error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL);
2886 	vrele(nd.ni_vp);
2887 	return (error);
2888 }
2889 
2890 /*
2891  * Set the access and modification times of a file.
2892  */
2893 #ifndef _SYS_SYSPROTO_H_
2894 struct futimes_args {
2895 	int	fd;
2896 	struct	timeval *tptr;
2897 };
2898 #endif
2899 int
2900 sys_futimes(struct thread *td, struct futimes_args *uap)
2901 {
2902 
2903 	return (kern_futimes(td, uap->fd, uap->tptr, UIO_USERSPACE));
2904 }
2905 
2906 int
2907 kern_futimes(struct thread *td, int fd, struct timeval *tptr,
2908     enum uio_seg tptrseg)
2909 {
2910 	struct timespec ts[2];
2911 	struct file *fp;
2912 	cap_rights_t rights;
2913 	int error;
2914 
2915 	AUDIT_ARG_FD(fd);
2916 	error = getutimes(tptr, tptrseg, ts);
2917 	if (error != 0)
2918 		return (error);
2919 	error = getvnode(td, fd, cap_rights_init(&rights, CAP_FUTIMES), &fp);
2920 	if (error != 0)
2921 		return (error);
2922 #ifdef AUDIT
2923 	vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY);
2924 	AUDIT_ARG_VNODE1(fp->f_vnode);
2925 	VOP_UNLOCK(fp->f_vnode, 0);
2926 #endif
2927 	error = setutimes(td, fp->f_vnode, ts, 2, tptr == NULL);
2928 	fdrop(fp, td);
2929 	return (error);
2930 }
2931 
2932 int
2933 sys_futimens(struct thread *td, struct futimens_args *uap)
2934 {
2935 
2936 	return (kern_futimens(td, uap->fd, uap->times, UIO_USERSPACE));
2937 }
2938 
2939 int
2940 kern_futimens(struct thread *td, int fd, struct timespec *tptr,
2941     enum uio_seg tptrseg)
2942 {
2943 	struct timespec ts[2];
2944 	struct file *fp;
2945 	cap_rights_t rights;
2946 	int error, flags;
2947 
2948 	AUDIT_ARG_FD(fd);
2949 	error = getutimens(tptr, tptrseg, ts, &flags);
2950 	if (error != 0)
2951 		return (error);
2952 	if (flags & UTIMENS_EXIT)
2953 		return (0);
2954 	error = getvnode(td, fd, cap_rights_init(&rights, CAP_FUTIMES), &fp);
2955 	if (error != 0)
2956 		return (error);
2957 #ifdef AUDIT
2958 	vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY);
2959 	AUDIT_ARG_VNODE1(fp->f_vnode);
2960 	VOP_UNLOCK(fp->f_vnode, 0);
2961 #endif
2962 	error = setutimes(td, fp->f_vnode, ts, 2, flags & UTIMENS_NULL);
2963 	fdrop(fp, td);
2964 	return (error);
2965 }
2966 
2967 int
2968 sys_utimensat(struct thread *td, struct utimensat_args *uap)
2969 {
2970 
2971 	return (kern_utimensat(td, uap->fd, uap->path, UIO_USERSPACE,
2972 	    uap->times, UIO_USERSPACE, uap->flag));
2973 }
2974 
2975 int
2976 kern_utimensat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
2977     struct timespec *tptr, enum uio_seg tptrseg, int flag)
2978 {
2979 	struct nameidata nd;
2980 	struct timespec ts[2];
2981 	cap_rights_t rights;
2982 	int error, flags;
2983 
2984 	if (flag & ~AT_SYMLINK_NOFOLLOW)
2985 		return (EINVAL);
2986 
2987 	if ((error = getutimens(tptr, tptrseg, ts, &flags)) != 0)
2988 		return (error);
2989 	NDINIT_ATRIGHTS(&nd, LOOKUP, ((flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW :
2990 	    FOLLOW) | AUDITVNODE1, pathseg, path, fd,
2991 	    cap_rights_init(&rights, CAP_FUTIMES), td);
2992 	if ((error = namei(&nd)) != 0)
2993 		return (error);
2994 	/*
2995 	 * We are allowed to call namei() regardless of 2xUTIME_OMIT.
2996 	 * POSIX states:
2997 	 * "If both tv_nsec fields are UTIME_OMIT... EACCESS may be detected."
2998 	 * "Search permission is denied by a component of the path prefix."
2999 	 */
3000 	NDFREE(&nd, NDF_ONLY_PNBUF);
3001 	if ((flags & UTIMENS_EXIT) == 0)
3002 		error = setutimes(td, nd.ni_vp, ts, 2, flags & UTIMENS_NULL);
3003 	vrele(nd.ni_vp);
3004 	return (error);
3005 }
3006 
3007 /*
3008  * Truncate a file given its path name.
3009  */
3010 #ifndef _SYS_SYSPROTO_H_
3011 struct truncate_args {
3012 	char	*path;
3013 	int	pad;
3014 	off_t	length;
3015 };
3016 #endif
3017 int
3018 sys_truncate(struct thread *td, struct truncate_args *uap)
3019 {
3020 
3021 	return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length));
3022 }
3023 
3024 int
3025 kern_truncate(struct thread *td, char *path, enum uio_seg pathseg, off_t length)
3026 {
3027 	struct mount *mp;
3028 	struct vnode *vp;
3029 	void *rl_cookie;
3030 	struct vattr vattr;
3031 	struct nameidata nd;
3032 	int error;
3033 
3034 	if (length < 0)
3035 		return(EINVAL);
3036 	NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, td);
3037 	if ((error = namei(&nd)) != 0)
3038 		return (error);
3039 	vp = nd.ni_vp;
3040 	rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
3041 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
3042 		vn_rangelock_unlock(vp, rl_cookie);
3043 		vrele(vp);
3044 		return (error);
3045 	}
3046 	NDFREE(&nd, NDF_ONLY_PNBUF);
3047 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3048 	if (vp->v_type == VDIR)
3049 		error = EISDIR;
3050 #ifdef MAC
3051 	else if ((error = mac_vnode_check_write(td->td_ucred, NOCRED, vp))) {
3052 	}
3053 #endif
3054 	else if ((error = vn_writechk(vp)) == 0 &&
3055 	    (error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td)) == 0) {
3056 		VATTR_NULL(&vattr);
3057 		vattr.va_size = length;
3058 		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
3059 	}
3060 	VOP_UNLOCK(vp, 0);
3061 	vn_finished_write(mp);
3062 	vn_rangelock_unlock(vp, rl_cookie);
3063 	vrele(vp);
3064 	return (error);
3065 }
3066 
3067 #if defined(COMPAT_43)
3068 /*
3069  * Truncate a file given its path name.
3070  */
3071 #ifndef _SYS_SYSPROTO_H_
3072 struct otruncate_args {
3073 	char	*path;
3074 	long	length;
3075 };
3076 #endif
3077 int
3078 otruncate(struct thread *td, struct otruncate_args *uap)
3079 {
3080 
3081 	return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length));
3082 }
3083 #endif /* COMPAT_43 */
3084 
3085 #if defined(COMPAT_FREEBSD6)
3086 /* Versions with the pad argument */
3087 int
3088 freebsd6_truncate(struct thread *td, struct freebsd6_truncate_args *uap)
3089 {
3090 
3091 	return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length));
3092 }
3093 
3094 int
3095 freebsd6_ftruncate(struct thread *td, struct freebsd6_ftruncate_args *uap)
3096 {
3097 
3098 	return (kern_ftruncate(td, uap->fd, uap->length));
3099 }
3100 #endif
3101 
3102 int
3103 kern_fsync(struct thread *td, int fd, bool fullsync)
3104 {
3105 	struct vnode *vp;
3106 	struct mount *mp;
3107 	struct file *fp;
3108 	cap_rights_t rights;
3109 	int error, lock_flags;
3110 
3111 	AUDIT_ARG_FD(fd);
3112 	error = getvnode(td, fd, cap_rights_init(&rights, CAP_FSYNC), &fp);
3113 	if (error != 0)
3114 		return (error);
3115 	vp = fp->f_vnode;
3116 #if 0
3117 	if (!fullsync)
3118 		/* XXXKIB: compete outstanding aio writes */;
3119 #endif
3120 	error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
3121 	if (error != 0)
3122 		goto drop;
3123 	if (MNT_SHARED_WRITES(mp) ||
3124 	    ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) {
3125 		lock_flags = LK_SHARED;
3126 	} else {
3127 		lock_flags = LK_EXCLUSIVE;
3128 	}
3129 	vn_lock(vp, lock_flags | LK_RETRY);
3130 	AUDIT_ARG_VNODE1(vp);
3131 	if (vp->v_object != NULL) {
3132 		VM_OBJECT_WLOCK(vp->v_object);
3133 		vm_object_page_clean(vp->v_object, 0, 0, 0);
3134 		VM_OBJECT_WUNLOCK(vp->v_object);
3135 	}
3136 	error = fullsync ? VOP_FSYNC(vp, MNT_WAIT, td) : VOP_FDATASYNC(vp, td);
3137 	VOP_UNLOCK(vp, 0);
3138 	vn_finished_write(mp);
3139 drop:
3140 	fdrop(fp, td);
3141 	return (error);
3142 }
3143 
3144 /*
3145  * Sync an open file.
3146  */
3147 #ifndef _SYS_SYSPROTO_H_
3148 struct fsync_args {
3149 	int	fd;
3150 };
3151 #endif
3152 int
3153 sys_fsync(struct thread *td, struct fsync_args *uap)
3154 {
3155 
3156 	return (kern_fsync(td, uap->fd, true));
3157 }
3158 
3159 int
3160 sys_fdatasync(struct thread *td, struct fdatasync_args *uap)
3161 {
3162 
3163 	return (kern_fsync(td, uap->fd, false));
3164 }
3165 
3166 /*
3167  * Rename files.  Source and destination must either both be directories, or
3168  * both not be directories.  If target is a directory, it must be empty.
3169  */
3170 #ifndef _SYS_SYSPROTO_H_
3171 struct rename_args {
3172 	char	*from;
3173 	char	*to;
3174 };
3175 #endif
3176 int
3177 sys_rename(struct thread *td, struct rename_args *uap)
3178 {
3179 
3180 	return (kern_renameat(td, AT_FDCWD, uap->from, AT_FDCWD,
3181 	    uap->to, UIO_USERSPACE));
3182 }
3183 
3184 #ifndef _SYS_SYSPROTO_H_
3185 struct renameat_args {
3186 	int	oldfd;
3187 	char	*old;
3188 	int	newfd;
3189 	char	*new;
3190 };
3191 #endif
3192 int
3193 sys_renameat(struct thread *td, struct renameat_args *uap)
3194 {
3195 
3196 	return (kern_renameat(td, uap->oldfd, uap->old, uap->newfd, uap->new,
3197 	    UIO_USERSPACE));
3198 }
3199 
3200 int
3201 kern_renameat(struct thread *td, int oldfd, char *old, int newfd, char *new,
3202     enum uio_seg pathseg)
3203 {
3204 	struct mount *mp = NULL;
3205 	struct vnode *tvp, *fvp, *tdvp;
3206 	struct nameidata fromnd, tond;
3207 	cap_rights_t rights;
3208 	int error;
3209 
3210 again:
3211 	bwillwrite();
3212 #ifdef MAC
3213 	NDINIT_ATRIGHTS(&fromnd, DELETE, LOCKPARENT | LOCKLEAF | SAVESTART |
3214 	    AUDITVNODE1, pathseg, old, oldfd,
3215 	    cap_rights_init(&rights, CAP_RENAMEAT_SOURCE), td);
3216 #else
3217 	NDINIT_ATRIGHTS(&fromnd, DELETE, WANTPARENT | SAVESTART | AUDITVNODE1,
3218 	    pathseg, old, oldfd,
3219 	    cap_rights_init(&rights, CAP_RENAMEAT_SOURCE), td);
3220 #endif
3221 
3222 	if ((error = namei(&fromnd)) != 0)
3223 		return (error);
3224 #ifdef MAC
3225 	error = mac_vnode_check_rename_from(td->td_ucred, fromnd.ni_dvp,
3226 	    fromnd.ni_vp, &fromnd.ni_cnd);
3227 	VOP_UNLOCK(fromnd.ni_dvp, 0);
3228 	if (fromnd.ni_dvp != fromnd.ni_vp)
3229 		VOP_UNLOCK(fromnd.ni_vp, 0);
3230 #endif
3231 	fvp = fromnd.ni_vp;
3232 	NDINIT_ATRIGHTS(&tond, RENAME, LOCKPARENT | LOCKLEAF | NOCACHE |
3233 	    SAVESTART | AUDITVNODE2, pathseg, new, newfd,
3234 	    cap_rights_init(&rights, CAP_RENAMEAT_TARGET), td);
3235 	if (fromnd.ni_vp->v_type == VDIR)
3236 		tond.ni_cnd.cn_flags |= WILLBEDIR;
3237 	if ((error = namei(&tond)) != 0) {
3238 		/* Translate error code for rename("dir1", "dir2/."). */
3239 		if (error == EISDIR && fvp->v_type == VDIR)
3240 			error = EINVAL;
3241 		NDFREE(&fromnd, NDF_ONLY_PNBUF);
3242 		vrele(fromnd.ni_dvp);
3243 		vrele(fvp);
3244 		goto out1;
3245 	}
3246 	tdvp = tond.ni_dvp;
3247 	tvp = tond.ni_vp;
3248 	error = vn_start_write(fvp, &mp, V_NOWAIT);
3249 	if (error != 0) {
3250 		NDFREE(&fromnd, NDF_ONLY_PNBUF);
3251 		NDFREE(&tond, NDF_ONLY_PNBUF);
3252 		if (tvp != NULL)
3253 			vput(tvp);
3254 		if (tdvp == tvp)
3255 			vrele(tdvp);
3256 		else
3257 			vput(tdvp);
3258 		vrele(fromnd.ni_dvp);
3259 		vrele(fvp);
3260 		vrele(tond.ni_startdir);
3261 		if (fromnd.ni_startdir != NULL)
3262 			vrele(fromnd.ni_startdir);
3263 		error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH);
3264 		if (error != 0)
3265 			return (error);
3266 		goto again;
3267 	}
3268 	if (tvp != NULL) {
3269 		if (fvp->v_type == VDIR && tvp->v_type != VDIR) {
3270 			error = ENOTDIR;
3271 			goto out;
3272 		} else if (fvp->v_type != VDIR && tvp->v_type == VDIR) {
3273 			error = EISDIR;
3274 			goto out;
3275 		}
3276 #ifdef CAPABILITIES
3277 		if (newfd != AT_FDCWD) {
3278 			/*
3279 			 * If the target already exists we require CAP_UNLINKAT
3280 			 * from 'newfd'.
3281 			 */
3282 			error = cap_check(&tond.ni_filecaps.fc_rights,
3283 			    cap_rights_init(&rights, CAP_UNLINKAT));
3284 			if (error != 0)
3285 				goto out;
3286 		}
3287 #endif
3288 	}
3289 	if (fvp == tdvp) {
3290 		error = EINVAL;
3291 		goto out;
3292 	}
3293 	/*
3294 	 * If the source is the same as the destination (that is, if they
3295 	 * are links to the same vnode), then there is nothing to do.
3296 	 */
3297 	if (fvp == tvp)
3298 		error = -1;
3299 #ifdef MAC
3300 	else
3301 		error = mac_vnode_check_rename_to(td->td_ucred, tdvp,
3302 		    tond.ni_vp, fromnd.ni_dvp == tdvp, &tond.ni_cnd);
3303 #endif
3304 out:
3305 	if (error == 0) {
3306 		error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd,
3307 		    tond.ni_dvp, tond.ni_vp, &tond.ni_cnd);
3308 		NDFREE(&fromnd, NDF_ONLY_PNBUF);
3309 		NDFREE(&tond, NDF_ONLY_PNBUF);
3310 	} else {
3311 		NDFREE(&fromnd, NDF_ONLY_PNBUF);
3312 		NDFREE(&tond, NDF_ONLY_PNBUF);
3313 		if (tvp != NULL)
3314 			vput(tvp);
3315 		if (tdvp == tvp)
3316 			vrele(tdvp);
3317 		else
3318 			vput(tdvp);
3319 		vrele(fromnd.ni_dvp);
3320 		vrele(fvp);
3321 	}
3322 	vrele(tond.ni_startdir);
3323 	vn_finished_write(mp);
3324 out1:
3325 	if (fromnd.ni_startdir)
3326 		vrele(fromnd.ni_startdir);
3327 	if (error == -1)
3328 		return (0);
3329 	return (error);
3330 }
3331 
3332 /*
3333  * Make a directory file.
3334  */
3335 #ifndef _SYS_SYSPROTO_H_
3336 struct mkdir_args {
3337 	char	*path;
3338 	int	mode;
3339 };
3340 #endif
3341 int
3342 sys_mkdir(struct thread *td, struct mkdir_args *uap)
3343 {
3344 
3345 	return (kern_mkdirat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
3346 	    uap->mode));
3347 }
3348 
3349 #ifndef _SYS_SYSPROTO_H_
3350 struct mkdirat_args {
3351 	int	fd;
3352 	char	*path;
3353 	mode_t	mode;
3354 };
3355 #endif
3356 int
3357 sys_mkdirat(struct thread *td, struct mkdirat_args *uap)
3358 {
3359 
3360 	return (kern_mkdirat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode));
3361 }
3362 
3363 int
3364 kern_mkdirat(struct thread *td, int fd, char *path, enum uio_seg segflg,
3365     int mode)
3366 {
3367 	struct mount *mp;
3368 	struct vnode *vp;
3369 	struct vattr vattr;
3370 	struct nameidata nd;
3371 	cap_rights_t rights;
3372 	int error;
3373 
3374 	AUDIT_ARG_MODE(mode);
3375 restart:
3376 	bwillwrite();
3377 	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 |
3378 	    NOCACHE, segflg, path, fd, cap_rights_init(&rights, CAP_MKDIRAT),
3379 	    td);
3380 	nd.ni_cnd.cn_flags |= WILLBEDIR;
3381 	if ((error = namei(&nd)) != 0)
3382 		return (error);
3383 	vp = nd.ni_vp;
3384 	if (vp != NULL) {
3385 		NDFREE(&nd, NDF_ONLY_PNBUF);
3386 		/*
3387 		 * XXX namei called with LOCKPARENT but not LOCKLEAF has
3388 		 * the strange behaviour of leaving the vnode unlocked
3389 		 * if the target is the same vnode as the parent.
3390 		 */
3391 		if (vp == nd.ni_dvp)
3392 			vrele(nd.ni_dvp);
3393 		else
3394 			vput(nd.ni_dvp);
3395 		vrele(vp);
3396 		return (EEXIST);
3397 	}
3398 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
3399 		NDFREE(&nd, NDF_ONLY_PNBUF);
3400 		vput(nd.ni_dvp);
3401 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
3402 			return (error);
3403 		goto restart;
3404 	}
3405 	VATTR_NULL(&vattr);
3406 	vattr.va_type = VDIR;
3407 	vattr.va_mode = (mode & ACCESSPERMS) &~ td->td_proc->p_fd->fd_cmask;
3408 #ifdef MAC
3409 	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
3410 	    &vattr);
3411 	if (error != 0)
3412 		goto out;
3413 #endif
3414 	error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
3415 #ifdef MAC
3416 out:
3417 #endif
3418 	NDFREE(&nd, NDF_ONLY_PNBUF);
3419 	vput(nd.ni_dvp);
3420 	if (error == 0)
3421 		vput(nd.ni_vp);
3422 	vn_finished_write(mp);
3423 	return (error);
3424 }
3425 
3426 /*
3427  * Remove a directory file.
3428  */
3429 #ifndef _SYS_SYSPROTO_H_
3430 struct rmdir_args {
3431 	char	*path;
3432 };
3433 #endif
3434 int
3435 sys_rmdir(struct thread *td, struct rmdir_args *uap)
3436 {
3437 
3438 	return (kern_rmdirat(td, AT_FDCWD, uap->path, UIO_USERSPACE));
3439 }
3440 
3441 int
3442 kern_rmdirat(struct thread *td, int fd, char *path, enum uio_seg pathseg)
3443 {
3444 	struct mount *mp;
3445 	struct vnode *vp;
3446 	struct nameidata nd;
3447 	cap_rights_t rights;
3448 	int error;
3449 
3450 restart:
3451 	bwillwrite();
3452 	NDINIT_ATRIGHTS(&nd, DELETE, LOCKPARENT | LOCKLEAF | AUDITVNODE1,
3453 	    pathseg, path, fd, cap_rights_init(&rights, CAP_UNLINKAT), td);
3454 	if ((error = namei(&nd)) != 0)
3455 		return (error);
3456 	vp = nd.ni_vp;
3457 	if (vp->v_type != VDIR) {
3458 		error = ENOTDIR;
3459 		goto out;
3460 	}
3461 	/*
3462 	 * No rmdir "." please.
3463 	 */
3464 	if (nd.ni_dvp == vp) {
3465 		error = EINVAL;
3466 		goto out;
3467 	}
3468 	/*
3469 	 * The root of a mounted filesystem cannot be deleted.
3470 	 */
3471 	if (vp->v_vflag & VV_ROOT) {
3472 		error = EBUSY;
3473 		goto out;
3474 	}
3475 #ifdef MAC
3476 	error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp,
3477 	    &nd.ni_cnd);
3478 	if (error != 0)
3479 		goto out;
3480 #endif
3481 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
3482 		NDFREE(&nd, NDF_ONLY_PNBUF);
3483 		vput(vp);
3484 		if (nd.ni_dvp == vp)
3485 			vrele(nd.ni_dvp);
3486 		else
3487 			vput(nd.ni_dvp);
3488 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
3489 			return (error);
3490 		goto restart;
3491 	}
3492 	vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK);
3493 	error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
3494 	vn_finished_write(mp);
3495 out:
3496 	NDFREE(&nd, NDF_ONLY_PNBUF);
3497 	vput(vp);
3498 	if (nd.ni_dvp == vp)
3499 		vrele(nd.ni_dvp);
3500 	else
3501 		vput(nd.ni_dvp);
3502 	return (error);
3503 }
3504 
3505 #ifdef COMPAT_43
3506 /*
3507  * Read a block of directory entries in a filesystem independent format.
3508  */
3509 #ifndef _SYS_SYSPROTO_H_
3510 struct ogetdirentries_args {
3511 	int	fd;
3512 	char	*buf;
3513 	u_int	count;
3514 	long	*basep;
3515 };
3516 #endif
3517 int
3518 ogetdirentries(struct thread *td, struct ogetdirentries_args *uap)
3519 {
3520 	long loff;
3521 	int error;
3522 
3523 	error = kern_ogetdirentries(td, uap, &loff);
3524 	if (error == 0)
3525 		error = copyout(&loff, uap->basep, sizeof(long));
3526 	return (error);
3527 }
3528 
3529 int
3530 kern_ogetdirentries(struct thread *td, struct ogetdirentries_args *uap,
3531     long *ploff)
3532 {
3533 	struct vnode *vp;
3534 	struct file *fp;
3535 	struct uio auio, kuio;
3536 	struct iovec aiov, kiov;
3537 	struct dirent *dp, *edp;
3538 	cap_rights_t rights;
3539 	caddr_t dirbuf;
3540 	int error, eofflag, readcnt;
3541 	long loff;
3542 	off_t foffset;
3543 
3544 	/* XXX arbitrary sanity limit on `count'. */
3545 	if (uap->count > 64 * 1024)
3546 		return (EINVAL);
3547 	error = getvnode(td, uap->fd, cap_rights_init(&rights, CAP_READ), &fp);
3548 	if (error != 0)
3549 		return (error);
3550 	if ((fp->f_flag & FREAD) == 0) {
3551 		fdrop(fp, td);
3552 		return (EBADF);
3553 	}
3554 	vp = fp->f_vnode;
3555 	foffset = foffset_lock(fp, 0);
3556 unionread:
3557 	if (vp->v_type != VDIR) {
3558 		foffset_unlock(fp, foffset, 0);
3559 		fdrop(fp, td);
3560 		return (EINVAL);
3561 	}
3562 	aiov.iov_base = uap->buf;
3563 	aiov.iov_len = uap->count;
3564 	auio.uio_iov = &aiov;
3565 	auio.uio_iovcnt = 1;
3566 	auio.uio_rw = UIO_READ;
3567 	auio.uio_segflg = UIO_USERSPACE;
3568 	auio.uio_td = td;
3569 	auio.uio_resid = uap->count;
3570 	vn_lock(vp, LK_SHARED | LK_RETRY);
3571 	loff = auio.uio_offset = foffset;
3572 #ifdef MAC
3573 	error = mac_vnode_check_readdir(td->td_ucred, vp);
3574 	if (error != 0) {
3575 		VOP_UNLOCK(vp, 0);
3576 		foffset_unlock(fp, foffset, FOF_NOUPDATE);
3577 		fdrop(fp, td);
3578 		return (error);
3579 	}
3580 #endif
3581 #	if (BYTE_ORDER != LITTLE_ENDIAN)
3582 		if (vp->v_mount->mnt_maxsymlinklen <= 0) {
3583 			error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag,
3584 			    NULL, NULL);
3585 			foffset = auio.uio_offset;
3586 		} else
3587 #	endif
3588 	{
3589 		kuio = auio;
3590 		kuio.uio_iov = &kiov;
3591 		kuio.uio_segflg = UIO_SYSSPACE;
3592 		kiov.iov_len = uap->count;
3593 		dirbuf = malloc(uap->count, M_TEMP, M_WAITOK);
3594 		kiov.iov_base = dirbuf;
3595 		error = VOP_READDIR(vp, &kuio, fp->f_cred, &eofflag,
3596 			    NULL, NULL);
3597 		foffset = kuio.uio_offset;
3598 		if (error == 0) {
3599 			readcnt = uap->count - kuio.uio_resid;
3600 			edp = (struct dirent *)&dirbuf[readcnt];
3601 			for (dp = (struct dirent *)dirbuf; dp < edp; ) {
3602 #				if (BYTE_ORDER == LITTLE_ENDIAN)
3603 					/*
3604 					 * The expected low byte of
3605 					 * dp->d_namlen is our dp->d_type.
3606 					 * The high MBZ byte of dp->d_namlen
3607 					 * is our dp->d_namlen.
3608 					 */
3609 					dp->d_type = dp->d_namlen;
3610 					dp->d_namlen = 0;
3611 #				else
3612 					/*
3613 					 * The dp->d_type is the high byte
3614 					 * of the expected dp->d_namlen,
3615 					 * so must be zero'ed.
3616 					 */
3617 					dp->d_type = 0;
3618 #				endif
3619 				if (dp->d_reclen > 0) {
3620 					dp = (struct dirent *)
3621 					    ((char *)dp + dp->d_reclen);
3622 				} else {
3623 					error = EIO;
3624 					break;
3625 				}
3626 			}
3627 			if (dp >= edp)
3628 				error = uiomove(dirbuf, readcnt, &auio);
3629 		}
3630 		free(dirbuf, M_TEMP);
3631 	}
3632 	if (error != 0) {
3633 		VOP_UNLOCK(vp, 0);
3634 		foffset_unlock(fp, foffset, 0);
3635 		fdrop(fp, td);
3636 		return (error);
3637 	}
3638 	if (uap->count == auio.uio_resid &&
3639 	    (vp->v_vflag & VV_ROOT) &&
3640 	    (vp->v_mount->mnt_flag & MNT_UNION)) {
3641 		struct vnode *tvp = vp;
3642 		vp = vp->v_mount->mnt_vnodecovered;
3643 		VREF(vp);
3644 		fp->f_vnode = vp;
3645 		fp->f_data = vp;
3646 		foffset = 0;
3647 		vput(tvp);
3648 		goto unionread;
3649 	}
3650 	VOP_UNLOCK(vp, 0);
3651 	foffset_unlock(fp, foffset, 0);
3652 	fdrop(fp, td);
3653 	td->td_retval[0] = uap->count - auio.uio_resid;
3654 	if (error == 0)
3655 		*ploff = loff;
3656 	return (error);
3657 }
3658 #endif /* COMPAT_43 */
3659 
3660 /*
3661  * Read a block of directory entries in a filesystem independent format.
3662  */
3663 #ifndef _SYS_SYSPROTO_H_
3664 struct getdirentries_args {
3665 	int	fd;
3666 	char	*buf;
3667 	u_int	count;
3668 	long	*basep;
3669 };
3670 #endif
3671 int
3672 sys_getdirentries(struct thread *td, struct getdirentries_args *uap)
3673 {
3674 	long base;
3675 	int error;
3676 
3677 	error = kern_getdirentries(td, uap->fd, uap->buf, uap->count, &base,
3678 	    NULL, UIO_USERSPACE);
3679 	if (error != 0)
3680 		return (error);
3681 	if (uap->basep != NULL)
3682 		error = copyout(&base, uap->basep, sizeof(long));
3683 	return (error);
3684 }
3685 
3686 int
3687 kern_getdirentries(struct thread *td, int fd, char *buf, u_int count,
3688     long *basep, ssize_t *residp, enum uio_seg bufseg)
3689 {
3690 	struct vnode *vp;
3691 	struct file *fp;
3692 	struct uio auio;
3693 	struct iovec aiov;
3694 	cap_rights_t rights;
3695 	long loff;
3696 	int error, eofflag;
3697 	off_t foffset;
3698 
3699 	AUDIT_ARG_FD(fd);
3700 	if (count > IOSIZE_MAX)
3701 		return (EINVAL);
3702 	auio.uio_resid = count;
3703 	error = getvnode(td, fd, cap_rights_init(&rights, CAP_READ), &fp);
3704 	if (error != 0)
3705 		return (error);
3706 	if ((fp->f_flag & FREAD) == 0) {
3707 		fdrop(fp, td);
3708 		return (EBADF);
3709 	}
3710 	vp = fp->f_vnode;
3711 	foffset = foffset_lock(fp, 0);
3712 unionread:
3713 	if (vp->v_type != VDIR) {
3714 		error = EINVAL;
3715 		goto fail;
3716 	}
3717 	aiov.iov_base = buf;
3718 	aiov.iov_len = count;
3719 	auio.uio_iov = &aiov;
3720 	auio.uio_iovcnt = 1;
3721 	auio.uio_rw = UIO_READ;
3722 	auio.uio_segflg = bufseg;
3723 	auio.uio_td = td;
3724 	vn_lock(vp, LK_SHARED | LK_RETRY);
3725 	AUDIT_ARG_VNODE1(vp);
3726 	loff = auio.uio_offset = foffset;
3727 #ifdef MAC
3728 	error = mac_vnode_check_readdir(td->td_ucred, vp);
3729 	if (error == 0)
3730 #endif
3731 		error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL,
3732 		    NULL);
3733 	foffset = auio.uio_offset;
3734 	if (error != 0) {
3735 		VOP_UNLOCK(vp, 0);
3736 		goto fail;
3737 	}
3738 	if (count == auio.uio_resid &&
3739 	    (vp->v_vflag & VV_ROOT) &&
3740 	    (vp->v_mount->mnt_flag & MNT_UNION)) {
3741 		struct vnode *tvp = vp;
3742 
3743 		vp = vp->v_mount->mnt_vnodecovered;
3744 		VREF(vp);
3745 		fp->f_vnode = vp;
3746 		fp->f_data = vp;
3747 		foffset = 0;
3748 		vput(tvp);
3749 		goto unionread;
3750 	}
3751 	VOP_UNLOCK(vp, 0);
3752 	*basep = loff;
3753 	if (residp != NULL)
3754 		*residp = auio.uio_resid;
3755 	td->td_retval[0] = count - auio.uio_resid;
3756 fail:
3757 	foffset_unlock(fp, foffset, 0);
3758 	fdrop(fp, td);
3759 	return (error);
3760 }
3761 
3762 #ifndef _SYS_SYSPROTO_H_
3763 struct getdents_args {
3764 	int fd;
3765 	char *buf;
3766 	size_t count;
3767 };
3768 #endif
3769 int
3770 sys_getdents(struct thread *td, struct getdents_args *uap)
3771 {
3772 	struct getdirentries_args ap;
3773 
3774 	ap.fd = uap->fd;
3775 	ap.buf = uap->buf;
3776 	ap.count = uap->count;
3777 	ap.basep = NULL;
3778 	return (sys_getdirentries(td, &ap));
3779 }
3780 
3781 /*
3782  * Set the mode mask for creation of filesystem nodes.
3783  */
3784 #ifndef _SYS_SYSPROTO_H_
3785 struct umask_args {
3786 	int	newmask;
3787 };
3788 #endif
3789 int
3790 sys_umask(struct thread *td, struct umask_args *uap)
3791 {
3792 	struct filedesc *fdp;
3793 
3794 	fdp = td->td_proc->p_fd;
3795 	FILEDESC_XLOCK(fdp);
3796 	td->td_retval[0] = fdp->fd_cmask;
3797 	fdp->fd_cmask = uap->newmask & ALLPERMS;
3798 	FILEDESC_XUNLOCK(fdp);
3799 	return (0);
3800 }
3801 
3802 /*
3803  * Void all references to file by ripping underlying filesystem away from
3804  * vnode.
3805  */
3806 #ifndef _SYS_SYSPROTO_H_
3807 struct revoke_args {
3808 	char	*path;
3809 };
3810 #endif
3811 int
3812 sys_revoke(struct thread *td, struct revoke_args *uap)
3813 {
3814 	struct vnode *vp;
3815 	struct vattr vattr;
3816 	struct nameidata nd;
3817 	int error;
3818 
3819 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
3820 	    uap->path, td);
3821 	if ((error = namei(&nd)) != 0)
3822 		return (error);
3823 	vp = nd.ni_vp;
3824 	NDFREE(&nd, NDF_ONLY_PNBUF);
3825 	if (vp->v_type != VCHR || vp->v_rdev == NULL) {
3826 		error = EINVAL;
3827 		goto out;
3828 	}
3829 #ifdef MAC
3830 	error = mac_vnode_check_revoke(td->td_ucred, vp);
3831 	if (error != 0)
3832 		goto out;
3833 #endif
3834 	error = VOP_GETATTR(vp, &vattr, td->td_ucred);
3835 	if (error != 0)
3836 		goto out;
3837 	if (td->td_ucred->cr_uid != vattr.va_uid) {
3838 		error = priv_check(td, PRIV_VFS_ADMIN);
3839 		if (error != 0)
3840 			goto out;
3841 	}
3842 	if (vcount(vp) > 1)
3843 		VOP_REVOKE(vp, REVOKEALL);
3844 out:
3845 	vput(vp);
3846 	return (error);
3847 }
3848 
3849 /*
3850  * Convert a user file descriptor to a kernel file entry and check that, if it
3851  * is a capability, the correct rights are present. A reference on the file
3852  * entry is held upon returning.
3853  */
3854 int
3855 getvnode(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
3856 {
3857 	struct file *fp;
3858 	int error;
3859 
3860 	error = fget_unlocked(td->td_proc->p_fd, fd, rightsp, &fp, NULL);
3861 	if (error != 0)
3862 		return (error);
3863 
3864 	/*
3865 	 * The file could be not of the vnode type, or it may be not
3866 	 * yet fully initialized, in which case the f_vnode pointer
3867 	 * may be set, but f_ops is still badfileops.  E.g.,
3868 	 * devfs_open() transiently create such situation to
3869 	 * facilitate csw d_fdopen().
3870 	 *
3871 	 * Dupfdopen() handling in kern_openat() installs the
3872 	 * half-baked file into the process descriptor table, allowing
3873 	 * other thread to dereference it. Guard against the race by
3874 	 * checking f_ops.
3875 	 */
3876 	if (fp->f_vnode == NULL || fp->f_ops == &badfileops) {
3877 		fdrop(fp, td);
3878 		return (EINVAL);
3879 	}
3880 	*fpp = fp;
3881 	return (0);
3882 }
3883 
3884 
3885 /*
3886  * Get an (NFS) file handle.
3887  */
3888 #ifndef _SYS_SYSPROTO_H_
3889 struct lgetfh_args {
3890 	char	*fname;
3891 	fhandle_t *fhp;
3892 };
3893 #endif
3894 int
3895 sys_lgetfh(struct thread *td, struct lgetfh_args *uap)
3896 {
3897 	struct nameidata nd;
3898 	fhandle_t fh;
3899 	struct vnode *vp;
3900 	int error;
3901 
3902 	error = priv_check(td, PRIV_VFS_GETFH);
3903 	if (error != 0)
3904 		return (error);
3905 	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
3906 	    uap->fname, td);
3907 	error = namei(&nd);
3908 	if (error != 0)
3909 		return (error);
3910 	NDFREE(&nd, NDF_ONLY_PNBUF);
3911 	vp = nd.ni_vp;
3912 	bzero(&fh, sizeof(fh));
3913 	fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid;
3914 	error = VOP_VPTOFH(vp, &fh.fh_fid);
3915 	vput(vp);
3916 	if (error == 0)
3917 		error = copyout(&fh, uap->fhp, sizeof (fh));
3918 	return (error);
3919 }
3920 
3921 #ifndef _SYS_SYSPROTO_H_
3922 struct getfh_args {
3923 	char	*fname;
3924 	fhandle_t *fhp;
3925 };
3926 #endif
3927 int
3928 sys_getfh(struct thread *td, struct getfh_args *uap)
3929 {
3930 	struct nameidata nd;
3931 	fhandle_t fh;
3932 	struct vnode *vp;
3933 	int error;
3934 
3935 	error = priv_check(td, PRIV_VFS_GETFH);
3936 	if (error != 0)
3937 		return (error);
3938 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
3939 	    uap->fname, td);
3940 	error = namei(&nd);
3941 	if (error != 0)
3942 		return (error);
3943 	NDFREE(&nd, NDF_ONLY_PNBUF);
3944 	vp = nd.ni_vp;
3945 	bzero(&fh, sizeof(fh));
3946 	fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid;
3947 	error = VOP_VPTOFH(vp, &fh.fh_fid);
3948 	vput(vp);
3949 	if (error == 0)
3950 		error = copyout(&fh, uap->fhp, sizeof (fh));
3951 	return (error);
3952 }
3953 
3954 /*
3955  * syscall for the rpc.lockd to use to translate a NFS file handle into an
3956  * open descriptor.
3957  *
3958  * warning: do not remove the priv_check() call or this becomes one giant
3959  * security hole.
3960  */
3961 #ifndef _SYS_SYSPROTO_H_
3962 struct fhopen_args {
3963 	const struct fhandle *u_fhp;
3964 	int flags;
3965 };
3966 #endif
3967 int
3968 sys_fhopen(struct thread *td, struct fhopen_args *uap)
3969 {
3970 	struct mount *mp;
3971 	struct vnode *vp;
3972 	struct fhandle fhp;
3973 	struct file *fp;
3974 	int fmode, error;
3975 	int indx;
3976 
3977 	error = priv_check(td, PRIV_VFS_FHOPEN);
3978 	if (error != 0)
3979 		return (error);
3980 	indx = -1;
3981 	fmode = FFLAGS(uap->flags);
3982 	/* why not allow a non-read/write open for our lockd? */
3983 	if (((fmode & (FREAD | FWRITE)) == 0) || (fmode & O_CREAT))
3984 		return (EINVAL);
3985 	error = copyin(uap->u_fhp, &fhp, sizeof(fhp));
3986 	if (error != 0)
3987 		return(error);
3988 	/* find the mount point */
3989 	mp = vfs_busyfs(&fhp.fh_fsid);
3990 	if (mp == NULL)
3991 		return (ESTALE);
3992 	/* now give me my vnode, it gets returned to me locked */
3993 	error = VFS_FHTOVP(mp, &fhp.fh_fid, LK_EXCLUSIVE, &vp);
3994 	vfs_unbusy(mp);
3995 	if (error != 0)
3996 		return (error);
3997 
3998 	error = falloc_noinstall(td, &fp);
3999 	if (error != 0) {
4000 		vput(vp);
4001 		return (error);
4002 	}
4003 	/*
4004 	 * An extra reference on `fp' has been held for us by
4005 	 * falloc_noinstall().
4006 	 */
4007 
4008 #ifdef INVARIANTS
4009 	td->td_dupfd = -1;
4010 #endif
4011 	error = vn_open_vnode(vp, fmode, td->td_ucred, td, fp);
4012 	if (error != 0) {
4013 		KASSERT(fp->f_ops == &badfileops,
4014 		    ("VOP_OPEN in fhopen() set f_ops"));
4015 		KASSERT(td->td_dupfd < 0,
4016 		    ("fhopen() encountered fdopen()"));
4017 
4018 		vput(vp);
4019 		goto bad;
4020 	}
4021 #ifdef INVARIANTS
4022 	td->td_dupfd = 0;
4023 #endif
4024 	fp->f_vnode = vp;
4025 	fp->f_seqcount = 1;
4026 	finit(fp, (fmode & FMASK) | (fp->f_flag & FHASLOCK), DTYPE_VNODE, vp,
4027 	    &vnops);
4028 	VOP_UNLOCK(vp, 0);
4029 	if ((fmode & O_TRUNC) != 0) {
4030 		error = fo_truncate(fp, 0, td->td_ucred, td);
4031 		if (error != 0)
4032 			goto bad;
4033 	}
4034 
4035 	error = finstall(td, fp, &indx, fmode, NULL);
4036 bad:
4037 	fdrop(fp, td);
4038 	td->td_retval[0] = indx;
4039 	return (error);
4040 }
4041 
4042 /*
4043  * Stat an (NFS) file handle.
4044  */
4045 #ifndef _SYS_SYSPROTO_H_
4046 struct fhstat_args {
4047 	struct fhandle *u_fhp;
4048 	struct stat *sb;
4049 };
4050 #endif
4051 int
4052 sys_fhstat(struct thread *td, struct fhstat_args *uap)
4053 {
4054 	struct stat sb;
4055 	struct fhandle fh;
4056 	int error;
4057 
4058 	error = copyin(uap->u_fhp, &fh, sizeof(fh));
4059 	if (error != 0)
4060 		return (error);
4061 	error = kern_fhstat(td, fh, &sb);
4062 	if (error == 0)
4063 		error = copyout(&sb, uap->sb, sizeof(sb));
4064 	return (error);
4065 }
4066 
4067 int
4068 kern_fhstat(struct thread *td, struct fhandle fh, struct stat *sb)
4069 {
4070 	struct mount *mp;
4071 	struct vnode *vp;
4072 	int error;
4073 
4074 	error = priv_check(td, PRIV_VFS_FHSTAT);
4075 	if (error != 0)
4076 		return (error);
4077 	if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
4078 		return (ESTALE);
4079 	error = VFS_FHTOVP(mp, &fh.fh_fid, LK_EXCLUSIVE, &vp);
4080 	vfs_unbusy(mp);
4081 	if (error != 0)
4082 		return (error);
4083 	error = vn_stat(vp, sb, td->td_ucred, NOCRED, td);
4084 	vput(vp);
4085 	return (error);
4086 }
4087 
4088 /*
4089  * Implement fstatfs() for (NFS) file handles.
4090  */
4091 #ifndef _SYS_SYSPROTO_H_
4092 struct fhstatfs_args {
4093 	struct fhandle *u_fhp;
4094 	struct statfs *buf;
4095 };
4096 #endif
4097 int
4098 sys_fhstatfs(struct thread *td, struct fhstatfs_args *uap)
4099 {
4100 	struct statfs *sfp;
4101 	fhandle_t fh;
4102 	int error;
4103 
4104 	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
4105 	if (error != 0)
4106 		return (error);
4107 	sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
4108 	error = kern_fhstatfs(td, fh, sfp);
4109 	if (error == 0)
4110 		error = copyout(sfp, uap->buf, sizeof(*sfp));
4111 	free(sfp, M_STATFS);
4112 	return (error);
4113 }
4114 
4115 int
4116 kern_fhstatfs(struct thread *td, fhandle_t fh, struct statfs *buf)
4117 {
4118 	struct statfs *sp;
4119 	struct mount *mp;
4120 	struct vnode *vp;
4121 	int error;
4122 
4123 	error = priv_check(td, PRIV_VFS_FHSTATFS);
4124 	if (error != 0)
4125 		return (error);
4126 	if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
4127 		return (ESTALE);
4128 	error = VFS_FHTOVP(mp, &fh.fh_fid, LK_EXCLUSIVE, &vp);
4129 	if (error != 0) {
4130 		vfs_unbusy(mp);
4131 		return (error);
4132 	}
4133 	vput(vp);
4134 	error = prison_canseemount(td->td_ucred, mp);
4135 	if (error != 0)
4136 		goto out;
4137 #ifdef MAC
4138 	error = mac_mount_check_stat(td->td_ucred, mp);
4139 	if (error != 0)
4140 		goto out;
4141 #endif
4142 	/*
4143 	 * Set these in case the underlying filesystem fails to do so.
4144 	 */
4145 	sp = &mp->mnt_stat;
4146 	sp->f_version = STATFS_VERSION;
4147 	sp->f_namemax = NAME_MAX;
4148 	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
4149 	error = VFS_STATFS(mp, sp);
4150 	if (error == 0)
4151 		*buf = *sp;
4152 out:
4153 	vfs_unbusy(mp);
4154 	return (error);
4155 }
4156 
4157 int
4158 kern_posix_fallocate(struct thread *td, int fd, off_t offset, off_t len)
4159 {
4160 	struct file *fp;
4161 	struct mount *mp;
4162 	struct vnode *vp;
4163 	cap_rights_t rights;
4164 	off_t olen, ooffset;
4165 	int error;
4166 #ifdef AUDIT
4167 	int audited_vnode1 = 0;
4168 #endif
4169 
4170 	AUDIT_ARG_FD(fd);
4171 	if (offset < 0 || len <= 0)
4172 		return (EINVAL);
4173 	/* Check for wrap. */
4174 	if (offset > OFF_MAX - len)
4175 		return (EFBIG);
4176 	AUDIT_ARG_FD(fd);
4177 	error = fget(td, fd, cap_rights_init(&rights, CAP_WRITE), &fp);
4178 	if (error != 0)
4179 		return (error);
4180 	AUDIT_ARG_FILE(td->td_proc, fp);
4181 	if ((fp->f_ops->fo_flags & DFLAG_SEEKABLE) == 0) {
4182 		error = ESPIPE;
4183 		goto out;
4184 	}
4185 	if ((fp->f_flag & FWRITE) == 0) {
4186 		error = EBADF;
4187 		goto out;
4188 	}
4189 	if (fp->f_type != DTYPE_VNODE) {
4190 		error = ENODEV;
4191 		goto out;
4192 	}
4193 	vp = fp->f_vnode;
4194 	if (vp->v_type != VREG) {
4195 		error = ENODEV;
4196 		goto out;
4197 	}
4198 
4199 	/* Allocating blocks may take a long time, so iterate. */
4200 	for (;;) {
4201 		olen = len;
4202 		ooffset = offset;
4203 
4204 		bwillwrite();
4205 		mp = NULL;
4206 		error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
4207 		if (error != 0)
4208 			break;
4209 		error = vn_lock(vp, LK_EXCLUSIVE);
4210 		if (error != 0) {
4211 			vn_finished_write(mp);
4212 			break;
4213 		}
4214 #ifdef AUDIT
4215 		if (!audited_vnode1) {
4216 			AUDIT_ARG_VNODE1(vp);
4217 			audited_vnode1 = 1;
4218 		}
4219 #endif
4220 #ifdef MAC
4221 		error = mac_vnode_check_write(td->td_ucred, fp->f_cred, vp);
4222 		if (error == 0)
4223 #endif
4224 			error = VOP_ALLOCATE(vp, &offset, &len);
4225 		VOP_UNLOCK(vp, 0);
4226 		vn_finished_write(mp);
4227 
4228 		if (olen + ooffset != offset + len) {
4229 			panic("offset + len changed from %jx/%jx to %jx/%jx",
4230 			    ooffset, olen, offset, len);
4231 		}
4232 		if (error != 0 || len == 0)
4233 			break;
4234 		KASSERT(olen > len, ("Iteration did not make progress?"));
4235 		maybe_yield();
4236 	}
4237  out:
4238 	fdrop(fp, td);
4239 	return (error);
4240 }
4241 
4242 int
4243 sys_posix_fallocate(struct thread *td, struct posix_fallocate_args *uap)
4244 {
4245 	int error;
4246 
4247 	error = kern_posix_fallocate(td, uap->fd, uap->offset, uap->len);
4248 	return (kern_posix_error(td, error));
4249 }
4250 
4251 /*
4252  * Unlike madvise(2), we do not make a best effort to remember every
4253  * possible caching hint.  Instead, we remember the last setting with
4254  * the exception that we will allow POSIX_FADV_NORMAL to adjust the
4255  * region of any current setting.
4256  */
4257 int
4258 kern_posix_fadvise(struct thread *td, int fd, off_t offset, off_t len,
4259     int advice)
4260 {
4261 	struct fadvise_info *fa, *new;
4262 	struct file *fp;
4263 	struct vnode *vp;
4264 	cap_rights_t rights;
4265 	off_t end;
4266 	int error;
4267 
4268 	if (offset < 0 || len < 0 || offset > OFF_MAX - len)
4269 		return (EINVAL);
4270 	AUDIT_ARG_VALUE(advice);
4271 	switch (advice) {
4272 	case POSIX_FADV_SEQUENTIAL:
4273 	case POSIX_FADV_RANDOM:
4274 	case POSIX_FADV_NOREUSE:
4275 		new = malloc(sizeof(*fa), M_FADVISE, M_WAITOK);
4276 		break;
4277 	case POSIX_FADV_NORMAL:
4278 	case POSIX_FADV_WILLNEED:
4279 	case POSIX_FADV_DONTNEED:
4280 		new = NULL;
4281 		break;
4282 	default:
4283 		return (EINVAL);
4284 	}
4285 	/* XXX: CAP_POSIX_FADVISE? */
4286 	AUDIT_ARG_FD(fd);
4287 	error = fget(td, fd, cap_rights_init(&rights), &fp);
4288 	if (error != 0)
4289 		goto out;
4290 	AUDIT_ARG_FILE(td->td_proc, fp);
4291 	if ((fp->f_ops->fo_flags & DFLAG_SEEKABLE) == 0) {
4292 		error = ESPIPE;
4293 		goto out;
4294 	}
4295 	if (fp->f_type != DTYPE_VNODE) {
4296 		error = ENODEV;
4297 		goto out;
4298 	}
4299 	vp = fp->f_vnode;
4300 	if (vp->v_type != VREG) {
4301 		error = ENODEV;
4302 		goto out;
4303 	}
4304 	if (len == 0)
4305 		end = OFF_MAX;
4306 	else
4307 		end = offset + len - 1;
4308 	switch (advice) {
4309 	case POSIX_FADV_SEQUENTIAL:
4310 	case POSIX_FADV_RANDOM:
4311 	case POSIX_FADV_NOREUSE:
4312 		/*
4313 		 * Try to merge any existing non-standard region with
4314 		 * this new region if possible, otherwise create a new
4315 		 * non-standard region for this request.
4316 		 */
4317 		mtx_pool_lock(mtxpool_sleep, fp);
4318 		fa = fp->f_advice;
4319 		if (fa != NULL && fa->fa_advice == advice &&
4320 		    ((fa->fa_start <= end && fa->fa_end >= offset) ||
4321 		    (end != OFF_MAX && fa->fa_start == end + 1) ||
4322 		    (fa->fa_end != OFF_MAX && fa->fa_end + 1 == offset))) {
4323 			if (offset < fa->fa_start)
4324 				fa->fa_start = offset;
4325 			if (end > fa->fa_end)
4326 				fa->fa_end = end;
4327 		} else {
4328 			new->fa_advice = advice;
4329 			new->fa_start = offset;
4330 			new->fa_end = end;
4331 			fp->f_advice = new;
4332 			new = fa;
4333 		}
4334 		mtx_pool_unlock(mtxpool_sleep, fp);
4335 		break;
4336 	case POSIX_FADV_NORMAL:
4337 		/*
4338 		 * If a the "normal" region overlaps with an existing
4339 		 * non-standard region, trim or remove the
4340 		 * non-standard region.
4341 		 */
4342 		mtx_pool_lock(mtxpool_sleep, fp);
4343 		fa = fp->f_advice;
4344 		if (fa != NULL) {
4345 			if (offset <= fa->fa_start && end >= fa->fa_end) {
4346 				new = fa;
4347 				fp->f_advice = NULL;
4348 			} else if (offset <= fa->fa_start &&
4349 			    end >= fa->fa_start)
4350 				fa->fa_start = end + 1;
4351 			else if (offset <= fa->fa_end && end >= fa->fa_end)
4352 				fa->fa_end = offset - 1;
4353 			else if (offset >= fa->fa_start && end <= fa->fa_end) {
4354 				/*
4355 				 * If the "normal" region is a middle
4356 				 * portion of the existing
4357 				 * non-standard region, just remove
4358 				 * the whole thing rather than picking
4359 				 * one side or the other to
4360 				 * preserve.
4361 				 */
4362 				new = fa;
4363 				fp->f_advice = NULL;
4364 			}
4365 		}
4366 		mtx_pool_unlock(mtxpool_sleep, fp);
4367 		break;
4368 	case POSIX_FADV_WILLNEED:
4369 	case POSIX_FADV_DONTNEED:
4370 		error = VOP_ADVISE(vp, offset, end, advice);
4371 		break;
4372 	}
4373 out:
4374 	if (fp != NULL)
4375 		fdrop(fp, td);
4376 	free(new, M_FADVISE);
4377 	return (error);
4378 }
4379 
4380 int
4381 sys_posix_fadvise(struct thread *td, struct posix_fadvise_args *uap)
4382 {
4383 	int error;
4384 
4385 	error = kern_posix_fadvise(td, uap->fd, uap->offset, uap->len,
4386 	    uap->advice);
4387 	return (kern_posix_error(td, error));
4388 }
4389