xref: /freebsd/sys/kern/vfs_syscalls.c (revision 9a41df2a0e6408e9b329bbd8b9e37c2b44461a1b)
1 /*-
2  * Copyright (c) 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 4. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  *	@(#)vfs_syscalls.c	8.13 (Berkeley) 4/15/94
35  */
36 
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD$");
39 
40 #include "opt_capsicum.h"
41 #include "opt_compat.h"
42 #include "opt_kdtrace.h"
43 #include "opt_ktrace.h"
44 
45 #include <sys/param.h>
46 #include <sys/systm.h>
47 #include <sys/bio.h>
48 #include <sys/buf.h>
49 #include <sys/capability.h>
50 #include <sys/disk.h>
51 #include <sys/sysent.h>
52 #include <sys/malloc.h>
53 #include <sys/mount.h>
54 #include <sys/mutex.h>
55 #include <sys/sysproto.h>
56 #include <sys/namei.h>
57 #include <sys/filedesc.h>
58 #include <sys/kernel.h>
59 #include <sys/fcntl.h>
60 #include <sys/file.h>
61 #include <sys/filio.h>
62 #include <sys/limits.h>
63 #include <sys/linker.h>
64 #include <sys/sdt.h>
65 #include <sys/stat.h>
66 #include <sys/sx.h>
67 #include <sys/unistd.h>
68 #include <sys/vnode.h>
69 #include <sys/priv.h>
70 #include <sys/proc.h>
71 #include <sys/dirent.h>
72 #include <sys/jail.h>
73 #include <sys/syscallsubr.h>
74 #include <sys/sysctl.h>
75 #ifdef KTRACE
76 #include <sys/ktrace.h>
77 #endif
78 
79 #include <machine/stdarg.h>
80 
81 #include <security/audit/audit.h>
82 #include <security/mac/mac_framework.h>
83 
84 #include <vm/vm.h>
85 #include <vm/vm_object.h>
86 #include <vm/vm_page.h>
87 #include <vm/uma.h>
88 
89 #include <ufs/ufs/quota.h>
90 
91 MALLOC_DEFINE(M_FADVISE, "fadvise", "posix_fadvise(2) information");
92 
93 SDT_PROVIDER_DEFINE(vfs);
94 SDT_PROBE_DEFINE(vfs, , stat, mode, mode);
95 SDT_PROBE_ARGTYPE(vfs, , stat, mode, 0, "char *");
96 SDT_PROBE_ARGTYPE(vfs, , stat, mode, 1, "int");
97 SDT_PROBE_DEFINE(vfs, , stat, reg, reg);
98 SDT_PROBE_ARGTYPE(vfs, , stat, reg, 0, "char *");
99 SDT_PROBE_ARGTYPE(vfs, , stat, reg, 1, "int");
100 
101 static int chroot_refuse_vdir_fds(struct filedesc *fdp);
102 static int getutimes(const struct timeval *, enum uio_seg, struct timespec *);
103 static int setfflags(struct thread *td, struct vnode *, int);
104 static int setutimes(struct thread *td, struct vnode *,
105     const struct timespec *, int, int);
106 static int vn_access(struct vnode *vp, int user_flags, struct ucred *cred,
107     struct thread *td);
108 
109 /*
110  * The module initialization routine for POSIX asynchronous I/O will
111  * set this to the version of AIO that it implements.  (Zero means
112  * that it is not implemented.)  This value is used here by pathconf()
113  * and in kern_descrip.c by fpathconf().
114  */
115 int async_io_version;
116 
117 #ifdef DEBUG
118 static int syncprt = 0;
119 SYSCTL_INT(_debug, OID_AUTO, syncprt, CTLFLAG_RW, &syncprt, 0, "");
120 #endif
121 
122 /*
123  * Sync each mounted filesystem.
124  */
125 #ifndef _SYS_SYSPROTO_H_
126 struct sync_args {
127 	int     dummy;
128 };
129 #endif
130 /* ARGSUSED */
131 int
132 sys_sync(td, uap)
133 	struct thread *td;
134 	struct sync_args *uap;
135 {
136 	struct mount *mp, *nmp;
137 	int save, vfslocked;
138 
139 	mtx_lock(&mountlist_mtx);
140 	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
141 		if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
142 			nmp = TAILQ_NEXT(mp, mnt_list);
143 			continue;
144 		}
145 		vfslocked = VFS_LOCK_GIANT(mp);
146 		if ((mp->mnt_flag & MNT_RDONLY) == 0 &&
147 		    vn_start_write(NULL, &mp, V_NOWAIT) == 0) {
148 			save = curthread_pflags_set(TDP_SYNCIO);
149 			vfs_msync(mp, MNT_NOWAIT);
150 			VFS_SYNC(mp, MNT_NOWAIT);
151 			curthread_pflags_restore(save);
152 			vn_finished_write(mp);
153 		}
154 		VFS_UNLOCK_GIANT(vfslocked);
155 		mtx_lock(&mountlist_mtx);
156 		nmp = TAILQ_NEXT(mp, mnt_list);
157 		vfs_unbusy(mp);
158 	}
159 	mtx_unlock(&mountlist_mtx);
160 	return (0);
161 }
162 
163 /*
164  * Change filesystem quotas.
165  */
166 #ifndef _SYS_SYSPROTO_H_
167 struct quotactl_args {
168 	char *path;
169 	int cmd;
170 	int uid;
171 	caddr_t arg;
172 };
173 #endif
174 int
175 sys_quotactl(td, uap)
176 	struct thread *td;
177 	register struct quotactl_args /* {
178 		char *path;
179 		int cmd;
180 		int uid;
181 		caddr_t arg;
182 	} */ *uap;
183 {
184 	struct mount *mp;
185 	int vfslocked;
186 	int error;
187 	struct nameidata nd;
188 
189 	AUDIT_ARG_CMD(uap->cmd);
190 	AUDIT_ARG_UID(uap->uid);
191 	if (!prison_allow(td->td_ucred, PR_ALLOW_QUOTAS))
192 		return (EPERM);
193 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
194 	   UIO_USERSPACE, uap->path, td);
195 	if ((error = namei(&nd)) != 0)
196 		return (error);
197 	vfslocked = NDHASGIANT(&nd);
198 	NDFREE(&nd, NDF_ONLY_PNBUF);
199 	mp = nd.ni_vp->v_mount;
200 	vfs_ref(mp);
201 	vput(nd.ni_vp);
202 	error = vfs_busy(mp, 0);
203 	vfs_rel(mp);
204 	if (error) {
205 		VFS_UNLOCK_GIANT(vfslocked);
206 		return (error);
207 	}
208 	error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, uap->arg);
209 
210 	/*
211 	 * Since quota on operation typically needs to open quota
212 	 * file, the Q_QUOTAON handler needs to unbusy the mount point
213 	 * before calling into namei.  Otherwise, unmount might be
214 	 * started between two vfs_busy() invocations (first is our,
215 	 * second is from mount point cross-walk code in lookup()),
216 	 * causing deadlock.
217 	 *
218 	 * Require that Q_QUOTAON handles the vfs_busy() reference on
219 	 * its own, always returning with ubusied mount point.
220 	 */
221 	if ((uap->cmd >> SUBCMDSHIFT) != Q_QUOTAON)
222 		vfs_unbusy(mp);
223 	VFS_UNLOCK_GIANT(vfslocked);
224 	return (error);
225 }
226 
227 /*
228  * Used by statfs conversion routines to scale the block size up if
229  * necessary so that all of the block counts are <= 'max_size'.  Note
230  * that 'max_size' should be a bitmask, i.e. 2^n - 1 for some non-zero
231  * value of 'n'.
232  */
233 void
234 statfs_scale_blocks(struct statfs *sf, long max_size)
235 {
236 	uint64_t count;
237 	int shift;
238 
239 	KASSERT(powerof2(max_size + 1), ("%s: invalid max_size", __func__));
240 
241 	/*
242 	 * Attempt to scale the block counts to give a more accurate
243 	 * overview to userland of the ratio of free space to used
244 	 * space.  To do this, find the largest block count and compute
245 	 * a divisor that lets it fit into a signed integer <= max_size.
246 	 */
247 	if (sf->f_bavail < 0)
248 		count = -sf->f_bavail;
249 	else
250 		count = sf->f_bavail;
251 	count = MAX(sf->f_blocks, MAX(sf->f_bfree, count));
252 	if (count <= max_size)
253 		return;
254 
255 	count >>= flsl(max_size);
256 	shift = 0;
257 	while (count > 0) {
258 		shift++;
259 		count >>=1;
260 	}
261 
262 	sf->f_bsize <<= shift;
263 	sf->f_blocks >>= shift;
264 	sf->f_bfree >>= shift;
265 	sf->f_bavail >>= shift;
266 }
267 
268 /*
269  * Get filesystem statistics.
270  */
271 #ifndef _SYS_SYSPROTO_H_
272 struct statfs_args {
273 	char *path;
274 	struct statfs *buf;
275 };
276 #endif
277 int
278 sys_statfs(td, uap)
279 	struct thread *td;
280 	register struct statfs_args /* {
281 		char *path;
282 		struct statfs *buf;
283 	} */ *uap;
284 {
285 	struct statfs sf;
286 	int error;
287 
288 	error = kern_statfs(td, uap->path, UIO_USERSPACE, &sf);
289 	if (error == 0)
290 		error = copyout(&sf, uap->buf, sizeof(sf));
291 	return (error);
292 }
293 
294 int
295 kern_statfs(struct thread *td, char *path, enum uio_seg pathseg,
296     struct statfs *buf)
297 {
298 	struct mount *mp;
299 	struct statfs *sp, sb;
300 	int vfslocked;
301 	int error;
302 	struct nameidata nd;
303 
304 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | MPSAFE |
305 	    AUDITVNODE1, pathseg, path, td);
306 	error = namei(&nd);
307 	if (error)
308 		return (error);
309 	vfslocked = NDHASGIANT(&nd);
310 	mp = nd.ni_vp->v_mount;
311 	vfs_ref(mp);
312 	NDFREE(&nd, NDF_ONLY_PNBUF);
313 	vput(nd.ni_vp);
314 	error = vfs_busy(mp, 0);
315 	vfs_rel(mp);
316 	if (error) {
317 		VFS_UNLOCK_GIANT(vfslocked);
318 		return (error);
319 	}
320 #ifdef MAC
321 	error = mac_mount_check_stat(td->td_ucred, mp);
322 	if (error)
323 		goto out;
324 #endif
325 	/*
326 	 * Set these in case the underlying filesystem fails to do so.
327 	 */
328 	sp = &mp->mnt_stat;
329 	sp->f_version = STATFS_VERSION;
330 	sp->f_namemax = NAME_MAX;
331 	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
332 	error = VFS_STATFS(mp, sp);
333 	if (error)
334 		goto out;
335 	if (priv_check(td, PRIV_VFS_GENERATION)) {
336 		bcopy(sp, &sb, sizeof(sb));
337 		sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
338 		prison_enforce_statfs(td->td_ucred, mp, &sb);
339 		sp = &sb;
340 	}
341 	*buf = *sp;
342 out:
343 	vfs_unbusy(mp);
344 	VFS_UNLOCK_GIANT(vfslocked);
345 	return (error);
346 }
347 
348 /*
349  * Get filesystem statistics.
350  */
351 #ifndef _SYS_SYSPROTO_H_
352 struct fstatfs_args {
353 	int fd;
354 	struct statfs *buf;
355 };
356 #endif
357 int
358 sys_fstatfs(td, uap)
359 	struct thread *td;
360 	register struct fstatfs_args /* {
361 		int fd;
362 		struct statfs *buf;
363 	} */ *uap;
364 {
365 	struct statfs sf;
366 	int error;
367 
368 	error = kern_fstatfs(td, uap->fd, &sf);
369 	if (error == 0)
370 		error = copyout(&sf, uap->buf, sizeof(sf));
371 	return (error);
372 }
373 
374 int
375 kern_fstatfs(struct thread *td, int fd, struct statfs *buf)
376 {
377 	struct file *fp;
378 	struct mount *mp;
379 	struct statfs *sp, sb;
380 	int vfslocked;
381 	struct vnode *vp;
382 	int error;
383 
384 	AUDIT_ARG_FD(fd);
385 	error = getvnode(td->td_proc->p_fd, fd, CAP_FSTATFS, &fp);
386 	if (error)
387 		return (error);
388 	vp = fp->f_vnode;
389 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
390 	vn_lock(vp, LK_SHARED | LK_RETRY);
391 #ifdef AUDIT
392 	AUDIT_ARG_VNODE1(vp);
393 #endif
394 	mp = vp->v_mount;
395 	if (mp)
396 		vfs_ref(mp);
397 	VOP_UNLOCK(vp, 0);
398 	fdrop(fp, td);
399 	if (mp == NULL) {
400 		error = EBADF;
401 		goto out;
402 	}
403 	error = vfs_busy(mp, 0);
404 	vfs_rel(mp);
405 	if (error) {
406 		VFS_UNLOCK_GIANT(vfslocked);
407 		return (error);
408 	}
409 #ifdef MAC
410 	error = mac_mount_check_stat(td->td_ucred, mp);
411 	if (error)
412 		goto out;
413 #endif
414 	/*
415 	 * Set these in case the underlying filesystem fails to do so.
416 	 */
417 	sp = &mp->mnt_stat;
418 	sp->f_version = STATFS_VERSION;
419 	sp->f_namemax = NAME_MAX;
420 	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
421 	error = VFS_STATFS(mp, sp);
422 	if (error)
423 		goto out;
424 	if (priv_check(td, PRIV_VFS_GENERATION)) {
425 		bcopy(sp, &sb, sizeof(sb));
426 		sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
427 		prison_enforce_statfs(td->td_ucred, mp, &sb);
428 		sp = &sb;
429 	}
430 	*buf = *sp;
431 out:
432 	if (mp)
433 		vfs_unbusy(mp);
434 	VFS_UNLOCK_GIANT(vfslocked);
435 	return (error);
436 }
437 
438 /*
439  * Get statistics on all filesystems.
440  */
441 #ifndef _SYS_SYSPROTO_H_
442 struct getfsstat_args {
443 	struct statfs *buf;
444 	long bufsize;
445 	int flags;
446 };
447 #endif
448 int
449 sys_getfsstat(td, uap)
450 	struct thread *td;
451 	register struct getfsstat_args /* {
452 		struct statfs *buf;
453 		long bufsize;
454 		int flags;
455 	} */ *uap;
456 {
457 
458 	return (kern_getfsstat(td, &uap->buf, uap->bufsize, UIO_USERSPACE,
459 	    uap->flags));
460 }
461 
462 /*
463  * If (bufsize > 0 && bufseg == UIO_SYSSPACE)
464  * 	The caller is responsible for freeing memory which will be allocated
465  *	in '*buf'.
466  */
467 int
468 kern_getfsstat(struct thread *td, struct statfs **buf, size_t bufsize,
469     enum uio_seg bufseg, int flags)
470 {
471 	struct mount *mp, *nmp;
472 	struct statfs *sfsp, *sp, sb;
473 	size_t count, maxcount;
474 	int vfslocked;
475 	int error;
476 
477 	maxcount = bufsize / sizeof(struct statfs);
478 	if (bufsize == 0)
479 		sfsp = NULL;
480 	else if (bufseg == UIO_USERSPACE)
481 		sfsp = *buf;
482 	else /* if (bufseg == UIO_SYSSPACE) */ {
483 		count = 0;
484 		mtx_lock(&mountlist_mtx);
485 		TAILQ_FOREACH(mp, &mountlist, mnt_list) {
486 			count++;
487 		}
488 		mtx_unlock(&mountlist_mtx);
489 		if (maxcount > count)
490 			maxcount = count;
491 		sfsp = *buf = malloc(maxcount * sizeof(struct statfs), M_TEMP,
492 		    M_WAITOK);
493 	}
494 	count = 0;
495 	mtx_lock(&mountlist_mtx);
496 	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
497 		if (prison_canseemount(td->td_ucred, mp) != 0) {
498 			nmp = TAILQ_NEXT(mp, mnt_list);
499 			continue;
500 		}
501 #ifdef MAC
502 		if (mac_mount_check_stat(td->td_ucred, mp) != 0) {
503 			nmp = TAILQ_NEXT(mp, mnt_list);
504 			continue;
505 		}
506 #endif
507 		if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
508 			nmp = TAILQ_NEXT(mp, mnt_list);
509 			continue;
510 		}
511 		vfslocked = VFS_LOCK_GIANT(mp);
512 		if (sfsp && count < maxcount) {
513 			sp = &mp->mnt_stat;
514 			/*
515 			 * Set these in case the underlying filesystem
516 			 * fails to do so.
517 			 */
518 			sp->f_version = STATFS_VERSION;
519 			sp->f_namemax = NAME_MAX;
520 			sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
521 			/*
522 			 * If MNT_NOWAIT or MNT_LAZY is specified, do not
523 			 * refresh the fsstat cache. MNT_NOWAIT or MNT_LAZY
524 			 * overrides MNT_WAIT.
525 			 */
526 			if (((flags & (MNT_LAZY|MNT_NOWAIT)) == 0 ||
527 			    (flags & MNT_WAIT)) &&
528 			    (error = VFS_STATFS(mp, sp))) {
529 				VFS_UNLOCK_GIANT(vfslocked);
530 				mtx_lock(&mountlist_mtx);
531 				nmp = TAILQ_NEXT(mp, mnt_list);
532 				vfs_unbusy(mp);
533 				continue;
534 			}
535 			if (priv_check(td, PRIV_VFS_GENERATION)) {
536 				bcopy(sp, &sb, sizeof(sb));
537 				sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
538 				prison_enforce_statfs(td->td_ucred, mp, &sb);
539 				sp = &sb;
540 			}
541 			if (bufseg == UIO_SYSSPACE)
542 				bcopy(sp, sfsp, sizeof(*sp));
543 			else /* if (bufseg == UIO_USERSPACE) */ {
544 				error = copyout(sp, sfsp, sizeof(*sp));
545 				if (error) {
546 					vfs_unbusy(mp);
547 					VFS_UNLOCK_GIANT(vfslocked);
548 					return (error);
549 				}
550 			}
551 			sfsp++;
552 		}
553 		VFS_UNLOCK_GIANT(vfslocked);
554 		count++;
555 		mtx_lock(&mountlist_mtx);
556 		nmp = TAILQ_NEXT(mp, mnt_list);
557 		vfs_unbusy(mp);
558 	}
559 	mtx_unlock(&mountlist_mtx);
560 	if (sfsp && count > maxcount)
561 		td->td_retval[0] = maxcount;
562 	else
563 		td->td_retval[0] = count;
564 	return (0);
565 }
566 
567 #ifdef COMPAT_FREEBSD4
568 /*
569  * Get old format filesystem statistics.
570  */
571 static void cvtstatfs(struct statfs *, struct ostatfs *);
572 
573 #ifndef _SYS_SYSPROTO_H_
574 struct freebsd4_statfs_args {
575 	char *path;
576 	struct ostatfs *buf;
577 };
578 #endif
579 int
580 freebsd4_statfs(td, uap)
581 	struct thread *td;
582 	struct freebsd4_statfs_args /* {
583 		char *path;
584 		struct ostatfs *buf;
585 	} */ *uap;
586 {
587 	struct ostatfs osb;
588 	struct statfs sf;
589 	int error;
590 
591 	error = kern_statfs(td, uap->path, UIO_USERSPACE, &sf);
592 	if (error)
593 		return (error);
594 	cvtstatfs(&sf, &osb);
595 	return (copyout(&osb, uap->buf, sizeof(osb)));
596 }
597 
598 /*
599  * Get filesystem statistics.
600  */
601 #ifndef _SYS_SYSPROTO_H_
602 struct freebsd4_fstatfs_args {
603 	int fd;
604 	struct ostatfs *buf;
605 };
606 #endif
607 int
608 freebsd4_fstatfs(td, uap)
609 	struct thread *td;
610 	struct freebsd4_fstatfs_args /* {
611 		int fd;
612 		struct ostatfs *buf;
613 	} */ *uap;
614 {
615 	struct ostatfs osb;
616 	struct statfs sf;
617 	int error;
618 
619 	error = kern_fstatfs(td, uap->fd, &sf);
620 	if (error)
621 		return (error);
622 	cvtstatfs(&sf, &osb);
623 	return (copyout(&osb, uap->buf, sizeof(osb)));
624 }
625 
626 /*
627  * Get statistics on all filesystems.
628  */
629 #ifndef _SYS_SYSPROTO_H_
630 struct freebsd4_getfsstat_args {
631 	struct ostatfs *buf;
632 	long bufsize;
633 	int flags;
634 };
635 #endif
636 int
637 freebsd4_getfsstat(td, uap)
638 	struct thread *td;
639 	register struct freebsd4_getfsstat_args /* {
640 		struct ostatfs *buf;
641 		long bufsize;
642 		int flags;
643 	} */ *uap;
644 {
645 	struct statfs *buf, *sp;
646 	struct ostatfs osb;
647 	size_t count, size;
648 	int error;
649 
650 	count = uap->bufsize / sizeof(struct ostatfs);
651 	size = count * sizeof(struct statfs);
652 	error = kern_getfsstat(td, &buf, size, UIO_SYSSPACE, uap->flags);
653 	if (size > 0) {
654 		count = td->td_retval[0];
655 		sp = buf;
656 		while (count > 0 && error == 0) {
657 			cvtstatfs(sp, &osb);
658 			error = copyout(&osb, uap->buf, sizeof(osb));
659 			sp++;
660 			uap->buf++;
661 			count--;
662 		}
663 		free(buf, M_TEMP);
664 	}
665 	return (error);
666 }
667 
668 /*
669  * Implement fstatfs() for (NFS) file handles.
670  */
671 #ifndef _SYS_SYSPROTO_H_
672 struct freebsd4_fhstatfs_args {
673 	struct fhandle *u_fhp;
674 	struct ostatfs *buf;
675 };
676 #endif
677 int
678 freebsd4_fhstatfs(td, uap)
679 	struct thread *td;
680 	struct freebsd4_fhstatfs_args /* {
681 		struct fhandle *u_fhp;
682 		struct ostatfs *buf;
683 	} */ *uap;
684 {
685 	struct ostatfs osb;
686 	struct statfs sf;
687 	fhandle_t fh;
688 	int error;
689 
690 	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
691 	if (error)
692 		return (error);
693 	error = kern_fhstatfs(td, fh, &sf);
694 	if (error)
695 		return (error);
696 	cvtstatfs(&sf, &osb);
697 	return (copyout(&osb, uap->buf, sizeof(osb)));
698 }
699 
700 /*
701  * Convert a new format statfs structure to an old format statfs structure.
702  */
703 static void
704 cvtstatfs(nsp, osp)
705 	struct statfs *nsp;
706 	struct ostatfs *osp;
707 {
708 
709 	statfs_scale_blocks(nsp, LONG_MAX);
710 	bzero(osp, sizeof(*osp));
711 	osp->f_bsize = nsp->f_bsize;
712 	osp->f_iosize = MIN(nsp->f_iosize, LONG_MAX);
713 	osp->f_blocks = nsp->f_blocks;
714 	osp->f_bfree = nsp->f_bfree;
715 	osp->f_bavail = nsp->f_bavail;
716 	osp->f_files = MIN(nsp->f_files, LONG_MAX);
717 	osp->f_ffree = MIN(nsp->f_ffree, LONG_MAX);
718 	osp->f_owner = nsp->f_owner;
719 	osp->f_type = nsp->f_type;
720 	osp->f_flags = nsp->f_flags;
721 	osp->f_syncwrites = MIN(nsp->f_syncwrites, LONG_MAX);
722 	osp->f_asyncwrites = MIN(nsp->f_asyncwrites, LONG_MAX);
723 	osp->f_syncreads = MIN(nsp->f_syncreads, LONG_MAX);
724 	osp->f_asyncreads = MIN(nsp->f_asyncreads, LONG_MAX);
725 	strlcpy(osp->f_fstypename, nsp->f_fstypename,
726 	    MIN(MFSNAMELEN, OMFSNAMELEN));
727 	strlcpy(osp->f_mntonname, nsp->f_mntonname,
728 	    MIN(MNAMELEN, OMNAMELEN));
729 	strlcpy(osp->f_mntfromname, nsp->f_mntfromname,
730 	    MIN(MNAMELEN, OMNAMELEN));
731 	osp->f_fsid = nsp->f_fsid;
732 }
733 #endif /* COMPAT_FREEBSD4 */
734 
735 /*
736  * Change current working directory to a given file descriptor.
737  */
738 #ifndef _SYS_SYSPROTO_H_
739 struct fchdir_args {
740 	int	fd;
741 };
742 #endif
743 int
744 sys_fchdir(td, uap)
745 	struct thread *td;
746 	struct fchdir_args /* {
747 		int fd;
748 	} */ *uap;
749 {
750 	register struct filedesc *fdp = td->td_proc->p_fd;
751 	struct vnode *vp, *tdp, *vpold;
752 	struct mount *mp;
753 	struct file *fp;
754 	int vfslocked;
755 	int error;
756 
757 	AUDIT_ARG_FD(uap->fd);
758 	if ((error = getvnode(fdp, uap->fd, CAP_FCHDIR, &fp)) != 0)
759 		return (error);
760 	vp = fp->f_vnode;
761 	VREF(vp);
762 	fdrop(fp, td);
763 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
764 	vn_lock(vp, LK_SHARED | LK_RETRY);
765 	AUDIT_ARG_VNODE1(vp);
766 	error = change_dir(vp, td);
767 	while (!error && (mp = vp->v_mountedhere) != NULL) {
768 		int tvfslocked;
769 		if (vfs_busy(mp, 0))
770 			continue;
771 		tvfslocked = VFS_LOCK_GIANT(mp);
772 		error = VFS_ROOT(mp, LK_SHARED, &tdp);
773 		vfs_unbusy(mp);
774 		if (error) {
775 			VFS_UNLOCK_GIANT(tvfslocked);
776 			break;
777 		}
778 		vput(vp);
779 		VFS_UNLOCK_GIANT(vfslocked);
780 		vp = tdp;
781 		vfslocked = tvfslocked;
782 	}
783 	if (error) {
784 		vput(vp);
785 		VFS_UNLOCK_GIANT(vfslocked);
786 		return (error);
787 	}
788 	VOP_UNLOCK(vp, 0);
789 	VFS_UNLOCK_GIANT(vfslocked);
790 	FILEDESC_XLOCK(fdp);
791 	vpold = fdp->fd_cdir;
792 	fdp->fd_cdir = vp;
793 	FILEDESC_XUNLOCK(fdp);
794 	vfslocked = VFS_LOCK_GIANT(vpold->v_mount);
795 	vrele(vpold);
796 	VFS_UNLOCK_GIANT(vfslocked);
797 	return (0);
798 }
799 
800 /*
801  * Change current working directory (``.'').
802  */
803 #ifndef _SYS_SYSPROTO_H_
804 struct chdir_args {
805 	char	*path;
806 };
807 #endif
808 int
809 sys_chdir(td, uap)
810 	struct thread *td;
811 	struct chdir_args /* {
812 		char *path;
813 	} */ *uap;
814 {
815 
816 	return (kern_chdir(td, uap->path, UIO_USERSPACE));
817 }
818 
819 int
820 kern_chdir(struct thread *td, char *path, enum uio_seg pathseg)
821 {
822 	register struct filedesc *fdp = td->td_proc->p_fd;
823 	int error;
824 	struct nameidata nd;
825 	struct vnode *vp;
826 	int vfslocked;
827 
828 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1 |
829 	    MPSAFE, pathseg, path, td);
830 	if ((error = namei(&nd)) != 0)
831 		return (error);
832 	vfslocked = NDHASGIANT(&nd);
833 	if ((error = change_dir(nd.ni_vp, td)) != 0) {
834 		vput(nd.ni_vp);
835 		VFS_UNLOCK_GIANT(vfslocked);
836 		NDFREE(&nd, NDF_ONLY_PNBUF);
837 		return (error);
838 	}
839 	VOP_UNLOCK(nd.ni_vp, 0);
840 	VFS_UNLOCK_GIANT(vfslocked);
841 	NDFREE(&nd, NDF_ONLY_PNBUF);
842 	FILEDESC_XLOCK(fdp);
843 	vp = fdp->fd_cdir;
844 	fdp->fd_cdir = nd.ni_vp;
845 	FILEDESC_XUNLOCK(fdp);
846 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
847 	vrele(vp);
848 	VFS_UNLOCK_GIANT(vfslocked);
849 	return (0);
850 }
851 
852 /*
853  * Helper function for raised chroot(2) security function:  Refuse if
854  * any filedescriptors are open directories.
855  */
856 static int
857 chroot_refuse_vdir_fds(fdp)
858 	struct filedesc *fdp;
859 {
860 	struct vnode *vp;
861 	struct file *fp;
862 	int fd;
863 
864 	FILEDESC_LOCK_ASSERT(fdp);
865 
866 	for (fd = 0; fd < fdp->fd_nfiles ; fd++) {
867 		fp = fget_locked(fdp, fd);
868 		if (fp == NULL)
869 			continue;
870 		if (fp->f_type == DTYPE_VNODE) {
871 			vp = fp->f_vnode;
872 			if (vp->v_type == VDIR)
873 				return (EPERM);
874 		}
875 	}
876 	return (0);
877 }
878 
879 /*
880  * This sysctl determines if we will allow a process to chroot(2) if it
881  * has a directory open:
882  *	0: disallowed for all processes.
883  *	1: allowed for processes that were not already chroot(2)'ed.
884  *	2: allowed for all processes.
885  */
886 
887 static int chroot_allow_open_directories = 1;
888 
889 SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW,
890      &chroot_allow_open_directories, 0,
891      "Allow a process to chroot(2) if it has a directory open");
892 
893 /*
894  * Change notion of root (``/'') directory.
895  */
896 #ifndef _SYS_SYSPROTO_H_
897 struct chroot_args {
898 	char	*path;
899 };
900 #endif
901 int
902 sys_chroot(td, uap)
903 	struct thread *td;
904 	struct chroot_args /* {
905 		char *path;
906 	} */ *uap;
907 {
908 	int error;
909 	struct nameidata nd;
910 	int vfslocked;
911 
912 	error = priv_check(td, PRIV_VFS_CHROOT);
913 	if (error)
914 		return (error);
915 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | MPSAFE |
916 	    AUDITVNODE1, UIO_USERSPACE, uap->path, td);
917 	error = namei(&nd);
918 	if (error)
919 		goto error;
920 	vfslocked = NDHASGIANT(&nd);
921 	if ((error = change_dir(nd.ni_vp, td)) != 0)
922 		goto e_vunlock;
923 #ifdef MAC
924 	if ((error = mac_vnode_check_chroot(td->td_ucred, nd.ni_vp)))
925 		goto e_vunlock;
926 #endif
927 	VOP_UNLOCK(nd.ni_vp, 0);
928 	error = change_root(nd.ni_vp, td);
929 	vrele(nd.ni_vp);
930 	VFS_UNLOCK_GIANT(vfslocked);
931 	NDFREE(&nd, NDF_ONLY_PNBUF);
932 	return (error);
933 e_vunlock:
934 	vput(nd.ni_vp);
935 	VFS_UNLOCK_GIANT(vfslocked);
936 error:
937 	NDFREE(&nd, NDF_ONLY_PNBUF);
938 	return (error);
939 }
940 
941 /*
942  * Common routine for chroot and chdir.  Callers must provide a locked vnode
943  * instance.
944  */
945 int
946 change_dir(vp, td)
947 	struct vnode *vp;
948 	struct thread *td;
949 {
950 	int error;
951 
952 	ASSERT_VOP_LOCKED(vp, "change_dir(): vp not locked");
953 	if (vp->v_type != VDIR)
954 		return (ENOTDIR);
955 #ifdef MAC
956 	error = mac_vnode_check_chdir(td->td_ucred, vp);
957 	if (error)
958 		return (error);
959 #endif
960 	error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td);
961 	return (error);
962 }
963 
964 /*
965  * Common routine for kern_chroot() and jail_attach().  The caller is
966  * responsible for invoking priv_check() and mac_vnode_check_chroot() to
967  * authorize this operation.
968  */
969 int
970 change_root(vp, td)
971 	struct vnode *vp;
972 	struct thread *td;
973 {
974 	struct filedesc *fdp;
975 	struct vnode *oldvp;
976 	int vfslocked;
977 	int error;
978 
979 	VFS_ASSERT_GIANT(vp->v_mount);
980 	fdp = td->td_proc->p_fd;
981 	FILEDESC_XLOCK(fdp);
982 	if (chroot_allow_open_directories == 0 ||
983 	    (chroot_allow_open_directories == 1 && fdp->fd_rdir != rootvnode)) {
984 		error = chroot_refuse_vdir_fds(fdp);
985 		if (error) {
986 			FILEDESC_XUNLOCK(fdp);
987 			return (error);
988 		}
989 	}
990 	oldvp = fdp->fd_rdir;
991 	fdp->fd_rdir = vp;
992 	VREF(fdp->fd_rdir);
993 	if (!fdp->fd_jdir) {
994 		fdp->fd_jdir = vp;
995 		VREF(fdp->fd_jdir);
996 	}
997 	FILEDESC_XUNLOCK(fdp);
998 	vfslocked = VFS_LOCK_GIANT(oldvp->v_mount);
999 	vrele(oldvp);
1000 	VFS_UNLOCK_GIANT(vfslocked);
1001 	return (0);
1002 }
1003 
1004 static __inline cap_rights_t
1005 flags_to_rights(int flags)
1006 {
1007 	cap_rights_t rights = 0;
1008 
1009 	if (flags & O_EXEC) {
1010 		rights |= CAP_FEXECVE;
1011 	} else {
1012 		switch ((flags & O_ACCMODE)) {
1013 		case O_RDONLY:
1014 			rights |= CAP_READ;
1015 			break;
1016 		case O_RDWR:
1017 			rights |= CAP_READ;
1018 			/* FALLTHROUGH */
1019 		case O_WRONLY:
1020 			rights |= CAP_WRITE;
1021 			break;
1022 		}
1023 	}
1024 
1025 	if (flags & O_CREAT)
1026 		rights |= CAP_CREATE;
1027 
1028 	if (flags & O_TRUNC)
1029 		rights |= CAP_FTRUNCATE;
1030 
1031 	if ((flags & O_EXLOCK) || (flags & O_SHLOCK))
1032 		rights |= CAP_FLOCK;
1033 
1034 	return (rights);
1035 }
1036 
1037 /*
1038  * Check permissions, allocate an open file structure, and call the device
1039  * open routine if any.
1040  */
1041 #ifndef _SYS_SYSPROTO_H_
1042 struct open_args {
1043 	char	*path;
1044 	int	flags;
1045 	int	mode;
1046 };
1047 #endif
1048 int
1049 sys_open(td, uap)
1050 	struct thread *td;
1051 	register struct open_args /* {
1052 		char *path;
1053 		int flags;
1054 		int mode;
1055 	} */ *uap;
1056 {
1057 
1058 	return (kern_open(td, uap->path, UIO_USERSPACE, uap->flags, uap->mode));
1059 }
1060 
1061 #ifndef _SYS_SYSPROTO_H_
1062 struct openat_args {
1063 	int	fd;
1064 	char	*path;
1065 	int	flag;
1066 	int	mode;
1067 };
1068 #endif
1069 int
1070 sys_openat(struct thread *td, struct openat_args *uap)
1071 {
1072 
1073 	return (kern_openat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flag,
1074 	    uap->mode));
1075 }
1076 
1077 int
1078 kern_open(struct thread *td, char *path, enum uio_seg pathseg, int flags,
1079     int mode)
1080 {
1081 
1082 	return (kern_openat(td, AT_FDCWD, path, pathseg, flags, mode));
1083 }
1084 
1085 int
1086 kern_openat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
1087     int flags, int mode)
1088 {
1089 	struct proc *p = td->td_proc;
1090 	struct filedesc *fdp = p->p_fd;
1091 	struct file *fp;
1092 	struct vnode *vp;
1093 	int cmode;
1094 	int indx = -1, error;
1095 	struct nameidata nd;
1096 	int vfslocked;
1097 	cap_rights_t rights_needed = CAP_LOOKUP;
1098 
1099 	AUDIT_ARG_FFLAGS(flags);
1100 	AUDIT_ARG_MODE(mode);
1101 	/* XXX: audit dirfd */
1102 	rights_needed |= flags_to_rights(flags);
1103 	/*
1104 	 * Only one of the O_EXEC, O_RDONLY, O_WRONLY and O_RDWR flags
1105 	 * may be specified.
1106 	 */
1107 	if (flags & O_EXEC) {
1108 		if (flags & O_ACCMODE)
1109 			return (EINVAL);
1110 	} else if ((flags & O_ACCMODE) == O_ACCMODE) {
1111 		return (EINVAL);
1112 	} else {
1113 		flags = FFLAGS(flags);
1114 	}
1115 
1116 	/*
1117 	 * Allocate the file descriptor, but don't install a descriptor yet.
1118 	 */
1119 	error = falloc_noinstall(td, &fp);
1120 	if (error)
1121 		return (error);
1122 	/*
1123 	 * An extra reference on `fp' has been held for us by
1124 	 * falloc_noinstall().
1125 	 */
1126 	/* Set the flags early so the finit in devfs can pick them up. */
1127 	fp->f_flag = flags & FMASK;
1128 	cmode = ((mode &~ fdp->fd_cmask) & ALLPERMS) &~ S_ISTXT;
1129 	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | AUDITVNODE1 | MPSAFE, pathseg,
1130 	    path, fd, rights_needed, td);
1131 	td->td_dupfd = -1;		/* XXX check for fdopen */
1132 	error = vn_open(&nd, &flags, cmode, fp);
1133 	if (error) {
1134 		/*
1135 		 * If the vn_open replaced the method vector, something
1136 		 * wonderous happened deep below and we just pass it up
1137 		 * pretending we know what we do.
1138 		 */
1139 		if (error == ENXIO && fp->f_ops != &badfileops)
1140 			goto success;
1141 
1142 		/*
1143 		 * Handle special fdopen() case. bleh.
1144 		 *
1145 		 * Don't do this for relative (capability) lookups; we don't
1146 		 * understand exactly what would happen, and we don't think
1147 		 * that it ever should.
1148 		 */
1149 		if (nd.ni_strictrelative == 0 &&
1150 		    (error == ENODEV || error == ENXIO) &&
1151 		    td->td_dupfd >= 0) {
1152 			error = dupfdopen(td, fdp, td->td_dupfd, flags, error,
1153 			    &indx);
1154 			if (error == 0)
1155 				goto success;
1156 		}
1157 
1158 		if (error == ERESTART)
1159 			error = EINTR;
1160 		goto bad_unlocked;
1161 	}
1162 	td->td_dupfd = 0;
1163 	vfslocked = NDHASGIANT(&nd);
1164 	NDFREE(&nd, NDF_ONLY_PNBUF);
1165 	vp = nd.ni_vp;
1166 
1167 	/*
1168 	 * Store the vnode, for any f_type. Typically, the vnode use
1169 	 * count is decremented by direct call to vn_closefile() for
1170 	 * files that switched type in the cdevsw fdopen() method.
1171 	 */
1172 	fp->f_vnode = vp;
1173 	/*
1174 	 * If the file wasn't claimed by devfs bind it to the normal
1175 	 * vnode operations here.
1176 	 */
1177 	if (fp->f_ops == &badfileops) {
1178 		KASSERT(vp->v_type != VFIFO, ("Unexpected fifo."));
1179 		fp->f_seqcount = 1;
1180 		finit(fp, (flags & FMASK) | (fp->f_flag & FHASLOCK), DTYPE_VNODE,
1181 		    vp, &vnops);
1182 	}
1183 
1184 	VOP_UNLOCK(vp, 0);
1185 	if (flags & O_TRUNC) {
1186 		error = fo_truncate(fp, 0, td->td_ucred, td);
1187 		if (error)
1188 			goto bad;
1189 	}
1190 	VFS_UNLOCK_GIANT(vfslocked);
1191 success:
1192 	/*
1193 	 * If we haven't already installed the FD (for dupfdopen), do so now.
1194 	 */
1195 	if (indx == -1) {
1196 #ifdef CAPABILITIES
1197 		if (nd.ni_strictrelative == 1) {
1198 			/*
1199 			 * We are doing a strict relative lookup; wrap the
1200 			 * result in a capability.
1201 			 */
1202 			if ((error = kern_capwrap(td, fp, nd.ni_baserights,
1203 			    &indx)) != 0)
1204 				goto bad_unlocked;
1205 		} else
1206 #endif
1207 			if ((error = finstall(td, fp, &indx, flags)) != 0)
1208 				goto bad_unlocked;
1209 
1210 	}
1211 
1212 	/*
1213 	 * Release our private reference, leaving the one associated with
1214 	 * the descriptor table intact.
1215 	 */
1216 	fdrop(fp, td);
1217 	td->td_retval[0] = indx;
1218 	return (0);
1219 bad:
1220 	VFS_UNLOCK_GIANT(vfslocked);
1221 bad_unlocked:
1222 	KASSERT(indx == -1, ("indx=%d, should be -1", indx));
1223 	fdrop(fp, td);
1224 	return (error);
1225 }
1226 
1227 #ifdef COMPAT_43
1228 /*
1229  * Create a file.
1230  */
1231 #ifndef _SYS_SYSPROTO_H_
1232 struct ocreat_args {
1233 	char	*path;
1234 	int	mode;
1235 };
1236 #endif
1237 int
1238 ocreat(td, uap)
1239 	struct thread *td;
1240 	register struct ocreat_args /* {
1241 		char *path;
1242 		int mode;
1243 	} */ *uap;
1244 {
1245 
1246 	return (kern_open(td, uap->path, UIO_USERSPACE,
1247 	    O_WRONLY | O_CREAT | O_TRUNC, uap->mode));
1248 }
1249 #endif /* COMPAT_43 */
1250 
1251 /*
1252  * Create a special file.
1253  */
1254 #ifndef _SYS_SYSPROTO_H_
1255 struct mknod_args {
1256 	char	*path;
1257 	int	mode;
1258 	int	dev;
1259 };
1260 #endif
1261 int
1262 sys_mknod(td, uap)
1263 	struct thread *td;
1264 	register struct mknod_args /* {
1265 		char *path;
1266 		int mode;
1267 		int dev;
1268 	} */ *uap;
1269 {
1270 
1271 	return (kern_mknod(td, uap->path, UIO_USERSPACE, uap->mode, uap->dev));
1272 }
1273 
1274 #ifndef _SYS_SYSPROTO_H_
1275 struct mknodat_args {
1276 	int	fd;
1277 	char	*path;
1278 	mode_t	mode;
1279 	dev_t	dev;
1280 };
1281 #endif
1282 int
1283 sys_mknodat(struct thread *td, struct mknodat_args *uap)
1284 {
1285 
1286 	return (kern_mknodat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode,
1287 	    uap->dev));
1288 }
1289 
1290 int
1291 kern_mknod(struct thread *td, char *path, enum uio_seg pathseg, int mode,
1292     int dev)
1293 {
1294 
1295 	return (kern_mknodat(td, AT_FDCWD, path, pathseg, mode, dev));
1296 }
1297 
1298 int
1299 kern_mknodat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
1300     int mode, int dev)
1301 {
1302 	struct vnode *vp;
1303 	struct mount *mp;
1304 	struct vattr vattr;
1305 	int error;
1306 	int whiteout = 0;
1307 	struct nameidata nd;
1308 	int vfslocked;
1309 
1310 	AUDIT_ARG_MODE(mode);
1311 	AUDIT_ARG_DEV(dev);
1312 	switch (mode & S_IFMT) {
1313 	case S_IFCHR:
1314 	case S_IFBLK:
1315 		error = priv_check(td, PRIV_VFS_MKNOD_DEV);
1316 		break;
1317 	case S_IFMT:
1318 		error = priv_check(td, PRIV_VFS_MKNOD_BAD);
1319 		break;
1320 	case S_IFWHT:
1321 		error = priv_check(td, PRIV_VFS_MKNOD_WHT);
1322 		break;
1323 	case S_IFIFO:
1324 		if (dev == 0)
1325 			return (kern_mkfifoat(td, fd, path, pathseg, mode));
1326 		/* FALLTHROUGH */
1327 	default:
1328 		error = EINVAL;
1329 		break;
1330 	}
1331 	if (error)
1332 		return (error);
1333 restart:
1334 	bwillwrite();
1335 	NDINIT_ATRIGHTS(&nd, CREATE,
1336 	    LOCKPARENT | SAVENAME | MPSAFE | AUDITVNODE1, pathseg, path, fd,
1337 	    CAP_MKNOD, td);
1338 	if ((error = namei(&nd)) != 0)
1339 		return (error);
1340 	vfslocked = NDHASGIANT(&nd);
1341 	vp = nd.ni_vp;
1342 	if (vp != NULL) {
1343 		NDFREE(&nd, NDF_ONLY_PNBUF);
1344 		if (vp == nd.ni_dvp)
1345 			vrele(nd.ni_dvp);
1346 		else
1347 			vput(nd.ni_dvp);
1348 		vrele(vp);
1349 		VFS_UNLOCK_GIANT(vfslocked);
1350 		return (EEXIST);
1351 	} else {
1352 		VATTR_NULL(&vattr);
1353 		vattr.va_mode = (mode & ALLPERMS) &
1354 		    ~td->td_proc->p_fd->fd_cmask;
1355 		vattr.va_rdev = dev;
1356 		whiteout = 0;
1357 
1358 		switch (mode & S_IFMT) {
1359 		case S_IFMT:	/* used by badsect to flag bad sectors */
1360 			vattr.va_type = VBAD;
1361 			break;
1362 		case S_IFCHR:
1363 			vattr.va_type = VCHR;
1364 			break;
1365 		case S_IFBLK:
1366 			vattr.va_type = VBLK;
1367 			break;
1368 		case S_IFWHT:
1369 			whiteout = 1;
1370 			break;
1371 		default:
1372 			panic("kern_mknod: invalid mode");
1373 		}
1374 	}
1375 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1376 		NDFREE(&nd, NDF_ONLY_PNBUF);
1377 		vput(nd.ni_dvp);
1378 		VFS_UNLOCK_GIANT(vfslocked);
1379 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
1380 			return (error);
1381 		goto restart;
1382 	}
1383 #ifdef MAC
1384 	if (error == 0 && !whiteout)
1385 		error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp,
1386 		    &nd.ni_cnd, &vattr);
1387 #endif
1388 	if (!error) {
1389 		if (whiteout)
1390 			error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE);
1391 		else {
1392 			error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp,
1393 						&nd.ni_cnd, &vattr);
1394 			if (error == 0)
1395 				vput(nd.ni_vp);
1396 		}
1397 	}
1398 	NDFREE(&nd, NDF_ONLY_PNBUF);
1399 	vput(nd.ni_dvp);
1400 	vn_finished_write(mp);
1401 	VFS_UNLOCK_GIANT(vfslocked);
1402 	return (error);
1403 }
1404 
1405 /*
1406  * Create a named pipe.
1407  */
1408 #ifndef _SYS_SYSPROTO_H_
1409 struct mkfifo_args {
1410 	char	*path;
1411 	int	mode;
1412 };
1413 #endif
1414 int
1415 sys_mkfifo(td, uap)
1416 	struct thread *td;
1417 	register struct mkfifo_args /* {
1418 		char *path;
1419 		int mode;
1420 	} */ *uap;
1421 {
1422 
1423 	return (kern_mkfifo(td, uap->path, UIO_USERSPACE, uap->mode));
1424 }
1425 
1426 #ifndef _SYS_SYSPROTO_H_
1427 struct mkfifoat_args {
1428 	int	fd;
1429 	char	*path;
1430 	mode_t	mode;
1431 };
1432 #endif
1433 int
1434 sys_mkfifoat(struct thread *td, struct mkfifoat_args *uap)
1435 {
1436 
1437 	return (kern_mkfifoat(td, uap->fd, uap->path, UIO_USERSPACE,
1438 	    uap->mode));
1439 }
1440 
1441 int
1442 kern_mkfifo(struct thread *td, char *path, enum uio_seg pathseg, int mode)
1443 {
1444 
1445 	return (kern_mkfifoat(td, AT_FDCWD, path, pathseg, mode));
1446 }
1447 
1448 int
1449 kern_mkfifoat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
1450     int mode)
1451 {
1452 	struct mount *mp;
1453 	struct vattr vattr;
1454 	int error;
1455 	struct nameidata nd;
1456 	int vfslocked;
1457 
1458 	AUDIT_ARG_MODE(mode);
1459 restart:
1460 	bwillwrite();
1461 	NDINIT_ATRIGHTS(&nd, CREATE,
1462 	    LOCKPARENT | SAVENAME | MPSAFE | AUDITVNODE1, pathseg, path, fd,
1463 	    CAP_MKFIFO, td);
1464 	if ((error = namei(&nd)) != 0)
1465 		return (error);
1466 	vfslocked = NDHASGIANT(&nd);
1467 	if (nd.ni_vp != NULL) {
1468 		NDFREE(&nd, NDF_ONLY_PNBUF);
1469 		if (nd.ni_vp == nd.ni_dvp)
1470 			vrele(nd.ni_dvp);
1471 		else
1472 			vput(nd.ni_dvp);
1473 		vrele(nd.ni_vp);
1474 		VFS_UNLOCK_GIANT(vfslocked);
1475 		return (EEXIST);
1476 	}
1477 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1478 		NDFREE(&nd, NDF_ONLY_PNBUF);
1479 		vput(nd.ni_dvp);
1480 		VFS_UNLOCK_GIANT(vfslocked);
1481 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
1482 			return (error);
1483 		goto restart;
1484 	}
1485 	VATTR_NULL(&vattr);
1486 	vattr.va_type = VFIFO;
1487 	vattr.va_mode = (mode & ALLPERMS) & ~td->td_proc->p_fd->fd_cmask;
1488 #ifdef MAC
1489 	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
1490 	    &vattr);
1491 	if (error)
1492 		goto out;
1493 #endif
1494 	error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
1495 	if (error == 0)
1496 		vput(nd.ni_vp);
1497 #ifdef MAC
1498 out:
1499 #endif
1500 	vput(nd.ni_dvp);
1501 	vn_finished_write(mp);
1502 	VFS_UNLOCK_GIANT(vfslocked);
1503 	NDFREE(&nd, NDF_ONLY_PNBUF);
1504 	return (error);
1505 }
1506 
1507 /*
1508  * Make a hard file link.
1509  */
1510 #ifndef _SYS_SYSPROTO_H_
1511 struct link_args {
1512 	char	*path;
1513 	char	*link;
1514 };
1515 #endif
1516 int
1517 sys_link(td, uap)
1518 	struct thread *td;
1519 	register struct link_args /* {
1520 		char *path;
1521 		char *link;
1522 	} */ *uap;
1523 {
1524 
1525 	return (kern_link(td, uap->path, uap->link, UIO_USERSPACE));
1526 }
1527 
1528 #ifndef _SYS_SYSPROTO_H_
1529 struct linkat_args {
1530 	int	fd1;
1531 	char	*path1;
1532 	int	fd2;
1533 	char	*path2;
1534 	int	flag;
1535 };
1536 #endif
1537 int
1538 sys_linkat(struct thread *td, struct linkat_args *uap)
1539 {
1540 	int flag;
1541 
1542 	flag = uap->flag;
1543 	if (flag & ~AT_SYMLINK_FOLLOW)
1544 		return (EINVAL);
1545 
1546 	return (kern_linkat(td, uap->fd1, uap->fd2, uap->path1, uap->path2,
1547 	    UIO_USERSPACE, (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW));
1548 }
1549 
1550 int hardlink_check_uid = 0;
1551 SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_uid, CTLFLAG_RW,
1552     &hardlink_check_uid, 0,
1553     "Unprivileged processes cannot create hard links to files owned by other "
1554     "users");
1555 static int hardlink_check_gid = 0;
1556 SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_gid, CTLFLAG_RW,
1557     &hardlink_check_gid, 0,
1558     "Unprivileged processes cannot create hard links to files owned by other "
1559     "groups");
1560 
1561 static int
1562 can_hardlink(struct vnode *vp, struct ucred *cred)
1563 {
1564 	struct vattr va;
1565 	int error;
1566 
1567 	if (!hardlink_check_uid && !hardlink_check_gid)
1568 		return (0);
1569 
1570 	error = VOP_GETATTR(vp, &va, cred);
1571 	if (error != 0)
1572 		return (error);
1573 
1574 	if (hardlink_check_uid && cred->cr_uid != va.va_uid) {
1575 		error = priv_check_cred(cred, PRIV_VFS_LINK, 0);
1576 		if (error)
1577 			return (error);
1578 	}
1579 
1580 	if (hardlink_check_gid && !groupmember(va.va_gid, cred)) {
1581 		error = priv_check_cred(cred, PRIV_VFS_LINK, 0);
1582 		if (error)
1583 			return (error);
1584 	}
1585 
1586 	return (0);
1587 }
1588 
1589 int
1590 kern_link(struct thread *td, char *path, char *link, enum uio_seg segflg)
1591 {
1592 
1593 	return (kern_linkat(td, AT_FDCWD, AT_FDCWD, path,link, segflg, FOLLOW));
1594 }
1595 
1596 int
1597 kern_linkat(struct thread *td, int fd1, int fd2, char *path1, char *path2,
1598     enum uio_seg segflg, int follow)
1599 {
1600 	struct vnode *vp;
1601 	struct mount *mp;
1602 	struct nameidata nd;
1603 	int vfslocked;
1604 	int lvfslocked;
1605 	int error;
1606 
1607 	bwillwrite();
1608 	NDINIT_AT(&nd, LOOKUP, follow | MPSAFE | AUDITVNODE1, segflg, path1,
1609 	    fd1, td);
1610 
1611 	if ((error = namei(&nd)) != 0)
1612 		return (error);
1613 	vfslocked = NDHASGIANT(&nd);
1614 	NDFREE(&nd, NDF_ONLY_PNBUF);
1615 	vp = nd.ni_vp;
1616 	if (vp->v_type == VDIR) {
1617 		vrele(vp);
1618 		VFS_UNLOCK_GIANT(vfslocked);
1619 		return (EPERM);		/* POSIX */
1620 	}
1621 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
1622 		vrele(vp);
1623 		VFS_UNLOCK_GIANT(vfslocked);
1624 		return (error);
1625 	}
1626 	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE |
1627 	    AUDITVNODE2, segflg, path2, fd2, CAP_CREATE, td);
1628 	if ((error = namei(&nd)) == 0) {
1629 		lvfslocked = NDHASGIANT(&nd);
1630 		if (nd.ni_vp != NULL) {
1631 			if (nd.ni_dvp == nd.ni_vp)
1632 				vrele(nd.ni_dvp);
1633 			else
1634 				vput(nd.ni_dvp);
1635 			vrele(nd.ni_vp);
1636 			error = EEXIST;
1637 		} else if ((error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY))
1638 		    == 0) {
1639 			error = can_hardlink(vp, td->td_ucred);
1640 			if (error == 0)
1641 #ifdef MAC
1642 				error = mac_vnode_check_link(td->td_ucred,
1643 				    nd.ni_dvp, vp, &nd.ni_cnd);
1644 			if (error == 0)
1645 #endif
1646 				error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd);
1647 			VOP_UNLOCK(vp, 0);
1648 			vput(nd.ni_dvp);
1649 		}
1650 		NDFREE(&nd, NDF_ONLY_PNBUF);
1651 		VFS_UNLOCK_GIANT(lvfslocked);
1652 	}
1653 	vrele(vp);
1654 	vn_finished_write(mp);
1655 	VFS_UNLOCK_GIANT(vfslocked);
1656 	return (error);
1657 }
1658 
1659 /*
1660  * Make a symbolic link.
1661  */
1662 #ifndef _SYS_SYSPROTO_H_
1663 struct symlink_args {
1664 	char	*path;
1665 	char	*link;
1666 };
1667 #endif
1668 int
1669 sys_symlink(td, uap)
1670 	struct thread *td;
1671 	register struct symlink_args /* {
1672 		char *path;
1673 		char *link;
1674 	} */ *uap;
1675 {
1676 
1677 	return (kern_symlink(td, uap->path, uap->link, UIO_USERSPACE));
1678 }
1679 
1680 #ifndef _SYS_SYSPROTO_H_
1681 struct symlinkat_args {
1682 	char	*path;
1683 	int	fd;
1684 	char	*path2;
1685 };
1686 #endif
1687 int
1688 sys_symlinkat(struct thread *td, struct symlinkat_args *uap)
1689 {
1690 
1691 	return (kern_symlinkat(td, uap->path1, uap->fd, uap->path2,
1692 	    UIO_USERSPACE));
1693 }
1694 
1695 int
1696 kern_symlink(struct thread *td, char *path, char *link, enum uio_seg segflg)
1697 {
1698 
1699 	return (kern_symlinkat(td, path, AT_FDCWD, link, segflg));
1700 }
1701 
1702 int
1703 kern_symlinkat(struct thread *td, char *path1, int fd, char *path2,
1704     enum uio_seg segflg)
1705 {
1706 	struct mount *mp;
1707 	struct vattr vattr;
1708 	char *syspath;
1709 	int error;
1710 	struct nameidata nd;
1711 	int vfslocked;
1712 
1713 	if (segflg == UIO_SYSSPACE) {
1714 		syspath = path1;
1715 	} else {
1716 		syspath = uma_zalloc(namei_zone, M_WAITOK);
1717 		if ((error = copyinstr(path1, syspath, MAXPATHLEN, NULL)) != 0)
1718 			goto out;
1719 	}
1720 	AUDIT_ARG_TEXT(syspath);
1721 restart:
1722 	bwillwrite();
1723 	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE |
1724 	    AUDITVNODE1, segflg, path2, fd, CAP_CREATE, td);
1725 	if ((error = namei(&nd)) != 0)
1726 		goto out;
1727 	vfslocked = NDHASGIANT(&nd);
1728 	if (nd.ni_vp) {
1729 		NDFREE(&nd, NDF_ONLY_PNBUF);
1730 		if (nd.ni_vp == nd.ni_dvp)
1731 			vrele(nd.ni_dvp);
1732 		else
1733 			vput(nd.ni_dvp);
1734 		vrele(nd.ni_vp);
1735 		VFS_UNLOCK_GIANT(vfslocked);
1736 		error = EEXIST;
1737 		goto out;
1738 	}
1739 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1740 		NDFREE(&nd, NDF_ONLY_PNBUF);
1741 		vput(nd.ni_dvp);
1742 		VFS_UNLOCK_GIANT(vfslocked);
1743 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
1744 			goto out;
1745 		goto restart;
1746 	}
1747 	VATTR_NULL(&vattr);
1748 	vattr.va_mode = ACCESSPERMS &~ td->td_proc->p_fd->fd_cmask;
1749 #ifdef MAC
1750 	vattr.va_type = VLNK;
1751 	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
1752 	    &vattr);
1753 	if (error)
1754 		goto out2;
1755 #endif
1756 	error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, syspath);
1757 	if (error == 0)
1758 		vput(nd.ni_vp);
1759 #ifdef MAC
1760 out2:
1761 #endif
1762 	NDFREE(&nd, NDF_ONLY_PNBUF);
1763 	vput(nd.ni_dvp);
1764 	vn_finished_write(mp);
1765 	VFS_UNLOCK_GIANT(vfslocked);
1766 out:
1767 	if (segflg != UIO_SYSSPACE)
1768 		uma_zfree(namei_zone, syspath);
1769 	return (error);
1770 }
1771 
1772 /*
1773  * Delete a whiteout from the filesystem.
1774  */
1775 int
1776 sys_undelete(td, uap)
1777 	struct thread *td;
1778 	register struct undelete_args /* {
1779 		char *path;
1780 	} */ *uap;
1781 {
1782 	int error;
1783 	struct mount *mp;
1784 	struct nameidata nd;
1785 	int vfslocked;
1786 
1787 restart:
1788 	bwillwrite();
1789 	NDINIT(&nd, DELETE, LOCKPARENT | DOWHITEOUT | MPSAFE | AUDITVNODE1,
1790 	    UIO_USERSPACE, uap->path, td);
1791 	error = namei(&nd);
1792 	if (error)
1793 		return (error);
1794 	vfslocked = NDHASGIANT(&nd);
1795 
1796 	if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) {
1797 		NDFREE(&nd, NDF_ONLY_PNBUF);
1798 		if (nd.ni_vp == nd.ni_dvp)
1799 			vrele(nd.ni_dvp);
1800 		else
1801 			vput(nd.ni_dvp);
1802 		if (nd.ni_vp)
1803 			vrele(nd.ni_vp);
1804 		VFS_UNLOCK_GIANT(vfslocked);
1805 		return (EEXIST);
1806 	}
1807 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1808 		NDFREE(&nd, NDF_ONLY_PNBUF);
1809 		vput(nd.ni_dvp);
1810 		VFS_UNLOCK_GIANT(vfslocked);
1811 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
1812 			return (error);
1813 		goto restart;
1814 	}
1815 	error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE);
1816 	NDFREE(&nd, NDF_ONLY_PNBUF);
1817 	vput(nd.ni_dvp);
1818 	vn_finished_write(mp);
1819 	VFS_UNLOCK_GIANT(vfslocked);
1820 	return (error);
1821 }
1822 
1823 /*
1824  * Delete a name from the filesystem.
1825  */
1826 #ifndef _SYS_SYSPROTO_H_
1827 struct unlink_args {
1828 	char	*path;
1829 };
1830 #endif
1831 int
1832 sys_unlink(td, uap)
1833 	struct thread *td;
1834 	struct unlink_args /* {
1835 		char *path;
1836 	} */ *uap;
1837 {
1838 
1839 	return (kern_unlink(td, uap->path, UIO_USERSPACE));
1840 }
1841 
1842 #ifndef _SYS_SYSPROTO_H_
1843 struct unlinkat_args {
1844 	int	fd;
1845 	char	*path;
1846 	int	flag;
1847 };
1848 #endif
1849 int
1850 sys_unlinkat(struct thread *td, struct unlinkat_args *uap)
1851 {
1852 	int flag = uap->flag;
1853 	int fd = uap->fd;
1854 	char *path = uap->path;
1855 
1856 	if (flag & ~AT_REMOVEDIR)
1857 		return (EINVAL);
1858 
1859 	if (flag & AT_REMOVEDIR)
1860 		return (kern_rmdirat(td, fd, path, UIO_USERSPACE));
1861 	else
1862 		return (kern_unlinkat(td, fd, path, UIO_USERSPACE, 0));
1863 }
1864 
1865 int
1866 kern_unlink(struct thread *td, char *path, enum uio_seg pathseg)
1867 {
1868 
1869 	return (kern_unlinkat(td, AT_FDCWD, path, pathseg, 0));
1870 }
1871 
1872 int
1873 kern_unlinkat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
1874     ino_t oldinum)
1875 {
1876 	struct mount *mp;
1877 	struct vnode *vp;
1878 	int error;
1879 	struct nameidata nd;
1880 	struct stat sb;
1881 	int vfslocked;
1882 
1883 restart:
1884 	bwillwrite();
1885 	NDINIT_ATRIGHTS(&nd, DELETE, LOCKPARENT | LOCKLEAF | MPSAFE |
1886 	    AUDITVNODE1, pathseg, path, fd, CAP_DELETE, td);
1887 	if ((error = namei(&nd)) != 0)
1888 		return (error == EINVAL ? EPERM : error);
1889 	vfslocked = NDHASGIANT(&nd);
1890 	vp = nd.ni_vp;
1891 	if (vp->v_type == VDIR && oldinum == 0) {
1892 		error = EPERM;		/* POSIX */
1893 	} else if (oldinum != 0 &&
1894 		  ((error = vn_stat(vp, &sb, td->td_ucred, NOCRED, td)) == 0) &&
1895 		  sb.st_ino != oldinum) {
1896 			error = EIDRM;	/* Identifier removed */
1897 	} else {
1898 		/*
1899 		 * The root of a mounted filesystem cannot be deleted.
1900 		 *
1901 		 * XXX: can this only be a VDIR case?
1902 		 */
1903 		if (vp->v_vflag & VV_ROOT)
1904 			error = EBUSY;
1905 	}
1906 	if (error == 0) {
1907 		if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1908 			NDFREE(&nd, NDF_ONLY_PNBUF);
1909 			vput(nd.ni_dvp);
1910 			if (vp == nd.ni_dvp)
1911 				vrele(vp);
1912 			else
1913 				vput(vp);
1914 			VFS_UNLOCK_GIANT(vfslocked);
1915 			if ((error = vn_start_write(NULL, &mp,
1916 			    V_XSLEEP | PCATCH)) != 0)
1917 				return (error);
1918 			goto restart;
1919 		}
1920 #ifdef MAC
1921 		error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp,
1922 		    &nd.ni_cnd);
1923 		if (error)
1924 			goto out;
1925 #endif
1926 		error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
1927 #ifdef MAC
1928 out:
1929 #endif
1930 		vn_finished_write(mp);
1931 	}
1932 	NDFREE(&nd, NDF_ONLY_PNBUF);
1933 	vput(nd.ni_dvp);
1934 	if (vp == nd.ni_dvp)
1935 		vrele(vp);
1936 	else
1937 		vput(vp);
1938 	VFS_UNLOCK_GIANT(vfslocked);
1939 	return (error);
1940 }
1941 
1942 /*
1943  * Reposition read/write file offset.
1944  */
1945 #ifndef _SYS_SYSPROTO_H_
1946 struct lseek_args {
1947 	int	fd;
1948 	int	pad;
1949 	off_t	offset;
1950 	int	whence;
1951 };
1952 #endif
1953 int
1954 sys_lseek(td, uap)
1955 	struct thread *td;
1956 	register struct lseek_args /* {
1957 		int fd;
1958 		int pad;
1959 		off_t offset;
1960 		int whence;
1961 	} */ *uap;
1962 {
1963 	struct ucred *cred = td->td_ucred;
1964 	struct file *fp;
1965 	struct vnode *vp;
1966 	struct vattr vattr;
1967 	off_t foffset, offset, size;
1968 	int error, noneg;
1969 	int vfslocked;
1970 
1971 	AUDIT_ARG_FD(uap->fd);
1972 	if ((error = fget(td, uap->fd, CAP_SEEK, &fp)) != 0)
1973 		return (error);
1974 	if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) {
1975 		fdrop(fp, td);
1976 		return (ESPIPE);
1977 	}
1978 	vp = fp->f_vnode;
1979 	foffset = foffset_lock(fp, 0);
1980 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
1981 	noneg = (vp->v_type != VCHR);
1982 	offset = uap->offset;
1983 	switch (uap->whence) {
1984 	case L_INCR:
1985 		if (noneg &&
1986 		    (foffset < 0 ||
1987 		    (offset > 0 && foffset > OFF_MAX - offset))) {
1988 			error = EOVERFLOW;
1989 			break;
1990 		}
1991 		offset += foffset;
1992 		break;
1993 	case L_XTND:
1994 		vn_lock(vp, LK_SHARED | LK_RETRY);
1995 		error = VOP_GETATTR(vp, &vattr, cred);
1996 		VOP_UNLOCK(vp, 0);
1997 		if (error)
1998 			break;
1999 
2000 		/*
2001 		 * If the file references a disk device, then fetch
2002 		 * the media size and use that to determine the ending
2003 		 * offset.
2004 		 */
2005 		if (vattr.va_size == 0 && vp->v_type == VCHR &&
2006 		    fo_ioctl(fp, DIOCGMEDIASIZE, &size, cred, td) == 0)
2007 			vattr.va_size = size;
2008 		if (noneg &&
2009 		    (vattr.va_size > OFF_MAX ||
2010 		    (offset > 0 && vattr.va_size > OFF_MAX - offset))) {
2011 			error = EOVERFLOW;
2012 			break;
2013 		}
2014 		offset += vattr.va_size;
2015 		break;
2016 	case L_SET:
2017 		break;
2018 	case SEEK_DATA:
2019 		error = fo_ioctl(fp, FIOSEEKDATA, &offset, cred, td);
2020 		break;
2021 	case SEEK_HOLE:
2022 		error = fo_ioctl(fp, FIOSEEKHOLE, &offset, cred, td);
2023 		break;
2024 	default:
2025 		error = EINVAL;
2026 	}
2027 	if (error == 0 && noneg && offset < 0)
2028 		error = EINVAL;
2029 	if (error != 0)
2030 		goto drop;
2031 	VFS_KNOTE_UNLOCKED(vp, 0);
2032 	*(off_t *)(td->td_retval) = offset;
2033 drop:
2034 	fdrop(fp, td);
2035 	VFS_UNLOCK_GIANT(vfslocked);
2036 	foffset_unlock(fp, offset, error != 0 ? FOF_NOUPDATE : 0);
2037 	return (error);
2038 }
2039 
2040 #if defined(COMPAT_43)
2041 /*
2042  * Reposition read/write file offset.
2043  */
2044 #ifndef _SYS_SYSPROTO_H_
2045 struct olseek_args {
2046 	int	fd;
2047 	long	offset;
2048 	int	whence;
2049 };
2050 #endif
2051 int
2052 olseek(td, uap)
2053 	struct thread *td;
2054 	register struct olseek_args /* {
2055 		int fd;
2056 		long offset;
2057 		int whence;
2058 	} */ *uap;
2059 {
2060 	struct lseek_args /* {
2061 		int fd;
2062 		int pad;
2063 		off_t offset;
2064 		int whence;
2065 	} */ nuap;
2066 
2067 	nuap.fd = uap->fd;
2068 	nuap.offset = uap->offset;
2069 	nuap.whence = uap->whence;
2070 	return (sys_lseek(td, &nuap));
2071 }
2072 #endif /* COMPAT_43 */
2073 
2074 /* Version with the 'pad' argument */
2075 int
2076 freebsd6_lseek(td, uap)
2077 	struct thread *td;
2078 	register struct freebsd6_lseek_args *uap;
2079 {
2080 	struct lseek_args ouap;
2081 
2082 	ouap.fd = uap->fd;
2083 	ouap.offset = uap->offset;
2084 	ouap.whence = uap->whence;
2085 	return (sys_lseek(td, &ouap));
2086 }
2087 
2088 /*
2089  * Check access permissions using passed credentials.
2090  */
2091 static int
2092 vn_access(vp, user_flags, cred, td)
2093 	struct vnode	*vp;
2094 	int		user_flags;
2095 	struct ucred	*cred;
2096 	struct thread	*td;
2097 {
2098 	int error;
2099 	accmode_t accmode;
2100 
2101 	/* Flags == 0 means only check for existence. */
2102 	error = 0;
2103 	if (user_flags) {
2104 		accmode = 0;
2105 		if (user_flags & R_OK)
2106 			accmode |= VREAD;
2107 		if (user_flags & W_OK)
2108 			accmode |= VWRITE;
2109 		if (user_flags & X_OK)
2110 			accmode |= VEXEC;
2111 #ifdef MAC
2112 		error = mac_vnode_check_access(cred, vp, accmode);
2113 		if (error)
2114 			return (error);
2115 #endif
2116 		if ((accmode & VWRITE) == 0 || (error = vn_writechk(vp)) == 0)
2117 			error = VOP_ACCESS(vp, accmode, cred, td);
2118 	}
2119 	return (error);
2120 }
2121 
2122 /*
2123  * Check access permissions using "real" credentials.
2124  */
2125 #ifndef _SYS_SYSPROTO_H_
2126 struct access_args {
2127 	char	*path;
2128 	int	amode;
2129 };
2130 #endif
2131 int
2132 sys_access(td, uap)
2133 	struct thread *td;
2134 	register struct access_args /* {
2135 		char *path;
2136 		int amode;
2137 	} */ *uap;
2138 {
2139 
2140 	return (kern_access(td, uap->path, UIO_USERSPACE, uap->amode));
2141 }
2142 
2143 #ifndef _SYS_SYSPROTO_H_
2144 struct faccessat_args {
2145 	int	dirfd;
2146 	char	*path;
2147 	int	amode;
2148 	int	flag;
2149 }
2150 #endif
2151 int
2152 sys_faccessat(struct thread *td, struct faccessat_args *uap)
2153 {
2154 
2155 	if (uap->flag & ~AT_EACCESS)
2156 		return (EINVAL);
2157 	return (kern_accessat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flag,
2158 	    uap->amode));
2159 }
2160 
2161 int
2162 kern_access(struct thread *td, char *path, enum uio_seg pathseg, int amode)
2163 {
2164 
2165 	return (kern_accessat(td, AT_FDCWD, path, pathseg, 0, amode));
2166 }
2167 
2168 int
2169 kern_accessat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
2170     int flag, int amode)
2171 {
2172 	struct ucred *cred, *tmpcred;
2173 	struct vnode *vp;
2174 	struct nameidata nd;
2175 	int vfslocked;
2176 	int error;
2177 
2178 	/*
2179 	 * Create and modify a temporary credential instead of one that
2180 	 * is potentially shared.
2181 	 */
2182 	if (!(flag & AT_EACCESS)) {
2183 		cred = td->td_ucred;
2184 		tmpcred = crdup(cred);
2185 		tmpcred->cr_uid = cred->cr_ruid;
2186 		tmpcred->cr_groups[0] = cred->cr_rgid;
2187 		td->td_ucred = tmpcred;
2188 	} else
2189 		cred = tmpcred = td->td_ucred;
2190 	AUDIT_ARG_VALUE(amode);
2191 	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | MPSAFE |
2192 	    AUDITVNODE1, pathseg, path, fd, CAP_FSTAT, td);
2193 	if ((error = namei(&nd)) != 0)
2194 		goto out1;
2195 	vfslocked = NDHASGIANT(&nd);
2196 	vp = nd.ni_vp;
2197 
2198 	error = vn_access(vp, amode, tmpcred, td);
2199 	NDFREE(&nd, NDF_ONLY_PNBUF);
2200 	vput(vp);
2201 	VFS_UNLOCK_GIANT(vfslocked);
2202 out1:
2203 	if (!(flag & AT_EACCESS)) {
2204 		td->td_ucred = cred;
2205 		crfree(tmpcred);
2206 	}
2207 	return (error);
2208 }
2209 
2210 /*
2211  * Check access permissions using "effective" credentials.
2212  */
2213 #ifndef _SYS_SYSPROTO_H_
2214 struct eaccess_args {
2215 	char	*path;
2216 	int	amode;
2217 };
2218 #endif
2219 int
2220 sys_eaccess(td, uap)
2221 	struct thread *td;
2222 	register struct eaccess_args /* {
2223 		char *path;
2224 		int amode;
2225 	} */ *uap;
2226 {
2227 
2228 	return (kern_eaccess(td, uap->path, UIO_USERSPACE, uap->amode));
2229 }
2230 
2231 int
2232 kern_eaccess(struct thread *td, char *path, enum uio_seg pathseg, int amode)
2233 {
2234 
2235 	return (kern_accessat(td, AT_FDCWD, path, pathseg, AT_EACCESS, amode));
2236 }
2237 
2238 #if defined(COMPAT_43)
2239 /*
2240  * Get file status; this version follows links.
2241  */
2242 #ifndef _SYS_SYSPROTO_H_
2243 struct ostat_args {
2244 	char	*path;
2245 	struct ostat *ub;
2246 };
2247 #endif
2248 int
2249 ostat(td, uap)
2250 	struct thread *td;
2251 	register struct ostat_args /* {
2252 		char *path;
2253 		struct ostat *ub;
2254 	} */ *uap;
2255 {
2256 	struct stat sb;
2257 	struct ostat osb;
2258 	int error;
2259 
2260 	error = kern_stat(td, uap->path, UIO_USERSPACE, &sb);
2261 	if (error)
2262 		return (error);
2263 	cvtstat(&sb, &osb);
2264 	error = copyout(&osb, uap->ub, sizeof (osb));
2265 	return (error);
2266 }
2267 
2268 /*
2269  * Get file status; this version does not follow links.
2270  */
2271 #ifndef _SYS_SYSPROTO_H_
2272 struct olstat_args {
2273 	char	*path;
2274 	struct ostat *ub;
2275 };
2276 #endif
2277 int
2278 olstat(td, uap)
2279 	struct thread *td;
2280 	register struct olstat_args /* {
2281 		char *path;
2282 		struct ostat *ub;
2283 	} */ *uap;
2284 {
2285 	struct stat sb;
2286 	struct ostat osb;
2287 	int error;
2288 
2289 	error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb);
2290 	if (error)
2291 		return (error);
2292 	cvtstat(&sb, &osb);
2293 	error = copyout(&osb, uap->ub, sizeof (osb));
2294 	return (error);
2295 }
2296 
2297 /*
2298  * Convert from an old to a new stat structure.
2299  */
2300 void
2301 cvtstat(st, ost)
2302 	struct stat *st;
2303 	struct ostat *ost;
2304 {
2305 
2306 	ost->st_dev = st->st_dev;
2307 	ost->st_ino = st->st_ino;
2308 	ost->st_mode = st->st_mode;
2309 	ost->st_nlink = st->st_nlink;
2310 	ost->st_uid = st->st_uid;
2311 	ost->st_gid = st->st_gid;
2312 	ost->st_rdev = st->st_rdev;
2313 	if (st->st_size < (quad_t)1 << 32)
2314 		ost->st_size = st->st_size;
2315 	else
2316 		ost->st_size = -2;
2317 	ost->st_atim = st->st_atim;
2318 	ost->st_mtim = st->st_mtim;
2319 	ost->st_ctim = st->st_ctim;
2320 	ost->st_blksize = st->st_blksize;
2321 	ost->st_blocks = st->st_blocks;
2322 	ost->st_flags = st->st_flags;
2323 	ost->st_gen = st->st_gen;
2324 }
2325 #endif /* COMPAT_43 */
2326 
2327 /*
2328  * Get file status; this version follows links.
2329  */
2330 #ifndef _SYS_SYSPROTO_H_
2331 struct stat_args {
2332 	char	*path;
2333 	struct stat *ub;
2334 };
2335 #endif
2336 int
2337 sys_stat(td, uap)
2338 	struct thread *td;
2339 	register struct stat_args /* {
2340 		char *path;
2341 		struct stat *ub;
2342 	} */ *uap;
2343 {
2344 	struct stat sb;
2345 	int error;
2346 
2347 	error = kern_stat(td, uap->path, UIO_USERSPACE, &sb);
2348 	if (error == 0)
2349 		error = copyout(&sb, uap->ub, sizeof (sb));
2350 	return (error);
2351 }
2352 
2353 #ifndef _SYS_SYSPROTO_H_
2354 struct fstatat_args {
2355 	int	fd;
2356 	char	*path;
2357 	struct stat	*buf;
2358 	int	flag;
2359 }
2360 #endif
2361 int
2362 sys_fstatat(struct thread *td, struct fstatat_args *uap)
2363 {
2364 	struct stat sb;
2365 	int error;
2366 
2367 	error = kern_statat(td, uap->flag, uap->fd, uap->path,
2368 	    UIO_USERSPACE, &sb);
2369 	if (error == 0)
2370 		error = copyout(&sb, uap->buf, sizeof (sb));
2371 	return (error);
2372 }
2373 
2374 int
2375 kern_stat(struct thread *td, char *path, enum uio_seg pathseg, struct stat *sbp)
2376 {
2377 
2378 	return (kern_statat(td, 0, AT_FDCWD, path, pathseg, sbp));
2379 }
2380 
2381 int
2382 kern_statat(struct thread *td, int flag, int fd, char *path,
2383     enum uio_seg pathseg, struct stat *sbp)
2384 {
2385 
2386 	return (kern_statat_vnhook(td, flag, fd, path, pathseg, sbp, NULL));
2387 }
2388 
2389 int
2390 kern_statat_vnhook(struct thread *td, int flag, int fd, char *path,
2391     enum uio_seg pathseg, struct stat *sbp,
2392     void (*hook)(struct vnode *vp, struct stat *sbp))
2393 {
2394 	struct nameidata nd;
2395 	struct stat sb;
2396 	int error, vfslocked;
2397 
2398 	if (flag & ~AT_SYMLINK_NOFOLLOW)
2399 		return (EINVAL);
2400 
2401 	NDINIT_ATRIGHTS(&nd, LOOKUP, ((flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW :
2402 	    FOLLOW) | LOCKSHARED | LOCKLEAF | AUDITVNODE1 | MPSAFE, pathseg,
2403 	    path, fd, CAP_FSTAT, td);
2404 
2405 	if ((error = namei(&nd)) != 0)
2406 		return (error);
2407 	vfslocked = NDHASGIANT(&nd);
2408 	error = vn_stat(nd.ni_vp, &sb, td->td_ucred, NOCRED, td);
2409 	if (!error) {
2410 		SDT_PROBE(vfs, , stat, mode, path, sb.st_mode, 0, 0, 0);
2411 		if (S_ISREG(sb.st_mode))
2412 			SDT_PROBE(vfs, , stat, reg, path, pathseg, 0, 0, 0);
2413 		if (__predict_false(hook != NULL))
2414 			hook(nd.ni_vp, &sb);
2415 	}
2416 	NDFREE(&nd, NDF_ONLY_PNBUF);
2417 	vput(nd.ni_vp);
2418 	VFS_UNLOCK_GIANT(vfslocked);
2419 	if (error)
2420 		return (error);
2421 	*sbp = sb;
2422 #ifdef KTRACE
2423 	if (KTRPOINT(td, KTR_STRUCT))
2424 		ktrstat(&sb);
2425 #endif
2426 	return (0);
2427 }
2428 
2429 /*
2430  * Get file status; this version does not follow links.
2431  */
2432 #ifndef _SYS_SYSPROTO_H_
2433 struct lstat_args {
2434 	char	*path;
2435 	struct stat *ub;
2436 };
2437 #endif
2438 int
2439 sys_lstat(td, uap)
2440 	struct thread *td;
2441 	register struct lstat_args /* {
2442 		char *path;
2443 		struct stat *ub;
2444 	} */ *uap;
2445 {
2446 	struct stat sb;
2447 	int error;
2448 
2449 	error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb);
2450 	if (error == 0)
2451 		error = copyout(&sb, uap->ub, sizeof (sb));
2452 	return (error);
2453 }
2454 
2455 int
2456 kern_lstat(struct thread *td, char *path, enum uio_seg pathseg, struct stat *sbp)
2457 {
2458 
2459 	return (kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, path, pathseg,
2460 	    sbp));
2461 }
2462 
2463 /*
2464  * Implementation of the NetBSD [l]stat() functions.
2465  */
2466 void
2467 cvtnstat(sb, nsb)
2468 	struct stat *sb;
2469 	struct nstat *nsb;
2470 {
2471 	bzero(nsb, sizeof *nsb);
2472 	nsb->st_dev = sb->st_dev;
2473 	nsb->st_ino = sb->st_ino;
2474 	nsb->st_mode = sb->st_mode;
2475 	nsb->st_nlink = sb->st_nlink;
2476 	nsb->st_uid = sb->st_uid;
2477 	nsb->st_gid = sb->st_gid;
2478 	nsb->st_rdev = sb->st_rdev;
2479 	nsb->st_atim = sb->st_atim;
2480 	nsb->st_mtim = sb->st_mtim;
2481 	nsb->st_ctim = sb->st_ctim;
2482 	nsb->st_size = sb->st_size;
2483 	nsb->st_blocks = sb->st_blocks;
2484 	nsb->st_blksize = sb->st_blksize;
2485 	nsb->st_flags = sb->st_flags;
2486 	nsb->st_gen = sb->st_gen;
2487 	nsb->st_birthtim = sb->st_birthtim;
2488 }
2489 
2490 #ifndef _SYS_SYSPROTO_H_
2491 struct nstat_args {
2492 	char	*path;
2493 	struct nstat *ub;
2494 };
2495 #endif
2496 int
2497 sys_nstat(td, uap)
2498 	struct thread *td;
2499 	register struct nstat_args /* {
2500 		char *path;
2501 		struct nstat *ub;
2502 	} */ *uap;
2503 {
2504 	struct stat sb;
2505 	struct nstat nsb;
2506 	int error;
2507 
2508 	error = kern_stat(td, uap->path, UIO_USERSPACE, &sb);
2509 	if (error)
2510 		return (error);
2511 	cvtnstat(&sb, &nsb);
2512 	error = copyout(&nsb, uap->ub, sizeof (nsb));
2513 	return (error);
2514 }
2515 
2516 /*
2517  * NetBSD lstat.  Get file status; this version does not follow links.
2518  */
2519 #ifndef _SYS_SYSPROTO_H_
2520 struct lstat_args {
2521 	char	*path;
2522 	struct stat *ub;
2523 };
2524 #endif
2525 int
2526 sys_nlstat(td, uap)
2527 	struct thread *td;
2528 	register struct nlstat_args /* {
2529 		char *path;
2530 		struct nstat *ub;
2531 	} */ *uap;
2532 {
2533 	struct stat sb;
2534 	struct nstat nsb;
2535 	int error;
2536 
2537 	error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb);
2538 	if (error)
2539 		return (error);
2540 	cvtnstat(&sb, &nsb);
2541 	error = copyout(&nsb, uap->ub, sizeof (nsb));
2542 	return (error);
2543 }
2544 
2545 /*
2546  * Get configurable pathname variables.
2547  */
2548 #ifndef _SYS_SYSPROTO_H_
2549 struct pathconf_args {
2550 	char	*path;
2551 	int	name;
2552 };
2553 #endif
2554 int
2555 sys_pathconf(td, uap)
2556 	struct thread *td;
2557 	register struct pathconf_args /* {
2558 		char *path;
2559 		int name;
2560 	} */ *uap;
2561 {
2562 
2563 	return (kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name, FOLLOW));
2564 }
2565 
2566 #ifndef _SYS_SYSPROTO_H_
2567 struct lpathconf_args {
2568 	char	*path;
2569 	int	name;
2570 };
2571 #endif
2572 int
2573 sys_lpathconf(td, uap)
2574 	struct thread *td;
2575 	register struct lpathconf_args /* {
2576 		char *path;
2577 		int name;
2578 	} */ *uap;
2579 {
2580 
2581 	return (kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name, NOFOLLOW));
2582 }
2583 
2584 int
2585 kern_pathconf(struct thread *td, char *path, enum uio_seg pathseg, int name,
2586     u_long flags)
2587 {
2588 	struct nameidata nd;
2589 	int error, vfslocked;
2590 
2591 	NDINIT(&nd, LOOKUP, LOCKSHARED | LOCKLEAF | MPSAFE | AUDITVNODE1 |
2592 	    flags, pathseg, path, td);
2593 	if ((error = namei(&nd)) != 0)
2594 		return (error);
2595 	vfslocked = NDHASGIANT(&nd);
2596 	NDFREE(&nd, NDF_ONLY_PNBUF);
2597 
2598 	/* If asynchronous I/O is available, it works for all files. */
2599 	if (name == _PC_ASYNC_IO)
2600 		td->td_retval[0] = async_io_version;
2601 	else
2602 		error = VOP_PATHCONF(nd.ni_vp, name, td->td_retval);
2603 	vput(nd.ni_vp);
2604 	VFS_UNLOCK_GIANT(vfslocked);
2605 	return (error);
2606 }
2607 
2608 /*
2609  * Return target name of a symbolic link.
2610  */
2611 #ifndef _SYS_SYSPROTO_H_
2612 struct readlink_args {
2613 	char	*path;
2614 	char	*buf;
2615 	size_t	count;
2616 };
2617 #endif
2618 int
2619 sys_readlink(td, uap)
2620 	struct thread *td;
2621 	register struct readlink_args /* {
2622 		char *path;
2623 		char *buf;
2624 		size_t count;
2625 	} */ *uap;
2626 {
2627 
2628 	return (kern_readlink(td, uap->path, UIO_USERSPACE, uap->buf,
2629 	    UIO_USERSPACE, uap->count));
2630 }
2631 #ifndef _SYS_SYSPROTO_H_
2632 struct readlinkat_args {
2633 	int	fd;
2634 	char	*path;
2635 	char	*buf;
2636 	size_t	bufsize;
2637 };
2638 #endif
2639 int
2640 sys_readlinkat(struct thread *td, struct readlinkat_args *uap)
2641 {
2642 
2643 	return (kern_readlinkat(td, uap->fd, uap->path, UIO_USERSPACE,
2644 	    uap->buf, UIO_USERSPACE, uap->bufsize));
2645 }
2646 
2647 int
2648 kern_readlink(struct thread *td, char *path, enum uio_seg pathseg, char *buf,
2649     enum uio_seg bufseg, size_t count)
2650 {
2651 
2652 	return (kern_readlinkat(td, AT_FDCWD, path, pathseg, buf, bufseg,
2653 	    count));
2654 }
2655 
2656 int
2657 kern_readlinkat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
2658     char *buf, enum uio_seg bufseg, size_t count)
2659 {
2660 	struct vnode *vp;
2661 	struct iovec aiov;
2662 	struct uio auio;
2663 	int error;
2664 	struct nameidata nd;
2665 	int vfslocked;
2666 
2667 	if (count > IOSIZE_MAX)
2668 		return (EINVAL);
2669 
2670 	NDINIT_AT(&nd, LOOKUP, NOFOLLOW | LOCKSHARED | LOCKLEAF | MPSAFE |
2671 	    AUDITVNODE1, pathseg, path, fd, td);
2672 
2673 	if ((error = namei(&nd)) != 0)
2674 		return (error);
2675 	NDFREE(&nd, NDF_ONLY_PNBUF);
2676 	vfslocked = NDHASGIANT(&nd);
2677 	vp = nd.ni_vp;
2678 #ifdef MAC
2679 	error = mac_vnode_check_readlink(td->td_ucred, vp);
2680 	if (error) {
2681 		vput(vp);
2682 		VFS_UNLOCK_GIANT(vfslocked);
2683 		return (error);
2684 	}
2685 #endif
2686 	if (vp->v_type != VLNK)
2687 		error = EINVAL;
2688 	else {
2689 		aiov.iov_base = buf;
2690 		aiov.iov_len = count;
2691 		auio.uio_iov = &aiov;
2692 		auio.uio_iovcnt = 1;
2693 		auio.uio_offset = 0;
2694 		auio.uio_rw = UIO_READ;
2695 		auio.uio_segflg = bufseg;
2696 		auio.uio_td = td;
2697 		auio.uio_resid = count;
2698 		error = VOP_READLINK(vp, &auio, td->td_ucred);
2699 	}
2700 	vput(vp);
2701 	VFS_UNLOCK_GIANT(vfslocked);
2702 	td->td_retval[0] = count - auio.uio_resid;
2703 	return (error);
2704 }
2705 
2706 /*
2707  * Common implementation code for chflags() and fchflags().
2708  */
2709 static int
2710 setfflags(td, vp, flags)
2711 	struct thread *td;
2712 	struct vnode *vp;
2713 	int flags;
2714 {
2715 	int error;
2716 	struct mount *mp;
2717 	struct vattr vattr;
2718 
2719 	/* We can't support the value matching VNOVAL. */
2720 	if (flags == VNOVAL)
2721 		return (EOPNOTSUPP);
2722 
2723 	/*
2724 	 * Prevent non-root users from setting flags on devices.  When
2725 	 * a device is reused, users can retain ownership of the device
2726 	 * if they are allowed to set flags and programs assume that
2727 	 * chown can't fail when done as root.
2728 	 */
2729 	if (vp->v_type == VCHR || vp->v_type == VBLK) {
2730 		error = priv_check(td, PRIV_VFS_CHFLAGS_DEV);
2731 		if (error)
2732 			return (error);
2733 	}
2734 
2735 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
2736 		return (error);
2737 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2738 	VATTR_NULL(&vattr);
2739 	vattr.va_flags = flags;
2740 #ifdef MAC
2741 	error = mac_vnode_check_setflags(td->td_ucred, vp, vattr.va_flags);
2742 	if (error == 0)
2743 #endif
2744 		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
2745 	VOP_UNLOCK(vp, 0);
2746 	vn_finished_write(mp);
2747 	return (error);
2748 }
2749 
2750 /*
2751  * Change flags of a file given a path name.
2752  */
2753 #ifndef _SYS_SYSPROTO_H_
2754 struct chflags_args {
2755 	char	*path;
2756 	int	flags;
2757 };
2758 #endif
2759 int
2760 sys_chflags(td, uap)
2761 	struct thread *td;
2762 	register struct chflags_args /* {
2763 		char *path;
2764 		int flags;
2765 	} */ *uap;
2766 {
2767 	int error;
2768 	struct nameidata nd;
2769 	int vfslocked;
2770 
2771 	AUDIT_ARG_FFLAGS(uap->flags);
2772 	NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1, UIO_USERSPACE,
2773 	    uap->path, td);
2774 	if ((error = namei(&nd)) != 0)
2775 		return (error);
2776 	NDFREE(&nd, NDF_ONLY_PNBUF);
2777 	vfslocked = NDHASGIANT(&nd);
2778 	error = setfflags(td, nd.ni_vp, uap->flags);
2779 	vrele(nd.ni_vp);
2780 	VFS_UNLOCK_GIANT(vfslocked);
2781 	return (error);
2782 }
2783 
2784 /*
2785  * Same as chflags() but doesn't follow symlinks.
2786  */
2787 int
2788 sys_lchflags(td, uap)
2789 	struct thread *td;
2790 	register struct lchflags_args /* {
2791 		char *path;
2792 		int flags;
2793 	} */ *uap;
2794 {
2795 	int error;
2796 	struct nameidata nd;
2797 	int vfslocked;
2798 
2799 	AUDIT_ARG_FFLAGS(uap->flags);
2800 	NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE | AUDITVNODE1, UIO_USERSPACE,
2801 	    uap->path, td);
2802 	if ((error = namei(&nd)) != 0)
2803 		return (error);
2804 	vfslocked = NDHASGIANT(&nd);
2805 	NDFREE(&nd, NDF_ONLY_PNBUF);
2806 	error = setfflags(td, nd.ni_vp, uap->flags);
2807 	vrele(nd.ni_vp);
2808 	VFS_UNLOCK_GIANT(vfslocked);
2809 	return (error);
2810 }
2811 
2812 /*
2813  * Change flags of a file given a file descriptor.
2814  */
2815 #ifndef _SYS_SYSPROTO_H_
2816 struct fchflags_args {
2817 	int	fd;
2818 	int	flags;
2819 };
2820 #endif
2821 int
2822 sys_fchflags(td, uap)
2823 	struct thread *td;
2824 	register struct fchflags_args /* {
2825 		int fd;
2826 		int flags;
2827 	} */ *uap;
2828 {
2829 	struct file *fp;
2830 	int vfslocked;
2831 	int error;
2832 
2833 	AUDIT_ARG_FD(uap->fd);
2834 	AUDIT_ARG_FFLAGS(uap->flags);
2835 	if ((error = getvnode(td->td_proc->p_fd, uap->fd, CAP_FCHFLAGS,
2836 	    &fp)) != 0)
2837 		return (error);
2838 	vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
2839 #ifdef AUDIT
2840 	vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY);
2841 	AUDIT_ARG_VNODE1(fp->f_vnode);
2842 	VOP_UNLOCK(fp->f_vnode, 0);
2843 #endif
2844 	error = setfflags(td, fp->f_vnode, uap->flags);
2845 	VFS_UNLOCK_GIANT(vfslocked);
2846 	fdrop(fp, td);
2847 	return (error);
2848 }
2849 
2850 /*
2851  * Common implementation code for chmod(), lchmod() and fchmod().
2852  */
2853 int
2854 setfmode(td, cred, vp, mode)
2855 	struct thread *td;
2856 	struct ucred *cred;
2857 	struct vnode *vp;
2858 	int mode;
2859 {
2860 	int error;
2861 	struct mount *mp;
2862 	struct vattr vattr;
2863 
2864 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
2865 		return (error);
2866 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2867 	VATTR_NULL(&vattr);
2868 	vattr.va_mode = mode & ALLPERMS;
2869 #ifdef MAC
2870 	error = mac_vnode_check_setmode(cred, vp, vattr.va_mode);
2871 	if (error == 0)
2872 #endif
2873 		error = VOP_SETATTR(vp, &vattr, cred);
2874 	VOP_UNLOCK(vp, 0);
2875 	vn_finished_write(mp);
2876 	return (error);
2877 }
2878 
2879 /*
2880  * Change mode of a file given path name.
2881  */
2882 #ifndef _SYS_SYSPROTO_H_
2883 struct chmod_args {
2884 	char	*path;
2885 	int	mode;
2886 };
2887 #endif
2888 int
2889 sys_chmod(td, uap)
2890 	struct thread *td;
2891 	register struct chmod_args /* {
2892 		char *path;
2893 		int mode;
2894 	} */ *uap;
2895 {
2896 
2897 	return (kern_chmod(td, uap->path, UIO_USERSPACE, uap->mode));
2898 }
2899 
2900 #ifndef _SYS_SYSPROTO_H_
2901 struct fchmodat_args {
2902 	int	dirfd;
2903 	char	*path;
2904 	mode_t	mode;
2905 	int	flag;
2906 }
2907 #endif
2908 int
2909 sys_fchmodat(struct thread *td, struct fchmodat_args *uap)
2910 {
2911 	int flag = uap->flag;
2912 	int fd = uap->fd;
2913 	char *path = uap->path;
2914 	mode_t mode = uap->mode;
2915 
2916 	if (flag & ~AT_SYMLINK_NOFOLLOW)
2917 		return (EINVAL);
2918 
2919 	return (kern_fchmodat(td, fd, path, UIO_USERSPACE, mode, flag));
2920 }
2921 
2922 int
2923 kern_chmod(struct thread *td, char *path, enum uio_seg pathseg, int mode)
2924 {
2925 
2926 	return (kern_fchmodat(td, AT_FDCWD, path, pathseg, mode, 0));
2927 }
2928 
2929 /*
2930  * Change mode of a file given path name (don't follow links.)
2931  */
2932 #ifndef _SYS_SYSPROTO_H_
2933 struct lchmod_args {
2934 	char	*path;
2935 	int	mode;
2936 };
2937 #endif
2938 int
2939 sys_lchmod(td, uap)
2940 	struct thread *td;
2941 	register struct lchmod_args /* {
2942 		char *path;
2943 		int mode;
2944 	} */ *uap;
2945 {
2946 
2947 	return (kern_fchmodat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
2948 	    uap->mode, AT_SYMLINK_NOFOLLOW));
2949 }
2950 
2951 
2952 int
2953 kern_fchmodat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
2954     mode_t mode, int flag)
2955 {
2956 	int error;
2957 	struct nameidata nd;
2958 	int vfslocked;
2959 	int follow;
2960 
2961 	AUDIT_ARG_MODE(mode);
2962 	follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
2963 	NDINIT_ATRIGHTS(&nd, LOOKUP,  follow | MPSAFE | AUDITVNODE1, pathseg,
2964 	    path, fd, CAP_FCHMOD, td);
2965 	if ((error = namei(&nd)) != 0)
2966 		return (error);
2967 	vfslocked = NDHASGIANT(&nd);
2968 	NDFREE(&nd, NDF_ONLY_PNBUF);
2969 	error = setfmode(td, td->td_ucred, nd.ni_vp, mode);
2970 	vrele(nd.ni_vp);
2971 	VFS_UNLOCK_GIANT(vfslocked);
2972 	return (error);
2973 }
2974 
2975 /*
2976  * Change mode of a file given a file descriptor.
2977  */
2978 #ifndef _SYS_SYSPROTO_H_
2979 struct fchmod_args {
2980 	int	fd;
2981 	int	mode;
2982 };
2983 #endif
2984 int
2985 sys_fchmod(struct thread *td, struct fchmod_args *uap)
2986 {
2987 	struct file *fp;
2988 	int error;
2989 
2990 	AUDIT_ARG_FD(uap->fd);
2991 	AUDIT_ARG_MODE(uap->mode);
2992 
2993 	error = fget(td, uap->fd, CAP_FCHMOD, &fp);
2994 	if (error != 0)
2995 		return (error);
2996 	error = fo_chmod(fp, uap->mode, td->td_ucred, td);
2997 	fdrop(fp, td);
2998 	return (error);
2999 }
3000 
3001 /*
3002  * Common implementation for chown(), lchown(), and fchown()
3003  */
3004 int
3005 setfown(td, cred, vp, uid, gid)
3006 	struct thread *td;
3007 	struct ucred *cred;
3008 	struct vnode *vp;
3009 	uid_t uid;
3010 	gid_t gid;
3011 {
3012 	int error;
3013 	struct mount *mp;
3014 	struct vattr vattr;
3015 
3016 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
3017 		return (error);
3018 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3019 	VATTR_NULL(&vattr);
3020 	vattr.va_uid = uid;
3021 	vattr.va_gid = gid;
3022 #ifdef MAC
3023 	error = mac_vnode_check_setowner(cred, vp, vattr.va_uid,
3024 	    vattr.va_gid);
3025 	if (error == 0)
3026 #endif
3027 		error = VOP_SETATTR(vp, &vattr, cred);
3028 	VOP_UNLOCK(vp, 0);
3029 	vn_finished_write(mp);
3030 	return (error);
3031 }
3032 
3033 /*
3034  * Set ownership given a path name.
3035  */
3036 #ifndef _SYS_SYSPROTO_H_
3037 struct chown_args {
3038 	char	*path;
3039 	int	uid;
3040 	int	gid;
3041 };
3042 #endif
3043 int
3044 sys_chown(td, uap)
3045 	struct thread *td;
3046 	register struct chown_args /* {
3047 		char *path;
3048 		int uid;
3049 		int gid;
3050 	} */ *uap;
3051 {
3052 
3053 	return (kern_chown(td, uap->path, UIO_USERSPACE, uap->uid, uap->gid));
3054 }
3055 
3056 #ifndef _SYS_SYSPROTO_H_
3057 struct fchownat_args {
3058 	int fd;
3059 	const char * path;
3060 	uid_t uid;
3061 	gid_t gid;
3062 	int flag;
3063 };
3064 #endif
3065 int
3066 sys_fchownat(struct thread *td, struct fchownat_args *uap)
3067 {
3068 	int flag;
3069 
3070 	flag = uap->flag;
3071 	if (flag & ~AT_SYMLINK_NOFOLLOW)
3072 		return (EINVAL);
3073 
3074 	return (kern_fchownat(td, uap->fd, uap->path, UIO_USERSPACE, uap->uid,
3075 	    uap->gid, uap->flag));
3076 }
3077 
3078 int
3079 kern_chown(struct thread *td, char *path, enum uio_seg pathseg, int uid,
3080     int gid)
3081 {
3082 
3083 	return (kern_fchownat(td, AT_FDCWD, path, pathseg, uid, gid, 0));
3084 }
3085 
3086 int
3087 kern_fchownat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
3088     int uid, int gid, int flag)
3089 {
3090 	struct nameidata nd;
3091 	int error, vfslocked, follow;
3092 
3093 	AUDIT_ARG_OWNER(uid, gid);
3094 	follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
3095 	NDINIT_ATRIGHTS(&nd, LOOKUP, follow | MPSAFE | AUDITVNODE1, pathseg,
3096 	    path, fd, CAP_FCHOWN, td);
3097 
3098 	if ((error = namei(&nd)) != 0)
3099 		return (error);
3100 	vfslocked = NDHASGIANT(&nd);
3101 	NDFREE(&nd, NDF_ONLY_PNBUF);
3102 	error = setfown(td, td->td_ucred, nd.ni_vp, uid, gid);
3103 	vrele(nd.ni_vp);
3104 	VFS_UNLOCK_GIANT(vfslocked);
3105 	return (error);
3106 }
3107 
3108 /*
3109  * Set ownership given a path name, do not cross symlinks.
3110  */
3111 #ifndef _SYS_SYSPROTO_H_
3112 struct lchown_args {
3113 	char	*path;
3114 	int	uid;
3115 	int	gid;
3116 };
3117 #endif
3118 int
3119 sys_lchown(td, uap)
3120 	struct thread *td;
3121 	register struct lchown_args /* {
3122 		char *path;
3123 		int uid;
3124 		int gid;
3125 	} */ *uap;
3126 {
3127 
3128 	return (kern_lchown(td, uap->path, UIO_USERSPACE, uap->uid, uap->gid));
3129 }
3130 
3131 int
3132 kern_lchown(struct thread *td, char *path, enum uio_seg pathseg, int uid,
3133     int gid)
3134 {
3135 
3136 	return (kern_fchownat(td, AT_FDCWD, path, pathseg, uid, gid,
3137 	    AT_SYMLINK_NOFOLLOW));
3138 }
3139 
3140 /*
3141  * Set ownership given a file descriptor.
3142  */
3143 #ifndef _SYS_SYSPROTO_H_
3144 struct fchown_args {
3145 	int	fd;
3146 	int	uid;
3147 	int	gid;
3148 };
3149 #endif
3150 int
3151 sys_fchown(td, uap)
3152 	struct thread *td;
3153 	register struct fchown_args /* {
3154 		int fd;
3155 		int uid;
3156 		int gid;
3157 	} */ *uap;
3158 {
3159 	struct file *fp;
3160 	int error;
3161 
3162 	AUDIT_ARG_FD(uap->fd);
3163 	AUDIT_ARG_OWNER(uap->uid, uap->gid);
3164 	error = fget(td, uap->fd, CAP_FCHOWN, &fp);
3165 	if (error != 0)
3166 		return (error);
3167 	error = fo_chown(fp, uap->uid, uap->gid, td->td_ucred, td);
3168 	fdrop(fp, td);
3169 	return (error);
3170 }
3171 
3172 /*
3173  * Common implementation code for utimes(), lutimes(), and futimes().
3174  */
3175 static int
3176 getutimes(usrtvp, tvpseg, tsp)
3177 	const struct timeval *usrtvp;
3178 	enum uio_seg tvpseg;
3179 	struct timespec *tsp;
3180 {
3181 	struct timeval tv[2];
3182 	const struct timeval *tvp;
3183 	int error;
3184 
3185 	if (usrtvp == NULL) {
3186 		vfs_timestamp(&tsp[0]);
3187 		tsp[1] = tsp[0];
3188 	} else {
3189 		if (tvpseg == UIO_SYSSPACE) {
3190 			tvp = usrtvp;
3191 		} else {
3192 			if ((error = copyin(usrtvp, tv, sizeof(tv))) != 0)
3193 				return (error);
3194 			tvp = tv;
3195 		}
3196 
3197 		if (tvp[0].tv_usec < 0 || tvp[0].tv_usec >= 1000000 ||
3198 		    tvp[1].tv_usec < 0 || tvp[1].tv_usec >= 1000000)
3199 			return (EINVAL);
3200 		TIMEVAL_TO_TIMESPEC(&tvp[0], &tsp[0]);
3201 		TIMEVAL_TO_TIMESPEC(&tvp[1], &tsp[1]);
3202 	}
3203 	return (0);
3204 }
3205 
3206 /*
3207  * Common implementation code for utimes(), lutimes(), and futimes().
3208  */
3209 static int
3210 setutimes(td, vp, ts, numtimes, nullflag)
3211 	struct thread *td;
3212 	struct vnode *vp;
3213 	const struct timespec *ts;
3214 	int numtimes;
3215 	int nullflag;
3216 {
3217 	int error, setbirthtime;
3218 	struct mount *mp;
3219 	struct vattr vattr;
3220 
3221 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
3222 		return (error);
3223 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3224 	setbirthtime = 0;
3225 	if (numtimes < 3 && !VOP_GETATTR(vp, &vattr, td->td_ucred) &&
3226 	    timespeccmp(&ts[1], &vattr.va_birthtime, < ))
3227 		setbirthtime = 1;
3228 	VATTR_NULL(&vattr);
3229 	vattr.va_atime = ts[0];
3230 	vattr.va_mtime = ts[1];
3231 	if (setbirthtime)
3232 		vattr.va_birthtime = ts[1];
3233 	if (numtimes > 2)
3234 		vattr.va_birthtime = ts[2];
3235 	if (nullflag)
3236 		vattr.va_vaflags |= VA_UTIMES_NULL;
3237 #ifdef MAC
3238 	error = mac_vnode_check_setutimes(td->td_ucred, vp, vattr.va_atime,
3239 	    vattr.va_mtime);
3240 #endif
3241 	if (error == 0)
3242 		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
3243 	VOP_UNLOCK(vp, 0);
3244 	vn_finished_write(mp);
3245 	return (error);
3246 }
3247 
3248 /*
3249  * Set the access and modification times of a file.
3250  */
3251 #ifndef _SYS_SYSPROTO_H_
3252 struct utimes_args {
3253 	char	*path;
3254 	struct	timeval *tptr;
3255 };
3256 #endif
3257 int
3258 sys_utimes(td, uap)
3259 	struct thread *td;
3260 	register struct utimes_args /* {
3261 		char *path;
3262 		struct timeval *tptr;
3263 	} */ *uap;
3264 {
3265 
3266 	return (kern_utimes(td, uap->path, UIO_USERSPACE, uap->tptr,
3267 	    UIO_USERSPACE));
3268 }
3269 
3270 #ifndef _SYS_SYSPROTO_H_
3271 struct futimesat_args {
3272 	int fd;
3273 	const char * path;
3274 	const struct timeval * times;
3275 };
3276 #endif
3277 int
3278 sys_futimesat(struct thread *td, struct futimesat_args *uap)
3279 {
3280 
3281 	return (kern_utimesat(td, uap->fd, uap->path, UIO_USERSPACE,
3282 	    uap->times, UIO_USERSPACE));
3283 }
3284 
3285 int
3286 kern_utimes(struct thread *td, char *path, enum uio_seg pathseg,
3287     struct timeval *tptr, enum uio_seg tptrseg)
3288 {
3289 
3290 	return (kern_utimesat(td, AT_FDCWD, path, pathseg, tptr, tptrseg));
3291 }
3292 
3293 int
3294 kern_utimesat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
3295     struct timeval *tptr, enum uio_seg tptrseg)
3296 {
3297 	struct nameidata nd;
3298 	struct timespec ts[2];
3299 	int error, vfslocked;
3300 
3301 	if ((error = getutimes(tptr, tptrseg, ts)) != 0)
3302 		return (error);
3303 	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1, pathseg,
3304 	    path, fd, CAP_FUTIMES, td);
3305 
3306 	if ((error = namei(&nd)) != 0)
3307 		return (error);
3308 	vfslocked = NDHASGIANT(&nd);
3309 	NDFREE(&nd, NDF_ONLY_PNBUF);
3310 	error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL);
3311 	vrele(nd.ni_vp);
3312 	VFS_UNLOCK_GIANT(vfslocked);
3313 	return (error);
3314 }
3315 
3316 /*
3317  * Set the access and modification times of a file.
3318  */
3319 #ifndef _SYS_SYSPROTO_H_
3320 struct lutimes_args {
3321 	char	*path;
3322 	struct	timeval *tptr;
3323 };
3324 #endif
3325 int
3326 sys_lutimes(td, uap)
3327 	struct thread *td;
3328 	register struct lutimes_args /* {
3329 		char *path;
3330 		struct timeval *tptr;
3331 	} */ *uap;
3332 {
3333 
3334 	return (kern_lutimes(td, uap->path, UIO_USERSPACE, uap->tptr,
3335 	    UIO_USERSPACE));
3336 }
3337 
3338 int
3339 kern_lutimes(struct thread *td, char *path, enum uio_seg pathseg,
3340     struct timeval *tptr, enum uio_seg tptrseg)
3341 {
3342 	struct timespec ts[2];
3343 	int error;
3344 	struct nameidata nd;
3345 	int vfslocked;
3346 
3347 	if ((error = getutimes(tptr, tptrseg, ts)) != 0)
3348 		return (error);
3349 	NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE | AUDITVNODE1, pathseg, path, td);
3350 	if ((error = namei(&nd)) != 0)
3351 		return (error);
3352 	vfslocked = NDHASGIANT(&nd);
3353 	NDFREE(&nd, NDF_ONLY_PNBUF);
3354 	error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL);
3355 	vrele(nd.ni_vp);
3356 	VFS_UNLOCK_GIANT(vfslocked);
3357 	return (error);
3358 }
3359 
3360 /*
3361  * Set the access and modification times of a file.
3362  */
3363 #ifndef _SYS_SYSPROTO_H_
3364 struct futimes_args {
3365 	int	fd;
3366 	struct	timeval *tptr;
3367 };
3368 #endif
3369 int
3370 sys_futimes(td, uap)
3371 	struct thread *td;
3372 	register struct futimes_args /* {
3373 		int  fd;
3374 		struct timeval *tptr;
3375 	} */ *uap;
3376 {
3377 
3378 	return (kern_futimes(td, uap->fd, uap->tptr, UIO_USERSPACE));
3379 }
3380 
3381 int
3382 kern_futimes(struct thread *td, int fd, struct timeval *tptr,
3383     enum uio_seg tptrseg)
3384 {
3385 	struct timespec ts[2];
3386 	struct file *fp;
3387 	int vfslocked;
3388 	int error;
3389 
3390 	AUDIT_ARG_FD(fd);
3391 	if ((error = getutimes(tptr, tptrseg, ts)) != 0)
3392 		return (error);
3393 	if ((error = getvnode(td->td_proc->p_fd, fd, CAP_FUTIMES, &fp))
3394 	    != 0)
3395 		return (error);
3396 	vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
3397 #ifdef AUDIT
3398 	vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY);
3399 	AUDIT_ARG_VNODE1(fp->f_vnode);
3400 	VOP_UNLOCK(fp->f_vnode, 0);
3401 #endif
3402 	error = setutimes(td, fp->f_vnode, ts, 2, tptr == NULL);
3403 	VFS_UNLOCK_GIANT(vfslocked);
3404 	fdrop(fp, td);
3405 	return (error);
3406 }
3407 
3408 /*
3409  * Truncate a file given its path name.
3410  */
3411 #ifndef _SYS_SYSPROTO_H_
3412 struct truncate_args {
3413 	char	*path;
3414 	int	pad;
3415 	off_t	length;
3416 };
3417 #endif
3418 int
3419 sys_truncate(td, uap)
3420 	struct thread *td;
3421 	register struct truncate_args /* {
3422 		char *path;
3423 		int pad;
3424 		off_t length;
3425 	} */ *uap;
3426 {
3427 
3428 	return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length));
3429 }
3430 
3431 int
3432 kern_truncate(struct thread *td, char *path, enum uio_seg pathseg, off_t length)
3433 {
3434 	struct mount *mp;
3435 	struct vnode *vp;
3436 	void *rl_cookie;
3437 	struct vattr vattr;
3438 	struct nameidata nd;
3439 	int error, vfslocked;
3440 
3441 	if (length < 0)
3442 		return(EINVAL);
3443 	NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1, pathseg, path, td);
3444 	if ((error = namei(&nd)) != 0)
3445 		return (error);
3446 	vfslocked = NDHASGIANT(&nd);
3447 	vp = nd.ni_vp;
3448 	rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
3449 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
3450 		vn_rangelock_unlock(vp, rl_cookie);
3451 		vrele(vp);
3452 		VFS_UNLOCK_GIANT(vfslocked);
3453 		return (error);
3454 	}
3455 	NDFREE(&nd, NDF_ONLY_PNBUF);
3456 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3457 	if (vp->v_type == VDIR)
3458 		error = EISDIR;
3459 #ifdef MAC
3460 	else if ((error = mac_vnode_check_write(td->td_ucred, NOCRED, vp))) {
3461 	}
3462 #endif
3463 	else if ((error = vn_writechk(vp)) == 0 &&
3464 	    (error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td)) == 0) {
3465 		VATTR_NULL(&vattr);
3466 		vattr.va_size = length;
3467 		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
3468 	}
3469 	VOP_UNLOCK(vp, 0);
3470 	vn_finished_write(mp);
3471 	vn_rangelock_unlock(vp, rl_cookie);
3472 	vrele(vp);
3473 	VFS_UNLOCK_GIANT(vfslocked);
3474 	return (error);
3475 }
3476 
3477 #if defined(COMPAT_43)
3478 /*
3479  * Truncate a file given its path name.
3480  */
3481 #ifndef _SYS_SYSPROTO_H_
3482 struct otruncate_args {
3483 	char	*path;
3484 	long	length;
3485 };
3486 #endif
3487 int
3488 otruncate(td, uap)
3489 	struct thread *td;
3490 	register struct otruncate_args /* {
3491 		char *path;
3492 		long length;
3493 	} */ *uap;
3494 {
3495 	struct truncate_args /* {
3496 		char *path;
3497 		int pad;
3498 		off_t length;
3499 	} */ nuap;
3500 
3501 	nuap.path = uap->path;
3502 	nuap.length = uap->length;
3503 	return (sys_truncate(td, &nuap));
3504 }
3505 #endif /* COMPAT_43 */
3506 
3507 /* Versions with the pad argument */
3508 int
3509 freebsd6_truncate(struct thread *td, struct freebsd6_truncate_args *uap)
3510 {
3511 	struct truncate_args ouap;
3512 
3513 	ouap.path = uap->path;
3514 	ouap.length = uap->length;
3515 	return (sys_truncate(td, &ouap));
3516 }
3517 
3518 int
3519 freebsd6_ftruncate(struct thread *td, struct freebsd6_ftruncate_args *uap)
3520 {
3521 	struct ftruncate_args ouap;
3522 
3523 	ouap.fd = uap->fd;
3524 	ouap.length = uap->length;
3525 	return (sys_ftruncate(td, &ouap));
3526 }
3527 
3528 /*
3529  * Sync an open file.
3530  */
3531 #ifndef _SYS_SYSPROTO_H_
3532 struct fsync_args {
3533 	int	fd;
3534 };
3535 #endif
3536 int
3537 sys_fsync(td, uap)
3538 	struct thread *td;
3539 	struct fsync_args /* {
3540 		int fd;
3541 	} */ *uap;
3542 {
3543 	struct vnode *vp;
3544 	struct mount *mp;
3545 	struct file *fp;
3546 	int vfslocked;
3547 	int error, lock_flags;
3548 
3549 	AUDIT_ARG_FD(uap->fd);
3550 	if ((error = getvnode(td->td_proc->p_fd, uap->fd, CAP_FSYNC,
3551 	    &fp)) != 0)
3552 		return (error);
3553 	vp = fp->f_vnode;
3554 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
3555 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
3556 		goto drop;
3557 	if (MNT_SHARED_WRITES(mp) ||
3558 	    ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) {
3559 		lock_flags = LK_SHARED;
3560 	} else {
3561 		lock_flags = LK_EXCLUSIVE;
3562 	}
3563 	vn_lock(vp, lock_flags | LK_RETRY);
3564 	AUDIT_ARG_VNODE1(vp);
3565 	if (vp->v_object != NULL) {
3566 		VM_OBJECT_LOCK(vp->v_object);
3567 		vm_object_page_clean(vp->v_object, 0, 0, 0);
3568 		VM_OBJECT_UNLOCK(vp->v_object);
3569 	}
3570 	error = VOP_FSYNC(vp, MNT_WAIT, td);
3571 
3572 	VOP_UNLOCK(vp, 0);
3573 	vn_finished_write(mp);
3574 drop:
3575 	VFS_UNLOCK_GIANT(vfslocked);
3576 	fdrop(fp, td);
3577 	return (error);
3578 }
3579 
3580 /*
3581  * Rename files.  Source and destination must either both be directories, or
3582  * both not be directories.  If target is a directory, it must be empty.
3583  */
3584 #ifndef _SYS_SYSPROTO_H_
3585 struct rename_args {
3586 	char	*from;
3587 	char	*to;
3588 };
3589 #endif
3590 int
3591 sys_rename(td, uap)
3592 	struct thread *td;
3593 	register struct rename_args /* {
3594 		char *from;
3595 		char *to;
3596 	} */ *uap;
3597 {
3598 
3599 	return (kern_rename(td, uap->from, uap->to, UIO_USERSPACE));
3600 }
3601 
3602 #ifndef _SYS_SYSPROTO_H_
3603 struct renameat_args {
3604 	int	oldfd;
3605 	char	*old;
3606 	int	newfd;
3607 	char	*new;
3608 };
3609 #endif
3610 int
3611 sys_renameat(struct thread *td, struct renameat_args *uap)
3612 {
3613 
3614 	return (kern_renameat(td, uap->oldfd, uap->old, uap->newfd, uap->new,
3615 	    UIO_USERSPACE));
3616 }
3617 
3618 int
3619 kern_rename(struct thread *td, char *from, char *to, enum uio_seg pathseg)
3620 {
3621 
3622 	return (kern_renameat(td, AT_FDCWD, from, AT_FDCWD, to, pathseg));
3623 }
3624 
3625 int
3626 kern_renameat(struct thread *td, int oldfd, char *old, int newfd, char *new,
3627     enum uio_seg pathseg)
3628 {
3629 	struct mount *mp = NULL;
3630 	struct vnode *tvp, *fvp, *tdvp;
3631 	struct nameidata fromnd, tond;
3632 	int tvfslocked;
3633 	int fvfslocked;
3634 	int error;
3635 
3636 	bwillwrite();
3637 #ifdef MAC
3638 	NDINIT_ATRIGHTS(&fromnd, DELETE, LOCKPARENT | LOCKLEAF | SAVESTART |
3639 	    MPSAFE | AUDITVNODE1, pathseg, old, oldfd, CAP_DELETE, td);
3640 #else
3641 	NDINIT_ATRIGHTS(&fromnd, DELETE, WANTPARENT | SAVESTART | MPSAFE |
3642 	    AUDITVNODE1, pathseg, old, oldfd, CAP_DELETE, td);
3643 #endif
3644 
3645 	if ((error = namei(&fromnd)) != 0)
3646 		return (error);
3647 	fvfslocked = NDHASGIANT(&fromnd);
3648 	tvfslocked = 0;
3649 #ifdef MAC
3650 	error = mac_vnode_check_rename_from(td->td_ucred, fromnd.ni_dvp,
3651 	    fromnd.ni_vp, &fromnd.ni_cnd);
3652 	VOP_UNLOCK(fromnd.ni_dvp, 0);
3653 	if (fromnd.ni_dvp != fromnd.ni_vp)
3654 		VOP_UNLOCK(fromnd.ni_vp, 0);
3655 #endif
3656 	fvp = fromnd.ni_vp;
3657 	if (error == 0)
3658 		error = vn_start_write(fvp, &mp, V_WAIT | PCATCH);
3659 	if (error != 0) {
3660 		NDFREE(&fromnd, NDF_ONLY_PNBUF);
3661 		vrele(fromnd.ni_dvp);
3662 		vrele(fvp);
3663 		goto out1;
3664 	}
3665 	NDINIT_ATRIGHTS(&tond, RENAME, LOCKPARENT | LOCKLEAF | NOCACHE |
3666 	    SAVESTART | MPSAFE | AUDITVNODE2, pathseg, new, newfd, CAP_CREATE,
3667 	    td);
3668 	if (fromnd.ni_vp->v_type == VDIR)
3669 		tond.ni_cnd.cn_flags |= WILLBEDIR;
3670 	if ((error = namei(&tond)) != 0) {
3671 		/* Translate error code for rename("dir1", "dir2/."). */
3672 		if (error == EISDIR && fvp->v_type == VDIR)
3673 			error = EINVAL;
3674 		NDFREE(&fromnd, NDF_ONLY_PNBUF);
3675 		vrele(fromnd.ni_dvp);
3676 		vrele(fvp);
3677 		vn_finished_write(mp);
3678 		goto out1;
3679 	}
3680 	tvfslocked = NDHASGIANT(&tond);
3681 	tdvp = tond.ni_dvp;
3682 	tvp = tond.ni_vp;
3683 	if (tvp != NULL) {
3684 		if (fvp->v_type == VDIR && tvp->v_type != VDIR) {
3685 			error = ENOTDIR;
3686 			goto out;
3687 		} else if (fvp->v_type != VDIR && tvp->v_type == VDIR) {
3688 			error = EISDIR;
3689 			goto out;
3690 		}
3691 	}
3692 	if (fvp == tdvp) {
3693 		error = EINVAL;
3694 		goto out;
3695 	}
3696 	/*
3697 	 * If the source is the same as the destination (that is, if they
3698 	 * are links to the same vnode), then there is nothing to do.
3699 	 */
3700 	if (fvp == tvp)
3701 		error = -1;
3702 #ifdef MAC
3703 	else
3704 		error = mac_vnode_check_rename_to(td->td_ucred, tdvp,
3705 		    tond.ni_vp, fromnd.ni_dvp == tdvp, &tond.ni_cnd);
3706 #endif
3707 out:
3708 	if (!error) {
3709 		error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd,
3710 				   tond.ni_dvp, tond.ni_vp, &tond.ni_cnd);
3711 		NDFREE(&fromnd, NDF_ONLY_PNBUF);
3712 		NDFREE(&tond, NDF_ONLY_PNBUF);
3713 	} else {
3714 		NDFREE(&fromnd, NDF_ONLY_PNBUF);
3715 		NDFREE(&tond, NDF_ONLY_PNBUF);
3716 		if (tvp)
3717 			vput(tvp);
3718 		if (tdvp == tvp)
3719 			vrele(tdvp);
3720 		else
3721 			vput(tdvp);
3722 		vrele(fromnd.ni_dvp);
3723 		vrele(fvp);
3724 	}
3725 	vrele(tond.ni_startdir);
3726 	vn_finished_write(mp);
3727 out1:
3728 	if (fromnd.ni_startdir)
3729 		vrele(fromnd.ni_startdir);
3730 	VFS_UNLOCK_GIANT(fvfslocked);
3731 	VFS_UNLOCK_GIANT(tvfslocked);
3732 	if (error == -1)
3733 		return (0);
3734 	return (error);
3735 }
3736 
3737 /*
3738  * Make a directory file.
3739  */
3740 #ifndef _SYS_SYSPROTO_H_
3741 struct mkdir_args {
3742 	char	*path;
3743 	int	mode;
3744 };
3745 #endif
3746 int
3747 sys_mkdir(td, uap)
3748 	struct thread *td;
3749 	register struct mkdir_args /* {
3750 		char *path;
3751 		int mode;
3752 	} */ *uap;
3753 {
3754 
3755 	return (kern_mkdir(td, uap->path, UIO_USERSPACE, uap->mode));
3756 }
3757 
3758 #ifndef _SYS_SYSPROTO_H_
3759 struct mkdirat_args {
3760 	int	fd;
3761 	char	*path;
3762 	mode_t	mode;
3763 };
3764 #endif
3765 int
3766 sys_mkdirat(struct thread *td, struct mkdirat_args *uap)
3767 {
3768 
3769 	return (kern_mkdirat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode));
3770 }
3771 
3772 int
3773 kern_mkdir(struct thread *td, char *path, enum uio_seg segflg, int mode)
3774 {
3775 
3776 	return (kern_mkdirat(td, AT_FDCWD, path, segflg, mode));
3777 }
3778 
3779 int
3780 kern_mkdirat(struct thread *td, int fd, char *path, enum uio_seg segflg,
3781     int mode)
3782 {
3783 	struct mount *mp;
3784 	struct vnode *vp;
3785 	struct vattr vattr;
3786 	int error;
3787 	struct nameidata nd;
3788 	int vfslocked;
3789 
3790 	AUDIT_ARG_MODE(mode);
3791 restart:
3792 	bwillwrite();
3793 	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE |
3794 	    AUDITVNODE1, segflg, path, fd, CAP_MKDIR, td);
3795 	nd.ni_cnd.cn_flags |= WILLBEDIR;
3796 	if ((error = namei(&nd)) != 0)
3797 		return (error);
3798 	vfslocked = NDHASGIANT(&nd);
3799 	vp = nd.ni_vp;
3800 	if (vp != NULL) {
3801 		NDFREE(&nd, NDF_ONLY_PNBUF);
3802 		/*
3803 		 * XXX namei called with LOCKPARENT but not LOCKLEAF has
3804 		 * the strange behaviour of leaving the vnode unlocked
3805 		 * if the target is the same vnode as the parent.
3806 		 */
3807 		if (vp == nd.ni_dvp)
3808 			vrele(nd.ni_dvp);
3809 		else
3810 			vput(nd.ni_dvp);
3811 		vrele(vp);
3812 		VFS_UNLOCK_GIANT(vfslocked);
3813 		return (EEXIST);
3814 	}
3815 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
3816 		NDFREE(&nd, NDF_ONLY_PNBUF);
3817 		vput(nd.ni_dvp);
3818 		VFS_UNLOCK_GIANT(vfslocked);
3819 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
3820 			return (error);
3821 		goto restart;
3822 	}
3823 	VATTR_NULL(&vattr);
3824 	vattr.va_type = VDIR;
3825 	vattr.va_mode = (mode & ACCESSPERMS) &~ td->td_proc->p_fd->fd_cmask;
3826 #ifdef MAC
3827 	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
3828 	    &vattr);
3829 	if (error)
3830 		goto out;
3831 #endif
3832 	error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
3833 #ifdef MAC
3834 out:
3835 #endif
3836 	NDFREE(&nd, NDF_ONLY_PNBUF);
3837 	vput(nd.ni_dvp);
3838 	if (!error)
3839 		vput(nd.ni_vp);
3840 	vn_finished_write(mp);
3841 	VFS_UNLOCK_GIANT(vfslocked);
3842 	return (error);
3843 }
3844 
3845 /*
3846  * Remove a directory file.
3847  */
3848 #ifndef _SYS_SYSPROTO_H_
3849 struct rmdir_args {
3850 	char	*path;
3851 };
3852 #endif
3853 int
3854 sys_rmdir(td, uap)
3855 	struct thread *td;
3856 	struct rmdir_args /* {
3857 		char *path;
3858 	} */ *uap;
3859 {
3860 
3861 	return (kern_rmdir(td, uap->path, UIO_USERSPACE));
3862 }
3863 
3864 int
3865 kern_rmdir(struct thread *td, char *path, enum uio_seg pathseg)
3866 {
3867 
3868 	return (kern_rmdirat(td, AT_FDCWD, path, pathseg));
3869 }
3870 
3871 int
3872 kern_rmdirat(struct thread *td, int fd, char *path, enum uio_seg pathseg)
3873 {
3874 	struct mount *mp;
3875 	struct vnode *vp;
3876 	int error;
3877 	struct nameidata nd;
3878 	int vfslocked;
3879 
3880 restart:
3881 	bwillwrite();
3882 	NDINIT_ATRIGHTS(&nd, DELETE, LOCKPARENT | LOCKLEAF | MPSAFE |
3883 	    AUDITVNODE1, pathseg, path, fd, CAP_RMDIR, td);
3884 	if ((error = namei(&nd)) != 0)
3885 		return (error);
3886 	vfslocked = NDHASGIANT(&nd);
3887 	vp = nd.ni_vp;
3888 	if (vp->v_type != VDIR) {
3889 		error = ENOTDIR;
3890 		goto out;
3891 	}
3892 	/*
3893 	 * No rmdir "." please.
3894 	 */
3895 	if (nd.ni_dvp == vp) {
3896 		error = EINVAL;
3897 		goto out;
3898 	}
3899 	/*
3900 	 * The root of a mounted filesystem cannot be deleted.
3901 	 */
3902 	if (vp->v_vflag & VV_ROOT) {
3903 		error = EBUSY;
3904 		goto out;
3905 	}
3906 #ifdef MAC
3907 	error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp,
3908 	    &nd.ni_cnd);
3909 	if (error)
3910 		goto out;
3911 #endif
3912 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
3913 		NDFREE(&nd, NDF_ONLY_PNBUF);
3914 		vput(vp);
3915 		if (nd.ni_dvp == vp)
3916 			vrele(nd.ni_dvp);
3917 		else
3918 			vput(nd.ni_dvp);
3919 		VFS_UNLOCK_GIANT(vfslocked);
3920 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
3921 			return (error);
3922 		goto restart;
3923 	}
3924 	error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
3925 	vn_finished_write(mp);
3926 out:
3927 	NDFREE(&nd, NDF_ONLY_PNBUF);
3928 	vput(vp);
3929 	if (nd.ni_dvp == vp)
3930 		vrele(nd.ni_dvp);
3931 	else
3932 		vput(nd.ni_dvp);
3933 	VFS_UNLOCK_GIANT(vfslocked);
3934 	return (error);
3935 }
3936 
3937 #ifdef COMPAT_43
3938 /*
3939  * Read a block of directory entries in a filesystem independent format.
3940  */
3941 #ifndef _SYS_SYSPROTO_H_
3942 struct ogetdirentries_args {
3943 	int	fd;
3944 	char	*buf;
3945 	u_int	count;
3946 	long	*basep;
3947 };
3948 #endif
3949 int
3950 ogetdirentries(struct thread *td, struct ogetdirentries_args *uap)
3951 {
3952 	long loff;
3953 	int error;
3954 
3955 	error = kern_ogetdirentries(td, uap, &loff);
3956 	if (error == 0)
3957 		error = copyout(&loff, uap->basep, sizeof(long));
3958 	return (error);
3959 }
3960 
3961 int
3962 kern_ogetdirentries(struct thread *td, struct ogetdirentries_args *uap,
3963     long *ploff)
3964 {
3965 	struct vnode *vp;
3966 	struct file *fp;
3967 	struct uio auio, kuio;
3968 	struct iovec aiov, kiov;
3969 	struct dirent *dp, *edp;
3970 	caddr_t dirbuf;
3971 	int error, eofflag, readcnt, vfslocked;
3972 	long loff;
3973 	off_t foffset;
3974 
3975 	/* XXX arbitrary sanity limit on `count'. */
3976 	if (uap->count > 64 * 1024)
3977 		return (EINVAL);
3978 	if ((error = getvnode(td->td_proc->p_fd, uap->fd, CAP_READ,
3979 	    &fp)) != 0)
3980 		return (error);
3981 	if ((fp->f_flag & FREAD) == 0) {
3982 		fdrop(fp, td);
3983 		return (EBADF);
3984 	}
3985 	vp = fp->f_vnode;
3986 	foffset = foffset_lock(fp, 0);
3987 unionread:
3988 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
3989 	if (vp->v_type != VDIR) {
3990 		VFS_UNLOCK_GIANT(vfslocked);
3991 		foffset_unlock(fp, foffset, 0);
3992 		fdrop(fp, td);
3993 		return (EINVAL);
3994 	}
3995 	aiov.iov_base = uap->buf;
3996 	aiov.iov_len = uap->count;
3997 	auio.uio_iov = &aiov;
3998 	auio.uio_iovcnt = 1;
3999 	auio.uio_rw = UIO_READ;
4000 	auio.uio_segflg = UIO_USERSPACE;
4001 	auio.uio_td = td;
4002 	auio.uio_resid = uap->count;
4003 	vn_lock(vp, LK_SHARED | LK_RETRY);
4004 	loff = auio.uio_offset = foffset;
4005 #ifdef MAC
4006 	error = mac_vnode_check_readdir(td->td_ucred, vp);
4007 	if (error) {
4008 		VOP_UNLOCK(vp, 0);
4009 		VFS_UNLOCK_GIANT(vfslocked);
4010 		foffset_unlock(fp, foffset, FOF_NOUPDATE);
4011 		fdrop(fp, td);
4012 		return (error);
4013 	}
4014 #endif
4015 #	if (BYTE_ORDER != LITTLE_ENDIAN)
4016 		if (vp->v_mount->mnt_maxsymlinklen <= 0) {
4017 			error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag,
4018 			    NULL, NULL);
4019 			foffset = auio.uio_offset;
4020 		} else
4021 #	endif
4022 	{
4023 		kuio = auio;
4024 		kuio.uio_iov = &kiov;
4025 		kuio.uio_segflg = UIO_SYSSPACE;
4026 		kiov.iov_len = uap->count;
4027 		dirbuf = malloc(uap->count, M_TEMP, M_WAITOK);
4028 		kiov.iov_base = dirbuf;
4029 		error = VOP_READDIR(vp, &kuio, fp->f_cred, &eofflag,
4030 			    NULL, NULL);
4031 		foffset = kuio.uio_offset;
4032 		if (error == 0) {
4033 			readcnt = uap->count - kuio.uio_resid;
4034 			edp = (struct dirent *)&dirbuf[readcnt];
4035 			for (dp = (struct dirent *)dirbuf; dp < edp; ) {
4036 #				if (BYTE_ORDER == LITTLE_ENDIAN)
4037 					/*
4038 					 * The expected low byte of
4039 					 * dp->d_namlen is our dp->d_type.
4040 					 * The high MBZ byte of dp->d_namlen
4041 					 * is our dp->d_namlen.
4042 					 */
4043 					dp->d_type = dp->d_namlen;
4044 					dp->d_namlen = 0;
4045 #				else
4046 					/*
4047 					 * The dp->d_type is the high byte
4048 					 * of the expected dp->d_namlen,
4049 					 * so must be zero'ed.
4050 					 */
4051 					dp->d_type = 0;
4052 #				endif
4053 				if (dp->d_reclen > 0) {
4054 					dp = (struct dirent *)
4055 					    ((char *)dp + dp->d_reclen);
4056 				} else {
4057 					error = EIO;
4058 					break;
4059 				}
4060 			}
4061 			if (dp >= edp)
4062 				error = uiomove(dirbuf, readcnt, &auio);
4063 		}
4064 		free(dirbuf, M_TEMP);
4065 	}
4066 	if (error) {
4067 		VOP_UNLOCK(vp, 0);
4068 		VFS_UNLOCK_GIANT(vfslocked);
4069 		foffset_unlock(fp, foffset, 0);
4070 		fdrop(fp, td);
4071 		return (error);
4072 	}
4073 	if (uap->count == auio.uio_resid &&
4074 	    (vp->v_vflag & VV_ROOT) &&
4075 	    (vp->v_mount->mnt_flag & MNT_UNION)) {
4076 		struct vnode *tvp = vp;
4077 		vp = vp->v_mount->mnt_vnodecovered;
4078 		VREF(vp);
4079 		fp->f_vnode = vp;
4080 		fp->f_data = vp;
4081 		foffset = 0;
4082 		vput(tvp);
4083 		VFS_UNLOCK_GIANT(vfslocked);
4084 		goto unionread;
4085 	}
4086 	VOP_UNLOCK(vp, 0);
4087 	VFS_UNLOCK_GIANT(vfslocked);
4088 	foffset_unlock(fp, foffset, 0);
4089 	fdrop(fp, td);
4090 	td->td_retval[0] = uap->count - auio.uio_resid;
4091 	if (error == 0)
4092 		*ploff = loff;
4093 	return (error);
4094 }
4095 #endif /* COMPAT_43 */
4096 
4097 /*
4098  * Read a block of directory entries in a filesystem independent format.
4099  */
4100 #ifndef _SYS_SYSPROTO_H_
4101 struct getdirentries_args {
4102 	int	fd;
4103 	char	*buf;
4104 	u_int	count;
4105 	long	*basep;
4106 };
4107 #endif
4108 int
4109 sys_getdirentries(td, uap)
4110 	struct thread *td;
4111 	register struct getdirentries_args /* {
4112 		int fd;
4113 		char *buf;
4114 		u_int count;
4115 		long *basep;
4116 	} */ *uap;
4117 {
4118 	long base;
4119 	int error;
4120 
4121 	error = kern_getdirentries(td, uap->fd, uap->buf, uap->count, &base,
4122 	    NULL, UIO_USERSPACE);
4123 	if (error)
4124 		return (error);
4125 	if (uap->basep != NULL)
4126 		error = copyout(&base, uap->basep, sizeof(long));
4127 	return (error);
4128 }
4129 
4130 int
4131 kern_getdirentries(struct thread *td, int fd, char *buf, u_int count,
4132     long *basep, ssize_t *residp, enum uio_seg bufseg)
4133 {
4134 	struct vnode *vp;
4135 	struct file *fp;
4136 	struct uio auio;
4137 	struct iovec aiov;
4138 	int vfslocked;
4139 	long loff;
4140 	int error, eofflag;
4141 	off_t foffset;
4142 
4143 	AUDIT_ARG_FD(fd);
4144 	if (count > IOSIZE_MAX)
4145 		return (EINVAL);
4146 	auio.uio_resid = count;
4147 	if ((error = getvnode(td->td_proc->p_fd, fd, CAP_READ | CAP_SEEK,
4148 	    &fp)) != 0)
4149 		return (error);
4150 	if ((fp->f_flag & FREAD) == 0) {
4151 		fdrop(fp, td);
4152 		return (EBADF);
4153 	}
4154 	vp = fp->f_vnode;
4155 	foffset = foffset_lock(fp, 0);
4156 unionread:
4157 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
4158 	if (vp->v_type != VDIR) {
4159 		VFS_UNLOCK_GIANT(vfslocked);
4160 		error = EINVAL;
4161 		goto fail;
4162 	}
4163 	aiov.iov_base = buf;
4164 	aiov.iov_len = count;
4165 	auio.uio_iov = &aiov;
4166 	auio.uio_iovcnt = 1;
4167 	auio.uio_rw = UIO_READ;
4168 	auio.uio_segflg = bufseg;
4169 	auio.uio_td = td;
4170 	vn_lock(vp, LK_SHARED | LK_RETRY);
4171 	AUDIT_ARG_VNODE1(vp);
4172 	loff = auio.uio_offset = foffset;
4173 #ifdef MAC
4174 	error = mac_vnode_check_readdir(td->td_ucred, vp);
4175 	if (error == 0)
4176 #endif
4177 		error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL,
4178 		    NULL);
4179 	foffset = auio.uio_offset;
4180 	if (error) {
4181 		VOP_UNLOCK(vp, 0);
4182 		VFS_UNLOCK_GIANT(vfslocked);
4183 		goto fail;
4184 	}
4185 	if (count == auio.uio_resid &&
4186 	    (vp->v_vflag & VV_ROOT) &&
4187 	    (vp->v_mount->mnt_flag & MNT_UNION)) {
4188 		struct vnode *tvp = vp;
4189 		vp = vp->v_mount->mnt_vnodecovered;
4190 		VREF(vp);
4191 		fp->f_vnode = vp;
4192 		fp->f_data = vp;
4193 		foffset = 0;
4194 		vput(tvp);
4195 		VFS_UNLOCK_GIANT(vfslocked);
4196 		goto unionread;
4197 	}
4198 	VOP_UNLOCK(vp, 0);
4199 	VFS_UNLOCK_GIANT(vfslocked);
4200 	*basep = loff;
4201 	if (residp != NULL)
4202 		*residp = auio.uio_resid;
4203 	td->td_retval[0] = count - auio.uio_resid;
4204 fail:
4205 	foffset_unlock(fp, foffset, 0);
4206 	fdrop(fp, td);
4207 	return (error);
4208 }
4209 
4210 #ifndef _SYS_SYSPROTO_H_
4211 struct getdents_args {
4212 	int fd;
4213 	char *buf;
4214 	size_t count;
4215 };
4216 #endif
4217 int
4218 sys_getdents(td, uap)
4219 	struct thread *td;
4220 	register struct getdents_args /* {
4221 		int fd;
4222 		char *buf;
4223 		u_int count;
4224 	} */ *uap;
4225 {
4226 	struct getdirentries_args ap;
4227 	ap.fd = uap->fd;
4228 	ap.buf = uap->buf;
4229 	ap.count = uap->count;
4230 	ap.basep = NULL;
4231 	return (sys_getdirentries(td, &ap));
4232 }
4233 
4234 /*
4235  * Set the mode mask for creation of filesystem nodes.
4236  */
4237 #ifndef _SYS_SYSPROTO_H_
4238 struct umask_args {
4239 	int	newmask;
4240 };
4241 #endif
4242 int
4243 sys_umask(td, uap)
4244 	struct thread *td;
4245 	struct umask_args /* {
4246 		int newmask;
4247 	} */ *uap;
4248 {
4249 	register struct filedesc *fdp;
4250 
4251 	FILEDESC_XLOCK(td->td_proc->p_fd);
4252 	fdp = td->td_proc->p_fd;
4253 	td->td_retval[0] = fdp->fd_cmask;
4254 	fdp->fd_cmask = uap->newmask & ALLPERMS;
4255 	FILEDESC_XUNLOCK(td->td_proc->p_fd);
4256 	return (0);
4257 }
4258 
4259 /*
4260  * Void all references to file by ripping underlying filesystem away from
4261  * vnode.
4262  */
4263 #ifndef _SYS_SYSPROTO_H_
4264 struct revoke_args {
4265 	char	*path;
4266 };
4267 #endif
4268 int
4269 sys_revoke(td, uap)
4270 	struct thread *td;
4271 	register struct revoke_args /* {
4272 		char *path;
4273 	} */ *uap;
4274 {
4275 	struct vnode *vp;
4276 	struct vattr vattr;
4277 	int error;
4278 	struct nameidata nd;
4279 	int vfslocked;
4280 
4281 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
4282 	    UIO_USERSPACE, uap->path, td);
4283 	if ((error = namei(&nd)) != 0)
4284 		return (error);
4285 	vfslocked = NDHASGIANT(&nd);
4286 	vp = nd.ni_vp;
4287 	NDFREE(&nd, NDF_ONLY_PNBUF);
4288 	if (vp->v_type != VCHR || vp->v_rdev == NULL) {
4289 		error = EINVAL;
4290 		goto out;
4291 	}
4292 #ifdef MAC
4293 	error = mac_vnode_check_revoke(td->td_ucred, vp);
4294 	if (error)
4295 		goto out;
4296 #endif
4297 	error = VOP_GETATTR(vp, &vattr, td->td_ucred);
4298 	if (error)
4299 		goto out;
4300 	if (td->td_ucred->cr_uid != vattr.va_uid) {
4301 		error = priv_check(td, PRIV_VFS_ADMIN);
4302 		if (error)
4303 			goto out;
4304 	}
4305 	if (vcount(vp) > 1)
4306 		VOP_REVOKE(vp, REVOKEALL);
4307 out:
4308 	vput(vp);
4309 	VFS_UNLOCK_GIANT(vfslocked);
4310 	return (error);
4311 }
4312 
4313 /*
4314  * Convert a user file descriptor to a kernel file entry and check that, if it
4315  * is a capability, the correct rights are present. A reference on the file
4316  * entry is held upon returning.
4317  */
4318 int
4319 getvnode(struct filedesc *fdp, int fd, cap_rights_t rights,
4320     struct file **fpp)
4321 {
4322 	struct file *fp;
4323 #ifdef CAPABILITIES
4324 	struct file *fp_fromcap;
4325 	int error;
4326 #endif
4327 
4328 	if (fdp == NULL || (fp = fget_unlocked(fdp, fd)) == NULL)
4329 		return (EBADF);
4330 #ifdef CAPABILITIES
4331 	/*
4332 	 * If the file descriptor is for a capability, test rights and use the
4333 	 * file descriptor referenced by the capability.
4334 	 */
4335 	error = cap_funwrap(fp, rights, &fp_fromcap);
4336 	if (error) {
4337 		fdrop(fp, curthread);
4338 		return (error);
4339 	}
4340 	if (fp != fp_fromcap) {
4341 		fhold(fp_fromcap);
4342 		fdrop(fp, curthread);
4343 		fp = fp_fromcap;
4344 	}
4345 #endif /* CAPABILITIES */
4346 
4347 	/*
4348 	 * The file could be not of the vnode type, or it may be not
4349 	 * yet fully initialized, in which case the f_vnode pointer
4350 	 * may be set, but f_ops is still badfileops.  E.g.,
4351 	 * devfs_open() transiently create such situation to
4352 	 * facilitate csw d_fdopen().
4353 	 *
4354 	 * Dupfdopen() handling in kern_openat() installs the
4355 	 * half-baked file into the process descriptor table, allowing
4356 	 * other thread to dereference it. Guard against the race by
4357 	 * checking f_ops.
4358 	 */
4359 	if (fp->f_vnode == NULL || fp->f_ops == &badfileops) {
4360 		fdrop(fp, curthread);
4361 		return (EINVAL);
4362 	}
4363 	*fpp = fp;
4364 	return (0);
4365 }
4366 
4367 
4368 /*
4369  * Get an (NFS) file handle.
4370  */
4371 #ifndef _SYS_SYSPROTO_H_
4372 struct lgetfh_args {
4373 	char	*fname;
4374 	fhandle_t *fhp;
4375 };
4376 #endif
4377 int
4378 sys_lgetfh(td, uap)
4379 	struct thread *td;
4380 	register struct lgetfh_args *uap;
4381 {
4382 	struct nameidata nd;
4383 	fhandle_t fh;
4384 	register struct vnode *vp;
4385 	int vfslocked;
4386 	int error;
4387 
4388 	error = priv_check(td, PRIV_VFS_GETFH);
4389 	if (error)
4390 		return (error);
4391 	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
4392 	    UIO_USERSPACE, uap->fname, td);
4393 	error = namei(&nd);
4394 	if (error)
4395 		return (error);
4396 	vfslocked = NDHASGIANT(&nd);
4397 	NDFREE(&nd, NDF_ONLY_PNBUF);
4398 	vp = nd.ni_vp;
4399 	bzero(&fh, sizeof(fh));
4400 	fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid;
4401 	error = VOP_VPTOFH(vp, &fh.fh_fid);
4402 	vput(vp);
4403 	VFS_UNLOCK_GIANT(vfslocked);
4404 	if (error)
4405 		return (error);
4406 	error = copyout(&fh, uap->fhp, sizeof (fh));
4407 	return (error);
4408 }
4409 
4410 #ifndef _SYS_SYSPROTO_H_
4411 struct getfh_args {
4412 	char	*fname;
4413 	fhandle_t *fhp;
4414 };
4415 #endif
4416 int
4417 sys_getfh(td, uap)
4418 	struct thread *td;
4419 	register struct getfh_args *uap;
4420 {
4421 	struct nameidata nd;
4422 	fhandle_t fh;
4423 	register struct vnode *vp;
4424 	int vfslocked;
4425 	int error;
4426 
4427 	error = priv_check(td, PRIV_VFS_GETFH);
4428 	if (error)
4429 		return (error);
4430 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
4431 	    UIO_USERSPACE, uap->fname, td);
4432 	error = namei(&nd);
4433 	if (error)
4434 		return (error);
4435 	vfslocked = NDHASGIANT(&nd);
4436 	NDFREE(&nd, NDF_ONLY_PNBUF);
4437 	vp = nd.ni_vp;
4438 	bzero(&fh, sizeof(fh));
4439 	fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid;
4440 	error = VOP_VPTOFH(vp, &fh.fh_fid);
4441 	vput(vp);
4442 	VFS_UNLOCK_GIANT(vfslocked);
4443 	if (error)
4444 		return (error);
4445 	error = copyout(&fh, uap->fhp, sizeof (fh));
4446 	return (error);
4447 }
4448 
4449 /*
4450  * syscall for the rpc.lockd to use to translate a NFS file handle into an
4451  * open descriptor.
4452  *
4453  * warning: do not remove the priv_check() call or this becomes one giant
4454  * security hole.
4455  */
4456 #ifndef _SYS_SYSPROTO_H_
4457 struct fhopen_args {
4458 	const struct fhandle *u_fhp;
4459 	int flags;
4460 };
4461 #endif
4462 int
4463 sys_fhopen(td, uap)
4464 	struct thread *td;
4465 	struct fhopen_args /* {
4466 		const struct fhandle *u_fhp;
4467 		int flags;
4468 	} */ *uap;
4469 {
4470 	struct mount *mp;
4471 	struct vnode *vp;
4472 	struct fhandle fhp;
4473 	struct file *fp;
4474 	int fmode, error;
4475 	int vfslocked;
4476 	int indx;
4477 
4478 	error = priv_check(td, PRIV_VFS_FHOPEN);
4479 	if (error)
4480 		return (error);
4481 	indx = -1;
4482 	fmode = FFLAGS(uap->flags);
4483 	/* why not allow a non-read/write open for our lockd? */
4484 	if (((fmode & (FREAD | FWRITE)) == 0) || (fmode & O_CREAT))
4485 		return (EINVAL);
4486 	error = copyin(uap->u_fhp, &fhp, sizeof(fhp));
4487 	if (error)
4488 		return(error);
4489 	/* find the mount point */
4490 	mp = vfs_busyfs(&fhp.fh_fsid);
4491 	if (mp == NULL)
4492 		return (ESTALE);
4493 	vfslocked = VFS_LOCK_GIANT(mp);
4494 	/* now give me my vnode, it gets returned to me locked */
4495 	error = VFS_FHTOVP(mp, &fhp.fh_fid, LK_EXCLUSIVE, &vp);
4496 	vfs_unbusy(mp);
4497 	if (error) {
4498 		VFS_UNLOCK_GIANT(vfslocked);
4499 		return (error);
4500 	}
4501 
4502 	error = falloc_noinstall(td, &fp);
4503 	if (error) {
4504 		vput(vp);
4505 		VFS_UNLOCK_GIANT(vfslocked);
4506 		return (error);
4507 	}
4508 	/*
4509 	 * An extra reference on `fp' has been held for us by
4510 	 * falloc_noinstall().
4511 	 */
4512 
4513 #ifdef INVARIANTS
4514 	td->td_dupfd = -1;
4515 #endif
4516 	error = vn_open_vnode(vp, fmode, td->td_ucred, td, fp);
4517 	if (error) {
4518 		KASSERT(fp->f_ops == &badfileops,
4519 		    ("VOP_OPEN in fhopen() set f_ops"));
4520 		KASSERT(td->td_dupfd < 0,
4521 		    ("fhopen() encountered fdopen()"));
4522 
4523 		vput(vp);
4524 		goto bad;
4525 	}
4526 #ifdef INVARIANTS
4527 	td->td_dupfd = 0;
4528 #endif
4529 	fp->f_vnode = vp;
4530 	fp->f_seqcount = 1;
4531 	finit(fp, (fmode & FMASK) | (fp->f_flag & FHASLOCK), DTYPE_VNODE, vp,
4532 	    &vnops);
4533 	VOP_UNLOCK(vp, 0);
4534 	if (fmode & O_TRUNC) {
4535 		error = fo_truncate(fp, 0, td->td_ucred, td);
4536 		if (error)
4537 			goto bad;
4538 	}
4539 
4540 	error = finstall(td, fp, &indx, fmode);
4541 bad:
4542 	VFS_UNLOCK_GIANT(vfslocked);
4543 	fdrop(fp, td);
4544 	td->td_retval[0] = indx;
4545 	return (error);
4546 }
4547 
4548 /*
4549  * Stat an (NFS) file handle.
4550  */
4551 #ifndef _SYS_SYSPROTO_H_
4552 struct fhstat_args {
4553 	struct fhandle *u_fhp;
4554 	struct stat *sb;
4555 };
4556 #endif
4557 int
4558 sys_fhstat(td, uap)
4559 	struct thread *td;
4560 	register struct fhstat_args /* {
4561 		struct fhandle *u_fhp;
4562 		struct stat *sb;
4563 	} */ *uap;
4564 {
4565 	struct stat sb;
4566 	struct fhandle fh;
4567 	int error;
4568 
4569 	error = copyin(uap->u_fhp, &fh, sizeof(fh));
4570 	if (error != 0)
4571 		return (error);
4572 	error = kern_fhstat(td, fh, &sb);
4573 	if (error != 0)
4574 		return (error);
4575 	error = copyout(&sb, uap->sb, sizeof(sb));
4576 	return (error);
4577 }
4578 
4579 int
4580 kern_fhstat(struct thread *td, struct fhandle fh, struct stat *sb)
4581 {
4582 	struct mount *mp;
4583 	struct vnode *vp;
4584 	int vfslocked;
4585 	int error;
4586 
4587 	error = priv_check(td, PRIV_VFS_FHSTAT);
4588 	if (error)
4589 		return (error);
4590 	if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
4591 		return (ESTALE);
4592 	vfslocked = VFS_LOCK_GIANT(mp);
4593 	error = VFS_FHTOVP(mp, &fh.fh_fid, LK_EXCLUSIVE, &vp);
4594 	vfs_unbusy(mp);
4595 	if (error) {
4596 		VFS_UNLOCK_GIANT(vfslocked);
4597 		return (error);
4598 	}
4599 	error = vn_stat(vp, sb, td->td_ucred, NOCRED, td);
4600 	vput(vp);
4601 	VFS_UNLOCK_GIANT(vfslocked);
4602 	return (error);
4603 }
4604 
4605 /*
4606  * Implement fstatfs() for (NFS) file handles.
4607  */
4608 #ifndef _SYS_SYSPROTO_H_
4609 struct fhstatfs_args {
4610 	struct fhandle *u_fhp;
4611 	struct statfs *buf;
4612 };
4613 #endif
4614 int
4615 sys_fhstatfs(td, uap)
4616 	struct thread *td;
4617 	struct fhstatfs_args /* {
4618 		struct fhandle *u_fhp;
4619 		struct statfs *buf;
4620 	} */ *uap;
4621 {
4622 	struct statfs sf;
4623 	fhandle_t fh;
4624 	int error;
4625 
4626 	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
4627 	if (error)
4628 		return (error);
4629 	error = kern_fhstatfs(td, fh, &sf);
4630 	if (error)
4631 		return (error);
4632 	return (copyout(&sf, uap->buf, sizeof(sf)));
4633 }
4634 
4635 int
4636 kern_fhstatfs(struct thread *td, fhandle_t fh, struct statfs *buf)
4637 {
4638 	struct statfs *sp;
4639 	struct mount *mp;
4640 	struct vnode *vp;
4641 	int vfslocked;
4642 	int error;
4643 
4644 	error = priv_check(td, PRIV_VFS_FHSTATFS);
4645 	if (error)
4646 		return (error);
4647 	if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
4648 		return (ESTALE);
4649 	vfslocked = VFS_LOCK_GIANT(mp);
4650 	error = VFS_FHTOVP(mp, &fh.fh_fid, LK_EXCLUSIVE, &vp);
4651 	if (error) {
4652 		vfs_unbusy(mp);
4653 		VFS_UNLOCK_GIANT(vfslocked);
4654 		return (error);
4655 	}
4656 	vput(vp);
4657 	error = prison_canseemount(td->td_ucred, mp);
4658 	if (error)
4659 		goto out;
4660 #ifdef MAC
4661 	error = mac_mount_check_stat(td->td_ucred, mp);
4662 	if (error)
4663 		goto out;
4664 #endif
4665 	/*
4666 	 * Set these in case the underlying filesystem fails to do so.
4667 	 */
4668 	sp = &mp->mnt_stat;
4669 	sp->f_version = STATFS_VERSION;
4670 	sp->f_namemax = NAME_MAX;
4671 	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
4672 	error = VFS_STATFS(mp, sp);
4673 	if (error == 0)
4674 		*buf = *sp;
4675 out:
4676 	vfs_unbusy(mp);
4677 	VFS_UNLOCK_GIANT(vfslocked);
4678 	return (error);
4679 }
4680 
4681 int
4682 kern_posix_fallocate(struct thread *td, int fd, off_t offset, off_t len)
4683 {
4684 	struct file *fp;
4685 	struct mount *mp;
4686 	struct vnode *vp;
4687 	off_t olen, ooffset;
4688 	int error, vfslocked;
4689 
4690 	fp = NULL;
4691 	vfslocked = 0;
4692 	error = fget(td, fd, CAP_WRITE, &fp);
4693 	if (error != 0)
4694 		goto out;
4695 
4696 	switch (fp->f_type) {
4697 	case DTYPE_VNODE:
4698 		break;
4699 	case DTYPE_PIPE:
4700 	case DTYPE_FIFO:
4701 		error = ESPIPE;
4702 		goto out;
4703 	default:
4704 		error = ENODEV;
4705 		goto out;
4706 	}
4707 	if ((fp->f_flag & FWRITE) == 0) {
4708 		error = EBADF;
4709 		goto out;
4710 	}
4711 	vp = fp->f_vnode;
4712 	if (vp->v_type != VREG) {
4713 		error = ENODEV;
4714 		goto out;
4715 	}
4716 	if (offset < 0 || len <= 0) {
4717 		error = EINVAL;
4718 		goto out;
4719 	}
4720 	/* Check for wrap. */
4721 	if (offset > OFF_MAX - len) {
4722 		error = EFBIG;
4723 		goto out;
4724 	}
4725 
4726 	/* Allocating blocks may take a long time, so iterate. */
4727 	for (;;) {
4728 		olen = len;
4729 		ooffset = offset;
4730 
4731 		bwillwrite();
4732 		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
4733 		mp = NULL;
4734 		error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
4735 		if (error != 0) {
4736 			VFS_UNLOCK_GIANT(vfslocked);
4737 			break;
4738 		}
4739 		error = vn_lock(vp, LK_EXCLUSIVE);
4740 		if (error != 0) {
4741 			vn_finished_write(mp);
4742 			VFS_UNLOCK_GIANT(vfslocked);
4743 			break;
4744 		}
4745 #ifdef MAC
4746 		error = mac_vnode_check_write(td->td_ucred, fp->f_cred, vp);
4747 		if (error == 0)
4748 #endif
4749 			error = VOP_ALLOCATE(vp, &offset, &len);
4750 		VOP_UNLOCK(vp, 0);
4751 		vn_finished_write(mp);
4752 		VFS_UNLOCK_GIANT(vfslocked);
4753 
4754 		if (olen + ooffset != offset + len) {
4755 			panic("offset + len changed from %jx/%jx to %jx/%jx",
4756 			    ooffset, olen, offset, len);
4757 		}
4758 		if (error != 0 || len == 0)
4759 			break;
4760 		KASSERT(olen > len, ("Iteration did not make progress?"));
4761 		maybe_yield();
4762 	}
4763  out:
4764 	if (fp != NULL)
4765 		fdrop(fp, td);
4766 	return (error);
4767 }
4768 
4769 int
4770 sys_posix_fallocate(struct thread *td, struct posix_fallocate_args *uap)
4771 {
4772 
4773 	return (kern_posix_fallocate(td, uap->fd, uap->offset, uap->len));
4774 }
4775 
4776 /*
4777  * Unlike madvise(2), we do not make a best effort to remember every
4778  * possible caching hint.  Instead, we remember the last setting with
4779  * the exception that we will allow POSIX_FADV_NORMAL to adjust the
4780  * region of any current setting.
4781  */
4782 int
4783 kern_posix_fadvise(struct thread *td, int fd, off_t offset, off_t len,
4784     int advice)
4785 {
4786 	struct fadvise_info *fa, *new;
4787 	struct file *fp;
4788 	struct vnode *vp;
4789 	off_t end;
4790 	int error;
4791 
4792 	if (offset < 0 || len < 0 || offset > OFF_MAX - len)
4793 		return (EINVAL);
4794 	switch (advice) {
4795 	case POSIX_FADV_SEQUENTIAL:
4796 	case POSIX_FADV_RANDOM:
4797 	case POSIX_FADV_NOREUSE:
4798 		new = malloc(sizeof(*fa), M_FADVISE, M_WAITOK);
4799 		break;
4800 	case POSIX_FADV_NORMAL:
4801 	case POSIX_FADV_WILLNEED:
4802 	case POSIX_FADV_DONTNEED:
4803 		new = NULL;
4804 		break;
4805 	default:
4806 		return (EINVAL);
4807 	}
4808 	/* XXX: CAP_POSIX_FADVISE? */
4809 	error = fget(td, fd, 0, &fp);
4810 	if (error != 0)
4811 		goto out;
4812 
4813 	switch (fp->f_type) {
4814 	case DTYPE_VNODE:
4815 		break;
4816 	case DTYPE_PIPE:
4817 	case DTYPE_FIFO:
4818 		error = ESPIPE;
4819 		goto out;
4820 	default:
4821 		error = ENODEV;
4822 		goto out;
4823 	}
4824 	vp = fp->f_vnode;
4825 	if (vp->v_type != VREG) {
4826 		error = ENODEV;
4827 		goto out;
4828 	}
4829 	if (len == 0)
4830 		end = OFF_MAX;
4831 	else
4832 		end = offset + len - 1;
4833 	switch (advice) {
4834 	case POSIX_FADV_SEQUENTIAL:
4835 	case POSIX_FADV_RANDOM:
4836 	case POSIX_FADV_NOREUSE:
4837 		/*
4838 		 * Try to merge any existing non-standard region with
4839 		 * this new region if possible, otherwise create a new
4840 		 * non-standard region for this request.
4841 		 */
4842 		mtx_pool_lock(mtxpool_sleep, fp);
4843 		fa = fp->f_advice;
4844 		if (fa != NULL && fa->fa_advice == advice &&
4845 		    ((fa->fa_start <= end && fa->fa_end >= offset) ||
4846 		    (end != OFF_MAX && fa->fa_start == end + 1) ||
4847 		    (fa->fa_end != OFF_MAX && fa->fa_end + 1 == offset))) {
4848 			if (offset < fa->fa_start)
4849 				fa->fa_start = offset;
4850 			if (end > fa->fa_end)
4851 				fa->fa_end = end;
4852 		} else {
4853 			new->fa_advice = advice;
4854 			new->fa_start = offset;
4855 			new->fa_end = end;
4856 			new->fa_prevstart = 0;
4857 			new->fa_prevend = 0;
4858 			fp->f_advice = new;
4859 			new = fa;
4860 		}
4861 		mtx_pool_unlock(mtxpool_sleep, fp);
4862 		break;
4863 	case POSIX_FADV_NORMAL:
4864 		/*
4865 		 * If a the "normal" region overlaps with an existing
4866 		 * non-standard region, trim or remove the
4867 		 * non-standard region.
4868 		 */
4869 		mtx_pool_lock(mtxpool_sleep, fp);
4870 		fa = fp->f_advice;
4871 		if (fa != NULL) {
4872 			if (offset <= fa->fa_start && end >= fa->fa_end) {
4873 				new = fa;
4874 				fp->f_advice = NULL;
4875 			} else if (offset <= fa->fa_start &&
4876  			    end >= fa->fa_start)
4877 				fa->fa_start = end + 1;
4878 			else if (offset <= fa->fa_end && end >= fa->fa_end)
4879 				fa->fa_end = offset - 1;
4880 			else if (offset >= fa->fa_start && end <= fa->fa_end) {
4881 				/*
4882 				 * If the "normal" region is a middle
4883 				 * portion of the existing
4884 				 * non-standard region, just remove
4885 				 * the whole thing rather than picking
4886 				 * one side or the other to
4887 				 * preserve.
4888 				 */
4889 				new = fa;
4890 				fp->f_advice = NULL;
4891 			}
4892 		}
4893 		mtx_pool_unlock(mtxpool_sleep, fp);
4894 		break;
4895 	case POSIX_FADV_WILLNEED:
4896 	case POSIX_FADV_DONTNEED:
4897 		error = VOP_ADVISE(vp, offset, end, advice);
4898 		break;
4899 	}
4900 out:
4901 	if (fp != NULL)
4902 		fdrop(fp, td);
4903 	free(new, M_FADVISE);
4904 	return (error);
4905 }
4906 
4907 int
4908 sys_posix_fadvise(struct thread *td, struct posix_fadvise_args *uap)
4909 {
4910 
4911 	return (kern_posix_fadvise(td, uap->fd, uap->offset, uap->len,
4912 	    uap->advice));
4913 }
4914