xref: /freebsd/sys/kern/vfs_syscalls.c (revision 9a14aa017b21c292740c00ee098195cd46642730)
1 /*-
2  * Copyright (c) 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 4. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  *	@(#)vfs_syscalls.c	8.13 (Berkeley) 4/15/94
35  */
36 
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD$");
39 
40 #include "opt_capsicum.h"
41 #include "opt_compat.h"
42 #include "opt_kdtrace.h"
43 #include "opt_ktrace.h"
44 
45 #include <sys/param.h>
46 #include <sys/systm.h>
47 #include <sys/bio.h>
48 #include <sys/buf.h>
49 #include <sys/capability.h>
50 #include <sys/disk.h>
51 #include <sys/sysent.h>
52 #include <sys/malloc.h>
53 #include <sys/mount.h>
54 #include <sys/mutex.h>
55 #include <sys/sysproto.h>
56 #include <sys/namei.h>
57 #include <sys/filedesc.h>
58 #include <sys/kernel.h>
59 #include <sys/fcntl.h>
60 #include <sys/file.h>
61 #include <sys/filio.h>
62 #include <sys/limits.h>
63 #include <sys/linker.h>
64 #include <sys/sdt.h>
65 #include <sys/stat.h>
66 #include <sys/sx.h>
67 #include <sys/unistd.h>
68 #include <sys/vnode.h>
69 #include <sys/priv.h>
70 #include <sys/proc.h>
71 #include <sys/dirent.h>
72 #include <sys/jail.h>
73 #include <sys/syscallsubr.h>
74 #include <sys/sysctl.h>
75 #ifdef KTRACE
76 #include <sys/ktrace.h>
77 #endif
78 
79 #include <machine/stdarg.h>
80 
81 #include <security/audit/audit.h>
82 #include <security/mac/mac_framework.h>
83 
84 #include <vm/vm.h>
85 #include <vm/vm_object.h>
86 #include <vm/vm_page.h>
87 #include <vm/uma.h>
88 
89 #include <ufs/ufs/quota.h>
90 
91 static MALLOC_DEFINE(M_FADVISE, "fadvise", "posix_fadvise(2) information");
92 
93 SDT_PROVIDER_DEFINE(vfs);
94 SDT_PROBE_DEFINE(vfs, , stat, mode, mode);
95 SDT_PROBE_ARGTYPE(vfs, , stat, mode, 0, "char *");
96 SDT_PROBE_ARGTYPE(vfs, , stat, mode, 1, "int");
97 SDT_PROBE_DEFINE(vfs, , stat, reg, reg);
98 SDT_PROBE_ARGTYPE(vfs, , stat, reg, 0, "char *");
99 SDT_PROBE_ARGTYPE(vfs, , stat, reg, 1, "int");
100 
101 static int chroot_refuse_vdir_fds(struct filedesc *fdp);
102 static int getutimes(const struct timeval *, enum uio_seg, struct timespec *);
103 static int setfflags(struct thread *td, struct vnode *, int);
104 static int setutimes(struct thread *td, struct vnode *,
105     const struct timespec *, int, int);
106 static int vn_access(struct vnode *vp, int user_flags, struct ucred *cred,
107     struct thread *td);
108 
109 /*
110  * The module initialization routine for POSIX asynchronous I/O will
111  * set this to the version of AIO that it implements.  (Zero means
112  * that it is not implemented.)  This value is used here by pathconf()
113  * and in kern_descrip.c by fpathconf().
114  */
115 int async_io_version;
116 
117 #ifdef DEBUG
118 static int syncprt = 0;
119 SYSCTL_INT(_debug, OID_AUTO, syncprt, CTLFLAG_RW, &syncprt, 0, "");
120 #endif
121 
122 /*
123  * Sync each mounted filesystem.
124  */
125 #ifndef _SYS_SYSPROTO_H_
126 struct sync_args {
127 	int     dummy;
128 };
129 #endif
130 /* ARGSUSED */
131 int
132 sys_sync(td, uap)
133 	struct thread *td;
134 	struct sync_args *uap;
135 {
136 	struct mount *mp, *nmp;
137 	int vfslocked;
138 
139 	mtx_lock(&mountlist_mtx);
140 	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
141 		if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
142 			nmp = TAILQ_NEXT(mp, mnt_list);
143 			continue;
144 		}
145 		vfslocked = VFS_LOCK_GIANT(mp);
146 		if ((mp->mnt_flag & MNT_RDONLY) == 0 &&
147 		    vn_start_write(NULL, &mp, V_NOWAIT) == 0) {
148 			MNT_ILOCK(mp);
149 			mp->mnt_noasync++;
150 			mp->mnt_kern_flag &= ~MNTK_ASYNC;
151 			MNT_IUNLOCK(mp);
152 			vfs_msync(mp, MNT_NOWAIT);
153 			VFS_SYNC(mp, MNT_NOWAIT);
154 			MNT_ILOCK(mp);
155 			mp->mnt_noasync--;
156 			if ((mp->mnt_flag & MNT_ASYNC) != 0 &&
157 			    mp->mnt_noasync == 0)
158 				mp->mnt_kern_flag |= MNTK_ASYNC;
159 			MNT_IUNLOCK(mp);
160 			vn_finished_write(mp);
161 		}
162 		VFS_UNLOCK_GIANT(vfslocked);
163 		mtx_lock(&mountlist_mtx);
164 		nmp = TAILQ_NEXT(mp, mnt_list);
165 		vfs_unbusy(mp);
166 	}
167 	mtx_unlock(&mountlist_mtx);
168 	return (0);
169 }
170 
171 /*
172  * Change filesystem quotas.
173  */
174 #ifndef _SYS_SYSPROTO_H_
175 struct quotactl_args {
176 	char *path;
177 	int cmd;
178 	int uid;
179 	caddr_t arg;
180 };
181 #endif
182 int
183 sys_quotactl(td, uap)
184 	struct thread *td;
185 	register struct quotactl_args /* {
186 		char *path;
187 		int cmd;
188 		int uid;
189 		caddr_t arg;
190 	} */ *uap;
191 {
192 	struct mount *mp;
193 	int vfslocked;
194 	int error;
195 	struct nameidata nd;
196 
197 	AUDIT_ARG_CMD(uap->cmd);
198 	AUDIT_ARG_UID(uap->uid);
199 	if (!prison_allow(td->td_ucred, PR_ALLOW_QUOTAS))
200 		return (EPERM);
201 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
202 	   UIO_USERSPACE, uap->path, td);
203 	if ((error = namei(&nd)) != 0)
204 		return (error);
205 	vfslocked = NDHASGIANT(&nd);
206 	NDFREE(&nd, NDF_ONLY_PNBUF);
207 	mp = nd.ni_vp->v_mount;
208 	vfs_ref(mp);
209 	vput(nd.ni_vp);
210 	error = vfs_busy(mp, 0);
211 	vfs_rel(mp);
212 	if (error) {
213 		VFS_UNLOCK_GIANT(vfslocked);
214 		return (error);
215 	}
216 	error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, uap->arg);
217 
218 	/*
219 	 * Since quota on operation typically needs to open quota
220 	 * file, the Q_QUOTAON handler needs to unbusy the mount point
221 	 * before calling into namei.  Otherwise, unmount might be
222 	 * started between two vfs_busy() invocations (first is our,
223 	 * second is from mount point cross-walk code in lookup()),
224 	 * causing deadlock.
225 	 *
226 	 * Require that Q_QUOTAON handles the vfs_busy() reference on
227 	 * its own, always returning with ubusied mount point.
228 	 */
229 	if ((uap->cmd >> SUBCMDSHIFT) != Q_QUOTAON)
230 		vfs_unbusy(mp);
231 	VFS_UNLOCK_GIANT(vfslocked);
232 	return (error);
233 }
234 
235 /*
236  * Used by statfs conversion routines to scale the block size up if
237  * necessary so that all of the block counts are <= 'max_size'.  Note
238  * that 'max_size' should be a bitmask, i.e. 2^n - 1 for some non-zero
239  * value of 'n'.
240  */
241 void
242 statfs_scale_blocks(struct statfs *sf, long max_size)
243 {
244 	uint64_t count;
245 	int shift;
246 
247 	KASSERT(powerof2(max_size + 1), ("%s: invalid max_size", __func__));
248 
249 	/*
250 	 * Attempt to scale the block counts to give a more accurate
251 	 * overview to userland of the ratio of free space to used
252 	 * space.  To do this, find the largest block count and compute
253 	 * a divisor that lets it fit into a signed integer <= max_size.
254 	 */
255 	if (sf->f_bavail < 0)
256 		count = -sf->f_bavail;
257 	else
258 		count = sf->f_bavail;
259 	count = MAX(sf->f_blocks, MAX(sf->f_bfree, count));
260 	if (count <= max_size)
261 		return;
262 
263 	count >>= flsl(max_size);
264 	shift = 0;
265 	while (count > 0) {
266 		shift++;
267 		count >>=1;
268 	}
269 
270 	sf->f_bsize <<= shift;
271 	sf->f_blocks >>= shift;
272 	sf->f_bfree >>= shift;
273 	sf->f_bavail >>= shift;
274 }
275 
276 /*
277  * Get filesystem statistics.
278  */
279 #ifndef _SYS_SYSPROTO_H_
280 struct statfs_args {
281 	char *path;
282 	struct statfs *buf;
283 };
284 #endif
285 int
286 sys_statfs(td, uap)
287 	struct thread *td;
288 	register struct statfs_args /* {
289 		char *path;
290 		struct statfs *buf;
291 	} */ *uap;
292 {
293 	struct statfs sf;
294 	int error;
295 
296 	error = kern_statfs(td, uap->path, UIO_USERSPACE, &sf);
297 	if (error == 0)
298 		error = copyout(&sf, uap->buf, sizeof(sf));
299 	return (error);
300 }
301 
302 int
303 kern_statfs(struct thread *td, char *path, enum uio_seg pathseg,
304     struct statfs *buf)
305 {
306 	struct mount *mp;
307 	struct statfs *sp, sb;
308 	int vfslocked;
309 	int error;
310 	struct nameidata nd;
311 
312 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | MPSAFE |
313 	    AUDITVNODE1, pathseg, path, td);
314 	error = namei(&nd);
315 	if (error)
316 		return (error);
317 	vfslocked = NDHASGIANT(&nd);
318 	mp = nd.ni_vp->v_mount;
319 	vfs_ref(mp);
320 	NDFREE(&nd, NDF_ONLY_PNBUF);
321 	vput(nd.ni_vp);
322 	error = vfs_busy(mp, 0);
323 	vfs_rel(mp);
324 	if (error) {
325 		VFS_UNLOCK_GIANT(vfslocked);
326 		return (error);
327 	}
328 #ifdef MAC
329 	error = mac_mount_check_stat(td->td_ucred, mp);
330 	if (error)
331 		goto out;
332 #endif
333 	/*
334 	 * Set these in case the underlying filesystem fails to do so.
335 	 */
336 	sp = &mp->mnt_stat;
337 	sp->f_version = STATFS_VERSION;
338 	sp->f_namemax = NAME_MAX;
339 	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
340 	error = VFS_STATFS(mp, sp);
341 	if (error)
342 		goto out;
343 	if (priv_check(td, PRIV_VFS_GENERATION)) {
344 		bcopy(sp, &sb, sizeof(sb));
345 		sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
346 		prison_enforce_statfs(td->td_ucred, mp, &sb);
347 		sp = &sb;
348 	}
349 	*buf = *sp;
350 out:
351 	vfs_unbusy(mp);
352 	VFS_UNLOCK_GIANT(vfslocked);
353 	return (error);
354 }
355 
356 /*
357  * Get filesystem statistics.
358  */
359 #ifndef _SYS_SYSPROTO_H_
360 struct fstatfs_args {
361 	int fd;
362 	struct statfs *buf;
363 };
364 #endif
365 int
366 sys_fstatfs(td, uap)
367 	struct thread *td;
368 	register struct fstatfs_args /* {
369 		int fd;
370 		struct statfs *buf;
371 	} */ *uap;
372 {
373 	struct statfs sf;
374 	int error;
375 
376 	error = kern_fstatfs(td, uap->fd, &sf);
377 	if (error == 0)
378 		error = copyout(&sf, uap->buf, sizeof(sf));
379 	return (error);
380 }
381 
382 int
383 kern_fstatfs(struct thread *td, int fd, struct statfs *buf)
384 {
385 	struct file *fp;
386 	struct mount *mp;
387 	struct statfs *sp, sb;
388 	int vfslocked;
389 	struct vnode *vp;
390 	int error;
391 
392 	AUDIT_ARG_FD(fd);
393 	error = getvnode(td->td_proc->p_fd, fd, CAP_FSTATFS, &fp);
394 	if (error)
395 		return (error);
396 	vp = fp->f_vnode;
397 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
398 	vn_lock(vp, LK_SHARED | LK_RETRY);
399 #ifdef AUDIT
400 	AUDIT_ARG_VNODE1(vp);
401 #endif
402 	mp = vp->v_mount;
403 	if (mp)
404 		vfs_ref(mp);
405 	VOP_UNLOCK(vp, 0);
406 	fdrop(fp, td);
407 	if (mp == NULL) {
408 		error = EBADF;
409 		goto out;
410 	}
411 	error = vfs_busy(mp, 0);
412 	vfs_rel(mp);
413 	if (error) {
414 		VFS_UNLOCK_GIANT(vfslocked);
415 		return (error);
416 	}
417 #ifdef MAC
418 	error = mac_mount_check_stat(td->td_ucred, mp);
419 	if (error)
420 		goto out;
421 #endif
422 	/*
423 	 * Set these in case the underlying filesystem fails to do so.
424 	 */
425 	sp = &mp->mnt_stat;
426 	sp->f_version = STATFS_VERSION;
427 	sp->f_namemax = NAME_MAX;
428 	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
429 	error = VFS_STATFS(mp, sp);
430 	if (error)
431 		goto out;
432 	if (priv_check(td, PRIV_VFS_GENERATION)) {
433 		bcopy(sp, &sb, sizeof(sb));
434 		sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
435 		prison_enforce_statfs(td->td_ucred, mp, &sb);
436 		sp = &sb;
437 	}
438 	*buf = *sp;
439 out:
440 	if (mp)
441 		vfs_unbusy(mp);
442 	VFS_UNLOCK_GIANT(vfslocked);
443 	return (error);
444 }
445 
446 /*
447  * Get statistics on all filesystems.
448  */
449 #ifndef _SYS_SYSPROTO_H_
450 struct getfsstat_args {
451 	struct statfs *buf;
452 	long bufsize;
453 	int flags;
454 };
455 #endif
456 int
457 sys_getfsstat(td, uap)
458 	struct thread *td;
459 	register struct getfsstat_args /* {
460 		struct statfs *buf;
461 		long bufsize;
462 		int flags;
463 	} */ *uap;
464 {
465 
466 	return (kern_getfsstat(td, &uap->buf, uap->bufsize, UIO_USERSPACE,
467 	    uap->flags));
468 }
469 
470 /*
471  * If (bufsize > 0 && bufseg == UIO_SYSSPACE)
472  * 	The caller is responsible for freeing memory which will be allocated
473  *	in '*buf'.
474  */
475 int
476 kern_getfsstat(struct thread *td, struct statfs **buf, size_t bufsize,
477     enum uio_seg bufseg, int flags)
478 {
479 	struct mount *mp, *nmp;
480 	struct statfs *sfsp, *sp, sb;
481 	size_t count, maxcount;
482 	int vfslocked;
483 	int error;
484 
485 	maxcount = bufsize / sizeof(struct statfs);
486 	if (bufsize == 0)
487 		sfsp = NULL;
488 	else if (bufseg == UIO_USERSPACE)
489 		sfsp = *buf;
490 	else /* if (bufseg == UIO_SYSSPACE) */ {
491 		count = 0;
492 		mtx_lock(&mountlist_mtx);
493 		TAILQ_FOREACH(mp, &mountlist, mnt_list) {
494 			count++;
495 		}
496 		mtx_unlock(&mountlist_mtx);
497 		if (maxcount > count)
498 			maxcount = count;
499 		sfsp = *buf = malloc(maxcount * sizeof(struct statfs), M_TEMP,
500 		    M_WAITOK);
501 	}
502 	count = 0;
503 	mtx_lock(&mountlist_mtx);
504 	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
505 		if (prison_canseemount(td->td_ucred, mp) != 0) {
506 			nmp = TAILQ_NEXT(mp, mnt_list);
507 			continue;
508 		}
509 #ifdef MAC
510 		if (mac_mount_check_stat(td->td_ucred, mp) != 0) {
511 			nmp = TAILQ_NEXT(mp, mnt_list);
512 			continue;
513 		}
514 #endif
515 		if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
516 			nmp = TAILQ_NEXT(mp, mnt_list);
517 			continue;
518 		}
519 		vfslocked = VFS_LOCK_GIANT(mp);
520 		if (sfsp && count < maxcount) {
521 			sp = &mp->mnt_stat;
522 			/*
523 			 * Set these in case the underlying filesystem
524 			 * fails to do so.
525 			 */
526 			sp->f_version = STATFS_VERSION;
527 			sp->f_namemax = NAME_MAX;
528 			sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
529 			/*
530 			 * If MNT_NOWAIT or MNT_LAZY is specified, do not
531 			 * refresh the fsstat cache. MNT_NOWAIT or MNT_LAZY
532 			 * overrides MNT_WAIT.
533 			 */
534 			if (((flags & (MNT_LAZY|MNT_NOWAIT)) == 0 ||
535 			    (flags & MNT_WAIT)) &&
536 			    (error = VFS_STATFS(mp, sp))) {
537 				VFS_UNLOCK_GIANT(vfslocked);
538 				mtx_lock(&mountlist_mtx);
539 				nmp = TAILQ_NEXT(mp, mnt_list);
540 				vfs_unbusy(mp);
541 				continue;
542 			}
543 			if (priv_check(td, PRIV_VFS_GENERATION)) {
544 				bcopy(sp, &sb, sizeof(sb));
545 				sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
546 				prison_enforce_statfs(td->td_ucred, mp, &sb);
547 				sp = &sb;
548 			}
549 			if (bufseg == UIO_SYSSPACE)
550 				bcopy(sp, sfsp, sizeof(*sp));
551 			else /* if (bufseg == UIO_USERSPACE) */ {
552 				error = copyout(sp, sfsp, sizeof(*sp));
553 				if (error) {
554 					vfs_unbusy(mp);
555 					VFS_UNLOCK_GIANT(vfslocked);
556 					return (error);
557 				}
558 			}
559 			sfsp++;
560 		}
561 		VFS_UNLOCK_GIANT(vfslocked);
562 		count++;
563 		mtx_lock(&mountlist_mtx);
564 		nmp = TAILQ_NEXT(mp, mnt_list);
565 		vfs_unbusy(mp);
566 	}
567 	mtx_unlock(&mountlist_mtx);
568 	if (sfsp && count > maxcount)
569 		td->td_retval[0] = maxcount;
570 	else
571 		td->td_retval[0] = count;
572 	return (0);
573 }
574 
575 #ifdef COMPAT_FREEBSD4
576 /*
577  * Get old format filesystem statistics.
578  */
579 static void cvtstatfs(struct statfs *, struct ostatfs *);
580 
581 #ifndef _SYS_SYSPROTO_H_
582 struct freebsd4_statfs_args {
583 	char *path;
584 	struct ostatfs *buf;
585 };
586 #endif
587 int
588 freebsd4_statfs(td, uap)
589 	struct thread *td;
590 	struct freebsd4_statfs_args /* {
591 		char *path;
592 		struct ostatfs *buf;
593 	} */ *uap;
594 {
595 	struct ostatfs osb;
596 	struct statfs sf;
597 	int error;
598 
599 	error = kern_statfs(td, uap->path, UIO_USERSPACE, &sf);
600 	if (error)
601 		return (error);
602 	cvtstatfs(&sf, &osb);
603 	return (copyout(&osb, uap->buf, sizeof(osb)));
604 }
605 
606 /*
607  * Get filesystem statistics.
608  */
609 #ifndef _SYS_SYSPROTO_H_
610 struct freebsd4_fstatfs_args {
611 	int fd;
612 	struct ostatfs *buf;
613 };
614 #endif
615 int
616 freebsd4_fstatfs(td, uap)
617 	struct thread *td;
618 	struct freebsd4_fstatfs_args /* {
619 		int fd;
620 		struct ostatfs *buf;
621 	} */ *uap;
622 {
623 	struct ostatfs osb;
624 	struct statfs sf;
625 	int error;
626 
627 	error = kern_fstatfs(td, uap->fd, &sf);
628 	if (error)
629 		return (error);
630 	cvtstatfs(&sf, &osb);
631 	return (copyout(&osb, uap->buf, sizeof(osb)));
632 }
633 
634 /*
635  * Get statistics on all filesystems.
636  */
637 #ifndef _SYS_SYSPROTO_H_
638 struct freebsd4_getfsstat_args {
639 	struct ostatfs *buf;
640 	long bufsize;
641 	int flags;
642 };
643 #endif
644 int
645 freebsd4_getfsstat(td, uap)
646 	struct thread *td;
647 	register struct freebsd4_getfsstat_args /* {
648 		struct ostatfs *buf;
649 		long bufsize;
650 		int flags;
651 	} */ *uap;
652 {
653 	struct statfs *buf, *sp;
654 	struct ostatfs osb;
655 	size_t count, size;
656 	int error;
657 
658 	count = uap->bufsize / sizeof(struct ostatfs);
659 	size = count * sizeof(struct statfs);
660 	error = kern_getfsstat(td, &buf, size, UIO_SYSSPACE, uap->flags);
661 	if (size > 0) {
662 		count = td->td_retval[0];
663 		sp = buf;
664 		while (count > 0 && error == 0) {
665 			cvtstatfs(sp, &osb);
666 			error = copyout(&osb, uap->buf, sizeof(osb));
667 			sp++;
668 			uap->buf++;
669 			count--;
670 		}
671 		free(buf, M_TEMP);
672 	}
673 	return (error);
674 }
675 
676 /*
677  * Implement fstatfs() for (NFS) file handles.
678  */
679 #ifndef _SYS_SYSPROTO_H_
680 struct freebsd4_fhstatfs_args {
681 	struct fhandle *u_fhp;
682 	struct ostatfs *buf;
683 };
684 #endif
685 int
686 freebsd4_fhstatfs(td, uap)
687 	struct thread *td;
688 	struct freebsd4_fhstatfs_args /* {
689 		struct fhandle *u_fhp;
690 		struct ostatfs *buf;
691 	} */ *uap;
692 {
693 	struct ostatfs osb;
694 	struct statfs sf;
695 	fhandle_t fh;
696 	int error;
697 
698 	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
699 	if (error)
700 		return (error);
701 	error = kern_fhstatfs(td, fh, &sf);
702 	if (error)
703 		return (error);
704 	cvtstatfs(&sf, &osb);
705 	return (copyout(&osb, uap->buf, sizeof(osb)));
706 }
707 
708 /*
709  * Convert a new format statfs structure to an old format statfs structure.
710  */
711 static void
712 cvtstatfs(nsp, osp)
713 	struct statfs *nsp;
714 	struct ostatfs *osp;
715 {
716 
717 	statfs_scale_blocks(nsp, LONG_MAX);
718 	bzero(osp, sizeof(*osp));
719 	osp->f_bsize = nsp->f_bsize;
720 	osp->f_iosize = MIN(nsp->f_iosize, LONG_MAX);
721 	osp->f_blocks = nsp->f_blocks;
722 	osp->f_bfree = nsp->f_bfree;
723 	osp->f_bavail = nsp->f_bavail;
724 	osp->f_files = MIN(nsp->f_files, LONG_MAX);
725 	osp->f_ffree = MIN(nsp->f_ffree, LONG_MAX);
726 	osp->f_owner = nsp->f_owner;
727 	osp->f_type = nsp->f_type;
728 	osp->f_flags = nsp->f_flags;
729 	osp->f_syncwrites = MIN(nsp->f_syncwrites, LONG_MAX);
730 	osp->f_asyncwrites = MIN(nsp->f_asyncwrites, LONG_MAX);
731 	osp->f_syncreads = MIN(nsp->f_syncreads, LONG_MAX);
732 	osp->f_asyncreads = MIN(nsp->f_asyncreads, LONG_MAX);
733 	strlcpy(osp->f_fstypename, nsp->f_fstypename,
734 	    MIN(MFSNAMELEN, OMFSNAMELEN));
735 	strlcpy(osp->f_mntonname, nsp->f_mntonname,
736 	    MIN(MNAMELEN, OMNAMELEN));
737 	strlcpy(osp->f_mntfromname, nsp->f_mntfromname,
738 	    MIN(MNAMELEN, OMNAMELEN));
739 	osp->f_fsid = nsp->f_fsid;
740 }
741 #endif /* COMPAT_FREEBSD4 */
742 
743 /*
744  * Change current working directory to a given file descriptor.
745  */
746 #ifndef _SYS_SYSPROTO_H_
747 struct fchdir_args {
748 	int	fd;
749 };
750 #endif
751 int
752 sys_fchdir(td, uap)
753 	struct thread *td;
754 	struct fchdir_args /* {
755 		int fd;
756 	} */ *uap;
757 {
758 	register struct filedesc *fdp = td->td_proc->p_fd;
759 	struct vnode *vp, *tdp, *vpold;
760 	struct mount *mp;
761 	struct file *fp;
762 	int vfslocked;
763 	int error;
764 
765 	AUDIT_ARG_FD(uap->fd);
766 	if ((error = getvnode(fdp, uap->fd, CAP_FCHDIR, &fp)) != 0)
767 		return (error);
768 	vp = fp->f_vnode;
769 	VREF(vp);
770 	fdrop(fp, td);
771 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
772 	vn_lock(vp, LK_SHARED | LK_RETRY);
773 	AUDIT_ARG_VNODE1(vp);
774 	error = change_dir(vp, td);
775 	while (!error && (mp = vp->v_mountedhere) != NULL) {
776 		int tvfslocked;
777 		if (vfs_busy(mp, 0))
778 			continue;
779 		tvfslocked = VFS_LOCK_GIANT(mp);
780 		error = VFS_ROOT(mp, LK_SHARED, &tdp);
781 		vfs_unbusy(mp);
782 		if (error) {
783 			VFS_UNLOCK_GIANT(tvfslocked);
784 			break;
785 		}
786 		vput(vp);
787 		VFS_UNLOCK_GIANT(vfslocked);
788 		vp = tdp;
789 		vfslocked = tvfslocked;
790 	}
791 	if (error) {
792 		vput(vp);
793 		VFS_UNLOCK_GIANT(vfslocked);
794 		return (error);
795 	}
796 	VOP_UNLOCK(vp, 0);
797 	VFS_UNLOCK_GIANT(vfslocked);
798 	FILEDESC_XLOCK(fdp);
799 	vpold = fdp->fd_cdir;
800 	fdp->fd_cdir = vp;
801 	FILEDESC_XUNLOCK(fdp);
802 	vfslocked = VFS_LOCK_GIANT(vpold->v_mount);
803 	vrele(vpold);
804 	VFS_UNLOCK_GIANT(vfslocked);
805 	return (0);
806 }
807 
808 /*
809  * Change current working directory (``.'').
810  */
811 #ifndef _SYS_SYSPROTO_H_
812 struct chdir_args {
813 	char	*path;
814 };
815 #endif
816 int
817 sys_chdir(td, uap)
818 	struct thread *td;
819 	struct chdir_args /* {
820 		char *path;
821 	} */ *uap;
822 {
823 
824 	return (kern_chdir(td, uap->path, UIO_USERSPACE));
825 }
826 
827 int
828 kern_chdir(struct thread *td, char *path, enum uio_seg pathseg)
829 {
830 	register struct filedesc *fdp = td->td_proc->p_fd;
831 	int error;
832 	struct nameidata nd;
833 	struct vnode *vp;
834 	int vfslocked;
835 
836 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1 |
837 	    MPSAFE, pathseg, path, td);
838 	if ((error = namei(&nd)) != 0)
839 		return (error);
840 	vfslocked = NDHASGIANT(&nd);
841 	if ((error = change_dir(nd.ni_vp, td)) != 0) {
842 		vput(nd.ni_vp);
843 		VFS_UNLOCK_GIANT(vfslocked);
844 		NDFREE(&nd, NDF_ONLY_PNBUF);
845 		return (error);
846 	}
847 	VOP_UNLOCK(nd.ni_vp, 0);
848 	VFS_UNLOCK_GIANT(vfslocked);
849 	NDFREE(&nd, NDF_ONLY_PNBUF);
850 	FILEDESC_XLOCK(fdp);
851 	vp = fdp->fd_cdir;
852 	fdp->fd_cdir = nd.ni_vp;
853 	FILEDESC_XUNLOCK(fdp);
854 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
855 	vrele(vp);
856 	VFS_UNLOCK_GIANT(vfslocked);
857 	return (0);
858 }
859 
860 /*
861  * Helper function for raised chroot(2) security function:  Refuse if
862  * any filedescriptors are open directories.
863  */
864 static int
865 chroot_refuse_vdir_fds(fdp)
866 	struct filedesc *fdp;
867 {
868 	struct vnode *vp;
869 	struct file *fp;
870 	int fd;
871 
872 	FILEDESC_LOCK_ASSERT(fdp);
873 
874 	for (fd = 0; fd < fdp->fd_nfiles ; fd++) {
875 		fp = fget_locked(fdp, fd);
876 		if (fp == NULL)
877 			continue;
878 		if (fp->f_type == DTYPE_VNODE) {
879 			vp = fp->f_vnode;
880 			if (vp->v_type == VDIR)
881 				return (EPERM);
882 		}
883 	}
884 	return (0);
885 }
886 
887 /*
888  * This sysctl determines if we will allow a process to chroot(2) if it
889  * has a directory open:
890  *	0: disallowed for all processes.
891  *	1: allowed for processes that were not already chroot(2)'ed.
892  *	2: allowed for all processes.
893  */
894 
895 static int chroot_allow_open_directories = 1;
896 
897 SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW,
898      &chroot_allow_open_directories, 0,
899      "Allow a process to chroot(2) if it has a directory open");
900 
901 /*
902  * Change notion of root (``/'') directory.
903  */
904 #ifndef _SYS_SYSPROTO_H_
905 struct chroot_args {
906 	char	*path;
907 };
908 #endif
909 int
910 sys_chroot(td, uap)
911 	struct thread *td;
912 	struct chroot_args /* {
913 		char *path;
914 	} */ *uap;
915 {
916 	int error;
917 	struct nameidata nd;
918 	int vfslocked;
919 
920 	error = priv_check(td, PRIV_VFS_CHROOT);
921 	if (error)
922 		return (error);
923 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | MPSAFE |
924 	    AUDITVNODE1, UIO_USERSPACE, uap->path, td);
925 	error = namei(&nd);
926 	if (error)
927 		goto error;
928 	vfslocked = NDHASGIANT(&nd);
929 	if ((error = change_dir(nd.ni_vp, td)) != 0)
930 		goto e_vunlock;
931 #ifdef MAC
932 	if ((error = mac_vnode_check_chroot(td->td_ucred, nd.ni_vp)))
933 		goto e_vunlock;
934 #endif
935 	VOP_UNLOCK(nd.ni_vp, 0);
936 	error = change_root(nd.ni_vp, td);
937 	vrele(nd.ni_vp);
938 	VFS_UNLOCK_GIANT(vfslocked);
939 	NDFREE(&nd, NDF_ONLY_PNBUF);
940 	return (error);
941 e_vunlock:
942 	vput(nd.ni_vp);
943 	VFS_UNLOCK_GIANT(vfslocked);
944 error:
945 	NDFREE(&nd, NDF_ONLY_PNBUF);
946 	return (error);
947 }
948 
949 /*
950  * Common routine for chroot and chdir.  Callers must provide a locked vnode
951  * instance.
952  */
953 int
954 change_dir(vp, td)
955 	struct vnode *vp;
956 	struct thread *td;
957 {
958 	int error;
959 
960 	ASSERT_VOP_LOCKED(vp, "change_dir(): vp not locked");
961 	if (vp->v_type != VDIR)
962 		return (ENOTDIR);
963 #ifdef MAC
964 	error = mac_vnode_check_chdir(td->td_ucred, vp);
965 	if (error)
966 		return (error);
967 #endif
968 	error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td);
969 	return (error);
970 }
971 
972 /*
973  * Common routine for kern_chroot() and jail_attach().  The caller is
974  * responsible for invoking priv_check() and mac_vnode_check_chroot() to
975  * authorize this operation.
976  */
977 int
978 change_root(vp, td)
979 	struct vnode *vp;
980 	struct thread *td;
981 {
982 	struct filedesc *fdp;
983 	struct vnode *oldvp;
984 	int vfslocked;
985 	int error;
986 
987 	VFS_ASSERT_GIANT(vp->v_mount);
988 	fdp = td->td_proc->p_fd;
989 	FILEDESC_XLOCK(fdp);
990 	if (chroot_allow_open_directories == 0 ||
991 	    (chroot_allow_open_directories == 1 && fdp->fd_rdir != rootvnode)) {
992 		error = chroot_refuse_vdir_fds(fdp);
993 		if (error) {
994 			FILEDESC_XUNLOCK(fdp);
995 			return (error);
996 		}
997 	}
998 	oldvp = fdp->fd_rdir;
999 	fdp->fd_rdir = vp;
1000 	VREF(fdp->fd_rdir);
1001 	if (!fdp->fd_jdir) {
1002 		fdp->fd_jdir = vp;
1003 		VREF(fdp->fd_jdir);
1004 	}
1005 	FILEDESC_XUNLOCK(fdp);
1006 	vfslocked = VFS_LOCK_GIANT(oldvp->v_mount);
1007 	vrele(oldvp);
1008 	VFS_UNLOCK_GIANT(vfslocked);
1009 	return (0);
1010 }
1011 
1012 static __inline cap_rights_t
1013 flags_to_rights(int flags)
1014 {
1015 	cap_rights_t rights = 0;
1016 
1017 	switch ((flags & O_ACCMODE)) {
1018 	case O_RDONLY:
1019 		rights |= CAP_READ;
1020 		break;
1021 
1022 	case O_RDWR:
1023 		rights |= CAP_READ;
1024 		/* fall through */
1025 
1026 	case O_WRONLY:
1027 		rights |= CAP_WRITE;
1028 		break;
1029 
1030 	case O_EXEC:
1031 		rights |= CAP_FEXECVE;
1032 		break;
1033 	}
1034 
1035 	if (flags & O_CREAT)
1036 		rights |= CAP_CREATE;
1037 
1038 	if (flags & O_TRUNC)
1039 		rights |= CAP_FTRUNCATE;
1040 
1041 	if ((flags & O_EXLOCK) || (flags & O_SHLOCK))
1042 		rights |= CAP_FLOCK;
1043 
1044 	return (rights);
1045 }
1046 
1047 /*
1048  * Check permissions, allocate an open file structure, and call the device
1049  * open routine if any.
1050  */
1051 #ifndef _SYS_SYSPROTO_H_
1052 struct open_args {
1053 	char	*path;
1054 	int	flags;
1055 	int	mode;
1056 };
1057 #endif
1058 int
1059 sys_open(td, uap)
1060 	struct thread *td;
1061 	register struct open_args /* {
1062 		char *path;
1063 		int flags;
1064 		int mode;
1065 	} */ *uap;
1066 {
1067 
1068 	return (kern_open(td, uap->path, UIO_USERSPACE, uap->flags, uap->mode));
1069 }
1070 
1071 #ifndef _SYS_SYSPROTO_H_
1072 struct openat_args {
1073 	int	fd;
1074 	char	*path;
1075 	int	flag;
1076 	int	mode;
1077 };
1078 #endif
1079 int
1080 sys_openat(struct thread *td, struct openat_args *uap)
1081 {
1082 
1083 	return (kern_openat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flag,
1084 	    uap->mode));
1085 }
1086 
1087 int
1088 kern_open(struct thread *td, char *path, enum uio_seg pathseg, int flags,
1089     int mode)
1090 {
1091 
1092 	return (kern_openat(td, AT_FDCWD, path, pathseg, flags, mode));
1093 }
1094 
1095 int
1096 kern_openat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
1097     int flags, int mode)
1098 {
1099 	struct proc *p = td->td_proc;
1100 	struct filedesc *fdp = p->p_fd;
1101 	struct file *fp;
1102 	struct vnode *vp;
1103 	int cmode;
1104 	struct file *nfp;
1105 	int type, indx = -1, error, error_open;
1106 	struct flock lf;
1107 	struct nameidata nd;
1108 	int vfslocked;
1109 	cap_rights_t rights_needed = CAP_LOOKUP;
1110 
1111 	AUDIT_ARG_FFLAGS(flags);
1112 	AUDIT_ARG_MODE(mode);
1113 	/* XXX: audit dirfd */
1114 	rights_needed |= flags_to_rights(flags);
1115 	/*
1116 	 * Only one of the O_EXEC, O_RDONLY, O_WRONLY and O_RDWR flags
1117 	 * may be specified.
1118 	 */
1119 	if (flags & O_EXEC) {
1120 		if (flags & O_ACCMODE)
1121 			return (EINVAL);
1122 	} else if ((flags & O_ACCMODE) == O_ACCMODE)
1123 		return (EINVAL);
1124 	else
1125 		flags = FFLAGS(flags);
1126 
1127 	/*
1128 	 * allocate the file descriptor, but don't install a descriptor yet
1129 	 */
1130 	error = falloc_noinstall(td, &nfp);
1131 	if (error)
1132 		return (error);
1133 	/* An extra reference on `nfp' has been held for us by falloc_noinstall(). */
1134 	fp = nfp;
1135 	/* Set the flags early so the finit in devfs can pick them up. */
1136 	fp->f_flag = flags & FMASK;
1137 	cmode = ((mode &~ fdp->fd_cmask) & ALLPERMS) &~ S_ISTXT;
1138 	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | AUDITVNODE1 | MPSAFE, pathseg,
1139 	    path, fd, rights_needed, td);
1140 	td->td_dupfd = -1;		/* XXX check for fdopen */
1141 	error = vn_open(&nd, &flags, cmode, fp);
1142 	if (error) {
1143 		/*
1144 		 * If the vn_open replaced the method vector, something
1145 		 * wonderous happened deep below and we just pass it up
1146 		 * pretending we know what we do.
1147 		 */
1148 		if (error == ENXIO && fp->f_ops != &badfileops)
1149 			goto success;
1150 
1151 		/*
1152 		 * handle special fdopen() case.  bleh.  dupfdopen() is
1153 		 * responsible for dropping the old contents of ofiles[indx]
1154 		 * if it succeeds.
1155 		 *
1156 		 * Don't do this for relative (capability) lookups; we don't
1157 		 * understand exactly what would happen, and we don't think
1158 		 * that it ever should.
1159 		 */
1160 		if ((nd.ni_strictrelative == 0) &&
1161 		    (error == ENODEV || error == ENXIO) &&
1162 		    (td->td_dupfd >= 0)) {
1163 			/* XXX from fdopen */
1164 			error_open = error;
1165 			if ((error = finstall(td, fp, &indx, flags)) != 0)
1166 				goto bad_unlocked;
1167 			if ((error = dupfdopen(td, fdp, indx, td->td_dupfd,
1168 			    flags, error_open)) == 0)
1169 				goto success;
1170 		}
1171 		/*
1172 		 * Clean up the descriptor, but only if another thread hadn't
1173 		 * replaced or closed it.
1174 		 */
1175 		if (indx != -1)
1176 			fdclose(fdp, fp, indx, td);
1177 		fdrop(fp, td);
1178 
1179 		if (error == ERESTART)
1180 			error = EINTR;
1181 		return (error);
1182 	}
1183 	td->td_dupfd = 0;
1184 	vfslocked = NDHASGIANT(&nd);
1185 	NDFREE(&nd, NDF_ONLY_PNBUF);
1186 	vp = nd.ni_vp;
1187 
1188 	/*
1189 	 * Store the vnode, for any f_type. Typically, the vnode use
1190 	 * count is decremented by direct call to vn_closefile() for
1191 	 * files that switched type in the cdevsw fdopen() method.
1192 	 */
1193 	fp->f_vnode = vp;
1194 	/*
1195 	 * If the file wasn't claimed by devfs bind it to the normal
1196 	 * vnode operations here.
1197 	 */
1198 	if (fp->f_ops == &badfileops) {
1199 		KASSERT(vp->v_type != VFIFO, ("Unexpected fifo."));
1200 		fp->f_seqcount = 1;
1201 		finit(fp, flags & FMASK, DTYPE_VNODE, vp, &vnops);
1202 	}
1203 
1204 	VOP_UNLOCK(vp, 0);
1205 	if (fp->f_type == DTYPE_VNODE && (flags & (O_EXLOCK | O_SHLOCK)) != 0) {
1206 		lf.l_whence = SEEK_SET;
1207 		lf.l_start = 0;
1208 		lf.l_len = 0;
1209 		if (flags & O_EXLOCK)
1210 			lf.l_type = F_WRLCK;
1211 		else
1212 			lf.l_type = F_RDLCK;
1213 		type = F_FLOCK;
1214 		if ((flags & FNONBLOCK) == 0)
1215 			type |= F_WAIT;
1216 		if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,
1217 			    type)) != 0)
1218 			goto bad;
1219 		atomic_set_int(&fp->f_flag, FHASLOCK);
1220 	}
1221 	if (flags & O_TRUNC) {
1222 		error = fo_truncate(fp, 0, td->td_ucred, td);
1223 		if (error)
1224 			goto bad;
1225 	}
1226 	VFS_UNLOCK_GIANT(vfslocked);
1227 success:
1228 	/*
1229 	 * If we haven't already installed the FD (for dupfdopen), do so now.
1230 	 */
1231 	if (indx == -1) {
1232 #ifdef CAPABILITIES
1233 		if (nd.ni_strictrelative == 1) {
1234 			/*
1235 			 * We are doing a strict relative lookup; wrap the
1236 			 * result in a capability.
1237 			 */
1238 			if ((error = kern_capwrap(td, fp, nd.ni_baserights,
1239 			    &indx)) != 0)
1240 				goto bad_unlocked;
1241 		} else
1242 #endif
1243 			if ((error = finstall(td, fp, &indx, flags)) != 0)
1244 				goto bad_unlocked;
1245 
1246 	}
1247 
1248 	/*
1249 	 * Release our private reference, leaving the one associated with
1250 	 * the descriptor table intact.
1251 	 */
1252 	fdrop(fp, td);
1253 	td->td_retval[0] = indx;
1254 	return (0);
1255 bad:
1256 	VFS_UNLOCK_GIANT(vfslocked);
1257 bad_unlocked:
1258 	if (indx != -1)
1259 		fdclose(fdp, fp, indx, td);
1260 	fdrop(fp, td);
1261 	td->td_retval[0] = -1;
1262 	return (error);
1263 }
1264 
1265 #ifdef COMPAT_43
1266 /*
1267  * Create a file.
1268  */
1269 #ifndef _SYS_SYSPROTO_H_
1270 struct ocreat_args {
1271 	char	*path;
1272 	int	mode;
1273 };
1274 #endif
1275 int
1276 ocreat(td, uap)
1277 	struct thread *td;
1278 	register struct ocreat_args /* {
1279 		char *path;
1280 		int mode;
1281 	} */ *uap;
1282 {
1283 
1284 	return (kern_open(td, uap->path, UIO_USERSPACE,
1285 	    O_WRONLY | O_CREAT | O_TRUNC, uap->mode));
1286 }
1287 #endif /* COMPAT_43 */
1288 
1289 /*
1290  * Create a special file.
1291  */
1292 #ifndef _SYS_SYSPROTO_H_
1293 struct mknod_args {
1294 	char	*path;
1295 	int	mode;
1296 	int	dev;
1297 };
1298 #endif
1299 int
1300 sys_mknod(td, uap)
1301 	struct thread *td;
1302 	register struct mknod_args /* {
1303 		char *path;
1304 		int mode;
1305 		int dev;
1306 	} */ *uap;
1307 {
1308 
1309 	return (kern_mknod(td, uap->path, UIO_USERSPACE, uap->mode, uap->dev));
1310 }
1311 
1312 #ifndef _SYS_SYSPROTO_H_
1313 struct mknodat_args {
1314 	int	fd;
1315 	char	*path;
1316 	mode_t	mode;
1317 	dev_t	dev;
1318 };
1319 #endif
1320 int
1321 sys_mknodat(struct thread *td, struct mknodat_args *uap)
1322 {
1323 
1324 	return (kern_mknodat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode,
1325 	    uap->dev));
1326 }
1327 
1328 int
1329 kern_mknod(struct thread *td, char *path, enum uio_seg pathseg, int mode,
1330     int dev)
1331 {
1332 
1333 	return (kern_mknodat(td, AT_FDCWD, path, pathseg, mode, dev));
1334 }
1335 
1336 int
1337 kern_mknodat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
1338     int mode, int dev)
1339 {
1340 	struct vnode *vp;
1341 	struct mount *mp;
1342 	struct vattr vattr;
1343 	int error;
1344 	int whiteout = 0;
1345 	struct nameidata nd;
1346 	int vfslocked;
1347 
1348 	AUDIT_ARG_MODE(mode);
1349 	AUDIT_ARG_DEV(dev);
1350 	switch (mode & S_IFMT) {
1351 	case S_IFCHR:
1352 	case S_IFBLK:
1353 		error = priv_check(td, PRIV_VFS_MKNOD_DEV);
1354 		break;
1355 	case S_IFMT:
1356 		error = priv_check(td, PRIV_VFS_MKNOD_BAD);
1357 		break;
1358 	case S_IFWHT:
1359 		error = priv_check(td, PRIV_VFS_MKNOD_WHT);
1360 		break;
1361 	case S_IFIFO:
1362 		if (dev == 0)
1363 			return (kern_mkfifoat(td, fd, path, pathseg, mode));
1364 		/* FALLTHROUGH */
1365 	default:
1366 		error = EINVAL;
1367 		break;
1368 	}
1369 	if (error)
1370 		return (error);
1371 restart:
1372 	bwillwrite();
1373 	NDINIT_ATRIGHTS(&nd, CREATE,
1374 	    LOCKPARENT | SAVENAME | MPSAFE | AUDITVNODE1, pathseg, path, fd,
1375 	    CAP_MKFIFO, td);
1376 	if ((error = namei(&nd)) != 0)
1377 		return (error);
1378 	vfslocked = NDHASGIANT(&nd);
1379 	vp = nd.ni_vp;
1380 	if (vp != NULL) {
1381 		NDFREE(&nd, NDF_ONLY_PNBUF);
1382 		if (vp == nd.ni_dvp)
1383 			vrele(nd.ni_dvp);
1384 		else
1385 			vput(nd.ni_dvp);
1386 		vrele(vp);
1387 		VFS_UNLOCK_GIANT(vfslocked);
1388 		return (EEXIST);
1389 	} else {
1390 		VATTR_NULL(&vattr);
1391 		vattr.va_mode = (mode & ALLPERMS) &
1392 		    ~td->td_proc->p_fd->fd_cmask;
1393 		vattr.va_rdev = dev;
1394 		whiteout = 0;
1395 
1396 		switch (mode & S_IFMT) {
1397 		case S_IFMT:	/* used by badsect to flag bad sectors */
1398 			vattr.va_type = VBAD;
1399 			break;
1400 		case S_IFCHR:
1401 			vattr.va_type = VCHR;
1402 			break;
1403 		case S_IFBLK:
1404 			vattr.va_type = VBLK;
1405 			break;
1406 		case S_IFWHT:
1407 			whiteout = 1;
1408 			break;
1409 		default:
1410 			panic("kern_mknod: invalid mode");
1411 		}
1412 	}
1413 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1414 		NDFREE(&nd, NDF_ONLY_PNBUF);
1415 		vput(nd.ni_dvp);
1416 		VFS_UNLOCK_GIANT(vfslocked);
1417 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
1418 			return (error);
1419 		goto restart;
1420 	}
1421 #ifdef MAC
1422 	if (error == 0 && !whiteout)
1423 		error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp,
1424 		    &nd.ni_cnd, &vattr);
1425 #endif
1426 	if (!error) {
1427 		if (whiteout)
1428 			error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE);
1429 		else {
1430 			error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp,
1431 						&nd.ni_cnd, &vattr);
1432 			if (error == 0)
1433 				vput(nd.ni_vp);
1434 		}
1435 	}
1436 	NDFREE(&nd, NDF_ONLY_PNBUF);
1437 	vput(nd.ni_dvp);
1438 	vn_finished_write(mp);
1439 	VFS_UNLOCK_GIANT(vfslocked);
1440 	return (error);
1441 }
1442 
1443 /*
1444  * Create a named pipe.
1445  */
1446 #ifndef _SYS_SYSPROTO_H_
1447 struct mkfifo_args {
1448 	char	*path;
1449 	int	mode;
1450 };
1451 #endif
1452 int
1453 sys_mkfifo(td, uap)
1454 	struct thread *td;
1455 	register struct mkfifo_args /* {
1456 		char *path;
1457 		int mode;
1458 	} */ *uap;
1459 {
1460 
1461 	return (kern_mkfifo(td, uap->path, UIO_USERSPACE, uap->mode));
1462 }
1463 
1464 #ifndef _SYS_SYSPROTO_H_
1465 struct mkfifoat_args {
1466 	int	fd;
1467 	char	*path;
1468 	mode_t	mode;
1469 };
1470 #endif
1471 int
1472 sys_mkfifoat(struct thread *td, struct mkfifoat_args *uap)
1473 {
1474 
1475 	return (kern_mkfifoat(td, uap->fd, uap->path, UIO_USERSPACE,
1476 	    uap->mode));
1477 }
1478 
1479 int
1480 kern_mkfifo(struct thread *td, char *path, enum uio_seg pathseg, int mode)
1481 {
1482 
1483 	return (kern_mkfifoat(td, AT_FDCWD, path, pathseg, mode));
1484 }
1485 
1486 int
1487 kern_mkfifoat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
1488     int mode)
1489 {
1490 	struct mount *mp;
1491 	struct vattr vattr;
1492 	int error;
1493 	struct nameidata nd;
1494 	int vfslocked;
1495 
1496 	AUDIT_ARG_MODE(mode);
1497 restart:
1498 	bwillwrite();
1499 	NDINIT_AT(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE | AUDITVNODE1,
1500 	    pathseg, path, fd, td);
1501 	if ((error = namei(&nd)) != 0)
1502 		return (error);
1503 	vfslocked = NDHASGIANT(&nd);
1504 	if (nd.ni_vp != NULL) {
1505 		NDFREE(&nd, NDF_ONLY_PNBUF);
1506 		if (nd.ni_vp == nd.ni_dvp)
1507 			vrele(nd.ni_dvp);
1508 		else
1509 			vput(nd.ni_dvp);
1510 		vrele(nd.ni_vp);
1511 		VFS_UNLOCK_GIANT(vfslocked);
1512 		return (EEXIST);
1513 	}
1514 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1515 		NDFREE(&nd, NDF_ONLY_PNBUF);
1516 		vput(nd.ni_dvp);
1517 		VFS_UNLOCK_GIANT(vfslocked);
1518 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
1519 			return (error);
1520 		goto restart;
1521 	}
1522 	VATTR_NULL(&vattr);
1523 	vattr.va_type = VFIFO;
1524 	vattr.va_mode = (mode & ALLPERMS) & ~td->td_proc->p_fd->fd_cmask;
1525 #ifdef MAC
1526 	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
1527 	    &vattr);
1528 	if (error)
1529 		goto out;
1530 #endif
1531 	error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
1532 	if (error == 0)
1533 		vput(nd.ni_vp);
1534 #ifdef MAC
1535 out:
1536 #endif
1537 	vput(nd.ni_dvp);
1538 	vn_finished_write(mp);
1539 	VFS_UNLOCK_GIANT(vfslocked);
1540 	NDFREE(&nd, NDF_ONLY_PNBUF);
1541 	return (error);
1542 }
1543 
1544 /*
1545  * Make a hard file link.
1546  */
1547 #ifndef _SYS_SYSPROTO_H_
1548 struct link_args {
1549 	char	*path;
1550 	char	*link;
1551 };
1552 #endif
1553 int
1554 sys_link(td, uap)
1555 	struct thread *td;
1556 	register struct link_args /* {
1557 		char *path;
1558 		char *link;
1559 	} */ *uap;
1560 {
1561 
1562 	return (kern_link(td, uap->path, uap->link, UIO_USERSPACE));
1563 }
1564 
1565 #ifndef _SYS_SYSPROTO_H_
1566 struct linkat_args {
1567 	int	fd1;
1568 	char	*path1;
1569 	int	fd2;
1570 	char	*path2;
1571 	int	flag;
1572 };
1573 #endif
1574 int
1575 sys_linkat(struct thread *td, struct linkat_args *uap)
1576 {
1577 	int flag;
1578 
1579 	flag = uap->flag;
1580 	if (flag & ~AT_SYMLINK_FOLLOW)
1581 		return (EINVAL);
1582 
1583 	return (kern_linkat(td, uap->fd1, uap->fd2, uap->path1, uap->path2,
1584 	    UIO_USERSPACE, (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW));
1585 }
1586 
1587 int hardlink_check_uid = 0;
1588 SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_uid, CTLFLAG_RW,
1589     &hardlink_check_uid, 0,
1590     "Unprivileged processes cannot create hard links to files owned by other "
1591     "users");
1592 static int hardlink_check_gid = 0;
1593 SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_gid, CTLFLAG_RW,
1594     &hardlink_check_gid, 0,
1595     "Unprivileged processes cannot create hard links to files owned by other "
1596     "groups");
1597 
1598 static int
1599 can_hardlink(struct vnode *vp, struct ucred *cred)
1600 {
1601 	struct vattr va;
1602 	int error;
1603 
1604 	if (!hardlink_check_uid && !hardlink_check_gid)
1605 		return (0);
1606 
1607 	error = VOP_GETATTR(vp, &va, cred);
1608 	if (error != 0)
1609 		return (error);
1610 
1611 	if (hardlink_check_uid && cred->cr_uid != va.va_uid) {
1612 		error = priv_check_cred(cred, PRIV_VFS_LINK, 0);
1613 		if (error)
1614 			return (error);
1615 	}
1616 
1617 	if (hardlink_check_gid && !groupmember(va.va_gid, cred)) {
1618 		error = priv_check_cred(cred, PRIV_VFS_LINK, 0);
1619 		if (error)
1620 			return (error);
1621 	}
1622 
1623 	return (0);
1624 }
1625 
1626 int
1627 kern_link(struct thread *td, char *path, char *link, enum uio_seg segflg)
1628 {
1629 
1630 	return (kern_linkat(td, AT_FDCWD, AT_FDCWD, path,link, segflg, FOLLOW));
1631 }
1632 
1633 int
1634 kern_linkat(struct thread *td, int fd1, int fd2, char *path1, char *path2,
1635     enum uio_seg segflg, int follow)
1636 {
1637 	struct vnode *vp;
1638 	struct mount *mp;
1639 	struct nameidata nd;
1640 	int vfslocked;
1641 	int lvfslocked;
1642 	int error;
1643 
1644 	bwillwrite();
1645 	NDINIT_AT(&nd, LOOKUP, follow | MPSAFE | AUDITVNODE1, segflg, path1,
1646 	    fd1, td);
1647 
1648 	if ((error = namei(&nd)) != 0)
1649 		return (error);
1650 	vfslocked = NDHASGIANT(&nd);
1651 	NDFREE(&nd, NDF_ONLY_PNBUF);
1652 	vp = nd.ni_vp;
1653 	if (vp->v_type == VDIR) {
1654 		vrele(vp);
1655 		VFS_UNLOCK_GIANT(vfslocked);
1656 		return (EPERM);		/* POSIX */
1657 	}
1658 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
1659 		vrele(vp);
1660 		VFS_UNLOCK_GIANT(vfslocked);
1661 		return (error);
1662 	}
1663 	NDINIT_AT(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE | AUDITVNODE2,
1664 	    segflg, path2, fd2, td);
1665 	if ((error = namei(&nd)) == 0) {
1666 		lvfslocked = NDHASGIANT(&nd);
1667 		if (nd.ni_vp != NULL) {
1668 			if (nd.ni_dvp == nd.ni_vp)
1669 				vrele(nd.ni_dvp);
1670 			else
1671 				vput(nd.ni_dvp);
1672 			vrele(nd.ni_vp);
1673 			error = EEXIST;
1674 		} else if ((error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY))
1675 		    == 0) {
1676 			error = can_hardlink(vp, td->td_ucred);
1677 			if (error == 0)
1678 #ifdef MAC
1679 				error = mac_vnode_check_link(td->td_ucred,
1680 				    nd.ni_dvp, vp, &nd.ni_cnd);
1681 			if (error == 0)
1682 #endif
1683 				error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd);
1684 			VOP_UNLOCK(vp, 0);
1685 			vput(nd.ni_dvp);
1686 		}
1687 		NDFREE(&nd, NDF_ONLY_PNBUF);
1688 		VFS_UNLOCK_GIANT(lvfslocked);
1689 	}
1690 	vrele(vp);
1691 	vn_finished_write(mp);
1692 	VFS_UNLOCK_GIANT(vfslocked);
1693 	return (error);
1694 }
1695 
1696 /*
1697  * Make a symbolic link.
1698  */
1699 #ifndef _SYS_SYSPROTO_H_
1700 struct symlink_args {
1701 	char	*path;
1702 	char	*link;
1703 };
1704 #endif
1705 int
1706 sys_symlink(td, uap)
1707 	struct thread *td;
1708 	register struct symlink_args /* {
1709 		char *path;
1710 		char *link;
1711 	} */ *uap;
1712 {
1713 
1714 	return (kern_symlink(td, uap->path, uap->link, UIO_USERSPACE));
1715 }
1716 
1717 #ifndef _SYS_SYSPROTO_H_
1718 struct symlinkat_args {
1719 	char	*path;
1720 	int	fd;
1721 	char	*path2;
1722 };
1723 #endif
1724 int
1725 sys_symlinkat(struct thread *td, struct symlinkat_args *uap)
1726 {
1727 
1728 	return (kern_symlinkat(td, uap->path1, uap->fd, uap->path2,
1729 	    UIO_USERSPACE));
1730 }
1731 
1732 int
1733 kern_symlink(struct thread *td, char *path, char *link, enum uio_seg segflg)
1734 {
1735 
1736 	return (kern_symlinkat(td, path, AT_FDCWD, link, segflg));
1737 }
1738 
1739 int
1740 kern_symlinkat(struct thread *td, char *path1, int fd, char *path2,
1741     enum uio_seg segflg)
1742 {
1743 	struct mount *mp;
1744 	struct vattr vattr;
1745 	char *syspath;
1746 	int error;
1747 	struct nameidata nd;
1748 	int vfslocked;
1749 
1750 	if (segflg == UIO_SYSSPACE) {
1751 		syspath = path1;
1752 	} else {
1753 		syspath = uma_zalloc(namei_zone, M_WAITOK);
1754 		if ((error = copyinstr(path1, syspath, MAXPATHLEN, NULL)) != 0)
1755 			goto out;
1756 	}
1757 	AUDIT_ARG_TEXT(syspath);
1758 restart:
1759 	bwillwrite();
1760 	NDINIT_AT(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE | AUDITVNODE1,
1761 	    segflg, path2, fd, td);
1762 	if ((error = namei(&nd)) != 0)
1763 		goto out;
1764 	vfslocked = NDHASGIANT(&nd);
1765 	if (nd.ni_vp) {
1766 		NDFREE(&nd, NDF_ONLY_PNBUF);
1767 		if (nd.ni_vp == nd.ni_dvp)
1768 			vrele(nd.ni_dvp);
1769 		else
1770 			vput(nd.ni_dvp);
1771 		vrele(nd.ni_vp);
1772 		VFS_UNLOCK_GIANT(vfslocked);
1773 		error = EEXIST;
1774 		goto out;
1775 	}
1776 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1777 		NDFREE(&nd, NDF_ONLY_PNBUF);
1778 		vput(nd.ni_dvp);
1779 		VFS_UNLOCK_GIANT(vfslocked);
1780 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
1781 			goto out;
1782 		goto restart;
1783 	}
1784 	VATTR_NULL(&vattr);
1785 	vattr.va_mode = ACCESSPERMS &~ td->td_proc->p_fd->fd_cmask;
1786 #ifdef MAC
1787 	vattr.va_type = VLNK;
1788 	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
1789 	    &vattr);
1790 	if (error)
1791 		goto out2;
1792 #endif
1793 	error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, syspath);
1794 	if (error == 0)
1795 		vput(nd.ni_vp);
1796 #ifdef MAC
1797 out2:
1798 #endif
1799 	NDFREE(&nd, NDF_ONLY_PNBUF);
1800 	vput(nd.ni_dvp);
1801 	vn_finished_write(mp);
1802 	VFS_UNLOCK_GIANT(vfslocked);
1803 out:
1804 	if (segflg != UIO_SYSSPACE)
1805 		uma_zfree(namei_zone, syspath);
1806 	return (error);
1807 }
1808 
1809 /*
1810  * Delete a whiteout from the filesystem.
1811  */
1812 int
1813 sys_undelete(td, uap)
1814 	struct thread *td;
1815 	register struct undelete_args /* {
1816 		char *path;
1817 	} */ *uap;
1818 {
1819 	int error;
1820 	struct mount *mp;
1821 	struct nameidata nd;
1822 	int vfslocked;
1823 
1824 restart:
1825 	bwillwrite();
1826 	NDINIT(&nd, DELETE, LOCKPARENT | DOWHITEOUT | MPSAFE | AUDITVNODE1,
1827 	    UIO_USERSPACE, uap->path, td);
1828 	error = namei(&nd);
1829 	if (error)
1830 		return (error);
1831 	vfslocked = NDHASGIANT(&nd);
1832 
1833 	if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) {
1834 		NDFREE(&nd, NDF_ONLY_PNBUF);
1835 		if (nd.ni_vp == nd.ni_dvp)
1836 			vrele(nd.ni_dvp);
1837 		else
1838 			vput(nd.ni_dvp);
1839 		if (nd.ni_vp)
1840 			vrele(nd.ni_vp);
1841 		VFS_UNLOCK_GIANT(vfslocked);
1842 		return (EEXIST);
1843 	}
1844 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1845 		NDFREE(&nd, NDF_ONLY_PNBUF);
1846 		vput(nd.ni_dvp);
1847 		VFS_UNLOCK_GIANT(vfslocked);
1848 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
1849 			return (error);
1850 		goto restart;
1851 	}
1852 	error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE);
1853 	NDFREE(&nd, NDF_ONLY_PNBUF);
1854 	vput(nd.ni_dvp);
1855 	vn_finished_write(mp);
1856 	VFS_UNLOCK_GIANT(vfslocked);
1857 	return (error);
1858 }
1859 
1860 /*
1861  * Delete a name from the filesystem.
1862  */
1863 #ifndef _SYS_SYSPROTO_H_
1864 struct unlink_args {
1865 	char	*path;
1866 };
1867 #endif
1868 int
1869 sys_unlink(td, uap)
1870 	struct thread *td;
1871 	struct unlink_args /* {
1872 		char *path;
1873 	} */ *uap;
1874 {
1875 
1876 	return (kern_unlink(td, uap->path, UIO_USERSPACE));
1877 }
1878 
1879 #ifndef _SYS_SYSPROTO_H_
1880 struct unlinkat_args {
1881 	int	fd;
1882 	char	*path;
1883 	int	flag;
1884 };
1885 #endif
1886 int
1887 sys_unlinkat(struct thread *td, struct unlinkat_args *uap)
1888 {
1889 	int flag = uap->flag;
1890 	int fd = uap->fd;
1891 	char *path = uap->path;
1892 
1893 	if (flag & ~AT_REMOVEDIR)
1894 		return (EINVAL);
1895 
1896 	if (flag & AT_REMOVEDIR)
1897 		return (kern_rmdirat(td, fd, path, UIO_USERSPACE));
1898 	else
1899 		return (kern_unlinkat(td, fd, path, UIO_USERSPACE, 0));
1900 }
1901 
1902 int
1903 kern_unlink(struct thread *td, char *path, enum uio_seg pathseg)
1904 {
1905 
1906 	return (kern_unlinkat(td, AT_FDCWD, path, pathseg, 0));
1907 }
1908 
1909 int
1910 kern_unlinkat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
1911     ino_t oldinum)
1912 {
1913 	struct mount *mp;
1914 	struct vnode *vp;
1915 	int error;
1916 	struct nameidata nd;
1917 	struct stat sb;
1918 	int vfslocked;
1919 
1920 restart:
1921 	bwillwrite();
1922 	NDINIT_AT(&nd, DELETE, LOCKPARENT | LOCKLEAF | MPSAFE | AUDITVNODE1,
1923 	    pathseg, path, fd, td);
1924 	if ((error = namei(&nd)) != 0)
1925 		return (error == EINVAL ? EPERM : error);
1926 	vfslocked = NDHASGIANT(&nd);
1927 	vp = nd.ni_vp;
1928 	if (vp->v_type == VDIR && oldinum == 0) {
1929 		error = EPERM;		/* POSIX */
1930 	} else if (oldinum != 0 &&
1931 		  ((error = vn_stat(vp, &sb, td->td_ucred, NOCRED, td)) == 0) &&
1932 		  sb.st_ino != oldinum) {
1933 			error = EIDRM;	/* Identifier removed */
1934 	} else {
1935 		/*
1936 		 * The root of a mounted filesystem cannot be deleted.
1937 		 *
1938 		 * XXX: can this only be a VDIR case?
1939 		 */
1940 		if (vp->v_vflag & VV_ROOT)
1941 			error = EBUSY;
1942 	}
1943 	if (error == 0) {
1944 		if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1945 			NDFREE(&nd, NDF_ONLY_PNBUF);
1946 			vput(nd.ni_dvp);
1947 			if (vp == nd.ni_dvp)
1948 				vrele(vp);
1949 			else
1950 				vput(vp);
1951 			VFS_UNLOCK_GIANT(vfslocked);
1952 			if ((error = vn_start_write(NULL, &mp,
1953 			    V_XSLEEP | PCATCH)) != 0)
1954 				return (error);
1955 			goto restart;
1956 		}
1957 #ifdef MAC
1958 		error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp,
1959 		    &nd.ni_cnd);
1960 		if (error)
1961 			goto out;
1962 #endif
1963 		error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
1964 #ifdef MAC
1965 out:
1966 #endif
1967 		vn_finished_write(mp);
1968 	}
1969 	NDFREE(&nd, NDF_ONLY_PNBUF);
1970 	vput(nd.ni_dvp);
1971 	if (vp == nd.ni_dvp)
1972 		vrele(vp);
1973 	else
1974 		vput(vp);
1975 	VFS_UNLOCK_GIANT(vfslocked);
1976 	return (error);
1977 }
1978 
1979 /*
1980  * Reposition read/write file offset.
1981  */
1982 #ifndef _SYS_SYSPROTO_H_
1983 struct lseek_args {
1984 	int	fd;
1985 	int	pad;
1986 	off_t	offset;
1987 	int	whence;
1988 };
1989 #endif
1990 int
1991 sys_lseek(td, uap)
1992 	struct thread *td;
1993 	register struct lseek_args /* {
1994 		int fd;
1995 		int pad;
1996 		off_t offset;
1997 		int whence;
1998 	} */ *uap;
1999 {
2000 	struct ucred *cred = td->td_ucred;
2001 	struct file *fp;
2002 	struct vnode *vp;
2003 	struct vattr vattr;
2004 	off_t offset, size;
2005 	int error, noneg;
2006 	int vfslocked;
2007 
2008 	AUDIT_ARG_FD(uap->fd);
2009 	if ((error = fget(td, uap->fd, CAP_SEEK, &fp)) != 0)
2010 		return (error);
2011 	if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) {
2012 		fdrop(fp, td);
2013 		return (ESPIPE);
2014 	}
2015 	vp = fp->f_vnode;
2016 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
2017 	noneg = (vp->v_type != VCHR);
2018 	offset = uap->offset;
2019 	switch (uap->whence) {
2020 	case L_INCR:
2021 		if (noneg &&
2022 		    (fp->f_offset < 0 ||
2023 		    (offset > 0 && fp->f_offset > OFF_MAX - offset))) {
2024 			error = EOVERFLOW;
2025 			break;
2026 		}
2027 		offset += fp->f_offset;
2028 		break;
2029 	case L_XTND:
2030 		vn_lock(vp, LK_SHARED | LK_RETRY);
2031 		error = VOP_GETATTR(vp, &vattr, cred);
2032 		VOP_UNLOCK(vp, 0);
2033 		if (error)
2034 			break;
2035 
2036 		/*
2037 		 * If the file references a disk device, then fetch
2038 		 * the media size and use that to determine the ending
2039 		 * offset.
2040 		 */
2041 		if (vattr.va_size == 0 && vp->v_type == VCHR &&
2042 		    fo_ioctl(fp, DIOCGMEDIASIZE, &size, cred, td) == 0)
2043 			vattr.va_size = size;
2044 		if (noneg &&
2045 		    (vattr.va_size > OFF_MAX ||
2046 		    (offset > 0 && vattr.va_size > OFF_MAX - offset))) {
2047 			error = EOVERFLOW;
2048 			break;
2049 		}
2050 		offset += vattr.va_size;
2051 		break;
2052 	case L_SET:
2053 		break;
2054 	case SEEK_DATA:
2055 		error = fo_ioctl(fp, FIOSEEKDATA, &offset, cred, td);
2056 		break;
2057 	case SEEK_HOLE:
2058 		error = fo_ioctl(fp, FIOSEEKHOLE, &offset, cred, td);
2059 		break;
2060 	default:
2061 		error = EINVAL;
2062 	}
2063 	if (error == 0 && noneg && offset < 0)
2064 		error = EINVAL;
2065 	if (error != 0)
2066 		goto drop;
2067 	fp->f_offset = offset;
2068 	VFS_KNOTE_UNLOCKED(vp, 0);
2069 	*(off_t *)(td->td_retval) = fp->f_offset;
2070 drop:
2071 	fdrop(fp, td);
2072 	VFS_UNLOCK_GIANT(vfslocked);
2073 	return (error);
2074 }
2075 
2076 #if defined(COMPAT_43)
2077 /*
2078  * Reposition read/write file offset.
2079  */
2080 #ifndef _SYS_SYSPROTO_H_
2081 struct olseek_args {
2082 	int	fd;
2083 	long	offset;
2084 	int	whence;
2085 };
2086 #endif
2087 int
2088 olseek(td, uap)
2089 	struct thread *td;
2090 	register struct olseek_args /* {
2091 		int fd;
2092 		long offset;
2093 		int whence;
2094 	} */ *uap;
2095 {
2096 	struct lseek_args /* {
2097 		int fd;
2098 		int pad;
2099 		off_t offset;
2100 		int whence;
2101 	} */ nuap;
2102 
2103 	nuap.fd = uap->fd;
2104 	nuap.offset = uap->offset;
2105 	nuap.whence = uap->whence;
2106 	return (sys_lseek(td, &nuap));
2107 }
2108 #endif /* COMPAT_43 */
2109 
2110 /* Version with the 'pad' argument */
2111 int
2112 freebsd6_lseek(td, uap)
2113 	struct thread *td;
2114 	register struct freebsd6_lseek_args *uap;
2115 {
2116 	struct lseek_args ouap;
2117 
2118 	ouap.fd = uap->fd;
2119 	ouap.offset = uap->offset;
2120 	ouap.whence = uap->whence;
2121 	return (sys_lseek(td, &ouap));
2122 }
2123 
2124 /*
2125  * Check access permissions using passed credentials.
2126  */
2127 static int
2128 vn_access(vp, user_flags, cred, td)
2129 	struct vnode	*vp;
2130 	int		user_flags;
2131 	struct ucred	*cred;
2132 	struct thread	*td;
2133 {
2134 	int error;
2135 	accmode_t accmode;
2136 
2137 	/* Flags == 0 means only check for existence. */
2138 	error = 0;
2139 	if (user_flags) {
2140 		accmode = 0;
2141 		if (user_flags & R_OK)
2142 			accmode |= VREAD;
2143 		if (user_flags & W_OK)
2144 			accmode |= VWRITE;
2145 		if (user_flags & X_OK)
2146 			accmode |= VEXEC;
2147 #ifdef MAC
2148 		error = mac_vnode_check_access(cred, vp, accmode);
2149 		if (error)
2150 			return (error);
2151 #endif
2152 		if ((accmode & VWRITE) == 0 || (error = vn_writechk(vp)) == 0)
2153 			error = VOP_ACCESS(vp, accmode, cred, td);
2154 	}
2155 	return (error);
2156 }
2157 
2158 /*
2159  * Check access permissions using "real" credentials.
2160  */
2161 #ifndef _SYS_SYSPROTO_H_
2162 struct access_args {
2163 	char	*path;
2164 	int	amode;
2165 };
2166 #endif
2167 int
2168 sys_access(td, uap)
2169 	struct thread *td;
2170 	register struct access_args /* {
2171 		char *path;
2172 		int amode;
2173 	} */ *uap;
2174 {
2175 
2176 	return (kern_access(td, uap->path, UIO_USERSPACE, uap->amode));
2177 }
2178 
2179 #ifndef _SYS_SYSPROTO_H_
2180 struct faccessat_args {
2181 	int	dirfd;
2182 	char	*path;
2183 	int	amode;
2184 	int	flag;
2185 }
2186 #endif
2187 int
2188 sys_faccessat(struct thread *td, struct faccessat_args *uap)
2189 {
2190 
2191 	if (uap->flag & ~AT_EACCESS)
2192 		return (EINVAL);
2193 	return (kern_accessat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flag,
2194 	    uap->amode));
2195 }
2196 
2197 int
2198 kern_access(struct thread *td, char *path, enum uio_seg pathseg, int amode)
2199 {
2200 
2201 	return (kern_accessat(td, AT_FDCWD, path, pathseg, 0, amode));
2202 }
2203 
2204 int
2205 kern_accessat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
2206     int flag, int amode)
2207 {
2208 	struct ucred *cred, *tmpcred;
2209 	struct vnode *vp;
2210 	struct nameidata nd;
2211 	int vfslocked;
2212 	int error;
2213 
2214 	/*
2215 	 * Create and modify a temporary credential instead of one that
2216 	 * is potentially shared.
2217 	 */
2218 	if (!(flag & AT_EACCESS)) {
2219 		cred = td->td_ucred;
2220 		tmpcred = crdup(cred);
2221 		tmpcred->cr_uid = cred->cr_ruid;
2222 		tmpcred->cr_groups[0] = cred->cr_rgid;
2223 		td->td_ucred = tmpcred;
2224 	} else
2225 		cred = tmpcred = td->td_ucred;
2226 	AUDIT_ARG_VALUE(amode);
2227 	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | MPSAFE |
2228 	    AUDITVNODE1, pathseg, path, fd, CAP_FSTAT, td);
2229 	if ((error = namei(&nd)) != 0)
2230 		goto out1;
2231 	vfslocked = NDHASGIANT(&nd);
2232 	vp = nd.ni_vp;
2233 
2234 	error = vn_access(vp, amode, tmpcred, td);
2235 	NDFREE(&nd, NDF_ONLY_PNBUF);
2236 	vput(vp);
2237 	VFS_UNLOCK_GIANT(vfslocked);
2238 out1:
2239 	if (!(flag & AT_EACCESS)) {
2240 		td->td_ucred = cred;
2241 		crfree(tmpcred);
2242 	}
2243 	return (error);
2244 }
2245 
2246 /*
2247  * Check access permissions using "effective" credentials.
2248  */
2249 #ifndef _SYS_SYSPROTO_H_
2250 struct eaccess_args {
2251 	char	*path;
2252 	int	amode;
2253 };
2254 #endif
2255 int
2256 sys_eaccess(td, uap)
2257 	struct thread *td;
2258 	register struct eaccess_args /* {
2259 		char *path;
2260 		int amode;
2261 	} */ *uap;
2262 {
2263 
2264 	return (kern_eaccess(td, uap->path, UIO_USERSPACE, uap->amode));
2265 }
2266 
2267 int
2268 kern_eaccess(struct thread *td, char *path, enum uio_seg pathseg, int amode)
2269 {
2270 
2271 	return (kern_accessat(td, AT_FDCWD, path, pathseg, AT_EACCESS, amode));
2272 }
2273 
2274 #if defined(COMPAT_43)
2275 /*
2276  * Get file status; this version follows links.
2277  */
2278 #ifndef _SYS_SYSPROTO_H_
2279 struct ostat_args {
2280 	char	*path;
2281 	struct ostat *ub;
2282 };
2283 #endif
2284 int
2285 ostat(td, uap)
2286 	struct thread *td;
2287 	register struct ostat_args /* {
2288 		char *path;
2289 		struct ostat *ub;
2290 	} */ *uap;
2291 {
2292 	struct stat sb;
2293 	struct ostat osb;
2294 	int error;
2295 
2296 	error = kern_stat(td, uap->path, UIO_USERSPACE, &sb);
2297 	if (error)
2298 		return (error);
2299 	cvtstat(&sb, &osb);
2300 	error = copyout(&osb, uap->ub, sizeof (osb));
2301 	return (error);
2302 }
2303 
2304 /*
2305  * Get file status; this version does not follow links.
2306  */
2307 #ifndef _SYS_SYSPROTO_H_
2308 struct olstat_args {
2309 	char	*path;
2310 	struct ostat *ub;
2311 };
2312 #endif
2313 int
2314 olstat(td, uap)
2315 	struct thread *td;
2316 	register struct olstat_args /* {
2317 		char *path;
2318 		struct ostat *ub;
2319 	} */ *uap;
2320 {
2321 	struct stat sb;
2322 	struct ostat osb;
2323 	int error;
2324 
2325 	error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb);
2326 	if (error)
2327 		return (error);
2328 	cvtstat(&sb, &osb);
2329 	error = copyout(&osb, uap->ub, sizeof (osb));
2330 	return (error);
2331 }
2332 
2333 /*
2334  * Convert from an old to a new stat structure.
2335  */
2336 void
2337 cvtstat(st, ost)
2338 	struct stat *st;
2339 	struct ostat *ost;
2340 {
2341 
2342 	ost->st_dev = st->st_dev;
2343 	ost->st_ino = st->st_ino;
2344 	ost->st_mode = st->st_mode;
2345 	ost->st_nlink = st->st_nlink;
2346 	ost->st_uid = st->st_uid;
2347 	ost->st_gid = st->st_gid;
2348 	ost->st_rdev = st->st_rdev;
2349 	if (st->st_size < (quad_t)1 << 32)
2350 		ost->st_size = st->st_size;
2351 	else
2352 		ost->st_size = -2;
2353 	ost->st_atim = st->st_atim;
2354 	ost->st_mtim = st->st_mtim;
2355 	ost->st_ctim = st->st_ctim;
2356 	ost->st_blksize = st->st_blksize;
2357 	ost->st_blocks = st->st_blocks;
2358 	ost->st_flags = st->st_flags;
2359 	ost->st_gen = st->st_gen;
2360 }
2361 #endif /* COMPAT_43 */
2362 
2363 /*
2364  * Get file status; this version follows links.
2365  */
2366 #ifndef _SYS_SYSPROTO_H_
2367 struct stat_args {
2368 	char	*path;
2369 	struct stat *ub;
2370 };
2371 #endif
2372 int
2373 sys_stat(td, uap)
2374 	struct thread *td;
2375 	register struct stat_args /* {
2376 		char *path;
2377 		struct stat *ub;
2378 	} */ *uap;
2379 {
2380 	struct stat sb;
2381 	int error;
2382 
2383 	error = kern_stat(td, uap->path, UIO_USERSPACE, &sb);
2384 	if (error == 0)
2385 		error = copyout(&sb, uap->ub, sizeof (sb));
2386 	return (error);
2387 }
2388 
2389 #ifndef _SYS_SYSPROTO_H_
2390 struct fstatat_args {
2391 	int	fd;
2392 	char	*path;
2393 	struct stat	*buf;
2394 	int	flag;
2395 }
2396 #endif
2397 int
2398 sys_fstatat(struct thread *td, struct fstatat_args *uap)
2399 {
2400 	struct stat sb;
2401 	int error;
2402 
2403 	error = kern_statat(td, uap->flag, uap->fd, uap->path,
2404 	    UIO_USERSPACE, &sb);
2405 	if (error == 0)
2406 		error = copyout(&sb, uap->buf, sizeof (sb));
2407 	return (error);
2408 }
2409 
2410 int
2411 kern_stat(struct thread *td, char *path, enum uio_seg pathseg, struct stat *sbp)
2412 {
2413 
2414 	return (kern_statat(td, 0, AT_FDCWD, path, pathseg, sbp));
2415 }
2416 
2417 int
2418 kern_statat(struct thread *td, int flag, int fd, char *path,
2419     enum uio_seg pathseg, struct stat *sbp)
2420 {
2421 
2422 	return (kern_statat_vnhook(td, flag, fd, path, pathseg, sbp, NULL));
2423 }
2424 
2425 int
2426 kern_statat_vnhook(struct thread *td, int flag, int fd, char *path,
2427     enum uio_seg pathseg, struct stat *sbp,
2428     void (*hook)(struct vnode *vp, struct stat *sbp))
2429 {
2430 	struct nameidata nd;
2431 	struct stat sb;
2432 	int error, vfslocked;
2433 
2434 	if (flag & ~AT_SYMLINK_NOFOLLOW)
2435 		return (EINVAL);
2436 
2437 	NDINIT_ATRIGHTS(&nd, LOOKUP, ((flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW :
2438 	    FOLLOW) | LOCKSHARED | LOCKLEAF | AUDITVNODE1 | MPSAFE, pathseg,
2439 	    path, fd, CAP_FSTAT, td);
2440 
2441 	if ((error = namei(&nd)) != 0)
2442 		return (error);
2443 	vfslocked = NDHASGIANT(&nd);
2444 	error = vn_stat(nd.ni_vp, &sb, td->td_ucred, NOCRED, td);
2445 	if (!error) {
2446 		SDT_PROBE(vfs, , stat, mode, path, sb.st_mode, 0, 0, 0);
2447 		if (S_ISREG(sb.st_mode))
2448 			SDT_PROBE(vfs, , stat, reg, path, pathseg, 0, 0, 0);
2449 		if (__predict_false(hook != NULL))
2450 			hook(nd.ni_vp, &sb);
2451 	}
2452 	NDFREE(&nd, NDF_ONLY_PNBUF);
2453 	vput(nd.ni_vp);
2454 	VFS_UNLOCK_GIANT(vfslocked);
2455 	if (error)
2456 		return (error);
2457 	*sbp = sb;
2458 #ifdef KTRACE
2459 	if (KTRPOINT(td, KTR_STRUCT))
2460 		ktrstat(&sb);
2461 #endif
2462 	return (0);
2463 }
2464 
2465 /*
2466  * Get file status; this version does not follow links.
2467  */
2468 #ifndef _SYS_SYSPROTO_H_
2469 struct lstat_args {
2470 	char	*path;
2471 	struct stat *ub;
2472 };
2473 #endif
2474 int
2475 sys_lstat(td, uap)
2476 	struct thread *td;
2477 	register struct lstat_args /* {
2478 		char *path;
2479 		struct stat *ub;
2480 	} */ *uap;
2481 {
2482 	struct stat sb;
2483 	int error;
2484 
2485 	error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb);
2486 	if (error == 0)
2487 		error = copyout(&sb, uap->ub, sizeof (sb));
2488 	return (error);
2489 }
2490 
2491 int
2492 kern_lstat(struct thread *td, char *path, enum uio_seg pathseg, struct stat *sbp)
2493 {
2494 
2495 	return (kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, path, pathseg,
2496 	    sbp));
2497 }
2498 
2499 /*
2500  * Implementation of the NetBSD [l]stat() functions.
2501  */
2502 void
2503 cvtnstat(sb, nsb)
2504 	struct stat *sb;
2505 	struct nstat *nsb;
2506 {
2507 	bzero(nsb, sizeof *nsb);
2508 	nsb->st_dev = sb->st_dev;
2509 	nsb->st_ino = sb->st_ino;
2510 	nsb->st_mode = sb->st_mode;
2511 	nsb->st_nlink = sb->st_nlink;
2512 	nsb->st_uid = sb->st_uid;
2513 	nsb->st_gid = sb->st_gid;
2514 	nsb->st_rdev = sb->st_rdev;
2515 	nsb->st_atim = sb->st_atim;
2516 	nsb->st_mtim = sb->st_mtim;
2517 	nsb->st_ctim = sb->st_ctim;
2518 	nsb->st_size = sb->st_size;
2519 	nsb->st_blocks = sb->st_blocks;
2520 	nsb->st_blksize = sb->st_blksize;
2521 	nsb->st_flags = sb->st_flags;
2522 	nsb->st_gen = sb->st_gen;
2523 	nsb->st_birthtim = sb->st_birthtim;
2524 }
2525 
2526 #ifndef _SYS_SYSPROTO_H_
2527 struct nstat_args {
2528 	char	*path;
2529 	struct nstat *ub;
2530 };
2531 #endif
2532 int
2533 sys_nstat(td, uap)
2534 	struct thread *td;
2535 	register struct nstat_args /* {
2536 		char *path;
2537 		struct nstat *ub;
2538 	} */ *uap;
2539 {
2540 	struct stat sb;
2541 	struct nstat nsb;
2542 	int error;
2543 
2544 	error = kern_stat(td, uap->path, UIO_USERSPACE, &sb);
2545 	if (error)
2546 		return (error);
2547 	cvtnstat(&sb, &nsb);
2548 	error = copyout(&nsb, uap->ub, sizeof (nsb));
2549 	return (error);
2550 }
2551 
2552 /*
2553  * NetBSD lstat.  Get file status; this version does not follow links.
2554  */
2555 #ifndef _SYS_SYSPROTO_H_
2556 struct lstat_args {
2557 	char	*path;
2558 	struct stat *ub;
2559 };
2560 #endif
2561 int
2562 sys_nlstat(td, uap)
2563 	struct thread *td;
2564 	register struct nlstat_args /* {
2565 		char *path;
2566 		struct nstat *ub;
2567 	} */ *uap;
2568 {
2569 	struct stat sb;
2570 	struct nstat nsb;
2571 	int error;
2572 
2573 	error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb);
2574 	if (error)
2575 		return (error);
2576 	cvtnstat(&sb, &nsb);
2577 	error = copyout(&nsb, uap->ub, sizeof (nsb));
2578 	return (error);
2579 }
2580 
2581 /*
2582  * Get configurable pathname variables.
2583  */
2584 #ifndef _SYS_SYSPROTO_H_
2585 struct pathconf_args {
2586 	char	*path;
2587 	int	name;
2588 };
2589 #endif
2590 int
2591 sys_pathconf(td, uap)
2592 	struct thread *td;
2593 	register struct pathconf_args /* {
2594 		char *path;
2595 		int name;
2596 	} */ *uap;
2597 {
2598 
2599 	return (kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name, FOLLOW));
2600 }
2601 
2602 #ifndef _SYS_SYSPROTO_H_
2603 struct lpathconf_args {
2604 	char	*path;
2605 	int	name;
2606 };
2607 #endif
2608 int
2609 sys_lpathconf(td, uap)
2610 	struct thread *td;
2611 	register struct lpathconf_args /* {
2612 		char *path;
2613 		int name;
2614 	} */ *uap;
2615 {
2616 
2617 	return (kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name, NOFOLLOW));
2618 }
2619 
2620 int
2621 kern_pathconf(struct thread *td, char *path, enum uio_seg pathseg, int name,
2622     u_long flags)
2623 {
2624 	struct nameidata nd;
2625 	int error, vfslocked;
2626 
2627 	NDINIT(&nd, LOOKUP, LOCKSHARED | LOCKLEAF | MPSAFE | AUDITVNODE1 |
2628 	    flags, pathseg, path, td);
2629 	if ((error = namei(&nd)) != 0)
2630 		return (error);
2631 	vfslocked = NDHASGIANT(&nd);
2632 	NDFREE(&nd, NDF_ONLY_PNBUF);
2633 
2634 	/* If asynchronous I/O is available, it works for all files. */
2635 	if (name == _PC_ASYNC_IO)
2636 		td->td_retval[0] = async_io_version;
2637 	else
2638 		error = VOP_PATHCONF(nd.ni_vp, name, td->td_retval);
2639 	vput(nd.ni_vp);
2640 	VFS_UNLOCK_GIANT(vfslocked);
2641 	return (error);
2642 }
2643 
2644 /*
2645  * Return target name of a symbolic link.
2646  */
2647 #ifndef _SYS_SYSPROTO_H_
2648 struct readlink_args {
2649 	char	*path;
2650 	char	*buf;
2651 	size_t	count;
2652 };
2653 #endif
2654 int
2655 sys_readlink(td, uap)
2656 	struct thread *td;
2657 	register struct readlink_args /* {
2658 		char *path;
2659 		char *buf;
2660 		size_t count;
2661 	} */ *uap;
2662 {
2663 
2664 	return (kern_readlink(td, uap->path, UIO_USERSPACE, uap->buf,
2665 	    UIO_USERSPACE, uap->count));
2666 }
2667 #ifndef _SYS_SYSPROTO_H_
2668 struct readlinkat_args {
2669 	int	fd;
2670 	char	*path;
2671 	char	*buf;
2672 	size_t	bufsize;
2673 };
2674 #endif
2675 int
2676 sys_readlinkat(struct thread *td, struct readlinkat_args *uap)
2677 {
2678 
2679 	return (kern_readlinkat(td, uap->fd, uap->path, UIO_USERSPACE,
2680 	    uap->buf, UIO_USERSPACE, uap->bufsize));
2681 }
2682 
2683 int
2684 kern_readlink(struct thread *td, char *path, enum uio_seg pathseg, char *buf,
2685     enum uio_seg bufseg, size_t count)
2686 {
2687 
2688 	return (kern_readlinkat(td, AT_FDCWD, path, pathseg, buf, bufseg,
2689 	    count));
2690 }
2691 
2692 int
2693 kern_readlinkat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
2694     char *buf, enum uio_seg bufseg, size_t count)
2695 {
2696 	struct vnode *vp;
2697 	struct iovec aiov;
2698 	struct uio auio;
2699 	int error;
2700 	struct nameidata nd;
2701 	int vfslocked;
2702 
2703 	if (count > INT_MAX)
2704 		return (EINVAL);
2705 
2706 	NDINIT_AT(&nd, LOOKUP, NOFOLLOW | LOCKSHARED | LOCKLEAF | MPSAFE |
2707 	    AUDITVNODE1, pathseg, path, fd, td);
2708 
2709 	if ((error = namei(&nd)) != 0)
2710 		return (error);
2711 	NDFREE(&nd, NDF_ONLY_PNBUF);
2712 	vfslocked = NDHASGIANT(&nd);
2713 	vp = nd.ni_vp;
2714 #ifdef MAC
2715 	error = mac_vnode_check_readlink(td->td_ucred, vp);
2716 	if (error) {
2717 		vput(vp);
2718 		VFS_UNLOCK_GIANT(vfslocked);
2719 		return (error);
2720 	}
2721 #endif
2722 	if (vp->v_type != VLNK)
2723 		error = EINVAL;
2724 	else {
2725 		aiov.iov_base = buf;
2726 		aiov.iov_len = count;
2727 		auio.uio_iov = &aiov;
2728 		auio.uio_iovcnt = 1;
2729 		auio.uio_offset = 0;
2730 		auio.uio_rw = UIO_READ;
2731 		auio.uio_segflg = bufseg;
2732 		auio.uio_td = td;
2733 		auio.uio_resid = count;
2734 		error = VOP_READLINK(vp, &auio, td->td_ucred);
2735 	}
2736 	vput(vp);
2737 	VFS_UNLOCK_GIANT(vfslocked);
2738 	td->td_retval[0] = count - auio.uio_resid;
2739 	return (error);
2740 }
2741 
2742 /*
2743  * Common implementation code for chflags() and fchflags().
2744  */
2745 static int
2746 setfflags(td, vp, flags)
2747 	struct thread *td;
2748 	struct vnode *vp;
2749 	int flags;
2750 {
2751 	int error;
2752 	struct mount *mp;
2753 	struct vattr vattr;
2754 
2755 	/*
2756 	 * Prevent non-root users from setting flags on devices.  When
2757 	 * a device is reused, users can retain ownership of the device
2758 	 * if they are allowed to set flags and programs assume that
2759 	 * chown can't fail when done as root.
2760 	 */
2761 	if (vp->v_type == VCHR || vp->v_type == VBLK) {
2762 		error = priv_check(td, PRIV_VFS_CHFLAGS_DEV);
2763 		if (error)
2764 			return (error);
2765 	}
2766 
2767 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
2768 		return (error);
2769 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2770 	VATTR_NULL(&vattr);
2771 	vattr.va_flags = flags;
2772 #ifdef MAC
2773 	error = mac_vnode_check_setflags(td->td_ucred, vp, vattr.va_flags);
2774 	if (error == 0)
2775 #endif
2776 		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
2777 	VOP_UNLOCK(vp, 0);
2778 	vn_finished_write(mp);
2779 	return (error);
2780 }
2781 
2782 /*
2783  * Change flags of a file given a path name.
2784  */
2785 #ifndef _SYS_SYSPROTO_H_
2786 struct chflags_args {
2787 	char	*path;
2788 	int	flags;
2789 };
2790 #endif
2791 int
2792 sys_chflags(td, uap)
2793 	struct thread *td;
2794 	register struct chflags_args /* {
2795 		char *path;
2796 		int flags;
2797 	} */ *uap;
2798 {
2799 	int error;
2800 	struct nameidata nd;
2801 	int vfslocked;
2802 
2803 	AUDIT_ARG_FFLAGS(uap->flags);
2804 	NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1, UIO_USERSPACE,
2805 	    uap->path, td);
2806 	if ((error = namei(&nd)) != 0)
2807 		return (error);
2808 	NDFREE(&nd, NDF_ONLY_PNBUF);
2809 	vfslocked = NDHASGIANT(&nd);
2810 	error = setfflags(td, nd.ni_vp, uap->flags);
2811 	vrele(nd.ni_vp);
2812 	VFS_UNLOCK_GIANT(vfslocked);
2813 	return (error);
2814 }
2815 
2816 /*
2817  * Same as chflags() but doesn't follow symlinks.
2818  */
2819 int
2820 sys_lchflags(td, uap)
2821 	struct thread *td;
2822 	register struct lchflags_args /* {
2823 		char *path;
2824 		int flags;
2825 	} */ *uap;
2826 {
2827 	int error;
2828 	struct nameidata nd;
2829 	int vfslocked;
2830 
2831 	AUDIT_ARG_FFLAGS(uap->flags);
2832 	NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE | AUDITVNODE1, UIO_USERSPACE,
2833 	    uap->path, td);
2834 	if ((error = namei(&nd)) != 0)
2835 		return (error);
2836 	vfslocked = NDHASGIANT(&nd);
2837 	NDFREE(&nd, NDF_ONLY_PNBUF);
2838 	error = setfflags(td, nd.ni_vp, uap->flags);
2839 	vrele(nd.ni_vp);
2840 	VFS_UNLOCK_GIANT(vfslocked);
2841 	return (error);
2842 }
2843 
2844 /*
2845  * Change flags of a file given a file descriptor.
2846  */
2847 #ifndef _SYS_SYSPROTO_H_
2848 struct fchflags_args {
2849 	int	fd;
2850 	int	flags;
2851 };
2852 #endif
2853 int
2854 sys_fchflags(td, uap)
2855 	struct thread *td;
2856 	register struct fchflags_args /* {
2857 		int fd;
2858 		int flags;
2859 	} */ *uap;
2860 {
2861 	struct file *fp;
2862 	int vfslocked;
2863 	int error;
2864 
2865 	AUDIT_ARG_FD(uap->fd);
2866 	AUDIT_ARG_FFLAGS(uap->flags);
2867 	if ((error = getvnode(td->td_proc->p_fd, uap->fd, CAP_FCHFLAGS,
2868 	    &fp)) != 0)
2869 		return (error);
2870 	vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
2871 #ifdef AUDIT
2872 	vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY);
2873 	AUDIT_ARG_VNODE1(fp->f_vnode);
2874 	VOP_UNLOCK(fp->f_vnode, 0);
2875 #endif
2876 	error = setfflags(td, fp->f_vnode, uap->flags);
2877 	VFS_UNLOCK_GIANT(vfslocked);
2878 	fdrop(fp, td);
2879 	return (error);
2880 }
2881 
2882 /*
2883  * Common implementation code for chmod(), lchmod() and fchmod().
2884  */
2885 int
2886 setfmode(td, cred, vp, mode)
2887 	struct thread *td;
2888 	struct ucred *cred;
2889 	struct vnode *vp;
2890 	int mode;
2891 {
2892 	int error;
2893 	struct mount *mp;
2894 	struct vattr vattr;
2895 
2896 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
2897 		return (error);
2898 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2899 	VATTR_NULL(&vattr);
2900 	vattr.va_mode = mode & ALLPERMS;
2901 #ifdef MAC
2902 	error = mac_vnode_check_setmode(cred, vp, vattr.va_mode);
2903 	if (error == 0)
2904 #endif
2905 		error = VOP_SETATTR(vp, &vattr, cred);
2906 	VOP_UNLOCK(vp, 0);
2907 	vn_finished_write(mp);
2908 	return (error);
2909 }
2910 
2911 /*
2912  * Change mode of a file given path name.
2913  */
2914 #ifndef _SYS_SYSPROTO_H_
2915 struct chmod_args {
2916 	char	*path;
2917 	int	mode;
2918 };
2919 #endif
2920 int
2921 sys_chmod(td, uap)
2922 	struct thread *td;
2923 	register struct chmod_args /* {
2924 		char *path;
2925 		int mode;
2926 	} */ *uap;
2927 {
2928 
2929 	return (kern_chmod(td, uap->path, UIO_USERSPACE, uap->mode));
2930 }
2931 
2932 #ifndef _SYS_SYSPROTO_H_
2933 struct fchmodat_args {
2934 	int	dirfd;
2935 	char	*path;
2936 	mode_t	mode;
2937 	int	flag;
2938 }
2939 #endif
2940 int
2941 sys_fchmodat(struct thread *td, struct fchmodat_args *uap)
2942 {
2943 	int flag = uap->flag;
2944 	int fd = uap->fd;
2945 	char *path = uap->path;
2946 	mode_t mode = uap->mode;
2947 
2948 	if (flag & ~AT_SYMLINK_NOFOLLOW)
2949 		return (EINVAL);
2950 
2951 	return (kern_fchmodat(td, fd, path, UIO_USERSPACE, mode, flag));
2952 }
2953 
2954 int
2955 kern_chmod(struct thread *td, char *path, enum uio_seg pathseg, int mode)
2956 {
2957 
2958 	return (kern_fchmodat(td, AT_FDCWD, path, pathseg, mode, 0));
2959 }
2960 
2961 /*
2962  * Change mode of a file given path name (don't follow links.)
2963  */
2964 #ifndef _SYS_SYSPROTO_H_
2965 struct lchmod_args {
2966 	char	*path;
2967 	int	mode;
2968 };
2969 #endif
2970 int
2971 sys_lchmod(td, uap)
2972 	struct thread *td;
2973 	register struct lchmod_args /* {
2974 		char *path;
2975 		int mode;
2976 	} */ *uap;
2977 {
2978 
2979 	return (kern_fchmodat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
2980 	    uap->mode, AT_SYMLINK_NOFOLLOW));
2981 }
2982 
2983 
2984 int
2985 kern_fchmodat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
2986     mode_t mode, int flag)
2987 {
2988 	int error;
2989 	struct nameidata nd;
2990 	int vfslocked;
2991 	int follow;
2992 
2993 	AUDIT_ARG_MODE(mode);
2994 	follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
2995 	NDINIT_ATRIGHTS(&nd, LOOKUP,  follow | MPSAFE | AUDITVNODE1, pathseg,
2996 	    path, fd, CAP_FCHMOD, td);
2997 	if ((error = namei(&nd)) != 0)
2998 		return (error);
2999 	vfslocked = NDHASGIANT(&nd);
3000 	NDFREE(&nd, NDF_ONLY_PNBUF);
3001 	error = setfmode(td, td->td_ucred, nd.ni_vp, mode);
3002 	vrele(nd.ni_vp);
3003 	VFS_UNLOCK_GIANT(vfslocked);
3004 	return (error);
3005 }
3006 
3007 /*
3008  * Change mode of a file given a file descriptor.
3009  */
3010 #ifndef _SYS_SYSPROTO_H_
3011 struct fchmod_args {
3012 	int	fd;
3013 	int	mode;
3014 };
3015 #endif
3016 int
3017 sys_fchmod(struct thread *td, struct fchmod_args *uap)
3018 {
3019 	struct file *fp;
3020 	int error;
3021 
3022 	AUDIT_ARG_FD(uap->fd);
3023 	AUDIT_ARG_MODE(uap->mode);
3024 
3025 	error = fget(td, uap->fd, CAP_FCHMOD, &fp);
3026 	if (error != 0)
3027 		return (error);
3028 	error = fo_chmod(fp, uap->mode, td->td_ucred, td);
3029 	fdrop(fp, td);
3030 	return (error);
3031 }
3032 
3033 /*
3034  * Common implementation for chown(), lchown(), and fchown()
3035  */
3036 int
3037 setfown(td, cred, vp, uid, gid)
3038 	struct thread *td;
3039 	struct ucred *cred;
3040 	struct vnode *vp;
3041 	uid_t uid;
3042 	gid_t gid;
3043 {
3044 	int error;
3045 	struct mount *mp;
3046 	struct vattr vattr;
3047 
3048 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
3049 		return (error);
3050 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3051 	VATTR_NULL(&vattr);
3052 	vattr.va_uid = uid;
3053 	vattr.va_gid = gid;
3054 #ifdef MAC
3055 	error = mac_vnode_check_setowner(cred, vp, vattr.va_uid,
3056 	    vattr.va_gid);
3057 	if (error == 0)
3058 #endif
3059 		error = VOP_SETATTR(vp, &vattr, cred);
3060 	VOP_UNLOCK(vp, 0);
3061 	vn_finished_write(mp);
3062 	return (error);
3063 }
3064 
3065 /*
3066  * Set ownership given a path name.
3067  */
3068 #ifndef _SYS_SYSPROTO_H_
3069 struct chown_args {
3070 	char	*path;
3071 	int	uid;
3072 	int	gid;
3073 };
3074 #endif
3075 int
3076 sys_chown(td, uap)
3077 	struct thread *td;
3078 	register struct chown_args /* {
3079 		char *path;
3080 		int uid;
3081 		int gid;
3082 	} */ *uap;
3083 {
3084 
3085 	return (kern_chown(td, uap->path, UIO_USERSPACE, uap->uid, uap->gid));
3086 }
3087 
3088 #ifndef _SYS_SYSPROTO_H_
3089 struct fchownat_args {
3090 	int fd;
3091 	const char * path;
3092 	uid_t uid;
3093 	gid_t gid;
3094 	int flag;
3095 };
3096 #endif
3097 int
3098 sys_fchownat(struct thread *td, struct fchownat_args *uap)
3099 {
3100 	int flag;
3101 
3102 	flag = uap->flag;
3103 	if (flag & ~AT_SYMLINK_NOFOLLOW)
3104 		return (EINVAL);
3105 
3106 	return (kern_fchownat(td, uap->fd, uap->path, UIO_USERSPACE, uap->uid,
3107 	    uap->gid, uap->flag));
3108 }
3109 
3110 int
3111 kern_chown(struct thread *td, char *path, enum uio_seg pathseg, int uid,
3112     int gid)
3113 {
3114 
3115 	return (kern_fchownat(td, AT_FDCWD, path, pathseg, uid, gid, 0));
3116 }
3117 
3118 int
3119 kern_fchownat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
3120     int uid, int gid, int flag)
3121 {
3122 	struct nameidata nd;
3123 	int error, vfslocked, follow;
3124 
3125 	AUDIT_ARG_OWNER(uid, gid);
3126 	follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
3127 	NDINIT_ATRIGHTS(&nd, LOOKUP, follow | MPSAFE | AUDITVNODE1, pathseg,
3128 	    path, fd, CAP_FCHOWN, td);
3129 
3130 	if ((error = namei(&nd)) != 0)
3131 		return (error);
3132 	vfslocked = NDHASGIANT(&nd);
3133 	NDFREE(&nd, NDF_ONLY_PNBUF);
3134 	error = setfown(td, td->td_ucred, nd.ni_vp, uid, gid);
3135 	vrele(nd.ni_vp);
3136 	VFS_UNLOCK_GIANT(vfslocked);
3137 	return (error);
3138 }
3139 
3140 /*
3141  * Set ownership given a path name, do not cross symlinks.
3142  */
3143 #ifndef _SYS_SYSPROTO_H_
3144 struct lchown_args {
3145 	char	*path;
3146 	int	uid;
3147 	int	gid;
3148 };
3149 #endif
3150 int
3151 sys_lchown(td, uap)
3152 	struct thread *td;
3153 	register struct lchown_args /* {
3154 		char *path;
3155 		int uid;
3156 		int gid;
3157 	} */ *uap;
3158 {
3159 
3160 	return (kern_lchown(td, uap->path, UIO_USERSPACE, uap->uid, uap->gid));
3161 }
3162 
3163 int
3164 kern_lchown(struct thread *td, char *path, enum uio_seg pathseg, int uid,
3165     int gid)
3166 {
3167 
3168 	return (kern_fchownat(td, AT_FDCWD, path, pathseg, uid, gid,
3169 	    AT_SYMLINK_NOFOLLOW));
3170 }
3171 
3172 /*
3173  * Set ownership given a file descriptor.
3174  */
3175 #ifndef _SYS_SYSPROTO_H_
3176 struct fchown_args {
3177 	int	fd;
3178 	int	uid;
3179 	int	gid;
3180 };
3181 #endif
3182 int
3183 sys_fchown(td, uap)
3184 	struct thread *td;
3185 	register struct fchown_args /* {
3186 		int fd;
3187 		int uid;
3188 		int gid;
3189 	} */ *uap;
3190 {
3191 	struct file *fp;
3192 	int error;
3193 
3194 	AUDIT_ARG_FD(uap->fd);
3195 	AUDIT_ARG_OWNER(uap->uid, uap->gid);
3196 	error = fget(td, uap->fd, CAP_FCHOWN, &fp);
3197 	if (error != 0)
3198 		return (error);
3199 	error = fo_chown(fp, uap->uid, uap->gid, td->td_ucred, td);
3200 	fdrop(fp, td);
3201 	return (error);
3202 }
3203 
3204 /*
3205  * Common implementation code for utimes(), lutimes(), and futimes().
3206  */
3207 static int
3208 getutimes(usrtvp, tvpseg, tsp)
3209 	const struct timeval *usrtvp;
3210 	enum uio_seg tvpseg;
3211 	struct timespec *tsp;
3212 {
3213 	struct timeval tv[2];
3214 	const struct timeval *tvp;
3215 	int error;
3216 
3217 	if (usrtvp == NULL) {
3218 		vfs_timestamp(&tsp[0]);
3219 		tsp[1] = tsp[0];
3220 	} else {
3221 		if (tvpseg == UIO_SYSSPACE) {
3222 			tvp = usrtvp;
3223 		} else {
3224 			if ((error = copyin(usrtvp, tv, sizeof(tv))) != 0)
3225 				return (error);
3226 			tvp = tv;
3227 		}
3228 
3229 		if (tvp[0].tv_usec < 0 || tvp[0].tv_usec >= 1000000 ||
3230 		    tvp[1].tv_usec < 0 || tvp[1].tv_usec >= 1000000)
3231 			return (EINVAL);
3232 		TIMEVAL_TO_TIMESPEC(&tvp[0], &tsp[0]);
3233 		TIMEVAL_TO_TIMESPEC(&tvp[1], &tsp[1]);
3234 	}
3235 	return (0);
3236 }
3237 
3238 /*
3239  * Common implementation code for utimes(), lutimes(), and futimes().
3240  */
3241 static int
3242 setutimes(td, vp, ts, numtimes, nullflag)
3243 	struct thread *td;
3244 	struct vnode *vp;
3245 	const struct timespec *ts;
3246 	int numtimes;
3247 	int nullflag;
3248 {
3249 	int error, setbirthtime;
3250 	struct mount *mp;
3251 	struct vattr vattr;
3252 
3253 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
3254 		return (error);
3255 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3256 	setbirthtime = 0;
3257 	if (numtimes < 3 && !VOP_GETATTR(vp, &vattr, td->td_ucred) &&
3258 	    timespeccmp(&ts[1], &vattr.va_birthtime, < ))
3259 		setbirthtime = 1;
3260 	VATTR_NULL(&vattr);
3261 	vattr.va_atime = ts[0];
3262 	vattr.va_mtime = ts[1];
3263 	if (setbirthtime)
3264 		vattr.va_birthtime = ts[1];
3265 	if (numtimes > 2)
3266 		vattr.va_birthtime = ts[2];
3267 	if (nullflag)
3268 		vattr.va_vaflags |= VA_UTIMES_NULL;
3269 #ifdef MAC
3270 	error = mac_vnode_check_setutimes(td->td_ucred, vp, vattr.va_atime,
3271 	    vattr.va_mtime);
3272 #endif
3273 	if (error == 0)
3274 		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
3275 	VOP_UNLOCK(vp, 0);
3276 	vn_finished_write(mp);
3277 	return (error);
3278 }
3279 
3280 /*
3281  * Set the access and modification times of a file.
3282  */
3283 #ifndef _SYS_SYSPROTO_H_
3284 struct utimes_args {
3285 	char	*path;
3286 	struct	timeval *tptr;
3287 };
3288 #endif
3289 int
3290 sys_utimes(td, uap)
3291 	struct thread *td;
3292 	register struct utimes_args /* {
3293 		char *path;
3294 		struct timeval *tptr;
3295 	} */ *uap;
3296 {
3297 
3298 	return (kern_utimes(td, uap->path, UIO_USERSPACE, uap->tptr,
3299 	    UIO_USERSPACE));
3300 }
3301 
3302 #ifndef _SYS_SYSPROTO_H_
3303 struct futimesat_args {
3304 	int fd;
3305 	const char * path;
3306 	const struct timeval * times;
3307 };
3308 #endif
3309 int
3310 sys_futimesat(struct thread *td, struct futimesat_args *uap)
3311 {
3312 
3313 	return (kern_utimesat(td, uap->fd, uap->path, UIO_USERSPACE,
3314 	    uap->times, UIO_USERSPACE));
3315 }
3316 
3317 int
3318 kern_utimes(struct thread *td, char *path, enum uio_seg pathseg,
3319     struct timeval *tptr, enum uio_seg tptrseg)
3320 {
3321 
3322 	return (kern_utimesat(td, AT_FDCWD, path, pathseg, tptr, tptrseg));
3323 }
3324 
3325 int
3326 kern_utimesat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
3327     struct timeval *tptr, enum uio_seg tptrseg)
3328 {
3329 	struct nameidata nd;
3330 	struct timespec ts[2];
3331 	int error, vfslocked;
3332 
3333 	if ((error = getutimes(tptr, tptrseg, ts)) != 0)
3334 		return (error);
3335 	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1, pathseg,
3336 	    path, fd, CAP_FUTIMES, td);
3337 
3338 	if ((error = namei(&nd)) != 0)
3339 		return (error);
3340 	vfslocked = NDHASGIANT(&nd);
3341 	NDFREE(&nd, NDF_ONLY_PNBUF);
3342 	error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL);
3343 	vrele(nd.ni_vp);
3344 	VFS_UNLOCK_GIANT(vfslocked);
3345 	return (error);
3346 }
3347 
3348 /*
3349  * Set the access and modification times of a file.
3350  */
3351 #ifndef _SYS_SYSPROTO_H_
3352 struct lutimes_args {
3353 	char	*path;
3354 	struct	timeval *tptr;
3355 };
3356 #endif
3357 int
3358 sys_lutimes(td, uap)
3359 	struct thread *td;
3360 	register struct lutimes_args /* {
3361 		char *path;
3362 		struct timeval *tptr;
3363 	} */ *uap;
3364 {
3365 
3366 	return (kern_lutimes(td, uap->path, UIO_USERSPACE, uap->tptr,
3367 	    UIO_USERSPACE));
3368 }
3369 
3370 int
3371 kern_lutimes(struct thread *td, char *path, enum uio_seg pathseg,
3372     struct timeval *tptr, enum uio_seg tptrseg)
3373 {
3374 	struct timespec ts[2];
3375 	int error;
3376 	struct nameidata nd;
3377 	int vfslocked;
3378 
3379 	if ((error = getutimes(tptr, tptrseg, ts)) != 0)
3380 		return (error);
3381 	NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE | AUDITVNODE1, pathseg, path, td);
3382 	if ((error = namei(&nd)) != 0)
3383 		return (error);
3384 	vfslocked = NDHASGIANT(&nd);
3385 	NDFREE(&nd, NDF_ONLY_PNBUF);
3386 	error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL);
3387 	vrele(nd.ni_vp);
3388 	VFS_UNLOCK_GIANT(vfslocked);
3389 	return (error);
3390 }
3391 
3392 /*
3393  * Set the access and modification times of a file.
3394  */
3395 #ifndef _SYS_SYSPROTO_H_
3396 struct futimes_args {
3397 	int	fd;
3398 	struct	timeval *tptr;
3399 };
3400 #endif
3401 int
3402 sys_futimes(td, uap)
3403 	struct thread *td;
3404 	register struct futimes_args /* {
3405 		int  fd;
3406 		struct timeval *tptr;
3407 	} */ *uap;
3408 {
3409 
3410 	return (kern_futimes(td, uap->fd, uap->tptr, UIO_USERSPACE));
3411 }
3412 
3413 int
3414 kern_futimes(struct thread *td, int fd, struct timeval *tptr,
3415     enum uio_seg tptrseg)
3416 {
3417 	struct timespec ts[2];
3418 	struct file *fp;
3419 	int vfslocked;
3420 	int error;
3421 
3422 	AUDIT_ARG_FD(fd);
3423 	if ((error = getutimes(tptr, tptrseg, ts)) != 0)
3424 		return (error);
3425 	if ((error = getvnode(td->td_proc->p_fd, fd, CAP_FUTIMES, &fp))
3426 	    != 0)
3427 		return (error);
3428 	vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
3429 #ifdef AUDIT
3430 	vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY);
3431 	AUDIT_ARG_VNODE1(fp->f_vnode);
3432 	VOP_UNLOCK(fp->f_vnode, 0);
3433 #endif
3434 	error = setutimes(td, fp->f_vnode, ts, 2, tptr == NULL);
3435 	VFS_UNLOCK_GIANT(vfslocked);
3436 	fdrop(fp, td);
3437 	return (error);
3438 }
3439 
3440 /*
3441  * Truncate a file given its path name.
3442  */
3443 #ifndef _SYS_SYSPROTO_H_
3444 struct truncate_args {
3445 	char	*path;
3446 	int	pad;
3447 	off_t	length;
3448 };
3449 #endif
3450 int
3451 sys_truncate(td, uap)
3452 	struct thread *td;
3453 	register struct truncate_args /* {
3454 		char *path;
3455 		int pad;
3456 		off_t length;
3457 	} */ *uap;
3458 {
3459 
3460 	return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length));
3461 }
3462 
3463 int
3464 kern_truncate(struct thread *td, char *path, enum uio_seg pathseg, off_t length)
3465 {
3466 	struct mount *mp;
3467 	struct vnode *vp;
3468 	struct vattr vattr;
3469 	int error;
3470 	struct nameidata nd;
3471 	int vfslocked;
3472 
3473 	if (length < 0)
3474 		return(EINVAL);
3475 	NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1, pathseg, path, td);
3476 	if ((error = namei(&nd)) != 0)
3477 		return (error);
3478 	vfslocked = NDHASGIANT(&nd);
3479 	vp = nd.ni_vp;
3480 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
3481 		vrele(vp);
3482 		VFS_UNLOCK_GIANT(vfslocked);
3483 		return (error);
3484 	}
3485 	NDFREE(&nd, NDF_ONLY_PNBUF);
3486 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3487 	if (vp->v_type == VDIR)
3488 		error = EISDIR;
3489 #ifdef MAC
3490 	else if ((error = mac_vnode_check_write(td->td_ucred, NOCRED, vp))) {
3491 	}
3492 #endif
3493 	else if ((error = vn_writechk(vp)) == 0 &&
3494 	    (error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td)) == 0) {
3495 		VATTR_NULL(&vattr);
3496 		vattr.va_size = length;
3497 		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
3498 	}
3499 	vput(vp);
3500 	vn_finished_write(mp);
3501 	VFS_UNLOCK_GIANT(vfslocked);
3502 	return (error);
3503 }
3504 
3505 #if defined(COMPAT_43)
3506 /*
3507  * Truncate a file given its path name.
3508  */
3509 #ifndef _SYS_SYSPROTO_H_
3510 struct otruncate_args {
3511 	char	*path;
3512 	long	length;
3513 };
3514 #endif
3515 int
3516 otruncate(td, uap)
3517 	struct thread *td;
3518 	register struct otruncate_args /* {
3519 		char *path;
3520 		long length;
3521 	} */ *uap;
3522 {
3523 	struct truncate_args /* {
3524 		char *path;
3525 		int pad;
3526 		off_t length;
3527 	} */ nuap;
3528 
3529 	nuap.path = uap->path;
3530 	nuap.length = uap->length;
3531 	return (sys_truncate(td, &nuap));
3532 }
3533 #endif /* COMPAT_43 */
3534 
3535 /* Versions with the pad argument */
3536 int
3537 freebsd6_truncate(struct thread *td, struct freebsd6_truncate_args *uap)
3538 {
3539 	struct truncate_args ouap;
3540 
3541 	ouap.path = uap->path;
3542 	ouap.length = uap->length;
3543 	return (sys_truncate(td, &ouap));
3544 }
3545 
3546 int
3547 freebsd6_ftruncate(struct thread *td, struct freebsd6_ftruncate_args *uap)
3548 {
3549 	struct ftruncate_args ouap;
3550 
3551 	ouap.fd = uap->fd;
3552 	ouap.length = uap->length;
3553 	return (sys_ftruncate(td, &ouap));
3554 }
3555 
3556 /*
3557  * Sync an open file.
3558  */
3559 #ifndef _SYS_SYSPROTO_H_
3560 struct fsync_args {
3561 	int	fd;
3562 };
3563 #endif
3564 int
3565 sys_fsync(td, uap)
3566 	struct thread *td;
3567 	struct fsync_args /* {
3568 		int fd;
3569 	} */ *uap;
3570 {
3571 	struct vnode *vp;
3572 	struct mount *mp;
3573 	struct file *fp;
3574 	int vfslocked;
3575 	int error, lock_flags;
3576 
3577 	AUDIT_ARG_FD(uap->fd);
3578 	if ((error = getvnode(td->td_proc->p_fd, uap->fd, CAP_FSYNC,
3579 	    &fp)) != 0)
3580 		return (error);
3581 	vp = fp->f_vnode;
3582 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
3583 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
3584 		goto drop;
3585 	if (MNT_SHARED_WRITES(mp) ||
3586 	    ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) {
3587 		lock_flags = LK_SHARED;
3588 	} else {
3589 		lock_flags = LK_EXCLUSIVE;
3590 	}
3591 	vn_lock(vp, lock_flags | LK_RETRY);
3592 	AUDIT_ARG_VNODE1(vp);
3593 	if (vp->v_object != NULL) {
3594 		VM_OBJECT_LOCK(vp->v_object);
3595 		vm_object_page_clean(vp->v_object, 0, 0, 0);
3596 		VM_OBJECT_UNLOCK(vp->v_object);
3597 	}
3598 	error = VOP_FSYNC(vp, MNT_WAIT, td);
3599 
3600 	VOP_UNLOCK(vp, 0);
3601 	vn_finished_write(mp);
3602 drop:
3603 	VFS_UNLOCK_GIANT(vfslocked);
3604 	fdrop(fp, td);
3605 	return (error);
3606 }
3607 
3608 /*
3609  * Rename files.  Source and destination must either both be directories, or
3610  * both not be directories.  If target is a directory, it must be empty.
3611  */
3612 #ifndef _SYS_SYSPROTO_H_
3613 struct rename_args {
3614 	char	*from;
3615 	char	*to;
3616 };
3617 #endif
3618 int
3619 sys_rename(td, uap)
3620 	struct thread *td;
3621 	register struct rename_args /* {
3622 		char *from;
3623 		char *to;
3624 	} */ *uap;
3625 {
3626 
3627 	return (kern_rename(td, uap->from, uap->to, UIO_USERSPACE));
3628 }
3629 
3630 #ifndef _SYS_SYSPROTO_H_
3631 struct renameat_args {
3632 	int	oldfd;
3633 	char	*old;
3634 	int	newfd;
3635 	char	*new;
3636 };
3637 #endif
3638 int
3639 sys_renameat(struct thread *td, struct renameat_args *uap)
3640 {
3641 
3642 	return (kern_renameat(td, uap->oldfd, uap->old, uap->newfd, uap->new,
3643 	    UIO_USERSPACE));
3644 }
3645 
3646 int
3647 kern_rename(struct thread *td, char *from, char *to, enum uio_seg pathseg)
3648 {
3649 
3650 	return (kern_renameat(td, AT_FDCWD, from, AT_FDCWD, to, pathseg));
3651 }
3652 
3653 int
3654 kern_renameat(struct thread *td, int oldfd, char *old, int newfd, char *new,
3655     enum uio_seg pathseg)
3656 {
3657 	struct mount *mp = NULL;
3658 	struct vnode *tvp, *fvp, *tdvp;
3659 	struct nameidata fromnd, tond;
3660 	int tvfslocked;
3661 	int fvfslocked;
3662 	int error;
3663 
3664 	bwillwrite();
3665 #ifdef MAC
3666 	NDINIT_ATRIGHTS(&fromnd, DELETE, LOCKPARENT | LOCKLEAF | SAVESTART |
3667 	    MPSAFE | AUDITVNODE1, pathseg, old, oldfd, CAP_DELETE, td);
3668 #else
3669 	NDINIT_ATRIGHTS(&fromnd, DELETE, WANTPARENT | SAVESTART | MPSAFE |
3670 	    AUDITVNODE1, pathseg, old, oldfd, CAP_DELETE, td);
3671 #endif
3672 
3673 	if ((error = namei(&fromnd)) != 0)
3674 		return (error);
3675 	fvfslocked = NDHASGIANT(&fromnd);
3676 	tvfslocked = 0;
3677 #ifdef MAC
3678 	error = mac_vnode_check_rename_from(td->td_ucred, fromnd.ni_dvp,
3679 	    fromnd.ni_vp, &fromnd.ni_cnd);
3680 	VOP_UNLOCK(fromnd.ni_dvp, 0);
3681 	if (fromnd.ni_dvp != fromnd.ni_vp)
3682 		VOP_UNLOCK(fromnd.ni_vp, 0);
3683 #endif
3684 	fvp = fromnd.ni_vp;
3685 	if (error == 0)
3686 		error = vn_start_write(fvp, &mp, V_WAIT | PCATCH);
3687 	if (error != 0) {
3688 		NDFREE(&fromnd, NDF_ONLY_PNBUF);
3689 		vrele(fromnd.ni_dvp);
3690 		vrele(fvp);
3691 		goto out1;
3692 	}
3693 	NDINIT_ATRIGHTS(&tond, RENAME, LOCKPARENT | LOCKLEAF | NOCACHE |
3694 	    SAVESTART | MPSAFE | AUDITVNODE2, pathseg, new, newfd, CAP_CREATE,
3695 	    td);
3696 	if (fromnd.ni_vp->v_type == VDIR)
3697 		tond.ni_cnd.cn_flags |= WILLBEDIR;
3698 	if ((error = namei(&tond)) != 0) {
3699 		/* Translate error code for rename("dir1", "dir2/."). */
3700 		if (error == EISDIR && fvp->v_type == VDIR)
3701 			error = EINVAL;
3702 		NDFREE(&fromnd, NDF_ONLY_PNBUF);
3703 		vrele(fromnd.ni_dvp);
3704 		vrele(fvp);
3705 		vn_finished_write(mp);
3706 		goto out1;
3707 	}
3708 	tvfslocked = NDHASGIANT(&tond);
3709 	tdvp = tond.ni_dvp;
3710 	tvp = tond.ni_vp;
3711 	if (tvp != NULL) {
3712 		if (fvp->v_type == VDIR && tvp->v_type != VDIR) {
3713 			error = ENOTDIR;
3714 			goto out;
3715 		} else if (fvp->v_type != VDIR && tvp->v_type == VDIR) {
3716 			error = EISDIR;
3717 			goto out;
3718 		}
3719 	}
3720 	if (fvp == tdvp) {
3721 		error = EINVAL;
3722 		goto out;
3723 	}
3724 	/*
3725 	 * If the source is the same as the destination (that is, if they
3726 	 * are links to the same vnode), then there is nothing to do.
3727 	 */
3728 	if (fvp == tvp)
3729 		error = -1;
3730 #ifdef MAC
3731 	else
3732 		error = mac_vnode_check_rename_to(td->td_ucred, tdvp,
3733 		    tond.ni_vp, fromnd.ni_dvp == tdvp, &tond.ni_cnd);
3734 #endif
3735 out:
3736 	if (!error) {
3737 		error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd,
3738 				   tond.ni_dvp, tond.ni_vp, &tond.ni_cnd);
3739 		NDFREE(&fromnd, NDF_ONLY_PNBUF);
3740 		NDFREE(&tond, NDF_ONLY_PNBUF);
3741 	} else {
3742 		NDFREE(&fromnd, NDF_ONLY_PNBUF);
3743 		NDFREE(&tond, NDF_ONLY_PNBUF);
3744 		if (tvp)
3745 			vput(tvp);
3746 		if (tdvp == tvp)
3747 			vrele(tdvp);
3748 		else
3749 			vput(tdvp);
3750 		vrele(fromnd.ni_dvp);
3751 		vrele(fvp);
3752 	}
3753 	vrele(tond.ni_startdir);
3754 	vn_finished_write(mp);
3755 out1:
3756 	if (fromnd.ni_startdir)
3757 		vrele(fromnd.ni_startdir);
3758 	VFS_UNLOCK_GIANT(fvfslocked);
3759 	VFS_UNLOCK_GIANT(tvfslocked);
3760 	if (error == -1)
3761 		return (0);
3762 	return (error);
3763 }
3764 
3765 /*
3766  * Make a directory file.
3767  */
3768 #ifndef _SYS_SYSPROTO_H_
3769 struct mkdir_args {
3770 	char	*path;
3771 	int	mode;
3772 };
3773 #endif
3774 int
3775 sys_mkdir(td, uap)
3776 	struct thread *td;
3777 	register struct mkdir_args /* {
3778 		char *path;
3779 		int mode;
3780 	} */ *uap;
3781 {
3782 
3783 	return (kern_mkdir(td, uap->path, UIO_USERSPACE, uap->mode));
3784 }
3785 
3786 #ifndef _SYS_SYSPROTO_H_
3787 struct mkdirat_args {
3788 	int	fd;
3789 	char	*path;
3790 	mode_t	mode;
3791 };
3792 #endif
3793 int
3794 sys_mkdirat(struct thread *td, struct mkdirat_args *uap)
3795 {
3796 
3797 	return (kern_mkdirat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode));
3798 }
3799 
3800 int
3801 kern_mkdir(struct thread *td, char *path, enum uio_seg segflg, int mode)
3802 {
3803 
3804 	return (kern_mkdirat(td, AT_FDCWD, path, segflg, mode));
3805 }
3806 
3807 int
3808 kern_mkdirat(struct thread *td, int fd, char *path, enum uio_seg segflg,
3809     int mode)
3810 {
3811 	struct mount *mp;
3812 	struct vnode *vp;
3813 	struct vattr vattr;
3814 	int error;
3815 	struct nameidata nd;
3816 	int vfslocked;
3817 
3818 	AUDIT_ARG_MODE(mode);
3819 restart:
3820 	bwillwrite();
3821 	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE |
3822 	    AUDITVNODE1, segflg, path, fd, CAP_MKDIR, td);
3823 	nd.ni_cnd.cn_flags |= WILLBEDIR;
3824 	if ((error = namei(&nd)) != 0)
3825 		return (error);
3826 	vfslocked = NDHASGIANT(&nd);
3827 	vp = nd.ni_vp;
3828 	if (vp != NULL) {
3829 		NDFREE(&nd, NDF_ONLY_PNBUF);
3830 		/*
3831 		 * XXX namei called with LOCKPARENT but not LOCKLEAF has
3832 		 * the strange behaviour of leaving the vnode unlocked
3833 		 * if the target is the same vnode as the parent.
3834 		 */
3835 		if (vp == nd.ni_dvp)
3836 			vrele(nd.ni_dvp);
3837 		else
3838 			vput(nd.ni_dvp);
3839 		vrele(vp);
3840 		VFS_UNLOCK_GIANT(vfslocked);
3841 		return (EEXIST);
3842 	}
3843 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
3844 		NDFREE(&nd, NDF_ONLY_PNBUF);
3845 		vput(nd.ni_dvp);
3846 		VFS_UNLOCK_GIANT(vfslocked);
3847 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
3848 			return (error);
3849 		goto restart;
3850 	}
3851 	VATTR_NULL(&vattr);
3852 	vattr.va_type = VDIR;
3853 	vattr.va_mode = (mode & ACCESSPERMS) &~ td->td_proc->p_fd->fd_cmask;
3854 #ifdef MAC
3855 	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
3856 	    &vattr);
3857 	if (error)
3858 		goto out;
3859 #endif
3860 	error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
3861 #ifdef MAC
3862 out:
3863 #endif
3864 	NDFREE(&nd, NDF_ONLY_PNBUF);
3865 	vput(nd.ni_dvp);
3866 	if (!error)
3867 		vput(nd.ni_vp);
3868 	vn_finished_write(mp);
3869 	VFS_UNLOCK_GIANT(vfslocked);
3870 	return (error);
3871 }
3872 
3873 /*
3874  * Remove a directory file.
3875  */
3876 #ifndef _SYS_SYSPROTO_H_
3877 struct rmdir_args {
3878 	char	*path;
3879 };
3880 #endif
3881 int
3882 sys_rmdir(td, uap)
3883 	struct thread *td;
3884 	struct rmdir_args /* {
3885 		char *path;
3886 	} */ *uap;
3887 {
3888 
3889 	return (kern_rmdir(td, uap->path, UIO_USERSPACE));
3890 }
3891 
3892 int
3893 kern_rmdir(struct thread *td, char *path, enum uio_seg pathseg)
3894 {
3895 
3896 	return (kern_rmdirat(td, AT_FDCWD, path, pathseg));
3897 }
3898 
3899 int
3900 kern_rmdirat(struct thread *td, int fd, char *path, enum uio_seg pathseg)
3901 {
3902 	struct mount *mp;
3903 	struct vnode *vp;
3904 	int error;
3905 	struct nameidata nd;
3906 	int vfslocked;
3907 
3908 restart:
3909 	bwillwrite();
3910 	NDINIT_ATRIGHTS(&nd, DELETE, LOCKPARENT | LOCKLEAF | MPSAFE |
3911 	    AUDITVNODE1, pathseg, path, fd, CAP_RMDIR, td);
3912 	if ((error = namei(&nd)) != 0)
3913 		return (error);
3914 	vfslocked = NDHASGIANT(&nd);
3915 	vp = nd.ni_vp;
3916 	if (vp->v_type != VDIR) {
3917 		error = ENOTDIR;
3918 		goto out;
3919 	}
3920 	/*
3921 	 * No rmdir "." please.
3922 	 */
3923 	if (nd.ni_dvp == vp) {
3924 		error = EINVAL;
3925 		goto out;
3926 	}
3927 	/*
3928 	 * The root of a mounted filesystem cannot be deleted.
3929 	 */
3930 	if (vp->v_vflag & VV_ROOT) {
3931 		error = EBUSY;
3932 		goto out;
3933 	}
3934 #ifdef MAC
3935 	error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp,
3936 	    &nd.ni_cnd);
3937 	if (error)
3938 		goto out;
3939 #endif
3940 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
3941 		NDFREE(&nd, NDF_ONLY_PNBUF);
3942 		vput(vp);
3943 		if (nd.ni_dvp == vp)
3944 			vrele(nd.ni_dvp);
3945 		else
3946 			vput(nd.ni_dvp);
3947 		VFS_UNLOCK_GIANT(vfslocked);
3948 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
3949 			return (error);
3950 		goto restart;
3951 	}
3952 	error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
3953 	vn_finished_write(mp);
3954 out:
3955 	NDFREE(&nd, NDF_ONLY_PNBUF);
3956 	vput(vp);
3957 	if (nd.ni_dvp == vp)
3958 		vrele(nd.ni_dvp);
3959 	else
3960 		vput(nd.ni_dvp);
3961 	VFS_UNLOCK_GIANT(vfslocked);
3962 	return (error);
3963 }
3964 
3965 #ifdef COMPAT_43
3966 /*
3967  * Read a block of directory entries in a filesystem independent format.
3968  */
3969 #ifndef _SYS_SYSPROTO_H_
3970 struct ogetdirentries_args {
3971 	int	fd;
3972 	char	*buf;
3973 	u_int	count;
3974 	long	*basep;
3975 };
3976 #endif
3977 int
3978 ogetdirentries(struct thread *td, struct ogetdirentries_args *uap)
3979 {
3980 	long loff;
3981 	int error;
3982 
3983 	error = kern_ogetdirentries(td, uap, &loff);
3984 	if (error == 0)
3985 		error = copyout(&loff, uap->basep, sizeof(long));
3986 	return (error);
3987 }
3988 
3989 int
3990 kern_ogetdirentries(struct thread *td, struct ogetdirentries_args *uap,
3991     long *ploff)
3992 {
3993 	struct vnode *vp;
3994 	struct file *fp;
3995 	struct uio auio, kuio;
3996 	struct iovec aiov, kiov;
3997 	struct dirent *dp, *edp;
3998 	caddr_t dirbuf;
3999 	int error, eofflag, readcnt, vfslocked;
4000 	long loff;
4001 
4002 	/* XXX arbitrary sanity limit on `count'. */
4003 	if (uap->count > 64 * 1024)
4004 		return (EINVAL);
4005 	if ((error = getvnode(td->td_proc->p_fd, uap->fd, CAP_READ,
4006 	    &fp)) != 0)
4007 		return (error);
4008 	if ((fp->f_flag & FREAD) == 0) {
4009 		fdrop(fp, td);
4010 		return (EBADF);
4011 	}
4012 	vp = fp->f_vnode;
4013 unionread:
4014 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
4015 	if (vp->v_type != VDIR) {
4016 		VFS_UNLOCK_GIANT(vfslocked);
4017 		fdrop(fp, td);
4018 		return (EINVAL);
4019 	}
4020 	aiov.iov_base = uap->buf;
4021 	aiov.iov_len = uap->count;
4022 	auio.uio_iov = &aiov;
4023 	auio.uio_iovcnt = 1;
4024 	auio.uio_rw = UIO_READ;
4025 	auio.uio_segflg = UIO_USERSPACE;
4026 	auio.uio_td = td;
4027 	auio.uio_resid = uap->count;
4028 	vn_lock(vp, LK_SHARED | LK_RETRY);
4029 	loff = auio.uio_offset = fp->f_offset;
4030 #ifdef MAC
4031 	error = mac_vnode_check_readdir(td->td_ucred, vp);
4032 	if (error) {
4033 		VOP_UNLOCK(vp, 0);
4034 		VFS_UNLOCK_GIANT(vfslocked);
4035 		fdrop(fp, td);
4036 		return (error);
4037 	}
4038 #endif
4039 #	if (BYTE_ORDER != LITTLE_ENDIAN)
4040 		if (vp->v_mount->mnt_maxsymlinklen <= 0) {
4041 			error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag,
4042 			    NULL, NULL);
4043 			fp->f_offset = auio.uio_offset;
4044 		} else
4045 #	endif
4046 	{
4047 		kuio = auio;
4048 		kuio.uio_iov = &kiov;
4049 		kuio.uio_segflg = UIO_SYSSPACE;
4050 		kiov.iov_len = uap->count;
4051 		dirbuf = malloc(uap->count, M_TEMP, M_WAITOK);
4052 		kiov.iov_base = dirbuf;
4053 		error = VOP_READDIR(vp, &kuio, fp->f_cred, &eofflag,
4054 			    NULL, NULL);
4055 		fp->f_offset = kuio.uio_offset;
4056 		if (error == 0) {
4057 			readcnt = uap->count - kuio.uio_resid;
4058 			edp = (struct dirent *)&dirbuf[readcnt];
4059 			for (dp = (struct dirent *)dirbuf; dp < edp; ) {
4060 #				if (BYTE_ORDER == LITTLE_ENDIAN)
4061 					/*
4062 					 * The expected low byte of
4063 					 * dp->d_namlen is our dp->d_type.
4064 					 * The high MBZ byte of dp->d_namlen
4065 					 * is our dp->d_namlen.
4066 					 */
4067 					dp->d_type = dp->d_namlen;
4068 					dp->d_namlen = 0;
4069 #				else
4070 					/*
4071 					 * The dp->d_type is the high byte
4072 					 * of the expected dp->d_namlen,
4073 					 * so must be zero'ed.
4074 					 */
4075 					dp->d_type = 0;
4076 #				endif
4077 				if (dp->d_reclen > 0) {
4078 					dp = (struct dirent *)
4079 					    ((char *)dp + dp->d_reclen);
4080 				} else {
4081 					error = EIO;
4082 					break;
4083 				}
4084 			}
4085 			if (dp >= edp)
4086 				error = uiomove(dirbuf, readcnt, &auio);
4087 		}
4088 		free(dirbuf, M_TEMP);
4089 	}
4090 	if (error) {
4091 		VOP_UNLOCK(vp, 0);
4092 		VFS_UNLOCK_GIANT(vfslocked);
4093 		fdrop(fp, td);
4094 		return (error);
4095 	}
4096 	if (uap->count == auio.uio_resid &&
4097 	    (vp->v_vflag & VV_ROOT) &&
4098 	    (vp->v_mount->mnt_flag & MNT_UNION)) {
4099 		struct vnode *tvp = vp;
4100 		vp = vp->v_mount->mnt_vnodecovered;
4101 		VREF(vp);
4102 		fp->f_vnode = vp;
4103 		fp->f_data = vp;
4104 		fp->f_offset = 0;
4105 		vput(tvp);
4106 		VFS_UNLOCK_GIANT(vfslocked);
4107 		goto unionread;
4108 	}
4109 	VOP_UNLOCK(vp, 0);
4110 	VFS_UNLOCK_GIANT(vfslocked);
4111 	fdrop(fp, td);
4112 	td->td_retval[0] = uap->count - auio.uio_resid;
4113 	if (error == 0)
4114 		*ploff = loff;
4115 	return (error);
4116 }
4117 #endif /* COMPAT_43 */
4118 
4119 /*
4120  * Read a block of directory entries in a filesystem independent format.
4121  */
4122 #ifndef _SYS_SYSPROTO_H_
4123 struct getdirentries_args {
4124 	int	fd;
4125 	char	*buf;
4126 	u_int	count;
4127 	long	*basep;
4128 };
4129 #endif
4130 int
4131 sys_getdirentries(td, uap)
4132 	struct thread *td;
4133 	register struct getdirentries_args /* {
4134 		int fd;
4135 		char *buf;
4136 		u_int count;
4137 		long *basep;
4138 	} */ *uap;
4139 {
4140 	long base;
4141 	int error;
4142 
4143 	error = kern_getdirentries(td, uap->fd, uap->buf, uap->count, &base);
4144 	if (error)
4145 		return (error);
4146 	if (uap->basep != NULL)
4147 		error = copyout(&base, uap->basep, sizeof(long));
4148 	return (error);
4149 }
4150 
4151 int
4152 kern_getdirentries(struct thread *td, int fd, char *buf, u_int count,
4153     long *basep)
4154 {
4155 	struct vnode *vp;
4156 	struct file *fp;
4157 	struct uio auio;
4158 	struct iovec aiov;
4159 	int vfslocked;
4160 	long loff;
4161 	int error, eofflag;
4162 
4163 	AUDIT_ARG_FD(fd);
4164 	if (count > INT_MAX)
4165 		return (EINVAL);
4166 	if ((error = getvnode(td->td_proc->p_fd, fd, CAP_READ | CAP_SEEK,
4167 	    &fp)) != 0)
4168 		return (error);
4169 	if ((fp->f_flag & FREAD) == 0) {
4170 		fdrop(fp, td);
4171 		return (EBADF);
4172 	}
4173 	vp = fp->f_vnode;
4174 unionread:
4175 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
4176 	if (vp->v_type != VDIR) {
4177 		VFS_UNLOCK_GIANT(vfslocked);
4178 		error = EINVAL;
4179 		goto fail;
4180 	}
4181 	aiov.iov_base = buf;
4182 	aiov.iov_len = count;
4183 	auio.uio_iov = &aiov;
4184 	auio.uio_iovcnt = 1;
4185 	auio.uio_rw = UIO_READ;
4186 	auio.uio_segflg = UIO_USERSPACE;
4187 	auio.uio_td = td;
4188 	auio.uio_resid = count;
4189 	vn_lock(vp, LK_SHARED | LK_RETRY);
4190 	AUDIT_ARG_VNODE1(vp);
4191 	loff = auio.uio_offset = fp->f_offset;
4192 #ifdef MAC
4193 	error = mac_vnode_check_readdir(td->td_ucred, vp);
4194 	if (error == 0)
4195 #endif
4196 		error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL,
4197 		    NULL);
4198 	fp->f_offset = auio.uio_offset;
4199 	if (error) {
4200 		VOP_UNLOCK(vp, 0);
4201 		VFS_UNLOCK_GIANT(vfslocked);
4202 		goto fail;
4203 	}
4204 	if (count == auio.uio_resid &&
4205 	    (vp->v_vflag & VV_ROOT) &&
4206 	    (vp->v_mount->mnt_flag & MNT_UNION)) {
4207 		struct vnode *tvp = vp;
4208 		vp = vp->v_mount->mnt_vnodecovered;
4209 		VREF(vp);
4210 		fp->f_vnode = vp;
4211 		fp->f_data = vp;
4212 		fp->f_offset = 0;
4213 		vput(tvp);
4214 		VFS_UNLOCK_GIANT(vfslocked);
4215 		goto unionread;
4216 	}
4217 	VOP_UNLOCK(vp, 0);
4218 	VFS_UNLOCK_GIANT(vfslocked);
4219 	*basep = loff;
4220 	td->td_retval[0] = count - auio.uio_resid;
4221 fail:
4222 	fdrop(fp, td);
4223 	return (error);
4224 }
4225 
4226 #ifndef _SYS_SYSPROTO_H_
4227 struct getdents_args {
4228 	int fd;
4229 	char *buf;
4230 	size_t count;
4231 };
4232 #endif
4233 int
4234 sys_getdents(td, uap)
4235 	struct thread *td;
4236 	register struct getdents_args /* {
4237 		int fd;
4238 		char *buf;
4239 		u_int count;
4240 	} */ *uap;
4241 {
4242 	struct getdirentries_args ap;
4243 	ap.fd = uap->fd;
4244 	ap.buf = uap->buf;
4245 	ap.count = uap->count;
4246 	ap.basep = NULL;
4247 	return (sys_getdirentries(td, &ap));
4248 }
4249 
4250 /*
4251  * Set the mode mask for creation of filesystem nodes.
4252  */
4253 #ifndef _SYS_SYSPROTO_H_
4254 struct umask_args {
4255 	int	newmask;
4256 };
4257 #endif
4258 int
4259 sys_umask(td, uap)
4260 	struct thread *td;
4261 	struct umask_args /* {
4262 		int newmask;
4263 	} */ *uap;
4264 {
4265 	register struct filedesc *fdp;
4266 
4267 	FILEDESC_XLOCK(td->td_proc->p_fd);
4268 	fdp = td->td_proc->p_fd;
4269 	td->td_retval[0] = fdp->fd_cmask;
4270 	fdp->fd_cmask = uap->newmask & ALLPERMS;
4271 	FILEDESC_XUNLOCK(td->td_proc->p_fd);
4272 	return (0);
4273 }
4274 
4275 /*
4276  * Void all references to file by ripping underlying filesystem away from
4277  * vnode.
4278  */
4279 #ifndef _SYS_SYSPROTO_H_
4280 struct revoke_args {
4281 	char	*path;
4282 };
4283 #endif
4284 int
4285 sys_revoke(td, uap)
4286 	struct thread *td;
4287 	register struct revoke_args /* {
4288 		char *path;
4289 	} */ *uap;
4290 {
4291 	struct vnode *vp;
4292 	struct vattr vattr;
4293 	int error;
4294 	struct nameidata nd;
4295 	int vfslocked;
4296 
4297 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
4298 	    UIO_USERSPACE, uap->path, td);
4299 	if ((error = namei(&nd)) != 0)
4300 		return (error);
4301 	vfslocked = NDHASGIANT(&nd);
4302 	vp = nd.ni_vp;
4303 	NDFREE(&nd, NDF_ONLY_PNBUF);
4304 	if (vp->v_type != VCHR || vp->v_rdev == NULL) {
4305 		error = EINVAL;
4306 		goto out;
4307 	}
4308 #ifdef MAC
4309 	error = mac_vnode_check_revoke(td->td_ucred, vp);
4310 	if (error)
4311 		goto out;
4312 #endif
4313 	error = VOP_GETATTR(vp, &vattr, td->td_ucred);
4314 	if (error)
4315 		goto out;
4316 	if (td->td_ucred->cr_uid != vattr.va_uid) {
4317 		error = priv_check(td, PRIV_VFS_ADMIN);
4318 		if (error)
4319 			goto out;
4320 	}
4321 	if (vcount(vp) > 1)
4322 		VOP_REVOKE(vp, REVOKEALL);
4323 out:
4324 	vput(vp);
4325 	VFS_UNLOCK_GIANT(vfslocked);
4326 	return (error);
4327 }
4328 
4329 /*
4330  * Convert a user file descriptor to a kernel file entry and check that, if it
4331  * is a capability, the correct rights are present. A reference on the file
4332  * entry is held upon returning.
4333  */
4334 int
4335 getvnode(struct filedesc *fdp, int fd, cap_rights_t rights,
4336     struct file **fpp)
4337 {
4338 	struct file *fp;
4339 #ifdef CAPABILITIES
4340 	struct file *fp_fromcap;
4341 #endif
4342 	int error;
4343 
4344 	error = 0;
4345 	fp = NULL;
4346 	if ((fdp == NULL) || (fp = fget_unlocked(fdp, fd)) == NULL)
4347 		return (EBADF);
4348 #ifdef CAPABILITIES
4349 	/*
4350 	 * If the file descriptor is for a capability, test rights and use the
4351 	 * file descriptor referenced by the capability.
4352 	 */
4353 	error = cap_funwrap(fp, rights, &fp_fromcap);
4354 	if (error) {
4355 		fdrop(fp, curthread);
4356 		return (error);
4357 	}
4358 	if (fp != fp_fromcap) {
4359 		fhold(fp_fromcap);
4360 		fdrop(fp, curthread);
4361 		fp = fp_fromcap;
4362 	}
4363 #endif /* CAPABILITIES */
4364 
4365 	/*
4366 	 * The file could be not of the vnode type, or it may be not
4367 	 * yet fully initialized, in which case the f_vnode pointer
4368 	 * may be set, but f_ops is still badfileops.  E.g.,
4369 	 * devfs_open() transiently create such situation to
4370 	 * facilitate csw d_fdopen().
4371 	 *
4372 	 * Dupfdopen() handling in kern_openat() installs the
4373 	 * half-baked file into the process descriptor table, allowing
4374 	 * other thread to dereference it. Guard against the race by
4375 	 * checking f_ops.
4376 	 */
4377 	if (fp->f_vnode == NULL || fp->f_ops == &badfileops) {
4378 		fdrop(fp, curthread);
4379 		return (EINVAL);
4380 	}
4381 	*fpp = fp;
4382 	return (0);
4383 }
4384 
4385 
4386 /*
4387  * Get an (NFS) file handle.
4388  */
4389 #ifndef _SYS_SYSPROTO_H_
4390 struct lgetfh_args {
4391 	char	*fname;
4392 	fhandle_t *fhp;
4393 };
4394 #endif
4395 int
4396 sys_lgetfh(td, uap)
4397 	struct thread *td;
4398 	register struct lgetfh_args *uap;
4399 {
4400 	struct nameidata nd;
4401 	fhandle_t fh;
4402 	register struct vnode *vp;
4403 	int vfslocked;
4404 	int error;
4405 
4406 	error = priv_check(td, PRIV_VFS_GETFH);
4407 	if (error)
4408 		return (error);
4409 	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
4410 	    UIO_USERSPACE, uap->fname, td);
4411 	error = namei(&nd);
4412 	if (error)
4413 		return (error);
4414 	vfslocked = NDHASGIANT(&nd);
4415 	NDFREE(&nd, NDF_ONLY_PNBUF);
4416 	vp = nd.ni_vp;
4417 	bzero(&fh, sizeof(fh));
4418 	fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid;
4419 	error = VOP_VPTOFH(vp, &fh.fh_fid);
4420 	vput(vp);
4421 	VFS_UNLOCK_GIANT(vfslocked);
4422 	if (error)
4423 		return (error);
4424 	error = copyout(&fh, uap->fhp, sizeof (fh));
4425 	return (error);
4426 }
4427 
4428 #ifndef _SYS_SYSPROTO_H_
4429 struct getfh_args {
4430 	char	*fname;
4431 	fhandle_t *fhp;
4432 };
4433 #endif
4434 int
4435 sys_getfh(td, uap)
4436 	struct thread *td;
4437 	register struct getfh_args *uap;
4438 {
4439 	struct nameidata nd;
4440 	fhandle_t fh;
4441 	register struct vnode *vp;
4442 	int vfslocked;
4443 	int error;
4444 
4445 	error = priv_check(td, PRIV_VFS_GETFH);
4446 	if (error)
4447 		return (error);
4448 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
4449 	    UIO_USERSPACE, uap->fname, td);
4450 	error = namei(&nd);
4451 	if (error)
4452 		return (error);
4453 	vfslocked = NDHASGIANT(&nd);
4454 	NDFREE(&nd, NDF_ONLY_PNBUF);
4455 	vp = nd.ni_vp;
4456 	bzero(&fh, sizeof(fh));
4457 	fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid;
4458 	error = VOP_VPTOFH(vp, &fh.fh_fid);
4459 	vput(vp);
4460 	VFS_UNLOCK_GIANT(vfslocked);
4461 	if (error)
4462 		return (error);
4463 	error = copyout(&fh, uap->fhp, sizeof (fh));
4464 	return (error);
4465 }
4466 
4467 /*
4468  * syscall for the rpc.lockd to use to translate a NFS file handle into an
4469  * open descriptor.
4470  *
4471  * warning: do not remove the priv_check() call or this becomes one giant
4472  * security hole.
4473  */
4474 #ifndef _SYS_SYSPROTO_H_
4475 struct fhopen_args {
4476 	const struct fhandle *u_fhp;
4477 	int flags;
4478 };
4479 #endif
4480 int
4481 sys_fhopen(td, uap)
4482 	struct thread *td;
4483 	struct fhopen_args /* {
4484 		const struct fhandle *u_fhp;
4485 		int flags;
4486 	} */ *uap;
4487 {
4488 	struct proc *p = td->td_proc;
4489 	struct mount *mp;
4490 	struct vnode *vp;
4491 	struct fhandle fhp;
4492 	struct vattr vat;
4493 	struct vattr *vap = &vat;
4494 	struct flock lf;
4495 	struct file *fp;
4496 	register struct filedesc *fdp = p->p_fd;
4497 	int fmode, error, type;
4498 	accmode_t accmode;
4499 	struct file *nfp;
4500 	int vfslocked;
4501 	int indx;
4502 
4503 	error = priv_check(td, PRIV_VFS_FHOPEN);
4504 	if (error)
4505 		return (error);
4506 	fmode = FFLAGS(uap->flags);
4507 	/* why not allow a non-read/write open for our lockd? */
4508 	if (((fmode & (FREAD | FWRITE)) == 0) || (fmode & O_CREAT))
4509 		return (EINVAL);
4510 	error = copyin(uap->u_fhp, &fhp, sizeof(fhp));
4511 	if (error)
4512 		return(error);
4513 	/* find the mount point */
4514 	mp = vfs_busyfs(&fhp.fh_fsid);
4515 	if (mp == NULL)
4516 		return (ESTALE);
4517 	vfslocked = VFS_LOCK_GIANT(mp);
4518 	/* now give me my vnode, it gets returned to me locked */
4519 	error = VFS_FHTOVP(mp, &fhp.fh_fid, LK_EXCLUSIVE, &vp);
4520 	vfs_unbusy(mp);
4521 	if (error)
4522 		goto out;
4523 	/*
4524 	 * from now on we have to make sure not
4525 	 * to forget about the vnode
4526 	 * any error that causes an abort must vput(vp)
4527 	 * just set error = err and 'goto bad;'.
4528 	 */
4529 
4530 	/*
4531 	 * from vn_open
4532 	 */
4533 	if (vp->v_type == VLNK) {
4534 		error = EMLINK;
4535 		goto bad;
4536 	}
4537 	if (vp->v_type == VSOCK) {
4538 		error = EOPNOTSUPP;
4539 		goto bad;
4540 	}
4541 	if (vp->v_type != VDIR && fmode & O_DIRECTORY) {
4542 		error = ENOTDIR;
4543 		goto bad;
4544 	}
4545 	accmode = 0;
4546 	if (fmode & (FWRITE | O_TRUNC)) {
4547 		if (vp->v_type == VDIR) {
4548 			error = EISDIR;
4549 			goto bad;
4550 		}
4551 		error = vn_writechk(vp);
4552 		if (error)
4553 			goto bad;
4554 		accmode |= VWRITE;
4555 	}
4556 	if (fmode & FREAD)
4557 		accmode |= VREAD;
4558 	if ((fmode & O_APPEND) && (fmode & FWRITE))
4559 		accmode |= VAPPEND;
4560 #ifdef MAC
4561 	error = mac_vnode_check_open(td->td_ucred, vp, accmode);
4562 	if (error)
4563 		goto bad;
4564 #endif
4565 	if (accmode) {
4566 		error = VOP_ACCESS(vp, accmode, td->td_ucred, td);
4567 		if (error)
4568 			goto bad;
4569 	}
4570 	if (fmode & O_TRUNC) {
4571 		vfs_ref(mp);
4572 		VOP_UNLOCK(vp, 0);				/* XXX */
4573 		if ((error = vn_start_write(NULL, &mp, V_WAIT | PCATCH)) != 0) {
4574 			vrele(vp);
4575 			vfs_rel(mp);
4576 			goto out;
4577 		}
4578 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);	/* XXX */
4579 		vfs_rel(mp);
4580 #ifdef MAC
4581 		/*
4582 		 * We don't yet have fp->f_cred, so use td->td_ucred, which
4583 		 * should be right.
4584 		 */
4585 		error = mac_vnode_check_write(td->td_ucred, td->td_ucred, vp);
4586 		if (error == 0) {
4587 #endif
4588 			VATTR_NULL(vap);
4589 			vap->va_size = 0;
4590 			error = VOP_SETATTR(vp, vap, td->td_ucred);
4591 #ifdef MAC
4592 		}
4593 #endif
4594 		vn_finished_write(mp);
4595 		if (error)
4596 			goto bad;
4597 	}
4598 	error = VOP_OPEN(vp, fmode, td->td_ucred, td, NULL);
4599 	if (error)
4600 		goto bad;
4601 
4602 	if (fmode & FWRITE)
4603 		vp->v_writecount++;
4604 
4605 	/*
4606 	 * end of vn_open code
4607 	 */
4608 
4609 	if ((error = falloc(td, &nfp, &indx, fmode)) != 0) {
4610 		if (fmode & FWRITE)
4611 			vp->v_writecount--;
4612 		goto bad;
4613 	}
4614 	/* An extra reference on `nfp' has been held for us by falloc(). */
4615 	fp = nfp;
4616 	nfp->f_vnode = vp;
4617 	finit(nfp, fmode & FMASK, DTYPE_VNODE, vp, &vnops);
4618 	if (fmode & (O_EXLOCK | O_SHLOCK)) {
4619 		lf.l_whence = SEEK_SET;
4620 		lf.l_start = 0;
4621 		lf.l_len = 0;
4622 		if (fmode & O_EXLOCK)
4623 			lf.l_type = F_WRLCK;
4624 		else
4625 			lf.l_type = F_RDLCK;
4626 		type = F_FLOCK;
4627 		if ((fmode & FNONBLOCK) == 0)
4628 			type |= F_WAIT;
4629 		VOP_UNLOCK(vp, 0);
4630 		if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,
4631 			    type)) != 0) {
4632 			/*
4633 			 * The lock request failed.  Normally close the
4634 			 * descriptor but handle the case where someone might
4635 			 * have dup()d or close()d it when we weren't looking.
4636 			 */
4637 			fdclose(fdp, fp, indx, td);
4638 
4639 			/*
4640 			 * release our private reference
4641 			 */
4642 			fdrop(fp, td);
4643 			goto out;
4644 		}
4645 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4646 		atomic_set_int(&fp->f_flag, FHASLOCK);
4647 	}
4648 
4649 	VOP_UNLOCK(vp, 0);
4650 	fdrop(fp, td);
4651 	VFS_UNLOCK_GIANT(vfslocked);
4652 	td->td_retval[0] = indx;
4653 	return (0);
4654 
4655 bad:
4656 	vput(vp);
4657 out:
4658 	VFS_UNLOCK_GIANT(vfslocked);
4659 	return (error);
4660 }
4661 
4662 /*
4663  * Stat an (NFS) file handle.
4664  */
4665 #ifndef _SYS_SYSPROTO_H_
4666 struct fhstat_args {
4667 	struct fhandle *u_fhp;
4668 	struct stat *sb;
4669 };
4670 #endif
4671 int
4672 sys_fhstat(td, uap)
4673 	struct thread *td;
4674 	register struct fhstat_args /* {
4675 		struct fhandle *u_fhp;
4676 		struct stat *sb;
4677 	} */ *uap;
4678 {
4679 	struct stat sb;
4680 	fhandle_t fh;
4681 	struct mount *mp;
4682 	struct vnode *vp;
4683 	int vfslocked;
4684 	int error;
4685 
4686 	error = priv_check(td, PRIV_VFS_FHSTAT);
4687 	if (error)
4688 		return (error);
4689 	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
4690 	if (error)
4691 		return (error);
4692 	if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
4693 		return (ESTALE);
4694 	vfslocked = VFS_LOCK_GIANT(mp);
4695 	error = VFS_FHTOVP(mp, &fh.fh_fid, LK_EXCLUSIVE, &vp);
4696 	vfs_unbusy(mp);
4697 	if (error) {
4698 		VFS_UNLOCK_GIANT(vfslocked);
4699 		return (error);
4700 	}
4701 	error = vn_stat(vp, &sb, td->td_ucred, NOCRED, td);
4702 	vput(vp);
4703 	VFS_UNLOCK_GIANT(vfslocked);
4704 	if (error)
4705 		return (error);
4706 	error = copyout(&sb, uap->sb, sizeof(sb));
4707 	return (error);
4708 }
4709 
4710 /*
4711  * Implement fstatfs() for (NFS) file handles.
4712  */
4713 #ifndef _SYS_SYSPROTO_H_
4714 struct fhstatfs_args {
4715 	struct fhandle *u_fhp;
4716 	struct statfs *buf;
4717 };
4718 #endif
4719 int
4720 sys_fhstatfs(td, uap)
4721 	struct thread *td;
4722 	struct fhstatfs_args /* {
4723 		struct fhandle *u_fhp;
4724 		struct statfs *buf;
4725 	} */ *uap;
4726 {
4727 	struct statfs sf;
4728 	fhandle_t fh;
4729 	int error;
4730 
4731 	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
4732 	if (error)
4733 		return (error);
4734 	error = kern_fhstatfs(td, fh, &sf);
4735 	if (error)
4736 		return (error);
4737 	return (copyout(&sf, uap->buf, sizeof(sf)));
4738 }
4739 
4740 int
4741 kern_fhstatfs(struct thread *td, fhandle_t fh, struct statfs *buf)
4742 {
4743 	struct statfs *sp;
4744 	struct mount *mp;
4745 	struct vnode *vp;
4746 	int vfslocked;
4747 	int error;
4748 
4749 	error = priv_check(td, PRIV_VFS_FHSTATFS);
4750 	if (error)
4751 		return (error);
4752 	if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
4753 		return (ESTALE);
4754 	vfslocked = VFS_LOCK_GIANT(mp);
4755 	error = VFS_FHTOVP(mp, &fh.fh_fid, LK_EXCLUSIVE, &vp);
4756 	if (error) {
4757 		vfs_unbusy(mp);
4758 		VFS_UNLOCK_GIANT(vfslocked);
4759 		return (error);
4760 	}
4761 	vput(vp);
4762 	error = prison_canseemount(td->td_ucred, mp);
4763 	if (error)
4764 		goto out;
4765 #ifdef MAC
4766 	error = mac_mount_check_stat(td->td_ucred, mp);
4767 	if (error)
4768 		goto out;
4769 #endif
4770 	/*
4771 	 * Set these in case the underlying filesystem fails to do so.
4772 	 */
4773 	sp = &mp->mnt_stat;
4774 	sp->f_version = STATFS_VERSION;
4775 	sp->f_namemax = NAME_MAX;
4776 	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
4777 	error = VFS_STATFS(mp, sp);
4778 	if (error == 0)
4779 		*buf = *sp;
4780 out:
4781 	vfs_unbusy(mp);
4782 	VFS_UNLOCK_GIANT(vfslocked);
4783 	return (error);
4784 }
4785 
4786 int
4787 kern_posix_fallocate(struct thread *td, int fd, off_t offset, off_t len)
4788 {
4789 	struct file *fp;
4790 	struct mount *mp;
4791 	struct vnode *vp;
4792 	off_t olen, ooffset;
4793 	int error, vfslocked;
4794 
4795 	fp = NULL;
4796 	vfslocked = 0;
4797 	error = fget(td, fd, CAP_WRITE, &fp);
4798 	if (error != 0)
4799 		goto out;
4800 
4801 	switch (fp->f_type) {
4802 	case DTYPE_VNODE:
4803 		break;
4804 	case DTYPE_PIPE:
4805 	case DTYPE_FIFO:
4806 		error = ESPIPE;
4807 		goto out;
4808 	default:
4809 		error = ENODEV;
4810 		goto out;
4811 	}
4812 	if ((fp->f_flag & FWRITE) == 0) {
4813 		error = EBADF;
4814 		goto out;
4815 	}
4816 	vp = fp->f_vnode;
4817 	if (vp->v_type != VREG) {
4818 		error = ENODEV;
4819 		goto out;
4820 	}
4821 	if (offset < 0 || len <= 0) {
4822 		error = EINVAL;
4823 		goto out;
4824 	}
4825 	/* Check for wrap. */
4826 	if (offset > OFF_MAX - len) {
4827 		error = EFBIG;
4828 		goto out;
4829 	}
4830 
4831 	/* Allocating blocks may take a long time, so iterate. */
4832 	for (;;) {
4833 		olen = len;
4834 		ooffset = offset;
4835 
4836 		bwillwrite();
4837 		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
4838 		mp = NULL;
4839 		error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
4840 		if (error != 0) {
4841 			VFS_UNLOCK_GIANT(vfslocked);
4842 			break;
4843 		}
4844 		error = vn_lock(vp, LK_EXCLUSIVE);
4845 		if (error != 0) {
4846 			vn_finished_write(mp);
4847 			VFS_UNLOCK_GIANT(vfslocked);
4848 			break;
4849 		}
4850 #ifdef MAC
4851 		error = mac_vnode_check_write(td->td_ucred, fp->f_cred, vp);
4852 		if (error == 0)
4853 #endif
4854 			error = VOP_ALLOCATE(vp, &offset, &len);
4855 		VOP_UNLOCK(vp, 0);
4856 		vn_finished_write(mp);
4857 		VFS_UNLOCK_GIANT(vfslocked);
4858 
4859 		if (olen + ooffset != offset + len) {
4860 			panic("offset + len changed from %jx/%jx to %jx/%jx",
4861 			    ooffset, olen, offset, len);
4862 		}
4863 		if (error != 0 || len == 0)
4864 			break;
4865 		KASSERT(olen > len, ("Iteration did not make progress?"));
4866 		maybe_yield();
4867 	}
4868  out:
4869 	if (fp != NULL)
4870 		fdrop(fp, td);
4871 	return (error);
4872 }
4873 
4874 int
4875 sys_posix_fallocate(struct thread *td, struct posix_fallocate_args *uap)
4876 {
4877 
4878 	return (kern_posix_fallocate(td, uap->fd, uap->offset, uap->len));
4879 }
4880 
4881 /*
4882  * Unlike madvise(2), we do not make a best effort to remember every
4883  * possible caching hint.  Instead, we remember the last setting with
4884  * the exception that we will allow POSIX_FADV_NORMAL to adjust the
4885  * region of any current setting.
4886  */
4887 int
4888 kern_posix_fadvise(struct thread *td, int fd, off_t offset, off_t len,
4889     int advice)
4890 {
4891 	struct fadvise_info *fa, *new;
4892 	struct file *fp;
4893 	struct vnode *vp;
4894 	off_t end;
4895 	int error;
4896 
4897 	if (offset < 0 || len < 0 || offset > OFF_MAX - len)
4898 		return (EINVAL);
4899 	switch (advice) {
4900 	case POSIX_FADV_SEQUENTIAL:
4901 	case POSIX_FADV_RANDOM:
4902 	case POSIX_FADV_NOREUSE:
4903 		new = malloc(sizeof(*fa), M_FADVISE, M_WAITOK);
4904 		break;
4905 	case POSIX_FADV_NORMAL:
4906 	case POSIX_FADV_WILLNEED:
4907 	case POSIX_FADV_DONTNEED:
4908 		new = NULL;
4909 		break;
4910 	default:
4911 		return (EINVAL);
4912 	}
4913 	/* XXX: CAP_POSIX_FADVISE? */
4914 	error = fget(td, fd, 0, &fp);
4915 	if (error != 0)
4916 		goto out;
4917 
4918 	switch (fp->f_type) {
4919 	case DTYPE_VNODE:
4920 		break;
4921 	case DTYPE_PIPE:
4922 	case DTYPE_FIFO:
4923 		error = ESPIPE;
4924 		goto out;
4925 	default:
4926 		error = ENODEV;
4927 		goto out;
4928 	}
4929 	vp = fp->f_vnode;
4930 	if (vp->v_type != VREG) {
4931 		error = ENODEV;
4932 		goto out;
4933 	}
4934 	if (len == 0)
4935 		end = OFF_MAX;
4936 	else
4937 		end = offset + len - 1;
4938 	switch (advice) {
4939 	case POSIX_FADV_SEQUENTIAL:
4940 	case POSIX_FADV_RANDOM:
4941 	case POSIX_FADV_NOREUSE:
4942 		/*
4943 		 * Try to merge any existing non-standard region with
4944 		 * this new region if possible, otherwise create a new
4945 		 * non-standard region for this request.
4946 		 */
4947 		mtx_pool_lock(mtxpool_sleep, fp);
4948 		fa = fp->f_advice;
4949 		if (fa != NULL && fa->fa_advice == advice &&
4950 		    ((fa->fa_start <= end && fa->fa_end >= offset) ||
4951 		    (end != OFF_MAX && fa->fa_start == end + 1) ||
4952 		    (fa->fa_end != OFF_MAX && fa->fa_end + 1 == offset))) {
4953 			if (offset < fa->fa_start)
4954 				fa->fa_start = offset;
4955 			if (end > fa->fa_end)
4956 				fa->fa_end = end;
4957 		} else {
4958 			new->fa_advice = advice;
4959 			new->fa_start = offset;
4960 			new->fa_end = end;
4961 			fp->f_advice = new;
4962 			new = fa;
4963 		}
4964 		mtx_pool_unlock(mtxpool_sleep, fp);
4965 		break;
4966 	case POSIX_FADV_NORMAL:
4967 		/*
4968 		 * If a the "normal" region overlaps with an existing
4969 		 * non-standard region, trim or remove the
4970 		 * non-standard region.
4971 		 */
4972 		mtx_pool_lock(mtxpool_sleep, fp);
4973 		fa = fp->f_advice;
4974 		if (fa != NULL) {
4975 			if (offset <= fa->fa_start && end >= fa->fa_end) {
4976 				new = fa;
4977 				fp->f_advice = NULL;
4978 			} else if (offset <= fa->fa_start &&
4979  			    end >= fa->fa_start)
4980 				fa->fa_start = end + 1;
4981 			else if (offset <= fa->fa_end && end >= fa->fa_end)
4982 				fa->fa_end = offset - 1;
4983 			else if (offset >= fa->fa_start && end <= fa->fa_end) {
4984 				/*
4985 				 * If the "normal" region is a middle
4986 				 * portion of the existing
4987 				 * non-standard region, just remove
4988 				 * the whole thing rather than picking
4989 				 * one side or the other to
4990 				 * preserve.
4991 				 */
4992 				new = fa;
4993 				fp->f_advice = NULL;
4994 			}
4995 		}
4996 		mtx_pool_unlock(mtxpool_sleep, fp);
4997 		break;
4998 	case POSIX_FADV_WILLNEED:
4999 	case POSIX_FADV_DONTNEED:
5000 		error = VOP_ADVISE(vp, offset, end, advice);
5001 		break;
5002 	}
5003 out:
5004 	if (fp != NULL)
5005 		fdrop(fp, td);
5006 	free(new, M_FADVISE);
5007 	return (error);
5008 }
5009 
5010 int
5011 sys_posix_fadvise(struct thread *td, struct posix_fadvise_args *uap)
5012 {
5013 
5014 	return (kern_posix_fadvise(td, uap->fd, uap->offset, uap->len,
5015 	    uap->advice));
5016 }
5017