xref: /freebsd/sys/kern/vfs_syscalls.c (revision e39e854e27f53a784c3982cbeb68f4ad1cfd9162)
1 /*-
2  * Copyright (c) 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 4. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  *	@(#)vfs_syscalls.c	8.13 (Berkeley) 4/15/94
35  */
36 
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD$");
39 
40 #include "opt_capsicum.h"
41 #include "opt_compat.h"
42 #include "opt_kdtrace.h"
43 #include "opt_ktrace.h"
44 
45 #include <sys/param.h>
46 #include <sys/systm.h>
47 #include <sys/bio.h>
48 #include <sys/buf.h>
49 #include <sys/capability.h>
50 #include <sys/disk.h>
51 #include <sys/sysent.h>
52 #include <sys/malloc.h>
53 #include <sys/mount.h>
54 #include <sys/mutex.h>
55 #include <sys/sysproto.h>
56 #include <sys/namei.h>
57 #include <sys/filedesc.h>
58 #include <sys/kernel.h>
59 #include <sys/fcntl.h>
60 #include <sys/file.h>
61 #include <sys/filio.h>
62 #include <sys/limits.h>
63 #include <sys/linker.h>
64 #include <sys/sdt.h>
65 #include <sys/stat.h>
66 #include <sys/sx.h>
67 #include <sys/unistd.h>
68 #include <sys/vnode.h>
69 #include <sys/priv.h>
70 #include <sys/proc.h>
71 #include <sys/dirent.h>
72 #include <sys/jail.h>
73 #include <sys/syscallsubr.h>
74 #include <sys/sysctl.h>
75 #ifdef KTRACE
76 #include <sys/ktrace.h>
77 #endif
78 
79 #include <machine/stdarg.h>
80 
81 #include <security/audit/audit.h>
82 #include <security/mac/mac_framework.h>
83 
84 #include <vm/vm.h>
85 #include <vm/vm_object.h>
86 #include <vm/vm_page.h>
87 #include <vm/uma.h>
88 
89 #include <ufs/ufs/quota.h>
90 
91 MALLOC_DEFINE(M_FADVISE, "fadvise", "posix_fadvise(2) information");
92 
93 SDT_PROVIDER_DEFINE(vfs);
94 SDT_PROBE_DEFINE(vfs, , stat, mode, mode);
95 SDT_PROBE_ARGTYPE(vfs, , stat, mode, 0, "char *");
96 SDT_PROBE_ARGTYPE(vfs, , stat, mode, 1, "int");
97 SDT_PROBE_DEFINE(vfs, , stat, reg, reg);
98 SDT_PROBE_ARGTYPE(vfs, , stat, reg, 0, "char *");
99 SDT_PROBE_ARGTYPE(vfs, , stat, reg, 1, "int");
100 
101 static int chroot_refuse_vdir_fds(struct filedesc *fdp);
102 static int getutimes(const struct timeval *, enum uio_seg, struct timespec *);
103 static int setfflags(struct thread *td, struct vnode *, int);
104 static int setutimes(struct thread *td, struct vnode *,
105     const struct timespec *, int, int);
106 static int vn_access(struct vnode *vp, int user_flags, struct ucred *cred,
107     struct thread *td);
108 
109 /*
110  * The module initialization routine for POSIX asynchronous I/O will
111  * set this to the version of AIO that it implements.  (Zero means
112  * that it is not implemented.)  This value is used here by pathconf()
113  * and in kern_descrip.c by fpathconf().
114  */
115 int async_io_version;
116 
117 #ifdef DEBUG
118 static int syncprt = 0;
119 SYSCTL_INT(_debug, OID_AUTO, syncprt, CTLFLAG_RW, &syncprt, 0, "");
120 #endif
121 
122 /*
123  * Sync each mounted filesystem.
124  */
125 #ifndef _SYS_SYSPROTO_H_
126 struct sync_args {
127 	int     dummy;
128 };
129 #endif
130 /* ARGSUSED */
131 int
132 sys_sync(td, uap)
133 	struct thread *td;
134 	struct sync_args *uap;
135 {
136 	struct mount *mp, *nmp;
137 	int save, vfslocked;
138 
139 	mtx_lock(&mountlist_mtx);
140 	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
141 		if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
142 			nmp = TAILQ_NEXT(mp, mnt_list);
143 			continue;
144 		}
145 		vfslocked = VFS_LOCK_GIANT(mp);
146 		if ((mp->mnt_flag & MNT_RDONLY) == 0 &&
147 		    vn_start_write(NULL, &mp, V_NOWAIT) == 0) {
148 			save = curthread_pflags_set(TDP_SYNCIO);
149 			vfs_msync(mp, MNT_NOWAIT);
150 			VFS_SYNC(mp, MNT_NOWAIT);
151 			curthread_pflags_restore(save);
152 			vn_finished_write(mp);
153 		}
154 		VFS_UNLOCK_GIANT(vfslocked);
155 		mtx_lock(&mountlist_mtx);
156 		nmp = TAILQ_NEXT(mp, mnt_list);
157 		vfs_unbusy(mp);
158 	}
159 	mtx_unlock(&mountlist_mtx);
160 	return (0);
161 }
162 
163 /*
164  * Change filesystem quotas.
165  */
166 #ifndef _SYS_SYSPROTO_H_
167 struct quotactl_args {
168 	char *path;
169 	int cmd;
170 	int uid;
171 	caddr_t arg;
172 };
173 #endif
174 int
175 sys_quotactl(td, uap)
176 	struct thread *td;
177 	register struct quotactl_args /* {
178 		char *path;
179 		int cmd;
180 		int uid;
181 		caddr_t arg;
182 	} */ *uap;
183 {
184 	struct mount *mp;
185 	int vfslocked;
186 	int error;
187 	struct nameidata nd;
188 
189 	AUDIT_ARG_CMD(uap->cmd);
190 	AUDIT_ARG_UID(uap->uid);
191 	if (!prison_allow(td->td_ucred, PR_ALLOW_QUOTAS))
192 		return (EPERM);
193 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
194 	   UIO_USERSPACE, uap->path, td);
195 	if ((error = namei(&nd)) != 0)
196 		return (error);
197 	vfslocked = NDHASGIANT(&nd);
198 	NDFREE(&nd, NDF_ONLY_PNBUF);
199 	mp = nd.ni_vp->v_mount;
200 	vfs_ref(mp);
201 	vput(nd.ni_vp);
202 	error = vfs_busy(mp, 0);
203 	vfs_rel(mp);
204 	if (error) {
205 		VFS_UNLOCK_GIANT(vfslocked);
206 		return (error);
207 	}
208 	error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, uap->arg);
209 
210 	/*
211 	 * Since quota on operation typically needs to open quota
212 	 * file, the Q_QUOTAON handler needs to unbusy the mount point
213 	 * before calling into namei.  Otherwise, unmount might be
214 	 * started between two vfs_busy() invocations (first is our,
215 	 * second is from mount point cross-walk code in lookup()),
216 	 * causing deadlock.
217 	 *
218 	 * Require that Q_QUOTAON handles the vfs_busy() reference on
219 	 * its own, always returning with ubusied mount point.
220 	 */
221 	if ((uap->cmd >> SUBCMDSHIFT) != Q_QUOTAON)
222 		vfs_unbusy(mp);
223 	VFS_UNLOCK_GIANT(vfslocked);
224 	return (error);
225 }
226 
227 /*
228  * Used by statfs conversion routines to scale the block size up if
229  * necessary so that all of the block counts are <= 'max_size'.  Note
230  * that 'max_size' should be a bitmask, i.e. 2^n - 1 for some non-zero
231  * value of 'n'.
232  */
233 void
234 statfs_scale_blocks(struct statfs *sf, long max_size)
235 {
236 	uint64_t count;
237 	int shift;
238 
239 	KASSERT(powerof2(max_size + 1), ("%s: invalid max_size", __func__));
240 
241 	/*
242 	 * Attempt to scale the block counts to give a more accurate
243 	 * overview to userland of the ratio of free space to used
244 	 * space.  To do this, find the largest block count and compute
245 	 * a divisor that lets it fit into a signed integer <= max_size.
246 	 */
247 	if (sf->f_bavail < 0)
248 		count = -sf->f_bavail;
249 	else
250 		count = sf->f_bavail;
251 	count = MAX(sf->f_blocks, MAX(sf->f_bfree, count));
252 	if (count <= max_size)
253 		return;
254 
255 	count >>= flsl(max_size);
256 	shift = 0;
257 	while (count > 0) {
258 		shift++;
259 		count >>=1;
260 	}
261 
262 	sf->f_bsize <<= shift;
263 	sf->f_blocks >>= shift;
264 	sf->f_bfree >>= shift;
265 	sf->f_bavail >>= shift;
266 }
267 
268 /*
269  * Get filesystem statistics.
270  */
271 #ifndef _SYS_SYSPROTO_H_
272 struct statfs_args {
273 	char *path;
274 	struct statfs *buf;
275 };
276 #endif
277 int
278 sys_statfs(td, uap)
279 	struct thread *td;
280 	register struct statfs_args /* {
281 		char *path;
282 		struct statfs *buf;
283 	} */ *uap;
284 {
285 	struct statfs sf;
286 	int error;
287 
288 	error = kern_statfs(td, uap->path, UIO_USERSPACE, &sf);
289 	if (error == 0)
290 		error = copyout(&sf, uap->buf, sizeof(sf));
291 	return (error);
292 }
293 
294 int
295 kern_statfs(struct thread *td, char *path, enum uio_seg pathseg,
296     struct statfs *buf)
297 {
298 	struct mount *mp;
299 	struct statfs *sp, sb;
300 	int vfslocked;
301 	int error;
302 	struct nameidata nd;
303 
304 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | MPSAFE |
305 	    AUDITVNODE1, pathseg, path, td);
306 	error = namei(&nd);
307 	if (error)
308 		return (error);
309 	vfslocked = NDHASGIANT(&nd);
310 	mp = nd.ni_vp->v_mount;
311 	vfs_ref(mp);
312 	NDFREE(&nd, NDF_ONLY_PNBUF);
313 	vput(nd.ni_vp);
314 	error = vfs_busy(mp, 0);
315 	vfs_rel(mp);
316 	if (error) {
317 		VFS_UNLOCK_GIANT(vfslocked);
318 		return (error);
319 	}
320 #ifdef MAC
321 	error = mac_mount_check_stat(td->td_ucred, mp);
322 	if (error)
323 		goto out;
324 #endif
325 	/*
326 	 * Set these in case the underlying filesystem fails to do so.
327 	 */
328 	sp = &mp->mnt_stat;
329 	sp->f_version = STATFS_VERSION;
330 	sp->f_namemax = NAME_MAX;
331 	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
332 	error = VFS_STATFS(mp, sp);
333 	if (error)
334 		goto out;
335 	if (priv_check(td, PRIV_VFS_GENERATION)) {
336 		bcopy(sp, &sb, sizeof(sb));
337 		sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
338 		prison_enforce_statfs(td->td_ucred, mp, &sb);
339 		sp = &sb;
340 	}
341 	*buf = *sp;
342 out:
343 	vfs_unbusy(mp);
344 	VFS_UNLOCK_GIANT(vfslocked);
345 	return (error);
346 }
347 
348 /*
349  * Get filesystem statistics.
350  */
351 #ifndef _SYS_SYSPROTO_H_
352 struct fstatfs_args {
353 	int fd;
354 	struct statfs *buf;
355 };
356 #endif
357 int
358 sys_fstatfs(td, uap)
359 	struct thread *td;
360 	register struct fstatfs_args /* {
361 		int fd;
362 		struct statfs *buf;
363 	} */ *uap;
364 {
365 	struct statfs sf;
366 	int error;
367 
368 	error = kern_fstatfs(td, uap->fd, &sf);
369 	if (error == 0)
370 		error = copyout(&sf, uap->buf, sizeof(sf));
371 	return (error);
372 }
373 
374 int
375 kern_fstatfs(struct thread *td, int fd, struct statfs *buf)
376 {
377 	struct file *fp;
378 	struct mount *mp;
379 	struct statfs *sp, sb;
380 	int vfslocked;
381 	struct vnode *vp;
382 	int error;
383 
384 	AUDIT_ARG_FD(fd);
385 	error = getvnode(td->td_proc->p_fd, fd, CAP_FSTATFS, &fp);
386 	if (error)
387 		return (error);
388 	vp = fp->f_vnode;
389 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
390 	vn_lock(vp, LK_SHARED | LK_RETRY);
391 #ifdef AUDIT
392 	AUDIT_ARG_VNODE1(vp);
393 #endif
394 	mp = vp->v_mount;
395 	if (mp)
396 		vfs_ref(mp);
397 	VOP_UNLOCK(vp, 0);
398 	fdrop(fp, td);
399 	if (mp == NULL) {
400 		error = EBADF;
401 		goto out;
402 	}
403 	error = vfs_busy(mp, 0);
404 	vfs_rel(mp);
405 	if (error) {
406 		VFS_UNLOCK_GIANT(vfslocked);
407 		return (error);
408 	}
409 #ifdef MAC
410 	error = mac_mount_check_stat(td->td_ucred, mp);
411 	if (error)
412 		goto out;
413 #endif
414 	/*
415 	 * Set these in case the underlying filesystem fails to do so.
416 	 */
417 	sp = &mp->mnt_stat;
418 	sp->f_version = STATFS_VERSION;
419 	sp->f_namemax = NAME_MAX;
420 	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
421 	error = VFS_STATFS(mp, sp);
422 	if (error)
423 		goto out;
424 	if (priv_check(td, PRIV_VFS_GENERATION)) {
425 		bcopy(sp, &sb, sizeof(sb));
426 		sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
427 		prison_enforce_statfs(td->td_ucred, mp, &sb);
428 		sp = &sb;
429 	}
430 	*buf = *sp;
431 out:
432 	if (mp)
433 		vfs_unbusy(mp);
434 	VFS_UNLOCK_GIANT(vfslocked);
435 	return (error);
436 }
437 
438 /*
439  * Get statistics on all filesystems.
440  */
441 #ifndef _SYS_SYSPROTO_H_
442 struct getfsstat_args {
443 	struct statfs *buf;
444 	long bufsize;
445 	int flags;
446 };
447 #endif
448 int
449 sys_getfsstat(td, uap)
450 	struct thread *td;
451 	register struct getfsstat_args /* {
452 		struct statfs *buf;
453 		long bufsize;
454 		int flags;
455 	} */ *uap;
456 {
457 
458 	return (kern_getfsstat(td, &uap->buf, uap->bufsize, UIO_USERSPACE,
459 	    uap->flags));
460 }
461 
462 /*
463  * If (bufsize > 0 && bufseg == UIO_SYSSPACE)
464  * 	The caller is responsible for freeing memory which will be allocated
465  *	in '*buf'.
466  */
467 int
468 kern_getfsstat(struct thread *td, struct statfs **buf, size_t bufsize,
469     enum uio_seg bufseg, int flags)
470 {
471 	struct mount *mp, *nmp;
472 	struct statfs *sfsp, *sp, sb;
473 	size_t count, maxcount;
474 	int vfslocked;
475 	int error;
476 
477 	maxcount = bufsize / sizeof(struct statfs);
478 	if (bufsize == 0)
479 		sfsp = NULL;
480 	else if (bufseg == UIO_USERSPACE)
481 		sfsp = *buf;
482 	else /* if (bufseg == UIO_SYSSPACE) */ {
483 		count = 0;
484 		mtx_lock(&mountlist_mtx);
485 		TAILQ_FOREACH(mp, &mountlist, mnt_list) {
486 			count++;
487 		}
488 		mtx_unlock(&mountlist_mtx);
489 		if (maxcount > count)
490 			maxcount = count;
491 		sfsp = *buf = malloc(maxcount * sizeof(struct statfs), M_TEMP,
492 		    M_WAITOK);
493 	}
494 	count = 0;
495 	mtx_lock(&mountlist_mtx);
496 	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
497 		if (prison_canseemount(td->td_ucred, mp) != 0) {
498 			nmp = TAILQ_NEXT(mp, mnt_list);
499 			continue;
500 		}
501 #ifdef MAC
502 		if (mac_mount_check_stat(td->td_ucred, mp) != 0) {
503 			nmp = TAILQ_NEXT(mp, mnt_list);
504 			continue;
505 		}
506 #endif
507 		if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
508 			nmp = TAILQ_NEXT(mp, mnt_list);
509 			continue;
510 		}
511 		vfslocked = VFS_LOCK_GIANT(mp);
512 		if (sfsp && count < maxcount) {
513 			sp = &mp->mnt_stat;
514 			/*
515 			 * Set these in case the underlying filesystem
516 			 * fails to do so.
517 			 */
518 			sp->f_version = STATFS_VERSION;
519 			sp->f_namemax = NAME_MAX;
520 			sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
521 			/*
522 			 * If MNT_NOWAIT or MNT_LAZY is specified, do not
523 			 * refresh the fsstat cache. MNT_NOWAIT or MNT_LAZY
524 			 * overrides MNT_WAIT.
525 			 */
526 			if (((flags & (MNT_LAZY|MNT_NOWAIT)) == 0 ||
527 			    (flags & MNT_WAIT)) &&
528 			    (error = VFS_STATFS(mp, sp))) {
529 				VFS_UNLOCK_GIANT(vfslocked);
530 				mtx_lock(&mountlist_mtx);
531 				nmp = TAILQ_NEXT(mp, mnt_list);
532 				vfs_unbusy(mp);
533 				continue;
534 			}
535 			if (priv_check(td, PRIV_VFS_GENERATION)) {
536 				bcopy(sp, &sb, sizeof(sb));
537 				sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
538 				prison_enforce_statfs(td->td_ucred, mp, &sb);
539 				sp = &sb;
540 			}
541 			if (bufseg == UIO_SYSSPACE)
542 				bcopy(sp, sfsp, sizeof(*sp));
543 			else /* if (bufseg == UIO_USERSPACE) */ {
544 				error = copyout(sp, sfsp, sizeof(*sp));
545 				if (error) {
546 					vfs_unbusy(mp);
547 					VFS_UNLOCK_GIANT(vfslocked);
548 					return (error);
549 				}
550 			}
551 			sfsp++;
552 		}
553 		VFS_UNLOCK_GIANT(vfslocked);
554 		count++;
555 		mtx_lock(&mountlist_mtx);
556 		nmp = TAILQ_NEXT(mp, mnt_list);
557 		vfs_unbusy(mp);
558 	}
559 	mtx_unlock(&mountlist_mtx);
560 	if (sfsp && count > maxcount)
561 		td->td_retval[0] = maxcount;
562 	else
563 		td->td_retval[0] = count;
564 	return (0);
565 }
566 
567 #ifdef COMPAT_FREEBSD4
568 /*
569  * Get old format filesystem statistics.
570  */
571 static void cvtstatfs(struct statfs *, struct ostatfs *);
572 
573 #ifndef _SYS_SYSPROTO_H_
574 struct freebsd4_statfs_args {
575 	char *path;
576 	struct ostatfs *buf;
577 };
578 #endif
579 int
580 freebsd4_statfs(td, uap)
581 	struct thread *td;
582 	struct freebsd4_statfs_args /* {
583 		char *path;
584 		struct ostatfs *buf;
585 	} */ *uap;
586 {
587 	struct ostatfs osb;
588 	struct statfs sf;
589 	int error;
590 
591 	error = kern_statfs(td, uap->path, UIO_USERSPACE, &sf);
592 	if (error)
593 		return (error);
594 	cvtstatfs(&sf, &osb);
595 	return (copyout(&osb, uap->buf, sizeof(osb)));
596 }
597 
598 /*
599  * Get filesystem statistics.
600  */
601 #ifndef _SYS_SYSPROTO_H_
602 struct freebsd4_fstatfs_args {
603 	int fd;
604 	struct ostatfs *buf;
605 };
606 #endif
607 int
608 freebsd4_fstatfs(td, uap)
609 	struct thread *td;
610 	struct freebsd4_fstatfs_args /* {
611 		int fd;
612 		struct ostatfs *buf;
613 	} */ *uap;
614 {
615 	struct ostatfs osb;
616 	struct statfs sf;
617 	int error;
618 
619 	error = kern_fstatfs(td, uap->fd, &sf);
620 	if (error)
621 		return (error);
622 	cvtstatfs(&sf, &osb);
623 	return (copyout(&osb, uap->buf, sizeof(osb)));
624 }
625 
626 /*
627  * Get statistics on all filesystems.
628  */
629 #ifndef _SYS_SYSPROTO_H_
630 struct freebsd4_getfsstat_args {
631 	struct ostatfs *buf;
632 	long bufsize;
633 	int flags;
634 };
635 #endif
636 int
637 freebsd4_getfsstat(td, uap)
638 	struct thread *td;
639 	register struct freebsd4_getfsstat_args /* {
640 		struct ostatfs *buf;
641 		long bufsize;
642 		int flags;
643 	} */ *uap;
644 {
645 	struct statfs *buf, *sp;
646 	struct ostatfs osb;
647 	size_t count, size;
648 	int error;
649 
650 	count = uap->bufsize / sizeof(struct ostatfs);
651 	size = count * sizeof(struct statfs);
652 	error = kern_getfsstat(td, &buf, size, UIO_SYSSPACE, uap->flags);
653 	if (size > 0) {
654 		count = td->td_retval[0];
655 		sp = buf;
656 		while (count > 0 && error == 0) {
657 			cvtstatfs(sp, &osb);
658 			error = copyout(&osb, uap->buf, sizeof(osb));
659 			sp++;
660 			uap->buf++;
661 			count--;
662 		}
663 		free(buf, M_TEMP);
664 	}
665 	return (error);
666 }
667 
668 /*
669  * Implement fstatfs() for (NFS) file handles.
670  */
671 #ifndef _SYS_SYSPROTO_H_
672 struct freebsd4_fhstatfs_args {
673 	struct fhandle *u_fhp;
674 	struct ostatfs *buf;
675 };
676 #endif
677 int
678 freebsd4_fhstatfs(td, uap)
679 	struct thread *td;
680 	struct freebsd4_fhstatfs_args /* {
681 		struct fhandle *u_fhp;
682 		struct ostatfs *buf;
683 	} */ *uap;
684 {
685 	struct ostatfs osb;
686 	struct statfs sf;
687 	fhandle_t fh;
688 	int error;
689 
690 	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
691 	if (error)
692 		return (error);
693 	error = kern_fhstatfs(td, fh, &sf);
694 	if (error)
695 		return (error);
696 	cvtstatfs(&sf, &osb);
697 	return (copyout(&osb, uap->buf, sizeof(osb)));
698 }
699 
700 /*
701  * Convert a new format statfs structure to an old format statfs structure.
702  */
703 static void
704 cvtstatfs(nsp, osp)
705 	struct statfs *nsp;
706 	struct ostatfs *osp;
707 {
708 
709 	statfs_scale_blocks(nsp, LONG_MAX);
710 	bzero(osp, sizeof(*osp));
711 	osp->f_bsize = nsp->f_bsize;
712 	osp->f_iosize = MIN(nsp->f_iosize, LONG_MAX);
713 	osp->f_blocks = nsp->f_blocks;
714 	osp->f_bfree = nsp->f_bfree;
715 	osp->f_bavail = nsp->f_bavail;
716 	osp->f_files = MIN(nsp->f_files, LONG_MAX);
717 	osp->f_ffree = MIN(nsp->f_ffree, LONG_MAX);
718 	osp->f_owner = nsp->f_owner;
719 	osp->f_type = nsp->f_type;
720 	osp->f_flags = nsp->f_flags;
721 	osp->f_syncwrites = MIN(nsp->f_syncwrites, LONG_MAX);
722 	osp->f_asyncwrites = MIN(nsp->f_asyncwrites, LONG_MAX);
723 	osp->f_syncreads = MIN(nsp->f_syncreads, LONG_MAX);
724 	osp->f_asyncreads = MIN(nsp->f_asyncreads, LONG_MAX);
725 	strlcpy(osp->f_fstypename, nsp->f_fstypename,
726 	    MIN(MFSNAMELEN, OMFSNAMELEN));
727 	strlcpy(osp->f_mntonname, nsp->f_mntonname,
728 	    MIN(MNAMELEN, OMNAMELEN));
729 	strlcpy(osp->f_mntfromname, nsp->f_mntfromname,
730 	    MIN(MNAMELEN, OMNAMELEN));
731 	osp->f_fsid = nsp->f_fsid;
732 }
733 #endif /* COMPAT_FREEBSD4 */
734 
735 /*
736  * Change current working directory to a given file descriptor.
737  */
738 #ifndef _SYS_SYSPROTO_H_
739 struct fchdir_args {
740 	int	fd;
741 };
742 #endif
743 int
744 sys_fchdir(td, uap)
745 	struct thread *td;
746 	struct fchdir_args /* {
747 		int fd;
748 	} */ *uap;
749 {
750 	register struct filedesc *fdp = td->td_proc->p_fd;
751 	struct vnode *vp, *tdp, *vpold;
752 	struct mount *mp;
753 	struct file *fp;
754 	int vfslocked;
755 	int error;
756 
757 	AUDIT_ARG_FD(uap->fd);
758 	if ((error = getvnode(fdp, uap->fd, CAP_FCHDIR, &fp)) != 0)
759 		return (error);
760 	vp = fp->f_vnode;
761 	VREF(vp);
762 	fdrop(fp, td);
763 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
764 	vn_lock(vp, LK_SHARED | LK_RETRY);
765 	AUDIT_ARG_VNODE1(vp);
766 	error = change_dir(vp, td);
767 	while (!error && (mp = vp->v_mountedhere) != NULL) {
768 		int tvfslocked;
769 		if (vfs_busy(mp, 0))
770 			continue;
771 		tvfslocked = VFS_LOCK_GIANT(mp);
772 		error = VFS_ROOT(mp, LK_SHARED, &tdp);
773 		vfs_unbusy(mp);
774 		if (error) {
775 			VFS_UNLOCK_GIANT(tvfslocked);
776 			break;
777 		}
778 		vput(vp);
779 		VFS_UNLOCK_GIANT(vfslocked);
780 		vp = tdp;
781 		vfslocked = tvfslocked;
782 	}
783 	if (error) {
784 		vput(vp);
785 		VFS_UNLOCK_GIANT(vfslocked);
786 		return (error);
787 	}
788 	VOP_UNLOCK(vp, 0);
789 	VFS_UNLOCK_GIANT(vfslocked);
790 	FILEDESC_XLOCK(fdp);
791 	vpold = fdp->fd_cdir;
792 	fdp->fd_cdir = vp;
793 	FILEDESC_XUNLOCK(fdp);
794 	vfslocked = VFS_LOCK_GIANT(vpold->v_mount);
795 	vrele(vpold);
796 	VFS_UNLOCK_GIANT(vfslocked);
797 	return (0);
798 }
799 
800 /*
801  * Change current working directory (``.'').
802  */
803 #ifndef _SYS_SYSPROTO_H_
804 struct chdir_args {
805 	char	*path;
806 };
807 #endif
808 int
809 sys_chdir(td, uap)
810 	struct thread *td;
811 	struct chdir_args /* {
812 		char *path;
813 	} */ *uap;
814 {
815 
816 	return (kern_chdir(td, uap->path, UIO_USERSPACE));
817 }
818 
819 int
820 kern_chdir(struct thread *td, char *path, enum uio_seg pathseg)
821 {
822 	register struct filedesc *fdp = td->td_proc->p_fd;
823 	int error;
824 	struct nameidata nd;
825 	struct vnode *vp;
826 	int vfslocked;
827 
828 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1 |
829 	    MPSAFE, pathseg, path, td);
830 	if ((error = namei(&nd)) != 0)
831 		return (error);
832 	vfslocked = NDHASGIANT(&nd);
833 	if ((error = change_dir(nd.ni_vp, td)) != 0) {
834 		vput(nd.ni_vp);
835 		VFS_UNLOCK_GIANT(vfslocked);
836 		NDFREE(&nd, NDF_ONLY_PNBUF);
837 		return (error);
838 	}
839 	VOP_UNLOCK(nd.ni_vp, 0);
840 	VFS_UNLOCK_GIANT(vfslocked);
841 	NDFREE(&nd, NDF_ONLY_PNBUF);
842 	FILEDESC_XLOCK(fdp);
843 	vp = fdp->fd_cdir;
844 	fdp->fd_cdir = nd.ni_vp;
845 	FILEDESC_XUNLOCK(fdp);
846 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
847 	vrele(vp);
848 	VFS_UNLOCK_GIANT(vfslocked);
849 	return (0);
850 }
851 
852 /*
853  * Helper function for raised chroot(2) security function:  Refuse if
854  * any filedescriptors are open directories.
855  */
856 static int
857 chroot_refuse_vdir_fds(fdp)
858 	struct filedesc *fdp;
859 {
860 	struct vnode *vp;
861 	struct file *fp;
862 	int fd;
863 
864 	FILEDESC_LOCK_ASSERT(fdp);
865 
866 	for (fd = 0; fd < fdp->fd_nfiles ; fd++) {
867 		fp = fget_locked(fdp, fd);
868 		if (fp == NULL)
869 			continue;
870 		if (fp->f_type == DTYPE_VNODE) {
871 			vp = fp->f_vnode;
872 			if (vp->v_type == VDIR)
873 				return (EPERM);
874 		}
875 	}
876 	return (0);
877 }
878 
879 /*
880  * This sysctl determines if we will allow a process to chroot(2) if it
881  * has a directory open:
882  *	0: disallowed for all processes.
883  *	1: allowed for processes that were not already chroot(2)'ed.
884  *	2: allowed for all processes.
885  */
886 
887 static int chroot_allow_open_directories = 1;
888 
889 SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW,
890      &chroot_allow_open_directories, 0,
891      "Allow a process to chroot(2) if it has a directory open");
892 
893 /*
894  * Change notion of root (``/'') directory.
895  */
896 #ifndef _SYS_SYSPROTO_H_
897 struct chroot_args {
898 	char	*path;
899 };
900 #endif
901 int
902 sys_chroot(td, uap)
903 	struct thread *td;
904 	struct chroot_args /* {
905 		char *path;
906 	} */ *uap;
907 {
908 	int error;
909 	struct nameidata nd;
910 	int vfslocked;
911 
912 	error = priv_check(td, PRIV_VFS_CHROOT);
913 	if (error)
914 		return (error);
915 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | MPSAFE |
916 	    AUDITVNODE1, UIO_USERSPACE, uap->path, td);
917 	error = namei(&nd);
918 	if (error)
919 		goto error;
920 	vfslocked = NDHASGIANT(&nd);
921 	if ((error = change_dir(nd.ni_vp, td)) != 0)
922 		goto e_vunlock;
923 #ifdef MAC
924 	if ((error = mac_vnode_check_chroot(td->td_ucred, nd.ni_vp)))
925 		goto e_vunlock;
926 #endif
927 	VOP_UNLOCK(nd.ni_vp, 0);
928 	error = change_root(nd.ni_vp, td);
929 	vrele(nd.ni_vp);
930 	VFS_UNLOCK_GIANT(vfslocked);
931 	NDFREE(&nd, NDF_ONLY_PNBUF);
932 	return (error);
933 e_vunlock:
934 	vput(nd.ni_vp);
935 	VFS_UNLOCK_GIANT(vfslocked);
936 error:
937 	NDFREE(&nd, NDF_ONLY_PNBUF);
938 	return (error);
939 }
940 
941 /*
942  * Common routine for chroot and chdir.  Callers must provide a locked vnode
943  * instance.
944  */
945 int
946 change_dir(vp, td)
947 	struct vnode *vp;
948 	struct thread *td;
949 {
950 	int error;
951 
952 	ASSERT_VOP_LOCKED(vp, "change_dir(): vp not locked");
953 	if (vp->v_type != VDIR)
954 		return (ENOTDIR);
955 #ifdef MAC
956 	error = mac_vnode_check_chdir(td->td_ucred, vp);
957 	if (error)
958 		return (error);
959 #endif
960 	error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td);
961 	return (error);
962 }
963 
964 /*
965  * Common routine for kern_chroot() and jail_attach().  The caller is
966  * responsible for invoking priv_check() and mac_vnode_check_chroot() to
967  * authorize this operation.
968  */
969 int
970 change_root(vp, td)
971 	struct vnode *vp;
972 	struct thread *td;
973 {
974 	struct filedesc *fdp;
975 	struct vnode *oldvp;
976 	int vfslocked;
977 	int error;
978 
979 	VFS_ASSERT_GIANT(vp->v_mount);
980 	fdp = td->td_proc->p_fd;
981 	FILEDESC_XLOCK(fdp);
982 	if (chroot_allow_open_directories == 0 ||
983 	    (chroot_allow_open_directories == 1 && fdp->fd_rdir != rootvnode)) {
984 		error = chroot_refuse_vdir_fds(fdp);
985 		if (error) {
986 			FILEDESC_XUNLOCK(fdp);
987 			return (error);
988 		}
989 	}
990 	oldvp = fdp->fd_rdir;
991 	fdp->fd_rdir = vp;
992 	VREF(fdp->fd_rdir);
993 	if (!fdp->fd_jdir) {
994 		fdp->fd_jdir = vp;
995 		VREF(fdp->fd_jdir);
996 	}
997 	FILEDESC_XUNLOCK(fdp);
998 	vfslocked = VFS_LOCK_GIANT(oldvp->v_mount);
999 	vrele(oldvp);
1000 	VFS_UNLOCK_GIANT(vfslocked);
1001 	return (0);
1002 }
1003 
1004 static __inline cap_rights_t
1005 flags_to_rights(int flags)
1006 {
1007 	cap_rights_t rights = 0;
1008 
1009 	switch ((flags & O_ACCMODE)) {
1010 	case O_RDONLY:
1011 		rights |= CAP_READ;
1012 		break;
1013 
1014 	case O_RDWR:
1015 		rights |= CAP_READ;
1016 		/* fall through */
1017 
1018 	case O_WRONLY:
1019 		rights |= CAP_WRITE;
1020 		break;
1021 
1022 	case O_EXEC:
1023 		rights |= CAP_FEXECVE;
1024 		break;
1025 	}
1026 
1027 	if (flags & O_CREAT)
1028 		rights |= CAP_CREATE;
1029 
1030 	if (flags & O_TRUNC)
1031 		rights |= CAP_FTRUNCATE;
1032 
1033 	if ((flags & O_EXLOCK) || (flags & O_SHLOCK))
1034 		rights |= CAP_FLOCK;
1035 
1036 	return (rights);
1037 }
1038 
1039 /*
1040  * Check permissions, allocate an open file structure, and call the device
1041  * open routine if any.
1042  */
1043 #ifndef _SYS_SYSPROTO_H_
1044 struct open_args {
1045 	char	*path;
1046 	int	flags;
1047 	int	mode;
1048 };
1049 #endif
1050 int
1051 sys_open(td, uap)
1052 	struct thread *td;
1053 	register struct open_args /* {
1054 		char *path;
1055 		int flags;
1056 		int mode;
1057 	} */ *uap;
1058 {
1059 
1060 	return (kern_open(td, uap->path, UIO_USERSPACE, uap->flags, uap->mode));
1061 }
1062 
1063 #ifndef _SYS_SYSPROTO_H_
1064 struct openat_args {
1065 	int	fd;
1066 	char	*path;
1067 	int	flag;
1068 	int	mode;
1069 };
1070 #endif
1071 int
1072 sys_openat(struct thread *td, struct openat_args *uap)
1073 {
1074 
1075 	return (kern_openat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flag,
1076 	    uap->mode));
1077 }
1078 
1079 int
1080 kern_open(struct thread *td, char *path, enum uio_seg pathseg, int flags,
1081     int mode)
1082 {
1083 
1084 	return (kern_openat(td, AT_FDCWD, path, pathseg, flags, mode));
1085 }
1086 
1087 int
1088 kern_openat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
1089     int flags, int mode)
1090 {
1091 	struct proc *p = td->td_proc;
1092 	struct filedesc *fdp = p->p_fd;
1093 	struct file *fp;
1094 	struct vnode *vp;
1095 	int cmode;
1096 	struct file *nfp;
1097 	int type, indx = -1, error, error_open;
1098 	struct flock lf;
1099 	struct nameidata nd;
1100 	int vfslocked;
1101 	cap_rights_t rights_needed = CAP_LOOKUP;
1102 
1103 	AUDIT_ARG_FFLAGS(flags);
1104 	AUDIT_ARG_MODE(mode);
1105 	/* XXX: audit dirfd */
1106 	rights_needed |= flags_to_rights(flags);
1107 	/*
1108 	 * Only one of the O_EXEC, O_RDONLY, O_WRONLY and O_RDWR flags
1109 	 * may be specified.
1110 	 */
1111 	if (flags & O_EXEC) {
1112 		if (flags & O_ACCMODE)
1113 			return (EINVAL);
1114 	} else if ((flags & O_ACCMODE) == O_ACCMODE)
1115 		return (EINVAL);
1116 	else
1117 		flags = FFLAGS(flags);
1118 
1119 	/*
1120 	 * allocate the file descriptor, but don't install a descriptor yet
1121 	 */
1122 	error = falloc_noinstall(td, &nfp);
1123 	if (error)
1124 		return (error);
1125 	/* An extra reference on `nfp' has been held for us by falloc_noinstall(). */
1126 	fp = nfp;
1127 	/* Set the flags early so the finit in devfs can pick them up. */
1128 	fp->f_flag = flags & FMASK;
1129 	cmode = ((mode &~ fdp->fd_cmask) & ALLPERMS) &~ S_ISTXT;
1130 	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | AUDITVNODE1 | MPSAFE, pathseg,
1131 	    path, fd, rights_needed, td);
1132 	td->td_dupfd = -1;		/* XXX check for fdopen */
1133 	error = vn_open(&nd, &flags, cmode, fp);
1134 	if (error) {
1135 		/*
1136 		 * If the vn_open replaced the method vector, something
1137 		 * wonderous happened deep below and we just pass it up
1138 		 * pretending we know what we do.
1139 		 */
1140 		if (error == ENXIO && fp->f_ops != &badfileops)
1141 			goto success;
1142 
1143 		/*
1144 		 * handle special fdopen() case.  bleh.  dupfdopen() is
1145 		 * responsible for dropping the old contents of ofiles[indx]
1146 		 * if it succeeds.
1147 		 *
1148 		 * Don't do this for relative (capability) lookups; we don't
1149 		 * understand exactly what would happen, and we don't think
1150 		 * that it ever should.
1151 		 */
1152 		if ((nd.ni_strictrelative == 0) &&
1153 		    (error == ENODEV || error == ENXIO) &&
1154 		    (td->td_dupfd >= 0)) {
1155 			/* XXX from fdopen */
1156 			error_open = error;
1157 			if ((error = finstall(td, fp, &indx, flags)) != 0)
1158 				goto bad_unlocked;
1159 			if ((error = dupfdopen(td, fdp, indx, td->td_dupfd,
1160 			    flags, error_open)) == 0)
1161 				goto success;
1162 		}
1163 		/*
1164 		 * Clean up the descriptor, but only if another thread hadn't
1165 		 * replaced or closed it.
1166 		 */
1167 		if (indx != -1)
1168 			fdclose(fdp, fp, indx, td);
1169 		fdrop(fp, td);
1170 
1171 		if (error == ERESTART)
1172 			error = EINTR;
1173 		return (error);
1174 	}
1175 	td->td_dupfd = 0;
1176 	vfslocked = NDHASGIANT(&nd);
1177 	NDFREE(&nd, NDF_ONLY_PNBUF);
1178 	vp = nd.ni_vp;
1179 
1180 	/*
1181 	 * Store the vnode, for any f_type. Typically, the vnode use
1182 	 * count is decremented by direct call to vn_closefile() for
1183 	 * files that switched type in the cdevsw fdopen() method.
1184 	 */
1185 	fp->f_vnode = vp;
1186 	/*
1187 	 * If the file wasn't claimed by devfs bind it to the normal
1188 	 * vnode operations here.
1189 	 */
1190 	if (fp->f_ops == &badfileops) {
1191 		KASSERT(vp->v_type != VFIFO, ("Unexpected fifo."));
1192 		fp->f_seqcount = 1;
1193 		finit(fp, flags & FMASK, DTYPE_VNODE, vp, &vnops);
1194 	}
1195 
1196 	VOP_UNLOCK(vp, 0);
1197 	if (fp->f_type == DTYPE_VNODE && (flags & (O_EXLOCK | O_SHLOCK)) != 0) {
1198 		lf.l_whence = SEEK_SET;
1199 		lf.l_start = 0;
1200 		lf.l_len = 0;
1201 		if (flags & O_EXLOCK)
1202 			lf.l_type = F_WRLCK;
1203 		else
1204 			lf.l_type = F_RDLCK;
1205 		type = F_FLOCK;
1206 		if ((flags & FNONBLOCK) == 0)
1207 			type |= F_WAIT;
1208 		if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,
1209 			    type)) != 0)
1210 			goto bad;
1211 		atomic_set_int(&fp->f_flag, FHASLOCK);
1212 	}
1213 	if (flags & O_TRUNC) {
1214 		error = fo_truncate(fp, 0, td->td_ucred, td);
1215 		if (error)
1216 			goto bad;
1217 	}
1218 	VFS_UNLOCK_GIANT(vfslocked);
1219 success:
1220 	/*
1221 	 * If we haven't already installed the FD (for dupfdopen), do so now.
1222 	 */
1223 	if (indx == -1) {
1224 #ifdef CAPABILITIES
1225 		if (nd.ni_strictrelative == 1) {
1226 			/*
1227 			 * We are doing a strict relative lookup; wrap the
1228 			 * result in a capability.
1229 			 */
1230 			if ((error = kern_capwrap(td, fp, nd.ni_baserights,
1231 			    &indx)) != 0)
1232 				goto bad_unlocked;
1233 		} else
1234 #endif
1235 			if ((error = finstall(td, fp, &indx, flags)) != 0)
1236 				goto bad_unlocked;
1237 
1238 	}
1239 
1240 	/*
1241 	 * Release our private reference, leaving the one associated with
1242 	 * the descriptor table intact.
1243 	 */
1244 	fdrop(fp, td);
1245 	td->td_retval[0] = indx;
1246 	return (0);
1247 bad:
1248 	VFS_UNLOCK_GIANT(vfslocked);
1249 bad_unlocked:
1250 	if (indx != -1)
1251 		fdclose(fdp, fp, indx, td);
1252 	fdrop(fp, td);
1253 	td->td_retval[0] = -1;
1254 	return (error);
1255 }
1256 
1257 #ifdef COMPAT_43
1258 /*
1259  * Create a file.
1260  */
1261 #ifndef _SYS_SYSPROTO_H_
1262 struct ocreat_args {
1263 	char	*path;
1264 	int	mode;
1265 };
1266 #endif
1267 int
1268 ocreat(td, uap)
1269 	struct thread *td;
1270 	register struct ocreat_args /* {
1271 		char *path;
1272 		int mode;
1273 	} */ *uap;
1274 {
1275 
1276 	return (kern_open(td, uap->path, UIO_USERSPACE,
1277 	    O_WRONLY | O_CREAT | O_TRUNC, uap->mode));
1278 }
1279 #endif /* COMPAT_43 */
1280 
1281 /*
1282  * Create a special file.
1283  */
1284 #ifndef _SYS_SYSPROTO_H_
1285 struct mknod_args {
1286 	char	*path;
1287 	int	mode;
1288 	int	dev;
1289 };
1290 #endif
1291 int
1292 sys_mknod(td, uap)
1293 	struct thread *td;
1294 	register struct mknod_args /* {
1295 		char *path;
1296 		int mode;
1297 		int dev;
1298 	} */ *uap;
1299 {
1300 
1301 	return (kern_mknod(td, uap->path, UIO_USERSPACE, uap->mode, uap->dev));
1302 }
1303 
1304 #ifndef _SYS_SYSPROTO_H_
1305 struct mknodat_args {
1306 	int	fd;
1307 	char	*path;
1308 	mode_t	mode;
1309 	dev_t	dev;
1310 };
1311 #endif
1312 int
1313 sys_mknodat(struct thread *td, struct mknodat_args *uap)
1314 {
1315 
1316 	return (kern_mknodat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode,
1317 	    uap->dev));
1318 }
1319 
1320 int
1321 kern_mknod(struct thread *td, char *path, enum uio_seg pathseg, int mode,
1322     int dev)
1323 {
1324 
1325 	return (kern_mknodat(td, AT_FDCWD, path, pathseg, mode, dev));
1326 }
1327 
1328 int
1329 kern_mknodat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
1330     int mode, int dev)
1331 {
1332 	struct vnode *vp;
1333 	struct mount *mp;
1334 	struct vattr vattr;
1335 	int error;
1336 	int whiteout = 0;
1337 	struct nameidata nd;
1338 	int vfslocked;
1339 
1340 	AUDIT_ARG_MODE(mode);
1341 	AUDIT_ARG_DEV(dev);
1342 	switch (mode & S_IFMT) {
1343 	case S_IFCHR:
1344 	case S_IFBLK:
1345 		error = priv_check(td, PRIV_VFS_MKNOD_DEV);
1346 		break;
1347 	case S_IFMT:
1348 		error = priv_check(td, PRIV_VFS_MKNOD_BAD);
1349 		break;
1350 	case S_IFWHT:
1351 		error = priv_check(td, PRIV_VFS_MKNOD_WHT);
1352 		break;
1353 	case S_IFIFO:
1354 		if (dev == 0)
1355 			return (kern_mkfifoat(td, fd, path, pathseg, mode));
1356 		/* FALLTHROUGH */
1357 	default:
1358 		error = EINVAL;
1359 		break;
1360 	}
1361 	if (error)
1362 		return (error);
1363 restart:
1364 	bwillwrite();
1365 	NDINIT_ATRIGHTS(&nd, CREATE,
1366 	    LOCKPARENT | SAVENAME | MPSAFE | AUDITVNODE1, pathseg, path, fd,
1367 	    CAP_MKFIFO, td);
1368 	if ((error = namei(&nd)) != 0)
1369 		return (error);
1370 	vfslocked = NDHASGIANT(&nd);
1371 	vp = nd.ni_vp;
1372 	if (vp != NULL) {
1373 		NDFREE(&nd, NDF_ONLY_PNBUF);
1374 		if (vp == nd.ni_dvp)
1375 			vrele(nd.ni_dvp);
1376 		else
1377 			vput(nd.ni_dvp);
1378 		vrele(vp);
1379 		VFS_UNLOCK_GIANT(vfslocked);
1380 		return (EEXIST);
1381 	} else {
1382 		VATTR_NULL(&vattr);
1383 		vattr.va_mode = (mode & ALLPERMS) &
1384 		    ~td->td_proc->p_fd->fd_cmask;
1385 		vattr.va_rdev = dev;
1386 		whiteout = 0;
1387 
1388 		switch (mode & S_IFMT) {
1389 		case S_IFMT:	/* used by badsect to flag bad sectors */
1390 			vattr.va_type = VBAD;
1391 			break;
1392 		case S_IFCHR:
1393 			vattr.va_type = VCHR;
1394 			break;
1395 		case S_IFBLK:
1396 			vattr.va_type = VBLK;
1397 			break;
1398 		case S_IFWHT:
1399 			whiteout = 1;
1400 			break;
1401 		default:
1402 			panic("kern_mknod: invalid mode");
1403 		}
1404 	}
1405 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1406 		NDFREE(&nd, NDF_ONLY_PNBUF);
1407 		vput(nd.ni_dvp);
1408 		VFS_UNLOCK_GIANT(vfslocked);
1409 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
1410 			return (error);
1411 		goto restart;
1412 	}
1413 #ifdef MAC
1414 	if (error == 0 && !whiteout)
1415 		error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp,
1416 		    &nd.ni_cnd, &vattr);
1417 #endif
1418 	if (!error) {
1419 		if (whiteout)
1420 			error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE);
1421 		else {
1422 			error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp,
1423 						&nd.ni_cnd, &vattr);
1424 			if (error == 0)
1425 				vput(nd.ni_vp);
1426 		}
1427 	}
1428 	NDFREE(&nd, NDF_ONLY_PNBUF);
1429 	vput(nd.ni_dvp);
1430 	vn_finished_write(mp);
1431 	VFS_UNLOCK_GIANT(vfslocked);
1432 	return (error);
1433 }
1434 
1435 /*
1436  * Create a named pipe.
1437  */
1438 #ifndef _SYS_SYSPROTO_H_
1439 struct mkfifo_args {
1440 	char	*path;
1441 	int	mode;
1442 };
1443 #endif
1444 int
1445 sys_mkfifo(td, uap)
1446 	struct thread *td;
1447 	register struct mkfifo_args /* {
1448 		char *path;
1449 		int mode;
1450 	} */ *uap;
1451 {
1452 
1453 	return (kern_mkfifo(td, uap->path, UIO_USERSPACE, uap->mode));
1454 }
1455 
1456 #ifndef _SYS_SYSPROTO_H_
1457 struct mkfifoat_args {
1458 	int	fd;
1459 	char	*path;
1460 	mode_t	mode;
1461 };
1462 #endif
1463 int
1464 sys_mkfifoat(struct thread *td, struct mkfifoat_args *uap)
1465 {
1466 
1467 	return (kern_mkfifoat(td, uap->fd, uap->path, UIO_USERSPACE,
1468 	    uap->mode));
1469 }
1470 
1471 int
1472 kern_mkfifo(struct thread *td, char *path, enum uio_seg pathseg, int mode)
1473 {
1474 
1475 	return (kern_mkfifoat(td, AT_FDCWD, path, pathseg, mode));
1476 }
1477 
1478 int
1479 kern_mkfifoat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
1480     int mode)
1481 {
1482 	struct mount *mp;
1483 	struct vattr vattr;
1484 	int error;
1485 	struct nameidata nd;
1486 	int vfslocked;
1487 
1488 	AUDIT_ARG_MODE(mode);
1489 restart:
1490 	bwillwrite();
1491 	NDINIT_AT(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE | AUDITVNODE1,
1492 	    pathseg, path, fd, td);
1493 	if ((error = namei(&nd)) != 0)
1494 		return (error);
1495 	vfslocked = NDHASGIANT(&nd);
1496 	if (nd.ni_vp != NULL) {
1497 		NDFREE(&nd, NDF_ONLY_PNBUF);
1498 		if (nd.ni_vp == nd.ni_dvp)
1499 			vrele(nd.ni_dvp);
1500 		else
1501 			vput(nd.ni_dvp);
1502 		vrele(nd.ni_vp);
1503 		VFS_UNLOCK_GIANT(vfslocked);
1504 		return (EEXIST);
1505 	}
1506 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1507 		NDFREE(&nd, NDF_ONLY_PNBUF);
1508 		vput(nd.ni_dvp);
1509 		VFS_UNLOCK_GIANT(vfslocked);
1510 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
1511 			return (error);
1512 		goto restart;
1513 	}
1514 	VATTR_NULL(&vattr);
1515 	vattr.va_type = VFIFO;
1516 	vattr.va_mode = (mode & ALLPERMS) & ~td->td_proc->p_fd->fd_cmask;
1517 #ifdef MAC
1518 	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
1519 	    &vattr);
1520 	if (error)
1521 		goto out;
1522 #endif
1523 	error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
1524 	if (error == 0)
1525 		vput(nd.ni_vp);
1526 #ifdef MAC
1527 out:
1528 #endif
1529 	vput(nd.ni_dvp);
1530 	vn_finished_write(mp);
1531 	VFS_UNLOCK_GIANT(vfslocked);
1532 	NDFREE(&nd, NDF_ONLY_PNBUF);
1533 	return (error);
1534 }
1535 
1536 /*
1537  * Make a hard file link.
1538  */
1539 #ifndef _SYS_SYSPROTO_H_
1540 struct link_args {
1541 	char	*path;
1542 	char	*link;
1543 };
1544 #endif
1545 int
1546 sys_link(td, uap)
1547 	struct thread *td;
1548 	register struct link_args /* {
1549 		char *path;
1550 		char *link;
1551 	} */ *uap;
1552 {
1553 
1554 	return (kern_link(td, uap->path, uap->link, UIO_USERSPACE));
1555 }
1556 
1557 #ifndef _SYS_SYSPROTO_H_
1558 struct linkat_args {
1559 	int	fd1;
1560 	char	*path1;
1561 	int	fd2;
1562 	char	*path2;
1563 	int	flag;
1564 };
1565 #endif
1566 int
1567 sys_linkat(struct thread *td, struct linkat_args *uap)
1568 {
1569 	int flag;
1570 
1571 	flag = uap->flag;
1572 	if (flag & ~AT_SYMLINK_FOLLOW)
1573 		return (EINVAL);
1574 
1575 	return (kern_linkat(td, uap->fd1, uap->fd2, uap->path1, uap->path2,
1576 	    UIO_USERSPACE, (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW));
1577 }
1578 
1579 int hardlink_check_uid = 0;
1580 SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_uid, CTLFLAG_RW,
1581     &hardlink_check_uid, 0,
1582     "Unprivileged processes cannot create hard links to files owned by other "
1583     "users");
1584 static int hardlink_check_gid = 0;
1585 SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_gid, CTLFLAG_RW,
1586     &hardlink_check_gid, 0,
1587     "Unprivileged processes cannot create hard links to files owned by other "
1588     "groups");
1589 
1590 static int
1591 can_hardlink(struct vnode *vp, struct ucred *cred)
1592 {
1593 	struct vattr va;
1594 	int error;
1595 
1596 	if (!hardlink_check_uid && !hardlink_check_gid)
1597 		return (0);
1598 
1599 	error = VOP_GETATTR(vp, &va, cred);
1600 	if (error != 0)
1601 		return (error);
1602 
1603 	if (hardlink_check_uid && cred->cr_uid != va.va_uid) {
1604 		error = priv_check_cred(cred, PRIV_VFS_LINK, 0);
1605 		if (error)
1606 			return (error);
1607 	}
1608 
1609 	if (hardlink_check_gid && !groupmember(va.va_gid, cred)) {
1610 		error = priv_check_cred(cred, PRIV_VFS_LINK, 0);
1611 		if (error)
1612 			return (error);
1613 	}
1614 
1615 	return (0);
1616 }
1617 
1618 int
1619 kern_link(struct thread *td, char *path, char *link, enum uio_seg segflg)
1620 {
1621 
1622 	return (kern_linkat(td, AT_FDCWD, AT_FDCWD, path,link, segflg, FOLLOW));
1623 }
1624 
1625 int
1626 kern_linkat(struct thread *td, int fd1, int fd2, char *path1, char *path2,
1627     enum uio_seg segflg, int follow)
1628 {
1629 	struct vnode *vp;
1630 	struct mount *mp;
1631 	struct nameidata nd;
1632 	int vfslocked;
1633 	int lvfslocked;
1634 	int error;
1635 
1636 	bwillwrite();
1637 	NDINIT_AT(&nd, LOOKUP, follow | MPSAFE | AUDITVNODE1, segflg, path1,
1638 	    fd1, td);
1639 
1640 	if ((error = namei(&nd)) != 0)
1641 		return (error);
1642 	vfslocked = NDHASGIANT(&nd);
1643 	NDFREE(&nd, NDF_ONLY_PNBUF);
1644 	vp = nd.ni_vp;
1645 	if (vp->v_type == VDIR) {
1646 		vrele(vp);
1647 		VFS_UNLOCK_GIANT(vfslocked);
1648 		return (EPERM);		/* POSIX */
1649 	}
1650 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
1651 		vrele(vp);
1652 		VFS_UNLOCK_GIANT(vfslocked);
1653 		return (error);
1654 	}
1655 	NDINIT_AT(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE | AUDITVNODE2,
1656 	    segflg, path2, fd2, td);
1657 	if ((error = namei(&nd)) == 0) {
1658 		lvfslocked = NDHASGIANT(&nd);
1659 		if (nd.ni_vp != NULL) {
1660 			if (nd.ni_dvp == nd.ni_vp)
1661 				vrele(nd.ni_dvp);
1662 			else
1663 				vput(nd.ni_dvp);
1664 			vrele(nd.ni_vp);
1665 			error = EEXIST;
1666 		} else if ((error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY))
1667 		    == 0) {
1668 			error = can_hardlink(vp, td->td_ucred);
1669 			if (error == 0)
1670 #ifdef MAC
1671 				error = mac_vnode_check_link(td->td_ucred,
1672 				    nd.ni_dvp, vp, &nd.ni_cnd);
1673 			if (error == 0)
1674 #endif
1675 				error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd);
1676 			VOP_UNLOCK(vp, 0);
1677 			vput(nd.ni_dvp);
1678 		}
1679 		NDFREE(&nd, NDF_ONLY_PNBUF);
1680 		VFS_UNLOCK_GIANT(lvfslocked);
1681 	}
1682 	vrele(vp);
1683 	vn_finished_write(mp);
1684 	VFS_UNLOCK_GIANT(vfslocked);
1685 	return (error);
1686 }
1687 
1688 /*
1689  * Make a symbolic link.
1690  */
1691 #ifndef _SYS_SYSPROTO_H_
1692 struct symlink_args {
1693 	char	*path;
1694 	char	*link;
1695 };
1696 #endif
1697 int
1698 sys_symlink(td, uap)
1699 	struct thread *td;
1700 	register struct symlink_args /* {
1701 		char *path;
1702 		char *link;
1703 	} */ *uap;
1704 {
1705 
1706 	return (kern_symlink(td, uap->path, uap->link, UIO_USERSPACE));
1707 }
1708 
1709 #ifndef _SYS_SYSPROTO_H_
1710 struct symlinkat_args {
1711 	char	*path;
1712 	int	fd;
1713 	char	*path2;
1714 };
1715 #endif
1716 int
1717 sys_symlinkat(struct thread *td, struct symlinkat_args *uap)
1718 {
1719 
1720 	return (kern_symlinkat(td, uap->path1, uap->fd, uap->path2,
1721 	    UIO_USERSPACE));
1722 }
1723 
1724 int
1725 kern_symlink(struct thread *td, char *path, char *link, enum uio_seg segflg)
1726 {
1727 
1728 	return (kern_symlinkat(td, path, AT_FDCWD, link, segflg));
1729 }
1730 
1731 int
1732 kern_symlinkat(struct thread *td, char *path1, int fd, char *path2,
1733     enum uio_seg segflg)
1734 {
1735 	struct mount *mp;
1736 	struct vattr vattr;
1737 	char *syspath;
1738 	int error;
1739 	struct nameidata nd;
1740 	int vfslocked;
1741 
1742 	if (segflg == UIO_SYSSPACE) {
1743 		syspath = path1;
1744 	} else {
1745 		syspath = uma_zalloc(namei_zone, M_WAITOK);
1746 		if ((error = copyinstr(path1, syspath, MAXPATHLEN, NULL)) != 0)
1747 			goto out;
1748 	}
1749 	AUDIT_ARG_TEXT(syspath);
1750 restart:
1751 	bwillwrite();
1752 	NDINIT_AT(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE | AUDITVNODE1,
1753 	    segflg, path2, fd, td);
1754 	if ((error = namei(&nd)) != 0)
1755 		goto out;
1756 	vfslocked = NDHASGIANT(&nd);
1757 	if (nd.ni_vp) {
1758 		NDFREE(&nd, NDF_ONLY_PNBUF);
1759 		if (nd.ni_vp == nd.ni_dvp)
1760 			vrele(nd.ni_dvp);
1761 		else
1762 			vput(nd.ni_dvp);
1763 		vrele(nd.ni_vp);
1764 		VFS_UNLOCK_GIANT(vfslocked);
1765 		error = EEXIST;
1766 		goto out;
1767 	}
1768 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1769 		NDFREE(&nd, NDF_ONLY_PNBUF);
1770 		vput(nd.ni_dvp);
1771 		VFS_UNLOCK_GIANT(vfslocked);
1772 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
1773 			goto out;
1774 		goto restart;
1775 	}
1776 	VATTR_NULL(&vattr);
1777 	vattr.va_mode = ACCESSPERMS &~ td->td_proc->p_fd->fd_cmask;
1778 #ifdef MAC
1779 	vattr.va_type = VLNK;
1780 	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
1781 	    &vattr);
1782 	if (error)
1783 		goto out2;
1784 #endif
1785 	error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, syspath);
1786 	if (error == 0)
1787 		vput(nd.ni_vp);
1788 #ifdef MAC
1789 out2:
1790 #endif
1791 	NDFREE(&nd, NDF_ONLY_PNBUF);
1792 	vput(nd.ni_dvp);
1793 	vn_finished_write(mp);
1794 	VFS_UNLOCK_GIANT(vfslocked);
1795 out:
1796 	if (segflg != UIO_SYSSPACE)
1797 		uma_zfree(namei_zone, syspath);
1798 	return (error);
1799 }
1800 
1801 /*
1802  * Delete a whiteout from the filesystem.
1803  */
1804 int
1805 sys_undelete(td, uap)
1806 	struct thread *td;
1807 	register struct undelete_args /* {
1808 		char *path;
1809 	} */ *uap;
1810 {
1811 	int error;
1812 	struct mount *mp;
1813 	struct nameidata nd;
1814 	int vfslocked;
1815 
1816 restart:
1817 	bwillwrite();
1818 	NDINIT(&nd, DELETE, LOCKPARENT | DOWHITEOUT | MPSAFE | AUDITVNODE1,
1819 	    UIO_USERSPACE, uap->path, td);
1820 	error = namei(&nd);
1821 	if (error)
1822 		return (error);
1823 	vfslocked = NDHASGIANT(&nd);
1824 
1825 	if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) {
1826 		NDFREE(&nd, NDF_ONLY_PNBUF);
1827 		if (nd.ni_vp == nd.ni_dvp)
1828 			vrele(nd.ni_dvp);
1829 		else
1830 			vput(nd.ni_dvp);
1831 		if (nd.ni_vp)
1832 			vrele(nd.ni_vp);
1833 		VFS_UNLOCK_GIANT(vfslocked);
1834 		return (EEXIST);
1835 	}
1836 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1837 		NDFREE(&nd, NDF_ONLY_PNBUF);
1838 		vput(nd.ni_dvp);
1839 		VFS_UNLOCK_GIANT(vfslocked);
1840 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
1841 			return (error);
1842 		goto restart;
1843 	}
1844 	error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE);
1845 	NDFREE(&nd, NDF_ONLY_PNBUF);
1846 	vput(nd.ni_dvp);
1847 	vn_finished_write(mp);
1848 	VFS_UNLOCK_GIANT(vfslocked);
1849 	return (error);
1850 }
1851 
1852 /*
1853  * Delete a name from the filesystem.
1854  */
1855 #ifndef _SYS_SYSPROTO_H_
1856 struct unlink_args {
1857 	char	*path;
1858 };
1859 #endif
1860 int
1861 sys_unlink(td, uap)
1862 	struct thread *td;
1863 	struct unlink_args /* {
1864 		char *path;
1865 	} */ *uap;
1866 {
1867 
1868 	return (kern_unlink(td, uap->path, UIO_USERSPACE));
1869 }
1870 
1871 #ifndef _SYS_SYSPROTO_H_
1872 struct unlinkat_args {
1873 	int	fd;
1874 	char	*path;
1875 	int	flag;
1876 };
1877 #endif
1878 int
1879 sys_unlinkat(struct thread *td, struct unlinkat_args *uap)
1880 {
1881 	int flag = uap->flag;
1882 	int fd = uap->fd;
1883 	char *path = uap->path;
1884 
1885 	if (flag & ~AT_REMOVEDIR)
1886 		return (EINVAL);
1887 
1888 	if (flag & AT_REMOVEDIR)
1889 		return (kern_rmdirat(td, fd, path, UIO_USERSPACE));
1890 	else
1891 		return (kern_unlinkat(td, fd, path, UIO_USERSPACE, 0));
1892 }
1893 
1894 int
1895 kern_unlink(struct thread *td, char *path, enum uio_seg pathseg)
1896 {
1897 
1898 	return (kern_unlinkat(td, AT_FDCWD, path, pathseg, 0));
1899 }
1900 
1901 int
1902 kern_unlinkat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
1903     ino_t oldinum)
1904 {
1905 	struct mount *mp;
1906 	struct vnode *vp;
1907 	int error;
1908 	struct nameidata nd;
1909 	struct stat sb;
1910 	int vfslocked;
1911 
1912 restart:
1913 	bwillwrite();
1914 	NDINIT_AT(&nd, DELETE, LOCKPARENT | LOCKLEAF | MPSAFE | AUDITVNODE1,
1915 	    pathseg, path, fd, td);
1916 	if ((error = namei(&nd)) != 0)
1917 		return (error == EINVAL ? EPERM : error);
1918 	vfslocked = NDHASGIANT(&nd);
1919 	vp = nd.ni_vp;
1920 	if (vp->v_type == VDIR && oldinum == 0) {
1921 		error = EPERM;		/* POSIX */
1922 	} else if (oldinum != 0 &&
1923 		  ((error = vn_stat(vp, &sb, td->td_ucred, NOCRED, td)) == 0) &&
1924 		  sb.st_ino != oldinum) {
1925 			error = EIDRM;	/* Identifier removed */
1926 	} else {
1927 		/*
1928 		 * The root of a mounted filesystem cannot be deleted.
1929 		 *
1930 		 * XXX: can this only be a VDIR case?
1931 		 */
1932 		if (vp->v_vflag & VV_ROOT)
1933 			error = EBUSY;
1934 	}
1935 	if (error == 0) {
1936 		if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1937 			NDFREE(&nd, NDF_ONLY_PNBUF);
1938 			vput(nd.ni_dvp);
1939 			if (vp == nd.ni_dvp)
1940 				vrele(vp);
1941 			else
1942 				vput(vp);
1943 			VFS_UNLOCK_GIANT(vfslocked);
1944 			if ((error = vn_start_write(NULL, &mp,
1945 			    V_XSLEEP | PCATCH)) != 0)
1946 				return (error);
1947 			goto restart;
1948 		}
1949 #ifdef MAC
1950 		error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp,
1951 		    &nd.ni_cnd);
1952 		if (error)
1953 			goto out;
1954 #endif
1955 		error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
1956 #ifdef MAC
1957 out:
1958 #endif
1959 		vn_finished_write(mp);
1960 	}
1961 	NDFREE(&nd, NDF_ONLY_PNBUF);
1962 	vput(nd.ni_dvp);
1963 	if (vp == nd.ni_dvp)
1964 		vrele(vp);
1965 	else
1966 		vput(vp);
1967 	VFS_UNLOCK_GIANT(vfslocked);
1968 	return (error);
1969 }
1970 
1971 /*
1972  * Reposition read/write file offset.
1973  */
1974 #ifndef _SYS_SYSPROTO_H_
1975 struct lseek_args {
1976 	int	fd;
1977 	int	pad;
1978 	off_t	offset;
1979 	int	whence;
1980 };
1981 #endif
1982 int
1983 sys_lseek(td, uap)
1984 	struct thread *td;
1985 	register struct lseek_args /* {
1986 		int fd;
1987 		int pad;
1988 		off_t offset;
1989 		int whence;
1990 	} */ *uap;
1991 {
1992 	struct ucred *cred = td->td_ucred;
1993 	struct file *fp;
1994 	struct vnode *vp;
1995 	struct vattr vattr;
1996 	off_t offset, size;
1997 	int error, noneg;
1998 	int vfslocked;
1999 
2000 	AUDIT_ARG_FD(uap->fd);
2001 	if ((error = fget(td, uap->fd, CAP_SEEK, &fp)) != 0)
2002 		return (error);
2003 	if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) {
2004 		fdrop(fp, td);
2005 		return (ESPIPE);
2006 	}
2007 	vp = fp->f_vnode;
2008 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
2009 	noneg = (vp->v_type != VCHR);
2010 	offset = uap->offset;
2011 	switch (uap->whence) {
2012 	case L_INCR:
2013 		if (noneg &&
2014 		    (fp->f_offset < 0 ||
2015 		    (offset > 0 && fp->f_offset > OFF_MAX - offset))) {
2016 			error = EOVERFLOW;
2017 			break;
2018 		}
2019 		offset += fp->f_offset;
2020 		break;
2021 	case L_XTND:
2022 		vn_lock(vp, LK_SHARED | LK_RETRY);
2023 		error = VOP_GETATTR(vp, &vattr, cred);
2024 		VOP_UNLOCK(vp, 0);
2025 		if (error)
2026 			break;
2027 
2028 		/*
2029 		 * If the file references a disk device, then fetch
2030 		 * the media size and use that to determine the ending
2031 		 * offset.
2032 		 */
2033 		if (vattr.va_size == 0 && vp->v_type == VCHR &&
2034 		    fo_ioctl(fp, DIOCGMEDIASIZE, &size, cred, td) == 0)
2035 			vattr.va_size = size;
2036 		if (noneg &&
2037 		    (vattr.va_size > OFF_MAX ||
2038 		    (offset > 0 && vattr.va_size > OFF_MAX - offset))) {
2039 			error = EOVERFLOW;
2040 			break;
2041 		}
2042 		offset += vattr.va_size;
2043 		break;
2044 	case L_SET:
2045 		break;
2046 	case SEEK_DATA:
2047 		error = fo_ioctl(fp, FIOSEEKDATA, &offset, cred, td);
2048 		break;
2049 	case SEEK_HOLE:
2050 		error = fo_ioctl(fp, FIOSEEKHOLE, &offset, cred, td);
2051 		break;
2052 	default:
2053 		error = EINVAL;
2054 	}
2055 	if (error == 0 && noneg && offset < 0)
2056 		error = EINVAL;
2057 	if (error != 0)
2058 		goto drop;
2059 	fp->f_offset = offset;
2060 	VFS_KNOTE_UNLOCKED(vp, 0);
2061 	*(off_t *)(td->td_retval) = fp->f_offset;
2062 drop:
2063 	fdrop(fp, td);
2064 	VFS_UNLOCK_GIANT(vfslocked);
2065 	return (error);
2066 }
2067 
2068 #if defined(COMPAT_43)
2069 /*
2070  * Reposition read/write file offset.
2071  */
2072 #ifndef _SYS_SYSPROTO_H_
2073 struct olseek_args {
2074 	int	fd;
2075 	long	offset;
2076 	int	whence;
2077 };
2078 #endif
2079 int
2080 olseek(td, uap)
2081 	struct thread *td;
2082 	register struct olseek_args /* {
2083 		int fd;
2084 		long offset;
2085 		int whence;
2086 	} */ *uap;
2087 {
2088 	struct lseek_args /* {
2089 		int fd;
2090 		int pad;
2091 		off_t offset;
2092 		int whence;
2093 	} */ nuap;
2094 
2095 	nuap.fd = uap->fd;
2096 	nuap.offset = uap->offset;
2097 	nuap.whence = uap->whence;
2098 	return (sys_lseek(td, &nuap));
2099 }
2100 #endif /* COMPAT_43 */
2101 
2102 /* Version with the 'pad' argument */
2103 int
2104 freebsd6_lseek(td, uap)
2105 	struct thread *td;
2106 	register struct freebsd6_lseek_args *uap;
2107 {
2108 	struct lseek_args ouap;
2109 
2110 	ouap.fd = uap->fd;
2111 	ouap.offset = uap->offset;
2112 	ouap.whence = uap->whence;
2113 	return (sys_lseek(td, &ouap));
2114 }
2115 
2116 /*
2117  * Check access permissions using passed credentials.
2118  */
2119 static int
2120 vn_access(vp, user_flags, cred, td)
2121 	struct vnode	*vp;
2122 	int		user_flags;
2123 	struct ucred	*cred;
2124 	struct thread	*td;
2125 {
2126 	int error;
2127 	accmode_t accmode;
2128 
2129 	/* Flags == 0 means only check for existence. */
2130 	error = 0;
2131 	if (user_flags) {
2132 		accmode = 0;
2133 		if (user_flags & R_OK)
2134 			accmode |= VREAD;
2135 		if (user_flags & W_OK)
2136 			accmode |= VWRITE;
2137 		if (user_flags & X_OK)
2138 			accmode |= VEXEC;
2139 #ifdef MAC
2140 		error = mac_vnode_check_access(cred, vp, accmode);
2141 		if (error)
2142 			return (error);
2143 #endif
2144 		if ((accmode & VWRITE) == 0 || (error = vn_writechk(vp)) == 0)
2145 			error = VOP_ACCESS(vp, accmode, cred, td);
2146 	}
2147 	return (error);
2148 }
2149 
2150 /*
2151  * Check access permissions using "real" credentials.
2152  */
2153 #ifndef _SYS_SYSPROTO_H_
2154 struct access_args {
2155 	char	*path;
2156 	int	amode;
2157 };
2158 #endif
2159 int
2160 sys_access(td, uap)
2161 	struct thread *td;
2162 	register struct access_args /* {
2163 		char *path;
2164 		int amode;
2165 	} */ *uap;
2166 {
2167 
2168 	return (kern_access(td, uap->path, UIO_USERSPACE, uap->amode));
2169 }
2170 
2171 #ifndef _SYS_SYSPROTO_H_
2172 struct faccessat_args {
2173 	int	dirfd;
2174 	char	*path;
2175 	int	amode;
2176 	int	flag;
2177 }
2178 #endif
2179 int
2180 sys_faccessat(struct thread *td, struct faccessat_args *uap)
2181 {
2182 
2183 	if (uap->flag & ~AT_EACCESS)
2184 		return (EINVAL);
2185 	return (kern_accessat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flag,
2186 	    uap->amode));
2187 }
2188 
2189 int
2190 kern_access(struct thread *td, char *path, enum uio_seg pathseg, int amode)
2191 {
2192 
2193 	return (kern_accessat(td, AT_FDCWD, path, pathseg, 0, amode));
2194 }
2195 
2196 int
2197 kern_accessat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
2198     int flag, int amode)
2199 {
2200 	struct ucred *cred, *tmpcred;
2201 	struct vnode *vp;
2202 	struct nameidata nd;
2203 	int vfslocked;
2204 	int error;
2205 
2206 	/*
2207 	 * Create and modify a temporary credential instead of one that
2208 	 * is potentially shared.
2209 	 */
2210 	if (!(flag & AT_EACCESS)) {
2211 		cred = td->td_ucred;
2212 		tmpcred = crdup(cred);
2213 		tmpcred->cr_uid = cred->cr_ruid;
2214 		tmpcred->cr_groups[0] = cred->cr_rgid;
2215 		td->td_ucred = tmpcred;
2216 	} else
2217 		cred = tmpcred = td->td_ucred;
2218 	AUDIT_ARG_VALUE(amode);
2219 	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | MPSAFE |
2220 	    AUDITVNODE1, pathseg, path, fd, CAP_FSTAT, td);
2221 	if ((error = namei(&nd)) != 0)
2222 		goto out1;
2223 	vfslocked = NDHASGIANT(&nd);
2224 	vp = nd.ni_vp;
2225 
2226 	error = vn_access(vp, amode, tmpcred, td);
2227 	NDFREE(&nd, NDF_ONLY_PNBUF);
2228 	vput(vp);
2229 	VFS_UNLOCK_GIANT(vfslocked);
2230 out1:
2231 	if (!(flag & AT_EACCESS)) {
2232 		td->td_ucred = cred;
2233 		crfree(tmpcred);
2234 	}
2235 	return (error);
2236 }
2237 
2238 /*
2239  * Check access permissions using "effective" credentials.
2240  */
2241 #ifndef _SYS_SYSPROTO_H_
2242 struct eaccess_args {
2243 	char	*path;
2244 	int	amode;
2245 };
2246 #endif
2247 int
2248 sys_eaccess(td, uap)
2249 	struct thread *td;
2250 	register struct eaccess_args /* {
2251 		char *path;
2252 		int amode;
2253 	} */ *uap;
2254 {
2255 
2256 	return (kern_eaccess(td, uap->path, UIO_USERSPACE, uap->amode));
2257 }
2258 
2259 int
2260 kern_eaccess(struct thread *td, char *path, enum uio_seg pathseg, int amode)
2261 {
2262 
2263 	return (kern_accessat(td, AT_FDCWD, path, pathseg, AT_EACCESS, amode));
2264 }
2265 
2266 #if defined(COMPAT_43)
2267 /*
2268  * Get file status; this version follows links.
2269  */
2270 #ifndef _SYS_SYSPROTO_H_
2271 struct ostat_args {
2272 	char	*path;
2273 	struct ostat *ub;
2274 };
2275 #endif
2276 int
2277 ostat(td, uap)
2278 	struct thread *td;
2279 	register struct ostat_args /* {
2280 		char *path;
2281 		struct ostat *ub;
2282 	} */ *uap;
2283 {
2284 	struct stat sb;
2285 	struct ostat osb;
2286 	int error;
2287 
2288 	error = kern_stat(td, uap->path, UIO_USERSPACE, &sb);
2289 	if (error)
2290 		return (error);
2291 	cvtstat(&sb, &osb);
2292 	error = copyout(&osb, uap->ub, sizeof (osb));
2293 	return (error);
2294 }
2295 
2296 /*
2297  * Get file status; this version does not follow links.
2298  */
2299 #ifndef _SYS_SYSPROTO_H_
2300 struct olstat_args {
2301 	char	*path;
2302 	struct ostat *ub;
2303 };
2304 #endif
2305 int
2306 olstat(td, uap)
2307 	struct thread *td;
2308 	register struct olstat_args /* {
2309 		char *path;
2310 		struct ostat *ub;
2311 	} */ *uap;
2312 {
2313 	struct stat sb;
2314 	struct ostat osb;
2315 	int error;
2316 
2317 	error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb);
2318 	if (error)
2319 		return (error);
2320 	cvtstat(&sb, &osb);
2321 	error = copyout(&osb, uap->ub, sizeof (osb));
2322 	return (error);
2323 }
2324 
2325 /*
2326  * Convert from an old to a new stat structure.
2327  */
2328 void
2329 cvtstat(st, ost)
2330 	struct stat *st;
2331 	struct ostat *ost;
2332 {
2333 
2334 	ost->st_dev = st->st_dev;
2335 	ost->st_ino = st->st_ino;
2336 	ost->st_mode = st->st_mode;
2337 	ost->st_nlink = st->st_nlink;
2338 	ost->st_uid = st->st_uid;
2339 	ost->st_gid = st->st_gid;
2340 	ost->st_rdev = st->st_rdev;
2341 	if (st->st_size < (quad_t)1 << 32)
2342 		ost->st_size = st->st_size;
2343 	else
2344 		ost->st_size = -2;
2345 	ost->st_atim = st->st_atim;
2346 	ost->st_mtim = st->st_mtim;
2347 	ost->st_ctim = st->st_ctim;
2348 	ost->st_blksize = st->st_blksize;
2349 	ost->st_blocks = st->st_blocks;
2350 	ost->st_flags = st->st_flags;
2351 	ost->st_gen = st->st_gen;
2352 }
2353 #endif /* COMPAT_43 */
2354 
2355 /*
2356  * Get file status; this version follows links.
2357  */
2358 #ifndef _SYS_SYSPROTO_H_
2359 struct stat_args {
2360 	char	*path;
2361 	struct stat *ub;
2362 };
2363 #endif
2364 int
2365 sys_stat(td, uap)
2366 	struct thread *td;
2367 	register struct stat_args /* {
2368 		char *path;
2369 		struct stat *ub;
2370 	} */ *uap;
2371 {
2372 	struct stat sb;
2373 	int error;
2374 
2375 	error = kern_stat(td, uap->path, UIO_USERSPACE, &sb);
2376 	if (error == 0)
2377 		error = copyout(&sb, uap->ub, sizeof (sb));
2378 	return (error);
2379 }
2380 
2381 #ifndef _SYS_SYSPROTO_H_
2382 struct fstatat_args {
2383 	int	fd;
2384 	char	*path;
2385 	struct stat	*buf;
2386 	int	flag;
2387 }
2388 #endif
2389 int
2390 sys_fstatat(struct thread *td, struct fstatat_args *uap)
2391 {
2392 	struct stat sb;
2393 	int error;
2394 
2395 	error = kern_statat(td, uap->flag, uap->fd, uap->path,
2396 	    UIO_USERSPACE, &sb);
2397 	if (error == 0)
2398 		error = copyout(&sb, uap->buf, sizeof (sb));
2399 	return (error);
2400 }
2401 
2402 int
2403 kern_stat(struct thread *td, char *path, enum uio_seg pathseg, struct stat *sbp)
2404 {
2405 
2406 	return (kern_statat(td, 0, AT_FDCWD, path, pathseg, sbp));
2407 }
2408 
2409 int
2410 kern_statat(struct thread *td, int flag, int fd, char *path,
2411     enum uio_seg pathseg, struct stat *sbp)
2412 {
2413 
2414 	return (kern_statat_vnhook(td, flag, fd, path, pathseg, sbp, NULL));
2415 }
2416 
2417 int
2418 kern_statat_vnhook(struct thread *td, int flag, int fd, char *path,
2419     enum uio_seg pathseg, struct stat *sbp,
2420     void (*hook)(struct vnode *vp, struct stat *sbp))
2421 {
2422 	struct nameidata nd;
2423 	struct stat sb;
2424 	int error, vfslocked;
2425 
2426 	if (flag & ~AT_SYMLINK_NOFOLLOW)
2427 		return (EINVAL);
2428 
2429 	NDINIT_ATRIGHTS(&nd, LOOKUP, ((flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW :
2430 	    FOLLOW) | LOCKSHARED | LOCKLEAF | AUDITVNODE1 | MPSAFE, pathseg,
2431 	    path, fd, CAP_FSTAT, td);
2432 
2433 	if ((error = namei(&nd)) != 0)
2434 		return (error);
2435 	vfslocked = NDHASGIANT(&nd);
2436 	error = vn_stat(nd.ni_vp, &sb, td->td_ucred, NOCRED, td);
2437 	if (!error) {
2438 		SDT_PROBE(vfs, , stat, mode, path, sb.st_mode, 0, 0, 0);
2439 		if (S_ISREG(sb.st_mode))
2440 			SDT_PROBE(vfs, , stat, reg, path, pathseg, 0, 0, 0);
2441 		if (__predict_false(hook != NULL))
2442 			hook(nd.ni_vp, &sb);
2443 	}
2444 	NDFREE(&nd, NDF_ONLY_PNBUF);
2445 	vput(nd.ni_vp);
2446 	VFS_UNLOCK_GIANT(vfslocked);
2447 	if (error)
2448 		return (error);
2449 	*sbp = sb;
2450 #ifdef KTRACE
2451 	if (KTRPOINT(td, KTR_STRUCT))
2452 		ktrstat(&sb);
2453 #endif
2454 	return (0);
2455 }
2456 
2457 /*
2458  * Get file status; this version does not follow links.
2459  */
2460 #ifndef _SYS_SYSPROTO_H_
2461 struct lstat_args {
2462 	char	*path;
2463 	struct stat *ub;
2464 };
2465 #endif
2466 int
2467 sys_lstat(td, uap)
2468 	struct thread *td;
2469 	register struct lstat_args /* {
2470 		char *path;
2471 		struct stat *ub;
2472 	} */ *uap;
2473 {
2474 	struct stat sb;
2475 	int error;
2476 
2477 	error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb);
2478 	if (error == 0)
2479 		error = copyout(&sb, uap->ub, sizeof (sb));
2480 	return (error);
2481 }
2482 
2483 int
2484 kern_lstat(struct thread *td, char *path, enum uio_seg pathseg, struct stat *sbp)
2485 {
2486 
2487 	return (kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, path, pathseg,
2488 	    sbp));
2489 }
2490 
2491 /*
2492  * Implementation of the NetBSD [l]stat() functions.
2493  */
2494 void
2495 cvtnstat(sb, nsb)
2496 	struct stat *sb;
2497 	struct nstat *nsb;
2498 {
2499 	bzero(nsb, sizeof *nsb);
2500 	nsb->st_dev = sb->st_dev;
2501 	nsb->st_ino = sb->st_ino;
2502 	nsb->st_mode = sb->st_mode;
2503 	nsb->st_nlink = sb->st_nlink;
2504 	nsb->st_uid = sb->st_uid;
2505 	nsb->st_gid = sb->st_gid;
2506 	nsb->st_rdev = sb->st_rdev;
2507 	nsb->st_atim = sb->st_atim;
2508 	nsb->st_mtim = sb->st_mtim;
2509 	nsb->st_ctim = sb->st_ctim;
2510 	nsb->st_size = sb->st_size;
2511 	nsb->st_blocks = sb->st_blocks;
2512 	nsb->st_blksize = sb->st_blksize;
2513 	nsb->st_flags = sb->st_flags;
2514 	nsb->st_gen = sb->st_gen;
2515 	nsb->st_birthtim = sb->st_birthtim;
2516 }
2517 
2518 #ifndef _SYS_SYSPROTO_H_
2519 struct nstat_args {
2520 	char	*path;
2521 	struct nstat *ub;
2522 };
2523 #endif
2524 int
2525 sys_nstat(td, uap)
2526 	struct thread *td;
2527 	register struct nstat_args /* {
2528 		char *path;
2529 		struct nstat *ub;
2530 	} */ *uap;
2531 {
2532 	struct stat sb;
2533 	struct nstat nsb;
2534 	int error;
2535 
2536 	error = kern_stat(td, uap->path, UIO_USERSPACE, &sb);
2537 	if (error)
2538 		return (error);
2539 	cvtnstat(&sb, &nsb);
2540 	error = copyout(&nsb, uap->ub, sizeof (nsb));
2541 	return (error);
2542 }
2543 
2544 /*
2545  * NetBSD lstat.  Get file status; this version does not follow links.
2546  */
2547 #ifndef _SYS_SYSPROTO_H_
2548 struct lstat_args {
2549 	char	*path;
2550 	struct stat *ub;
2551 };
2552 #endif
2553 int
2554 sys_nlstat(td, uap)
2555 	struct thread *td;
2556 	register struct nlstat_args /* {
2557 		char *path;
2558 		struct nstat *ub;
2559 	} */ *uap;
2560 {
2561 	struct stat sb;
2562 	struct nstat nsb;
2563 	int error;
2564 
2565 	error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb);
2566 	if (error)
2567 		return (error);
2568 	cvtnstat(&sb, &nsb);
2569 	error = copyout(&nsb, uap->ub, sizeof (nsb));
2570 	return (error);
2571 }
2572 
2573 /*
2574  * Get configurable pathname variables.
2575  */
2576 #ifndef _SYS_SYSPROTO_H_
2577 struct pathconf_args {
2578 	char	*path;
2579 	int	name;
2580 };
2581 #endif
2582 int
2583 sys_pathconf(td, uap)
2584 	struct thread *td;
2585 	register struct pathconf_args /* {
2586 		char *path;
2587 		int name;
2588 	} */ *uap;
2589 {
2590 
2591 	return (kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name, FOLLOW));
2592 }
2593 
2594 #ifndef _SYS_SYSPROTO_H_
2595 struct lpathconf_args {
2596 	char	*path;
2597 	int	name;
2598 };
2599 #endif
2600 int
2601 sys_lpathconf(td, uap)
2602 	struct thread *td;
2603 	register struct lpathconf_args /* {
2604 		char *path;
2605 		int name;
2606 	} */ *uap;
2607 {
2608 
2609 	return (kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name, NOFOLLOW));
2610 }
2611 
2612 int
2613 kern_pathconf(struct thread *td, char *path, enum uio_seg pathseg, int name,
2614     u_long flags)
2615 {
2616 	struct nameidata nd;
2617 	int error, vfslocked;
2618 
2619 	NDINIT(&nd, LOOKUP, LOCKSHARED | LOCKLEAF | MPSAFE | AUDITVNODE1 |
2620 	    flags, pathseg, path, td);
2621 	if ((error = namei(&nd)) != 0)
2622 		return (error);
2623 	vfslocked = NDHASGIANT(&nd);
2624 	NDFREE(&nd, NDF_ONLY_PNBUF);
2625 
2626 	/* If asynchronous I/O is available, it works for all files. */
2627 	if (name == _PC_ASYNC_IO)
2628 		td->td_retval[0] = async_io_version;
2629 	else
2630 		error = VOP_PATHCONF(nd.ni_vp, name, td->td_retval);
2631 	vput(nd.ni_vp);
2632 	VFS_UNLOCK_GIANT(vfslocked);
2633 	return (error);
2634 }
2635 
2636 /*
2637  * Return target name of a symbolic link.
2638  */
2639 #ifndef _SYS_SYSPROTO_H_
2640 struct readlink_args {
2641 	char	*path;
2642 	char	*buf;
2643 	size_t	count;
2644 };
2645 #endif
2646 int
2647 sys_readlink(td, uap)
2648 	struct thread *td;
2649 	register struct readlink_args /* {
2650 		char *path;
2651 		char *buf;
2652 		size_t count;
2653 	} */ *uap;
2654 {
2655 
2656 	return (kern_readlink(td, uap->path, UIO_USERSPACE, uap->buf,
2657 	    UIO_USERSPACE, uap->count));
2658 }
2659 #ifndef _SYS_SYSPROTO_H_
2660 struct readlinkat_args {
2661 	int	fd;
2662 	char	*path;
2663 	char	*buf;
2664 	size_t	bufsize;
2665 };
2666 #endif
2667 int
2668 sys_readlinkat(struct thread *td, struct readlinkat_args *uap)
2669 {
2670 
2671 	return (kern_readlinkat(td, uap->fd, uap->path, UIO_USERSPACE,
2672 	    uap->buf, UIO_USERSPACE, uap->bufsize));
2673 }
2674 
2675 int
2676 kern_readlink(struct thread *td, char *path, enum uio_seg pathseg, char *buf,
2677     enum uio_seg bufseg, size_t count)
2678 {
2679 
2680 	return (kern_readlinkat(td, AT_FDCWD, path, pathseg, buf, bufseg,
2681 	    count));
2682 }
2683 
2684 int
2685 kern_readlinkat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
2686     char *buf, enum uio_seg bufseg, size_t count)
2687 {
2688 	struct vnode *vp;
2689 	struct iovec aiov;
2690 	struct uio auio;
2691 	int error;
2692 	struct nameidata nd;
2693 	int vfslocked;
2694 
2695 	if (count > IOSIZE_MAX)
2696 		return (EINVAL);
2697 
2698 	NDINIT_AT(&nd, LOOKUP, NOFOLLOW | LOCKSHARED | LOCKLEAF | MPSAFE |
2699 	    AUDITVNODE1, pathseg, path, fd, td);
2700 
2701 	if ((error = namei(&nd)) != 0)
2702 		return (error);
2703 	NDFREE(&nd, NDF_ONLY_PNBUF);
2704 	vfslocked = NDHASGIANT(&nd);
2705 	vp = nd.ni_vp;
2706 #ifdef MAC
2707 	error = mac_vnode_check_readlink(td->td_ucred, vp);
2708 	if (error) {
2709 		vput(vp);
2710 		VFS_UNLOCK_GIANT(vfslocked);
2711 		return (error);
2712 	}
2713 #endif
2714 	if (vp->v_type != VLNK)
2715 		error = EINVAL;
2716 	else {
2717 		aiov.iov_base = buf;
2718 		aiov.iov_len = count;
2719 		auio.uio_iov = &aiov;
2720 		auio.uio_iovcnt = 1;
2721 		auio.uio_offset = 0;
2722 		auio.uio_rw = UIO_READ;
2723 		auio.uio_segflg = bufseg;
2724 		auio.uio_td = td;
2725 		auio.uio_resid = count;
2726 		error = VOP_READLINK(vp, &auio, td->td_ucred);
2727 	}
2728 	vput(vp);
2729 	VFS_UNLOCK_GIANT(vfslocked);
2730 	td->td_retval[0] = count - auio.uio_resid;
2731 	return (error);
2732 }
2733 
2734 /*
2735  * Common implementation code for chflags() and fchflags().
2736  */
2737 static int
2738 setfflags(td, vp, flags)
2739 	struct thread *td;
2740 	struct vnode *vp;
2741 	int flags;
2742 {
2743 	int error;
2744 	struct mount *mp;
2745 	struct vattr vattr;
2746 
2747 	/* We can't support the value matching VNOVAL. */
2748 	if (flags == VNOVAL)
2749 		return (EOPNOTSUPP);
2750 
2751 	/*
2752 	 * Prevent non-root users from setting flags on devices.  When
2753 	 * a device is reused, users can retain ownership of the device
2754 	 * if they are allowed to set flags and programs assume that
2755 	 * chown can't fail when done as root.
2756 	 */
2757 	if (vp->v_type == VCHR || vp->v_type == VBLK) {
2758 		error = priv_check(td, PRIV_VFS_CHFLAGS_DEV);
2759 		if (error)
2760 			return (error);
2761 	}
2762 
2763 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
2764 		return (error);
2765 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2766 	VATTR_NULL(&vattr);
2767 	vattr.va_flags = flags;
2768 #ifdef MAC
2769 	error = mac_vnode_check_setflags(td->td_ucred, vp, vattr.va_flags);
2770 	if (error == 0)
2771 #endif
2772 		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
2773 	VOP_UNLOCK(vp, 0);
2774 	vn_finished_write(mp);
2775 	return (error);
2776 }
2777 
2778 /*
2779  * Change flags of a file given a path name.
2780  */
2781 #ifndef _SYS_SYSPROTO_H_
2782 struct chflags_args {
2783 	char	*path;
2784 	int	flags;
2785 };
2786 #endif
2787 int
2788 sys_chflags(td, uap)
2789 	struct thread *td;
2790 	register struct chflags_args /* {
2791 		char *path;
2792 		int flags;
2793 	} */ *uap;
2794 {
2795 	int error;
2796 	struct nameidata nd;
2797 	int vfslocked;
2798 
2799 	AUDIT_ARG_FFLAGS(uap->flags);
2800 	NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1, UIO_USERSPACE,
2801 	    uap->path, td);
2802 	if ((error = namei(&nd)) != 0)
2803 		return (error);
2804 	NDFREE(&nd, NDF_ONLY_PNBUF);
2805 	vfslocked = NDHASGIANT(&nd);
2806 	error = setfflags(td, nd.ni_vp, uap->flags);
2807 	vrele(nd.ni_vp);
2808 	VFS_UNLOCK_GIANT(vfslocked);
2809 	return (error);
2810 }
2811 
2812 /*
2813  * Same as chflags() but doesn't follow symlinks.
2814  */
2815 int
2816 sys_lchflags(td, uap)
2817 	struct thread *td;
2818 	register struct lchflags_args /* {
2819 		char *path;
2820 		int flags;
2821 	} */ *uap;
2822 {
2823 	int error;
2824 	struct nameidata nd;
2825 	int vfslocked;
2826 
2827 	AUDIT_ARG_FFLAGS(uap->flags);
2828 	NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE | AUDITVNODE1, UIO_USERSPACE,
2829 	    uap->path, td);
2830 	if ((error = namei(&nd)) != 0)
2831 		return (error);
2832 	vfslocked = NDHASGIANT(&nd);
2833 	NDFREE(&nd, NDF_ONLY_PNBUF);
2834 	error = setfflags(td, nd.ni_vp, uap->flags);
2835 	vrele(nd.ni_vp);
2836 	VFS_UNLOCK_GIANT(vfslocked);
2837 	return (error);
2838 }
2839 
2840 /*
2841  * Change flags of a file given a file descriptor.
2842  */
2843 #ifndef _SYS_SYSPROTO_H_
2844 struct fchflags_args {
2845 	int	fd;
2846 	int	flags;
2847 };
2848 #endif
2849 int
2850 sys_fchflags(td, uap)
2851 	struct thread *td;
2852 	register struct fchflags_args /* {
2853 		int fd;
2854 		int flags;
2855 	} */ *uap;
2856 {
2857 	struct file *fp;
2858 	int vfslocked;
2859 	int error;
2860 
2861 	AUDIT_ARG_FD(uap->fd);
2862 	AUDIT_ARG_FFLAGS(uap->flags);
2863 	if ((error = getvnode(td->td_proc->p_fd, uap->fd, CAP_FCHFLAGS,
2864 	    &fp)) != 0)
2865 		return (error);
2866 	vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
2867 #ifdef AUDIT
2868 	vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY);
2869 	AUDIT_ARG_VNODE1(fp->f_vnode);
2870 	VOP_UNLOCK(fp->f_vnode, 0);
2871 #endif
2872 	error = setfflags(td, fp->f_vnode, uap->flags);
2873 	VFS_UNLOCK_GIANT(vfslocked);
2874 	fdrop(fp, td);
2875 	return (error);
2876 }
2877 
2878 /*
2879  * Common implementation code for chmod(), lchmod() and fchmod().
2880  */
2881 int
2882 setfmode(td, cred, vp, mode)
2883 	struct thread *td;
2884 	struct ucred *cred;
2885 	struct vnode *vp;
2886 	int mode;
2887 {
2888 	int error;
2889 	struct mount *mp;
2890 	struct vattr vattr;
2891 
2892 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
2893 		return (error);
2894 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2895 	VATTR_NULL(&vattr);
2896 	vattr.va_mode = mode & ALLPERMS;
2897 #ifdef MAC
2898 	error = mac_vnode_check_setmode(cred, vp, vattr.va_mode);
2899 	if (error == 0)
2900 #endif
2901 		error = VOP_SETATTR(vp, &vattr, cred);
2902 	VOP_UNLOCK(vp, 0);
2903 	vn_finished_write(mp);
2904 	return (error);
2905 }
2906 
2907 /*
2908  * Change mode of a file given path name.
2909  */
2910 #ifndef _SYS_SYSPROTO_H_
2911 struct chmod_args {
2912 	char	*path;
2913 	int	mode;
2914 };
2915 #endif
2916 int
2917 sys_chmod(td, uap)
2918 	struct thread *td;
2919 	register struct chmod_args /* {
2920 		char *path;
2921 		int mode;
2922 	} */ *uap;
2923 {
2924 
2925 	return (kern_chmod(td, uap->path, UIO_USERSPACE, uap->mode));
2926 }
2927 
2928 #ifndef _SYS_SYSPROTO_H_
2929 struct fchmodat_args {
2930 	int	dirfd;
2931 	char	*path;
2932 	mode_t	mode;
2933 	int	flag;
2934 }
2935 #endif
2936 int
2937 sys_fchmodat(struct thread *td, struct fchmodat_args *uap)
2938 {
2939 	int flag = uap->flag;
2940 	int fd = uap->fd;
2941 	char *path = uap->path;
2942 	mode_t mode = uap->mode;
2943 
2944 	if (flag & ~AT_SYMLINK_NOFOLLOW)
2945 		return (EINVAL);
2946 
2947 	return (kern_fchmodat(td, fd, path, UIO_USERSPACE, mode, flag));
2948 }
2949 
2950 int
2951 kern_chmod(struct thread *td, char *path, enum uio_seg pathseg, int mode)
2952 {
2953 
2954 	return (kern_fchmodat(td, AT_FDCWD, path, pathseg, mode, 0));
2955 }
2956 
2957 /*
2958  * Change mode of a file given path name (don't follow links.)
2959  */
2960 #ifndef _SYS_SYSPROTO_H_
2961 struct lchmod_args {
2962 	char	*path;
2963 	int	mode;
2964 };
2965 #endif
2966 int
2967 sys_lchmod(td, uap)
2968 	struct thread *td;
2969 	register struct lchmod_args /* {
2970 		char *path;
2971 		int mode;
2972 	} */ *uap;
2973 {
2974 
2975 	return (kern_fchmodat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
2976 	    uap->mode, AT_SYMLINK_NOFOLLOW));
2977 }
2978 
2979 
2980 int
2981 kern_fchmodat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
2982     mode_t mode, int flag)
2983 {
2984 	int error;
2985 	struct nameidata nd;
2986 	int vfslocked;
2987 	int follow;
2988 
2989 	AUDIT_ARG_MODE(mode);
2990 	follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
2991 	NDINIT_ATRIGHTS(&nd, LOOKUP,  follow | MPSAFE | AUDITVNODE1, pathseg,
2992 	    path, fd, CAP_FCHMOD, td);
2993 	if ((error = namei(&nd)) != 0)
2994 		return (error);
2995 	vfslocked = NDHASGIANT(&nd);
2996 	NDFREE(&nd, NDF_ONLY_PNBUF);
2997 	error = setfmode(td, td->td_ucred, nd.ni_vp, mode);
2998 	vrele(nd.ni_vp);
2999 	VFS_UNLOCK_GIANT(vfslocked);
3000 	return (error);
3001 }
3002 
3003 /*
3004  * Change mode of a file given a file descriptor.
3005  */
3006 #ifndef _SYS_SYSPROTO_H_
3007 struct fchmod_args {
3008 	int	fd;
3009 	int	mode;
3010 };
3011 #endif
3012 int
3013 sys_fchmod(struct thread *td, struct fchmod_args *uap)
3014 {
3015 	struct file *fp;
3016 	int error;
3017 
3018 	AUDIT_ARG_FD(uap->fd);
3019 	AUDIT_ARG_MODE(uap->mode);
3020 
3021 	error = fget(td, uap->fd, CAP_FCHMOD, &fp);
3022 	if (error != 0)
3023 		return (error);
3024 	error = fo_chmod(fp, uap->mode, td->td_ucred, td);
3025 	fdrop(fp, td);
3026 	return (error);
3027 }
3028 
3029 /*
3030  * Common implementation for chown(), lchown(), and fchown()
3031  */
3032 int
3033 setfown(td, cred, vp, uid, gid)
3034 	struct thread *td;
3035 	struct ucred *cred;
3036 	struct vnode *vp;
3037 	uid_t uid;
3038 	gid_t gid;
3039 {
3040 	int error;
3041 	struct mount *mp;
3042 	struct vattr vattr;
3043 
3044 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
3045 		return (error);
3046 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3047 	VATTR_NULL(&vattr);
3048 	vattr.va_uid = uid;
3049 	vattr.va_gid = gid;
3050 #ifdef MAC
3051 	error = mac_vnode_check_setowner(cred, vp, vattr.va_uid,
3052 	    vattr.va_gid);
3053 	if (error == 0)
3054 #endif
3055 		error = VOP_SETATTR(vp, &vattr, cred);
3056 	VOP_UNLOCK(vp, 0);
3057 	vn_finished_write(mp);
3058 	return (error);
3059 }
3060 
3061 /*
3062  * Set ownership given a path name.
3063  */
3064 #ifndef _SYS_SYSPROTO_H_
3065 struct chown_args {
3066 	char	*path;
3067 	int	uid;
3068 	int	gid;
3069 };
3070 #endif
3071 int
3072 sys_chown(td, uap)
3073 	struct thread *td;
3074 	register struct chown_args /* {
3075 		char *path;
3076 		int uid;
3077 		int gid;
3078 	} */ *uap;
3079 {
3080 
3081 	return (kern_chown(td, uap->path, UIO_USERSPACE, uap->uid, uap->gid));
3082 }
3083 
3084 #ifndef _SYS_SYSPROTO_H_
3085 struct fchownat_args {
3086 	int fd;
3087 	const char * path;
3088 	uid_t uid;
3089 	gid_t gid;
3090 	int flag;
3091 };
3092 #endif
3093 int
3094 sys_fchownat(struct thread *td, struct fchownat_args *uap)
3095 {
3096 	int flag;
3097 
3098 	flag = uap->flag;
3099 	if (flag & ~AT_SYMLINK_NOFOLLOW)
3100 		return (EINVAL);
3101 
3102 	return (kern_fchownat(td, uap->fd, uap->path, UIO_USERSPACE, uap->uid,
3103 	    uap->gid, uap->flag));
3104 }
3105 
3106 int
3107 kern_chown(struct thread *td, char *path, enum uio_seg pathseg, int uid,
3108     int gid)
3109 {
3110 
3111 	return (kern_fchownat(td, AT_FDCWD, path, pathseg, uid, gid, 0));
3112 }
3113 
3114 int
3115 kern_fchownat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
3116     int uid, int gid, int flag)
3117 {
3118 	struct nameidata nd;
3119 	int error, vfslocked, follow;
3120 
3121 	AUDIT_ARG_OWNER(uid, gid);
3122 	follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
3123 	NDINIT_ATRIGHTS(&nd, LOOKUP, follow | MPSAFE | AUDITVNODE1, pathseg,
3124 	    path, fd, CAP_FCHOWN, td);
3125 
3126 	if ((error = namei(&nd)) != 0)
3127 		return (error);
3128 	vfslocked = NDHASGIANT(&nd);
3129 	NDFREE(&nd, NDF_ONLY_PNBUF);
3130 	error = setfown(td, td->td_ucred, nd.ni_vp, uid, gid);
3131 	vrele(nd.ni_vp);
3132 	VFS_UNLOCK_GIANT(vfslocked);
3133 	return (error);
3134 }
3135 
3136 /*
3137  * Set ownership given a path name, do not cross symlinks.
3138  */
3139 #ifndef _SYS_SYSPROTO_H_
3140 struct lchown_args {
3141 	char	*path;
3142 	int	uid;
3143 	int	gid;
3144 };
3145 #endif
3146 int
3147 sys_lchown(td, uap)
3148 	struct thread *td;
3149 	register struct lchown_args /* {
3150 		char *path;
3151 		int uid;
3152 		int gid;
3153 	} */ *uap;
3154 {
3155 
3156 	return (kern_lchown(td, uap->path, UIO_USERSPACE, uap->uid, uap->gid));
3157 }
3158 
3159 int
3160 kern_lchown(struct thread *td, char *path, enum uio_seg pathseg, int uid,
3161     int gid)
3162 {
3163 
3164 	return (kern_fchownat(td, AT_FDCWD, path, pathseg, uid, gid,
3165 	    AT_SYMLINK_NOFOLLOW));
3166 }
3167 
3168 /*
3169  * Set ownership given a file descriptor.
3170  */
3171 #ifndef _SYS_SYSPROTO_H_
3172 struct fchown_args {
3173 	int	fd;
3174 	int	uid;
3175 	int	gid;
3176 };
3177 #endif
3178 int
3179 sys_fchown(td, uap)
3180 	struct thread *td;
3181 	register struct fchown_args /* {
3182 		int fd;
3183 		int uid;
3184 		int gid;
3185 	} */ *uap;
3186 {
3187 	struct file *fp;
3188 	int error;
3189 
3190 	AUDIT_ARG_FD(uap->fd);
3191 	AUDIT_ARG_OWNER(uap->uid, uap->gid);
3192 	error = fget(td, uap->fd, CAP_FCHOWN, &fp);
3193 	if (error != 0)
3194 		return (error);
3195 	error = fo_chown(fp, uap->uid, uap->gid, td->td_ucred, td);
3196 	fdrop(fp, td);
3197 	return (error);
3198 }
3199 
3200 /*
3201  * Common implementation code for utimes(), lutimes(), and futimes().
3202  */
3203 static int
3204 getutimes(usrtvp, tvpseg, tsp)
3205 	const struct timeval *usrtvp;
3206 	enum uio_seg tvpseg;
3207 	struct timespec *tsp;
3208 {
3209 	struct timeval tv[2];
3210 	const struct timeval *tvp;
3211 	int error;
3212 
3213 	if (usrtvp == NULL) {
3214 		vfs_timestamp(&tsp[0]);
3215 		tsp[1] = tsp[0];
3216 	} else {
3217 		if (tvpseg == UIO_SYSSPACE) {
3218 			tvp = usrtvp;
3219 		} else {
3220 			if ((error = copyin(usrtvp, tv, sizeof(tv))) != 0)
3221 				return (error);
3222 			tvp = tv;
3223 		}
3224 
3225 		if (tvp[0].tv_usec < 0 || tvp[0].tv_usec >= 1000000 ||
3226 		    tvp[1].tv_usec < 0 || tvp[1].tv_usec >= 1000000)
3227 			return (EINVAL);
3228 		TIMEVAL_TO_TIMESPEC(&tvp[0], &tsp[0]);
3229 		TIMEVAL_TO_TIMESPEC(&tvp[1], &tsp[1]);
3230 	}
3231 	return (0);
3232 }
3233 
3234 /*
3235  * Common implementation code for utimes(), lutimes(), and futimes().
3236  */
3237 static int
3238 setutimes(td, vp, ts, numtimes, nullflag)
3239 	struct thread *td;
3240 	struct vnode *vp;
3241 	const struct timespec *ts;
3242 	int numtimes;
3243 	int nullflag;
3244 {
3245 	int error, setbirthtime;
3246 	struct mount *mp;
3247 	struct vattr vattr;
3248 
3249 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
3250 		return (error);
3251 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3252 	setbirthtime = 0;
3253 	if (numtimes < 3 && !VOP_GETATTR(vp, &vattr, td->td_ucred) &&
3254 	    timespeccmp(&ts[1], &vattr.va_birthtime, < ))
3255 		setbirthtime = 1;
3256 	VATTR_NULL(&vattr);
3257 	vattr.va_atime = ts[0];
3258 	vattr.va_mtime = ts[1];
3259 	if (setbirthtime)
3260 		vattr.va_birthtime = ts[1];
3261 	if (numtimes > 2)
3262 		vattr.va_birthtime = ts[2];
3263 	if (nullflag)
3264 		vattr.va_vaflags |= VA_UTIMES_NULL;
3265 #ifdef MAC
3266 	error = mac_vnode_check_setutimes(td->td_ucred, vp, vattr.va_atime,
3267 	    vattr.va_mtime);
3268 #endif
3269 	if (error == 0)
3270 		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
3271 	VOP_UNLOCK(vp, 0);
3272 	vn_finished_write(mp);
3273 	return (error);
3274 }
3275 
3276 /*
3277  * Set the access and modification times of a file.
3278  */
3279 #ifndef _SYS_SYSPROTO_H_
3280 struct utimes_args {
3281 	char	*path;
3282 	struct	timeval *tptr;
3283 };
3284 #endif
3285 int
3286 sys_utimes(td, uap)
3287 	struct thread *td;
3288 	register struct utimes_args /* {
3289 		char *path;
3290 		struct timeval *tptr;
3291 	} */ *uap;
3292 {
3293 
3294 	return (kern_utimes(td, uap->path, UIO_USERSPACE, uap->tptr,
3295 	    UIO_USERSPACE));
3296 }
3297 
3298 #ifndef _SYS_SYSPROTO_H_
3299 struct futimesat_args {
3300 	int fd;
3301 	const char * path;
3302 	const struct timeval * times;
3303 };
3304 #endif
3305 int
3306 sys_futimesat(struct thread *td, struct futimesat_args *uap)
3307 {
3308 
3309 	return (kern_utimesat(td, uap->fd, uap->path, UIO_USERSPACE,
3310 	    uap->times, UIO_USERSPACE));
3311 }
3312 
3313 int
3314 kern_utimes(struct thread *td, char *path, enum uio_seg pathseg,
3315     struct timeval *tptr, enum uio_seg tptrseg)
3316 {
3317 
3318 	return (kern_utimesat(td, AT_FDCWD, path, pathseg, tptr, tptrseg));
3319 }
3320 
3321 int
3322 kern_utimesat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
3323     struct timeval *tptr, enum uio_seg tptrseg)
3324 {
3325 	struct nameidata nd;
3326 	struct timespec ts[2];
3327 	int error, vfslocked;
3328 
3329 	if ((error = getutimes(tptr, tptrseg, ts)) != 0)
3330 		return (error);
3331 	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1, pathseg,
3332 	    path, fd, CAP_FUTIMES, td);
3333 
3334 	if ((error = namei(&nd)) != 0)
3335 		return (error);
3336 	vfslocked = NDHASGIANT(&nd);
3337 	NDFREE(&nd, NDF_ONLY_PNBUF);
3338 	error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL);
3339 	vrele(nd.ni_vp);
3340 	VFS_UNLOCK_GIANT(vfslocked);
3341 	return (error);
3342 }
3343 
3344 /*
3345  * Set the access and modification times of a file.
3346  */
3347 #ifndef _SYS_SYSPROTO_H_
3348 struct lutimes_args {
3349 	char	*path;
3350 	struct	timeval *tptr;
3351 };
3352 #endif
3353 int
3354 sys_lutimes(td, uap)
3355 	struct thread *td;
3356 	register struct lutimes_args /* {
3357 		char *path;
3358 		struct timeval *tptr;
3359 	} */ *uap;
3360 {
3361 
3362 	return (kern_lutimes(td, uap->path, UIO_USERSPACE, uap->tptr,
3363 	    UIO_USERSPACE));
3364 }
3365 
3366 int
3367 kern_lutimes(struct thread *td, char *path, enum uio_seg pathseg,
3368     struct timeval *tptr, enum uio_seg tptrseg)
3369 {
3370 	struct timespec ts[2];
3371 	int error;
3372 	struct nameidata nd;
3373 	int vfslocked;
3374 
3375 	if ((error = getutimes(tptr, tptrseg, ts)) != 0)
3376 		return (error);
3377 	NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE | AUDITVNODE1, pathseg, path, td);
3378 	if ((error = namei(&nd)) != 0)
3379 		return (error);
3380 	vfslocked = NDHASGIANT(&nd);
3381 	NDFREE(&nd, NDF_ONLY_PNBUF);
3382 	error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL);
3383 	vrele(nd.ni_vp);
3384 	VFS_UNLOCK_GIANT(vfslocked);
3385 	return (error);
3386 }
3387 
3388 /*
3389  * Set the access and modification times of a file.
3390  */
3391 #ifndef _SYS_SYSPROTO_H_
3392 struct futimes_args {
3393 	int	fd;
3394 	struct	timeval *tptr;
3395 };
3396 #endif
3397 int
3398 sys_futimes(td, uap)
3399 	struct thread *td;
3400 	register struct futimes_args /* {
3401 		int  fd;
3402 		struct timeval *tptr;
3403 	} */ *uap;
3404 {
3405 
3406 	return (kern_futimes(td, uap->fd, uap->tptr, UIO_USERSPACE));
3407 }
3408 
3409 int
3410 kern_futimes(struct thread *td, int fd, struct timeval *tptr,
3411     enum uio_seg tptrseg)
3412 {
3413 	struct timespec ts[2];
3414 	struct file *fp;
3415 	int vfslocked;
3416 	int error;
3417 
3418 	AUDIT_ARG_FD(fd);
3419 	if ((error = getutimes(tptr, tptrseg, ts)) != 0)
3420 		return (error);
3421 	if ((error = getvnode(td->td_proc->p_fd, fd, CAP_FUTIMES, &fp))
3422 	    != 0)
3423 		return (error);
3424 	vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
3425 #ifdef AUDIT
3426 	vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY);
3427 	AUDIT_ARG_VNODE1(fp->f_vnode);
3428 	VOP_UNLOCK(fp->f_vnode, 0);
3429 #endif
3430 	error = setutimes(td, fp->f_vnode, ts, 2, tptr == NULL);
3431 	VFS_UNLOCK_GIANT(vfslocked);
3432 	fdrop(fp, td);
3433 	return (error);
3434 }
3435 
3436 /*
3437  * Truncate a file given its path name.
3438  */
3439 #ifndef _SYS_SYSPROTO_H_
3440 struct truncate_args {
3441 	char	*path;
3442 	int	pad;
3443 	off_t	length;
3444 };
3445 #endif
3446 int
3447 sys_truncate(td, uap)
3448 	struct thread *td;
3449 	register struct truncate_args /* {
3450 		char *path;
3451 		int pad;
3452 		off_t length;
3453 	} */ *uap;
3454 {
3455 
3456 	return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length));
3457 }
3458 
3459 int
3460 kern_truncate(struct thread *td, char *path, enum uio_seg pathseg, off_t length)
3461 {
3462 	struct mount *mp;
3463 	struct vnode *vp;
3464 	struct vattr vattr;
3465 	int error;
3466 	struct nameidata nd;
3467 	int vfslocked;
3468 
3469 	if (length < 0)
3470 		return(EINVAL);
3471 	NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1, pathseg, path, td);
3472 	if ((error = namei(&nd)) != 0)
3473 		return (error);
3474 	vfslocked = NDHASGIANT(&nd);
3475 	vp = nd.ni_vp;
3476 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
3477 		vrele(vp);
3478 		VFS_UNLOCK_GIANT(vfslocked);
3479 		return (error);
3480 	}
3481 	NDFREE(&nd, NDF_ONLY_PNBUF);
3482 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3483 	if (vp->v_type == VDIR)
3484 		error = EISDIR;
3485 #ifdef MAC
3486 	else if ((error = mac_vnode_check_write(td->td_ucred, NOCRED, vp))) {
3487 	}
3488 #endif
3489 	else if ((error = vn_writechk(vp)) == 0 &&
3490 	    (error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td)) == 0) {
3491 		VATTR_NULL(&vattr);
3492 		vattr.va_size = length;
3493 		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
3494 	}
3495 	vput(vp);
3496 	vn_finished_write(mp);
3497 	VFS_UNLOCK_GIANT(vfslocked);
3498 	return (error);
3499 }
3500 
3501 #if defined(COMPAT_43)
3502 /*
3503  * Truncate a file given its path name.
3504  */
3505 #ifndef _SYS_SYSPROTO_H_
3506 struct otruncate_args {
3507 	char	*path;
3508 	long	length;
3509 };
3510 #endif
3511 int
3512 otruncate(td, uap)
3513 	struct thread *td;
3514 	register struct otruncate_args /* {
3515 		char *path;
3516 		long length;
3517 	} */ *uap;
3518 {
3519 	struct truncate_args /* {
3520 		char *path;
3521 		int pad;
3522 		off_t length;
3523 	} */ nuap;
3524 
3525 	nuap.path = uap->path;
3526 	nuap.length = uap->length;
3527 	return (sys_truncate(td, &nuap));
3528 }
3529 #endif /* COMPAT_43 */
3530 
3531 /* Versions with the pad argument */
3532 int
3533 freebsd6_truncate(struct thread *td, struct freebsd6_truncate_args *uap)
3534 {
3535 	struct truncate_args ouap;
3536 
3537 	ouap.path = uap->path;
3538 	ouap.length = uap->length;
3539 	return (sys_truncate(td, &ouap));
3540 }
3541 
3542 int
3543 freebsd6_ftruncate(struct thread *td, struct freebsd6_ftruncate_args *uap)
3544 {
3545 	struct ftruncate_args ouap;
3546 
3547 	ouap.fd = uap->fd;
3548 	ouap.length = uap->length;
3549 	return (sys_ftruncate(td, &ouap));
3550 }
3551 
3552 /*
3553  * Sync an open file.
3554  */
3555 #ifndef _SYS_SYSPROTO_H_
3556 struct fsync_args {
3557 	int	fd;
3558 };
3559 #endif
3560 int
3561 sys_fsync(td, uap)
3562 	struct thread *td;
3563 	struct fsync_args /* {
3564 		int fd;
3565 	} */ *uap;
3566 {
3567 	struct vnode *vp;
3568 	struct mount *mp;
3569 	struct file *fp;
3570 	int vfslocked;
3571 	int error, lock_flags;
3572 
3573 	AUDIT_ARG_FD(uap->fd);
3574 	if ((error = getvnode(td->td_proc->p_fd, uap->fd, CAP_FSYNC,
3575 	    &fp)) != 0)
3576 		return (error);
3577 	vp = fp->f_vnode;
3578 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
3579 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
3580 		goto drop;
3581 	if (MNT_SHARED_WRITES(mp) ||
3582 	    ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) {
3583 		lock_flags = LK_SHARED;
3584 	} else {
3585 		lock_flags = LK_EXCLUSIVE;
3586 	}
3587 	vn_lock(vp, lock_flags | LK_RETRY);
3588 	AUDIT_ARG_VNODE1(vp);
3589 	if (vp->v_object != NULL) {
3590 		VM_OBJECT_LOCK(vp->v_object);
3591 		vm_object_page_clean(vp->v_object, 0, 0, 0);
3592 		VM_OBJECT_UNLOCK(vp->v_object);
3593 	}
3594 	error = VOP_FSYNC(vp, MNT_WAIT, td);
3595 
3596 	VOP_UNLOCK(vp, 0);
3597 	vn_finished_write(mp);
3598 drop:
3599 	VFS_UNLOCK_GIANT(vfslocked);
3600 	fdrop(fp, td);
3601 	return (error);
3602 }
3603 
3604 /*
3605  * Rename files.  Source and destination must either both be directories, or
3606  * both not be directories.  If target is a directory, it must be empty.
3607  */
3608 #ifndef _SYS_SYSPROTO_H_
3609 struct rename_args {
3610 	char	*from;
3611 	char	*to;
3612 };
3613 #endif
3614 int
3615 sys_rename(td, uap)
3616 	struct thread *td;
3617 	register struct rename_args /* {
3618 		char *from;
3619 		char *to;
3620 	} */ *uap;
3621 {
3622 
3623 	return (kern_rename(td, uap->from, uap->to, UIO_USERSPACE));
3624 }
3625 
3626 #ifndef _SYS_SYSPROTO_H_
3627 struct renameat_args {
3628 	int	oldfd;
3629 	char	*old;
3630 	int	newfd;
3631 	char	*new;
3632 };
3633 #endif
3634 int
3635 sys_renameat(struct thread *td, struct renameat_args *uap)
3636 {
3637 
3638 	return (kern_renameat(td, uap->oldfd, uap->old, uap->newfd, uap->new,
3639 	    UIO_USERSPACE));
3640 }
3641 
3642 int
3643 kern_rename(struct thread *td, char *from, char *to, enum uio_seg pathseg)
3644 {
3645 
3646 	return (kern_renameat(td, AT_FDCWD, from, AT_FDCWD, to, pathseg));
3647 }
3648 
3649 int
3650 kern_renameat(struct thread *td, int oldfd, char *old, int newfd, char *new,
3651     enum uio_seg pathseg)
3652 {
3653 	struct mount *mp = NULL;
3654 	struct vnode *tvp, *fvp, *tdvp;
3655 	struct nameidata fromnd, tond;
3656 	int tvfslocked;
3657 	int fvfslocked;
3658 	int error;
3659 
3660 	bwillwrite();
3661 #ifdef MAC
3662 	NDINIT_ATRIGHTS(&fromnd, DELETE, LOCKPARENT | LOCKLEAF | SAVESTART |
3663 	    MPSAFE | AUDITVNODE1, pathseg, old, oldfd, CAP_DELETE, td);
3664 #else
3665 	NDINIT_ATRIGHTS(&fromnd, DELETE, WANTPARENT | SAVESTART | MPSAFE |
3666 	    AUDITVNODE1, pathseg, old, oldfd, CAP_DELETE, td);
3667 #endif
3668 
3669 	if ((error = namei(&fromnd)) != 0)
3670 		return (error);
3671 	fvfslocked = NDHASGIANT(&fromnd);
3672 	tvfslocked = 0;
3673 #ifdef MAC
3674 	error = mac_vnode_check_rename_from(td->td_ucred, fromnd.ni_dvp,
3675 	    fromnd.ni_vp, &fromnd.ni_cnd);
3676 	VOP_UNLOCK(fromnd.ni_dvp, 0);
3677 	if (fromnd.ni_dvp != fromnd.ni_vp)
3678 		VOP_UNLOCK(fromnd.ni_vp, 0);
3679 #endif
3680 	fvp = fromnd.ni_vp;
3681 	if (error == 0)
3682 		error = vn_start_write(fvp, &mp, V_WAIT | PCATCH);
3683 	if (error != 0) {
3684 		NDFREE(&fromnd, NDF_ONLY_PNBUF);
3685 		vrele(fromnd.ni_dvp);
3686 		vrele(fvp);
3687 		goto out1;
3688 	}
3689 	NDINIT_ATRIGHTS(&tond, RENAME, LOCKPARENT | LOCKLEAF | NOCACHE |
3690 	    SAVESTART | MPSAFE | AUDITVNODE2, pathseg, new, newfd, CAP_CREATE,
3691 	    td);
3692 	if (fromnd.ni_vp->v_type == VDIR)
3693 		tond.ni_cnd.cn_flags |= WILLBEDIR;
3694 	if ((error = namei(&tond)) != 0) {
3695 		/* Translate error code for rename("dir1", "dir2/."). */
3696 		if (error == EISDIR && fvp->v_type == VDIR)
3697 			error = EINVAL;
3698 		NDFREE(&fromnd, NDF_ONLY_PNBUF);
3699 		vrele(fromnd.ni_dvp);
3700 		vrele(fvp);
3701 		vn_finished_write(mp);
3702 		goto out1;
3703 	}
3704 	tvfslocked = NDHASGIANT(&tond);
3705 	tdvp = tond.ni_dvp;
3706 	tvp = tond.ni_vp;
3707 	if (tvp != NULL) {
3708 		if (fvp->v_type == VDIR && tvp->v_type != VDIR) {
3709 			error = ENOTDIR;
3710 			goto out;
3711 		} else if (fvp->v_type != VDIR && tvp->v_type == VDIR) {
3712 			error = EISDIR;
3713 			goto out;
3714 		}
3715 	}
3716 	if (fvp == tdvp) {
3717 		error = EINVAL;
3718 		goto out;
3719 	}
3720 	/*
3721 	 * If the source is the same as the destination (that is, if they
3722 	 * are links to the same vnode), then there is nothing to do.
3723 	 */
3724 	if (fvp == tvp)
3725 		error = -1;
3726 #ifdef MAC
3727 	else
3728 		error = mac_vnode_check_rename_to(td->td_ucred, tdvp,
3729 		    tond.ni_vp, fromnd.ni_dvp == tdvp, &tond.ni_cnd);
3730 #endif
3731 out:
3732 	if (!error) {
3733 		error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd,
3734 				   tond.ni_dvp, tond.ni_vp, &tond.ni_cnd);
3735 		NDFREE(&fromnd, NDF_ONLY_PNBUF);
3736 		NDFREE(&tond, NDF_ONLY_PNBUF);
3737 	} else {
3738 		NDFREE(&fromnd, NDF_ONLY_PNBUF);
3739 		NDFREE(&tond, NDF_ONLY_PNBUF);
3740 		if (tvp)
3741 			vput(tvp);
3742 		if (tdvp == tvp)
3743 			vrele(tdvp);
3744 		else
3745 			vput(tdvp);
3746 		vrele(fromnd.ni_dvp);
3747 		vrele(fvp);
3748 	}
3749 	vrele(tond.ni_startdir);
3750 	vn_finished_write(mp);
3751 out1:
3752 	if (fromnd.ni_startdir)
3753 		vrele(fromnd.ni_startdir);
3754 	VFS_UNLOCK_GIANT(fvfslocked);
3755 	VFS_UNLOCK_GIANT(tvfslocked);
3756 	if (error == -1)
3757 		return (0);
3758 	return (error);
3759 }
3760 
3761 /*
3762  * Make a directory file.
3763  */
3764 #ifndef _SYS_SYSPROTO_H_
3765 struct mkdir_args {
3766 	char	*path;
3767 	int	mode;
3768 };
3769 #endif
3770 int
3771 sys_mkdir(td, uap)
3772 	struct thread *td;
3773 	register struct mkdir_args /* {
3774 		char *path;
3775 		int mode;
3776 	} */ *uap;
3777 {
3778 
3779 	return (kern_mkdir(td, uap->path, UIO_USERSPACE, uap->mode));
3780 }
3781 
3782 #ifndef _SYS_SYSPROTO_H_
3783 struct mkdirat_args {
3784 	int	fd;
3785 	char	*path;
3786 	mode_t	mode;
3787 };
3788 #endif
3789 int
3790 sys_mkdirat(struct thread *td, struct mkdirat_args *uap)
3791 {
3792 
3793 	return (kern_mkdirat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode));
3794 }
3795 
3796 int
3797 kern_mkdir(struct thread *td, char *path, enum uio_seg segflg, int mode)
3798 {
3799 
3800 	return (kern_mkdirat(td, AT_FDCWD, path, segflg, mode));
3801 }
3802 
3803 int
3804 kern_mkdirat(struct thread *td, int fd, char *path, enum uio_seg segflg,
3805     int mode)
3806 {
3807 	struct mount *mp;
3808 	struct vnode *vp;
3809 	struct vattr vattr;
3810 	int error;
3811 	struct nameidata nd;
3812 	int vfslocked;
3813 
3814 	AUDIT_ARG_MODE(mode);
3815 restart:
3816 	bwillwrite();
3817 	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE |
3818 	    AUDITVNODE1, segflg, path, fd, CAP_MKDIR, td);
3819 	nd.ni_cnd.cn_flags |= WILLBEDIR;
3820 	if ((error = namei(&nd)) != 0)
3821 		return (error);
3822 	vfslocked = NDHASGIANT(&nd);
3823 	vp = nd.ni_vp;
3824 	if (vp != NULL) {
3825 		NDFREE(&nd, NDF_ONLY_PNBUF);
3826 		/*
3827 		 * XXX namei called with LOCKPARENT but not LOCKLEAF has
3828 		 * the strange behaviour of leaving the vnode unlocked
3829 		 * if the target is the same vnode as the parent.
3830 		 */
3831 		if (vp == nd.ni_dvp)
3832 			vrele(nd.ni_dvp);
3833 		else
3834 			vput(nd.ni_dvp);
3835 		vrele(vp);
3836 		VFS_UNLOCK_GIANT(vfslocked);
3837 		return (EEXIST);
3838 	}
3839 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
3840 		NDFREE(&nd, NDF_ONLY_PNBUF);
3841 		vput(nd.ni_dvp);
3842 		VFS_UNLOCK_GIANT(vfslocked);
3843 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
3844 			return (error);
3845 		goto restart;
3846 	}
3847 	VATTR_NULL(&vattr);
3848 	vattr.va_type = VDIR;
3849 	vattr.va_mode = (mode & ACCESSPERMS) &~ td->td_proc->p_fd->fd_cmask;
3850 #ifdef MAC
3851 	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
3852 	    &vattr);
3853 	if (error)
3854 		goto out;
3855 #endif
3856 	error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
3857 #ifdef MAC
3858 out:
3859 #endif
3860 	NDFREE(&nd, NDF_ONLY_PNBUF);
3861 	vput(nd.ni_dvp);
3862 	if (!error)
3863 		vput(nd.ni_vp);
3864 	vn_finished_write(mp);
3865 	VFS_UNLOCK_GIANT(vfslocked);
3866 	return (error);
3867 }
3868 
3869 /*
3870  * Remove a directory file.
3871  */
3872 #ifndef _SYS_SYSPROTO_H_
3873 struct rmdir_args {
3874 	char	*path;
3875 };
3876 #endif
3877 int
3878 sys_rmdir(td, uap)
3879 	struct thread *td;
3880 	struct rmdir_args /* {
3881 		char *path;
3882 	} */ *uap;
3883 {
3884 
3885 	return (kern_rmdir(td, uap->path, UIO_USERSPACE));
3886 }
3887 
3888 int
3889 kern_rmdir(struct thread *td, char *path, enum uio_seg pathseg)
3890 {
3891 
3892 	return (kern_rmdirat(td, AT_FDCWD, path, pathseg));
3893 }
3894 
3895 int
3896 kern_rmdirat(struct thread *td, int fd, char *path, enum uio_seg pathseg)
3897 {
3898 	struct mount *mp;
3899 	struct vnode *vp;
3900 	int error;
3901 	struct nameidata nd;
3902 	int vfslocked;
3903 
3904 restart:
3905 	bwillwrite();
3906 	NDINIT_ATRIGHTS(&nd, DELETE, LOCKPARENT | LOCKLEAF | MPSAFE |
3907 	    AUDITVNODE1, pathseg, path, fd, CAP_RMDIR, td);
3908 	if ((error = namei(&nd)) != 0)
3909 		return (error);
3910 	vfslocked = NDHASGIANT(&nd);
3911 	vp = nd.ni_vp;
3912 	if (vp->v_type != VDIR) {
3913 		error = ENOTDIR;
3914 		goto out;
3915 	}
3916 	/*
3917 	 * No rmdir "." please.
3918 	 */
3919 	if (nd.ni_dvp == vp) {
3920 		error = EINVAL;
3921 		goto out;
3922 	}
3923 	/*
3924 	 * The root of a mounted filesystem cannot be deleted.
3925 	 */
3926 	if (vp->v_vflag & VV_ROOT) {
3927 		error = EBUSY;
3928 		goto out;
3929 	}
3930 #ifdef MAC
3931 	error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp,
3932 	    &nd.ni_cnd);
3933 	if (error)
3934 		goto out;
3935 #endif
3936 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
3937 		NDFREE(&nd, NDF_ONLY_PNBUF);
3938 		vput(vp);
3939 		if (nd.ni_dvp == vp)
3940 			vrele(nd.ni_dvp);
3941 		else
3942 			vput(nd.ni_dvp);
3943 		VFS_UNLOCK_GIANT(vfslocked);
3944 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
3945 			return (error);
3946 		goto restart;
3947 	}
3948 	error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
3949 	vn_finished_write(mp);
3950 out:
3951 	NDFREE(&nd, NDF_ONLY_PNBUF);
3952 	vput(vp);
3953 	if (nd.ni_dvp == vp)
3954 		vrele(nd.ni_dvp);
3955 	else
3956 		vput(nd.ni_dvp);
3957 	VFS_UNLOCK_GIANT(vfslocked);
3958 	return (error);
3959 }
3960 
3961 #ifdef COMPAT_43
3962 /*
3963  * Read a block of directory entries in a filesystem independent format.
3964  */
3965 #ifndef _SYS_SYSPROTO_H_
3966 struct ogetdirentries_args {
3967 	int	fd;
3968 	char	*buf;
3969 	u_int	count;
3970 	long	*basep;
3971 };
3972 #endif
3973 int
3974 ogetdirentries(struct thread *td, struct ogetdirentries_args *uap)
3975 {
3976 	long loff;
3977 	int error;
3978 
3979 	error = kern_ogetdirentries(td, uap, &loff);
3980 	if (error == 0)
3981 		error = copyout(&loff, uap->basep, sizeof(long));
3982 	return (error);
3983 }
3984 
3985 int
3986 kern_ogetdirentries(struct thread *td, struct ogetdirentries_args *uap,
3987     long *ploff)
3988 {
3989 	struct vnode *vp;
3990 	struct file *fp;
3991 	struct uio auio, kuio;
3992 	struct iovec aiov, kiov;
3993 	struct dirent *dp, *edp;
3994 	caddr_t dirbuf;
3995 	int error, eofflag, readcnt, vfslocked;
3996 	long loff;
3997 
3998 	/* XXX arbitrary sanity limit on `count'. */
3999 	if (uap->count > 64 * 1024)
4000 		return (EINVAL);
4001 	if ((error = getvnode(td->td_proc->p_fd, uap->fd, CAP_READ,
4002 	    &fp)) != 0)
4003 		return (error);
4004 	if ((fp->f_flag & FREAD) == 0) {
4005 		fdrop(fp, td);
4006 		return (EBADF);
4007 	}
4008 	vp = fp->f_vnode;
4009 unionread:
4010 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
4011 	if (vp->v_type != VDIR) {
4012 		VFS_UNLOCK_GIANT(vfslocked);
4013 		fdrop(fp, td);
4014 		return (EINVAL);
4015 	}
4016 	aiov.iov_base = uap->buf;
4017 	aiov.iov_len = uap->count;
4018 	auio.uio_iov = &aiov;
4019 	auio.uio_iovcnt = 1;
4020 	auio.uio_rw = UIO_READ;
4021 	auio.uio_segflg = UIO_USERSPACE;
4022 	auio.uio_td = td;
4023 	auio.uio_resid = uap->count;
4024 	vn_lock(vp, LK_SHARED | LK_RETRY);
4025 	loff = auio.uio_offset = fp->f_offset;
4026 #ifdef MAC
4027 	error = mac_vnode_check_readdir(td->td_ucred, vp);
4028 	if (error) {
4029 		VOP_UNLOCK(vp, 0);
4030 		VFS_UNLOCK_GIANT(vfslocked);
4031 		fdrop(fp, td);
4032 		return (error);
4033 	}
4034 #endif
4035 #	if (BYTE_ORDER != LITTLE_ENDIAN)
4036 		if (vp->v_mount->mnt_maxsymlinklen <= 0) {
4037 			error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag,
4038 			    NULL, NULL);
4039 			fp->f_offset = auio.uio_offset;
4040 		} else
4041 #	endif
4042 	{
4043 		kuio = auio;
4044 		kuio.uio_iov = &kiov;
4045 		kuio.uio_segflg = UIO_SYSSPACE;
4046 		kiov.iov_len = uap->count;
4047 		dirbuf = malloc(uap->count, M_TEMP, M_WAITOK);
4048 		kiov.iov_base = dirbuf;
4049 		error = VOP_READDIR(vp, &kuio, fp->f_cred, &eofflag,
4050 			    NULL, NULL);
4051 		fp->f_offset = kuio.uio_offset;
4052 		if (error == 0) {
4053 			readcnt = uap->count - kuio.uio_resid;
4054 			edp = (struct dirent *)&dirbuf[readcnt];
4055 			for (dp = (struct dirent *)dirbuf; dp < edp; ) {
4056 #				if (BYTE_ORDER == LITTLE_ENDIAN)
4057 					/*
4058 					 * The expected low byte of
4059 					 * dp->d_namlen is our dp->d_type.
4060 					 * The high MBZ byte of dp->d_namlen
4061 					 * is our dp->d_namlen.
4062 					 */
4063 					dp->d_type = dp->d_namlen;
4064 					dp->d_namlen = 0;
4065 #				else
4066 					/*
4067 					 * The dp->d_type is the high byte
4068 					 * of the expected dp->d_namlen,
4069 					 * so must be zero'ed.
4070 					 */
4071 					dp->d_type = 0;
4072 #				endif
4073 				if (dp->d_reclen > 0) {
4074 					dp = (struct dirent *)
4075 					    ((char *)dp + dp->d_reclen);
4076 				} else {
4077 					error = EIO;
4078 					break;
4079 				}
4080 			}
4081 			if (dp >= edp)
4082 				error = uiomove(dirbuf, readcnt, &auio);
4083 		}
4084 		free(dirbuf, M_TEMP);
4085 	}
4086 	if (error) {
4087 		VOP_UNLOCK(vp, 0);
4088 		VFS_UNLOCK_GIANT(vfslocked);
4089 		fdrop(fp, td);
4090 		return (error);
4091 	}
4092 	if (uap->count == auio.uio_resid &&
4093 	    (vp->v_vflag & VV_ROOT) &&
4094 	    (vp->v_mount->mnt_flag & MNT_UNION)) {
4095 		struct vnode *tvp = vp;
4096 		vp = vp->v_mount->mnt_vnodecovered;
4097 		VREF(vp);
4098 		fp->f_vnode = vp;
4099 		fp->f_data = vp;
4100 		fp->f_offset = 0;
4101 		vput(tvp);
4102 		VFS_UNLOCK_GIANT(vfslocked);
4103 		goto unionread;
4104 	}
4105 	VOP_UNLOCK(vp, 0);
4106 	VFS_UNLOCK_GIANT(vfslocked);
4107 	fdrop(fp, td);
4108 	td->td_retval[0] = uap->count - auio.uio_resid;
4109 	if (error == 0)
4110 		*ploff = loff;
4111 	return (error);
4112 }
4113 #endif /* COMPAT_43 */
4114 
4115 /*
4116  * Read a block of directory entries in a filesystem independent format.
4117  */
4118 #ifndef _SYS_SYSPROTO_H_
4119 struct getdirentries_args {
4120 	int	fd;
4121 	char	*buf;
4122 	u_int	count;
4123 	long	*basep;
4124 };
4125 #endif
4126 int
4127 sys_getdirentries(td, uap)
4128 	struct thread *td;
4129 	register struct getdirentries_args /* {
4130 		int fd;
4131 		char *buf;
4132 		u_int count;
4133 		long *basep;
4134 	} */ *uap;
4135 {
4136 	long base;
4137 	int error;
4138 
4139 	error = kern_getdirentries(td, uap->fd, uap->buf, uap->count, &base);
4140 	if (error)
4141 		return (error);
4142 	if (uap->basep != NULL)
4143 		error = copyout(&base, uap->basep, sizeof(long));
4144 	return (error);
4145 }
4146 
4147 int
4148 kern_getdirentries(struct thread *td, int fd, char *buf, u_int count,
4149     long *basep)
4150 {
4151 	struct vnode *vp;
4152 	struct file *fp;
4153 	struct uio auio;
4154 	struct iovec aiov;
4155 	int vfslocked;
4156 	long loff;
4157 	int error, eofflag;
4158 
4159 	AUDIT_ARG_FD(fd);
4160 	if (count > IOSIZE_MAX)
4161 		return (EINVAL);
4162 	auio.uio_resid = count;
4163 	if ((error = getvnode(td->td_proc->p_fd, fd, CAP_READ | CAP_SEEK,
4164 	    &fp)) != 0)
4165 		return (error);
4166 	if ((fp->f_flag & FREAD) == 0) {
4167 		fdrop(fp, td);
4168 		return (EBADF);
4169 	}
4170 	vp = fp->f_vnode;
4171 unionread:
4172 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
4173 	if (vp->v_type != VDIR) {
4174 		VFS_UNLOCK_GIANT(vfslocked);
4175 		error = EINVAL;
4176 		goto fail;
4177 	}
4178 	aiov.iov_base = buf;
4179 	aiov.iov_len = count;
4180 	auio.uio_iov = &aiov;
4181 	auio.uio_iovcnt = 1;
4182 	auio.uio_rw = UIO_READ;
4183 	auio.uio_segflg = UIO_USERSPACE;
4184 	auio.uio_td = td;
4185 	vn_lock(vp, LK_SHARED | LK_RETRY);
4186 	AUDIT_ARG_VNODE1(vp);
4187 	loff = auio.uio_offset = fp->f_offset;
4188 #ifdef MAC
4189 	error = mac_vnode_check_readdir(td->td_ucred, vp);
4190 	if (error == 0)
4191 #endif
4192 		error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL,
4193 		    NULL);
4194 	fp->f_offset = auio.uio_offset;
4195 	if (error) {
4196 		VOP_UNLOCK(vp, 0);
4197 		VFS_UNLOCK_GIANT(vfslocked);
4198 		goto fail;
4199 	}
4200 	if (count == auio.uio_resid &&
4201 	    (vp->v_vflag & VV_ROOT) &&
4202 	    (vp->v_mount->mnt_flag & MNT_UNION)) {
4203 		struct vnode *tvp = vp;
4204 		vp = vp->v_mount->mnt_vnodecovered;
4205 		VREF(vp);
4206 		fp->f_vnode = vp;
4207 		fp->f_data = vp;
4208 		fp->f_offset = 0;
4209 		vput(tvp);
4210 		VFS_UNLOCK_GIANT(vfslocked);
4211 		goto unionread;
4212 	}
4213 	VOP_UNLOCK(vp, 0);
4214 	VFS_UNLOCK_GIANT(vfslocked);
4215 	*basep = loff;
4216 	td->td_retval[0] = count - auio.uio_resid;
4217 fail:
4218 	fdrop(fp, td);
4219 	return (error);
4220 }
4221 
4222 #ifndef _SYS_SYSPROTO_H_
4223 struct getdents_args {
4224 	int fd;
4225 	char *buf;
4226 	size_t count;
4227 };
4228 #endif
4229 int
4230 sys_getdents(td, uap)
4231 	struct thread *td;
4232 	register struct getdents_args /* {
4233 		int fd;
4234 		char *buf;
4235 		u_int count;
4236 	} */ *uap;
4237 {
4238 	struct getdirentries_args ap;
4239 	ap.fd = uap->fd;
4240 	ap.buf = uap->buf;
4241 	ap.count = uap->count;
4242 	ap.basep = NULL;
4243 	return (sys_getdirentries(td, &ap));
4244 }
4245 
4246 /*
4247  * Set the mode mask for creation of filesystem nodes.
4248  */
4249 #ifndef _SYS_SYSPROTO_H_
4250 struct umask_args {
4251 	int	newmask;
4252 };
4253 #endif
4254 int
4255 sys_umask(td, uap)
4256 	struct thread *td;
4257 	struct umask_args /* {
4258 		int newmask;
4259 	} */ *uap;
4260 {
4261 	register struct filedesc *fdp;
4262 
4263 	FILEDESC_XLOCK(td->td_proc->p_fd);
4264 	fdp = td->td_proc->p_fd;
4265 	td->td_retval[0] = fdp->fd_cmask;
4266 	fdp->fd_cmask = uap->newmask & ALLPERMS;
4267 	FILEDESC_XUNLOCK(td->td_proc->p_fd);
4268 	return (0);
4269 }
4270 
4271 /*
4272  * Void all references to file by ripping underlying filesystem away from
4273  * vnode.
4274  */
4275 #ifndef _SYS_SYSPROTO_H_
4276 struct revoke_args {
4277 	char	*path;
4278 };
4279 #endif
4280 int
4281 sys_revoke(td, uap)
4282 	struct thread *td;
4283 	register struct revoke_args /* {
4284 		char *path;
4285 	} */ *uap;
4286 {
4287 	struct vnode *vp;
4288 	struct vattr vattr;
4289 	int error;
4290 	struct nameidata nd;
4291 	int vfslocked;
4292 
4293 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
4294 	    UIO_USERSPACE, uap->path, td);
4295 	if ((error = namei(&nd)) != 0)
4296 		return (error);
4297 	vfslocked = NDHASGIANT(&nd);
4298 	vp = nd.ni_vp;
4299 	NDFREE(&nd, NDF_ONLY_PNBUF);
4300 	if (vp->v_type != VCHR || vp->v_rdev == NULL) {
4301 		error = EINVAL;
4302 		goto out;
4303 	}
4304 #ifdef MAC
4305 	error = mac_vnode_check_revoke(td->td_ucred, vp);
4306 	if (error)
4307 		goto out;
4308 #endif
4309 	error = VOP_GETATTR(vp, &vattr, td->td_ucred);
4310 	if (error)
4311 		goto out;
4312 	if (td->td_ucred->cr_uid != vattr.va_uid) {
4313 		error = priv_check(td, PRIV_VFS_ADMIN);
4314 		if (error)
4315 			goto out;
4316 	}
4317 	if (vcount(vp) > 1)
4318 		VOP_REVOKE(vp, REVOKEALL);
4319 out:
4320 	vput(vp);
4321 	VFS_UNLOCK_GIANT(vfslocked);
4322 	return (error);
4323 }
4324 
4325 /*
4326  * Convert a user file descriptor to a kernel file entry and check that, if it
4327  * is a capability, the correct rights are present. A reference on the file
4328  * entry is held upon returning.
4329  */
4330 int
4331 getvnode(struct filedesc *fdp, int fd, cap_rights_t rights,
4332     struct file **fpp)
4333 {
4334 	struct file *fp;
4335 #ifdef CAPABILITIES
4336 	struct file *fp_fromcap;
4337 #endif
4338 	int error;
4339 
4340 	error = 0;
4341 	fp = NULL;
4342 	if ((fdp == NULL) || (fp = fget_unlocked(fdp, fd)) == NULL)
4343 		return (EBADF);
4344 #ifdef CAPABILITIES
4345 	/*
4346 	 * If the file descriptor is for a capability, test rights and use the
4347 	 * file descriptor referenced by the capability.
4348 	 */
4349 	error = cap_funwrap(fp, rights, &fp_fromcap);
4350 	if (error) {
4351 		fdrop(fp, curthread);
4352 		return (error);
4353 	}
4354 	if (fp != fp_fromcap) {
4355 		fhold(fp_fromcap);
4356 		fdrop(fp, curthread);
4357 		fp = fp_fromcap;
4358 	}
4359 #endif /* CAPABILITIES */
4360 
4361 	/*
4362 	 * The file could be not of the vnode type, or it may be not
4363 	 * yet fully initialized, in which case the f_vnode pointer
4364 	 * may be set, but f_ops is still badfileops.  E.g.,
4365 	 * devfs_open() transiently create such situation to
4366 	 * facilitate csw d_fdopen().
4367 	 *
4368 	 * Dupfdopen() handling in kern_openat() installs the
4369 	 * half-baked file into the process descriptor table, allowing
4370 	 * other thread to dereference it. Guard against the race by
4371 	 * checking f_ops.
4372 	 */
4373 	if (fp->f_vnode == NULL || fp->f_ops == &badfileops) {
4374 		fdrop(fp, curthread);
4375 		return (EINVAL);
4376 	}
4377 	*fpp = fp;
4378 	return (0);
4379 }
4380 
4381 
4382 /*
4383  * Get an (NFS) file handle.
4384  */
4385 #ifndef _SYS_SYSPROTO_H_
4386 struct lgetfh_args {
4387 	char	*fname;
4388 	fhandle_t *fhp;
4389 };
4390 #endif
4391 int
4392 sys_lgetfh(td, uap)
4393 	struct thread *td;
4394 	register struct lgetfh_args *uap;
4395 {
4396 	struct nameidata nd;
4397 	fhandle_t fh;
4398 	register struct vnode *vp;
4399 	int vfslocked;
4400 	int error;
4401 
4402 	error = priv_check(td, PRIV_VFS_GETFH);
4403 	if (error)
4404 		return (error);
4405 	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
4406 	    UIO_USERSPACE, uap->fname, td);
4407 	error = namei(&nd);
4408 	if (error)
4409 		return (error);
4410 	vfslocked = NDHASGIANT(&nd);
4411 	NDFREE(&nd, NDF_ONLY_PNBUF);
4412 	vp = nd.ni_vp;
4413 	bzero(&fh, sizeof(fh));
4414 	fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid;
4415 	error = VOP_VPTOFH(vp, &fh.fh_fid);
4416 	vput(vp);
4417 	VFS_UNLOCK_GIANT(vfslocked);
4418 	if (error)
4419 		return (error);
4420 	error = copyout(&fh, uap->fhp, sizeof (fh));
4421 	return (error);
4422 }
4423 
4424 #ifndef _SYS_SYSPROTO_H_
4425 struct getfh_args {
4426 	char	*fname;
4427 	fhandle_t *fhp;
4428 };
4429 #endif
4430 int
4431 sys_getfh(td, uap)
4432 	struct thread *td;
4433 	register struct getfh_args *uap;
4434 {
4435 	struct nameidata nd;
4436 	fhandle_t fh;
4437 	register struct vnode *vp;
4438 	int vfslocked;
4439 	int error;
4440 
4441 	error = priv_check(td, PRIV_VFS_GETFH);
4442 	if (error)
4443 		return (error);
4444 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
4445 	    UIO_USERSPACE, uap->fname, td);
4446 	error = namei(&nd);
4447 	if (error)
4448 		return (error);
4449 	vfslocked = NDHASGIANT(&nd);
4450 	NDFREE(&nd, NDF_ONLY_PNBUF);
4451 	vp = nd.ni_vp;
4452 	bzero(&fh, sizeof(fh));
4453 	fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid;
4454 	error = VOP_VPTOFH(vp, &fh.fh_fid);
4455 	vput(vp);
4456 	VFS_UNLOCK_GIANT(vfslocked);
4457 	if (error)
4458 		return (error);
4459 	error = copyout(&fh, uap->fhp, sizeof (fh));
4460 	return (error);
4461 }
4462 
4463 /*
4464  * syscall for the rpc.lockd to use to translate a NFS file handle into an
4465  * open descriptor.
4466  *
4467  * warning: do not remove the priv_check() call or this becomes one giant
4468  * security hole.
4469  */
4470 #ifndef _SYS_SYSPROTO_H_
4471 struct fhopen_args {
4472 	const struct fhandle *u_fhp;
4473 	int flags;
4474 };
4475 #endif
4476 int
4477 sys_fhopen(td, uap)
4478 	struct thread *td;
4479 	struct fhopen_args /* {
4480 		const struct fhandle *u_fhp;
4481 		int flags;
4482 	} */ *uap;
4483 {
4484 	struct proc *p = td->td_proc;
4485 	struct mount *mp;
4486 	struct vnode *vp;
4487 	struct fhandle fhp;
4488 	struct vattr vat;
4489 	struct vattr *vap = &vat;
4490 	struct flock lf;
4491 	struct file *fp;
4492 	register struct filedesc *fdp = p->p_fd;
4493 	int fmode, error, type;
4494 	accmode_t accmode;
4495 	struct file *nfp;
4496 	int vfslocked;
4497 	int indx;
4498 
4499 	error = priv_check(td, PRIV_VFS_FHOPEN);
4500 	if (error)
4501 		return (error);
4502 	fmode = FFLAGS(uap->flags);
4503 	/* why not allow a non-read/write open for our lockd? */
4504 	if (((fmode & (FREAD | FWRITE)) == 0) || (fmode & O_CREAT))
4505 		return (EINVAL);
4506 	error = copyin(uap->u_fhp, &fhp, sizeof(fhp));
4507 	if (error)
4508 		return(error);
4509 	/* find the mount point */
4510 	mp = vfs_busyfs(&fhp.fh_fsid);
4511 	if (mp == NULL)
4512 		return (ESTALE);
4513 	vfslocked = VFS_LOCK_GIANT(mp);
4514 	/* now give me my vnode, it gets returned to me locked */
4515 	error = VFS_FHTOVP(mp, &fhp.fh_fid, LK_EXCLUSIVE, &vp);
4516 	vfs_unbusy(mp);
4517 	if (error)
4518 		goto out;
4519 	/*
4520 	 * from now on we have to make sure not
4521 	 * to forget about the vnode
4522 	 * any error that causes an abort must vput(vp)
4523 	 * just set error = err and 'goto bad;'.
4524 	 */
4525 
4526 	/*
4527 	 * from vn_open
4528 	 */
4529 	if (vp->v_type == VLNK) {
4530 		error = EMLINK;
4531 		goto bad;
4532 	}
4533 	if (vp->v_type == VSOCK) {
4534 		error = EOPNOTSUPP;
4535 		goto bad;
4536 	}
4537 	if (vp->v_type != VDIR && fmode & O_DIRECTORY) {
4538 		error = ENOTDIR;
4539 		goto bad;
4540 	}
4541 	accmode = 0;
4542 	if (fmode & (FWRITE | O_TRUNC)) {
4543 		if (vp->v_type == VDIR) {
4544 			error = EISDIR;
4545 			goto bad;
4546 		}
4547 		error = vn_writechk(vp);
4548 		if (error)
4549 			goto bad;
4550 		accmode |= VWRITE;
4551 	}
4552 	if (fmode & FREAD)
4553 		accmode |= VREAD;
4554 	if ((fmode & O_APPEND) && (fmode & FWRITE))
4555 		accmode |= VAPPEND;
4556 #ifdef MAC
4557 	error = mac_vnode_check_open(td->td_ucred, vp, accmode);
4558 	if (error)
4559 		goto bad;
4560 #endif
4561 	if (accmode) {
4562 		error = VOP_ACCESS(vp, accmode, td->td_ucred, td);
4563 		if (error)
4564 			goto bad;
4565 	}
4566 	if (fmode & O_TRUNC) {
4567 		vfs_ref(mp);
4568 		VOP_UNLOCK(vp, 0);				/* XXX */
4569 		if ((error = vn_start_write(NULL, &mp, V_WAIT | PCATCH)) != 0) {
4570 			vrele(vp);
4571 			vfs_rel(mp);
4572 			goto out;
4573 		}
4574 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);	/* XXX */
4575 		vfs_rel(mp);
4576 #ifdef MAC
4577 		/*
4578 		 * We don't yet have fp->f_cred, so use td->td_ucred, which
4579 		 * should be right.
4580 		 */
4581 		error = mac_vnode_check_write(td->td_ucred, td->td_ucred, vp);
4582 		if (error == 0) {
4583 #endif
4584 			VATTR_NULL(vap);
4585 			vap->va_size = 0;
4586 			error = VOP_SETATTR(vp, vap, td->td_ucred);
4587 #ifdef MAC
4588 		}
4589 #endif
4590 		vn_finished_write(mp);
4591 		if (error)
4592 			goto bad;
4593 	}
4594 	error = VOP_OPEN(vp, fmode, td->td_ucred, td, NULL);
4595 	if (error)
4596 		goto bad;
4597 
4598 	if (fmode & FWRITE) {
4599 		vp->v_writecount++;
4600 		CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d",
4601 		    __func__, vp, vp->v_writecount);
4602 	}
4603 
4604 	/*
4605 	 * end of vn_open code
4606 	 */
4607 
4608 	if ((error = falloc(td, &nfp, &indx, fmode)) != 0) {
4609 		if (fmode & FWRITE) {
4610 			vp->v_writecount--;
4611 			CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d",
4612 			    __func__, vp, vp->v_writecount);
4613 		}
4614 		goto bad;
4615 	}
4616 	/* An extra reference on `nfp' has been held for us by falloc(). */
4617 	fp = nfp;
4618 	nfp->f_vnode = vp;
4619 	finit(nfp, fmode & FMASK, DTYPE_VNODE, vp, &vnops);
4620 	if (fmode & (O_EXLOCK | O_SHLOCK)) {
4621 		lf.l_whence = SEEK_SET;
4622 		lf.l_start = 0;
4623 		lf.l_len = 0;
4624 		if (fmode & O_EXLOCK)
4625 			lf.l_type = F_WRLCK;
4626 		else
4627 			lf.l_type = F_RDLCK;
4628 		type = F_FLOCK;
4629 		if ((fmode & FNONBLOCK) == 0)
4630 			type |= F_WAIT;
4631 		VOP_UNLOCK(vp, 0);
4632 		if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,
4633 			    type)) != 0) {
4634 			/*
4635 			 * The lock request failed.  Normally close the
4636 			 * descriptor but handle the case where someone might
4637 			 * have dup()d or close()d it when we weren't looking.
4638 			 */
4639 			fdclose(fdp, fp, indx, td);
4640 
4641 			/*
4642 			 * release our private reference
4643 			 */
4644 			fdrop(fp, td);
4645 			goto out;
4646 		}
4647 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4648 		atomic_set_int(&fp->f_flag, FHASLOCK);
4649 	}
4650 
4651 	VOP_UNLOCK(vp, 0);
4652 	fdrop(fp, td);
4653 	VFS_UNLOCK_GIANT(vfslocked);
4654 	td->td_retval[0] = indx;
4655 	return (0);
4656 
4657 bad:
4658 	vput(vp);
4659 out:
4660 	VFS_UNLOCK_GIANT(vfslocked);
4661 	return (error);
4662 }
4663 
4664 /*
4665  * Stat an (NFS) file handle.
4666  */
4667 #ifndef _SYS_SYSPROTO_H_
4668 struct fhstat_args {
4669 	struct fhandle *u_fhp;
4670 	struct stat *sb;
4671 };
4672 #endif
4673 int
4674 sys_fhstat(td, uap)
4675 	struct thread *td;
4676 	register struct fhstat_args /* {
4677 		struct fhandle *u_fhp;
4678 		struct stat *sb;
4679 	} */ *uap;
4680 {
4681 	struct stat sb;
4682 	fhandle_t fh;
4683 	struct mount *mp;
4684 	struct vnode *vp;
4685 	int vfslocked;
4686 	int error;
4687 
4688 	error = priv_check(td, PRIV_VFS_FHSTAT);
4689 	if (error)
4690 		return (error);
4691 	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
4692 	if (error)
4693 		return (error);
4694 	if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
4695 		return (ESTALE);
4696 	vfslocked = VFS_LOCK_GIANT(mp);
4697 	error = VFS_FHTOVP(mp, &fh.fh_fid, LK_EXCLUSIVE, &vp);
4698 	vfs_unbusy(mp);
4699 	if (error) {
4700 		VFS_UNLOCK_GIANT(vfslocked);
4701 		return (error);
4702 	}
4703 	error = vn_stat(vp, &sb, td->td_ucred, NOCRED, td);
4704 	vput(vp);
4705 	VFS_UNLOCK_GIANT(vfslocked);
4706 	if (error)
4707 		return (error);
4708 	error = copyout(&sb, uap->sb, sizeof(sb));
4709 	return (error);
4710 }
4711 
4712 /*
4713  * Implement fstatfs() for (NFS) file handles.
4714  */
4715 #ifndef _SYS_SYSPROTO_H_
4716 struct fhstatfs_args {
4717 	struct fhandle *u_fhp;
4718 	struct statfs *buf;
4719 };
4720 #endif
4721 int
4722 sys_fhstatfs(td, uap)
4723 	struct thread *td;
4724 	struct fhstatfs_args /* {
4725 		struct fhandle *u_fhp;
4726 		struct statfs *buf;
4727 	} */ *uap;
4728 {
4729 	struct statfs sf;
4730 	fhandle_t fh;
4731 	int error;
4732 
4733 	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
4734 	if (error)
4735 		return (error);
4736 	error = kern_fhstatfs(td, fh, &sf);
4737 	if (error)
4738 		return (error);
4739 	return (copyout(&sf, uap->buf, sizeof(sf)));
4740 }
4741 
4742 int
4743 kern_fhstatfs(struct thread *td, fhandle_t fh, struct statfs *buf)
4744 {
4745 	struct statfs *sp;
4746 	struct mount *mp;
4747 	struct vnode *vp;
4748 	int vfslocked;
4749 	int error;
4750 
4751 	error = priv_check(td, PRIV_VFS_FHSTATFS);
4752 	if (error)
4753 		return (error);
4754 	if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
4755 		return (ESTALE);
4756 	vfslocked = VFS_LOCK_GIANT(mp);
4757 	error = VFS_FHTOVP(mp, &fh.fh_fid, LK_EXCLUSIVE, &vp);
4758 	if (error) {
4759 		vfs_unbusy(mp);
4760 		VFS_UNLOCK_GIANT(vfslocked);
4761 		return (error);
4762 	}
4763 	vput(vp);
4764 	error = prison_canseemount(td->td_ucred, mp);
4765 	if (error)
4766 		goto out;
4767 #ifdef MAC
4768 	error = mac_mount_check_stat(td->td_ucred, mp);
4769 	if (error)
4770 		goto out;
4771 #endif
4772 	/*
4773 	 * Set these in case the underlying filesystem fails to do so.
4774 	 */
4775 	sp = &mp->mnt_stat;
4776 	sp->f_version = STATFS_VERSION;
4777 	sp->f_namemax = NAME_MAX;
4778 	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
4779 	error = VFS_STATFS(mp, sp);
4780 	if (error == 0)
4781 		*buf = *sp;
4782 out:
4783 	vfs_unbusy(mp);
4784 	VFS_UNLOCK_GIANT(vfslocked);
4785 	return (error);
4786 }
4787 
4788 int
4789 kern_posix_fallocate(struct thread *td, int fd, off_t offset, off_t len)
4790 {
4791 	struct file *fp;
4792 	struct mount *mp;
4793 	struct vnode *vp;
4794 	off_t olen, ooffset;
4795 	int error, vfslocked;
4796 
4797 	fp = NULL;
4798 	vfslocked = 0;
4799 	error = fget(td, fd, CAP_WRITE, &fp);
4800 	if (error != 0)
4801 		goto out;
4802 
4803 	switch (fp->f_type) {
4804 	case DTYPE_VNODE:
4805 		break;
4806 	case DTYPE_PIPE:
4807 	case DTYPE_FIFO:
4808 		error = ESPIPE;
4809 		goto out;
4810 	default:
4811 		error = ENODEV;
4812 		goto out;
4813 	}
4814 	if ((fp->f_flag & FWRITE) == 0) {
4815 		error = EBADF;
4816 		goto out;
4817 	}
4818 	vp = fp->f_vnode;
4819 	if (vp->v_type != VREG) {
4820 		error = ENODEV;
4821 		goto out;
4822 	}
4823 	if (offset < 0 || len <= 0) {
4824 		error = EINVAL;
4825 		goto out;
4826 	}
4827 	/* Check for wrap. */
4828 	if (offset > OFF_MAX - len) {
4829 		error = EFBIG;
4830 		goto out;
4831 	}
4832 
4833 	/* Allocating blocks may take a long time, so iterate. */
4834 	for (;;) {
4835 		olen = len;
4836 		ooffset = offset;
4837 
4838 		bwillwrite();
4839 		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
4840 		mp = NULL;
4841 		error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
4842 		if (error != 0) {
4843 			VFS_UNLOCK_GIANT(vfslocked);
4844 			break;
4845 		}
4846 		error = vn_lock(vp, LK_EXCLUSIVE);
4847 		if (error != 0) {
4848 			vn_finished_write(mp);
4849 			VFS_UNLOCK_GIANT(vfslocked);
4850 			break;
4851 		}
4852 #ifdef MAC
4853 		error = mac_vnode_check_write(td->td_ucred, fp->f_cred, vp);
4854 		if (error == 0)
4855 #endif
4856 			error = VOP_ALLOCATE(vp, &offset, &len);
4857 		VOP_UNLOCK(vp, 0);
4858 		vn_finished_write(mp);
4859 		VFS_UNLOCK_GIANT(vfslocked);
4860 
4861 		if (olen + ooffset != offset + len) {
4862 			panic("offset + len changed from %jx/%jx to %jx/%jx",
4863 			    ooffset, olen, offset, len);
4864 		}
4865 		if (error != 0 || len == 0)
4866 			break;
4867 		KASSERT(olen > len, ("Iteration did not make progress?"));
4868 		maybe_yield();
4869 	}
4870  out:
4871 	if (fp != NULL)
4872 		fdrop(fp, td);
4873 	return (error);
4874 }
4875 
4876 int
4877 sys_posix_fallocate(struct thread *td, struct posix_fallocate_args *uap)
4878 {
4879 
4880 	return (kern_posix_fallocate(td, uap->fd, uap->offset, uap->len));
4881 }
4882 
4883 /*
4884  * Unlike madvise(2), we do not make a best effort to remember every
4885  * possible caching hint.  Instead, we remember the last setting with
4886  * the exception that we will allow POSIX_FADV_NORMAL to adjust the
4887  * region of any current setting.
4888  */
4889 int
4890 kern_posix_fadvise(struct thread *td, int fd, off_t offset, off_t len,
4891     int advice)
4892 {
4893 	struct fadvise_info *fa, *new;
4894 	struct file *fp;
4895 	struct vnode *vp;
4896 	off_t end;
4897 	int error;
4898 
4899 	if (offset < 0 || len < 0 || offset > OFF_MAX - len)
4900 		return (EINVAL);
4901 	switch (advice) {
4902 	case POSIX_FADV_SEQUENTIAL:
4903 	case POSIX_FADV_RANDOM:
4904 	case POSIX_FADV_NOREUSE:
4905 		new = malloc(sizeof(*fa), M_FADVISE, M_WAITOK);
4906 		break;
4907 	case POSIX_FADV_NORMAL:
4908 	case POSIX_FADV_WILLNEED:
4909 	case POSIX_FADV_DONTNEED:
4910 		new = NULL;
4911 		break;
4912 	default:
4913 		return (EINVAL);
4914 	}
4915 	/* XXX: CAP_POSIX_FADVISE? */
4916 	error = fget(td, fd, 0, &fp);
4917 	if (error != 0)
4918 		goto out;
4919 
4920 	switch (fp->f_type) {
4921 	case DTYPE_VNODE:
4922 		break;
4923 	case DTYPE_PIPE:
4924 	case DTYPE_FIFO:
4925 		error = ESPIPE;
4926 		goto out;
4927 	default:
4928 		error = ENODEV;
4929 		goto out;
4930 	}
4931 	vp = fp->f_vnode;
4932 	if (vp->v_type != VREG) {
4933 		error = ENODEV;
4934 		goto out;
4935 	}
4936 	if (len == 0)
4937 		end = OFF_MAX;
4938 	else
4939 		end = offset + len - 1;
4940 	switch (advice) {
4941 	case POSIX_FADV_SEQUENTIAL:
4942 	case POSIX_FADV_RANDOM:
4943 	case POSIX_FADV_NOREUSE:
4944 		/*
4945 		 * Try to merge any existing non-standard region with
4946 		 * this new region if possible, otherwise create a new
4947 		 * non-standard region for this request.
4948 		 */
4949 		mtx_pool_lock(mtxpool_sleep, fp);
4950 		fa = fp->f_advice;
4951 		if (fa != NULL && fa->fa_advice == advice &&
4952 		    ((fa->fa_start <= end && fa->fa_end >= offset) ||
4953 		    (end != OFF_MAX && fa->fa_start == end + 1) ||
4954 		    (fa->fa_end != OFF_MAX && fa->fa_end + 1 == offset))) {
4955 			if (offset < fa->fa_start)
4956 				fa->fa_start = offset;
4957 			if (end > fa->fa_end)
4958 				fa->fa_end = end;
4959 		} else {
4960 			new->fa_advice = advice;
4961 			new->fa_start = offset;
4962 			new->fa_end = end;
4963 			fp->f_advice = new;
4964 			new = fa;
4965 		}
4966 		mtx_pool_unlock(mtxpool_sleep, fp);
4967 		break;
4968 	case POSIX_FADV_NORMAL:
4969 		/*
4970 		 * If a the "normal" region overlaps with an existing
4971 		 * non-standard region, trim or remove the
4972 		 * non-standard region.
4973 		 */
4974 		mtx_pool_lock(mtxpool_sleep, fp);
4975 		fa = fp->f_advice;
4976 		if (fa != NULL) {
4977 			if (offset <= fa->fa_start && end >= fa->fa_end) {
4978 				new = fa;
4979 				fp->f_advice = NULL;
4980 			} else if (offset <= fa->fa_start &&
4981  			    end >= fa->fa_start)
4982 				fa->fa_start = end + 1;
4983 			else if (offset <= fa->fa_end && end >= fa->fa_end)
4984 				fa->fa_end = offset - 1;
4985 			else if (offset >= fa->fa_start && end <= fa->fa_end) {
4986 				/*
4987 				 * If the "normal" region is a middle
4988 				 * portion of the existing
4989 				 * non-standard region, just remove
4990 				 * the whole thing rather than picking
4991 				 * one side or the other to
4992 				 * preserve.
4993 				 */
4994 				new = fa;
4995 				fp->f_advice = NULL;
4996 			}
4997 		}
4998 		mtx_pool_unlock(mtxpool_sleep, fp);
4999 		break;
5000 	case POSIX_FADV_WILLNEED:
5001 	case POSIX_FADV_DONTNEED:
5002 		error = VOP_ADVISE(vp, offset, end, advice);
5003 		break;
5004 	}
5005 out:
5006 	if (fp != NULL)
5007 		fdrop(fp, td);
5008 	free(new, M_FADVISE);
5009 	return (error);
5010 }
5011 
5012 int
5013 sys_posix_fadvise(struct thread *td, struct posix_fadvise_args *uap)
5014 {
5015 
5016 	return (kern_posix_fadvise(td, uap->fd, uap->offset, uap->len,
5017 	    uap->advice));
5018 }
5019