xref: /freebsd/sys/kern/vfs_vnops.c (revision 87569f75a91f298c52a71823c04d41cf53c88889)
1 /*-
2  * Copyright (c) 1982, 1986, 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 4. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  *	@(#)vfs_vnops.c	8.2 (Berkeley) 1/21/94
35  */
36 
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD$");
39 
40 #include "opt_mac.h"
41 
42 #include <sys/param.h>
43 #include <sys/systm.h>
44 #include <sys/fcntl.h>
45 #include <sys/file.h>
46 #include <sys/kdb.h>
47 #include <sys/stat.h>
48 #include <sys/proc.h>
49 #include <sys/limits.h>
50 #include <sys/lock.h>
51 #include <sys/mac.h>
52 #include <sys/mount.h>
53 #include <sys/mutex.h>
54 #include <sys/namei.h>
55 #include <sys/vnode.h>
56 #include <sys/bio.h>
57 #include <sys/buf.h>
58 #include <sys/filio.h>
59 #include <sys/sx.h>
60 #include <sys/ttycom.h>
61 #include <sys/conf.h>
62 #include <sys/syslog.h>
63 #include <sys/unistd.h>
64 
65 static fo_rdwr_t	vn_read;
66 static fo_rdwr_t	vn_write;
67 static fo_ioctl_t	vn_ioctl;
68 static fo_poll_t	vn_poll;
69 static fo_kqfilter_t	vn_kqfilter;
70 static fo_stat_t	vn_statfile;
71 static fo_close_t	vn_closefile;
72 
73 struct 	fileops vnops = {
74 	.fo_read = vn_read,
75 	.fo_write = vn_write,
76 	.fo_ioctl = vn_ioctl,
77 	.fo_poll = vn_poll,
78 	.fo_kqfilter = vn_kqfilter,
79 	.fo_stat = vn_statfile,
80 	.fo_close = vn_closefile,
81 	.fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE
82 };
83 
84 int
85 vn_open(ndp, flagp, cmode, fdidx)
86 	struct nameidata *ndp;
87 	int *flagp, cmode, fdidx;
88 {
89 	struct thread *td = ndp->ni_cnd.cn_thread;
90 
91 	return (vn_open_cred(ndp, flagp, cmode, td->td_ucred, fdidx));
92 }
93 
94 /*
95  * Common code for vnode open operations.
96  * Check permissions, and call the VOP_OPEN or VOP_CREATE routine.
97  *
98  * Note that this does NOT free nameidata for the successful case,
99  * due to the NDINIT being done elsewhere.
100  */
101 int
102 vn_open_cred(ndp, flagp, cmode, cred, fdidx)
103 	struct nameidata *ndp;
104 	int *flagp, cmode;
105 	struct ucred *cred;
106 	int fdidx;
107 {
108 	struct vnode *vp;
109 	struct mount *mp;
110 	struct thread *td = ndp->ni_cnd.cn_thread;
111 	struct vattr vat;
112 	struct vattr *vap = &vat;
113 	int mode, fmode, error;
114 	int vfslocked;
115 
116 restart:
117 	vfslocked = 0;
118 	fmode = *flagp;
119 	if (fmode & O_CREAT) {
120 		ndp->ni_cnd.cn_nameiop = CREATE;
121 		ndp->ni_cnd.cn_flags = ISOPEN | LOCKPARENT | LOCKLEAF |
122 		    MPSAFE | AUDITVNODE1;
123 		if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0)
124 			ndp->ni_cnd.cn_flags |= FOLLOW;
125 		bwillwrite();
126 		if ((error = namei(ndp)) != 0)
127 			return (error);
128 		vfslocked = (ndp->ni_cnd.cn_flags & GIANTHELD) != 0;
129 		ndp->ni_cnd.cn_flags &= ~MPSAFE;
130 		if (ndp->ni_vp == NULL) {
131 			VATTR_NULL(vap);
132 			vap->va_type = VREG;
133 			vap->va_mode = cmode;
134 			if (fmode & O_EXCL)
135 				vap->va_vaflags |= VA_EXCLUSIVE;
136 			if (vn_start_write(ndp->ni_dvp, &mp, V_NOWAIT) != 0) {
137 				NDFREE(ndp, NDF_ONLY_PNBUF);
138 				vput(ndp->ni_dvp);
139 				VFS_UNLOCK_GIANT(vfslocked);
140 				if ((error = vn_start_write(NULL, &mp,
141 				    V_XSLEEP | PCATCH)) != 0)
142 					return (error);
143 				goto restart;
144 			}
145 #ifdef MAC
146 			error = mac_check_vnode_create(cred, ndp->ni_dvp,
147 			    &ndp->ni_cnd, vap);
148 			if (error == 0) {
149 #endif
150 				VOP_LEASE(ndp->ni_dvp, td, cred, LEASE_WRITE);
151 				error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp,
152 						   &ndp->ni_cnd, vap);
153 #ifdef MAC
154 			}
155 #endif
156 			vput(ndp->ni_dvp);
157 			vn_finished_write(mp);
158 			if (error) {
159 				VFS_UNLOCK_GIANT(vfslocked);
160 				NDFREE(ndp, NDF_ONLY_PNBUF);
161 				return (error);
162 			}
163 			fmode &= ~O_TRUNC;
164 			vp = ndp->ni_vp;
165 		} else {
166 			if (ndp->ni_dvp == ndp->ni_vp)
167 				vrele(ndp->ni_dvp);
168 			else
169 				vput(ndp->ni_dvp);
170 			ndp->ni_dvp = NULL;
171 			vp = ndp->ni_vp;
172 			if (fmode & O_EXCL) {
173 				error = EEXIST;
174 				goto bad;
175 			}
176 			fmode &= ~O_CREAT;
177 		}
178 	} else {
179 		ndp->ni_cnd.cn_nameiop = LOOKUP;
180 		ndp->ni_cnd.cn_flags = ISOPEN |
181 		    ((fmode & O_NOFOLLOW) ? NOFOLLOW : FOLLOW) |
182 		    LOCKSHARED | LOCKLEAF | MPSAFE | AUDITVNODE1;
183 		if ((error = namei(ndp)) != 0)
184 			return (error);
185 		ndp->ni_cnd.cn_flags &= ~MPSAFE;
186 		vfslocked = (ndp->ni_cnd.cn_flags & GIANTHELD) != 0;
187 		vp = ndp->ni_vp;
188 	}
189 	if (vp->v_type == VLNK) {
190 		error = EMLINK;
191 		goto bad;
192 	}
193 	if (vp->v_type == VSOCK) {
194 		error = EOPNOTSUPP;
195 		goto bad;
196 	}
197 	mode = 0;
198 	if (fmode & (FWRITE | O_TRUNC)) {
199 		if (vp->v_type == VDIR) {
200 			error = EISDIR;
201 			goto bad;
202 		}
203 		mode |= VWRITE;
204 	}
205 	if (fmode & FREAD)
206 		mode |= VREAD;
207 	if (fmode & O_APPEND)
208 		mode |= VAPPEND;
209 #ifdef MAC
210 	error = mac_check_vnode_open(cred, vp, mode);
211 	if (error)
212 		goto bad;
213 #endif
214 	if ((fmode & O_CREAT) == 0) {
215 		if (mode & VWRITE) {
216 			error = vn_writechk(vp);
217 			if (error)
218 				goto bad;
219 		}
220 		if (mode) {
221 		        error = VOP_ACCESS(vp, mode, cred, td);
222 			if (error)
223 				goto bad;
224 		}
225 	}
226 	if ((error = VOP_OPEN(vp, fmode, cred, td, fdidx)) != 0)
227 		goto bad;
228 
229 	if (fmode & FWRITE)
230 		vp->v_writecount++;
231 	*flagp = fmode;
232 	ASSERT_VOP_LOCKED(vp, "vn_open_cred");
233 	if (fdidx == -1)
234 		VFS_UNLOCK_GIANT(vfslocked);
235 	return (0);
236 bad:
237 	NDFREE(ndp, NDF_ONLY_PNBUF);
238 	vput(vp);
239 	VFS_UNLOCK_GIANT(vfslocked);
240 	*flagp = fmode;
241 	ndp->ni_vp = NULL;
242 	return (error);
243 }
244 
245 /*
246  * Check for write permissions on the specified vnode.
247  * Prototype text segments cannot be written.
248  */
249 int
250 vn_writechk(vp)
251 	register struct vnode *vp;
252 {
253 
254 	ASSERT_VOP_LOCKED(vp, "vn_writechk");
255 	/*
256 	 * If there's shared text associated with
257 	 * the vnode, try to free it up once.  If
258 	 * we fail, we can't allow writing.
259 	 */
260 	if (vp->v_vflag & VV_TEXT)
261 		return (ETXTBSY);
262 
263 	return (0);
264 }
265 
266 /*
267  * Vnode close call
268  */
269 int
270 vn_close(vp, flags, file_cred, td)
271 	register struct vnode *vp;
272 	int flags;
273 	struct ucred *file_cred;
274 	struct thread *td;
275 {
276 	struct mount *mp;
277 	int error;
278 
279 	VFS_ASSERT_GIANT(vp->v_mount);
280 
281 	vn_start_write(vp, &mp, V_WAIT);
282 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
283 	if (flags & FWRITE)
284 		vp->v_writecount--;
285 	error = VOP_CLOSE(vp, flags, file_cred, td);
286 	vput(vp);
287 	vn_finished_write(mp);
288 	return (error);
289 }
290 
291 /*
292  * Sequential heuristic - detect sequential operation
293  */
294 static __inline
295 int
296 sequential_heuristic(struct uio *uio, struct file *fp)
297 {
298 
299 	if ((uio->uio_offset == 0 && fp->f_seqcount > 0) ||
300 	    uio->uio_offset == fp->f_nextoff) {
301 		/*
302 		 * XXX we assume that the filesystem block size is
303 		 * the default.  Not true, but still gives us a pretty
304 		 * good indicator of how sequential the read operations
305 		 * are.
306 		 */
307 		fp->f_seqcount += (uio->uio_resid + BKVASIZE - 1) / BKVASIZE;
308 		if (fp->f_seqcount > IO_SEQMAX)
309 			fp->f_seqcount = IO_SEQMAX;
310 		return(fp->f_seqcount << IO_SEQSHIFT);
311 	}
312 
313 	/*
314 	 * Not sequential, quick draw-down of seqcount
315 	 */
316 	if (fp->f_seqcount > 1)
317 		fp->f_seqcount = 1;
318 	else
319 		fp->f_seqcount = 0;
320 	return(0);
321 }
322 
323 /*
324  * Package up an I/O request on a vnode into a uio and do it.
325  */
326 int
327 vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, active_cred, file_cred,
328     aresid, td)
329 	enum uio_rw rw;
330 	struct vnode *vp;
331 	void *base;
332 	int len;
333 	off_t offset;
334 	enum uio_seg segflg;
335 	int ioflg;
336 	struct ucred *active_cred;
337 	struct ucred *file_cred;
338 	int *aresid;
339 	struct thread *td;
340 {
341 	struct uio auio;
342 	struct iovec aiov;
343 	struct mount *mp;
344 	struct ucred *cred;
345 	int error;
346 
347 	VFS_ASSERT_GIANT(vp->v_mount);
348 
349 	if ((ioflg & IO_NODELOCKED) == 0) {
350 		mp = NULL;
351 		if (rw == UIO_WRITE) {
352 			if (vp->v_type != VCHR &&
353 			    (error = vn_start_write(vp, &mp, V_WAIT | PCATCH))
354 			    != 0)
355 				return (error);
356 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
357 		} else {
358 			/*
359 			 * XXX This should be LK_SHARED but I don't trust VFS
360 			 * enough to leave it like that until it has been
361 			 * reviewed further.
362 			 */
363 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
364 		}
365 
366 	}
367 	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
368 	auio.uio_iov = &aiov;
369 	auio.uio_iovcnt = 1;
370 	aiov.iov_base = base;
371 	aiov.iov_len = len;
372 	auio.uio_resid = len;
373 	auio.uio_offset = offset;
374 	auio.uio_segflg = segflg;
375 	auio.uio_rw = rw;
376 	auio.uio_td = td;
377 	error = 0;
378 #ifdef MAC
379 	if ((ioflg & IO_NOMACCHECK) == 0) {
380 		if (rw == UIO_READ)
381 			error = mac_check_vnode_read(active_cred, file_cred,
382 			    vp);
383 		else
384 			error = mac_check_vnode_write(active_cred, file_cred,
385 			    vp);
386 	}
387 #endif
388 	if (error == 0) {
389 		if (file_cred)
390 			cred = file_cred;
391 		else
392 			cred = active_cred;
393 		if (rw == UIO_READ)
394 			error = VOP_READ(vp, &auio, ioflg, cred);
395 		else
396 			error = VOP_WRITE(vp, &auio, ioflg, cred);
397 	}
398 	if (aresid)
399 		*aresid = auio.uio_resid;
400 	else
401 		if (auio.uio_resid && error == 0)
402 			error = EIO;
403 	if ((ioflg & IO_NODELOCKED) == 0) {
404 		if (rw == UIO_WRITE)
405 			vn_finished_write(mp);
406 		VOP_UNLOCK(vp, 0, td);
407 	}
408 	return (error);
409 }
410 
411 /*
412  * Package up an I/O request on a vnode into a uio and do it.  The I/O
413  * request is split up into smaller chunks and we try to avoid saturating
414  * the buffer cache while potentially holding a vnode locked, so we
415  * check bwillwrite() before calling vn_rdwr().  We also call uio_yield()
416  * to give other processes a chance to lock the vnode (either other processes
417  * core'ing the same binary, or unrelated processes scanning the directory).
418  */
419 int
420 vn_rdwr_inchunks(rw, vp, base, len, offset, segflg, ioflg, active_cred,
421     file_cred, aresid, td)
422 	enum uio_rw rw;
423 	struct vnode *vp;
424 	void *base;
425 	size_t len;
426 	off_t offset;
427 	enum uio_seg segflg;
428 	int ioflg;
429 	struct ucred *active_cred;
430 	struct ucred *file_cred;
431 	size_t *aresid;
432 	struct thread *td;
433 {
434 	int error = 0;
435 	int iaresid;
436 
437 	VFS_ASSERT_GIANT(vp->v_mount);
438 
439 	do {
440 		int chunk;
441 
442 		/*
443 		 * Force `offset' to a multiple of MAXBSIZE except possibly
444 		 * for the first chunk, so that filesystems only need to
445 		 * write full blocks except possibly for the first and last
446 		 * chunks.
447 		 */
448 		chunk = MAXBSIZE - (uoff_t)offset % MAXBSIZE;
449 
450 		if (chunk > len)
451 			chunk = len;
452 		if (rw != UIO_READ && vp->v_type == VREG)
453 			bwillwrite();
454 		iaresid = 0;
455 		error = vn_rdwr(rw, vp, base, chunk, offset, segflg,
456 		    ioflg, active_cred, file_cred, &iaresid, td);
457 		len -= chunk;	/* aresid calc already includes length */
458 		if (error)
459 			break;
460 		offset += chunk;
461 		base = (char *)base + chunk;
462 		uio_yield();
463 	} while (len);
464 	if (aresid)
465 		*aresid = len + iaresid;
466 	return (error);
467 }
468 
469 /*
470  * File table vnode read routine.
471  */
472 static int
473 vn_read(fp, uio, active_cred, flags, td)
474 	struct file *fp;
475 	struct uio *uio;
476 	struct ucred *active_cred;
477 	struct thread *td;
478 	int flags;
479 {
480 	struct vnode *vp;
481 	int error, ioflag;
482 	int vfslocked;
483 
484 	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
485 	    uio->uio_td, td));
486 	vp = fp->f_vnode;
487 	ioflag = 0;
488 	if (fp->f_flag & FNONBLOCK)
489 		ioflag |= IO_NDELAY;
490 	if (fp->f_flag & O_DIRECT)
491 		ioflag |= IO_DIRECT;
492 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
493 	VOP_LEASE(vp, td, fp->f_cred, LEASE_READ);
494 	/*
495 	 * According to McKusick the vn lock is protecting f_offset here.
496 	 * Once this field has it's own lock we can acquire this shared.
497 	 */
498 	if ((flags & FOF_OFFSET) == 0) {
499 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
500 		uio->uio_offset = fp->f_offset;
501 	} else
502 		vn_lock(vp, LK_SHARED | LK_RETRY, td);
503 
504 	ioflag |= sequential_heuristic(uio, fp);
505 
506 #ifdef MAC
507 	error = mac_check_vnode_read(active_cred, fp->f_cred, vp);
508 	if (error == 0)
509 #endif
510 		error = VOP_READ(vp, uio, ioflag, fp->f_cred);
511 	if ((flags & FOF_OFFSET) == 0)
512 		fp->f_offset = uio->uio_offset;
513 	fp->f_nextoff = uio->uio_offset;
514 	VOP_UNLOCK(vp, 0, td);
515 	VFS_UNLOCK_GIANT(vfslocked);
516 	return (error);
517 }
518 
519 /*
520  * File table vnode write routine.
521  */
522 static int
523 vn_write(fp, uio, active_cred, flags, td)
524 	struct file *fp;
525 	struct uio *uio;
526 	struct ucred *active_cred;
527 	struct thread *td;
528 	int flags;
529 {
530 	struct vnode *vp;
531 	struct mount *mp;
532 	int error, ioflag;
533 	int vfslocked;
534 
535 	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
536 	    uio->uio_td, td));
537 	vp = fp->f_vnode;
538 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
539 	if (vp->v_type == VREG)
540 		bwillwrite();
541 	ioflag = IO_UNIT;
542 	if (vp->v_type == VREG && (fp->f_flag & O_APPEND))
543 		ioflag |= IO_APPEND;
544 	if (fp->f_flag & FNONBLOCK)
545 		ioflag |= IO_NDELAY;
546 	if (fp->f_flag & O_DIRECT)
547 		ioflag |= IO_DIRECT;
548 	if ((fp->f_flag & O_FSYNC) ||
549 	    (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)))
550 		ioflag |= IO_SYNC;
551 	mp = NULL;
552 	if (vp->v_type != VCHR &&
553 	    (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
554 		goto unlock;
555 	VOP_LEASE(vp, td, fp->f_cred, LEASE_WRITE);
556 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
557 	if ((flags & FOF_OFFSET) == 0)
558 		uio->uio_offset = fp->f_offset;
559 	ioflag |= sequential_heuristic(uio, fp);
560 #ifdef MAC
561 	error = mac_check_vnode_write(active_cred, fp->f_cred, vp);
562 	if (error == 0)
563 #endif
564 		error = VOP_WRITE(vp, uio, ioflag, fp->f_cred);
565 	if ((flags & FOF_OFFSET) == 0)
566 		fp->f_offset = uio->uio_offset;
567 	fp->f_nextoff = uio->uio_offset;
568 	VOP_UNLOCK(vp, 0, td);
569 	vn_finished_write(mp);
570 unlock:
571 	VFS_UNLOCK_GIANT(vfslocked);
572 	return (error);
573 }
574 
575 /*
576  * File table vnode stat routine.
577  */
578 static int
579 vn_statfile(fp, sb, active_cred, td)
580 	struct file *fp;
581 	struct stat *sb;
582 	struct ucred *active_cred;
583 	struct thread *td;
584 {
585 	struct vnode *vp = fp->f_vnode;
586 	int vfslocked;
587 	int error;
588 
589 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
590 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
591 	error = vn_stat(vp, sb, active_cred, fp->f_cred, td);
592 	VOP_UNLOCK(vp, 0, td);
593 	VFS_UNLOCK_GIANT(vfslocked);
594 
595 	return (error);
596 }
597 
598 /*
599  * Stat a vnode; implementation for the stat syscall
600  */
601 int
602 vn_stat(vp, sb, active_cred, file_cred, td)
603 	struct vnode *vp;
604 	register struct stat *sb;
605 	struct ucred *active_cred;
606 	struct ucred *file_cred;
607 	struct thread *td;
608 {
609 	struct vattr vattr;
610 	register struct vattr *vap;
611 	int error;
612 	u_short mode;
613 
614 #ifdef MAC
615 	error = mac_check_vnode_stat(active_cred, file_cred, vp);
616 	if (error)
617 		return (error);
618 #endif
619 
620 	vap = &vattr;
621 	error = VOP_GETATTR(vp, vap, active_cred, td);
622 	if (error)
623 		return (error);
624 
625 	/*
626 	 * Zero the spare stat fields
627 	 */
628 	bzero(sb, sizeof *sb);
629 
630 	/*
631 	 * Copy from vattr table
632 	 */
633 	if (vap->va_fsid != VNOVAL)
634 		sb->st_dev = vap->va_fsid;
635 	else
636 		sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0];
637 	sb->st_ino = vap->va_fileid;
638 	mode = vap->va_mode;
639 	switch (vap->va_type) {
640 	case VREG:
641 		mode |= S_IFREG;
642 		break;
643 	case VDIR:
644 		mode |= S_IFDIR;
645 		break;
646 	case VBLK:
647 		mode |= S_IFBLK;
648 		break;
649 	case VCHR:
650 		mode |= S_IFCHR;
651 		break;
652 	case VLNK:
653 		mode |= S_IFLNK;
654 		/* This is a cosmetic change, symlinks do not have a mode. */
655 		if (vp->v_mount->mnt_flag & MNT_NOSYMFOLLOW)
656 			sb->st_mode &= ~ACCESSPERMS;	/* 0000 */
657 		else
658 			sb->st_mode |= ACCESSPERMS;	/* 0777 */
659 		break;
660 	case VSOCK:
661 		mode |= S_IFSOCK;
662 		break;
663 	case VFIFO:
664 		mode |= S_IFIFO;
665 		break;
666 	default:
667 		return (EBADF);
668 	};
669 	sb->st_mode = mode;
670 	sb->st_nlink = vap->va_nlink;
671 	sb->st_uid = vap->va_uid;
672 	sb->st_gid = vap->va_gid;
673 	sb->st_rdev = vap->va_rdev;
674 	if (vap->va_size > OFF_MAX)
675 		return (EOVERFLOW);
676 	sb->st_size = vap->va_size;
677 	sb->st_atimespec = vap->va_atime;
678 	sb->st_mtimespec = vap->va_mtime;
679 	sb->st_ctimespec = vap->va_ctime;
680 	sb->st_birthtimespec = vap->va_birthtime;
681 
682         /*
683 	 * According to www.opengroup.org, the meaning of st_blksize is
684 	 *   "a filesystem-specific preferred I/O block size for this
685 	 *    object.  In some filesystem types, this may vary from file
686 	 *    to file"
687 	 * Default to PAGE_SIZE after much discussion.
688 	 * XXX: min(PAGE_SIZE, vp->v_bufobj.bo_bsize) may be more correct.
689 	 */
690 
691 	sb->st_blksize = PAGE_SIZE;
692 
693 	sb->st_flags = vap->va_flags;
694 	if (suser(td))
695 		sb->st_gen = 0;
696 	else
697 		sb->st_gen = vap->va_gen;
698 
699 #if (S_BLKSIZE == 512)
700 	/* Optimize this case */
701 	sb->st_blocks = vap->va_bytes >> 9;
702 #else
703 	sb->st_blocks = vap->va_bytes / S_BLKSIZE;
704 #endif
705 	return (0);
706 }
707 
708 /*
709  * File table vnode ioctl routine.
710  */
711 static int
712 vn_ioctl(fp, com, data, active_cred, td)
713 	struct file *fp;
714 	u_long com;
715 	void *data;
716 	struct ucred *active_cred;
717 	struct thread *td;
718 {
719 	struct vnode *vp = fp->f_vnode;
720 	struct vattr vattr;
721 	int vfslocked;
722 	int error;
723 
724 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
725 	error = ENOTTY;
726 	switch (vp->v_type) {
727 	case VREG:
728 	case VDIR:
729 		if (com == FIONREAD) {
730 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
731 			error = VOP_GETATTR(vp, &vattr, active_cred, td);
732 			VOP_UNLOCK(vp, 0, td);
733 			if (!error)
734 				*(int *)data = vattr.va_size - fp->f_offset;
735 		}
736 		if (com == FIONBIO || com == FIOASYNC)	/* XXX */
737 			error = 0;
738 		else
739 			error = VOP_IOCTL(vp, com, data, fp->f_flag,
740 			    active_cred, td);
741 		break;
742 
743 	default:
744 		break;
745 	}
746 	VFS_UNLOCK_GIANT(vfslocked);
747 	return (error);
748 }
749 
750 /*
751  * File table vnode poll routine.
752  */
753 static int
754 vn_poll(fp, events, active_cred, td)
755 	struct file *fp;
756 	int events;
757 	struct ucred *active_cred;
758 	struct thread *td;
759 {
760 	struct vnode *vp;
761 	int error;
762 
763 	mtx_lock(&Giant);
764 
765 	vp = fp->f_vnode;
766 #ifdef MAC
767 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
768 	error = mac_check_vnode_poll(active_cred, fp->f_cred, vp);
769 	VOP_UNLOCK(vp, 0, td);
770 	if (!error)
771 #endif
772 
773 	error = VOP_POLL(vp, events, fp->f_cred, td);
774 	mtx_unlock(&Giant);
775 	return (error);
776 }
777 
778 /*
779  * Check that the vnode is still valid, and if so
780  * acquire requested lock.
781  */
782 int
783 vn_lock(vp, flags, td)
784 	struct vnode *vp;
785 	int flags;
786 	struct thread *td;
787 {
788 	int error;
789 
790 	do {
791 		if ((flags & LK_INTERLOCK) == 0)
792 			VI_LOCK(vp);
793 		if ((flags & LK_NOWAIT || (flags & LK_TYPE_MASK) == 0) &&
794 		    vp->v_iflag & VI_DOOMED) {
795 			VI_UNLOCK(vp);
796 			return (ENOENT);
797 		}
798 		/*
799 		 * Just polling to check validity.
800 		 */
801 		if ((flags & LK_TYPE_MASK) == 0) {
802 			VI_UNLOCK(vp);
803 			return (0);
804 		}
805 		/*
806 		 * lockmgr drops interlock before it will return for
807 		 * any reason.  So force the code above to relock it.
808 		 */
809 		error = VOP_LOCK(vp, flags | LK_INTERLOCK, td);
810 		flags &= ~LK_INTERLOCK;
811 		KASSERT((flags & LK_RETRY) == 0 || error == 0,
812 		    ("LK_RETRY set with incompatible flags %d\n", flags));
813 		/*
814 		 * Callers specify LK_RETRY if they wish to get dead vnodes.
815 		 * If RETRY is not set, we return ENOENT instead.
816 		 */
817 		if (error == 0 && vp->v_iflag & VI_DOOMED &&
818 		    (flags & LK_RETRY) == 0) {
819 			VOP_UNLOCK(vp, 0, td);
820 			error = ENOENT;
821 			break;
822 		}
823 	} while (flags & LK_RETRY && error != 0);
824 	return (error);
825 }
826 
827 /*
828  * File table vnode close routine.
829  */
830 static int
831 vn_closefile(fp, td)
832 	struct file *fp;
833 	struct thread *td;
834 {
835 	struct vnode *vp;
836 	struct flock lf;
837 	int vfslocked;
838 	int error;
839 
840 	vp = fp->f_vnode;
841 
842 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
843 	if (fp->f_type == DTYPE_VNODE && fp->f_flag & FHASLOCK) {
844 		lf.l_whence = SEEK_SET;
845 		lf.l_start = 0;
846 		lf.l_len = 0;
847 		lf.l_type = F_UNLCK;
848 		(void) VOP_ADVLOCK(vp, fp, F_UNLCK, &lf, F_FLOCK);
849 	}
850 
851 	fp->f_ops = &badfileops;
852 
853 	error = vn_close(vp, fp->f_flag, fp->f_cred, td);
854 	VFS_UNLOCK_GIANT(vfslocked);
855 	return (error);
856 }
857 
858 /*
859  * Preparing to start a filesystem write operation. If the operation is
860  * permitted, then we bump the count of operations in progress and
861  * proceed. If a suspend request is in progress, we wait until the
862  * suspension is over, and then proceed.
863  */
864 int
865 vn_start_write(vp, mpp, flags)
866 	struct vnode *vp;
867 	struct mount **mpp;
868 	int flags;
869 {
870 	struct mount *mp;
871 	int error;
872 
873 	error = 0;
874 	/*
875 	 * If a vnode is provided, get and return the mount point that
876 	 * to which it will write.
877 	 */
878 	if (vp != NULL) {
879 		if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
880 			*mpp = NULL;
881 			if (error != EOPNOTSUPP)
882 				return (error);
883 			return (0);
884 		}
885 	}
886 	if ((mp = *mpp) == NULL)
887 		return (0);
888 	MNT_ILOCK(mp);
889 	/*
890 	 * Check on status of suspension.
891 	 */
892 	while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
893 		if (flags & V_NOWAIT) {
894 			error = EWOULDBLOCK;
895 			goto unlock;
896 		}
897 		error = msleep(&mp->mnt_flag, MNT_MTX(mp),
898 		    (PUSER - 1) | (flags & PCATCH), "suspfs", 0);
899 		if (error)
900 			goto unlock;
901 	}
902 	if (flags & V_XSLEEP)
903 		goto unlock;
904 	mp->mnt_writeopcount++;
905 unlock:
906 	MNT_IUNLOCK(mp);
907 	return (error);
908 }
909 
910 /*
911  * Secondary suspension. Used by operations such as vop_inactive
912  * routines that are needed by the higher level functions. These
913  * are allowed to proceed until all the higher level functions have
914  * completed (indicated by mnt_writeopcount dropping to zero). At that
915  * time, these operations are halted until the suspension is over.
916  */
917 int
918 vn_write_suspend_wait(vp, mp, flags)
919 	struct vnode *vp;
920 	struct mount *mp;
921 	int flags;
922 {
923 	int error;
924 
925 	if (vp != NULL) {
926 		if ((error = VOP_GETWRITEMOUNT(vp, &mp)) != 0) {
927 			if (error != EOPNOTSUPP)
928 				return (error);
929 			return (0);
930 		}
931 	}
932 	/*
933 	 * If we are not suspended or have not yet reached suspended
934 	 * mode, then let the operation proceed.
935 	 */
936 	if (mp == NULL)
937 		return (0);
938 	MNT_ILOCK(mp);
939 	if ((mp->mnt_kern_flag & MNTK_SUSPENDED) == 0) {
940 		MNT_IUNLOCK(mp);
941 		return (0);
942 	}
943 	if (flags & V_NOWAIT) {
944 		MNT_IUNLOCK(mp);
945 		return (EWOULDBLOCK);
946 	}
947 	/*
948 	 * Wait for the suspension to finish.
949 	 */
950 	return (msleep(&mp->mnt_flag, MNT_MTX(mp),
951 	    (PUSER - 1) | (flags & PCATCH) | PDROP, "suspfs", 0));
952 }
953 
954 /*
955  * Secondary suspension. Used by operations such as vop_inactive
956  * routines that are needed by the higher level functions. These
957  * are allowed to proceed until all the higher level functions have
958  * completed (indicated by mnt_writeopcount dropping to zero). At that
959  * time, these operations are halted until the suspension is over.
960  */
961 int
962 vn_start_secondary_write(vp, mpp, flags)
963 	struct vnode *vp;
964 	struct mount **mpp;
965 	int flags;
966 {
967 	struct mount *mp;
968 	int error;
969 
970  retry:
971 	if (vp != NULL) {
972 		if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
973 			*mpp = NULL;
974 			if (error != EOPNOTSUPP)
975 				return (error);
976 			return (0);
977 		}
978 	}
979 	/*
980 	 * If we are not suspended or have not yet reached suspended
981 	 * mode, then let the operation proceed.
982 	 */
983 	if ((mp = *mpp) == NULL)
984 		return (0);
985 	MNT_ILOCK(mp);
986 	if ((mp->mnt_kern_flag & (MNTK_SUSPENDED | MNTK_SUSPEND2)) == 0) {
987 		mp->mnt_secondary_writes++;
988 		mp->mnt_secondary_accwrites++;
989 		MNT_IUNLOCK(mp);
990 		return (0);
991 	}
992 	if (flags & V_NOWAIT) {
993 		MNT_IUNLOCK(mp);
994 		return (EWOULDBLOCK);
995 	}
996 	/*
997 	 * Wait for the suspension to finish.
998 	 */
999 	error = msleep(&mp->mnt_flag, MNT_MTX(mp),
1000 		       (PUSER - 1) | (flags & PCATCH) | PDROP, "suspfs", 0);
1001 	if (error == 0)
1002 		goto retry;
1003 	return (error);
1004 }
1005 
1006 /*
1007  * Filesystem write operation has completed. If we are suspending and this
1008  * operation is the last one, notify the suspender that the suspension is
1009  * now in effect.
1010  */
1011 void
1012 vn_finished_write(mp)
1013 	struct mount *mp;
1014 {
1015 	if (mp == NULL)
1016 		return;
1017 	MNT_ILOCK(mp);
1018 	mp->mnt_writeopcount--;
1019 	if (mp->mnt_writeopcount < 0)
1020 		panic("vn_finished_write: neg cnt");
1021 	if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
1022 	    mp->mnt_writeopcount <= 0)
1023 		wakeup(&mp->mnt_writeopcount);
1024 	MNT_IUNLOCK(mp);
1025 }
1026 
1027 
1028 /*
1029  * Filesystem secondary write operation has completed. If we are
1030  * suspending and this operation is the last one, notify the suspender
1031  * that the suspension is now in effect.
1032  */
1033 void
1034 vn_finished_secondary_write(mp)
1035 	struct mount *mp;
1036 {
1037 	if (mp == NULL)
1038 		return;
1039 	MNT_ILOCK(mp);
1040 	mp->mnt_secondary_writes--;
1041 	if (mp->mnt_secondary_writes < 0)
1042 		panic("vn_finished_secondary_write: neg cnt");
1043 	if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
1044 	    mp->mnt_secondary_writes <= 0)
1045 		wakeup(&mp->mnt_secondary_writes);
1046 	MNT_IUNLOCK(mp);
1047 }
1048 
1049 
1050 
1051 /*
1052  * Request a filesystem to suspend write operations.
1053  */
1054 int
1055 vfs_write_suspend(mp)
1056 	struct mount *mp;
1057 {
1058 	struct thread *td = curthread;
1059 	int error;
1060 
1061 	error = 0;
1062 	MNT_ILOCK(mp);
1063 	if (mp->mnt_kern_flag & MNTK_SUSPEND)
1064 		goto unlock;
1065 	mp->mnt_kern_flag |= MNTK_SUSPEND;
1066 	if (mp->mnt_writeopcount > 0)
1067 		(void) msleep(&mp->mnt_writeopcount,
1068 		    MNT_MTX(mp), (PUSER - 1)|PDROP, "suspwt", 0);
1069 	else
1070 		MNT_IUNLOCK(mp);
1071 	if ((error = VFS_SYNC(mp, MNT_SUSPEND, td)) != 0) {
1072 		vfs_write_resume(mp);
1073 		return (error);
1074 	}
1075 	MNT_ILOCK(mp);
1076 unlock:
1077 	MNT_IUNLOCK(mp);
1078 	return (error);
1079 }
1080 
1081 /*
1082  * Request a filesystem to resume write operations.
1083  */
1084 void
1085 vfs_write_resume(mp)
1086 	struct mount *mp;
1087 {
1088 
1089 	MNT_ILOCK(mp);
1090 	if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
1091 		mp->mnt_kern_flag &= ~(MNTK_SUSPEND | MNTK_SUSPEND2 |
1092 				       MNTK_SUSPENDED);
1093 		wakeup(&mp->mnt_writeopcount);
1094 		wakeup(&mp->mnt_flag);
1095 	}
1096 	MNT_IUNLOCK(mp);
1097 }
1098 
1099 /*
1100  * Implement kqueues for files by translating it to vnode operation.
1101  */
1102 static int
1103 vn_kqfilter(struct file *fp, struct knote *kn)
1104 {
1105 	int error;
1106 
1107 	mtx_lock(&Giant);
1108 	error = VOP_KQFILTER(fp->f_vnode, kn);
1109 	mtx_unlock(&Giant);
1110 
1111 	return error;
1112 }
1113 
1114 /*
1115  * Simplified in-kernel wrapper calls for extended attribute access.
1116  * Both calls pass in a NULL credential, authorizing as "kernel" access.
1117  * Set IO_NODELOCKED in ioflg if the vnode is already locked.
1118  */
1119 int
1120 vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace,
1121     const char *attrname, int *buflen, char *buf, struct thread *td)
1122 {
1123 	struct uio	auio;
1124 	struct iovec	iov;
1125 	int	error;
1126 
1127 	iov.iov_len = *buflen;
1128 	iov.iov_base = buf;
1129 
1130 	auio.uio_iov = &iov;
1131 	auio.uio_iovcnt = 1;
1132 	auio.uio_rw = UIO_READ;
1133 	auio.uio_segflg = UIO_SYSSPACE;
1134 	auio.uio_td = td;
1135 	auio.uio_offset = 0;
1136 	auio.uio_resid = *buflen;
1137 
1138 	if ((ioflg & IO_NODELOCKED) == 0)
1139 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
1140 
1141 	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
1142 
1143 	/* authorize attribute retrieval as kernel */
1144 	error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, NULL,
1145 	    td);
1146 
1147 	if ((ioflg & IO_NODELOCKED) == 0)
1148 		VOP_UNLOCK(vp, 0, td);
1149 
1150 	if (error == 0) {
1151 		*buflen = *buflen - auio.uio_resid;
1152 	}
1153 
1154 	return (error);
1155 }
1156 
1157 /*
1158  * XXX failure mode if partially written?
1159  */
1160 int
1161 vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace,
1162     const char *attrname, int buflen, char *buf, struct thread *td)
1163 {
1164 	struct uio	auio;
1165 	struct iovec	iov;
1166 	struct mount	*mp;
1167 	int	error;
1168 
1169 	iov.iov_len = buflen;
1170 	iov.iov_base = buf;
1171 
1172 	auio.uio_iov = &iov;
1173 	auio.uio_iovcnt = 1;
1174 	auio.uio_rw = UIO_WRITE;
1175 	auio.uio_segflg = UIO_SYSSPACE;
1176 	auio.uio_td = td;
1177 	auio.uio_offset = 0;
1178 	auio.uio_resid = buflen;
1179 
1180 	if ((ioflg & IO_NODELOCKED) == 0) {
1181 		if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
1182 			return (error);
1183 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
1184 	}
1185 
1186 	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
1187 
1188 	/* authorize attribute setting as kernel */
1189 	error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, td);
1190 
1191 	if ((ioflg & IO_NODELOCKED) == 0) {
1192 		vn_finished_write(mp);
1193 		VOP_UNLOCK(vp, 0, td);
1194 	}
1195 
1196 	return (error);
1197 }
1198 
1199 int
1200 vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace,
1201     const char *attrname, struct thread *td)
1202 {
1203 	struct mount	*mp;
1204 	int	error;
1205 
1206 	if ((ioflg & IO_NODELOCKED) == 0) {
1207 		if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
1208 			return (error);
1209 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
1210 	}
1211 
1212 	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
1213 
1214 	/* authorize attribute removal as kernel */
1215 	error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, NULL, td);
1216 	if (error == EOPNOTSUPP)
1217 		error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL,
1218 		    NULL, td);
1219 
1220 	if ((ioflg & IO_NODELOCKED) == 0) {
1221 		vn_finished_write(mp);
1222 		VOP_UNLOCK(vp, 0, td);
1223 	}
1224 
1225 	return (error);
1226 }
1227