xref: /freebsd/sys/kern/vfs_vnops.c (revision 7afc53b8dfcc7d5897920ce6cc7e842fbb4ab813)
1 /*-
2  * Copyright (c) 1982, 1986, 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 4. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  *	@(#)vfs_vnops.c	8.2 (Berkeley) 1/21/94
35  */
36 
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD$");
39 
40 #include "opt_mac.h"
41 
42 #include <sys/param.h>
43 #include <sys/systm.h>
44 #include <sys/fcntl.h>
45 #include <sys/file.h>
46 #include <sys/kdb.h>
47 #include <sys/stat.h>
48 #include <sys/proc.h>
49 #include <sys/limits.h>
50 #include <sys/lock.h>
51 #include <sys/mac.h>
52 #include <sys/mount.h>
53 #include <sys/mutex.h>
54 #include <sys/namei.h>
55 #include <sys/vnode.h>
56 #include <sys/bio.h>
57 #include <sys/buf.h>
58 #include <sys/filio.h>
59 #include <sys/sx.h>
60 #include <sys/ttycom.h>
61 #include <sys/conf.h>
62 #include <sys/syslog.h>
63 #include <sys/unistd.h>
64 
65 static fo_rdwr_t	vn_read;
66 static fo_rdwr_t	vn_write;
67 static fo_ioctl_t	vn_ioctl;
68 static fo_poll_t	vn_poll;
69 static fo_kqfilter_t	vn_kqfilter;
70 static fo_stat_t	vn_statfile;
71 static fo_close_t	vn_closefile;
72 
73 struct 	fileops vnops = {
74 	.fo_read = vn_read,
75 	.fo_write = vn_write,
76 	.fo_ioctl = vn_ioctl,
77 	.fo_poll = vn_poll,
78 	.fo_kqfilter = vn_kqfilter,
79 	.fo_stat = vn_statfile,
80 	.fo_close = vn_closefile,
81 	.fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE
82 };
83 
84 int
85 vn_open(ndp, flagp, cmode, fdidx)
86 	struct nameidata *ndp;
87 	int *flagp, cmode, fdidx;
88 {
89 	struct thread *td = ndp->ni_cnd.cn_thread;
90 
91 	return (vn_open_cred(ndp, flagp, cmode, td->td_ucred, fdidx));
92 }
93 
94 /*
95  * Common code for vnode open operations.
96  * Check permissions, and call the VOP_OPEN or VOP_CREATE routine.
97  *
98  * Note that this does NOT free nameidata for the successful case,
99  * due to the NDINIT being done elsewhere.
100  */
101 int
102 vn_open_cred(ndp, flagp, cmode, cred, fdidx)
103 	struct nameidata *ndp;
104 	int *flagp, cmode;
105 	struct ucred *cred;
106 	int fdidx;
107 {
108 	struct vnode *vp;
109 	struct mount *mp;
110 	struct thread *td = ndp->ni_cnd.cn_thread;
111 	struct vattr vat;
112 	struct vattr *vap = &vat;
113 	int mode, fmode, error;
114 	int vfslocked;
115 
116 restart:
117 	vfslocked = 0;
118 	fmode = *flagp;
119 	if (fmode & O_CREAT) {
120 		ndp->ni_cnd.cn_nameiop = CREATE;
121 		ndp->ni_cnd.cn_flags = ISOPEN | LOCKPARENT | LOCKLEAF | MPSAFE;
122 		if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0)
123 			ndp->ni_cnd.cn_flags |= FOLLOW;
124 		bwillwrite();
125 		if ((error = namei(ndp)) != 0)
126 			return (error);
127 		vfslocked = (ndp->ni_cnd.cn_flags & GIANTHELD) != 0;
128 		ndp->ni_cnd.cn_flags &= ~MPSAFE;
129 		if (ndp->ni_vp == NULL) {
130 			VATTR_NULL(vap);
131 			vap->va_type = VREG;
132 			vap->va_mode = cmode;
133 			if (fmode & O_EXCL)
134 				vap->va_vaflags |= VA_EXCLUSIVE;
135 			if (vn_start_write(ndp->ni_dvp, &mp, V_NOWAIT) != 0) {
136 				NDFREE(ndp, NDF_ONLY_PNBUF);
137 				vput(ndp->ni_dvp);
138 				VFS_UNLOCK_GIANT(vfslocked);
139 				if ((error = vn_start_write(NULL, &mp,
140 				    V_XSLEEP | PCATCH)) != 0)
141 					return (error);
142 				goto restart;
143 			}
144 #ifdef MAC
145 			error = mac_check_vnode_create(cred, ndp->ni_dvp,
146 			    &ndp->ni_cnd, vap);
147 			if (error == 0) {
148 #endif
149 				VOP_LEASE(ndp->ni_dvp, td, cred, LEASE_WRITE);
150 				error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp,
151 						   &ndp->ni_cnd, vap);
152 #ifdef MAC
153 			}
154 #endif
155 			vput(ndp->ni_dvp);
156 			vn_finished_write(mp);
157 			if (error) {
158 				VFS_UNLOCK_GIANT(vfslocked);
159 				NDFREE(ndp, NDF_ONLY_PNBUF);
160 				return (error);
161 			}
162 			ASSERT_VOP_UNLOCKED(ndp->ni_dvp, "create");
163 			ASSERT_VOP_LOCKED(ndp->ni_vp, "create");
164 			fmode &= ~O_TRUNC;
165 			vp = ndp->ni_vp;
166 		} else {
167 			if (ndp->ni_dvp == ndp->ni_vp)
168 				vrele(ndp->ni_dvp);
169 			else
170 				vput(ndp->ni_dvp);
171 			ndp->ni_dvp = NULL;
172 			vp = ndp->ni_vp;
173 			if (fmode & O_EXCL) {
174 				error = EEXIST;
175 				goto bad;
176 			}
177 			fmode &= ~O_CREAT;
178 		}
179 	} else {
180 		ndp->ni_cnd.cn_nameiop = LOOKUP;
181 		ndp->ni_cnd.cn_flags = ISOPEN |
182 		    ((fmode & O_NOFOLLOW) ? NOFOLLOW : FOLLOW) |
183 		    LOCKSHARED | LOCKLEAF | MPSAFE;
184 		if ((error = namei(ndp)) != 0)
185 			return (error);
186 		ndp->ni_cnd.cn_flags &= ~MPSAFE;
187 		vfslocked = (ndp->ni_cnd.cn_flags & GIANTHELD) != 0;
188 		vp = ndp->ni_vp;
189 	}
190 	if (vp->v_type == VLNK) {
191 		error = EMLINK;
192 		goto bad;
193 	}
194 	if (vp->v_type == VSOCK) {
195 		error = EOPNOTSUPP;
196 		goto bad;
197 	}
198 	mode = 0;
199 	if (fmode & (FWRITE | O_TRUNC)) {
200 		if (vp->v_type == VDIR) {
201 			error = EISDIR;
202 			goto bad;
203 		}
204 		mode |= VWRITE;
205 	}
206 	if (fmode & FREAD)
207 		mode |= VREAD;
208 	if (fmode & O_APPEND)
209 		mode |= VAPPEND;
210 #ifdef MAC
211 	error = mac_check_vnode_open(cred, vp, mode);
212 	if (error)
213 		goto bad;
214 #endif
215 	if ((fmode & O_CREAT) == 0) {
216 		if (mode & VWRITE) {
217 			error = vn_writechk(vp);
218 			if (error)
219 				goto bad;
220 		}
221 		if (mode) {
222 		        error = VOP_ACCESS(vp, mode, cred, td);
223 			if (error)
224 				goto bad;
225 		}
226 	}
227 	if ((error = VOP_OPEN(vp, fmode, cred, td, fdidx)) != 0)
228 		goto bad;
229 
230 	if (fmode & FWRITE)
231 		vp->v_writecount++;
232 	*flagp = fmode;
233 	ASSERT_VOP_LOCKED(vp, "vn_open_cred");
234 	if (fdidx == -1)
235 		VFS_UNLOCK_GIANT(vfslocked);
236 	return (0);
237 bad:
238 	NDFREE(ndp, NDF_ONLY_PNBUF);
239 	vput(vp);
240 	VFS_UNLOCK_GIANT(vfslocked);
241 	*flagp = fmode;
242 	ndp->ni_vp = NULL;
243 	return (error);
244 }
245 
246 /*
247  * Check for write permissions on the specified vnode.
248  * Prototype text segments cannot be written.
249  */
250 int
251 vn_writechk(vp)
252 	register struct vnode *vp;
253 {
254 
255 	ASSERT_VOP_LOCKED(vp, "vn_writechk");
256 	/*
257 	 * If there's shared text associated with
258 	 * the vnode, try to free it up once.  If
259 	 * we fail, we can't allow writing.
260 	 */
261 	if (vp->v_vflag & VV_TEXT)
262 		return (ETXTBSY);
263 
264 	return (0);
265 }
266 
267 /*
268  * Vnode close call
269  */
270 int
271 vn_close(vp, flags, file_cred, td)
272 	register struct vnode *vp;
273 	int flags;
274 	struct ucred *file_cred;
275 	struct thread *td;
276 {
277 	struct mount *mp;
278 	int error;
279 
280 	VFS_ASSERT_GIANT(vp->v_mount);
281 
282 	vn_start_write(vp, &mp, V_WAIT);
283 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
284 	if (flags & FWRITE)
285 		vp->v_writecount--;
286 	error = VOP_CLOSE(vp, flags, file_cred, td);
287 	vput(vp);
288 	vn_finished_write(mp);
289 	return (error);
290 }
291 
292 /*
293  * Sequential heuristic - detect sequential operation
294  */
295 static __inline
296 int
297 sequential_heuristic(struct uio *uio, struct file *fp)
298 {
299 
300 	if ((uio->uio_offset == 0 && fp->f_seqcount > 0) ||
301 	    uio->uio_offset == fp->f_nextoff) {
302 		/*
303 		 * XXX we assume that the filesystem block size is
304 		 * the default.  Not true, but still gives us a pretty
305 		 * good indicator of how sequential the read operations
306 		 * are.
307 		 */
308 		fp->f_seqcount += (uio->uio_resid + BKVASIZE - 1) / BKVASIZE;
309 		if (fp->f_seqcount > IO_SEQMAX)
310 			fp->f_seqcount = IO_SEQMAX;
311 		return(fp->f_seqcount << IO_SEQSHIFT);
312 	}
313 
314 	/*
315 	 * Not sequential, quick draw-down of seqcount
316 	 */
317 	if (fp->f_seqcount > 1)
318 		fp->f_seqcount = 1;
319 	else
320 		fp->f_seqcount = 0;
321 	return(0);
322 }
323 
324 /*
325  * Package up an I/O request on a vnode into a uio and do it.
326  */
327 int
328 vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, active_cred, file_cred,
329     aresid, td)
330 	enum uio_rw rw;
331 	struct vnode *vp;
332 	caddr_t base;
333 	int len;
334 	off_t offset;
335 	enum uio_seg segflg;
336 	int ioflg;
337 	struct ucred *active_cred;
338 	struct ucred *file_cred;
339 	int *aresid;
340 	struct thread *td;
341 {
342 	struct uio auio;
343 	struct iovec aiov;
344 	struct mount *mp;
345 	struct ucred *cred;
346 	int error;
347 
348 	VFS_ASSERT_GIANT(vp->v_mount);
349 
350 	if ((ioflg & IO_NODELOCKED) == 0) {
351 		mp = NULL;
352 		if (rw == UIO_WRITE) {
353 			if (vp->v_type != VCHR &&
354 			    (error = vn_start_write(vp, &mp, V_WAIT | PCATCH))
355 			    != 0)
356 				return (error);
357 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
358 		} else {
359 			/*
360 			 * XXX This should be LK_SHARED but I don't trust VFS
361 			 * enough to leave it like that until it has been
362 			 * reviewed further.
363 			 */
364 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
365 		}
366 
367 	}
368 	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
369 	auio.uio_iov = &aiov;
370 	auio.uio_iovcnt = 1;
371 	aiov.iov_base = base;
372 	aiov.iov_len = len;
373 	auio.uio_resid = len;
374 	auio.uio_offset = offset;
375 	auio.uio_segflg = segflg;
376 	auio.uio_rw = rw;
377 	auio.uio_td = td;
378 	error = 0;
379 #ifdef MAC
380 	if ((ioflg & IO_NOMACCHECK) == 0) {
381 		if (rw == UIO_READ)
382 			error = mac_check_vnode_read(active_cred, file_cred,
383 			    vp);
384 		else
385 			error = mac_check_vnode_write(active_cred, file_cred,
386 			    vp);
387 	}
388 #endif
389 	if (error == 0) {
390 		if (file_cred)
391 			cred = file_cred;
392 		else
393 			cred = active_cred;
394 		if (rw == UIO_READ)
395 			error = VOP_READ(vp, &auio, ioflg, cred);
396 		else
397 			error = VOP_WRITE(vp, &auio, ioflg, cred);
398 	}
399 	if (aresid)
400 		*aresid = auio.uio_resid;
401 	else
402 		if (auio.uio_resid && error == 0)
403 			error = EIO;
404 	if ((ioflg & IO_NODELOCKED) == 0) {
405 		if (rw == UIO_WRITE)
406 			vn_finished_write(mp);
407 		VOP_UNLOCK(vp, 0, td);
408 	}
409 	return (error);
410 }
411 
412 /*
413  * Package up an I/O request on a vnode into a uio and do it.  The I/O
414  * request is split up into smaller chunks and we try to avoid saturating
415  * the buffer cache while potentially holding a vnode locked, so we
416  * check bwillwrite() before calling vn_rdwr().  We also call uio_yield()
417  * to give other processes a chance to lock the vnode (either other processes
418  * core'ing the same binary, or unrelated processes scanning the directory).
419  */
420 int
421 vn_rdwr_inchunks(rw, vp, base, len, offset, segflg, ioflg, active_cred,
422     file_cred, aresid, td)
423 	enum uio_rw rw;
424 	struct vnode *vp;
425 	caddr_t base;
426 	size_t len;
427 	off_t offset;
428 	enum uio_seg segflg;
429 	int ioflg;
430 	struct ucred *active_cred;
431 	struct ucred *file_cred;
432 	size_t *aresid;
433 	struct thread *td;
434 {
435 	int error = 0;
436 	int iaresid;
437 
438 	VFS_ASSERT_GIANT(vp->v_mount);
439 
440 	do {
441 		int chunk;
442 
443 		/*
444 		 * Force `offset' to a multiple of MAXBSIZE except possibly
445 		 * for the first chunk, so that filesystems only need to
446 		 * write full blocks except possibly for the first and last
447 		 * chunks.
448 		 */
449 		chunk = MAXBSIZE - (uoff_t)offset % MAXBSIZE;
450 
451 		if (chunk > len)
452 			chunk = len;
453 		if (rw != UIO_READ && vp->v_type == VREG)
454 			bwillwrite();
455 		iaresid = 0;
456 		error = vn_rdwr(rw, vp, base, chunk, offset, segflg,
457 		    ioflg, active_cred, file_cred, &iaresid, td);
458 		len -= chunk;	/* aresid calc already includes length */
459 		if (error)
460 			break;
461 		offset += chunk;
462 		base += chunk;
463 		uio_yield();
464 	} while (len);
465 	if (aresid)
466 		*aresid = len + iaresid;
467 	return (error);
468 }
469 
470 /*
471  * File table vnode read routine.
472  */
473 static int
474 vn_read(fp, uio, active_cred, flags, td)
475 	struct file *fp;
476 	struct uio *uio;
477 	struct ucred *active_cred;
478 	struct thread *td;
479 	int flags;
480 {
481 	struct vnode *vp;
482 	int error, ioflag;
483 	int vfslocked;
484 
485 	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
486 	    uio->uio_td, td));
487 	vp = fp->f_vnode;
488 	ioflag = 0;
489 	if (fp->f_flag & FNONBLOCK)
490 		ioflag |= IO_NDELAY;
491 	if (fp->f_flag & O_DIRECT)
492 		ioflag |= IO_DIRECT;
493 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
494 	VOP_LEASE(vp, td, fp->f_cred, LEASE_READ);
495 	/*
496 	 * According to McKusick the vn lock is protecting f_offset here.
497 	 * Once this field has it's own lock we can acquire this shared.
498 	 */
499 	if ((flags & FOF_OFFSET) == 0) {
500 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
501 		uio->uio_offset = fp->f_offset;
502 	} else
503 		vn_lock(vp, LK_SHARED | LK_RETRY, td);
504 
505 	ioflag |= sequential_heuristic(uio, fp);
506 
507 #ifdef MAC
508 	error = mac_check_vnode_read(active_cred, fp->f_cred, vp);
509 	if (error == 0)
510 #endif
511 		error = VOP_READ(vp, uio, ioflag, fp->f_cred);
512 	if ((flags & FOF_OFFSET) == 0)
513 		fp->f_offset = uio->uio_offset;
514 	fp->f_nextoff = uio->uio_offset;
515 	VOP_UNLOCK(vp, 0, td);
516 	VFS_UNLOCK_GIANT(vfslocked);
517 	return (error);
518 }
519 
520 /*
521  * File table vnode write routine.
522  */
523 static int
524 vn_write(fp, uio, active_cred, flags, td)
525 	struct file *fp;
526 	struct uio *uio;
527 	struct ucred *active_cred;
528 	struct thread *td;
529 	int flags;
530 {
531 	struct vnode *vp;
532 	struct mount *mp;
533 	int error, ioflag;
534 	int vfslocked;
535 
536 	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
537 	    uio->uio_td, td));
538 	vp = fp->f_vnode;
539 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
540 	if (vp->v_type == VREG)
541 		bwillwrite();
542 	ioflag = IO_UNIT;
543 	if (vp->v_type == VREG && (fp->f_flag & O_APPEND))
544 		ioflag |= IO_APPEND;
545 	if (fp->f_flag & FNONBLOCK)
546 		ioflag |= IO_NDELAY;
547 	if (fp->f_flag & O_DIRECT)
548 		ioflag |= IO_DIRECT;
549 	if ((fp->f_flag & O_FSYNC) ||
550 	    (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)))
551 		ioflag |= IO_SYNC;
552 	mp = NULL;
553 	if (vp->v_type != VCHR &&
554 	    (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
555 		goto unlock;
556 	VOP_LEASE(vp, td, fp->f_cred, LEASE_WRITE);
557 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
558 	if ((flags & FOF_OFFSET) == 0)
559 		uio->uio_offset = fp->f_offset;
560 	ioflag |= sequential_heuristic(uio, fp);
561 #ifdef MAC
562 	error = mac_check_vnode_write(active_cred, fp->f_cred, vp);
563 	if (error == 0)
564 #endif
565 		error = VOP_WRITE(vp, uio, ioflag, fp->f_cred);
566 	if ((flags & FOF_OFFSET) == 0)
567 		fp->f_offset = uio->uio_offset;
568 	fp->f_nextoff = uio->uio_offset;
569 	VOP_UNLOCK(vp, 0, td);
570 	vn_finished_write(mp);
571 unlock:
572 	VFS_UNLOCK_GIANT(vfslocked);
573 	return (error);
574 }
575 
576 /*
577  * File table vnode stat routine.
578  */
579 static int
580 vn_statfile(fp, sb, active_cred, td)
581 	struct file *fp;
582 	struct stat *sb;
583 	struct ucred *active_cred;
584 	struct thread *td;
585 {
586 	struct vnode *vp = fp->f_vnode;
587 	int vfslocked;
588 	int error;
589 
590 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
591 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
592 	error = vn_stat(vp, sb, active_cred, fp->f_cred, td);
593 	VOP_UNLOCK(vp, 0, td);
594 	VFS_UNLOCK_GIANT(vfslocked);
595 
596 	return (error);
597 }
598 
599 /*
600  * Stat a vnode; implementation for the stat syscall
601  */
602 int
603 vn_stat(vp, sb, active_cred, file_cred, td)
604 	struct vnode *vp;
605 	register struct stat *sb;
606 	struct ucred *active_cred;
607 	struct ucred *file_cred;
608 	struct thread *td;
609 {
610 	struct vattr vattr;
611 	register struct vattr *vap;
612 	int error;
613 	u_short mode;
614 
615 #ifdef MAC
616 	error = mac_check_vnode_stat(active_cred, file_cred, vp);
617 	if (error)
618 		return (error);
619 #endif
620 
621 	vap = &vattr;
622 	error = VOP_GETATTR(vp, vap, active_cred, td);
623 	if (error)
624 		return (error);
625 
626 	/*
627 	 * Zero the spare stat fields
628 	 */
629 	bzero(sb, sizeof *sb);
630 
631 	/*
632 	 * Copy from vattr table
633 	 */
634 	if (vap->va_fsid != VNOVAL)
635 		sb->st_dev = vap->va_fsid;
636 	else
637 		sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0];
638 	sb->st_ino = vap->va_fileid;
639 	mode = vap->va_mode;
640 	switch (vap->va_type) {
641 	case VREG:
642 		mode |= S_IFREG;
643 		break;
644 	case VDIR:
645 		mode |= S_IFDIR;
646 		break;
647 	case VBLK:
648 		mode |= S_IFBLK;
649 		break;
650 	case VCHR:
651 		mode |= S_IFCHR;
652 		break;
653 	case VLNK:
654 		mode |= S_IFLNK;
655 		/* This is a cosmetic change, symlinks do not have a mode. */
656 		if (vp->v_mount->mnt_flag & MNT_NOSYMFOLLOW)
657 			sb->st_mode &= ~ACCESSPERMS;	/* 0000 */
658 		else
659 			sb->st_mode |= ACCESSPERMS;	/* 0777 */
660 		break;
661 	case VSOCK:
662 		mode |= S_IFSOCK;
663 		break;
664 	case VFIFO:
665 		mode |= S_IFIFO;
666 		break;
667 	default:
668 		return (EBADF);
669 	};
670 	sb->st_mode = mode;
671 	sb->st_nlink = vap->va_nlink;
672 	sb->st_uid = vap->va_uid;
673 	sb->st_gid = vap->va_gid;
674 	sb->st_rdev = vap->va_rdev;
675 	if (vap->va_size > OFF_MAX)
676 		return (EOVERFLOW);
677 	sb->st_size = vap->va_size;
678 	sb->st_atimespec = vap->va_atime;
679 	sb->st_mtimespec = vap->va_mtime;
680 	sb->st_ctimespec = vap->va_ctime;
681 	sb->st_birthtimespec = vap->va_birthtime;
682 
683         /*
684 	 * According to www.opengroup.org, the meaning of st_blksize is
685 	 *   "a filesystem-specific preferred I/O block size for this
686 	 *    object.  In some filesystem types, this may vary from file
687 	 *    to file"
688 	 * Default to PAGE_SIZE after much discussion.
689 	 * XXX: min(PAGE_SIZE, vp->v_bufobj.bo_bsize) may be more correct.
690 	 */
691 
692 	sb->st_blksize = PAGE_SIZE;
693 
694 	sb->st_flags = vap->va_flags;
695 	if (suser(td))
696 		sb->st_gen = 0;
697 	else
698 		sb->st_gen = vap->va_gen;
699 
700 #if (S_BLKSIZE == 512)
701 	/* Optimize this case */
702 	sb->st_blocks = vap->va_bytes >> 9;
703 #else
704 	sb->st_blocks = vap->va_bytes / S_BLKSIZE;
705 #endif
706 	return (0);
707 }
708 
709 /*
710  * File table vnode ioctl routine.
711  */
712 static int
713 vn_ioctl(fp, com, data, active_cred, td)
714 	struct file *fp;
715 	u_long com;
716 	void *data;
717 	struct ucred *active_cred;
718 	struct thread *td;
719 {
720 	struct vnode *vp = fp->f_vnode;
721 	struct vattr vattr;
722 	int vfslocked;
723 	int error;
724 
725 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
726 	error = ENOTTY;
727 	switch (vp->v_type) {
728 	case VREG:
729 	case VDIR:
730 		if (com == FIONREAD) {
731 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
732 			error = VOP_GETATTR(vp, &vattr, active_cred, td);
733 			VOP_UNLOCK(vp, 0, td);
734 			if (!error)
735 				*(int *)data = vattr.va_size - fp->f_offset;
736 		}
737 		if (com == FIONBIO || com == FIOASYNC)	/* XXX */
738 			error = 0;
739 		else
740 			error = VOP_IOCTL(vp, com, data, fp->f_flag,
741 			    active_cred, td);
742 		break;
743 
744 	default:
745 		break;
746 	}
747 	VFS_UNLOCK_GIANT(vfslocked);
748 	return (error);
749 }
750 
751 /*
752  * File table vnode poll routine.
753  */
754 static int
755 vn_poll(fp, events, active_cred, td)
756 	struct file *fp;
757 	int events;
758 	struct ucred *active_cred;
759 	struct thread *td;
760 {
761 	struct vnode *vp;
762 	int error;
763 
764 	mtx_lock(&Giant);
765 
766 	vp = fp->f_vnode;
767 #ifdef MAC
768 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
769 	error = mac_check_vnode_poll(active_cred, fp->f_cred, vp);
770 	VOP_UNLOCK(vp, 0, td);
771 	if (!error)
772 #endif
773 
774 	error = VOP_POLL(vp, events, fp->f_cred, td);
775 	mtx_unlock(&Giant);
776 	return (error);
777 }
778 
779 /*
780  * Check that the vnode is still valid, and if so
781  * acquire requested lock.
782  */
783 int
784 #ifndef	DEBUG_LOCKS
785 vn_lock(vp, flags, td)
786 #else
787 debug_vn_lock(vp, flags, td, filename, line)
788 #endif
789 	struct vnode *vp;
790 	int flags;
791 	struct thread *td;
792 #ifdef	DEBUG_LOCKS
793 	const char *filename;
794 	int line;
795 #endif
796 {
797 	int error;
798 
799 	do {
800 		if ((flags & LK_INTERLOCK) == 0)
801 			VI_LOCK(vp);
802 		if ((flags & LK_NOWAIT || (flags & LK_TYPE_MASK) == 0) &&
803 		    vp->v_iflag & VI_DOOMED) {
804 			VI_UNLOCK(vp);
805 			return (ENOENT);
806 		}
807 		/*
808 		 * Just polling to check validity.
809 		 */
810 		if ((flags & LK_TYPE_MASK) == 0) {
811 			VI_UNLOCK(vp);
812 			return (0);
813 		}
814 #ifdef	DEBUG_LOCKS
815 		vp->filename = filename;
816 		vp->line = line;
817 #endif
818 		/*
819 		 * lockmgr drops interlock before it will return for
820 		 * any reason.  So force the code above to relock it.
821 		 */
822 		error = VOP_LOCK(vp, flags | LK_INTERLOCK, td);
823 		flags &= ~LK_INTERLOCK;
824 		/*
825 		 * Callers specify LK_RETRY if they wish to get dead vnodes.
826 		 * If RETRY is not set, we return ENOENT instead.
827 		 */
828 		if (error == 0 && vp->v_iflag & VI_DOOMED &&
829 		    (flags & LK_RETRY) == 0) {
830 			VOP_UNLOCK(vp, 0, td);
831 			error = ENOENT;
832 			break;
833 		}
834 	} while (flags & LK_RETRY && error != 0);
835 	return (error);
836 }
837 
838 /*
839  * File table vnode close routine.
840  */
841 static int
842 vn_closefile(fp, td)
843 	struct file *fp;
844 	struct thread *td;
845 {
846 	struct vnode *vp;
847 	struct flock lf;
848 	int vfslocked;
849 	int error;
850 
851 	vp = fp->f_vnode;
852 
853 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
854 	if (fp->f_type == DTYPE_VNODE && fp->f_flag & FHASLOCK) {
855 		lf.l_whence = SEEK_SET;
856 		lf.l_start = 0;
857 		lf.l_len = 0;
858 		lf.l_type = F_UNLCK;
859 		(void) VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK);
860 	}
861 
862 	fp->f_ops = &badfileops;
863 
864 	error = vn_close(vp, fp->f_flag, fp->f_cred, td);
865 	VFS_UNLOCK_GIANT(vfslocked);
866 	return (error);
867 }
868 
869 /*
870  * Preparing to start a filesystem write operation. If the operation is
871  * permitted, then we bump the count of operations in progress and
872  * proceed. If a suspend request is in progress, we wait until the
873  * suspension is over, and then proceed.
874  */
875 int
876 vn_start_write(vp, mpp, flags)
877 	struct vnode *vp;
878 	struct mount **mpp;
879 	int flags;
880 {
881 	struct mount *mp;
882 	int error;
883 
884 	error = 0;
885 	/*
886 	 * If a vnode is provided, get and return the mount point that
887 	 * to which it will write.
888 	 */
889 	if (vp != NULL) {
890 		if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
891 			*mpp = NULL;
892 			if (error != EOPNOTSUPP)
893 				return (error);
894 			return (0);
895 		}
896 	}
897 	if ((mp = *mpp) == NULL)
898 		return (0);
899 	MNT_ILOCK(mp);
900 	/*
901 	 * Check on status of suspension.
902 	 */
903 	while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
904 		if (flags & V_NOWAIT) {
905 			error = EWOULDBLOCK;
906 			goto unlock;
907 		}
908 		error = msleep(&mp->mnt_flag, MNT_MTX(mp),
909 		    (PUSER - 1) | (flags & PCATCH), "suspfs", 0);
910 		if (error)
911 			goto unlock;
912 	}
913 	if (flags & V_XSLEEP)
914 		goto unlock;
915 	mp->mnt_writeopcount++;
916 unlock:
917 	MNT_IUNLOCK(mp);
918 	return (error);
919 }
920 
921 /*
922  * Secondary suspension. Used by operations such as vop_inactive
923  * routines that are needed by the higher level functions. These
924  * are allowed to proceed until all the higher level functions have
925  * completed (indicated by mnt_writeopcount dropping to zero). At that
926  * time, these operations are halted until the suspension is over.
927  */
928 int
929 vn_write_suspend_wait(vp, mp, flags)
930 	struct vnode *vp;
931 	struct mount *mp;
932 	int flags;
933 {
934 	int error;
935 
936 	if (vp != NULL) {
937 		if ((error = VOP_GETWRITEMOUNT(vp, &mp)) != 0) {
938 			if (error != EOPNOTSUPP)
939 				return (error);
940 			return (0);
941 		}
942 	}
943 	/*
944 	 * If we are not suspended or have not yet reached suspended
945 	 * mode, then let the operation proceed.
946 	 */
947 	if (mp == NULL)
948 		return (0);
949 	MNT_ILOCK(mp);
950 	if ((mp->mnt_kern_flag & MNTK_SUSPENDED) == 0) {
951 		MNT_IUNLOCK(mp);
952 		return (0);
953 	}
954 	if (flags & V_NOWAIT) {
955 		MNT_IUNLOCK(mp);
956 		return (EWOULDBLOCK);
957 	}
958 	/*
959 	 * Wait for the suspension to finish.
960 	 */
961 	return (msleep(&mp->mnt_flag, MNT_MTX(mp),
962 	    (PUSER - 1) | (flags & PCATCH) | PDROP, "suspfs", 0));
963 }
964 
965 /*
966  * Filesystem write operation has completed. If we are suspending and this
967  * operation is the last one, notify the suspender that the suspension is
968  * now in effect.
969  */
970 void
971 vn_finished_write(mp)
972 	struct mount *mp;
973 {
974 	if (mp == NULL)
975 		return;
976 	MNT_ILOCK(mp);
977 	mp->mnt_writeopcount--;
978 	if (mp->mnt_writeopcount < 0)
979 		panic("vn_finished_write: neg cnt");
980 	if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
981 	    mp->mnt_writeopcount <= 0)
982 		wakeup(&mp->mnt_writeopcount);
983 	MNT_IUNLOCK(mp);
984 }
985 
986 /*
987  * Request a filesystem to suspend write operations.
988  */
989 int
990 vfs_write_suspend(mp)
991 	struct mount *mp;
992 {
993 	struct thread *td = curthread;
994 	int error;
995 
996 	error = 0;
997 	MNT_ILOCK(mp);
998 	if (mp->mnt_kern_flag & MNTK_SUSPEND)
999 		goto unlock;
1000 	mp->mnt_kern_flag |= MNTK_SUSPEND;
1001 	if (mp->mnt_writeopcount > 0)
1002 		(void) msleep(&mp->mnt_writeopcount,
1003 		    MNT_MTX(mp), (PUSER - 1)|PDROP, "suspwt", 0);
1004 	else
1005 		MNT_IUNLOCK(mp);
1006 	if ((error = VFS_SYNC(mp, MNT_WAIT, td)) != 0) {
1007 		vfs_write_resume(mp);
1008 		return (error);
1009 	}
1010 	MNT_ILOCK(mp);
1011 	mp->mnt_kern_flag |= MNTK_SUSPENDED;
1012 unlock:
1013 	MNT_IUNLOCK(mp);
1014 	return (error);
1015 }
1016 
1017 /*
1018  * Request a filesystem to resume write operations.
1019  */
1020 void
1021 vfs_write_resume(mp)
1022 	struct mount *mp;
1023 {
1024 
1025 	MNT_ILOCK(mp);
1026 	if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
1027 		mp->mnt_kern_flag &= ~(MNTK_SUSPEND | MNTK_SUSPENDED);
1028 		wakeup(&mp->mnt_writeopcount);
1029 		wakeup(&mp->mnt_flag);
1030 	}
1031 	MNT_IUNLOCK(mp);
1032 }
1033 
1034 /*
1035  * Implement kqueues for files by translating it to vnode operation.
1036  */
1037 static int
1038 vn_kqfilter(struct file *fp, struct knote *kn)
1039 {
1040 	int error;
1041 
1042 	mtx_lock(&Giant);
1043 	error = VOP_KQFILTER(fp->f_vnode, kn);
1044 	mtx_unlock(&Giant);
1045 
1046 	return error;
1047 }
1048 
1049 /*
1050  * Simplified in-kernel wrapper calls for extended attribute access.
1051  * Both calls pass in a NULL credential, authorizing as "kernel" access.
1052  * Set IO_NODELOCKED in ioflg if the vnode is already locked.
1053  */
1054 int
1055 vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace,
1056     const char *attrname, int *buflen, char *buf, struct thread *td)
1057 {
1058 	struct uio	auio;
1059 	struct iovec	iov;
1060 	int	error;
1061 
1062 	iov.iov_len = *buflen;
1063 	iov.iov_base = buf;
1064 
1065 	auio.uio_iov = &iov;
1066 	auio.uio_iovcnt = 1;
1067 	auio.uio_rw = UIO_READ;
1068 	auio.uio_segflg = UIO_SYSSPACE;
1069 	auio.uio_td = td;
1070 	auio.uio_offset = 0;
1071 	auio.uio_resid = *buflen;
1072 
1073 	if ((ioflg & IO_NODELOCKED) == 0)
1074 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
1075 
1076 	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
1077 
1078 	/* authorize attribute retrieval as kernel */
1079 	error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, NULL,
1080 	    td);
1081 
1082 	if ((ioflg & IO_NODELOCKED) == 0)
1083 		VOP_UNLOCK(vp, 0, td);
1084 
1085 	if (error == 0) {
1086 		*buflen = *buflen - auio.uio_resid;
1087 	}
1088 
1089 	return (error);
1090 }
1091 
1092 /*
1093  * XXX failure mode if partially written?
1094  */
1095 int
1096 vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace,
1097     const char *attrname, int buflen, char *buf, struct thread *td)
1098 {
1099 	struct uio	auio;
1100 	struct iovec	iov;
1101 	struct mount	*mp;
1102 	int	error;
1103 
1104 	iov.iov_len = buflen;
1105 	iov.iov_base = buf;
1106 
1107 	auio.uio_iov = &iov;
1108 	auio.uio_iovcnt = 1;
1109 	auio.uio_rw = UIO_WRITE;
1110 	auio.uio_segflg = UIO_SYSSPACE;
1111 	auio.uio_td = td;
1112 	auio.uio_offset = 0;
1113 	auio.uio_resid = buflen;
1114 
1115 	if ((ioflg & IO_NODELOCKED) == 0) {
1116 		if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
1117 			return (error);
1118 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
1119 	}
1120 
1121 	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
1122 
1123 	/* authorize attribute setting as kernel */
1124 	error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, td);
1125 
1126 	if ((ioflg & IO_NODELOCKED) == 0) {
1127 		vn_finished_write(mp);
1128 		VOP_UNLOCK(vp, 0, td);
1129 	}
1130 
1131 	return (error);
1132 }
1133 
1134 int
1135 vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace,
1136     const char *attrname, struct thread *td)
1137 {
1138 	struct mount	*mp;
1139 	int	error;
1140 
1141 	if ((ioflg & IO_NODELOCKED) == 0) {
1142 		if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
1143 			return (error);
1144 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
1145 	}
1146 
1147 	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
1148 
1149 	/* authorize attribute removal as kernel */
1150 	error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, NULL, td);
1151 	if (error == EOPNOTSUPP)
1152 		error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL,
1153 		    NULL, td);
1154 
1155 	if ((ioflg & IO_NODELOCKED) == 0) {
1156 		vn_finished_write(mp);
1157 		VOP_UNLOCK(vp, 0, td);
1158 	}
1159 
1160 	return (error);
1161 }
1162