xref: /freebsd/sys/kern/vfs_vnops.c (revision 9162f64b58d01ec01481d60b6cdc06ffd8e8c7fc)
1 /*-
2  * Copyright (c) 1982, 1986, 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 4. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  *	@(#)vfs_vnops.c	8.2 (Berkeley) 1/21/94
35  */
36 
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD$");
39 
40 #include "opt_mac.h"
41 
42 #include <sys/param.h>
43 #include <sys/systm.h>
44 #include <sys/fcntl.h>
45 #include <sys/file.h>
46 #include <sys/kdb.h>
47 #include <sys/stat.h>
48 #include <sys/priv.h>
49 #include <sys/proc.h>
50 #include <sys/limits.h>
51 #include <sys/lock.h>
52 #include <sys/mount.h>
53 #include <sys/mutex.h>
54 #include <sys/namei.h>
55 #include <sys/vnode.h>
56 #include <sys/bio.h>
57 #include <sys/buf.h>
58 #include <sys/filio.h>
59 #include <sys/sx.h>
60 #include <sys/ttycom.h>
61 #include <sys/conf.h>
62 #include <sys/syslog.h>
63 #include <sys/unistd.h>
64 
65 #include <security/mac/mac_framework.h>
66 
67 static fo_rdwr_t	vn_read;
68 static fo_rdwr_t	vn_write;
69 static fo_truncate_t	vn_truncate;
70 static fo_ioctl_t	vn_ioctl;
71 static fo_poll_t	vn_poll;
72 static fo_kqfilter_t	vn_kqfilter;
73 static fo_stat_t	vn_statfile;
74 static fo_close_t	vn_closefile;
75 
76 struct 	fileops vnops = {
77 	.fo_read = vn_read,
78 	.fo_write = vn_write,
79 	.fo_truncate = vn_truncate,
80 	.fo_ioctl = vn_ioctl,
81 	.fo_poll = vn_poll,
82 	.fo_kqfilter = vn_kqfilter,
83 	.fo_stat = vn_statfile,
84 	.fo_close = vn_closefile,
85 	.fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE
86 };
87 
88 int
89 vn_open(ndp, flagp, cmode, fp)
90 	struct nameidata *ndp;
91 	int *flagp, cmode;
92 	struct file *fp;
93 {
94 	struct thread *td = ndp->ni_cnd.cn_thread;
95 
96 	return (vn_open_cred(ndp, flagp, cmode, td->td_ucred, fp));
97 }
98 
99 /*
100  * Common code for vnode open operations.
101  * Check permissions, and call the VOP_OPEN or VOP_CREATE routine.
102  *
103  * Note that this does NOT free nameidata for the successful case,
104  * due to the NDINIT being done elsewhere.
105  */
106 int
107 vn_open_cred(ndp, flagp, cmode, cred, fp)
108 	struct nameidata *ndp;
109 	int *flagp, cmode;
110 	struct ucred *cred;
111 	struct file *fp;
112 {
113 	struct vnode *vp;
114 	struct mount *mp;
115 	struct thread *td = ndp->ni_cnd.cn_thread;
116 	struct vattr vat;
117 	struct vattr *vap = &vat;
118 	int fmode, error;
119 	accmode_t accmode;
120 	int vfslocked, mpsafe;
121 
122 	mpsafe = ndp->ni_cnd.cn_flags & MPSAFE;
123 restart:
124 	vfslocked = 0;
125 	fmode = *flagp;
126 	if (fmode & O_CREAT) {
127 		ndp->ni_cnd.cn_nameiop = CREATE;
128 		ndp->ni_cnd.cn_flags = ISOPEN | LOCKPARENT | LOCKLEAF |
129 		    MPSAFE | AUDITVNODE1;
130 		if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0)
131 			ndp->ni_cnd.cn_flags |= FOLLOW;
132 		bwillwrite();
133 		if ((error = namei(ndp)) != 0)
134 			return (error);
135 		vfslocked = NDHASGIANT(ndp);
136 		if (!mpsafe)
137 			ndp->ni_cnd.cn_flags &= ~MPSAFE;
138 		if (ndp->ni_vp == NULL) {
139 			VATTR_NULL(vap);
140 			vap->va_type = VREG;
141 			vap->va_mode = cmode;
142 			if (fmode & O_EXCL)
143 				vap->va_vaflags |= VA_EXCLUSIVE;
144 			if (vn_start_write(ndp->ni_dvp, &mp, V_NOWAIT) != 0) {
145 				NDFREE(ndp, NDF_ONLY_PNBUF);
146 				vput(ndp->ni_dvp);
147 				VFS_UNLOCK_GIANT(vfslocked);
148 				if ((error = vn_start_write(NULL, &mp,
149 				    V_XSLEEP | PCATCH)) != 0)
150 					return (error);
151 				goto restart;
152 			}
153 #ifdef MAC
154 			error = mac_vnode_check_create(cred, ndp->ni_dvp,
155 			    &ndp->ni_cnd, vap);
156 			if (error == 0) {
157 #endif
158 				VOP_LEASE(ndp->ni_dvp, td, cred, LEASE_WRITE);
159 				error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp,
160 						   &ndp->ni_cnd, vap);
161 #ifdef MAC
162 			}
163 #endif
164 			vput(ndp->ni_dvp);
165 			vn_finished_write(mp);
166 			if (error) {
167 				VFS_UNLOCK_GIANT(vfslocked);
168 				NDFREE(ndp, NDF_ONLY_PNBUF);
169 				return (error);
170 			}
171 			fmode &= ~O_TRUNC;
172 			vp = ndp->ni_vp;
173 		} else {
174 			if (ndp->ni_dvp == ndp->ni_vp)
175 				vrele(ndp->ni_dvp);
176 			else
177 				vput(ndp->ni_dvp);
178 			ndp->ni_dvp = NULL;
179 			vp = ndp->ni_vp;
180 			if (fmode & O_EXCL) {
181 				error = EEXIST;
182 				goto bad;
183 			}
184 			fmode &= ~O_CREAT;
185 		}
186 	} else {
187 		ndp->ni_cnd.cn_nameiop = LOOKUP;
188 		ndp->ni_cnd.cn_flags = ISOPEN |
189 		    ((fmode & O_NOFOLLOW) ? NOFOLLOW : FOLLOW) |
190 		    LOCKLEAF | MPSAFE | AUDITVNODE1;
191 		if ((error = namei(ndp)) != 0)
192 			return (error);
193 		if (!mpsafe)
194 			ndp->ni_cnd.cn_flags &= ~MPSAFE;
195 		vfslocked = NDHASGIANT(ndp);
196 		vp = ndp->ni_vp;
197 	}
198 	if (vp->v_type == VLNK) {
199 		error = EMLINK;
200 		goto bad;
201 	}
202 	if (vp->v_type == VSOCK) {
203 		error = EOPNOTSUPP;
204 		goto bad;
205 	}
206 	accmode = 0;
207 	if (fmode & (FWRITE | O_TRUNC)) {
208 		if (vp->v_type == VDIR) {
209 			error = EISDIR;
210 			goto bad;
211 		}
212 		accmode |= VWRITE;
213 	}
214 	if (fmode & FREAD)
215 		accmode |= VREAD;
216 	if (fmode & FEXEC)
217 		accmode |= VEXEC;
218 	if (fmode & O_APPEND)
219 		accmode |= VAPPEND;
220 #ifdef MAC
221 	error = mac_vnode_check_open(cred, vp, accmode);
222 	if (error)
223 		goto bad;
224 #endif
225 	if ((fmode & O_CREAT) == 0) {
226 		if (accmode & VWRITE) {
227 			error = vn_writechk(vp);
228 			if (error)
229 				goto bad;
230 		}
231 		if (accmode) {
232 		        error = VOP_ACCESS(vp, accmode, cred, td);
233 			if (error)
234 				goto bad;
235 		}
236 	}
237 	if ((error = VOP_OPEN(vp, fmode, cred, td, fp)) != 0)
238 		goto bad;
239 
240 	if (fmode & FWRITE)
241 		vp->v_writecount++;
242 	*flagp = fmode;
243 	ASSERT_VOP_ELOCKED(vp, "vn_open_cred");
244 	if (!mpsafe)
245 		VFS_UNLOCK_GIANT(vfslocked);
246 	return (0);
247 bad:
248 	NDFREE(ndp, NDF_ONLY_PNBUF);
249 	vput(vp);
250 	VFS_UNLOCK_GIANT(vfslocked);
251 	*flagp = fmode;
252 	ndp->ni_vp = NULL;
253 	return (error);
254 }
255 
256 /*
257  * Check for write permissions on the specified vnode.
258  * Prototype text segments cannot be written.
259  */
260 int
261 vn_writechk(vp)
262 	register struct vnode *vp;
263 {
264 
265 	ASSERT_VOP_LOCKED(vp, "vn_writechk");
266 	/*
267 	 * If there's shared text associated with
268 	 * the vnode, try to free it up once.  If
269 	 * we fail, we can't allow writing.
270 	 */
271 	if (vp->v_vflag & VV_TEXT)
272 		return (ETXTBSY);
273 
274 	return (0);
275 }
276 
277 /*
278  * Vnode close call
279  */
280 int
281 vn_close(vp, flags, file_cred, td)
282 	register struct vnode *vp;
283 	int flags;
284 	struct ucred *file_cred;
285 	struct thread *td;
286 {
287 	struct mount *mp;
288 	int error;
289 
290 	VFS_ASSERT_GIANT(vp->v_mount);
291 
292 	vn_start_write(vp, &mp, V_WAIT);
293 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
294 	if (flags & FWRITE) {
295 		VNASSERT(vp->v_writecount > 0, vp,
296 		    ("vn_close: negative writecount"));
297 		vp->v_writecount--;
298 	}
299 	error = VOP_CLOSE(vp, flags, file_cred, td);
300 	vput(vp);
301 	vn_finished_write(mp);
302 	return (error);
303 }
304 
305 /*
306  * Heuristic to detect sequential operation.
307  */
308 static int
309 sequential_heuristic(struct uio *uio, struct file *fp)
310 {
311 
312 	/*
313 	 * Offset 0 is handled specially.  open() sets f_seqcount to 1 so
314 	 * that the first I/O is normally considered to be slightly
315 	 * sequential.  Seeking to offset 0 doesn't change sequentiality
316 	 * unless previous seeks have reduced f_seqcount to 0, in which
317 	 * case offset 0 is not special.
318 	 */
319 	if ((uio->uio_offset == 0 && fp->f_seqcount > 0) ||
320 	    uio->uio_offset == fp->f_nextoff) {
321 		/*
322 		 * f_seqcount is in units of fixed-size blocks so that it
323 		 * depends mainly on the amount of sequential I/O and not
324 		 * much on the number of sequential I/O's.  The fixed size
325 		 * of 16384 is hard-coded here since it is (not quite) just
326 		 * a magic size that works well here.  This size is more
327 		 * closely related to the best I/O size for real disks than
328 		 * to any block size used by software.
329 		 */
330 		fp->f_seqcount += howmany(uio->uio_resid, 16384);
331 		if (fp->f_seqcount > IO_SEQMAX)
332 			fp->f_seqcount = IO_SEQMAX;
333 		return (fp->f_seqcount << IO_SEQSHIFT);
334 	}
335 
336 	/* Not sequential.  Quickly draw-down sequentiality. */
337 	if (fp->f_seqcount > 1)
338 		fp->f_seqcount = 1;
339 	else
340 		fp->f_seqcount = 0;
341 	return (0);
342 }
343 
344 /*
345  * Package up an I/O request on a vnode into a uio and do it.
346  */
347 int
348 vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, active_cred, file_cred,
349     aresid, td)
350 	enum uio_rw rw;
351 	struct vnode *vp;
352 	void *base;
353 	int len;
354 	off_t offset;
355 	enum uio_seg segflg;
356 	int ioflg;
357 	struct ucred *active_cred;
358 	struct ucred *file_cred;
359 	int *aresid;
360 	struct thread *td;
361 {
362 	struct uio auio;
363 	struct iovec aiov;
364 	struct mount *mp;
365 	struct ucred *cred;
366 	int error;
367 
368 	VFS_ASSERT_GIANT(vp->v_mount);
369 
370 	if ((ioflg & IO_NODELOCKED) == 0) {
371 		mp = NULL;
372 		if (rw == UIO_WRITE) {
373 			if (vp->v_type != VCHR &&
374 			    (error = vn_start_write(vp, &mp, V_WAIT | PCATCH))
375 			    != 0)
376 				return (error);
377 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
378 		} else {
379 			/*
380 			 * XXX This should be LK_SHARED but I don't trust VFS
381 			 * enough to leave it like that until it has been
382 			 * reviewed further.
383 			 */
384 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
385 		}
386 
387 	}
388 	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
389 	auio.uio_iov = &aiov;
390 	auio.uio_iovcnt = 1;
391 	aiov.iov_base = base;
392 	aiov.iov_len = len;
393 	auio.uio_resid = len;
394 	auio.uio_offset = offset;
395 	auio.uio_segflg = segflg;
396 	auio.uio_rw = rw;
397 	auio.uio_td = td;
398 	error = 0;
399 #ifdef MAC
400 	if ((ioflg & IO_NOMACCHECK) == 0) {
401 		if (rw == UIO_READ)
402 			error = mac_vnode_check_read(active_cred, file_cred,
403 			    vp);
404 		else
405 			error = mac_vnode_check_write(active_cred, file_cred,
406 			    vp);
407 	}
408 #endif
409 	if (error == 0) {
410 		if (file_cred)
411 			cred = file_cred;
412 		else
413 			cred = active_cred;
414 		if (rw == UIO_READ)
415 			error = VOP_READ(vp, &auio, ioflg, cred);
416 		else
417 			error = VOP_WRITE(vp, &auio, ioflg, cred);
418 	}
419 	if (aresid)
420 		*aresid = auio.uio_resid;
421 	else
422 		if (auio.uio_resid && error == 0)
423 			error = EIO;
424 	if ((ioflg & IO_NODELOCKED) == 0) {
425 		if (rw == UIO_WRITE && vp->v_type != VCHR)
426 			vn_finished_write(mp);
427 		VOP_UNLOCK(vp, 0);
428 	}
429 	return (error);
430 }
431 
432 /*
433  * Package up an I/O request on a vnode into a uio and do it.  The I/O
434  * request is split up into smaller chunks and we try to avoid saturating
435  * the buffer cache while potentially holding a vnode locked, so we
436  * check bwillwrite() before calling vn_rdwr().  We also call uio_yield()
437  * to give other processes a chance to lock the vnode (either other processes
438  * core'ing the same binary, or unrelated processes scanning the directory).
439  */
440 int
441 vn_rdwr_inchunks(rw, vp, base, len, offset, segflg, ioflg, active_cred,
442     file_cred, aresid, td)
443 	enum uio_rw rw;
444 	struct vnode *vp;
445 	void *base;
446 	size_t len;
447 	off_t offset;
448 	enum uio_seg segflg;
449 	int ioflg;
450 	struct ucred *active_cred;
451 	struct ucred *file_cred;
452 	size_t *aresid;
453 	struct thread *td;
454 {
455 	int error = 0;
456 	int iaresid;
457 
458 	VFS_ASSERT_GIANT(vp->v_mount);
459 
460 	do {
461 		int chunk;
462 
463 		/*
464 		 * Force `offset' to a multiple of MAXBSIZE except possibly
465 		 * for the first chunk, so that filesystems only need to
466 		 * write full blocks except possibly for the first and last
467 		 * chunks.
468 		 */
469 		chunk = MAXBSIZE - (uoff_t)offset % MAXBSIZE;
470 
471 		if (chunk > len)
472 			chunk = len;
473 		if (rw != UIO_READ && vp->v_type == VREG)
474 			bwillwrite();
475 		iaresid = 0;
476 		error = vn_rdwr(rw, vp, base, chunk, offset, segflg,
477 		    ioflg, active_cred, file_cred, &iaresid, td);
478 		len -= chunk;	/* aresid calc already includes length */
479 		if (error)
480 			break;
481 		offset += chunk;
482 		base = (char *)base + chunk;
483 		uio_yield();
484 	} while (len);
485 	if (aresid)
486 		*aresid = len + iaresid;
487 	return (error);
488 }
489 
490 /*
491  * File table vnode read routine.
492  */
493 static int
494 vn_read(fp, uio, active_cred, flags, td)
495 	struct file *fp;
496 	struct uio *uio;
497 	struct ucred *active_cred;
498 	struct thread *td;
499 	int flags;
500 {
501 	struct vnode *vp;
502 	int error, ioflag;
503 	struct mtx *mtxp;
504 	int vfslocked;
505 
506 	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
507 	    uio->uio_td, td));
508 	mtxp = NULL;
509 	vp = fp->f_vnode;
510 	ioflag = 0;
511 	if (fp->f_flag & FNONBLOCK)
512 		ioflag |= IO_NDELAY;
513 	if (fp->f_flag & O_DIRECT)
514 		ioflag |= IO_DIRECT;
515 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
516 	VOP_LEASE(vp, td, fp->f_cred, LEASE_READ);
517 	/*
518 	 * According to McKusick the vn lock was protecting f_offset here.
519 	 * It is now protected by the FOFFSET_LOCKED flag.
520 	 */
521 	if ((flags & FOF_OFFSET) == 0) {
522 		mtxp = mtx_pool_find(mtxpool_sleep, fp);
523 		mtx_lock(mtxp);
524 		while(fp->f_vnread_flags & FOFFSET_LOCKED) {
525 			fp->f_vnread_flags |= FOFFSET_LOCK_WAITING;
526 			msleep(&fp->f_vnread_flags, mtxp, PUSER -1,
527 			    "vnread offlock", 0);
528 		}
529 		fp->f_vnread_flags |= FOFFSET_LOCKED;
530 		mtx_unlock(mtxp);
531 		vn_lock(vp, LK_SHARED | LK_RETRY);
532 		uio->uio_offset = fp->f_offset;
533 	} else
534 		vn_lock(vp, LK_SHARED | LK_RETRY);
535 
536 	ioflag |= sequential_heuristic(uio, fp);
537 
538 #ifdef MAC
539 	error = mac_vnode_check_read(active_cred, fp->f_cred, vp);
540 	if (error == 0)
541 #endif
542 		error = VOP_READ(vp, uio, ioflag, fp->f_cred);
543 	if ((flags & FOF_OFFSET) == 0) {
544 		fp->f_offset = uio->uio_offset;
545 		mtx_lock(mtxp);
546 		if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING)
547 			wakeup(&fp->f_vnread_flags);
548 		fp->f_vnread_flags = 0;
549 		mtx_unlock(mtxp);
550 	}
551 	fp->f_nextoff = uio->uio_offset;
552 	VOP_UNLOCK(vp, 0);
553 	VFS_UNLOCK_GIANT(vfslocked);
554 	return (error);
555 }
556 
557 /*
558  * File table vnode write routine.
559  */
560 static int
561 vn_write(fp, uio, active_cred, flags, td)
562 	struct file *fp;
563 	struct uio *uio;
564 	struct ucred *active_cred;
565 	struct thread *td;
566 	int flags;
567 {
568 	struct vnode *vp;
569 	struct mount *mp;
570 	int error, ioflag;
571 	int vfslocked;
572 
573 	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
574 	    uio->uio_td, td));
575 	vp = fp->f_vnode;
576 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
577 	if (vp->v_type == VREG)
578 		bwillwrite();
579 	ioflag = IO_UNIT;
580 	if (vp->v_type == VREG && (fp->f_flag & O_APPEND))
581 		ioflag |= IO_APPEND;
582 	if (fp->f_flag & FNONBLOCK)
583 		ioflag |= IO_NDELAY;
584 	if (fp->f_flag & O_DIRECT)
585 		ioflag |= IO_DIRECT;
586 	if ((fp->f_flag & O_FSYNC) ||
587 	    (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)))
588 		ioflag |= IO_SYNC;
589 	mp = NULL;
590 	if (vp->v_type != VCHR &&
591 	    (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
592 		goto unlock;
593 	VOP_LEASE(vp, td, fp->f_cred, LEASE_WRITE);
594 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
595 	if ((flags & FOF_OFFSET) == 0)
596 		uio->uio_offset = fp->f_offset;
597 	ioflag |= sequential_heuristic(uio, fp);
598 #ifdef MAC
599 	error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
600 	if (error == 0)
601 #endif
602 		error = VOP_WRITE(vp, uio, ioflag, fp->f_cred);
603 	if ((flags & FOF_OFFSET) == 0)
604 		fp->f_offset = uio->uio_offset;
605 	fp->f_nextoff = uio->uio_offset;
606 	VOP_UNLOCK(vp, 0);
607 	if (vp->v_type != VCHR)
608 		vn_finished_write(mp);
609 unlock:
610 	VFS_UNLOCK_GIANT(vfslocked);
611 	return (error);
612 }
613 
614 /*
615  * File table truncate routine.
616  */
617 static int
618 vn_truncate(fp, length, active_cred, td)
619 	struct file *fp;
620 	off_t length;
621 	struct ucred *active_cred;
622 	struct thread *td;
623 {
624 	struct vattr vattr;
625 	struct mount *mp;
626 	struct vnode *vp;
627 	int vfslocked;
628 	int error;
629 
630 	vp = fp->f_vnode;
631 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
632 	error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
633 	if (error) {
634 		VFS_UNLOCK_GIANT(vfslocked);
635 		return (error);
636 	}
637 	VOP_LEASE(vp, td, active_cred, LEASE_WRITE);
638 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
639 	if (vp->v_type == VDIR) {
640 		error = EISDIR;
641 		goto out;
642 	}
643 #ifdef MAC
644 	error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
645 	if (error)
646 		goto out;
647 #endif
648 	error = vn_writechk(vp);
649 	if (error == 0) {
650 		VATTR_NULL(&vattr);
651 		vattr.va_size = length;
652 		error = VOP_SETATTR(vp, &vattr, fp->f_cred);
653 	}
654 out:
655 	VOP_UNLOCK(vp, 0);
656 	vn_finished_write(mp);
657 	VFS_UNLOCK_GIANT(vfslocked);
658 	return (error);
659 }
660 
661 /*
662  * File table vnode stat routine.
663  */
664 static int
665 vn_statfile(fp, sb, active_cred, td)
666 	struct file *fp;
667 	struct stat *sb;
668 	struct ucred *active_cred;
669 	struct thread *td;
670 {
671 	struct vnode *vp = fp->f_vnode;
672 	int vfslocked;
673 	int error;
674 
675 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
676 	vn_lock(vp, LK_SHARED | LK_RETRY);
677 	error = vn_stat(vp, sb, active_cred, fp->f_cred, td);
678 	VOP_UNLOCK(vp, 0);
679 	VFS_UNLOCK_GIANT(vfslocked);
680 
681 	return (error);
682 }
683 
684 /*
685  * Stat a vnode; implementation for the stat syscall
686  */
687 int
688 vn_stat(vp, sb, active_cred, file_cred, td)
689 	struct vnode *vp;
690 	register struct stat *sb;
691 	struct ucred *active_cred;
692 	struct ucred *file_cred;
693 	struct thread *td;
694 {
695 	struct vattr vattr;
696 	register struct vattr *vap;
697 	int error;
698 	u_short mode;
699 
700 #ifdef MAC
701 	error = mac_vnode_check_stat(active_cred, file_cred, vp);
702 	if (error)
703 		return (error);
704 #endif
705 
706 	vap = &vattr;
707 
708 	/*
709 	 * Initialize defaults for new and unusual fields, so that file
710 	 * systems which don't support these fields don't need to know
711 	 * about them.
712 	 */
713 	vap->va_birthtime.tv_sec = -1;
714 	vap->va_birthtime.tv_nsec = 0;
715 	vap->va_fsid = VNOVAL;
716 	vap->va_rdev = NODEV;
717 
718 	error = VOP_GETATTR(vp, vap, active_cred);
719 	if (error)
720 		return (error);
721 
722 	/*
723 	 * Zero the spare stat fields
724 	 */
725 	bzero(sb, sizeof *sb);
726 
727 	/*
728 	 * Copy from vattr table
729 	 */
730 	if (vap->va_fsid != VNOVAL)
731 		sb->st_dev = vap->va_fsid;
732 	else
733 		sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0];
734 	sb->st_ino = vap->va_fileid;
735 	mode = vap->va_mode;
736 	switch (vap->va_type) {
737 	case VREG:
738 		mode |= S_IFREG;
739 		break;
740 	case VDIR:
741 		mode |= S_IFDIR;
742 		break;
743 	case VBLK:
744 		mode |= S_IFBLK;
745 		break;
746 	case VCHR:
747 		mode |= S_IFCHR;
748 		break;
749 	case VLNK:
750 		mode |= S_IFLNK;
751 		break;
752 	case VSOCK:
753 		mode |= S_IFSOCK;
754 		break;
755 	case VFIFO:
756 		mode |= S_IFIFO;
757 		break;
758 	default:
759 		return (EBADF);
760 	};
761 	sb->st_mode = mode;
762 	sb->st_nlink = vap->va_nlink;
763 	sb->st_uid = vap->va_uid;
764 	sb->st_gid = vap->va_gid;
765 	sb->st_rdev = vap->va_rdev;
766 	if (vap->va_size > OFF_MAX)
767 		return (EOVERFLOW);
768 	sb->st_size = vap->va_size;
769 	sb->st_atimespec = vap->va_atime;
770 	sb->st_mtimespec = vap->va_mtime;
771 	sb->st_ctimespec = vap->va_ctime;
772 	sb->st_birthtimespec = vap->va_birthtime;
773 
774         /*
775 	 * According to www.opengroup.org, the meaning of st_blksize is
776 	 *   "a filesystem-specific preferred I/O block size for this
777 	 *    object.  In some filesystem types, this may vary from file
778 	 *    to file"
779 	 * Default to PAGE_SIZE after much discussion.
780 	 * XXX: min(PAGE_SIZE, vp->v_bufobj.bo_bsize) may be more correct.
781 	 */
782 
783 	sb->st_blksize = PAGE_SIZE;
784 
785 	sb->st_flags = vap->va_flags;
786 	if (priv_check(td, PRIV_VFS_GENERATION))
787 		sb->st_gen = 0;
788 	else
789 		sb->st_gen = vap->va_gen;
790 
791 	sb->st_blocks = vap->va_bytes / S_BLKSIZE;
792 	return (0);
793 }
794 
795 /*
796  * File table vnode ioctl routine.
797  */
798 static int
799 vn_ioctl(fp, com, data, active_cred, td)
800 	struct file *fp;
801 	u_long com;
802 	void *data;
803 	struct ucred *active_cred;
804 	struct thread *td;
805 {
806 	struct vnode *vp = fp->f_vnode;
807 	struct vattr vattr;
808 	int vfslocked;
809 	int error;
810 
811 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
812 	error = ENOTTY;
813 	switch (vp->v_type) {
814 	case VREG:
815 	case VDIR:
816 		if (com == FIONREAD) {
817 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
818 			error = VOP_GETATTR(vp, &vattr, active_cred);
819 			VOP_UNLOCK(vp, 0);
820 			if (!error)
821 				*(int *)data = vattr.va_size - fp->f_offset;
822 		}
823 		if (com == FIONBIO || com == FIOASYNC)	/* XXX */
824 			error = 0;
825 		else
826 			error = VOP_IOCTL(vp, com, data, fp->f_flag,
827 			    active_cred, td);
828 		break;
829 
830 	default:
831 		break;
832 	}
833 	VFS_UNLOCK_GIANT(vfslocked);
834 	return (error);
835 }
836 
837 /*
838  * File table vnode poll routine.
839  */
840 static int
841 vn_poll(fp, events, active_cred, td)
842 	struct file *fp;
843 	int events;
844 	struct ucred *active_cred;
845 	struct thread *td;
846 {
847 	struct vnode *vp;
848 	int vfslocked;
849 	int error;
850 
851 	vp = fp->f_vnode;
852 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
853 #ifdef MAC
854 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
855 	error = mac_vnode_check_poll(active_cred, fp->f_cred, vp);
856 	VOP_UNLOCK(vp, 0);
857 	if (!error)
858 #endif
859 
860 	error = VOP_POLL(vp, events, fp->f_cred, td);
861 	VFS_UNLOCK_GIANT(vfslocked);
862 	return (error);
863 }
864 
865 /*
866  * Acquire the requested lock and then check for validity.  LK_RETRY
867  * permits vn_lock to return doomed vnodes.
868  */
869 int
870 _vn_lock(struct vnode *vp, int flags, char *file, int line)
871 {
872 	int error;
873 
874 	VNASSERT((flags & LK_TYPE_MASK) != 0, vp,
875 	    ("vn_lock called with no locktype."));
876 	do {
877 #ifdef DEBUG_VFS_LOCKS
878 		KASSERT(vp->v_holdcnt != 0,
879 		    ("vn_lock %p: zero hold count", vp));
880 #endif
881 		error = VOP_LOCK1(vp, flags, file, line);
882 		flags &= ~LK_INTERLOCK;	/* Interlock is always dropped. */
883 		KASSERT((flags & LK_RETRY) == 0 || error == 0,
884 		    ("LK_RETRY set with incompatible flags (0x%x) or an error occured (%d)",
885 		    flags, error));
886 		/*
887 		 * Callers specify LK_RETRY if they wish to get dead vnodes.
888 		 * If RETRY is not set, we return ENOENT instead.
889 		 */
890 		if (error == 0 && vp->v_iflag & VI_DOOMED &&
891 		    (flags & LK_RETRY) == 0) {
892 			VOP_UNLOCK(vp, 0);
893 			error = ENOENT;
894 			break;
895 		}
896 	} while (flags & LK_RETRY && error != 0);
897 	return (error);
898 }
899 
900 /*
901  * File table vnode close routine.
902  */
903 static int
904 vn_closefile(fp, td)
905 	struct file *fp;
906 	struct thread *td;
907 {
908 	struct vnode *vp;
909 	struct flock lf;
910 	int vfslocked;
911 	int error;
912 
913 	vp = fp->f_vnode;
914 
915 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
916 	if (fp->f_type == DTYPE_VNODE && fp->f_flag & FHASLOCK) {
917 		lf.l_whence = SEEK_SET;
918 		lf.l_start = 0;
919 		lf.l_len = 0;
920 		lf.l_type = F_UNLCK;
921 		(void) VOP_ADVLOCK(vp, fp, F_UNLCK, &lf, F_FLOCK);
922 	}
923 
924 	fp->f_ops = &badfileops;
925 
926 	error = vn_close(vp, fp->f_flag, fp->f_cred, td);
927 	VFS_UNLOCK_GIANT(vfslocked);
928 	return (error);
929 }
930 
931 /*
932  * Preparing to start a filesystem write operation. If the operation is
933  * permitted, then we bump the count of operations in progress and
934  * proceed. If a suspend request is in progress, we wait until the
935  * suspension is over, and then proceed.
936  */
937 int
938 vn_start_write(vp, mpp, flags)
939 	struct vnode *vp;
940 	struct mount **mpp;
941 	int flags;
942 {
943 	struct mount *mp;
944 	int error;
945 
946 	error = 0;
947 	/*
948 	 * If a vnode is provided, get and return the mount point that
949 	 * to which it will write.
950 	 */
951 	if (vp != NULL) {
952 		if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
953 			*mpp = NULL;
954 			if (error != EOPNOTSUPP)
955 				return (error);
956 			return (0);
957 		}
958 	}
959 	if ((mp = *mpp) == NULL)
960 		return (0);
961 
962 	/*
963 	 * VOP_GETWRITEMOUNT() returns with the mp refcount held through
964 	 * a vfs_ref().
965 	 * As long as a vnode is not provided we need to acquire a
966 	 * refcount for the provided mountpoint too, in order to
967 	 * emulate a vfs_ref().
968 	 */
969 	MNT_ILOCK(mp);
970 	if (vp == NULL)
971 		MNT_REF(mp);
972 
973 	/*
974 	 * Check on status of suspension.
975 	 */
976 	if ((curthread->td_pflags & TDP_IGNSUSP) == 0 ||
977 	    mp->mnt_susp_owner != curthread) {
978 		while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
979 			if (flags & V_NOWAIT) {
980 				error = EWOULDBLOCK;
981 				goto unlock;
982 			}
983 			error = msleep(&mp->mnt_flag, MNT_MTX(mp),
984 			    (PUSER - 1) | (flags & PCATCH), "suspfs", 0);
985 			if (error)
986 				goto unlock;
987 		}
988 	}
989 	if (flags & V_XSLEEP)
990 		goto unlock;
991 	mp->mnt_writeopcount++;
992 unlock:
993 	MNT_REL(mp);
994 	MNT_IUNLOCK(mp);
995 	return (error);
996 }
997 
998 /*
999  * Secondary suspension. Used by operations such as vop_inactive
1000  * routines that are needed by the higher level functions. These
1001  * are allowed to proceed until all the higher level functions have
1002  * completed (indicated by mnt_writeopcount dropping to zero). At that
1003  * time, these operations are halted until the suspension is over.
1004  */
1005 int
1006 vn_start_secondary_write(vp, mpp, flags)
1007 	struct vnode *vp;
1008 	struct mount **mpp;
1009 	int flags;
1010 {
1011 	struct mount *mp;
1012 	int error;
1013 
1014  retry:
1015 	if (vp != NULL) {
1016 		if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
1017 			*mpp = NULL;
1018 			if (error != EOPNOTSUPP)
1019 				return (error);
1020 			return (0);
1021 		}
1022 	}
1023 	/*
1024 	 * If we are not suspended or have not yet reached suspended
1025 	 * mode, then let the operation proceed.
1026 	 */
1027 	if ((mp = *mpp) == NULL)
1028 		return (0);
1029 
1030 	/*
1031 	 * VOP_GETWRITEMOUNT() returns with the mp refcount held through
1032 	 * a vfs_ref().
1033 	 * As long as a vnode is not provided we need to acquire a
1034 	 * refcount for the provided mountpoint too, in order to
1035 	 * emulate a vfs_ref().
1036 	 */
1037 	MNT_ILOCK(mp);
1038 	if (vp == NULL)
1039 		MNT_REF(mp);
1040 	if ((mp->mnt_kern_flag & (MNTK_SUSPENDED | MNTK_SUSPEND2)) == 0) {
1041 		mp->mnt_secondary_writes++;
1042 		mp->mnt_secondary_accwrites++;
1043 		MNT_REL(mp);
1044 		MNT_IUNLOCK(mp);
1045 		return (0);
1046 	}
1047 	if (flags & V_NOWAIT) {
1048 		MNT_REL(mp);
1049 		MNT_IUNLOCK(mp);
1050 		return (EWOULDBLOCK);
1051 	}
1052 	/*
1053 	 * Wait for the suspension to finish.
1054 	 */
1055 	error = msleep(&mp->mnt_flag, MNT_MTX(mp),
1056 		       (PUSER - 1) | (flags & PCATCH) | PDROP, "suspfs", 0);
1057 	vfs_rel(mp);
1058 	if (error == 0)
1059 		goto retry;
1060 	return (error);
1061 }
1062 
1063 /*
1064  * Filesystem write operation has completed. If we are suspending and this
1065  * operation is the last one, notify the suspender that the suspension is
1066  * now in effect.
1067  */
1068 void
1069 vn_finished_write(mp)
1070 	struct mount *mp;
1071 {
1072 	if (mp == NULL)
1073 		return;
1074 	MNT_ILOCK(mp);
1075 	mp->mnt_writeopcount--;
1076 	if (mp->mnt_writeopcount < 0)
1077 		panic("vn_finished_write: neg cnt");
1078 	if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
1079 	    mp->mnt_writeopcount <= 0)
1080 		wakeup(&mp->mnt_writeopcount);
1081 	MNT_IUNLOCK(mp);
1082 }
1083 
1084 
1085 /*
1086  * Filesystem secondary write operation has completed. If we are
1087  * suspending and this operation is the last one, notify the suspender
1088  * that the suspension is now in effect.
1089  */
1090 void
1091 vn_finished_secondary_write(mp)
1092 	struct mount *mp;
1093 {
1094 	if (mp == NULL)
1095 		return;
1096 	MNT_ILOCK(mp);
1097 	mp->mnt_secondary_writes--;
1098 	if (mp->mnt_secondary_writes < 0)
1099 		panic("vn_finished_secondary_write: neg cnt");
1100 	if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
1101 	    mp->mnt_secondary_writes <= 0)
1102 		wakeup(&mp->mnt_secondary_writes);
1103 	MNT_IUNLOCK(mp);
1104 }
1105 
1106 
1107 
1108 /*
1109  * Request a filesystem to suspend write operations.
1110  */
1111 int
1112 vfs_write_suspend(mp)
1113 	struct mount *mp;
1114 {
1115 	struct thread *td = curthread;
1116 	int error;
1117 
1118 	MNT_ILOCK(mp);
1119 	if (mp->mnt_susp_owner == curthread) {
1120 		MNT_IUNLOCK(mp);
1121 		return (EALREADY);
1122 	}
1123 	while (mp->mnt_kern_flag & MNTK_SUSPEND)
1124 		msleep(&mp->mnt_flag, MNT_MTX(mp), PUSER - 1, "wsuspfs", 0);
1125 	mp->mnt_kern_flag |= MNTK_SUSPEND;
1126 	mp->mnt_susp_owner = curthread;
1127 	if (mp->mnt_writeopcount > 0)
1128 		(void) msleep(&mp->mnt_writeopcount,
1129 		    MNT_MTX(mp), (PUSER - 1)|PDROP, "suspwt", 0);
1130 	else
1131 		MNT_IUNLOCK(mp);
1132 	if ((error = VFS_SYNC(mp, MNT_SUSPEND, td)) != 0)
1133 		vfs_write_resume(mp);
1134 	return (error);
1135 }
1136 
1137 /*
1138  * Request a filesystem to resume write operations.
1139  */
1140 void
1141 vfs_write_resume(mp)
1142 	struct mount *mp;
1143 {
1144 
1145 	MNT_ILOCK(mp);
1146 	if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
1147 		KASSERT(mp->mnt_susp_owner == curthread, ("mnt_susp_owner"));
1148 		mp->mnt_kern_flag &= ~(MNTK_SUSPEND | MNTK_SUSPEND2 |
1149 				       MNTK_SUSPENDED);
1150 		mp->mnt_susp_owner = NULL;
1151 		wakeup(&mp->mnt_writeopcount);
1152 		wakeup(&mp->mnt_flag);
1153 		curthread->td_pflags &= ~TDP_IGNSUSP;
1154 		MNT_IUNLOCK(mp);
1155 		VFS_SUSP_CLEAN(mp);
1156 	} else
1157 		MNT_IUNLOCK(mp);
1158 }
1159 
1160 /*
1161  * Implement kqueues for files by translating it to vnode operation.
1162  */
1163 static int
1164 vn_kqfilter(struct file *fp, struct knote *kn)
1165 {
1166 	int vfslocked;
1167 	int error;
1168 
1169 	vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
1170 	error = VOP_KQFILTER(fp->f_vnode, kn);
1171 	VFS_UNLOCK_GIANT(vfslocked);
1172 
1173 	return error;
1174 }
1175 
1176 /*
1177  * Simplified in-kernel wrapper calls for extended attribute access.
1178  * Both calls pass in a NULL credential, authorizing as "kernel" access.
1179  * Set IO_NODELOCKED in ioflg if the vnode is already locked.
1180  */
1181 int
1182 vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace,
1183     const char *attrname, int *buflen, char *buf, struct thread *td)
1184 {
1185 	struct uio	auio;
1186 	struct iovec	iov;
1187 	int	error;
1188 
1189 	iov.iov_len = *buflen;
1190 	iov.iov_base = buf;
1191 
1192 	auio.uio_iov = &iov;
1193 	auio.uio_iovcnt = 1;
1194 	auio.uio_rw = UIO_READ;
1195 	auio.uio_segflg = UIO_SYSSPACE;
1196 	auio.uio_td = td;
1197 	auio.uio_offset = 0;
1198 	auio.uio_resid = *buflen;
1199 
1200 	if ((ioflg & IO_NODELOCKED) == 0)
1201 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1202 
1203 	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
1204 
1205 	/* authorize attribute retrieval as kernel */
1206 	error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, NULL,
1207 	    td);
1208 
1209 	if ((ioflg & IO_NODELOCKED) == 0)
1210 		VOP_UNLOCK(vp, 0);
1211 
1212 	if (error == 0) {
1213 		*buflen = *buflen - auio.uio_resid;
1214 	}
1215 
1216 	return (error);
1217 }
1218 
1219 /*
1220  * XXX failure mode if partially written?
1221  */
1222 int
1223 vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace,
1224     const char *attrname, int buflen, char *buf, struct thread *td)
1225 {
1226 	struct uio	auio;
1227 	struct iovec	iov;
1228 	struct mount	*mp;
1229 	int	error;
1230 
1231 	iov.iov_len = buflen;
1232 	iov.iov_base = buf;
1233 
1234 	auio.uio_iov = &iov;
1235 	auio.uio_iovcnt = 1;
1236 	auio.uio_rw = UIO_WRITE;
1237 	auio.uio_segflg = UIO_SYSSPACE;
1238 	auio.uio_td = td;
1239 	auio.uio_offset = 0;
1240 	auio.uio_resid = buflen;
1241 
1242 	if ((ioflg & IO_NODELOCKED) == 0) {
1243 		if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
1244 			return (error);
1245 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1246 	}
1247 
1248 	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
1249 
1250 	/* authorize attribute setting as kernel */
1251 	error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, td);
1252 
1253 	if ((ioflg & IO_NODELOCKED) == 0) {
1254 		vn_finished_write(mp);
1255 		VOP_UNLOCK(vp, 0);
1256 	}
1257 
1258 	return (error);
1259 }
1260 
1261 int
1262 vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace,
1263     const char *attrname, struct thread *td)
1264 {
1265 	struct mount	*mp;
1266 	int	error;
1267 
1268 	if ((ioflg & IO_NODELOCKED) == 0) {
1269 		if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
1270 			return (error);
1271 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1272 	}
1273 
1274 	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
1275 
1276 	/* authorize attribute removal as kernel */
1277 	error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, NULL, td);
1278 	if (error == EOPNOTSUPP)
1279 		error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL,
1280 		    NULL, td);
1281 
1282 	if ((ioflg & IO_NODELOCKED) == 0) {
1283 		vn_finished_write(mp);
1284 		VOP_UNLOCK(vp, 0);
1285 	}
1286 
1287 	return (error);
1288 }
1289