xref: /freebsd/sys/kern/vfs_vnops.c (revision a9148abd9da5db2f1c682fb17bed791845fc41c9)
1 /*-
2  * Copyright (c) 1982, 1986, 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 4. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  *	@(#)vfs_vnops.c	8.2 (Berkeley) 1/21/94
35  */
36 
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD$");
39 
40 #include "opt_mac.h"
41 
42 #include <sys/param.h>
43 #include <sys/systm.h>
44 #include <sys/fcntl.h>
45 #include <sys/file.h>
46 #include <sys/kdb.h>
47 #include <sys/stat.h>
48 #include <sys/priv.h>
49 #include <sys/proc.h>
50 #include <sys/limits.h>
51 #include <sys/lock.h>
52 #include <sys/mount.h>
53 #include <sys/mutex.h>
54 #include <sys/namei.h>
55 #include <sys/vnode.h>
56 #include <sys/bio.h>
57 #include <sys/buf.h>
58 #include <sys/filio.h>
59 #include <sys/sx.h>
60 #include <sys/ttycom.h>
61 #include <sys/conf.h>
62 #include <sys/syslog.h>
63 #include <sys/unistd.h>
64 
65 #include <security/mac/mac_framework.h>
66 
67 static fo_rdwr_t	vn_read;
68 static fo_rdwr_t	vn_write;
69 static fo_truncate_t	vn_truncate;
70 static fo_ioctl_t	vn_ioctl;
71 static fo_poll_t	vn_poll;
72 static fo_kqfilter_t	vn_kqfilter;
73 static fo_stat_t	vn_statfile;
74 static fo_close_t	vn_closefile;
75 
76 struct 	fileops vnops = {
77 	.fo_read = vn_read,
78 	.fo_write = vn_write,
79 	.fo_truncate = vn_truncate,
80 	.fo_ioctl = vn_ioctl,
81 	.fo_poll = vn_poll,
82 	.fo_kqfilter = vn_kqfilter,
83 	.fo_stat = vn_statfile,
84 	.fo_close = vn_closefile,
85 	.fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE
86 };
87 
88 int
89 vn_open(ndp, flagp, cmode, fp)
90 	struct nameidata *ndp;
91 	int *flagp, cmode;
92 	struct file *fp;
93 {
94 	struct thread *td = ndp->ni_cnd.cn_thread;
95 
96 	return (vn_open_cred(ndp, flagp, cmode, td->td_ucred, fp));
97 }
98 
99 /*
100  * Common code for vnode open operations.
101  * Check permissions, and call the VOP_OPEN or VOP_CREATE routine.
102  *
103  * Note that this does NOT free nameidata for the successful case,
104  * due to the NDINIT being done elsewhere.
105  */
106 int
107 vn_open_cred(ndp, flagp, cmode, cred, fp)
108 	struct nameidata *ndp;
109 	int *flagp, cmode;
110 	struct ucred *cred;
111 	struct file *fp;
112 {
113 	struct vnode *vp;
114 	struct mount *mp;
115 	struct thread *td = ndp->ni_cnd.cn_thread;
116 	struct vattr vat;
117 	struct vattr *vap = &vat;
118 	int fmode, error;
119 	accmode_t accmode;
120 	int vfslocked, mpsafe;
121 
122 	mpsafe = ndp->ni_cnd.cn_flags & MPSAFE;
123 restart:
124 	vfslocked = 0;
125 	fmode = *flagp;
126 	if (fmode & O_CREAT) {
127 		ndp->ni_cnd.cn_nameiop = CREATE;
128 		ndp->ni_cnd.cn_flags = ISOPEN | LOCKPARENT | LOCKLEAF |
129 		    MPSAFE | AUDITVNODE1;
130 		if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0)
131 			ndp->ni_cnd.cn_flags |= FOLLOW;
132 		bwillwrite();
133 		if ((error = namei(ndp)) != 0)
134 			return (error);
135 		vfslocked = NDHASGIANT(ndp);
136 		if (!mpsafe)
137 			ndp->ni_cnd.cn_flags &= ~MPSAFE;
138 		if (ndp->ni_vp == NULL) {
139 			VATTR_NULL(vap);
140 			vap->va_type = VREG;
141 			vap->va_mode = cmode;
142 			if (fmode & O_EXCL)
143 				vap->va_vaflags |= VA_EXCLUSIVE;
144 			if (vn_start_write(ndp->ni_dvp, &mp, V_NOWAIT) != 0) {
145 				NDFREE(ndp, NDF_ONLY_PNBUF);
146 				vput(ndp->ni_dvp);
147 				VFS_UNLOCK_GIANT(vfslocked);
148 				if ((error = vn_start_write(NULL, &mp,
149 				    V_XSLEEP | PCATCH)) != 0)
150 					return (error);
151 				goto restart;
152 			}
153 #ifdef MAC
154 			error = mac_vnode_check_create(cred, ndp->ni_dvp,
155 			    &ndp->ni_cnd, vap);
156 			if (error == 0) {
157 #endif
158 				VOP_LEASE(ndp->ni_dvp, td, cred, LEASE_WRITE);
159 				error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp,
160 						   &ndp->ni_cnd, vap);
161 #ifdef MAC
162 			}
163 #endif
164 			vput(ndp->ni_dvp);
165 			vn_finished_write(mp);
166 			if (error) {
167 				VFS_UNLOCK_GIANT(vfslocked);
168 				NDFREE(ndp, NDF_ONLY_PNBUF);
169 				return (error);
170 			}
171 			fmode &= ~O_TRUNC;
172 			vp = ndp->ni_vp;
173 		} else {
174 			if (ndp->ni_dvp == ndp->ni_vp)
175 				vrele(ndp->ni_dvp);
176 			else
177 				vput(ndp->ni_dvp);
178 			ndp->ni_dvp = NULL;
179 			vp = ndp->ni_vp;
180 			if (fmode & O_EXCL) {
181 				error = EEXIST;
182 				goto bad;
183 			}
184 			fmode &= ~O_CREAT;
185 		}
186 	} else {
187 		ndp->ni_cnd.cn_nameiop = LOOKUP;
188 		ndp->ni_cnd.cn_flags = ISOPEN |
189 		    ((fmode & O_NOFOLLOW) ? NOFOLLOW : FOLLOW) |
190 		    LOCKLEAF | MPSAFE | AUDITVNODE1;
191 		if ((error = namei(ndp)) != 0)
192 			return (error);
193 		if (!mpsafe)
194 			ndp->ni_cnd.cn_flags &= ~MPSAFE;
195 		vfslocked = NDHASGIANT(ndp);
196 		vp = ndp->ni_vp;
197 	}
198 	if (vp->v_type == VLNK) {
199 		error = EMLINK;
200 		goto bad;
201 	}
202 	if (vp->v_type == VSOCK) {
203 		error = EOPNOTSUPP;
204 		goto bad;
205 	}
206 	accmode = 0;
207 	if (fmode & (FWRITE | O_TRUNC)) {
208 		if (vp->v_type == VDIR) {
209 			error = EISDIR;
210 			goto bad;
211 		}
212 		accmode |= VWRITE;
213 	}
214 	if (fmode & FREAD)
215 		accmode |= VREAD;
216 	if (fmode & FEXEC)
217 		accmode |= VEXEC;
218 	if (fmode & O_APPEND)
219 		accmode |= VAPPEND;
220 #ifdef MAC
221 	error = mac_vnode_check_open(cred, vp, accmode);
222 	if (error)
223 		goto bad;
224 #endif
225 	if ((fmode & O_CREAT) == 0) {
226 		if (accmode & VWRITE) {
227 			error = vn_writechk(vp);
228 			if (error)
229 				goto bad;
230 		}
231 		if (accmode) {
232 		        error = VOP_ACCESS(vp, accmode, cred, td);
233 			if (error)
234 				goto bad;
235 		}
236 	}
237 	if ((error = VOP_OPEN(vp, fmode, cred, td, fp)) != 0)
238 		goto bad;
239 
240 	if (fmode & FWRITE)
241 		vp->v_writecount++;
242 	*flagp = fmode;
243 	ASSERT_VOP_ELOCKED(vp, "vn_open_cred");
244 	if (!mpsafe)
245 		VFS_UNLOCK_GIANT(vfslocked);
246 	return (0);
247 bad:
248 	NDFREE(ndp, NDF_ONLY_PNBUF);
249 	vput(vp);
250 	VFS_UNLOCK_GIANT(vfslocked);
251 	*flagp = fmode;
252 	ndp->ni_vp = NULL;
253 	return (error);
254 }
255 
256 /*
257  * Check for write permissions on the specified vnode.
258  * Prototype text segments cannot be written.
259  */
260 int
261 vn_writechk(vp)
262 	register struct vnode *vp;
263 {
264 
265 	ASSERT_VOP_LOCKED(vp, "vn_writechk");
266 	/*
267 	 * If there's shared text associated with
268 	 * the vnode, try to free it up once.  If
269 	 * we fail, we can't allow writing.
270 	 */
271 	if (vp->v_vflag & VV_TEXT)
272 		return (ETXTBSY);
273 
274 	return (0);
275 }
276 
277 /*
278  * Vnode close call
279  */
280 int
281 vn_close(vp, flags, file_cred, td)
282 	register struct vnode *vp;
283 	int flags;
284 	struct ucred *file_cred;
285 	struct thread *td;
286 {
287 	struct mount *mp;
288 	int error;
289 
290 	VFS_ASSERT_GIANT(vp->v_mount);
291 
292 	vn_start_write(vp, &mp, V_WAIT);
293 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
294 	if (flags & FWRITE) {
295 		VNASSERT(vp->v_writecount > 0, vp,
296 		    ("vn_close: negative writecount"));
297 		vp->v_writecount--;
298 	}
299 	error = VOP_CLOSE(vp, flags, file_cred, td);
300 	vput(vp);
301 	vn_finished_write(mp);
302 	return (error);
303 }
304 
305 /*
306  * Heuristic to detect sequential operation.
307  */
308 static int
309 sequential_heuristic(struct uio *uio, struct file *fp)
310 {
311 
312 	/*
313 	 * Offset 0 is handled specially.  open() sets f_seqcount to 1 so
314 	 * that the first I/O is normally considered to be slightly
315 	 * sequential.  Seeking to offset 0 doesn't change sequentiality
316 	 * unless previous seeks have reduced f_seqcount to 0, in which
317 	 * case offset 0 is not special.
318 	 */
319 	if ((uio->uio_offset == 0 && fp->f_seqcount > 0) ||
320 	    uio->uio_offset == fp->f_nextoff) {
321 		/*
322 		 * f_seqcount is in units of fixed-size blocks so that it
323 		 * depends mainly on the amount of sequential I/O and not
324 		 * much on the number of sequential I/O's.  The fixed size
325 		 * of 16384 is hard-coded here since it is (not quite) just
326 		 * a magic size that works well here.  This size is more
327 		 * closely related to the best I/O size for real disks than
328 		 * to any block size used by software.
329 		 */
330 		fp->f_seqcount += howmany(uio->uio_resid, 16384);
331 		if (fp->f_seqcount > IO_SEQMAX)
332 			fp->f_seqcount = IO_SEQMAX;
333 		return (fp->f_seqcount << IO_SEQSHIFT);
334 	}
335 
336 	/* Not sequential.  Quickly draw-down sequentiality. */
337 	if (fp->f_seqcount > 1)
338 		fp->f_seqcount = 1;
339 	else
340 		fp->f_seqcount = 0;
341 	return (0);
342 }
343 
344 /*
345  * Package up an I/O request on a vnode into a uio and do it.
346  */
347 int
348 vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, active_cred, file_cred,
349     aresid, td)
350 	enum uio_rw rw;
351 	struct vnode *vp;
352 	void *base;
353 	int len;
354 	off_t offset;
355 	enum uio_seg segflg;
356 	int ioflg;
357 	struct ucred *active_cred;
358 	struct ucred *file_cred;
359 	int *aresid;
360 	struct thread *td;
361 {
362 	struct uio auio;
363 	struct iovec aiov;
364 	struct mount *mp;
365 	struct ucred *cred;
366 	int error;
367 
368 	VFS_ASSERT_GIANT(vp->v_mount);
369 
370 	if ((ioflg & IO_NODELOCKED) == 0) {
371 		mp = NULL;
372 		if (rw == UIO_WRITE) {
373 			if (vp->v_type != VCHR &&
374 			    (error = vn_start_write(vp, &mp, V_WAIT | PCATCH))
375 			    != 0)
376 				return (error);
377 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
378 		} else {
379 			/*
380 			 * XXX This should be LK_SHARED but I don't trust VFS
381 			 * enough to leave it like that until it has been
382 			 * reviewed further.
383 			 */
384 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
385 		}
386 
387 	}
388 	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
389 	auio.uio_iov = &aiov;
390 	auio.uio_iovcnt = 1;
391 	aiov.iov_base = base;
392 	aiov.iov_len = len;
393 	auio.uio_resid = len;
394 	auio.uio_offset = offset;
395 	auio.uio_segflg = segflg;
396 	auio.uio_rw = rw;
397 	auio.uio_td = td;
398 	error = 0;
399 #ifdef MAC
400 	if ((ioflg & IO_NOMACCHECK) == 0) {
401 		if (rw == UIO_READ)
402 			error = mac_vnode_check_read(active_cred, file_cred,
403 			    vp);
404 		else
405 			error = mac_vnode_check_write(active_cred, file_cred,
406 			    vp);
407 	}
408 #endif
409 	if (error == 0) {
410 		if (file_cred)
411 			cred = file_cred;
412 		else
413 			cred = active_cred;
414 		if (rw == UIO_READ)
415 			error = VOP_READ(vp, &auio, ioflg, cred);
416 		else
417 			error = VOP_WRITE(vp, &auio, ioflg, cred);
418 	}
419 	if (aresid)
420 		*aresid = auio.uio_resid;
421 	else
422 		if (auio.uio_resid && error == 0)
423 			error = EIO;
424 	if ((ioflg & IO_NODELOCKED) == 0) {
425 		if (rw == UIO_WRITE && vp->v_type != VCHR)
426 			vn_finished_write(mp);
427 		VOP_UNLOCK(vp, 0);
428 	}
429 	return (error);
430 }
431 
432 /*
433  * Package up an I/O request on a vnode into a uio and do it.  The I/O
434  * request is split up into smaller chunks and we try to avoid saturating
435  * the buffer cache while potentially holding a vnode locked, so we
436  * check bwillwrite() before calling vn_rdwr().  We also call uio_yield()
437  * to give other processes a chance to lock the vnode (either other processes
438  * core'ing the same binary, or unrelated processes scanning the directory).
439  */
440 int
441 vn_rdwr_inchunks(rw, vp, base, len, offset, segflg, ioflg, active_cred,
442     file_cred, aresid, td)
443 	enum uio_rw rw;
444 	struct vnode *vp;
445 	void *base;
446 	size_t len;
447 	off_t offset;
448 	enum uio_seg segflg;
449 	int ioflg;
450 	struct ucred *active_cred;
451 	struct ucred *file_cred;
452 	size_t *aresid;
453 	struct thread *td;
454 {
455 	int error = 0;
456 	int iaresid;
457 
458 	VFS_ASSERT_GIANT(vp->v_mount);
459 
460 	do {
461 		int chunk;
462 
463 		/*
464 		 * Force `offset' to a multiple of MAXBSIZE except possibly
465 		 * for the first chunk, so that filesystems only need to
466 		 * write full blocks except possibly for the first and last
467 		 * chunks.
468 		 */
469 		chunk = MAXBSIZE - (uoff_t)offset % MAXBSIZE;
470 
471 		if (chunk > len)
472 			chunk = len;
473 		if (rw != UIO_READ && vp->v_type == VREG)
474 			bwillwrite();
475 		iaresid = 0;
476 		error = vn_rdwr(rw, vp, base, chunk, offset, segflg,
477 		    ioflg, active_cred, file_cred, &iaresid, td);
478 		len -= chunk;	/* aresid calc already includes length */
479 		if (error)
480 			break;
481 		offset += chunk;
482 		base = (char *)base + chunk;
483 		uio_yield();
484 	} while (len);
485 	if (aresid)
486 		*aresid = len + iaresid;
487 	return (error);
488 }
489 
490 /*
491  * File table vnode read routine.
492  */
493 static int
494 vn_read(fp, uio, active_cred, flags, td)
495 	struct file *fp;
496 	struct uio *uio;
497 	struct ucred *active_cred;
498 	struct thread *td;
499 	int flags;
500 {
501 	struct vnode *vp;
502 	int error, ioflag;
503 	struct mtx *mtxp;
504 	int vfslocked;
505 
506 	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
507 	    uio->uio_td, td));
508 	mtxp = NULL;
509 	vp = fp->f_vnode;
510 	ioflag = 0;
511 	if (fp->f_flag & FNONBLOCK)
512 		ioflag |= IO_NDELAY;
513 	if (fp->f_flag & O_DIRECT)
514 		ioflag |= IO_DIRECT;
515 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
516 	VOP_LEASE(vp, td, fp->f_cred, LEASE_READ);
517 	/*
518 	 * According to McKusick the vn lock was protecting f_offset here.
519 	 * It is now protected by the FOFFSET_LOCKED flag.
520 	 */
521 	if ((flags & FOF_OFFSET) == 0) {
522 		mtxp = mtx_pool_find(mtxpool_sleep, fp);
523 		mtx_lock(mtxp);
524 		while(fp->f_vnread_flags & FOFFSET_LOCKED) {
525 			fp->f_vnread_flags |= FOFFSET_LOCK_WAITING;
526 			msleep(&fp->f_vnread_flags, mtxp, PUSER -1,
527 			    "vnread offlock", 0);
528 		}
529 		fp->f_vnread_flags |= FOFFSET_LOCKED;
530 		mtx_unlock(mtxp);
531 		vn_lock(vp, LK_SHARED | LK_RETRY);
532 		uio->uio_offset = fp->f_offset;
533 	} else
534 		vn_lock(vp, LK_SHARED | LK_RETRY);
535 
536 	ioflag |= sequential_heuristic(uio, fp);
537 
538 #ifdef MAC
539 	error = mac_vnode_check_read(active_cred, fp->f_cred, vp);
540 	if (error == 0)
541 #endif
542 		error = VOP_READ(vp, uio, ioflag, fp->f_cred);
543 	if ((flags & FOF_OFFSET) == 0) {
544 		fp->f_offset = uio->uio_offset;
545 		mtx_lock(mtxp);
546 		if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING)
547 			wakeup(&fp->f_vnread_flags);
548 		fp->f_vnread_flags = 0;
549 		mtx_unlock(mtxp);
550 	}
551 	fp->f_nextoff = uio->uio_offset;
552 	VOP_UNLOCK(vp, 0);
553 	VFS_UNLOCK_GIANT(vfslocked);
554 	return (error);
555 }
556 
557 /*
558  * File table vnode write routine.
559  */
560 static int
561 vn_write(fp, uio, active_cred, flags, td)
562 	struct file *fp;
563 	struct uio *uio;
564 	struct ucred *active_cred;
565 	struct thread *td;
566 	int flags;
567 {
568 	struct vnode *vp;
569 	struct mount *mp;
570 	int error, ioflag;
571 	int vfslocked;
572 
573 	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
574 	    uio->uio_td, td));
575 	vp = fp->f_vnode;
576 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
577 	if (vp->v_type == VREG)
578 		bwillwrite();
579 	ioflag = IO_UNIT;
580 	if (vp->v_type == VREG && (fp->f_flag & O_APPEND))
581 		ioflag |= IO_APPEND;
582 	if (fp->f_flag & FNONBLOCK)
583 		ioflag |= IO_NDELAY;
584 	if (fp->f_flag & O_DIRECT)
585 		ioflag |= IO_DIRECT;
586 	if ((fp->f_flag & O_FSYNC) ||
587 	    (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)))
588 		ioflag |= IO_SYNC;
589 	mp = NULL;
590 	if (vp->v_type != VCHR &&
591 	    (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
592 		goto unlock;
593 	VOP_LEASE(vp, td, fp->f_cred, LEASE_WRITE);
594 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
595 	if ((flags & FOF_OFFSET) == 0)
596 		uio->uio_offset = fp->f_offset;
597 	ioflag |= sequential_heuristic(uio, fp);
598 #ifdef MAC
599 	error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
600 	if (error == 0)
601 #endif
602 		error = VOP_WRITE(vp, uio, ioflag, fp->f_cred);
603 	if ((flags & FOF_OFFSET) == 0)
604 		fp->f_offset = uio->uio_offset;
605 	fp->f_nextoff = uio->uio_offset;
606 	VOP_UNLOCK(vp, 0);
607 	if (vp->v_type != VCHR)
608 		vn_finished_write(mp);
609 unlock:
610 	VFS_UNLOCK_GIANT(vfslocked);
611 	return (error);
612 }
613 
614 /*
615  * File table truncate routine.
616  */
617 static int
618 vn_truncate(fp, length, active_cred, td)
619 	struct file *fp;
620 	off_t length;
621 	struct ucred *active_cred;
622 	struct thread *td;
623 {
624 	struct vattr vattr;
625 	struct mount *mp;
626 	struct vnode *vp;
627 	int vfslocked;
628 	int error;
629 
630 	vp = fp->f_vnode;
631 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
632 	error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
633 	if (error) {
634 		VFS_UNLOCK_GIANT(vfslocked);
635 		return (error);
636 	}
637 	VOP_LEASE(vp, td, active_cred, LEASE_WRITE);
638 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
639 	if (vp->v_type == VDIR) {
640 		error = EISDIR;
641 		goto out;
642 	}
643 #ifdef MAC
644 	error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
645 	if (error)
646 		goto out;
647 #endif
648 	error = vn_writechk(vp);
649 	if (error == 0) {
650 		VATTR_NULL(&vattr);
651 		vattr.va_size = length;
652 		error = VOP_SETATTR(vp, &vattr, fp->f_cred);
653 	}
654 out:
655 	VOP_UNLOCK(vp, 0);
656 	vn_finished_write(mp);
657 	VFS_UNLOCK_GIANT(vfslocked);
658 	return (error);
659 }
660 
661 /*
662  * File table vnode stat routine.
663  */
664 static int
665 vn_statfile(fp, sb, active_cred, td)
666 	struct file *fp;
667 	struct stat *sb;
668 	struct ucred *active_cred;
669 	struct thread *td;
670 {
671 	struct vnode *vp = fp->f_vnode;
672 	int vfslocked;
673 	int error;
674 
675 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
676 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
677 	error = vn_stat(vp, sb, active_cred, fp->f_cred, td);
678 	VOP_UNLOCK(vp, 0);
679 	VFS_UNLOCK_GIANT(vfslocked);
680 
681 	return (error);
682 }
683 
684 /*
685  * Stat a vnode; implementation for the stat syscall
686  */
687 int
688 vn_stat(vp, sb, active_cred, file_cred, td)
689 	struct vnode *vp;
690 	register struct stat *sb;
691 	struct ucred *active_cred;
692 	struct ucred *file_cred;
693 	struct thread *td;
694 {
695 	struct vattr vattr;
696 	register struct vattr *vap;
697 	int error;
698 	u_short mode;
699 
700 #ifdef MAC
701 	error = mac_vnode_check_stat(active_cred, file_cred, vp);
702 	if (error)
703 		return (error);
704 #endif
705 
706 	vap = &vattr;
707 
708 	/*
709 	 * Initialize defaults for new and unusual fields, so that file
710 	 * systems which don't support these fields don't need to know
711 	 * about them.
712 	 */
713 	vap->va_birthtime.tv_sec = -1;
714 	vap->va_birthtime.tv_nsec = 0;
715 	vap->va_fsid = VNOVAL;
716 	vap->va_rdev = NODEV;
717 
718 	error = VOP_GETATTR(vp, vap, active_cred);
719 	if (error)
720 		return (error);
721 
722 	/*
723 	 * Zero the spare stat fields
724 	 */
725 	bzero(sb, sizeof *sb);
726 
727 	/*
728 	 * Copy from vattr table
729 	 */
730 	if (vap->va_fsid != VNOVAL)
731 		sb->st_dev = vap->va_fsid;
732 	else
733 		sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0];
734 	sb->st_ino = vap->va_fileid;
735 	mode = vap->va_mode;
736 	switch (vap->va_type) {
737 	case VREG:
738 		mode |= S_IFREG;
739 		break;
740 	case VDIR:
741 		mode |= S_IFDIR;
742 		break;
743 	case VBLK:
744 		mode |= S_IFBLK;
745 		break;
746 	case VCHR:
747 		mode |= S_IFCHR;
748 		break;
749 	case VLNK:
750 		mode |= S_IFLNK;
751 		break;
752 	case VSOCK:
753 		mode |= S_IFSOCK;
754 		break;
755 	case VFIFO:
756 		mode |= S_IFIFO;
757 		break;
758 	default:
759 		return (EBADF);
760 	};
761 	sb->st_mode = mode;
762 	sb->st_nlink = vap->va_nlink;
763 	sb->st_uid = vap->va_uid;
764 	sb->st_gid = vap->va_gid;
765 	sb->st_rdev = vap->va_rdev;
766 	if (vap->va_size > OFF_MAX)
767 		return (EOVERFLOW);
768 	sb->st_size = vap->va_size;
769 	sb->st_atimespec = vap->va_atime;
770 	sb->st_mtimespec = vap->va_mtime;
771 	sb->st_ctimespec = vap->va_ctime;
772 	sb->st_birthtimespec = vap->va_birthtime;
773 
774         /*
775 	 * According to www.opengroup.org, the meaning of st_blksize is
776 	 *   "a filesystem-specific preferred I/O block size for this
777 	 *    object.  In some filesystem types, this may vary from file
778 	 *    to file"
779 	 * Default to PAGE_SIZE after much discussion.
780 	 * XXX: min(PAGE_SIZE, vp->v_bufobj.bo_bsize) may be more correct.
781 	 */
782 
783 	sb->st_blksize = PAGE_SIZE;
784 
785 	sb->st_flags = vap->va_flags;
786 	if (priv_check(td, PRIV_VFS_GENERATION))
787 		sb->st_gen = 0;
788 	else
789 		sb->st_gen = vap->va_gen;
790 
791 	sb->st_blocks = vap->va_bytes / S_BLKSIZE;
792 	return (0);
793 }
794 
795 /*
796  * File table vnode ioctl routine.
797  */
798 static int
799 vn_ioctl(fp, com, data, active_cred, td)
800 	struct file *fp;
801 	u_long com;
802 	void *data;
803 	struct ucred *active_cred;
804 	struct thread *td;
805 {
806 	struct vnode *vp = fp->f_vnode;
807 	struct vattr vattr;
808 	int vfslocked;
809 	int error;
810 
811 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
812 	error = ENOTTY;
813 	switch (vp->v_type) {
814 	case VREG:
815 	case VDIR:
816 		if (com == FIONREAD) {
817 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
818 			error = VOP_GETATTR(vp, &vattr, active_cred);
819 			VOP_UNLOCK(vp, 0);
820 			if (!error)
821 				*(int *)data = vattr.va_size - fp->f_offset;
822 		}
823 		if (com == FIONBIO || com == FIOASYNC)	/* XXX */
824 			error = 0;
825 		else
826 			error = VOP_IOCTL(vp, com, data, fp->f_flag,
827 			    active_cred, td);
828 		break;
829 
830 	default:
831 		break;
832 	}
833 	VFS_UNLOCK_GIANT(vfslocked);
834 	return (error);
835 }
836 
837 /*
838  * File table vnode poll routine.
839  */
840 static int
841 vn_poll(fp, events, active_cred, td)
842 	struct file *fp;
843 	int events;
844 	struct ucred *active_cred;
845 	struct thread *td;
846 {
847 	struct vnode *vp;
848 	int vfslocked;
849 	int error;
850 
851 	vp = fp->f_vnode;
852 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
853 #ifdef MAC
854 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
855 	error = mac_vnode_check_poll(active_cred, fp->f_cred, vp);
856 	VOP_UNLOCK(vp, 0);
857 	if (!error)
858 #endif
859 
860 	error = VOP_POLL(vp, events, fp->f_cred, td);
861 	VFS_UNLOCK_GIANT(vfslocked);
862 	return (error);
863 }
864 
865 /*
866  * Acquire the requested lock and then check for validity.  LK_RETRY
867  * permits vn_lock to return doomed vnodes.
868  */
869 int
870 _vn_lock(struct vnode *vp, int flags, char *file, int line)
871 {
872 	int error;
873 
874 	VNASSERT((flags & LK_TYPE_MASK) != 0, vp,
875 	    ("vn_lock called with no locktype."));
876 	do {
877 #ifdef DEBUG_VFS_LOCKS
878 		KASSERT(vp->v_holdcnt != 0,
879 		    ("vn_lock %p: zero hold count", vp));
880 #endif
881 		error = VOP_LOCK1(vp, flags, file, line);
882 		flags &= ~LK_INTERLOCK;	/* Interlock is always dropped. */
883 		KASSERT((flags & LK_RETRY) == 0 || error == 0,
884 		    ("LK_RETRY set with incompatible flags %d\n", flags));
885 		/*
886 		 * Callers specify LK_RETRY if they wish to get dead vnodes.
887 		 * If RETRY is not set, we return ENOENT instead.
888 		 */
889 		if (error == 0 && vp->v_iflag & VI_DOOMED &&
890 		    (flags & LK_RETRY) == 0) {
891 			VOP_UNLOCK(vp, 0);
892 			error = ENOENT;
893 			break;
894 		}
895 	} while (flags & LK_RETRY && error != 0);
896 	return (error);
897 }
898 
899 /*
900  * File table vnode close routine.
901  */
902 static int
903 vn_closefile(fp, td)
904 	struct file *fp;
905 	struct thread *td;
906 {
907 	struct vnode *vp;
908 	struct flock lf;
909 	int vfslocked;
910 	int error;
911 
912 	vp = fp->f_vnode;
913 
914 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
915 	if (fp->f_type == DTYPE_VNODE && fp->f_flag & FHASLOCK) {
916 		lf.l_whence = SEEK_SET;
917 		lf.l_start = 0;
918 		lf.l_len = 0;
919 		lf.l_type = F_UNLCK;
920 		(void) VOP_ADVLOCK(vp, fp, F_UNLCK, &lf, F_FLOCK);
921 	}
922 
923 	fp->f_ops = &badfileops;
924 
925 	error = vn_close(vp, fp->f_flag, fp->f_cred, td);
926 	VFS_UNLOCK_GIANT(vfslocked);
927 	return (error);
928 }
929 
930 /*
931  * Preparing to start a filesystem write operation. If the operation is
932  * permitted, then we bump the count of operations in progress and
933  * proceed. If a suspend request is in progress, we wait until the
934  * suspension is over, and then proceed.
935  */
936 int
937 vn_start_write(vp, mpp, flags)
938 	struct vnode *vp;
939 	struct mount **mpp;
940 	int flags;
941 {
942 	struct mount *mp;
943 	int error;
944 
945 	error = 0;
946 	/*
947 	 * If a vnode is provided, get and return the mount point that
948 	 * to which it will write.
949 	 */
950 	if (vp != NULL) {
951 		if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
952 			*mpp = NULL;
953 			if (error != EOPNOTSUPP)
954 				return (error);
955 			return (0);
956 		}
957 	}
958 	if ((mp = *mpp) == NULL)
959 		return (0);
960 
961 	/*
962 	 * VOP_GETWRITEMOUNT() returns with the mp refcount held through
963 	 * a vfs_ref().
964 	 * As long as a vnode is not provided we need to acquire a
965 	 * refcount for the provided mountpoint too, in order to
966 	 * emulate a vfs_ref().
967 	 */
968 	MNT_ILOCK(mp);
969 	if (vp == NULL)
970 		MNT_REF(mp);
971 
972 	/*
973 	 * Check on status of suspension.
974 	 */
975 	if ((curthread->td_pflags & TDP_IGNSUSP) == 0 ||
976 	    mp->mnt_susp_owner != curthread) {
977 		while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
978 			if (flags & V_NOWAIT) {
979 				error = EWOULDBLOCK;
980 				if (vp != NULL)
981 					*mpp = NULL;
982 				goto unlock;
983 			}
984 			error = msleep(&mp->mnt_flag, MNT_MTX(mp),
985 			    (PUSER - 1) | (flags & PCATCH), "suspfs", 0);
986 			if (error) {
987 				if (vp != NULL)
988 					*mpp = NULL;
989 				goto unlock;
990 			}
991 		}
992 	}
993 	if (flags & V_XSLEEP)
994 		goto unlock;
995 	mp->mnt_writeopcount++;
996 unlock:
997 	MNT_REL(mp);
998 	MNT_IUNLOCK(mp);
999 	return (error);
1000 }
1001 
1002 /*
1003  * Secondary suspension. Used by operations such as vop_inactive
1004  * routines that are needed by the higher level functions. These
1005  * are allowed to proceed until all the higher level functions have
1006  * completed (indicated by mnt_writeopcount dropping to zero). At that
1007  * time, these operations are halted until the suspension is over.
1008  */
1009 int
1010 vn_start_secondary_write(vp, mpp, flags)
1011 	struct vnode *vp;
1012 	struct mount **mpp;
1013 	int flags;
1014 {
1015 	struct mount *mp;
1016 	int error;
1017 
1018  retry:
1019 	if (vp != NULL) {
1020 		if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
1021 			*mpp = NULL;
1022 			if (error != EOPNOTSUPP)
1023 				return (error);
1024 			return (0);
1025 		}
1026 	}
1027 	/*
1028 	 * If we are not suspended or have not yet reached suspended
1029 	 * mode, then let the operation proceed.
1030 	 */
1031 	if ((mp = *mpp) == NULL)
1032 		return (0);
1033 
1034 	/*
1035 	 * VOP_GETWRITEMOUNT() returns with the mp refcount held through
1036 	 * a vfs_ref().
1037 	 * As long as a vnode is not provided we need to acquire a
1038 	 * refcount for the provided mountpoint too, in order to
1039 	 * emulate a vfs_ref().
1040 	 */
1041 	MNT_ILOCK(mp);
1042 	if (vp == NULL)
1043 		MNT_REF(mp);
1044 	if ((mp->mnt_kern_flag & (MNTK_SUSPENDED | MNTK_SUSPEND2)) == 0) {
1045 		mp->mnt_secondary_writes++;
1046 		mp->mnt_secondary_accwrites++;
1047 		MNT_REL(mp);
1048 		MNT_IUNLOCK(mp);
1049 		return (0);
1050 	}
1051 	if (flags & V_NOWAIT) {
1052 		MNT_REL(mp);
1053 		MNT_IUNLOCK(mp);
1054 		if (vp != NULL)
1055 			*mpp = NULL;
1056 		return (EWOULDBLOCK);
1057 	}
1058 	/*
1059 	 * Wait for the suspension to finish.
1060 	 */
1061 	error = msleep(&mp->mnt_flag, MNT_MTX(mp),
1062 		       (PUSER - 1) | (flags & PCATCH) | PDROP, "suspfs", 0);
1063 	vfs_rel(mp);
1064 	if (error == 0)
1065 		goto retry;
1066 	if (vp != NULL)
1067 		*mpp = NULL;
1068 	return (error);
1069 }
1070 
1071 /*
1072  * Filesystem write operation has completed. If we are suspending and this
1073  * operation is the last one, notify the suspender that the suspension is
1074  * now in effect.
1075  */
1076 void
1077 vn_finished_write(mp)
1078 	struct mount *mp;
1079 {
1080 	if (mp == NULL)
1081 		return;
1082 	MNT_ILOCK(mp);
1083 	mp->mnt_writeopcount--;
1084 	if (mp->mnt_writeopcount < 0)
1085 		panic("vn_finished_write: neg cnt");
1086 	if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
1087 	    mp->mnt_writeopcount <= 0)
1088 		wakeup(&mp->mnt_writeopcount);
1089 	MNT_IUNLOCK(mp);
1090 }
1091 
1092 
1093 /*
1094  * Filesystem secondary write operation has completed. If we are
1095  * suspending and this operation is the last one, notify the suspender
1096  * that the suspension is now in effect.
1097  */
1098 void
1099 vn_finished_secondary_write(mp)
1100 	struct mount *mp;
1101 {
1102 	if (mp == NULL)
1103 		return;
1104 	MNT_ILOCK(mp);
1105 	mp->mnt_secondary_writes--;
1106 	if (mp->mnt_secondary_writes < 0)
1107 		panic("vn_finished_secondary_write: neg cnt");
1108 	if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
1109 	    mp->mnt_secondary_writes <= 0)
1110 		wakeup(&mp->mnt_secondary_writes);
1111 	MNT_IUNLOCK(mp);
1112 }
1113 
1114 
1115 
1116 /*
1117  * Request a filesystem to suspend write operations.
1118  */
1119 int
1120 vfs_write_suspend(mp)
1121 	struct mount *mp;
1122 {
1123 	struct thread *td = curthread;
1124 	int error;
1125 
1126 	MNT_ILOCK(mp);
1127 	if (mp->mnt_susp_owner == curthread) {
1128 		MNT_IUNLOCK(mp);
1129 		return (EALREADY);
1130 	}
1131 	while (mp->mnt_kern_flag & MNTK_SUSPEND)
1132 		msleep(&mp->mnt_flag, MNT_MTX(mp), PUSER - 1, "wsuspfs", 0);
1133 	mp->mnt_kern_flag |= MNTK_SUSPEND;
1134 	mp->mnt_susp_owner = curthread;
1135 	if (mp->mnt_writeopcount > 0)
1136 		(void) msleep(&mp->mnt_writeopcount,
1137 		    MNT_MTX(mp), (PUSER - 1)|PDROP, "suspwt", 0);
1138 	else
1139 		MNT_IUNLOCK(mp);
1140 	if ((error = VFS_SYNC(mp, MNT_SUSPEND, td)) != 0)
1141 		vfs_write_resume(mp);
1142 	return (error);
1143 }
1144 
1145 /*
1146  * Request a filesystem to resume write operations.
1147  */
1148 void
1149 vfs_write_resume(mp)
1150 	struct mount *mp;
1151 {
1152 
1153 	MNT_ILOCK(mp);
1154 	if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
1155 		KASSERT(mp->mnt_susp_owner == curthread, ("mnt_susp_owner"));
1156 		mp->mnt_kern_flag &= ~(MNTK_SUSPEND | MNTK_SUSPEND2 |
1157 				       MNTK_SUSPENDED);
1158 		mp->mnt_susp_owner = NULL;
1159 		wakeup(&mp->mnt_writeopcount);
1160 		wakeup(&mp->mnt_flag);
1161 		curthread->td_pflags &= ~TDP_IGNSUSP;
1162 		MNT_IUNLOCK(mp);
1163 		VFS_SUSP_CLEAN(mp);
1164 	} else
1165 		MNT_IUNLOCK(mp);
1166 }
1167 
1168 /*
1169  * Implement kqueues for files by translating it to vnode operation.
1170  */
1171 static int
1172 vn_kqfilter(struct file *fp, struct knote *kn)
1173 {
1174 	int vfslocked;
1175 	int error;
1176 
1177 	vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
1178 	error = VOP_KQFILTER(fp->f_vnode, kn);
1179 	VFS_UNLOCK_GIANT(vfslocked);
1180 
1181 	return error;
1182 }
1183 
1184 /*
1185  * Simplified in-kernel wrapper calls for extended attribute access.
1186  * Both calls pass in a NULL credential, authorizing as "kernel" access.
1187  * Set IO_NODELOCKED in ioflg if the vnode is already locked.
1188  */
1189 int
1190 vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace,
1191     const char *attrname, int *buflen, char *buf, struct thread *td)
1192 {
1193 	struct uio	auio;
1194 	struct iovec	iov;
1195 	int	error;
1196 
1197 	iov.iov_len = *buflen;
1198 	iov.iov_base = buf;
1199 
1200 	auio.uio_iov = &iov;
1201 	auio.uio_iovcnt = 1;
1202 	auio.uio_rw = UIO_READ;
1203 	auio.uio_segflg = UIO_SYSSPACE;
1204 	auio.uio_td = td;
1205 	auio.uio_offset = 0;
1206 	auio.uio_resid = *buflen;
1207 
1208 	if ((ioflg & IO_NODELOCKED) == 0)
1209 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1210 
1211 	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
1212 
1213 	/* authorize attribute retrieval as kernel */
1214 	error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, NULL,
1215 	    td);
1216 
1217 	if ((ioflg & IO_NODELOCKED) == 0)
1218 		VOP_UNLOCK(vp, 0);
1219 
1220 	if (error == 0) {
1221 		*buflen = *buflen - auio.uio_resid;
1222 	}
1223 
1224 	return (error);
1225 }
1226 
1227 /*
1228  * XXX failure mode if partially written?
1229  */
1230 int
1231 vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace,
1232     const char *attrname, int buflen, char *buf, struct thread *td)
1233 {
1234 	struct uio	auio;
1235 	struct iovec	iov;
1236 	struct mount	*mp;
1237 	int	error;
1238 
1239 	iov.iov_len = buflen;
1240 	iov.iov_base = buf;
1241 
1242 	auio.uio_iov = &iov;
1243 	auio.uio_iovcnt = 1;
1244 	auio.uio_rw = UIO_WRITE;
1245 	auio.uio_segflg = UIO_SYSSPACE;
1246 	auio.uio_td = td;
1247 	auio.uio_offset = 0;
1248 	auio.uio_resid = buflen;
1249 
1250 	if ((ioflg & IO_NODELOCKED) == 0) {
1251 		if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
1252 			return (error);
1253 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1254 	}
1255 
1256 	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
1257 
1258 	/* authorize attribute setting as kernel */
1259 	error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, td);
1260 
1261 	if ((ioflg & IO_NODELOCKED) == 0) {
1262 		vn_finished_write(mp);
1263 		VOP_UNLOCK(vp, 0);
1264 	}
1265 
1266 	return (error);
1267 }
1268 
1269 int
1270 vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace,
1271     const char *attrname, struct thread *td)
1272 {
1273 	struct mount	*mp;
1274 	int	error;
1275 
1276 	if ((ioflg & IO_NODELOCKED) == 0) {
1277 		if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
1278 			return (error);
1279 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1280 	}
1281 
1282 	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
1283 
1284 	/* authorize attribute removal as kernel */
1285 	error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, NULL, td);
1286 	if (error == EOPNOTSUPP)
1287 		error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL,
1288 		    NULL, td);
1289 
1290 	if ((ioflg & IO_NODELOCKED) == 0) {
1291 		vn_finished_write(mp);
1292 		VOP_UNLOCK(vp, 0);
1293 	}
1294 
1295 	return (error);
1296 }
1297