xref: /freebsd/sys/kern/vfs_vnops.c (revision 7750ad47a9a7dbc83f87158464170c8640723293)
1 /*-
2  * Copyright (c) 1982, 1986, 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 4. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  *	@(#)vfs_vnops.c	8.2 (Berkeley) 1/21/94
35  */
36 
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD$");
39 
40 #include <sys/param.h>
41 #include <sys/systm.h>
42 #include <sys/fcntl.h>
43 #include <sys/file.h>
44 #include <sys/kdb.h>
45 #include <sys/stat.h>
46 #include <sys/priv.h>
47 #include <sys/proc.h>
48 #include <sys/limits.h>
49 #include <sys/lock.h>
50 #include <sys/mount.h>
51 #include <sys/mutex.h>
52 #include <sys/namei.h>
53 #include <sys/vnode.h>
54 #include <sys/bio.h>
55 #include <sys/buf.h>
56 #include <sys/filio.h>
57 #include <sys/resourcevar.h>
58 #include <sys/sx.h>
59 #include <sys/ttycom.h>
60 #include <sys/conf.h>
61 #include <sys/syslog.h>
62 #include <sys/unistd.h>
63 
64 #include <security/audit/audit.h>
65 #include <security/mac/mac_framework.h>
66 
67 #include <vm/vm.h>
68 #include <vm/vm_extern.h>
69 #include <vm/pmap.h>
70 #include <vm/vm_map.h>
71 #include <vm/vm_object.h>
72 #include <vm/vm_page.h>
73 
74 static fo_rdwr_t	vn_read;
75 static fo_rdwr_t	vn_write;
76 static fo_rdwr_t	vn_io_fault;
77 static fo_truncate_t	vn_truncate;
78 static fo_ioctl_t	vn_ioctl;
79 static fo_poll_t	vn_poll;
80 static fo_kqfilter_t	vn_kqfilter;
81 static fo_stat_t	vn_statfile;
82 static fo_close_t	vn_closefile;
83 
84 struct 	fileops vnops = {
85 	.fo_read = vn_io_fault,
86 	.fo_write = vn_io_fault,
87 	.fo_truncate = vn_truncate,
88 	.fo_ioctl = vn_ioctl,
89 	.fo_poll = vn_poll,
90 	.fo_kqfilter = vn_kqfilter,
91 	.fo_stat = vn_statfile,
92 	.fo_close = vn_closefile,
93 	.fo_chmod = vn_chmod,
94 	.fo_chown = vn_chown,
95 	.fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE
96 };
97 
98 int
99 vn_open(ndp, flagp, cmode, fp)
100 	struct nameidata *ndp;
101 	int *flagp, cmode;
102 	struct file *fp;
103 {
104 	struct thread *td = ndp->ni_cnd.cn_thread;
105 
106 	return (vn_open_cred(ndp, flagp, cmode, 0, td->td_ucred, fp));
107 }
108 
109 /*
110  * Common code for vnode open operations.
111  * Check permissions, and call the VOP_OPEN or VOP_CREATE routine.
112  *
113  * Note that this does NOT free nameidata for the successful case,
114  * due to the NDINIT being done elsewhere.
115  */
116 int
117 vn_open_cred(struct nameidata *ndp, int *flagp, int cmode, u_int vn_open_flags,
118     struct ucred *cred, struct file *fp)
119 {
120 	struct vnode *vp;
121 	struct mount *mp;
122 	struct thread *td = ndp->ni_cnd.cn_thread;
123 	struct vattr vat;
124 	struct vattr *vap = &vat;
125 	int fmode, error;
126 	accmode_t accmode;
127 	int vfslocked, mpsafe;
128 
129 	mpsafe = ndp->ni_cnd.cn_flags & MPSAFE;
130 restart:
131 	vfslocked = 0;
132 	fmode = *flagp;
133 	if (fmode & O_CREAT) {
134 		ndp->ni_cnd.cn_nameiop = CREATE;
135 		ndp->ni_cnd.cn_flags = ISOPEN | LOCKPARENT | LOCKLEAF |
136 		    MPSAFE;
137 		if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0)
138 			ndp->ni_cnd.cn_flags |= FOLLOW;
139 		if (!(vn_open_flags & VN_OPEN_NOAUDIT))
140 			ndp->ni_cnd.cn_flags |= AUDITVNODE1;
141 		bwillwrite();
142 		if ((error = namei(ndp)) != 0)
143 			return (error);
144 		vfslocked = NDHASGIANT(ndp);
145 		if (!mpsafe)
146 			ndp->ni_cnd.cn_flags &= ~MPSAFE;
147 		if (ndp->ni_vp == NULL) {
148 			VATTR_NULL(vap);
149 			vap->va_type = VREG;
150 			vap->va_mode = cmode;
151 			if (fmode & O_EXCL)
152 				vap->va_vaflags |= VA_EXCLUSIVE;
153 			if (vn_start_write(ndp->ni_dvp, &mp, V_NOWAIT) != 0) {
154 				NDFREE(ndp, NDF_ONLY_PNBUF);
155 				vput(ndp->ni_dvp);
156 				VFS_UNLOCK_GIANT(vfslocked);
157 				if ((error = vn_start_write(NULL, &mp,
158 				    V_XSLEEP | PCATCH)) != 0)
159 					return (error);
160 				goto restart;
161 			}
162 #ifdef MAC
163 			error = mac_vnode_check_create(cred, ndp->ni_dvp,
164 			    &ndp->ni_cnd, vap);
165 			if (error == 0)
166 #endif
167 				error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp,
168 						   &ndp->ni_cnd, vap);
169 			vput(ndp->ni_dvp);
170 			vn_finished_write(mp);
171 			if (error) {
172 				VFS_UNLOCK_GIANT(vfslocked);
173 				NDFREE(ndp, NDF_ONLY_PNBUF);
174 				return (error);
175 			}
176 			fmode &= ~O_TRUNC;
177 			vp = ndp->ni_vp;
178 		} else {
179 			if (ndp->ni_dvp == ndp->ni_vp)
180 				vrele(ndp->ni_dvp);
181 			else
182 				vput(ndp->ni_dvp);
183 			ndp->ni_dvp = NULL;
184 			vp = ndp->ni_vp;
185 			if (fmode & O_EXCL) {
186 				error = EEXIST;
187 				goto bad;
188 			}
189 			fmode &= ~O_CREAT;
190 		}
191 	} else {
192 		ndp->ni_cnd.cn_nameiop = LOOKUP;
193 		ndp->ni_cnd.cn_flags = ISOPEN |
194 		    ((fmode & O_NOFOLLOW) ? NOFOLLOW : FOLLOW) |
195 		    LOCKLEAF | MPSAFE;
196 		if (!(fmode & FWRITE))
197 			ndp->ni_cnd.cn_flags |= LOCKSHARED;
198 		if (!(vn_open_flags & VN_OPEN_NOAUDIT))
199 			ndp->ni_cnd.cn_flags |= AUDITVNODE1;
200 		if ((error = namei(ndp)) != 0)
201 			return (error);
202 		if (!mpsafe)
203 			ndp->ni_cnd.cn_flags &= ~MPSAFE;
204 		vfslocked = NDHASGIANT(ndp);
205 		vp = ndp->ni_vp;
206 	}
207 	if (vp->v_type == VLNK) {
208 		error = EMLINK;
209 		goto bad;
210 	}
211 	if (vp->v_type == VSOCK) {
212 		error = EOPNOTSUPP;
213 		goto bad;
214 	}
215 	if (vp->v_type != VDIR && fmode & O_DIRECTORY) {
216 		error = ENOTDIR;
217 		goto bad;
218 	}
219 	accmode = 0;
220 	if (fmode & (FWRITE | O_TRUNC)) {
221 		if (vp->v_type == VDIR) {
222 			error = EISDIR;
223 			goto bad;
224 		}
225 		accmode |= VWRITE;
226 	}
227 	if (fmode & FREAD)
228 		accmode |= VREAD;
229 	if (fmode & FEXEC)
230 		accmode |= VEXEC;
231 	if ((fmode & O_APPEND) && (fmode & FWRITE))
232 		accmode |= VAPPEND;
233 #ifdef MAC
234 	error = mac_vnode_check_open(cred, vp, accmode);
235 	if (error)
236 		goto bad;
237 #endif
238 	if ((fmode & O_CREAT) == 0) {
239 		if (accmode & VWRITE) {
240 			error = vn_writechk(vp);
241 			if (error)
242 				goto bad;
243 		}
244 		if (accmode) {
245 		        error = VOP_ACCESS(vp, accmode, cred, td);
246 			if (error)
247 				goto bad;
248 		}
249 	}
250 	if ((error = VOP_OPEN(vp, fmode, cred, td, fp)) != 0)
251 		goto bad;
252 
253 	if (fmode & FWRITE) {
254 		vp->v_writecount++;
255 		CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d",
256 		    __func__, vp, vp->v_writecount);
257 	}
258 	*flagp = fmode;
259 	ASSERT_VOP_LOCKED(vp, "vn_open_cred");
260 	if (!mpsafe)
261 		VFS_UNLOCK_GIANT(vfslocked);
262 	return (0);
263 bad:
264 	NDFREE(ndp, NDF_ONLY_PNBUF);
265 	vput(vp);
266 	VFS_UNLOCK_GIANT(vfslocked);
267 	*flagp = fmode;
268 	ndp->ni_vp = NULL;
269 	return (error);
270 }
271 
272 /*
273  * Check for write permissions on the specified vnode.
274  * Prototype text segments cannot be written.
275  */
276 int
277 vn_writechk(vp)
278 	register struct vnode *vp;
279 {
280 
281 	ASSERT_VOP_LOCKED(vp, "vn_writechk");
282 	/*
283 	 * If there's shared text associated with
284 	 * the vnode, try to free it up once.  If
285 	 * we fail, we can't allow writing.
286 	 */
287 	if (vp->v_vflag & VV_TEXT)
288 		return (ETXTBSY);
289 
290 	return (0);
291 }
292 
293 /*
294  * Vnode close call
295  */
296 int
297 vn_close(vp, flags, file_cred, td)
298 	register struct vnode *vp;
299 	int flags;
300 	struct ucred *file_cred;
301 	struct thread *td;
302 {
303 	struct mount *mp;
304 	int error, lock_flags;
305 
306 	if (!(flags & FWRITE) && vp->v_mount != NULL &&
307 	    vp->v_mount->mnt_kern_flag & MNTK_EXTENDED_SHARED)
308 		lock_flags = LK_SHARED;
309 	else
310 		lock_flags = LK_EXCLUSIVE;
311 
312 	VFS_ASSERT_GIANT(vp->v_mount);
313 
314 	vn_start_write(vp, &mp, V_WAIT);
315 	vn_lock(vp, lock_flags | LK_RETRY);
316 	if (flags & FWRITE) {
317 		VNASSERT(vp->v_writecount > 0, vp,
318 		    ("vn_close: negative writecount"));
319 		vp->v_writecount--;
320 		CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d",
321 		    __func__, vp, vp->v_writecount);
322 	}
323 	error = VOP_CLOSE(vp, flags, file_cred, td);
324 	vput(vp);
325 	vn_finished_write(mp);
326 	return (error);
327 }
328 
329 /*
330  * Heuristic to detect sequential operation.
331  */
332 static int
333 sequential_heuristic(struct uio *uio, struct file *fp)
334 {
335 
336 	if (atomic_load_acq_int(&(fp->f_flag)) & FRDAHEAD)
337 		return (fp->f_seqcount << IO_SEQSHIFT);
338 
339 	/*
340 	 * Offset 0 is handled specially.  open() sets f_seqcount to 1 so
341 	 * that the first I/O is normally considered to be slightly
342 	 * sequential.  Seeking to offset 0 doesn't change sequentiality
343 	 * unless previous seeks have reduced f_seqcount to 0, in which
344 	 * case offset 0 is not special.
345 	 */
346 	if ((uio->uio_offset == 0 && fp->f_seqcount > 0) ||
347 	    uio->uio_offset == fp->f_nextoff) {
348 		/*
349 		 * f_seqcount is in units of fixed-size blocks so that it
350 		 * depends mainly on the amount of sequential I/O and not
351 		 * much on the number of sequential I/O's.  The fixed size
352 		 * of 16384 is hard-coded here since it is (not quite) just
353 		 * a magic size that works well here.  This size is more
354 		 * closely related to the best I/O size for real disks than
355 		 * to any block size used by software.
356 		 */
357 		fp->f_seqcount += howmany(uio->uio_resid, 16384);
358 		if (fp->f_seqcount > IO_SEQMAX)
359 			fp->f_seqcount = IO_SEQMAX;
360 		return (fp->f_seqcount << IO_SEQSHIFT);
361 	}
362 
363 	/* Not sequential.  Quickly draw-down sequentiality. */
364 	if (fp->f_seqcount > 1)
365 		fp->f_seqcount = 1;
366 	else
367 		fp->f_seqcount = 0;
368 	return (0);
369 }
370 
371 /*
372  * Package up an I/O request on a vnode into a uio and do it.
373  */
374 int
375 vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base, int len, off_t offset,
376     enum uio_seg segflg, int ioflg, struct ucred *active_cred,
377     struct ucred *file_cred, ssize_t *aresid, struct thread *td)
378 {
379 	struct uio auio;
380 	struct iovec aiov;
381 	struct mount *mp;
382 	struct ucred *cred;
383 	void *rl_cookie;
384 	int error, lock_flags;
385 
386 	VFS_ASSERT_GIANT(vp->v_mount);
387 
388 	auio.uio_iov = &aiov;
389 	auio.uio_iovcnt = 1;
390 	aiov.iov_base = base;
391 	aiov.iov_len = len;
392 	auio.uio_resid = len;
393 	auio.uio_offset = offset;
394 	auio.uio_segflg = segflg;
395 	auio.uio_rw = rw;
396 	auio.uio_td = td;
397 	error = 0;
398 
399 	if ((ioflg & IO_NODELOCKED) == 0) {
400 		if (rw == UIO_READ) {
401 			rl_cookie = vn_rangelock_rlock(vp, offset,
402 			    offset + len);
403 		} else {
404 			rl_cookie = vn_rangelock_wlock(vp, offset,
405 			    offset + len);
406 		}
407 		mp = NULL;
408 		if (rw == UIO_WRITE) {
409 			if (vp->v_type != VCHR &&
410 			    (error = vn_start_write(vp, &mp, V_WAIT | PCATCH))
411 			    != 0)
412 				goto out;
413 			if (MNT_SHARED_WRITES(mp) ||
414 			    ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount)))
415 				lock_flags = LK_SHARED;
416 			else
417 				lock_flags = LK_EXCLUSIVE;
418 		} else
419 			lock_flags = LK_SHARED;
420 		vn_lock(vp, lock_flags | LK_RETRY);
421 	} else
422 		rl_cookie = NULL;
423 
424 	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
425 #ifdef MAC
426 	if ((ioflg & IO_NOMACCHECK) == 0) {
427 		if (rw == UIO_READ)
428 			error = mac_vnode_check_read(active_cred, file_cred,
429 			    vp);
430 		else
431 			error = mac_vnode_check_write(active_cred, file_cred,
432 			    vp);
433 	}
434 #endif
435 	if (error == 0) {
436 		if (file_cred != NULL)
437 			cred = file_cred;
438 		else
439 			cred = active_cred;
440 		if (rw == UIO_READ)
441 			error = VOP_READ(vp, &auio, ioflg, cred);
442 		else
443 			error = VOP_WRITE(vp, &auio, ioflg, cred);
444 	}
445 	if (aresid)
446 		*aresid = auio.uio_resid;
447 	else
448 		if (auio.uio_resid && error == 0)
449 			error = EIO;
450 	if ((ioflg & IO_NODELOCKED) == 0) {
451 		VOP_UNLOCK(vp, 0);
452 		if (mp != NULL)
453 			vn_finished_write(mp);
454 	}
455  out:
456 	if (rl_cookie != NULL)
457 		vn_rangelock_unlock(vp, rl_cookie);
458 	return (error);
459 }
460 
461 /*
462  * Package up an I/O request on a vnode into a uio and do it.  The I/O
463  * request is split up into smaller chunks and we try to avoid saturating
464  * the buffer cache while potentially holding a vnode locked, so we
465  * check bwillwrite() before calling vn_rdwr().  We also call kern_yield()
466  * to give other processes a chance to lock the vnode (either other processes
467  * core'ing the same binary, or unrelated processes scanning the directory).
468  */
469 int
470 vn_rdwr_inchunks(rw, vp, base, len, offset, segflg, ioflg, active_cred,
471     file_cred, aresid, td)
472 	enum uio_rw rw;
473 	struct vnode *vp;
474 	void *base;
475 	size_t len;
476 	off_t offset;
477 	enum uio_seg segflg;
478 	int ioflg;
479 	struct ucred *active_cred;
480 	struct ucred *file_cred;
481 	size_t *aresid;
482 	struct thread *td;
483 {
484 	int error = 0;
485 	ssize_t iaresid;
486 
487 	VFS_ASSERT_GIANT(vp->v_mount);
488 
489 	do {
490 		int chunk;
491 
492 		/*
493 		 * Force `offset' to a multiple of MAXBSIZE except possibly
494 		 * for the first chunk, so that filesystems only need to
495 		 * write full blocks except possibly for the first and last
496 		 * chunks.
497 		 */
498 		chunk = MAXBSIZE - (uoff_t)offset % MAXBSIZE;
499 
500 		if (chunk > len)
501 			chunk = len;
502 		if (rw != UIO_READ && vp->v_type == VREG)
503 			bwillwrite();
504 		iaresid = 0;
505 		error = vn_rdwr(rw, vp, base, chunk, offset, segflg,
506 		    ioflg, active_cred, file_cred, &iaresid, td);
507 		len -= chunk;	/* aresid calc already includes length */
508 		if (error)
509 			break;
510 		offset += chunk;
511 		base = (char *)base + chunk;
512 		kern_yield(PRI_USER);
513 	} while (len);
514 	if (aresid)
515 		*aresid = len + iaresid;
516 	return (error);
517 }
518 
519 /*
520  * File table vnode read routine.
521  */
522 static int
523 vn_read(fp, uio, active_cred, flags, td)
524 	struct file *fp;
525 	struct uio *uio;
526 	struct ucred *active_cred;
527 	int flags;
528 	struct thread *td;
529 {
530 	struct vnode *vp;
531 	int error, ioflag;
532 	struct mtx *mtxp;
533 	int advice, vfslocked;
534 	off_t offset;
535 
536 	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
537 	    uio->uio_td, td));
538 	mtxp = NULL;
539 	vp = fp->f_vnode;
540 	ioflag = 0;
541 	if (fp->f_flag & FNONBLOCK)
542 		ioflag |= IO_NDELAY;
543 	if (fp->f_flag & O_DIRECT)
544 		ioflag |= IO_DIRECT;
545 	advice = POSIX_FADV_NORMAL;
546 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
547 	/*
548 	 * According to McKusick the vn lock was protecting f_offset here.
549 	 * It is now protected by the FOFFSET_LOCKED flag.
550 	 */
551 	if ((flags & FOF_OFFSET) == 0 || fp->f_advice != NULL) {
552 		mtxp = mtx_pool_find(mtxpool_sleep, fp);
553 		mtx_lock(mtxp);
554 		if ((flags & FOF_OFFSET) == 0) {
555 			while (fp->f_vnread_flags & FOFFSET_LOCKED) {
556 				fp->f_vnread_flags |= FOFFSET_LOCK_WAITING;
557 				msleep(&fp->f_vnread_flags, mtxp, PUSER -1,
558 				    "vnread offlock", 0);
559 			}
560 			fp->f_vnread_flags |= FOFFSET_LOCKED;
561 			uio->uio_offset = fp->f_offset;
562 		}
563 		if (fp->f_advice != NULL &&
564 		    uio->uio_offset >= fp->f_advice->fa_start &&
565 		    uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end)
566 			advice = fp->f_advice->fa_advice;
567 		mtx_unlock(mtxp);
568 	}
569 	vn_lock(vp, LK_SHARED | LK_RETRY);
570 
571 	switch (advice) {
572 	case POSIX_FADV_NORMAL:
573 	case POSIX_FADV_SEQUENTIAL:
574 	case POSIX_FADV_NOREUSE:
575 		ioflag |= sequential_heuristic(uio, fp);
576 		break;
577 	case POSIX_FADV_RANDOM:
578 		/* Disable read-ahead for random I/O. */
579 		break;
580 	}
581 	offset = uio->uio_offset;
582 
583 #ifdef MAC
584 	error = mac_vnode_check_read(active_cred, fp->f_cred, vp);
585 	if (error == 0)
586 #endif
587 		error = VOP_READ(vp, uio, ioflag, fp->f_cred);
588 	if ((flags & FOF_OFFSET) == 0) {
589 		fp->f_offset = uio->uio_offset;
590 		mtx_lock(mtxp);
591 		if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING)
592 			wakeup(&fp->f_vnread_flags);
593 		fp->f_vnread_flags = 0;
594 		mtx_unlock(mtxp);
595 	}
596 	fp->f_nextoff = uio->uio_offset;
597 	VOP_UNLOCK(vp, 0);
598 	if (error == 0 && advice == POSIX_FADV_NOREUSE &&
599 	    offset != uio->uio_offset)
600 		error = VOP_ADVISE(vp, offset, uio->uio_offset - 1,
601 		    POSIX_FADV_DONTNEED);
602 	VFS_UNLOCK_GIANT(vfslocked);
603 	return (error);
604 }
605 
606 /*
607  * File table vnode write routine.
608  */
609 static int
610 vn_write(fp, uio, active_cred, flags, td)
611 	struct file *fp;
612 	struct uio *uio;
613 	struct ucred *active_cred;
614 	int flags;
615 	struct thread *td;
616 {
617 	struct vnode *vp;
618 	struct mount *mp;
619 	int error, ioflag, lock_flags;
620 	struct mtx *mtxp;
621 	int advice, vfslocked;
622 
623 	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
624 	    uio->uio_td, td));
625 	vp = fp->f_vnode;
626 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
627 	if (vp->v_type == VREG)
628 		bwillwrite();
629 	ioflag = IO_UNIT;
630 	if (vp->v_type == VREG && (fp->f_flag & O_APPEND))
631 		ioflag |= IO_APPEND;
632 	if (fp->f_flag & FNONBLOCK)
633 		ioflag |= IO_NDELAY;
634 	if (fp->f_flag & O_DIRECT)
635 		ioflag |= IO_DIRECT;
636 	if ((fp->f_flag & O_FSYNC) ||
637 	    (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)))
638 		ioflag |= IO_SYNC;
639 	mp = NULL;
640 	if (vp->v_type != VCHR &&
641 	    (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
642 		goto unlock;
643 
644 	if ((MNT_SHARED_WRITES(mp) ||
645 	    ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) &&
646 	    (flags & FOF_OFFSET) != 0) {
647 		lock_flags = LK_SHARED;
648 	} else {
649 		lock_flags = LK_EXCLUSIVE;
650 	}
651 
652 	vn_lock(vp, lock_flags | LK_RETRY);
653 	if ((flags & FOF_OFFSET) == 0)
654 		uio->uio_offset = fp->f_offset;
655 	advice = POSIX_FADV_NORMAL;
656 	if (fp->f_advice != NULL) {
657 		mtxp = mtx_pool_find(mtxpool_sleep, fp);
658 		mtx_lock(mtxp);
659 		if (fp->f_advice != NULL &&
660 		    uio->uio_offset >= fp->f_advice->fa_start &&
661 		    uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end)
662 			advice = fp->f_advice->fa_advice;
663 		mtx_unlock(mtxp);
664 	}
665 	switch (advice) {
666 	case POSIX_FADV_NORMAL:
667 	case POSIX_FADV_SEQUENTIAL:
668 		ioflag |= sequential_heuristic(uio, fp);
669 		break;
670 	case POSIX_FADV_RANDOM:
671 		/* XXX: Is this correct? */
672 		break;
673 	case POSIX_FADV_NOREUSE:
674 		/*
675 		 * Request the underlying FS to discard the buffers
676 		 * and pages after the I/O is complete.
677 		 */
678 		ioflag |= IO_DIRECT;
679 		break;
680 	}
681 
682 #ifdef MAC
683 	error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
684 	if (error == 0)
685 #endif
686 		error = VOP_WRITE(vp, uio, ioflag, fp->f_cred);
687 	if ((flags & FOF_OFFSET) == 0)
688 		fp->f_offset = uio->uio_offset;
689 	fp->f_nextoff = uio->uio_offset;
690 	VOP_UNLOCK(vp, 0);
691 	if (vp->v_type != VCHR)
692 		vn_finished_write(mp);
693 unlock:
694 	VFS_UNLOCK_GIANT(vfslocked);
695 	return (error);
696 }
697 
698 static const int io_hold_cnt = 16;
699 
700 /*
701  * The vn_io_fault() is a wrapper around vn_read() and vn_write() to
702  * prevent the following deadlock:
703  *
704  * Assume that the thread A reads from the vnode vp1 into userspace
705  * buffer buf1 backed by the pages of vnode vp2.  If a page in buf1 is
706  * currently not resident, then system ends up with the call chain
707  *   vn_read() -> VOP_READ(vp1) -> uiomove() -> [Page Fault] ->
708  *     vm_fault(buf1) -> vnode_pager_getpages(vp2) -> VOP_GETPAGES(vp2)
709  * which establishes lock order vp1->vn_lock, then vp2->vn_lock.
710  * If, at the same time, thread B reads from vnode vp2 into buffer buf2
711  * backed by the pages of vnode vp1, and some page in buf2 is not
712  * resident, we get a reversed order vp2->vn_lock, then vp1->vn_lock.
713  *
714  * To prevent the lock order reversal and deadlock, vn_io_fault() does
715  * not allow page faults to happen during VOP_READ() or VOP_WRITE().
716  * Instead, it first tries to do the whole range i/o with pagefaults
717  * disabled. If all pages in the i/o buffer are resident and mapped,
718  * VOP will succeed (ignoring the genuine filesystem errors).
719  * Otherwise, we get back EFAULT, and vn_io_fault() falls back to do
720  * i/o in chunks, with all pages in the chunk prefaulted and held
721  * using vm_fault_quick_hold_pages().
722  *
723  * Filesystems using this deadlock avoidance scheme should use the
724  * array of the held pages from uio, saved in the curthread->td_ma,
725  * instead of doing uiomove().  A helper function
726  * vn_io_fault_uiomove() converts uiomove request into
727  * uiomove_fromphys() over td_ma array.
728  *
729  * Since vnode locks do not cover the whole i/o anymore, rangelocks
730  * make the current i/o request atomic with respect to other i/os and
731  * truncations.
732  */
733 static int
734 vn_io_fault(struct file *fp, struct uio *uio, struct ucred *active_cred,
735     int flags, struct thread *td)
736 {
737 	vm_page_t ma[io_hold_cnt + 2];
738 	struct uio *uio_clone, short_uio;
739 	struct iovec short_iovec[1];
740 	fo_rdwr_t *doio;
741 	struct vnode *vp;
742 	void *rl_cookie;
743 	struct mount *mp;
744 	vm_page_t *prev_td_ma;
745 	int cnt, error, save, saveheld, prev_td_ma_cnt;
746 	vm_offset_t addr, end;
747 	vm_prot_t prot;
748 	size_t len, resid;
749 	ssize_t adv;
750 
751 	if (uio->uio_rw == UIO_READ)
752 		doio = vn_read;
753 	else
754 		doio = vn_write;
755 	vp = fp->f_vnode;
756 	if (uio->uio_segflg != UIO_USERSPACE || vp->v_type != VREG ||
757 	    ((mp = vp->v_mount) != NULL &&
758 	    (mp->mnt_kern_flag & MNTK_NO_IOPF) == 0))
759 		return (doio(fp, uio, active_cred, flags, td));
760 
761 	/*
762 	 * The UFS follows IO_UNIT directive and replays back both
763 	 * uio_offset and uio_resid if an error is encountered during the
764 	 * operation.  But, since the iovec may be already advanced,
765 	 * uio is still in an inconsistent state.
766 	 *
767 	 * Cache a copy of the original uio, which is advanced to the redo
768 	 * point using UIO_NOCOPY below.
769 	 */
770 	uio_clone = cloneuio(uio);
771 	resid = uio->uio_resid;
772 
773 	short_uio.uio_segflg = UIO_USERSPACE;
774 	short_uio.uio_rw = uio->uio_rw;
775 	short_uio.uio_td = uio->uio_td;
776 
777 	if (uio->uio_rw == UIO_READ) {
778 		prot = VM_PROT_WRITE;
779 		rl_cookie = vn_rangelock_rlock(vp, uio->uio_offset,
780 		    uio->uio_offset + uio->uio_resid);
781 	} else {
782 		prot = VM_PROT_READ;
783 		if ((fp->f_flag & O_APPEND) != 0 || (flags & FOF_OFFSET) == 0)
784 			/* For appenders, punt and lock the whole range. */
785 			rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
786 		else
787 			rl_cookie = vn_rangelock_wlock(vp, uio->uio_offset,
788 			    uio->uio_offset + uio->uio_resid);
789 	}
790 
791 	save = vm_fault_disable_pagefaults();
792 	error = doio(fp, uio, active_cred, flags, td);
793 	if (error != EFAULT)
794 		goto out;
795 
796 	uio_clone->uio_segflg = UIO_NOCOPY;
797 	uiomove(NULL, resid - uio->uio_resid, uio_clone);
798 	uio_clone->uio_segflg = uio->uio_segflg;
799 
800 	saveheld = curthread_pflags_set(TDP_UIOHELD);
801 	prev_td_ma = td->td_ma;
802 	prev_td_ma_cnt = td->td_ma_cnt;
803 
804 	while (uio_clone->uio_resid != 0) {
805 		len = uio_clone->uio_iov->iov_len;
806 		if (len == 0) {
807 			KASSERT(uio_clone->uio_iovcnt >= 1,
808 			    ("iovcnt underflow"));
809 			uio_clone->uio_iov++;
810 			uio_clone->uio_iovcnt--;
811 			continue;
812 		}
813 
814 		addr = (vm_offset_t)uio_clone->uio_iov->iov_base;
815 		end = round_page(addr + len);
816 		cnt = howmany(end - trunc_page(addr), PAGE_SIZE);
817 		/*
818 		 * A perfectly misaligned address and length could cause
819 		 * both the start and the end of the chunk to use partial
820 		 * page.  +2 accounts for such a situation.
821 		 */
822 		if (cnt > io_hold_cnt + 2) {
823 			len = io_hold_cnt * PAGE_SIZE;
824 			KASSERT(howmany(round_page(addr + len) -
825 			    trunc_page(addr), PAGE_SIZE) <= io_hold_cnt + 2,
826 			    ("cnt overflow"));
827 		}
828 		cnt = vm_fault_quick_hold_pages(&td->td_proc->p_vmspace->vm_map,
829 		    addr, len, prot, ma, io_hold_cnt + 2);
830 		if (cnt == -1) {
831 			error = EFAULT;
832 			break;
833 		}
834 		short_uio.uio_iov = &short_iovec[0];
835 		short_iovec[0].iov_base = (void *)addr;
836 		short_uio.uio_iovcnt = 1;
837 		short_uio.uio_resid = short_iovec[0].iov_len = len;
838 		short_uio.uio_offset = uio_clone->uio_offset;
839 		td->td_ma = ma;
840 		td->td_ma_cnt = cnt;
841 
842 		error = doio(fp, &short_uio, active_cred, flags, td);
843 		vm_page_unhold_pages(ma, cnt);
844 		adv = len - short_uio.uio_resid;
845 
846 		uio_clone->uio_iov->iov_base =
847 		    (char *)uio_clone->uio_iov->iov_base + adv;
848 		uio_clone->uio_iov->iov_len -= adv;
849 		uio_clone->uio_resid -= adv;
850 		uio_clone->uio_offset += adv;
851 
852 		uio->uio_resid -= adv;
853 		uio->uio_offset += adv;
854 
855 		if (error != 0 || adv == 0)
856 			break;
857 	}
858 	td->td_ma = prev_td_ma;
859 	td->td_ma_cnt = prev_td_ma_cnt;
860 	curthread_pflags_restore(saveheld);
861 out:
862 	vm_fault_enable_pagefaults(save);
863 	vn_rangelock_unlock(vp, rl_cookie);
864 	free(uio_clone, M_IOV);
865 	return (error);
866 }
867 
868 /*
869  * Helper function to perform the requested uiomove operation using
870  * the held pages for io->uio_iov[0].iov_base buffer instead of
871  * copyin/copyout.  Access to the pages with uiomove_fromphys()
872  * instead of iov_base prevents page faults that could occur due to
873  * pmap_collect() invalidating the mapping created by
874  * vm_fault_quick_hold_pages(), or pageout daemon, page laundry or
875  * object cleanup revoking the write access from page mappings.
876  *
877  * Filesystems specified MNTK_NO_IOPF shall use vn_io_fault_uiomove()
878  * instead of plain uiomove().
879  */
880 int
881 vn_io_fault_uiomove(char *data, int xfersize, struct uio *uio)
882 {
883 	struct uio transp_uio;
884 	struct iovec transp_iov[1];
885 	struct thread *td;
886 	size_t adv;
887 	int error, pgadv;
888 
889 	td = curthread;
890 	if ((td->td_pflags & TDP_UIOHELD) == 0 ||
891 	    uio->uio_segflg != UIO_USERSPACE)
892 		return (uiomove(data, xfersize, uio));
893 
894 	KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt));
895 	transp_iov[0].iov_base = data;
896 	transp_uio.uio_iov = &transp_iov[0];
897 	transp_uio.uio_iovcnt = 1;
898 	if (xfersize > uio->uio_resid)
899 		xfersize = uio->uio_resid;
900 	transp_uio.uio_resid = transp_iov[0].iov_len = xfersize;
901 	transp_uio.uio_offset = 0;
902 	transp_uio.uio_segflg = UIO_SYSSPACE;
903 	/*
904 	 * Since transp_iov points to data, and td_ma page array
905 	 * corresponds to original uio->uio_iov, we need to invert the
906 	 * direction of the i/o operation as passed to
907 	 * uiomove_fromphys().
908 	 */
909 	switch (uio->uio_rw) {
910 	case UIO_WRITE:
911 		transp_uio.uio_rw = UIO_READ;
912 		break;
913 	case UIO_READ:
914 		transp_uio.uio_rw = UIO_WRITE;
915 		break;
916 	}
917 	transp_uio.uio_td = uio->uio_td;
918 	error = uiomove_fromphys(td->td_ma,
919 	    ((vm_offset_t)uio->uio_iov->iov_base) & PAGE_MASK,
920 	    xfersize, &transp_uio);
921 	adv = xfersize - transp_uio.uio_resid;
922 	pgadv =
923 	    (((vm_offset_t)uio->uio_iov->iov_base + adv) >> PAGE_SHIFT) -
924 	    (((vm_offset_t)uio->uio_iov->iov_base) >> PAGE_SHIFT);
925 	td->td_ma += pgadv;
926 	KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt,
927 	    pgadv));
928 	td->td_ma_cnt -= pgadv;
929 	uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + adv;
930 	uio->uio_iov->iov_len -= adv;
931 	uio->uio_resid -= adv;
932 	uio->uio_offset += adv;
933 	return (error);
934 }
935 
936 /*
937  * File table truncate routine.
938  */
939 static int
940 vn_truncate(struct file *fp, off_t length, struct ucred *active_cred,
941     struct thread *td)
942 {
943 	struct vattr vattr;
944 	struct mount *mp;
945 	struct vnode *vp;
946 	void *rl_cookie;
947 	int vfslocked;
948 	int error;
949 
950 	vp = fp->f_vnode;
951 
952 	/*
953 	 * Lock the whole range for truncation.  Otherwise split i/o
954 	 * might happen partly before and partly after the truncation.
955 	 */
956 	rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
957 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
958 	error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
959 	if (error)
960 		goto out1;
961 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
962 	if (vp->v_type == VDIR) {
963 		error = EISDIR;
964 		goto out;
965 	}
966 #ifdef MAC
967 	error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
968 	if (error)
969 		goto out;
970 #endif
971 	error = vn_writechk(vp);
972 	if (error == 0) {
973 		VATTR_NULL(&vattr);
974 		vattr.va_size = length;
975 		error = VOP_SETATTR(vp, &vattr, fp->f_cred);
976 	}
977 out:
978 	VOP_UNLOCK(vp, 0);
979 	vn_finished_write(mp);
980 out1:
981 	VFS_UNLOCK_GIANT(vfslocked);
982 	vn_rangelock_unlock(vp, rl_cookie);
983 	return (error);
984 }
985 
986 /*
987  * File table vnode stat routine.
988  */
989 static int
990 vn_statfile(fp, sb, active_cred, td)
991 	struct file *fp;
992 	struct stat *sb;
993 	struct ucred *active_cred;
994 	struct thread *td;
995 {
996 	struct vnode *vp = fp->f_vnode;
997 	int vfslocked;
998 	int error;
999 
1000 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
1001 	vn_lock(vp, LK_SHARED | LK_RETRY);
1002 	error = vn_stat(vp, sb, active_cred, fp->f_cred, td);
1003 	VOP_UNLOCK(vp, 0);
1004 	VFS_UNLOCK_GIANT(vfslocked);
1005 
1006 	return (error);
1007 }
1008 
1009 /*
1010  * Stat a vnode; implementation for the stat syscall
1011  */
1012 int
1013 vn_stat(vp, sb, active_cred, file_cred, td)
1014 	struct vnode *vp;
1015 	register struct stat *sb;
1016 	struct ucred *active_cred;
1017 	struct ucred *file_cred;
1018 	struct thread *td;
1019 {
1020 	struct vattr vattr;
1021 	register struct vattr *vap;
1022 	int error;
1023 	u_short mode;
1024 
1025 #ifdef MAC
1026 	error = mac_vnode_check_stat(active_cred, file_cred, vp);
1027 	if (error)
1028 		return (error);
1029 #endif
1030 
1031 	vap = &vattr;
1032 
1033 	/*
1034 	 * Initialize defaults for new and unusual fields, so that file
1035 	 * systems which don't support these fields don't need to know
1036 	 * about them.
1037 	 */
1038 	vap->va_birthtime.tv_sec = -1;
1039 	vap->va_birthtime.tv_nsec = 0;
1040 	vap->va_fsid = VNOVAL;
1041 	vap->va_rdev = NODEV;
1042 
1043 	error = VOP_GETATTR(vp, vap, active_cred);
1044 	if (error)
1045 		return (error);
1046 
1047 	/*
1048 	 * Zero the spare stat fields
1049 	 */
1050 	bzero(sb, sizeof *sb);
1051 
1052 	/*
1053 	 * Copy from vattr table
1054 	 */
1055 	if (vap->va_fsid != VNOVAL)
1056 		sb->st_dev = vap->va_fsid;
1057 	else
1058 		sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0];
1059 	sb->st_ino = vap->va_fileid;
1060 	mode = vap->va_mode;
1061 	switch (vap->va_type) {
1062 	case VREG:
1063 		mode |= S_IFREG;
1064 		break;
1065 	case VDIR:
1066 		mode |= S_IFDIR;
1067 		break;
1068 	case VBLK:
1069 		mode |= S_IFBLK;
1070 		break;
1071 	case VCHR:
1072 		mode |= S_IFCHR;
1073 		break;
1074 	case VLNK:
1075 		mode |= S_IFLNK;
1076 		break;
1077 	case VSOCK:
1078 		mode |= S_IFSOCK;
1079 		break;
1080 	case VFIFO:
1081 		mode |= S_IFIFO;
1082 		break;
1083 	default:
1084 		return (EBADF);
1085 	};
1086 	sb->st_mode = mode;
1087 	sb->st_nlink = vap->va_nlink;
1088 	sb->st_uid = vap->va_uid;
1089 	sb->st_gid = vap->va_gid;
1090 	sb->st_rdev = vap->va_rdev;
1091 	if (vap->va_size > OFF_MAX)
1092 		return (EOVERFLOW);
1093 	sb->st_size = vap->va_size;
1094 	sb->st_atim = vap->va_atime;
1095 	sb->st_mtim = vap->va_mtime;
1096 	sb->st_ctim = vap->va_ctime;
1097 	sb->st_birthtim = vap->va_birthtime;
1098 
1099         /*
1100 	 * According to www.opengroup.org, the meaning of st_blksize is
1101 	 *   "a filesystem-specific preferred I/O block size for this
1102 	 *    object.  In some filesystem types, this may vary from file
1103 	 *    to file"
1104 	 * Use miminum/default of PAGE_SIZE (e.g. for VCHR).
1105 	 */
1106 
1107 	sb->st_blksize = max(PAGE_SIZE, vap->va_blocksize);
1108 
1109 	sb->st_flags = vap->va_flags;
1110 	if (priv_check(td, PRIV_VFS_GENERATION))
1111 		sb->st_gen = 0;
1112 	else
1113 		sb->st_gen = vap->va_gen;
1114 
1115 	sb->st_blocks = vap->va_bytes / S_BLKSIZE;
1116 	return (0);
1117 }
1118 
1119 /*
1120  * File table vnode ioctl routine.
1121  */
1122 static int
1123 vn_ioctl(fp, com, data, active_cred, td)
1124 	struct file *fp;
1125 	u_long com;
1126 	void *data;
1127 	struct ucred *active_cred;
1128 	struct thread *td;
1129 {
1130 	struct vnode *vp = fp->f_vnode;
1131 	struct vattr vattr;
1132 	int vfslocked;
1133 	int error;
1134 
1135 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
1136 	error = ENOTTY;
1137 	switch (vp->v_type) {
1138 	case VREG:
1139 	case VDIR:
1140 		if (com == FIONREAD) {
1141 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1142 			error = VOP_GETATTR(vp, &vattr, active_cred);
1143 			VOP_UNLOCK(vp, 0);
1144 			if (!error)
1145 				*(int *)data = vattr.va_size - fp->f_offset;
1146 		}
1147 		if (com == FIONBIO || com == FIOASYNC)	/* XXX */
1148 			error = 0;
1149 		else
1150 			error = VOP_IOCTL(vp, com, data, fp->f_flag,
1151 			    active_cred, td);
1152 		break;
1153 
1154 	default:
1155 		break;
1156 	}
1157 	VFS_UNLOCK_GIANT(vfslocked);
1158 	return (error);
1159 }
1160 
1161 /*
1162  * File table vnode poll routine.
1163  */
1164 static int
1165 vn_poll(fp, events, active_cred, td)
1166 	struct file *fp;
1167 	int events;
1168 	struct ucred *active_cred;
1169 	struct thread *td;
1170 {
1171 	struct vnode *vp;
1172 	int vfslocked;
1173 	int error;
1174 
1175 	vp = fp->f_vnode;
1176 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
1177 #ifdef MAC
1178 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1179 	error = mac_vnode_check_poll(active_cred, fp->f_cred, vp);
1180 	VOP_UNLOCK(vp, 0);
1181 	if (!error)
1182 #endif
1183 
1184 	error = VOP_POLL(vp, events, fp->f_cred, td);
1185 	VFS_UNLOCK_GIANT(vfslocked);
1186 	return (error);
1187 }
1188 
1189 /*
1190  * Acquire the requested lock and then check for validity.  LK_RETRY
1191  * permits vn_lock to return doomed vnodes.
1192  */
1193 int
1194 _vn_lock(struct vnode *vp, int flags, char *file, int line)
1195 {
1196 	int error;
1197 
1198 	VNASSERT((flags & LK_TYPE_MASK) != 0, vp,
1199 	    ("vn_lock called with no locktype."));
1200 	do {
1201 #ifdef DEBUG_VFS_LOCKS
1202 		KASSERT(vp->v_holdcnt != 0,
1203 		    ("vn_lock %p: zero hold count", vp));
1204 #endif
1205 		error = VOP_LOCK1(vp, flags, file, line);
1206 		flags &= ~LK_INTERLOCK;	/* Interlock is always dropped. */
1207 		KASSERT((flags & LK_RETRY) == 0 || error == 0,
1208 		    ("LK_RETRY set with incompatible flags (0x%x) or an error occured (%d)",
1209 		    flags, error));
1210 		/*
1211 		 * Callers specify LK_RETRY if they wish to get dead vnodes.
1212 		 * If RETRY is not set, we return ENOENT instead.
1213 		 */
1214 		if (error == 0 && vp->v_iflag & VI_DOOMED &&
1215 		    (flags & LK_RETRY) == 0) {
1216 			VOP_UNLOCK(vp, 0);
1217 			error = ENOENT;
1218 			break;
1219 		}
1220 	} while (flags & LK_RETRY && error != 0);
1221 	return (error);
1222 }
1223 
1224 /*
1225  * File table vnode close routine.
1226  */
1227 static int
1228 vn_closefile(fp, td)
1229 	struct file *fp;
1230 	struct thread *td;
1231 {
1232 	struct vnode *vp;
1233 	struct flock lf;
1234 	int vfslocked;
1235 	int error;
1236 
1237 	vp = fp->f_vnode;
1238 
1239 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
1240 	if (fp->f_type == DTYPE_VNODE && fp->f_flag & FHASLOCK) {
1241 		lf.l_whence = SEEK_SET;
1242 		lf.l_start = 0;
1243 		lf.l_len = 0;
1244 		lf.l_type = F_UNLCK;
1245 		(void) VOP_ADVLOCK(vp, fp, F_UNLCK, &lf, F_FLOCK);
1246 	}
1247 
1248 	fp->f_ops = &badfileops;
1249 
1250 	error = vn_close(vp, fp->f_flag, fp->f_cred, td);
1251 	VFS_UNLOCK_GIANT(vfslocked);
1252 	return (error);
1253 }
1254 
1255 /*
1256  * Preparing to start a filesystem write operation. If the operation is
1257  * permitted, then we bump the count of operations in progress and
1258  * proceed. If a suspend request is in progress, we wait until the
1259  * suspension is over, and then proceed.
1260  */
1261 int
1262 vn_start_write(vp, mpp, flags)
1263 	struct vnode *vp;
1264 	struct mount **mpp;
1265 	int flags;
1266 {
1267 	struct mount *mp;
1268 	int error;
1269 
1270 	error = 0;
1271 	/*
1272 	 * If a vnode is provided, get and return the mount point that
1273 	 * to which it will write.
1274 	 */
1275 	if (vp != NULL) {
1276 		if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
1277 			*mpp = NULL;
1278 			if (error != EOPNOTSUPP)
1279 				return (error);
1280 			return (0);
1281 		}
1282 	}
1283 	if ((mp = *mpp) == NULL)
1284 		return (0);
1285 
1286 	/*
1287 	 * VOP_GETWRITEMOUNT() returns with the mp refcount held through
1288 	 * a vfs_ref().
1289 	 * As long as a vnode is not provided we need to acquire a
1290 	 * refcount for the provided mountpoint too, in order to
1291 	 * emulate a vfs_ref().
1292 	 */
1293 	MNT_ILOCK(mp);
1294 	if (vp == NULL)
1295 		MNT_REF(mp);
1296 
1297 	/*
1298 	 * Check on status of suspension.
1299 	 */
1300 	if ((curthread->td_pflags & TDP_IGNSUSP) == 0 ||
1301 	    mp->mnt_susp_owner != curthread) {
1302 		while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
1303 			if (flags & V_NOWAIT) {
1304 				error = EWOULDBLOCK;
1305 				goto unlock;
1306 			}
1307 			error = msleep(&mp->mnt_flag, MNT_MTX(mp),
1308 			    (PUSER - 1) | (flags & PCATCH), "suspfs", 0);
1309 			if (error)
1310 				goto unlock;
1311 		}
1312 	}
1313 	if (flags & V_XSLEEP)
1314 		goto unlock;
1315 	mp->mnt_writeopcount++;
1316 unlock:
1317 	if (error != 0 || (flags & V_XSLEEP) != 0)
1318 		MNT_REL(mp);
1319 	MNT_IUNLOCK(mp);
1320 	return (error);
1321 }
1322 
1323 /*
1324  * Secondary suspension. Used by operations such as vop_inactive
1325  * routines that are needed by the higher level functions. These
1326  * are allowed to proceed until all the higher level functions have
1327  * completed (indicated by mnt_writeopcount dropping to zero). At that
1328  * time, these operations are halted until the suspension is over.
1329  */
1330 int
1331 vn_start_secondary_write(vp, mpp, flags)
1332 	struct vnode *vp;
1333 	struct mount **mpp;
1334 	int flags;
1335 {
1336 	struct mount *mp;
1337 	int error;
1338 
1339  retry:
1340 	if (vp != NULL) {
1341 		if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
1342 			*mpp = NULL;
1343 			if (error != EOPNOTSUPP)
1344 				return (error);
1345 			return (0);
1346 		}
1347 	}
1348 	/*
1349 	 * If we are not suspended or have not yet reached suspended
1350 	 * mode, then let the operation proceed.
1351 	 */
1352 	if ((mp = *mpp) == NULL)
1353 		return (0);
1354 
1355 	/*
1356 	 * VOP_GETWRITEMOUNT() returns with the mp refcount held through
1357 	 * a vfs_ref().
1358 	 * As long as a vnode is not provided we need to acquire a
1359 	 * refcount for the provided mountpoint too, in order to
1360 	 * emulate a vfs_ref().
1361 	 */
1362 	MNT_ILOCK(mp);
1363 	if (vp == NULL)
1364 		MNT_REF(mp);
1365 	if ((mp->mnt_kern_flag & (MNTK_SUSPENDED | MNTK_SUSPEND2)) == 0) {
1366 		mp->mnt_secondary_writes++;
1367 		mp->mnt_secondary_accwrites++;
1368 		MNT_IUNLOCK(mp);
1369 		return (0);
1370 	}
1371 	if (flags & V_NOWAIT) {
1372 		MNT_REL(mp);
1373 		MNT_IUNLOCK(mp);
1374 		return (EWOULDBLOCK);
1375 	}
1376 	/*
1377 	 * Wait for the suspension to finish.
1378 	 */
1379 	error = msleep(&mp->mnt_flag, MNT_MTX(mp),
1380 		       (PUSER - 1) | (flags & PCATCH) | PDROP, "suspfs", 0);
1381 	vfs_rel(mp);
1382 	if (error == 0)
1383 		goto retry;
1384 	return (error);
1385 }
1386 
1387 /*
1388  * Filesystem write operation has completed. If we are suspending and this
1389  * operation is the last one, notify the suspender that the suspension is
1390  * now in effect.
1391  */
1392 void
1393 vn_finished_write(mp)
1394 	struct mount *mp;
1395 {
1396 	if (mp == NULL)
1397 		return;
1398 	MNT_ILOCK(mp);
1399 	MNT_REL(mp);
1400 	mp->mnt_writeopcount--;
1401 	if (mp->mnt_writeopcount < 0)
1402 		panic("vn_finished_write: neg cnt");
1403 	if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
1404 	    mp->mnt_writeopcount <= 0)
1405 		wakeup(&mp->mnt_writeopcount);
1406 	MNT_IUNLOCK(mp);
1407 }
1408 
1409 
1410 /*
1411  * Filesystem secondary write operation has completed. If we are
1412  * suspending and this operation is the last one, notify the suspender
1413  * that the suspension is now in effect.
1414  */
1415 void
1416 vn_finished_secondary_write(mp)
1417 	struct mount *mp;
1418 {
1419 	if (mp == NULL)
1420 		return;
1421 	MNT_ILOCK(mp);
1422 	MNT_REL(mp);
1423 	mp->mnt_secondary_writes--;
1424 	if (mp->mnt_secondary_writes < 0)
1425 		panic("vn_finished_secondary_write: neg cnt");
1426 	if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
1427 	    mp->mnt_secondary_writes <= 0)
1428 		wakeup(&mp->mnt_secondary_writes);
1429 	MNT_IUNLOCK(mp);
1430 }
1431 
1432 
1433 
1434 /*
1435  * Request a filesystem to suspend write operations.
1436  */
1437 int
1438 vfs_write_suspend(mp)
1439 	struct mount *mp;
1440 {
1441 	int error;
1442 
1443 	MNT_ILOCK(mp);
1444 	if (mp->mnt_susp_owner == curthread) {
1445 		MNT_IUNLOCK(mp);
1446 		return (EALREADY);
1447 	}
1448 	while (mp->mnt_kern_flag & MNTK_SUSPEND)
1449 		msleep(&mp->mnt_flag, MNT_MTX(mp), PUSER - 1, "wsuspfs", 0);
1450 	mp->mnt_kern_flag |= MNTK_SUSPEND;
1451 	mp->mnt_susp_owner = curthread;
1452 	if (mp->mnt_writeopcount > 0)
1453 		(void) msleep(&mp->mnt_writeopcount,
1454 		    MNT_MTX(mp), (PUSER - 1)|PDROP, "suspwt", 0);
1455 	else
1456 		MNT_IUNLOCK(mp);
1457 	if ((error = VFS_SYNC(mp, MNT_SUSPEND)) != 0)
1458 		vfs_write_resume(mp);
1459 	return (error);
1460 }
1461 
1462 /*
1463  * Request a filesystem to resume write operations.
1464  */
1465 void
1466 vfs_write_resume(mp)
1467 	struct mount *mp;
1468 {
1469 
1470 	MNT_ILOCK(mp);
1471 	if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
1472 		KASSERT(mp->mnt_susp_owner == curthread, ("mnt_susp_owner"));
1473 		mp->mnt_kern_flag &= ~(MNTK_SUSPEND | MNTK_SUSPEND2 |
1474 				       MNTK_SUSPENDED);
1475 		mp->mnt_susp_owner = NULL;
1476 		wakeup(&mp->mnt_writeopcount);
1477 		wakeup(&mp->mnt_flag);
1478 		curthread->td_pflags &= ~TDP_IGNSUSP;
1479 		MNT_IUNLOCK(mp);
1480 		VFS_SUSP_CLEAN(mp);
1481 	} else
1482 		MNT_IUNLOCK(mp);
1483 }
1484 
1485 /*
1486  * Implement kqueues for files by translating it to vnode operation.
1487  */
1488 static int
1489 vn_kqfilter(struct file *fp, struct knote *kn)
1490 {
1491 	int vfslocked;
1492 	int error;
1493 
1494 	vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
1495 	error = VOP_KQFILTER(fp->f_vnode, kn);
1496 	VFS_UNLOCK_GIANT(vfslocked);
1497 
1498 	return error;
1499 }
1500 
1501 /*
1502  * Simplified in-kernel wrapper calls for extended attribute access.
1503  * Both calls pass in a NULL credential, authorizing as "kernel" access.
1504  * Set IO_NODELOCKED in ioflg if the vnode is already locked.
1505  */
1506 int
1507 vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace,
1508     const char *attrname, int *buflen, char *buf, struct thread *td)
1509 {
1510 	struct uio	auio;
1511 	struct iovec	iov;
1512 	int	error;
1513 
1514 	iov.iov_len = *buflen;
1515 	iov.iov_base = buf;
1516 
1517 	auio.uio_iov = &iov;
1518 	auio.uio_iovcnt = 1;
1519 	auio.uio_rw = UIO_READ;
1520 	auio.uio_segflg = UIO_SYSSPACE;
1521 	auio.uio_td = td;
1522 	auio.uio_offset = 0;
1523 	auio.uio_resid = *buflen;
1524 
1525 	if ((ioflg & IO_NODELOCKED) == 0)
1526 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1527 
1528 	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
1529 
1530 	/* authorize attribute retrieval as kernel */
1531 	error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, NULL,
1532 	    td);
1533 
1534 	if ((ioflg & IO_NODELOCKED) == 0)
1535 		VOP_UNLOCK(vp, 0);
1536 
1537 	if (error == 0) {
1538 		*buflen = *buflen - auio.uio_resid;
1539 	}
1540 
1541 	return (error);
1542 }
1543 
1544 /*
1545  * XXX failure mode if partially written?
1546  */
1547 int
1548 vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace,
1549     const char *attrname, int buflen, char *buf, struct thread *td)
1550 {
1551 	struct uio	auio;
1552 	struct iovec	iov;
1553 	struct mount	*mp;
1554 	int	error;
1555 
1556 	iov.iov_len = buflen;
1557 	iov.iov_base = buf;
1558 
1559 	auio.uio_iov = &iov;
1560 	auio.uio_iovcnt = 1;
1561 	auio.uio_rw = UIO_WRITE;
1562 	auio.uio_segflg = UIO_SYSSPACE;
1563 	auio.uio_td = td;
1564 	auio.uio_offset = 0;
1565 	auio.uio_resid = buflen;
1566 
1567 	if ((ioflg & IO_NODELOCKED) == 0) {
1568 		if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
1569 			return (error);
1570 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1571 	}
1572 
1573 	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
1574 
1575 	/* authorize attribute setting as kernel */
1576 	error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, td);
1577 
1578 	if ((ioflg & IO_NODELOCKED) == 0) {
1579 		vn_finished_write(mp);
1580 		VOP_UNLOCK(vp, 0);
1581 	}
1582 
1583 	return (error);
1584 }
1585 
1586 int
1587 vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace,
1588     const char *attrname, struct thread *td)
1589 {
1590 	struct mount	*mp;
1591 	int	error;
1592 
1593 	if ((ioflg & IO_NODELOCKED) == 0) {
1594 		if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
1595 			return (error);
1596 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1597 	}
1598 
1599 	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
1600 
1601 	/* authorize attribute removal as kernel */
1602 	error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, NULL, td);
1603 	if (error == EOPNOTSUPP)
1604 		error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL,
1605 		    NULL, td);
1606 
1607 	if ((ioflg & IO_NODELOCKED) == 0) {
1608 		vn_finished_write(mp);
1609 		VOP_UNLOCK(vp, 0);
1610 	}
1611 
1612 	return (error);
1613 }
1614 
1615 int
1616 vn_vget_ino(struct vnode *vp, ino_t ino, int lkflags, struct vnode **rvp)
1617 {
1618 	struct mount *mp;
1619 	int ltype, error;
1620 
1621 	mp = vp->v_mount;
1622 	ltype = VOP_ISLOCKED(vp);
1623 	KASSERT(ltype == LK_EXCLUSIVE || ltype == LK_SHARED,
1624 	    ("vn_vget_ino: vp not locked"));
1625 	error = vfs_busy(mp, MBF_NOWAIT);
1626 	if (error != 0) {
1627 		vfs_ref(mp);
1628 		VOP_UNLOCK(vp, 0);
1629 		error = vfs_busy(mp, 0);
1630 		vn_lock(vp, ltype | LK_RETRY);
1631 		vfs_rel(mp);
1632 		if (error != 0)
1633 			return (ENOENT);
1634 		if (vp->v_iflag & VI_DOOMED) {
1635 			vfs_unbusy(mp);
1636 			return (ENOENT);
1637 		}
1638 	}
1639 	VOP_UNLOCK(vp, 0);
1640 	error = VFS_VGET(mp, ino, lkflags, rvp);
1641 	vfs_unbusy(mp);
1642 	vn_lock(vp, ltype | LK_RETRY);
1643 	if (vp->v_iflag & VI_DOOMED) {
1644 		if (error == 0)
1645 			vput(*rvp);
1646 		error = ENOENT;
1647 	}
1648 	return (error);
1649 }
1650 
1651 int
1652 vn_rlimit_fsize(const struct vnode *vp, const struct uio *uio,
1653     const struct thread *td)
1654 {
1655 
1656 	if (vp->v_type != VREG || td == NULL)
1657 		return (0);
1658 	PROC_LOCK(td->td_proc);
1659 	if ((uoff_t)uio->uio_offset + uio->uio_resid >
1660 	    lim_cur(td->td_proc, RLIMIT_FSIZE)) {
1661 		kern_psignal(td->td_proc, SIGXFSZ);
1662 		PROC_UNLOCK(td->td_proc);
1663 		return (EFBIG);
1664 	}
1665 	PROC_UNLOCK(td->td_proc);
1666 	return (0);
1667 }
1668 
1669 int
1670 vn_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
1671     struct thread *td)
1672 {
1673 	struct vnode *vp;
1674 	int error, vfslocked;
1675 
1676 	vp = fp->f_vnode;
1677 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
1678 #ifdef AUDIT
1679 	vn_lock(vp, LK_SHARED | LK_RETRY);
1680 	AUDIT_ARG_VNODE1(vp);
1681 	VOP_UNLOCK(vp, 0);
1682 #endif
1683 	error = setfmode(td, active_cred, vp, mode);
1684 	VFS_UNLOCK_GIANT(vfslocked);
1685 	return (error);
1686 }
1687 
1688 int
1689 vn_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
1690     struct thread *td)
1691 {
1692 	struct vnode *vp;
1693 	int error, vfslocked;
1694 
1695 	vp = fp->f_vnode;
1696 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
1697 #ifdef AUDIT
1698 	vn_lock(vp, LK_SHARED | LK_RETRY);
1699 	AUDIT_ARG_VNODE1(vp);
1700 	VOP_UNLOCK(vp, 0);
1701 #endif
1702 	error = setfown(td, active_cred, vp, uid, gid);
1703 	VFS_UNLOCK_GIANT(vfslocked);
1704 	return (error);
1705 }
1706 
1707 void
1708 vn_pages_remove(struct vnode *vp, vm_pindex_t start, vm_pindex_t end)
1709 {
1710 	vm_object_t object;
1711 
1712 	if ((object = vp->v_object) == NULL)
1713 		return;
1714 	VM_OBJECT_LOCK(object);
1715 	vm_object_page_remove(object, start, end, 0);
1716 	VM_OBJECT_UNLOCK(object);
1717 }
1718 
1719 int
1720 vn_bmap_seekhole(struct vnode *vp, u_long cmd, off_t *off, struct ucred *cred)
1721 {
1722 	struct vattr va;
1723 	daddr_t bn, bnp;
1724 	uint64_t bsize;
1725 	off_t noff;
1726 	int error;
1727 
1728 	KASSERT(cmd == FIOSEEKHOLE || cmd == FIOSEEKDATA,
1729 	    ("Wrong command %lu", cmd));
1730 
1731 	if (vn_lock(vp, LK_SHARED) != 0)
1732 		return (EBADF);
1733 	if (vp->v_type != VREG) {
1734 		error = ENOTTY;
1735 		goto unlock;
1736 	}
1737 	error = VOP_GETATTR(vp, &va, cred);
1738 	if (error != 0)
1739 		goto unlock;
1740 	noff = *off;
1741 	if (noff >= va.va_size) {
1742 		error = ENXIO;
1743 		goto unlock;
1744 	}
1745 	bsize = vp->v_mount->mnt_stat.f_iosize;
1746 	for (bn = noff / bsize; noff < va.va_size; bn++, noff += bsize) {
1747 		error = VOP_BMAP(vp, bn, NULL, &bnp, NULL, NULL);
1748 		if (error == EOPNOTSUPP) {
1749 			error = ENOTTY;
1750 			goto unlock;
1751 		}
1752 		if ((bnp == -1 && cmd == FIOSEEKHOLE) ||
1753 		    (bnp != -1 && cmd == FIOSEEKDATA)) {
1754 			noff = bn * bsize;
1755 			if (noff < *off)
1756 				noff = *off;
1757 			goto unlock;
1758 		}
1759 	}
1760 	if (noff > va.va_size)
1761 		noff = va.va_size;
1762 	/* noff == va.va_size. There is an implicit hole at the end of file. */
1763 	if (cmd == FIOSEEKDATA)
1764 		error = ENXIO;
1765 unlock:
1766 	VOP_UNLOCK(vp, 0);
1767 	if (error == 0)
1768 		*off = noff;
1769 	return (error);
1770 }
1771